face-detection, age-gender-recognition, emotion detection

Marc

Hello Everyone,

I would like to ask if it is possible to use 3 different models at the same time on Oak-D Pro. I'm planning to implement Face detection and age-gender-recognition and emotion detection. Thank you so much to those who can answer.

Sincerely,

Marc

jakaskerl

Hi @Marc
Yes, you can run as many models as the HW is capable of handling. If all three models are lightweight, you can expect them to work great, otherwise you will get very low FPS.

Thanks,
Jaka

Marc

jakaskerl Hi Jaka, It's been 4 days since I started the project, but unfortunately, all the approaches I tried were indeed unsuccessful. I'm really having a hard time figuring out how to send the output of face detection into the two models input. I tried adding image manipulation configurations so that I could resize it into two image outputs: (62,62) for age and gender recognition and (64,64) for emotion recognition. Linking the two outputs into one stream named "recognition" output. But it makes the stream confused about what output it should receive because it can only receive one output at a time, I think, by getting the data. I also tried to add a new script node so that I could assign individually different image manipulations and another two-host multi_message_sync, but still, I'm having a problem linking the face detection output into two models, and it doesn't work. I really need some guidelines on how to do it. Please help.

Sincerely,

Marc

from MultiMsgSync import TwoStageHostSeqSync
import blobconverter
import cv2
import depthai as dai
import numpy as np

print("DepthAI version", dai.__version__)
def frame_norm(frame, bbox):
    normVals = np.full(len(bbox), frame.shape[0])
    normVals[::2] = frame.shape[1]
    return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)

emotions = ['neutral', 'happy', 'sad', 'surprise', 'anger']

def create_pipeline(stereo):
    pipeline = dai.Pipeline()

    print("Creating Color Camera...")
    cam = pipeline.create(dai.node.ColorCamera)
    cam.setPreviewSize(1080, 1080)
    cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
    cam.setInterleaved(False)
    cam.setBoardSocket(dai.CameraBoardSocket.RGB)

    # Workaround: remove in 2.18, use `cam.setPreviewNumFramesPool(10)`
    # This manip uses 15*3.5 MB => 52 MB of RAM.
    copy_manip = pipeline.create(dai.node.ImageManip)
    copy_manip.setNumFramesPool(15)
    copy_manip.setMaxOutputFrameSize(3499200)
    cam.preview.link(copy_manip.inputImage)

    cam_xout = pipeline.create(dai.node.XLinkOut)
    cam_xout.setStreamName("color")
    copy_manip.out.link(cam_xout.input)

    # ImageManip will resize the frame before sending it to the Face detection NN node
    face_det_manip = pipeline.create(dai.node.ImageManip)
    face_det_manip.initialConfig.setResize(300, 300)
    face_det_manip.initialConfig.setFrameType(dai.RawImgFrame.Type.RGB888p)
    copy_manip.out.link(face_det_manip.inputImage)

    if stereo:
        monoLeft = pipeline.create(dai.node.MonoCamera)
        monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
        monoLeft.setBoardSocket(dai.CameraBoardSocket.LEFT)

        monoRight = pipeline.create(dai.node.MonoCamera)
        monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
        monoRight.setBoardSocket(dai.CameraBoardSocket.RIGHT)

        stereo = pipeline.create(dai.node.StereoDepth)
        stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)
        stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
        monoLeft.out.link(stereo.left)
        monoRight.out.link(stereo.right)

        # Spatial Detection network if OAK-D
        print("OAK-D detected, app will display spatial coordiantes")
        face_det_nn = pipeline.create(dai.node.MobileNetSpatialDetectionNetwork)
        face_det_nn.setBoundingBoxScaleFactor(0.8)
        face_det_nn.setDepthLowerThreshold(100)
        face_det_nn.setDepthUpperThreshold(5000)
        stereo.depth.link(face_det_nn.inputDepth)
    else: # Detection network if OAK-1
        print("OAK-1 detected, app won't display spatial coordiantes")
        face_det_nn = pipeline.create(dai.node.MobileNetDetectionNetwork)

    face_det_nn.setConfidenceThreshold(0.5)
    face_det_nn.setBlobPath(blobconverter.from_zoo(name="face-detection-retail-0004", shaves=6))
    face_det_nn.input.setQueueSize(1)
    face_det_manip.out.link(face_det_nn.input)

    # Send face detections to the host (for bounding boxes)
    face_det_xout = pipeline.create(dai.node.XLinkOut)
    face_det_xout.setStreamName("detection")
    face_det_nn.out.link(face_det_xout.input)

    # Script node will take the output from the face detection NN as an input and set ImageManipConfig
    # to the 'recognition_manip' to crop the initial frame
    image_manip_script = pipeline.create(dai.node.Script)
    face_det_nn.out.link(image_manip_script.inputs['face_det_in'])

    # Remove in 2.18 and use `imgFrame.getSequenceNum()` in Script node
    face_det_nn.passthrough.link(image_manip_script.inputs['passthrough'])

    copy_manip.out.link(image_manip_script.inputs['preview'])

    image_manip_script.setScript("""
    import time
    msgs = dict()

    def add_msg(msg, name, seq = None):
        global msgs
        if seq is None:
            seq = msg.getSequenceNum()
        seq = str(seq)
        # node.warn(f"New msg {name}, seq {seq}")

        # Each seq number has it's own dict of msgs
        if seq not in msgs:
            msgs[seq] = dict()
        msgs[seq][name] = msg

        # To avoid freezing (not necessary for this ObjDet model)
        if 15 < len(msgs):
            node.warn(f"Removing first element! len {len(msgs)}")
            msgs.popitem() # Remove first element

    def get_msgs():
        global msgs
        seq_remove = [] # Arr of sequence numbers to get deleted
        for seq, syncMsgs in msgs.items():
            seq_remove.append(seq) # Will get removed from dict if we find synced msgs pair
            # node.warn(f"Checking sync {seq}")

            # Check if we have both detections and color frame with this sequence number
            if len(syncMsgs) == 2: # 1 frame, 1 detection
                for rm in seq_remove:
                    del msgs[rm]
                # node.warn(f"synced {seq}. Removed older sync values. len {len(msgs)}")
                return syncMsgs # Returned synced msgs
        return None

    def correct_bb(bb):
        if bb.xmin < 0: bb.xmin = 0.001
        if bb.ymin < 0: bb.ymin = 0.001
        if bb.xmax > 1: bb.xmax = 0.999
        if bb.ymax > 1: bb.ymax = 0.999
        return bb

    while True:
        time.sleep(0.001) # Avoid lazy looping

        preview = node.io['preview'].tryGet()
        if preview is not None:
            add_msg(preview, 'preview')

        face_dets = node.io['face_det_in'].tryGet()
        if face_dets is not None:
            # TODO: in 2.18.0.0 use face_dets.getSequenceNum()
            passthrough = node.io['passthrough'].get()
            seq = passthrough.getSequenceNum()
            add_msg(face_dets, 'dets', seq)

        sync_msgs = get_msgs()
        if sync_msgs is not None:
            img = sync_msgs['preview']
            dets = sync_msgs['dets']
            for i, det in enumerate(dets.detections):
                cfg = ImageManipConfig()
                correct_bb(det)
                cfg.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                # node.warn(f"Sending {i + 1}. det. Seq {seq}. Det {det.xmin}, {det.ymin}, {det.xmax}, {det.ymax}")
                cfg.setResize(62, 62)
                cfg.setKeepAspectRatio(False)
                node.io['manip_cfg'].send(cfg)
                node.io['manip_img'].send(img)
                                 
                cfg1 = ImageManipConfig()
                correct_bb(det)
                cfg1.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                # node.warn(f"Sending {i + 1}. age/gender det. Seq {seq}. Det {det.xmin}, {det.ymin}, {det.xmax}, {det.ymax}")
                cfg1.setResize(64, 64)
                cfg1.setKeepAspectRatio(False)
                node.io['emotions_manip_cfg'].send(cfg1)
                node.io['emotions_manip_img'].send(img)
    """)

    recognition_manip = pipeline.create(dai.node.ImageManip)
    recognition_manip.initialConfig.setResize(62, 62)
    recognition_manip.setWaitForConfigInput(True)
    image_manip_script.outputs['manip_cfg'].link(recognition_manip.inputConfig)
    image_manip_script.outputs['manip_img'].link(recognition_manip.inputImage)

    # Second stange recognition NN
    print("Creating recognition Neural Network...")
    recognition_nn = pipeline.create(dai.node.NeuralNetwork)
    recognition_nn.setBlobPath(blobconverter.from_zoo(name="age-gender-recognition-retail-0013", shaves=6))
    recognition_manip.out.link(recognition_nn.input)

    recognition_xout = pipeline.create(dai.node.XLinkOut)
    recognition_xout.setStreamName("recognition")
    recognition_nn.out.link(recognition_xout.input)

    manip_manip = pipeline.create(dai.node.ImageManip)
    manip_manip.initialConfig.setResize(64, 64)
    manip_manip.setWaitForConfigInput(True)
    image_manip_script.outputs['emotions_manip_cfg'].link(manip_manip.inputConfig)
    image_manip_script.outputs['emotions_manip_img'].link(manip_manip.inputImage)

    #2nd stage
    emotions_nn = pipeline.create(dai.node.NeuralNetwork)
    emotions_nn.setBlobPath(blobconverter.from_zoo(name="emotions-recognition-retail-0003", shaves=6))
    manip_manip.out.link(emotions_nn.input)
    
    #linking output
    emotions_nn.out.link(recognition_xout.input)

    return pipeline

with dai.Device() as device:
    stereo = 1 < len(device.getConnectedCameras())
    device.startPipeline(create_pipeline(stereo))

    sync = TwoStageHostSeqSync()
    queues = {}
    emotion_name = None
    age = None
    gender_str = None
    # Create output queues
    for name in ["color", "detection", "recognition"]:
        queues[name] = device.getOutputQueue(name)

    while True:
        for name, q in queues.items():
            # Add all msgs (color frames, object detections and recognitions) to the Sync class.
            if q.has():
                sync.add_msg(q.get(), name)

        msgs = sync.get_msgs()
        if msgs is not None:
            frame = msgs["color"].getCvFrame()
            detections = msgs["detection"].detections
            recognitions = msgs["recognition"]

            for i, detection in enumerate(detections):
                bbox = frame_norm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))

                # Decoding of recognition results
                rec = recognitions[i]
                layer_names = rec.getAllLayerNames()
                layers = rec.getData()
                print("layers:", layers)
                emotion_data = np.array(rec.getFirstLayerFp16())
                # Decoding of recognition results
                emotion_results = emotion_data
                emotion_name = emotions[np.argmax(emotion_results)] 
                
                if 'age_conv3' in rec.getAllLayerNames():
                    age_conv3_data = rec.getLayerFp16('age_conv3')
                    
                else:
                    print("not found")

                if 'prob' in rec.getAllLayerNames():
                    prob_data = rec.getLayerFp16('prob')
                    
                else:
                    print("not found")
                
                age = int(float(np.squeeze(np.array(age_conv3_data))) * 100)
                gender = np.squeeze(np.array(prob_data))
                gender_str = "Female" if gender[0] > gender[1] else "Male"

                cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (10, 245, 10), 2)
                y = (bbox[1] + bbox[3]) // 2
                cv2.putText(frame, emotion_name, (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                cv2.putText(frame, emotion_name, (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)
                cv2.putText(frame, str(age), (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                cv2.putText(frame, str(age), (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)
                cv2.putText(frame, gender_str, (bbox[0], y + 30), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                cv2.putText(frame, gender_str, (bbox[0], y + 30), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)
                if stereo:
                    # You could also get detection.spatialCoordinates.x and detection.spatialCoordinates.y coordinates
                    coords = "Z: {:.2f} m".format(detection.spatialCoordinates.z/1000)
                    cv2.putText(frame, coords, (bbox[0], y + 60), cv2.FONT_HERSHEY_TRIPLEX, 1, (0, 0, 0), 8)
                    cv2.putText(frame, coords, (bbox[0], y + 60), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255), 2)

            cv2.imshow("Camera", frame)
        if cv2.waitKey(1) == ord('q'):
            break

Marc

thank you so much jakaskerl.

jakaskerl

Hi @Marc
Could you make a shorter repro code please?

Another note: Maybe try using sync node instead of TwoStageHostSeqSync. It abstracts the whole syncing part so you don't have to worry about it.

Thanks,
Jaka

Marc

jakaskerl

Hi Jakka, It is now working. Thank you so much for your help. Appreciate it the most. Wishing you all the best.

Sincerely,

Marc

Hi, @jakaskerl, it's me again. Good day! I'm trying the head posture detection main_api.py from depthai-experiments on the Oak-D PRO, but I'm having a problem like this discussion link with the depth accuracy. I tested it at a distance of 0.30m, but it gave me 4.17m. Then I tested it again at a distance of 1 m, but still, it gave me a value of 4.17m. Do you have any idea how to calibrate it?

I'm hoping for your reply.

Sincerely,
Marc

jakaskerl

Hi @Marc
Try first if this example works for you. This will tell you if the device is calibrated properly. If yes there is likely a ROI issue (maybe your wall is 4.17m away from the camera and the roi is incorrectly grabbing points on the wall. Could you add some screenshots?

Thanks,
Jaka

Marc

Hi @jakaskerl ,

Sorry for the late reply. I was in the hospital yesterday, I've just arrived. I tried the code on the link you've shared and it works well. Sorry for the confusion the only problem is the head posture detection code. The following is the screenshots of the output. Any idea on how to fix this?

Warm Regards,

Marc

When I put my hand on my head it will have a better value.

jakaskerl

Hi @Marc
The reason for that is because the MBnetSpatialDetection node doesn't work when you input a stream from ImageManip (likely because there is some metadata missing from the message). I have forwarded this to the FW team to fix.

If you drew the spatialBBmapping output for the detection, you would see that the bounding box is incorrectly scaled. Its located somewhere in the top left of the frame, which is why when you raise your hand, the depth suddenly works correctly.

Thanks,
Jaka

Marc

jakaskerl

Thank you so much for help. Appreciate it the most and wishing you all the best.

Best regards,

Marc

jakaskerl

Hi jakaskerl . Good day! It's me again. I wish you had a great day. I am now in the final stage of my project, thanks to you. But unfortunately, I encountered a new problem: the code is accurate for one person only, and when I try two people, their data changes with each other. I think the problem is with my pipeline. I just used one scrip node and created three image configurations for age-gender, emotion recognition, and pose estimation. Because I don't know how to use the sync nodes. Do you have any ideas on how to fix this kind of problem or am I doing the wrong thing. Your help is very much appreciated. Code below is the pipeline repro code. Once again thanks a lot jakaskerl.

Warm regards,

Marc


def create_pipeline():
    print("Creating pipeline...")
    pipeline = dai.Pipeline()
    print("Creating Color Camera...")
    cam = pipeline.create(dai.node.ColorCamera)
    cam.setPreviewSize(1080, 1080)
    cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
    cam.setInterleaved(False)
    cam.setBoardSocket(dai.CameraBoardSocket.RGB)
    cam_xout = pipeline.createXLinkOut()
    cam_xout.setStreamName("cam_out")
    cam.preview.link(cam_xout.input)
    
    copy_manip = pipeline.create(dai.node.ImageManip)
    copy_manip.setNumFramesPool(15)
    copy_manip.setMaxOutputFrameSize(3499200)
    cam.preview.link(copy_manip.inputImage) 

    # ImageManip that will crop the frame before sending it to the Face detection NN node
    face_det_manip = pipeline.create(dai.node.ImageManip)
    face_det_manip.initialConfig.setResize(300, 300)
    face_det_manip.initialConfig.setFrameType(dai.RawImgFrame.Type.RGB888p)
    copy_manip.out.link(face_det_manip.inputImage)

    face_det_nn.setConfidenceThreshold(0.5)
    face_det_nn.setBlobPath(blobconverter.from_zoo(name="face-detection-retail-0004", shaves=6))
    face_det_nn.input.setQueueSize(1)
    face_det_manip.out.link(face_det_nn.input)
    # Send face detections to the host (for bounding boxes)
    face_det_xout = pipeline.create(dai.node.XLinkOut)
    face_det_xout.setStreamName("face_det_out")
    face_det_nn.out.link(face_det_xout.input)

    image_manip_script = pipeline.create(dai.node.Script)
    image_manip_script.inputs['face_det_in'].setBlocking(False)
    image_manip_script.inputs['face_det_in'].setQueueSize(4)
    face_det_nn.out.link(image_manip_script.inputs['face_det_in'])
    image_manip_script.setScript("""
while True:
    face_dets = node.io['face_det_in'].get().detections
    # node.warn(f"Faces detected: {len(face_dets)}")
    for det in face_dets:
        cfg = ImageManipConfig()
        cfg.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
        cfg.setResize(62, 62)
        cfg.setKeepAspectRatio(False)
        node.io['to_manip'].send(cfg)
                                
        cfg1 = ImageManipConfig()
        cfg1.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
        cfg1.setResize(64, 64)
        cfg1.setKeepAspectRatio(False)
        node.io['emotions_manip_cfg'].send(cfg1)
                                 
        cfg2 = ImageManipConfig()
        cfg2.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
        cfg2.setResize(60, 60)
        cfg2.setKeepAspectRatio(False)
        node.io['pose_manip_cfg'].send(cfg2)
""")
    
    #image manip for age and gender recogntion
    age_gender_manip = pipeline.create(dai.node.ImageManip)
    age_gender_manip.initialConfig.setResize(62, 62)
    age_gender_manip.setWaitForConfigInput(False)
    image_manip_script.outputs['to_manip'].link(age_gender_manip.inputConfig)
    #image manip for emotion recognition
    emotion_manip = pipeline.create(dai.node.ImageManip)
    emotion_manip.initialConfig.setResize(64, 64)
    emotion_manip.setWaitForConfigInput(False)
    image_manip_script.outputs['emotions_manip_cfg'].link(emotion_manip.inputConfig)
    #image manip for head pose estimation
    pose_manip = pipeline.create(dai.node.ImageManip)
    pose_manip.initialConfig.setResize(60, 60)
    pose_manip.setWaitForConfigInput(False)
    image_manip_script.outputs['pose_manip_cfg'].link(pose_manip.inputConfig)
    #if args.camera:
    cam.preview.link(face_det_manip.inputImage)
    cam.preview.link(age_gender_manip.inputImage)
    cam.preview.link(emotion_manip.inputImage)
    cam.preview.link(pose_manip.inputImage)

    # NeuralNetwork for age gender recognition
    print("Creating age_gender Detection Neural Network...")
    age_gender_nn = pipeline.create(dai.node.NeuralNetwork)
    age_gender_nn.setBlobPath(blobconverter.from_zoo(name="age-gender-recognition-retail-0013", shaves=6))
    age_gender_manip.out.link(age_gender_nn.input)
    age_gender_nn_xout = pipeline.create(dai.node.XLinkOut)
    age_gender_nn_xout.setStreamName("age_gender_out")
    age_gender_nn.out.link(age_gender_nn_xout.input)

    #NeuralNetwork for emotion recogniotn
    print("Creating emotion Detection Neural Network...")
    emotion_nn = pipeline.create(dai.node.NeuralNetwork)
    emotion_nn.setBlobPath(blobconverter.from_zoo(name="emotions-recognition-retail-0003", shaves=6))
    emotion_manip.out.link(emotion_nn.input)
    emotion_nn_xout = pipeline.create(dai.node.XLinkOut)
    emotion_nn_xout.setStreamName("emotion_out")
    emotion_nn.out.link(emotion_nn_xout.input)

    #NeuralNetwork for head post estimation recogniotn
    print("Creating head posture Detection Neural Network...")
    pose_nn = pipeline.create(dai.node.NeuralNetwork)
    pose_nn.setBlobPath(blobconverter.from_zoo(name="head-pose-estimation-adas-0001", shaves=6))
    pose_manip.out.link(pose_nn.input)
    pose_nn_xout = pipeline.create(dai.node.XLinkOut)
    pose_nn_xout.setStreamName("pose_out")
    pose_nn.out.link(pose_nn_xout.input)

    #age_gender_manip.out.link(face_det_xout.input)
    #emotion_manip.out.link(face_det_xout.input)
    #pose_manip.out.link(face_det_xout.input)
    print("Pipeline succesfully created")

Marc

Marc Here's the output result.

jakaskerl

Hi @Marc
Can you add the full code (or MRE repro)? The issue might be in the part where you are drawing the bboxes. The bboxes themselves look ok, perhaps there is a mismatch with how the labels are handled?

Thanks,
Jaka

Marc

jakaskerl

Hi jakaskerl. I hope you're doing great. Thank you so much for your patience and guidance. Here's the full code. I'm sorry I can't provide you with an MRE because I still can't find what really causes the change of data. I'm suspecting the pipeline part because of its lack of synchronization. I'm still trying to find where the error occurs so that I can fix my problem. Any tips and advice are very much appreciated. Please help. I'm using Oak-D Pro. Once again, thanks a lot.

Sincerely,

Marc


from pathlib import Path
import blobconverter
import cv2
import depthai as dai
import numpy as np

MIN_THRESHOLD = 15. # Degrees in yaw/pitch/roll to be considered as head movement

def frame_norm(debug_frame, bbox):
    normVals = np.full(len(bbox),debug_frame.shape[0])
    normVals[::2] = debug_frame.shape[1]
    return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)


emotions = ['neutral', 'happy', 'sad', 'surprise', 'anger']

def create_pipeline(stereo):
    print("Creating pipeline...")
    pipeline = dai.Pipeline()

    #if args.camera:
        # ColorCamera
    print("Creating Color Camera...")
    cam = pipeline.create(dai.node.ColorCamera)
    cam.setPreviewSize(1080, 1080)
    cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
    cam.setInterleaved(False)
    cam.setBoardSocket(dai.CameraBoardSocket.RGB)
    cam_xout = pipeline.createXLinkOut()
    cam_xout.setStreamName("cam_out")
    cam.preview.link(cam_xout.input)
    
    copy_manip = pipeline.create(dai.node.ImageManip)
    copy_manip.setNumFramesPool(15)
    copy_manip.setMaxOutputFrameSize(3499200)
    cam.preview.link(copy_manip.inputImage) 

    # ImageManip that will crop the frame before sending it to the Face detection NN node
    face_det_manip = pipeline.create(dai.node.ImageManip)
    face_det_manip.initialConfig.setResize(300, 300)
    face_det_manip.initialConfig.setFrameType(dai.RawImgFrame.Type.RGB888p)
    copy_manip.out.link(face_det_manip.inputImage)

  # NeuralNetwork for  face detection
    # Link Face ImageManip -> Face detection NN node
    if stereo:
        monoLeft = pipeline.create(dai.node.MonoCamera)
        monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
        monoLeft.setBoardSocket(dai.CameraBoardSocket.LEFT)

        monoRight = pipeline.create(dai.node.MonoCamera)
        monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
        monoRight.setBoardSocket(dai.CameraBoardSocket.RIGHT)

        stereo = pipeline.create(dai.node.StereoDepth)
        stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)
        stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
        monoLeft.out.link(stereo.left)
        monoRight.out.link(stereo.right)

        # Spatial Detection network if OAK-D
        print("OAK-D detected, app will display spatial coordiantes")
        face_det_nn = pipeline.create(dai.node.MobileNetSpatialDetectionNetwork)
        face_det_nn.setBoundingBoxScaleFactor(0.8)
        face_det_nn.setDepthLowerThreshold(100)
        face_det_nn.setDepthUpperThreshold(5000)
        stereo.depth.link(face_det_nn.inputDepth)
    else: # Detection network if OAK-1
        print("OAK-1 detected, app won't display spatial coordiantes")
        face_det_nn = pipeline.create(dai.node.MobileNetDetectionNetwork)
    face_det_nn.setConfidenceThreshold(0.5)
    face_det_nn.setBlobPath(blobconverter.from_zoo(name="face-detection-retail-0004", shaves=6))
    face_det_nn.input.setQueueSize(1)
    face_det_manip.out.link(face_det_nn.input)
    # Send face detections to the host (for bounding boxes)
    face_det_xout = pipeline.create(dai.node.XLinkOut)
    face_det_xout.setStreamName("face_det_out")
    face_det_nn.out.link(face_det_xout.input)

    image_manip_script = pipeline.create(dai.node.Script)
    image_manip_script.inputs['face_det_in'].setBlocking(False)
    image_manip_script.inputs['face_det_in'].setQueueSize(4)
    face_det_nn.out.link(image_manip_script.inputs['face_det_in'])
    image_manip_script.setScript("""
while True:
    face_dets = node.io['face_det_in'].get().detections
    # node.warn(f"Faces detected: {len(face_dets)}")
    for det in face_dets:
        cfg = ImageManipConfig()
        cfg.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
        cfg.setResize(62, 62)
        cfg.setKeepAspectRatio(False)
        node.io['to_manip'].send(cfg)
                                
        cfg1 = ImageManipConfig()
        cfg1.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
        cfg1.setResize(64, 64)
        cfg1.setKeepAspectRatio(False)
        node.io['emotions_manip_cfg'].send(cfg1)
                                 
        cfg2 = ImageManipConfig()
        cfg2.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
        cfg2.setResize(60, 60)
        cfg2.setKeepAspectRatio(False)
        node.io['pose_manip_cfg'].send(cfg2)
""")
    
    #image manip for age and gender recogntion
    age_gender_manip = pipeline.create(dai.node.ImageManip)
    age_gender_manip.initialConfig.setResize(62, 62)
    age_gender_manip.setWaitForConfigInput(False)
    image_manip_script.outputs['to_manip'].link(age_gender_manip.inputConfig)
    #image manip for emotion recognition
    emotion_manip = pipeline.create(dai.node.ImageManip)
    emotion_manip.initialConfig.setResize(64, 64)
    emotion_manip.setWaitForConfigInput(False)
    image_manip_script.outputs['emotions_manip_cfg'].link(emotion_manip.inputConfig)
    #image manip for head pose estimation
    pose_manip = pipeline.create(dai.node.ImageManip)
    pose_manip.initialConfig.setResize(60, 60)
    pose_manip.setWaitForConfigInput(False)
    image_manip_script.outputs['pose_manip_cfg'].link(pose_manip.inputConfig)

    cam.preview.link(face_det_manip.inputImage)
    cam.preview.link(age_gender_manip.inputImage)
    cam.preview.link(emotion_manip.inputImage)
    cam.preview.link(pose_manip.inputImage)

    # NeuralNetwork for age gender recognition
    print("Creating age_gender Detection Neural Network...")
    age_gender_nn = pipeline.create(dai.node.NeuralNetwork)
    age_gender_nn.setBlobPath(blobconverter.from_zoo(name="age-gender-recognition-retail-0013", shaves=6))
    age_gender_manip.out.link(age_gender_nn.input)
    age_gender_nn_xout = pipeline.create(dai.node.XLinkOut)
    age_gender_nn_xout.setStreamName("age_gender_out")
    age_gender_nn.out.link(age_gender_nn_xout.input)

    #NeuralNetwork for emotion recogniotn
    print("Creating emotion Detection Neural Network...")
    emotion_nn = pipeline.create(dai.node.NeuralNetwork)
    emotion_nn.setBlobPath(blobconverter.from_zoo(name="emotions-recognition-retail-0003", shaves=6))
    emotion_manip.out.link(emotion_nn.input)
    emotion_nn_xout = pipeline.create(dai.node.XLinkOut)
    emotion_nn_xout.setStreamName("emotion_out")
    emotion_nn.out.link(emotion_nn_xout.input)

    #NeuralNetwork for head post estimation recogniotn
    print("Creating head posture Detection Neural Network...")
    pose_nn = pipeline.create(dai.node.NeuralNetwork)
    pose_nn.setBlobPath(blobconverter.from_zoo(name="head-pose-estimation-adas-0001", shaves=6))
    pose_manip.out.link(pose_nn.input)
    pose_nn_xout = pipeline.create(dai.node.XLinkOut)
    pose_nn_xout.setStreamName("pose_out")
    pose_nn.out.link(pose_nn_xout.input)

    print("Pipeline succesfully created")

    return pipeline
#uploading pipeine to the device
with dai.Device() as device:
    stereo = 1 < len(device.getConnectedCameras())

    device.setLogLevel(dai.LogLevel.WARN)
    device.setLogOutputLevel(dai.LogLevel.WARN)
    print("Starting pipeline...")
    device.startPipeline(create_pipeline(stereo))
    cam_out = device.getOutputQueue("cam_out", 4, False)
    face_q = device.getOutputQueue("face_det_out", 4, False)
    age_gender_q = device.getOutputQueue("age_gender_out", 4, False)
    emotion_q = device.getOutputQueue("emotion_out", 4, False)
    pose_q = device.getOutputQueue("pose_out",4, False)

    def get_frame():
        return True, cam_out.get().getCvFrame()
    
    try:
        while True:
            read_correctly, frame = get_frame()
            if not read_correctly:
                break
            if frame is not None:
                
                debug_frame = frame.copy()

            det_in = face_q.tryGet()
            
            if det_in is not None:
                detections = det_in.detections

                for detection in detections:
                    bbox = frame_norm(debug_frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
                    det = age_gender_q.get()
                    det2 = emotion_q.get()
                    det3  = pose_q.get()

                    emotion_results = np.array(det2.getFirstLayerFp16())
                    emotion_name = emotions[np.argmax(emotion_results)]
                    age = int(float(np.squeeze(np.array(det.getLayerFp16('age_conv3')))) * 100)
                    gender = np.squeeze(np.array(det.getLayerFp16('prob')))
                    gender_str = "female" if gender[0] > gender[1] else "male"
                    confidence = detection.confidence

                    # Decoding of recognition results
                    yaw = det3.getLayerFp16('angle_y_fc')[0]
                    pitch = det3.getLayerFp16('angle_p_fc')[0]
                    roll = det3.getLayerFp16('angle_r_fc')[0]
                    
                    """
                    pitch > 0 Head down, < 0 look up
                    yaw > 0 Turn right < 0 Turn left
                    roll > 0 Tilt right, < 0 Tilt left
                    """
                    #for emotion recognitions decoding the result
                    vals = np.array([abs(pitch),abs(yaw),abs(roll)])
                    max_index = np.argmax(vals)
                    txt = None  

                    if vals[max_index] > MIN_THRESHOLD:

                        cv2.putText(debug_frame, "pitch:{:.0f}, yaw:{:.0f}, roll:{:.0f}".format(pitch,yaw,roll), (bbox[0]+10-15, bbox[1]-15), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 8)

                        if max_index == 1:

                            if yaw > 0:
                                txt = "Turn right"

                            else:
                                txt = "Turn left"

                        elif max_index == 2:

                            if roll > 0: txt = "Tilt right"
                            else: txt = "Tilt left" 
                    
                        cv2.putText(debug_frame, txt, (bbox[0], bbox[1]+30), cv2.FONT_HERSHEY_TRIPLEX, 1, (0, 0, 0), 8)
                        cv2.putText(debug_frame, txt, (bbox[0], bbox[1]+30), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255), 2)

                    cv2.rectangle(debug_frame, (bbox[0], bbox[1]),(bbox[2], bbox[3]), (10, 245, 10), 2)    
                    cv2.putText(debug_frame, emotion_name, (bbox[0], bbox[1]-125), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                    cv2.putText(debug_frame, emotion_name, (bbox[0], bbox[1]-125), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)
                    cv2.putText(debug_frame, str(age), (bbox[0]+125, bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                    cv2.putText(debug_frame, str(age), (bbox[0]+125, bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)
                    cv2.putText(debug_frame, f"Score:{confidence}", (bbox[0], bbox[1]-50), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                    cv2.putText(debug_frame, f"Score:{confidence}", (bbox[0], bbox[1]-50), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)
                    cv2.putText(debug_frame, gender_str, (bbox[0], bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                    cv2.putText(debug_frame, gender_str, (bbox[0], bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)

                    #measuring depth distance
                    if stereo:
                        # You could also get detection.spatialCoordinates.x and detection.spatialCoordinates.y coordinates
                        coords = "{:.2f}".format((detection.spatialCoordinates.z /1000))
                        cv2.putText(debug_frame, coords, (bbox[0], bbox[1] + 60), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                        cv2.putText(debug_frame, coords, (bbox[0], bbox[1] + 60), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)

            aspect_ratio = debug_frame.shape[1] / debug_frame.shape[0]
            cv2.imshow("Camera_view", debug_frame)
            if cv2.waitKey(1) == ord('q'):
                cv2.destroyAllWindows()
                break
    except KeyboardInterrupt:
        pass

jakaskerl

Marc for detection in detections:
bbox = frame_norm(debug_frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
det = age_gender_q.get()
det2 = emotion_q.get()
det3 = pose_q.get()

These are not ordered the same way every time. For each frame, you are getting two detections which stay together, but the other 3 NN's produce detections one after another. This means there is no way for you to know which result corresponds to which detection.

You need to fix this, then it should work.

Thanks,
Jaka