• DepthAI-v2
  • face-detection, age-gender-recognition, emotion detection

Hello Everyone,

I would like to ask if it is possible to use 3 different models at the same time on Oak-D Pro. I'm planning to implement Face detection and age-gender-recognition and emotion detection. Thank you so much to those who can answer.

Sincerely,

Marc

Hi @Marc
Yes, you can run as many models as the HW is capable of handling. If all three models are lightweight, you can expect them to work great, otherwise you will get very low FPS.

Thanks,
Jaka

  • Marc replied to this.

    thank you so much jakaskerl.

    jakaskerl Hi Jaka, It's been 4 days since I started the project, but unfortunately, all the approaches I tried were indeed unsuccessful. I'm really having a hard time figuring out how to send the output of face detection into the two models input. I tried adding image manipulation configurations so that I could resize it into two image outputs: (62,62) for age and gender recognition and (64,64) for emotion recognition. Linking the two outputs into one stream named "recognition" output. But it makes the stream confused about what output it should receive because it can only receive one output at a time, I think, by getting the data. I also tried to add a new script node so that I could assign individually different image manipulations and another two-host multi_message_sync, but still, I'm having a problem linking the face detection output into two models, and it doesn't work. I really need some guidelines on how to do it. Please help.

    Sincerely,

    Marc

    from MultiMsgSync import TwoStageHostSeqSync
    import blobconverter
    import cv2
    import depthai as dai
    import numpy as np
    
    print("DepthAI version", dai.__version__)
    def frame_norm(frame, bbox):
        normVals = np.full(len(bbox), frame.shape[0])
        normVals[::2] = frame.shape[1]
        return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
    
    emotions = ['neutral', 'happy', 'sad', 'surprise', 'anger']
    
    def create_pipeline(stereo):
        pipeline = dai.Pipeline()
    
        print("Creating Color Camera...")
        cam = pipeline.create(dai.node.ColorCamera)
        cam.setPreviewSize(1080, 1080)
        cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
        cam.setInterleaved(False)
        cam.setBoardSocket(dai.CameraBoardSocket.RGB)
    
        # Workaround: remove in 2.18, use `cam.setPreviewNumFramesPool(10)`
        # This manip uses 15*3.5 MB => 52 MB of RAM.
        copy_manip = pipeline.create(dai.node.ImageManip)
        copy_manip.setNumFramesPool(15)
        copy_manip.setMaxOutputFrameSize(3499200)
        cam.preview.link(copy_manip.inputImage)
    
        cam_xout = pipeline.create(dai.node.XLinkOut)
        cam_xout.setStreamName("color")
        copy_manip.out.link(cam_xout.input)
    
        # ImageManip will resize the frame before sending it to the Face detection NN node
        face_det_manip = pipeline.create(dai.node.ImageManip)
        face_det_manip.initialConfig.setResize(300, 300)
        face_det_manip.initialConfig.setFrameType(dai.RawImgFrame.Type.RGB888p)
        copy_manip.out.link(face_det_manip.inputImage)
    
        if stereo:
            monoLeft = pipeline.create(dai.node.MonoCamera)
            monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
            monoLeft.setBoardSocket(dai.CameraBoardSocket.LEFT)
    
            monoRight = pipeline.create(dai.node.MonoCamera)
            monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
            monoRight.setBoardSocket(dai.CameraBoardSocket.RIGHT)
    
            stereo = pipeline.create(dai.node.StereoDepth)
            stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)
            stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
            monoLeft.out.link(stereo.left)
            monoRight.out.link(stereo.right)
    
            # Spatial Detection network if OAK-D
            print("OAK-D detected, app will display spatial coordiantes")
            face_det_nn = pipeline.create(dai.node.MobileNetSpatialDetectionNetwork)
            face_det_nn.setBoundingBoxScaleFactor(0.8)
            face_det_nn.setDepthLowerThreshold(100)
            face_det_nn.setDepthUpperThreshold(5000)
            stereo.depth.link(face_det_nn.inputDepth)
        else: # Detection network if OAK-1
            print("OAK-1 detected, app won't display spatial coordiantes")
            face_det_nn = pipeline.create(dai.node.MobileNetDetectionNetwork)
    
        face_det_nn.setConfidenceThreshold(0.5)
        face_det_nn.setBlobPath(blobconverter.from_zoo(name="face-detection-retail-0004", shaves=6))
        face_det_nn.input.setQueueSize(1)
        face_det_manip.out.link(face_det_nn.input)
    
        # Send face detections to the host (for bounding boxes)
        face_det_xout = pipeline.create(dai.node.XLinkOut)
        face_det_xout.setStreamName("detection")
        face_det_nn.out.link(face_det_xout.input)
    
        # Script node will take the output from the face detection NN as an input and set ImageManipConfig
        # to the 'recognition_manip' to crop the initial frame
        image_manip_script = pipeline.create(dai.node.Script)
        face_det_nn.out.link(image_manip_script.inputs['face_det_in'])
    
        # Remove in 2.18 and use `imgFrame.getSequenceNum()` in Script node
        face_det_nn.passthrough.link(image_manip_script.inputs['passthrough'])
    
        copy_manip.out.link(image_manip_script.inputs['preview'])
    
        image_manip_script.setScript("""
        import time
        msgs = dict()
    
        def add_msg(msg, name, seq = None):
            global msgs
            if seq is None:
                seq = msg.getSequenceNum()
            seq = str(seq)
            # node.warn(f"New msg {name}, seq {seq}")
    
            # Each seq number has it's own dict of msgs
            if seq not in msgs:
                msgs[seq] = dict()
            msgs[seq][name] = msg
    
            # To avoid freezing (not necessary for this ObjDet model)
            if 15 < len(msgs):
                node.warn(f"Removing first element! len {len(msgs)}")
                msgs.popitem() # Remove first element
    
        def get_msgs():
            global msgs
            seq_remove = [] # Arr of sequence numbers to get deleted
            for seq, syncMsgs in msgs.items():
                seq_remove.append(seq) # Will get removed from dict if we find synced msgs pair
                # node.warn(f"Checking sync {seq}")
    
                # Check if we have both detections and color frame with this sequence number
                if len(syncMsgs) == 2: # 1 frame, 1 detection
                    for rm in seq_remove:
                        del msgs[rm]
                    # node.warn(f"synced {seq}. Removed older sync values. len {len(msgs)}")
                    return syncMsgs # Returned synced msgs
            return None
    
        def correct_bb(bb):
            if bb.xmin < 0: bb.xmin = 0.001
            if bb.ymin < 0: bb.ymin = 0.001
            if bb.xmax > 1: bb.xmax = 0.999
            if bb.ymax > 1: bb.ymax = 0.999
            return bb
    
        while True:
            time.sleep(0.001) # Avoid lazy looping
    
            preview = node.io['preview'].tryGet()
            if preview is not None:
                add_msg(preview, 'preview')
    
            face_dets = node.io['face_det_in'].tryGet()
            if face_dets is not None:
                # TODO: in 2.18.0.0 use face_dets.getSequenceNum()
                passthrough = node.io['passthrough'].get()
                seq = passthrough.getSequenceNum()
                add_msg(face_dets, 'dets', seq)
    
            sync_msgs = get_msgs()
            if sync_msgs is not None:
                img = sync_msgs['preview']
                dets = sync_msgs['dets']
                for i, det in enumerate(dets.detections):
                    cfg = ImageManipConfig()
                    correct_bb(det)
                    cfg.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                    # node.warn(f"Sending {i + 1}. det. Seq {seq}. Det {det.xmin}, {det.ymin}, {det.xmax}, {det.ymax}")
                    cfg.setResize(62, 62)
                    cfg.setKeepAspectRatio(False)
                    node.io['manip_cfg'].send(cfg)
                    node.io['manip_img'].send(img)
                                     
                    cfg1 = ImageManipConfig()
                    correct_bb(det)
                    cfg1.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                    # node.warn(f"Sending {i + 1}. age/gender det. Seq {seq}. Det {det.xmin}, {det.ymin}, {det.xmax}, {det.ymax}")
                    cfg1.setResize(64, 64)
                    cfg1.setKeepAspectRatio(False)
                    node.io['emotions_manip_cfg'].send(cfg1)
                    node.io['emotions_manip_img'].send(img)
        """)
    
        recognition_manip = pipeline.create(dai.node.ImageManip)
        recognition_manip.initialConfig.setResize(62, 62)
        recognition_manip.setWaitForConfigInput(True)
        image_manip_script.outputs['manip_cfg'].link(recognition_manip.inputConfig)
        image_manip_script.outputs['manip_img'].link(recognition_manip.inputImage)
    
        # Second stange recognition NN
        print("Creating recognition Neural Network...")
        recognition_nn = pipeline.create(dai.node.NeuralNetwork)
        recognition_nn.setBlobPath(blobconverter.from_zoo(name="age-gender-recognition-retail-0013", shaves=6))
        recognition_manip.out.link(recognition_nn.input)
    
        recognition_xout = pipeline.create(dai.node.XLinkOut)
        recognition_xout.setStreamName("recognition")
        recognition_nn.out.link(recognition_xout.input)
    
        manip_manip = pipeline.create(dai.node.ImageManip)
        manip_manip.initialConfig.setResize(64, 64)
        manip_manip.setWaitForConfigInput(True)
        image_manip_script.outputs['emotions_manip_cfg'].link(manip_manip.inputConfig)
        image_manip_script.outputs['emotions_manip_img'].link(manip_manip.inputImage)
    
        #2nd stage
        emotions_nn = pipeline.create(dai.node.NeuralNetwork)
        emotions_nn.setBlobPath(blobconverter.from_zoo(name="emotions-recognition-retail-0003", shaves=6))
        manip_manip.out.link(emotions_nn.input)
        
        #linking output
        emotions_nn.out.link(recognition_xout.input)
    
        return pipeline
    
    with dai.Device() as device:
        stereo = 1 < len(device.getConnectedCameras())
        device.startPipeline(create_pipeline(stereo))
    
        sync = TwoStageHostSeqSync()
        queues = {}
        emotion_name = None
        age = None
        gender_str = None
        # Create output queues
        for name in ["color", "detection", "recognition"]:
            queues[name] = device.getOutputQueue(name)
    
        while True:
            for name, q in queues.items():
                # Add all msgs (color frames, object detections and recognitions) to the Sync class.
                if q.has():
                    sync.add_msg(q.get(), name)
    
            msgs = sync.get_msgs()
            if msgs is not None:
                frame = msgs["color"].getCvFrame()
                detections = msgs["detection"].detections
                recognitions = msgs["recognition"]
    
                for i, detection in enumerate(detections):
                    bbox = frame_norm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
    
                    # Decoding of recognition results
                    rec = recognitions[i]
                    layer_names = rec.getAllLayerNames()
                    layers = rec.getData()
                    print("layers:", layers)
                    emotion_data = np.array(rec.getFirstLayerFp16())
                    # Decoding of recognition results
                    emotion_results = emotion_data
                    emotion_name = emotions[np.argmax(emotion_results)] 
                    
                    if 'age_conv3' in rec.getAllLayerNames():
                        age_conv3_data = rec.getLayerFp16('age_conv3')
                        
                    else:
                        print("not found")
    
                    if 'prob' in rec.getAllLayerNames():
                        prob_data = rec.getLayerFp16('prob')
                        
                    else:
                        print("not found")
                    
                    age = int(float(np.squeeze(np.array(age_conv3_data))) * 100)
                    gender = np.squeeze(np.array(prob_data))
                    gender_str = "Female" if gender[0] > gender[1] else "Male"
    
                    cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (10, 245, 10), 2)
                    y = (bbox[1] + bbox[3]) // 2
                    cv2.putText(frame, emotion_name, (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                    cv2.putText(frame, emotion_name, (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)
                    cv2.putText(frame, str(age), (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                    cv2.putText(frame, str(age), (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)
                    cv2.putText(frame, gender_str, (bbox[0], y + 30), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                    cv2.putText(frame, gender_str, (bbox[0], y + 30), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)
                    if stereo:
                        # You could also get detection.spatialCoordinates.x and detection.spatialCoordinates.y coordinates
                        coords = "Z: {:.2f} m".format(detection.spatialCoordinates.z/1000)
                        cv2.putText(frame, coords, (bbox[0], y + 60), cv2.FONT_HERSHEY_TRIPLEX, 1, (0, 0, 0), 8)
                        cv2.putText(frame, coords, (bbox[0], y + 60), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255), 2)
    
                cv2.imshow("Camera", frame)
            if cv2.waitKey(1) == ord('q'):
                break

    Hi @Marc
    Could you make a shorter repro code please?

    Another note: Maybe try using sync node instead of TwoStageHostSeqSync. It abstracts the whole syncing part so you don't have to worry about it.

    Thanks,
    Jaka

    • Marc replied to this.

      jakaskerl

      Hi Jakka, It is now working. Thank you so much for your help. Appreciate it the most. Wishing you all the best.

      Sincerely,

      Marc

      6 days later

      Hi, @jakaskerl, it's me again. Good day! I'm trying the head posture detection main_api.py from depthai-experiments on the Oak-D PRO, but I'm having a problem like this discussion link with the depth accuracy. I tested it at a distance of 0.30m, but it gave me 4.17m. Then I tested it again at a distance of 1 m, but still, it gave me a value of 4.17m. Do you have any idea how to calibrate it?

      I'm hoping for your reply.

      Sincerely,
      Marc

      Hi @Marc
      Try first if this example works for you. This will tell you if the device is calibrated properly. If yes there is likely a ROI issue (maybe your wall is 4.17m away from the camera and the roi is incorrectly grabbing points on the wall. Could you add some screenshots?

      Thanks,
      Jaka

      Hi @jakaskerl ,

      Sorry for the late reply. I was in the hospital yesterday, I've just arrived. I tried the code on the link you've shared and it works well. Sorry for the confusion the only problem is the head posture detection code. The following is the screenshots of the output. Any idea on how to fix this?

      Warm Regards,

      Marc

      When I put my hand on my head it will have a better value.

      Hi @Marc
      The reason for that is because the MBnetSpatialDetection node doesn't work when you input a stream from ImageManip (likely because there is some metadata missing from the message). I have forwarded this to the FW team to fix.

      If you drew the spatialBBmapping output for the detection, you would see that the bounding box is incorrectly scaled. Its located somewhere in the top left of the frame, which is why when you raise your hand, the depth suddenly works correctly.

      Thanks,
      Jaka

      • Marc replied to this.

        jakaskerl

        Thank you so much for help. Appreciate it the most and wishing you all the best.

        Best regards,

        Marc

          7 days later

          jakaskerl

          Hi jakaskerl . Good day! It's me again. I wish you had a great day. I am now in the final stage of my project, thanks to you. But unfortunately, I encountered a new problem: the code is accurate for one person only, and when I try two people, their data changes with each other. I think the problem is with my pipeline. I just used one scrip node and created three image configurations for age-gender, emotion recognition, and pose estimation. Because I don't know how to use the sync nodes. Do you have any ideas on how to fix this kind of problem or am I doing the wrong thing. Your help is very much appreciated. Code below is the pipeline repro code. Once again thanks a lot jakaskerl.

          Warm regards,

          Marc

          
          def create_pipeline():
              print("Creating pipeline...")
              pipeline = dai.Pipeline()
              print("Creating Color Camera...")
              cam = pipeline.create(dai.node.ColorCamera)
              cam.setPreviewSize(1080, 1080)
              cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
              cam.setInterleaved(False)
              cam.setBoardSocket(dai.CameraBoardSocket.RGB)
              cam_xout = pipeline.createXLinkOut()
              cam_xout.setStreamName("cam_out")
              cam.preview.link(cam_xout.input)
              
              copy_manip = pipeline.create(dai.node.ImageManip)
              copy_manip.setNumFramesPool(15)
              copy_manip.setMaxOutputFrameSize(3499200)
              cam.preview.link(copy_manip.inputImage) 
          
              # ImageManip that will crop the frame before sending it to the Face detection NN node
              face_det_manip = pipeline.create(dai.node.ImageManip)
              face_det_manip.initialConfig.setResize(300, 300)
              face_det_manip.initialConfig.setFrameType(dai.RawImgFrame.Type.RGB888p)
              copy_manip.out.link(face_det_manip.inputImage)
          
              face_det_nn.setConfidenceThreshold(0.5)
              face_det_nn.setBlobPath(blobconverter.from_zoo(name="face-detection-retail-0004", shaves=6))
              face_det_nn.input.setQueueSize(1)
              face_det_manip.out.link(face_det_nn.input)
              # Send face detections to the host (for bounding boxes)
              face_det_xout = pipeline.create(dai.node.XLinkOut)
              face_det_xout.setStreamName("face_det_out")
              face_det_nn.out.link(face_det_xout.input)
          
              image_manip_script = pipeline.create(dai.node.Script)
              image_manip_script.inputs['face_det_in'].setBlocking(False)
              image_manip_script.inputs['face_det_in'].setQueueSize(4)
              face_det_nn.out.link(image_manip_script.inputs['face_det_in'])
              image_manip_script.setScript("""
          while True:
              face_dets = node.io['face_det_in'].get().detections
              # node.warn(f"Faces detected: {len(face_dets)}")
              for det in face_dets:
                  cfg = ImageManipConfig()
                  cfg.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                  cfg.setResize(62, 62)
                  cfg.setKeepAspectRatio(False)
                  node.io['to_manip'].send(cfg)
                                          
                  cfg1 = ImageManipConfig()
                  cfg1.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                  cfg1.setResize(64, 64)
                  cfg1.setKeepAspectRatio(False)
                  node.io['emotions_manip_cfg'].send(cfg1)
                                           
                  cfg2 = ImageManipConfig()
                  cfg2.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                  cfg2.setResize(60, 60)
                  cfg2.setKeepAspectRatio(False)
                  node.io['pose_manip_cfg'].send(cfg2)
          """)
              
              #image manip for age and gender recogntion
              age_gender_manip = pipeline.create(dai.node.ImageManip)
              age_gender_manip.initialConfig.setResize(62, 62)
              age_gender_manip.setWaitForConfigInput(False)
              image_manip_script.outputs['to_manip'].link(age_gender_manip.inputConfig)
              #image manip for emotion recognition
              emotion_manip = pipeline.create(dai.node.ImageManip)
              emotion_manip.initialConfig.setResize(64, 64)
              emotion_manip.setWaitForConfigInput(False)
              image_manip_script.outputs['emotions_manip_cfg'].link(emotion_manip.inputConfig)
              #image manip for head pose estimation
              pose_manip = pipeline.create(dai.node.ImageManip)
              pose_manip.initialConfig.setResize(60, 60)
              pose_manip.setWaitForConfigInput(False)
              image_manip_script.outputs['pose_manip_cfg'].link(pose_manip.inputConfig)
              #if args.camera:
              cam.preview.link(face_det_manip.inputImage)
              cam.preview.link(age_gender_manip.inputImage)
              cam.preview.link(emotion_manip.inputImage)
              cam.preview.link(pose_manip.inputImage)
          
              # NeuralNetwork for age gender recognition
              print("Creating age_gender Detection Neural Network...")
              age_gender_nn = pipeline.create(dai.node.NeuralNetwork)
              age_gender_nn.setBlobPath(blobconverter.from_zoo(name="age-gender-recognition-retail-0013", shaves=6))
              age_gender_manip.out.link(age_gender_nn.input)
              age_gender_nn_xout = pipeline.create(dai.node.XLinkOut)
              age_gender_nn_xout.setStreamName("age_gender_out")
              age_gender_nn.out.link(age_gender_nn_xout.input)
          
              #NeuralNetwork for emotion recogniotn
              print("Creating emotion Detection Neural Network...")
              emotion_nn = pipeline.create(dai.node.NeuralNetwork)
              emotion_nn.setBlobPath(blobconverter.from_zoo(name="emotions-recognition-retail-0003", shaves=6))
              emotion_manip.out.link(emotion_nn.input)
              emotion_nn_xout = pipeline.create(dai.node.XLinkOut)
              emotion_nn_xout.setStreamName("emotion_out")
              emotion_nn.out.link(emotion_nn_xout.input)
          
              #NeuralNetwork for head post estimation recogniotn
              print("Creating head posture Detection Neural Network...")
              pose_nn = pipeline.create(dai.node.NeuralNetwork)
              pose_nn.setBlobPath(blobconverter.from_zoo(name="head-pose-estimation-adas-0001", shaves=6))
              pose_manip.out.link(pose_nn.input)
              pose_nn_xout = pipeline.create(dai.node.XLinkOut)
              pose_nn_xout.setStreamName("pose_out")
              pose_nn.out.link(pose_nn_xout.input)
          
              #age_gender_manip.out.link(face_det_xout.input)
              #emotion_manip.out.link(face_det_xout.input)
              #pose_manip.out.link(face_det_xout.input)
              print("Pipeline succesfully created")

          Hi @Marc
          Can you add the full code (or MRE repro)? The issue might be in the part where you are drawing the bboxes. The bboxes themselves look ok, perhaps there is a mismatch with how the labels are handled?

          Thanks,
          Jaka

          • Marc replied to this.

            jakaskerl 

            Hi jakaskerl. I hope you're doing great. Thank you so much for your patience and guidance. Here's the full code. I'm sorry I can't provide you with an MRE because I still can't find what really causes the change of data. I'm suspecting the pipeline part because of its lack of synchronization. I'm still trying to find where the error occurs so that I can fix my problem. Any tips and advice are very much appreciated. Please help. I'm using Oak-D Pro. Once again, thanks a lot. 

            Sincerely,

            Marc

            
            from pathlib import Path
            import blobconverter
            import cv2
            import depthai as dai
            import numpy as np
            
            MIN_THRESHOLD = 15. # Degrees in yaw/pitch/roll to be considered as head movement
            
            def frame_norm(debug_frame, bbox):
                normVals = np.full(len(bbox),debug_frame.shape[0])
                normVals[::2] = debug_frame.shape[1]
                return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
            
            
            emotions = ['neutral', 'happy', 'sad', 'surprise', 'anger']
            
            def create_pipeline(stereo):
                print("Creating pipeline...")
                pipeline = dai.Pipeline()
            
                #if args.camera:
                    # ColorCamera
                print("Creating Color Camera...")
                cam = pipeline.create(dai.node.ColorCamera)
                cam.setPreviewSize(1080, 1080)
                cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
                cam.setInterleaved(False)
                cam.setBoardSocket(dai.CameraBoardSocket.RGB)
                cam_xout = pipeline.createXLinkOut()
                cam_xout.setStreamName("cam_out")
                cam.preview.link(cam_xout.input)
                
                copy_manip = pipeline.create(dai.node.ImageManip)
                copy_manip.setNumFramesPool(15)
                copy_manip.setMaxOutputFrameSize(3499200)
                cam.preview.link(copy_manip.inputImage) 
            
                # ImageManip that will crop the frame before sending it to the Face detection NN node
                face_det_manip = pipeline.create(dai.node.ImageManip)
                face_det_manip.initialConfig.setResize(300, 300)
                face_det_manip.initialConfig.setFrameType(dai.RawImgFrame.Type.RGB888p)
                copy_manip.out.link(face_det_manip.inputImage)
            
              # NeuralNetwork for  face detection
                # Link Face ImageManip -> Face detection NN node
                if stereo:
                    monoLeft = pipeline.create(dai.node.MonoCamera)
                    monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
                    monoLeft.setBoardSocket(dai.CameraBoardSocket.LEFT)
            
                    monoRight = pipeline.create(dai.node.MonoCamera)
                    monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
                    monoRight.setBoardSocket(dai.CameraBoardSocket.RIGHT)
            
                    stereo = pipeline.create(dai.node.StereoDepth)
                    stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)
                    stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
                    monoLeft.out.link(stereo.left)
                    monoRight.out.link(stereo.right)
            
                    # Spatial Detection network if OAK-D
                    print("OAK-D detected, app will display spatial coordiantes")
                    face_det_nn = pipeline.create(dai.node.MobileNetSpatialDetectionNetwork)
                    face_det_nn.setBoundingBoxScaleFactor(0.8)
                    face_det_nn.setDepthLowerThreshold(100)
                    face_det_nn.setDepthUpperThreshold(5000)
                    stereo.depth.link(face_det_nn.inputDepth)
                else: # Detection network if OAK-1
                    print("OAK-1 detected, app won't display spatial coordiantes")
                    face_det_nn = pipeline.create(dai.node.MobileNetDetectionNetwork)
                face_det_nn.setConfidenceThreshold(0.5)
                face_det_nn.setBlobPath(blobconverter.from_zoo(name="face-detection-retail-0004", shaves=6))
                face_det_nn.input.setQueueSize(1)
                face_det_manip.out.link(face_det_nn.input)
                # Send face detections to the host (for bounding boxes)
                face_det_xout = pipeline.create(dai.node.XLinkOut)
                face_det_xout.setStreamName("face_det_out")
                face_det_nn.out.link(face_det_xout.input)
            
                image_manip_script = pipeline.create(dai.node.Script)
                image_manip_script.inputs['face_det_in'].setBlocking(False)
                image_manip_script.inputs['face_det_in'].setQueueSize(4)
                face_det_nn.out.link(image_manip_script.inputs['face_det_in'])
                image_manip_script.setScript("""
            while True:
                face_dets = node.io['face_det_in'].get().detections
                # node.warn(f"Faces detected: {len(face_dets)}")
                for det in face_dets:
                    cfg = ImageManipConfig()
                    cfg.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                    cfg.setResize(62, 62)
                    cfg.setKeepAspectRatio(False)
                    node.io['to_manip'].send(cfg)
                                            
                    cfg1 = ImageManipConfig()
                    cfg1.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                    cfg1.setResize(64, 64)
                    cfg1.setKeepAspectRatio(False)
                    node.io['emotions_manip_cfg'].send(cfg1)
                                             
                    cfg2 = ImageManipConfig()
                    cfg2.setCropRect(det.xmin, det.ymin, det.xmax, det.ymax)
                    cfg2.setResize(60, 60)
                    cfg2.setKeepAspectRatio(False)
                    node.io['pose_manip_cfg'].send(cfg2)
            """)
                
                #image manip for age and gender recogntion
                age_gender_manip = pipeline.create(dai.node.ImageManip)
                age_gender_manip.initialConfig.setResize(62, 62)
                age_gender_manip.setWaitForConfigInput(False)
                image_manip_script.outputs['to_manip'].link(age_gender_manip.inputConfig)
                #image manip for emotion recognition
                emotion_manip = pipeline.create(dai.node.ImageManip)
                emotion_manip.initialConfig.setResize(64, 64)
                emotion_manip.setWaitForConfigInput(False)
                image_manip_script.outputs['emotions_manip_cfg'].link(emotion_manip.inputConfig)
                #image manip for head pose estimation
                pose_manip = pipeline.create(dai.node.ImageManip)
                pose_manip.initialConfig.setResize(60, 60)
                pose_manip.setWaitForConfigInput(False)
                image_manip_script.outputs['pose_manip_cfg'].link(pose_manip.inputConfig)
            
                cam.preview.link(face_det_manip.inputImage)
                cam.preview.link(age_gender_manip.inputImage)
                cam.preview.link(emotion_manip.inputImage)
                cam.preview.link(pose_manip.inputImage)
            
                # NeuralNetwork for age gender recognition
                print("Creating age_gender Detection Neural Network...")
                age_gender_nn = pipeline.create(dai.node.NeuralNetwork)
                age_gender_nn.setBlobPath(blobconverter.from_zoo(name="age-gender-recognition-retail-0013", shaves=6))
                age_gender_manip.out.link(age_gender_nn.input)
                age_gender_nn_xout = pipeline.create(dai.node.XLinkOut)
                age_gender_nn_xout.setStreamName("age_gender_out")
                age_gender_nn.out.link(age_gender_nn_xout.input)
            
                #NeuralNetwork for emotion recogniotn
                print("Creating emotion Detection Neural Network...")
                emotion_nn = pipeline.create(dai.node.NeuralNetwork)
                emotion_nn.setBlobPath(blobconverter.from_zoo(name="emotions-recognition-retail-0003", shaves=6))
                emotion_manip.out.link(emotion_nn.input)
                emotion_nn_xout = pipeline.create(dai.node.XLinkOut)
                emotion_nn_xout.setStreamName("emotion_out")
                emotion_nn.out.link(emotion_nn_xout.input)
            
                #NeuralNetwork for head post estimation recogniotn
                print("Creating head posture Detection Neural Network...")
                pose_nn = pipeline.create(dai.node.NeuralNetwork)
                pose_nn.setBlobPath(blobconverter.from_zoo(name="head-pose-estimation-adas-0001", shaves=6))
                pose_manip.out.link(pose_nn.input)
                pose_nn_xout = pipeline.create(dai.node.XLinkOut)
                pose_nn_xout.setStreamName("pose_out")
                pose_nn.out.link(pose_nn_xout.input)
            
                print("Pipeline succesfully created")
            
                return pipeline
            #uploading pipeine to the device
            with dai.Device() as device:
                stereo = 1 < len(device.getConnectedCameras())
            
                device.setLogLevel(dai.LogLevel.WARN)
                device.setLogOutputLevel(dai.LogLevel.WARN)
                print("Starting pipeline...")
                device.startPipeline(create_pipeline(stereo))
                cam_out = device.getOutputQueue("cam_out", 4, False)
                face_q = device.getOutputQueue("face_det_out", 4, False)
                age_gender_q = device.getOutputQueue("age_gender_out", 4, False)
                emotion_q = device.getOutputQueue("emotion_out", 4, False)
                pose_q = device.getOutputQueue("pose_out",4, False)
            
                def get_frame():
                    return True, cam_out.get().getCvFrame()
                
                try:
                    while True:
                        read_correctly, frame = get_frame()
                        if not read_correctly:
                            break
                        if frame is not None:
                            
                            debug_frame = frame.copy()
            
                        det_in = face_q.tryGet()
                        
                        if det_in is not None:
                            detections = det_in.detections
            
                            for detection in detections:
                                bbox = frame_norm(debug_frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
                                det = age_gender_q.get()
                                det2 = emotion_q.get()
                                det3  = pose_q.get()
            
                                emotion_results = np.array(det2.getFirstLayerFp16())
                                emotion_name = emotions[np.argmax(emotion_results)]
                                age = int(float(np.squeeze(np.array(det.getLayerFp16('age_conv3')))) * 100)
                                gender = np.squeeze(np.array(det.getLayerFp16('prob')))
                                gender_str = "female" if gender[0] > gender[1] else "male"
                                confidence = detection.confidence
            
                                # Decoding of recognition results
                                yaw = det3.getLayerFp16('angle_y_fc')[0]
                                pitch = det3.getLayerFp16('angle_p_fc')[0]
                                roll = det3.getLayerFp16('angle_r_fc')[0]
                                
                                """
                                pitch > 0 Head down, < 0 look up
                                yaw > 0 Turn right < 0 Turn left
                                roll > 0 Tilt right, < 0 Tilt left
                                """
                                #for emotion recognitions decoding the result
                                vals = np.array([abs(pitch),abs(yaw),abs(roll)])
                                max_index = np.argmax(vals)
                                txt = None  
            
                                if vals[max_index] > MIN_THRESHOLD:
            
                                    cv2.putText(debug_frame, "pitch:{:.0f}, yaw:{:.0f}, roll:{:.0f}".format(pitch,yaw,roll), (bbox[0]+10-15, bbox[1]-15), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 8)
            
                                    if max_index == 1:
            
                                        if yaw > 0:
                                            txt = "Turn right"
            
                                        else:
                                            txt = "Turn left"
            
                                    elif max_index == 2:
            
                                        if roll > 0: txt = "Tilt right"
                                        else: txt = "Tilt left" 
                                
                                    cv2.putText(debug_frame, txt, (bbox[0], bbox[1]+30), cv2.FONT_HERSHEY_TRIPLEX, 1, (0, 0, 0), 8)
                                    cv2.putText(debug_frame, txt, (bbox[0], bbox[1]+30), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255), 2)
            
                                cv2.rectangle(debug_frame, (bbox[0], bbox[1]),(bbox[2], bbox[3]), (10, 245, 10), 2)    
                                cv2.putText(debug_frame, emotion_name, (bbox[0], bbox[1]-125), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                                cv2.putText(debug_frame, emotion_name, (bbox[0], bbox[1]-125), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)
                                cv2.putText(debug_frame, str(age), (bbox[0]+125, bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                                cv2.putText(debug_frame, str(age), (bbox[0]+125, bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)
                                cv2.putText(debug_frame, f"Score:{confidence}", (bbox[0], bbox[1]-50), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                                cv2.putText(debug_frame, f"Score:{confidence}", (bbox[0], bbox[1]-50), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)
                                cv2.putText(debug_frame, gender_str, (bbox[0], bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (0, 0, 0), 8)
                                cv2.putText(debug_frame, gender_str, (bbox[0], bbox[1]-10), cv2.FONT_HERSHEY_TRIPLEX, .5, (255, 255, 255), 2)
            
                                #measuring depth distance
                                if stereo:
                                    # You could also get detection.spatialCoordinates.x and detection.spatialCoordinates.y coordinates
                                    coords = "{:.2f}".format((detection.spatialCoordinates.z /1000))
                                    cv2.putText(debug_frame, coords, (bbox[0], bbox[1] + 60), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 0, 0), 8)
                                    cv2.putText(debug_frame, coords, (bbox[0], bbox[1] + 60), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2)
            
                        aspect_ratio = debug_frame.shape[1] / debug_frame.shape[0]
                        cv2.imshow("Camera_view", debug_frame)
                        if cv2.waitKey(1) == ord('q'):
                            cv2.destroyAllWindows()
                            break
                except KeyboardInterrupt:
                    pass

              Marc for detection in detections:
              bbox = frame_norm(debug_frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
              det = age_gender_q.get()
              det2 = emotion_q.get()
              det3 = pose_q.get()

              These are not ordered the same way every time. For each frame, you are getting two detections which stay together, but the other 3 NN's produce detections one after another. This means there is no way for you to know which result corresponds to which detection.

              You need to fix this, then it should work.

              Thanks,
              Jaka