Hey guys, I am really interested in this project GitHub, I ran the project with both Yolov4_tiny and Yolov8n both are pretty well working for object detection. However, I couldn't manage to add Object Tracking to that project, I tried adding it by exploring other projects as well but no success yet. If you know how to do it or some tutorial or source material, please help me. In addition to that is it possible to put custom RoI for this project?
Thanks for your time.
Fakhrullo.

  • jakaskerl replied to this.
  • Hi Fakhrullo
    I just swapped the model to the yolo from https://docs.luxonis.com/projects/api/en/latest/samples/Yolo/tiny_yolo/:

    #!/usr/bin/env python3
    
    from pathlib import Path
    import cv2
    import depthai as dai
    import numpy as np
    import time
    import argparse
    
    # tiny yolo v4 label texts
    labelMap = [
        "person",         "bicycle",    "car",           "motorbike",     "aeroplane",   "bus",           "train",
        "truck",          "boat",       "traffic light", "fire hydrant",  "stop sign",   "parking meter", "bench",
        "bird",           "cat",        "dog",           "horse",         "sheep",       "cow",           "elephant",
        "bear",           "zebra",      "giraffe",       "backpack",      "umbrella",    "handbag",       "tie",
        "suitcase",       "frisbee",    "skis",          "snowboard",     "sports ball", "kite",          "baseball bat",
        "baseball glove", "skateboard", "surfboard",     "tennis racket", "bottle",      "wine glass",    "cup",
        "fork",           "knife",      "spoon",         "bowl",          "banana",      "apple",         "sandwich",
        "orange",         "broccoli",   "carrot",        "hot dog",       "pizza",       "donut",         "cake",
        "chair",          "sofa",       "pottedplant",   "bed",           "diningtable", "toilet",        "tvmonitor",
        "laptop",         "mouse",      "remote",        "keyboard",      "cell phone",  "microwave",     "oven",
        "toaster",        "sink",       "refrigerator",  "book",          "clock",       "vase",          "scissors",
        "teddy bear",     "hair drier", "toothbrush"
    ]
    
    nnPath = str((Path(__file__).parent / Path('../models/yolo-v4-tiny-tf_openvino_2021.4_6shave.blob')).resolve().absolute())
    
    
    
    # Create pipeline
    pipeline = dai.Pipeline()
    
    # Define sources and outputs
    camRgb = pipeline.create(dai.node.ColorCamera)
    detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
    objectTracker = pipeline.create(dai.node.ObjectTracker)
    
    xlinkOut = pipeline.create(dai.node.XLinkOut)
    trackerOut = pipeline.create(dai.node.XLinkOut)
    
    xlinkOut.setStreamName("preview")
    trackerOut.setStreamName("tracklets")
    
    # Properties
    camRgb.setPreviewSize(416, 416)
    camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
    camRgb.setInterleaved(False)
    camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
    camRgb.setFps(40)
    
    # testing MobileNet DetectionNetwork
    # Network specific settings
    detectionNetwork.setConfidenceThreshold(0.5)
    detectionNetwork.setNumClasses(80)
    detectionNetwork.setCoordinateSize(4)
    detectionNetwork.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319])
    detectionNetwork.setAnchorMasks({"side26": [1, 2, 3], "side13": [3, 4, 5]})
    detectionNetwork.setIouThreshold(0.5)
    detectionNetwork.setBlobPath(nnPath)
    detectionNetwork.setNumInferenceThreads(2)
    detectionNetwork.input.setBlocking(False)
    
    objectTracker.setDetectionLabelsToTrack([15])  # track only person
    # possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
    objectTracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)
    # take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
    objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)
    
    # Linking
    camRgb.preview.link(detectionNetwork.input)
    objectTracker.passthroughTrackerFrame.link(xlinkOut.input)
    
    
    detectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)
    
    detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
    detectionNetwork.out.link(objectTracker.inputDetections)
    objectTracker.out.link(trackerOut.input)
    
    # Connect to device and start pipeline
    with dai.Device(pipeline) as device:
    
        preview = device.getOutputQueue("preview", 4, False)
        tracklets = device.getOutputQueue("tracklets", 4, False)
    
        startTime = time.monotonic()
        counter = 0
        fps = 0
        frame = None
    
        while(True):
            imgFrame = preview.get()
            track = tracklets.get()
    
            counter+=1
            current_time = time.monotonic()
            if (current_time - startTime) > 1 :
                fps = counter / (current_time - startTime)
                counter = 0
                startTime = current_time
    
            color = (255, 0, 0)
            frame = imgFrame.getCvFrame()
            trackletsData = track.tracklets
            for t in trackletsData:
                roi = t.roi.denormalize(frame.shape[1], frame.shape[0])
                x1 = int(roi.topLeft().x)
                y1 = int(roi.topLeft().y)
                x2 = int(roi.bottomRight().x)
                y2 = int(roi.bottomRight().y)
    
                try:
                    label = labelMap[t.label]
                except:
                    label = t.label
    
                cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
                cv2.putText(frame, f"ID: {[t.id]}", (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
                cv2.putText(frame, t.status.name, (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, cv2.FONT_HERSHEY_SIMPLEX)
    
            cv2.putText(frame, "NN fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
    
            cv2.imshow("tracker", frame)
    
            if cv2.waitKey(1) == ord('q'):
                break

    thanks,
    Jaka

    Hi Fakhrullo
    Should be the same as this example just switch the MBNetSSD with YOLO. Both perform on device decoding.
    It is possible to put ROI in as well, but the question is where do you want to incorporate it - before NN, before Tracker?

    Thanks,
    Jaka

      Hi jakaskerl
      Yes I want to assign ROI before Tracker, so in theory less space to detect and track, it should be faster and higher FPS compared to default one. And it could help for a user to have more customization over the program. If you have any thougths about it please share with me.

      Thanks,
      Fakhrullo

      jakaskerl
      I run the example you mentioned above, and it worked fine with MBNetSSD. However, When I switched it with YOLO it's giving errors:
      1. First thing I changed is nnpath to blob file:
      nnPathDefault = str((Path(__file__).parent / Path('model/mobilenet-ssd_openvino_2021.4_6shave.blob')).resolve().absolute())
      to
      nnPathDefault = str((Path(__file__).parent / Path('model/tiny-yolo-v4_openvino_2021.2_6shave.blob')).resolve().absolute())
      and then it gave me error like this:
      [DetectionNetwork(1)] [error] Mask is not defined for output layer with width '26'. Define at pipeline build time using: 'setAnchorMasks' for 'side26'.

      2. Then I added this lines to the code:

      anchorMasks = {
      "side26" : [1,2,3],
      "side13" : [3,4,5]
      }
      detectionNetwork.setAnchorMasks(anchorMasks)

      After adding this lines now I'm getting this error:

      with dai.Device(pipeline, device) as device:

      [18443010C190BE0800] [192.168.1.234] [11.263] [system] [critical] Fatal error. Please report to developers. Log: 'Fatal error on MSS CPU: trap: 09, address: 8008EC3C' '0'

      [18443010C190BE0800] [192.168.1.234] [1699258753.950] [host] [warning] Monitor thread (device: 18443010C190BE0800 [192.168.1.234]) - ping was missed, closing the device connection

      Traceback (most recent call last):

      File "/home/fakha/Work Projects/YoloDepthAI/gen2-yolo/device-decoding/adding_tracker.py", line 87, in <module>

      imgFrame = preview.get()

      RuntimeError: Communication exception - possible device error/misconfiguration. Original message 'Couldn't read data from stream: 'preview' (X_LINK_ERROR)'

      Is there something that I'm doing wrong?

      Thank you for your time,
      Fakhrullo

        Hi Fakhrullo
        I just swapped the model to the yolo from https://docs.luxonis.com/projects/api/en/latest/samples/Yolo/tiny_yolo/:

        #!/usr/bin/env python3
        
        from pathlib import Path
        import cv2
        import depthai as dai
        import numpy as np
        import time
        import argparse
        
        # tiny yolo v4 label texts
        labelMap = [
            "person",         "bicycle",    "car",           "motorbike",     "aeroplane",   "bus",           "train",
            "truck",          "boat",       "traffic light", "fire hydrant",  "stop sign",   "parking meter", "bench",
            "bird",           "cat",        "dog",           "horse",         "sheep",       "cow",           "elephant",
            "bear",           "zebra",      "giraffe",       "backpack",      "umbrella",    "handbag",       "tie",
            "suitcase",       "frisbee",    "skis",          "snowboard",     "sports ball", "kite",          "baseball bat",
            "baseball glove", "skateboard", "surfboard",     "tennis racket", "bottle",      "wine glass",    "cup",
            "fork",           "knife",      "spoon",         "bowl",          "banana",      "apple",         "sandwich",
            "orange",         "broccoli",   "carrot",        "hot dog",       "pizza",       "donut",         "cake",
            "chair",          "sofa",       "pottedplant",   "bed",           "diningtable", "toilet",        "tvmonitor",
            "laptop",         "mouse",      "remote",        "keyboard",      "cell phone",  "microwave",     "oven",
            "toaster",        "sink",       "refrigerator",  "book",          "clock",       "vase",          "scissors",
            "teddy bear",     "hair drier", "toothbrush"
        ]
        
        nnPath = str((Path(__file__).parent / Path('../models/yolo-v4-tiny-tf_openvino_2021.4_6shave.blob')).resolve().absolute())
        
        
        
        # Create pipeline
        pipeline = dai.Pipeline()
        
        # Define sources and outputs
        camRgb = pipeline.create(dai.node.ColorCamera)
        detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
        objectTracker = pipeline.create(dai.node.ObjectTracker)
        
        xlinkOut = pipeline.create(dai.node.XLinkOut)
        trackerOut = pipeline.create(dai.node.XLinkOut)
        
        xlinkOut.setStreamName("preview")
        trackerOut.setStreamName("tracklets")
        
        # Properties
        camRgb.setPreviewSize(416, 416)
        camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
        camRgb.setInterleaved(False)
        camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
        camRgb.setFps(40)
        
        # testing MobileNet DetectionNetwork
        # Network specific settings
        detectionNetwork.setConfidenceThreshold(0.5)
        detectionNetwork.setNumClasses(80)
        detectionNetwork.setCoordinateSize(4)
        detectionNetwork.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319])
        detectionNetwork.setAnchorMasks({"side26": [1, 2, 3], "side13": [3, 4, 5]})
        detectionNetwork.setIouThreshold(0.5)
        detectionNetwork.setBlobPath(nnPath)
        detectionNetwork.setNumInferenceThreads(2)
        detectionNetwork.input.setBlocking(False)
        
        objectTracker.setDetectionLabelsToTrack([15])  # track only person
        # possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
        objectTracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)
        # take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
        objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)
        
        # Linking
        camRgb.preview.link(detectionNetwork.input)
        objectTracker.passthroughTrackerFrame.link(xlinkOut.input)
        
        
        detectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)
        
        detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
        detectionNetwork.out.link(objectTracker.inputDetections)
        objectTracker.out.link(trackerOut.input)
        
        # Connect to device and start pipeline
        with dai.Device(pipeline) as device:
        
            preview = device.getOutputQueue("preview", 4, False)
            tracklets = device.getOutputQueue("tracklets", 4, False)
        
            startTime = time.monotonic()
            counter = 0
            fps = 0
            frame = None
        
            while(True):
                imgFrame = preview.get()
                track = tracklets.get()
        
                counter+=1
                current_time = time.monotonic()
                if (current_time - startTime) > 1 :
                    fps = counter / (current_time - startTime)
                    counter = 0
                    startTime = current_time
        
                color = (255, 0, 0)
                frame = imgFrame.getCvFrame()
                trackletsData = track.tracklets
                for t in trackletsData:
                    roi = t.roi.denormalize(frame.shape[1], frame.shape[0])
                    x1 = int(roi.topLeft().x)
                    y1 = int(roi.topLeft().y)
                    x2 = int(roi.bottomRight().x)
                    y2 = int(roi.bottomRight().y)
        
                    try:
                        label = labelMap[t.label]
                    except:
                        label = t.label
        
                    cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
                    cv2.putText(frame, f"ID: {[t.id]}", (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
                    cv2.putText(frame, t.status.name, (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, cv2.FONT_HERSHEY_SIMPLEX)
        
                cv2.putText(frame, "NN fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
        
                cv2.imshow("tracker", frame)
        
                if cv2.waitKey(1) == ord('q'):
                    break

        thanks,
        Jaka

          Hi jakaskerl
          Great work bro appriciate it so much 🙂

          Is it possible to put ROI before detection and tracking?

            Hi Fakhrullo
            Make a manip node with setCropRect() to crop to specified size of the preview stream and setResize() to make sure the input fits the NN first layer size.

            Thanks,
            Jaka

              Hi jakaskerl
              Thank you very much, I will continue working on this project if I have any questions I'll notify you

              Thanks,
              Fakhrullo

              Hi jakaskerl

              How are you doing bro? I have a question regarding that project, I adjusted the project as I want and it's working as expected, now I have to test detection and tracking with crowd, currently I can't put camera to street or any public places, so I wanted to test it with videos. However, I couldn't manage to integrate the video reading in the project, I tried using this method https://docs.luxonis.com/projects/api/en/latest/samples/ObjectTracker/object_tracker_video/#source-code
              In the beginnign I tried to add this to my code but I couldn't get it right, so I added Yolo to the code in the documentation I mentioned above. But it's not running on device, how can I test my project with video? can you help me with that please.

              Thanks,
              Fakhrullo

              Hi jakaskerl

              I added manip mode as you said, I added like this:
              # Creating Manip node
              manip = pipeline.create(dai.node.ImageManip)
              # Setting CropRect for the Region of Interest
              manip.initialConfig.setCropRect(*custom_roi)
              # Setting Resize for the neural network input size
              manip.initialConfig.setResize(640, 640)
              # Setting maximum output frame size based on the desired output dimensions
              max_output_width = 640
              max_output_height = 640
              max_output_frame_size = 3 * max_output_width * max_output_height # Assuming 3 channels for BGR image
              manip.setMaxOutputFrameSize(max_output_frame_size)
              # Connecting Manip node to ColorCamera
              camRgb.preview.link(manip.inputImage)
              # Connecting Manip node to YoloDetectionNetwork
              manip.out.link(detectionNetwork.input)

              But I'm getting black screen, Where am I making mistake? is there any tutorial or documentation how to do that?

              Hi @Fakhrullo
              Can I see the full code?
              If you are inputting the preview image into the manip, make sure it's larger than the output image, otherwise the crop makes no sense.
              Maybe you could link the .video output and set the frame type to RGB.

              Thanks,
              Jaka

                Hi jakaskerl
                Here's the code

                from pathlib import Path

                import cv2

                import depthai as dai

                import time

                from environs import Env
                env = Env()env.read_env()
                MxID = env('MxID')# Set your custom ROI coordinates (x, y, width, height)custom_roi = (350, 250, 640, 640) # Example coordinates, adjust as needed
                # tiny yolo v4 label textslabelMap = [ "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]
                nnPath = str((Path(__file__).parent / Path('model/yolov6n_coco_640x640_openvino_2022.1_6shave.blob')).resolve().absolute())
                # Create pipelinepipeline = dai.Pipeline()
                # Define sources and outputscamRgb = pipeline.create(dai.node.ColorCamera)detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)objectTracker = pipeline.create(dai.node.ObjectTracker)
                xlinkOut = pipeline.create(dai.node.XLinkOut)trackerOut = pipeline.create(dai.node.XLinkOut)
                xlinkOut.setStreamName("preview")trackerOut.setStreamName("tracklets")
                # Creating Manip nodemanip = pipeline.create(dai.node.ImageManip)# Setting CropRect for the Region of Interestmanip.initialConfig.setCropRect(*custom_roi)# Setting Resize for the neural network input sizemanip.initialConfig.setResize(640, 640)# Setting maximum output frame size based on the desired output dimensionsmax_output_width = 640max_output_height = 640max_output_frame_size = 3 * max_output_width * max_output_height # Assuming 3 channels for BGR imagemanip.setMaxOutputFrameSize(max_output_frame_size)
                # Propertiesif MxID == "14442C10C1AD3FD700": camRgb.setImageOrientation(dai.CameraImageOrientation.HORIZONTAL_MIRROR)camRgb.setPreviewSize(640, 640)camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)camRgb.setInterleaved(False)camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)camRgb.setFps(40)
                # Network specific settingsdetectionNetwork.setConfidenceThreshold(0.5)detectionNetwork.setNumClasses(80)detectionNetwork.setCoordinateSize(4)# detectionNetwork.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319]) #YOLOv4 uchun# detectionNetwork.setAnchorMasks({"side26": [1, 2, 3], "side13": [3, 4, 5]})detectionNetwork.setAnchors([10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326]) #YOLOv5 uchundetectionNetwork.setAnchorMasks({"side52": [0,1,2], "side26": [3,4,5], "side13": [6,7,8]})detectionNetwork.setIouThreshold(0.5)detectionNetwork.setBlobPath(nnPath)detectionNetwork.setNumInferenceThreads(2)detectionNetwork.input.setBlocking(False)
                objectTracker.setDetectionLabelsToTrack([0]) # track only person# possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCFobjectTracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)# take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_IDobjectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)
                #Linking# Connecting Manip node to ColorCameracamRgb.preview.link(manip.inputImage)# Connecting Manip node to YoloDetectionNetworkmanip.out.link(detectionNetwork.input)# camRgb.preview.link(detectionNetwork.input)objectTracker.passthroughTrackerFrame.link(xlinkOut.input)

                detectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)
                detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)detectionNetwork.out.link(objectTracker.inputDetections)objectTracker.out.link(trackerOut.input)
                device = dai.DeviceInfo(MxID)
                # Connect to device and start pipelinewith dai.Device(pipeline, device) as device:
                preview = device.getOutputQueue("preview", 4, False) tracklets = device.getOutputQueue("tracklets", 4, False)
                startTime = time.monotonic() counter = 0 fps = 0 frame = None
                while(True): imgFrame = preview.get() track = tracklets.get()
                counter+=1 current_time = time.monotonic() if (current_time - startTime) > 1 : fps = counter / (current_time - startTime) counter = 0 startTime = current_time
                color = (255, 0, 0) text_color = (0, 0, 255) rectangle = (111, 147, 26)
                frame = imgFrame.getCvFrame() trackletsData = track.tracklets for t in trackletsData: if t.status.name == "TRACKED": roi = t.roi.denormalize(frame.shape[1], frame.shape[0]) x1 = int(roi.topLeft().x) y1 = int(roi.topLeft().y) x2 = int(roi.bottomRight().x) y2 = int(roi.bottomRight().y)
                try: label = labelMap[t.label] except: label = t.label # if t.status.name == 'TRACKED': cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, text_color) cv2.putText(frame, f"ID: {[t.id]}", (x1 + 10, y1 + 45), cv2.FONT_HERSHEY_TRIPLEX, 0.5, text_color) cv2.putText(frame, t.status.name, (x1 + 10, y1 + 70), cv2.FONT_HERSHEY_TRIPLEX, 0.5, text_color) cv2.rectangle(frame, (x1, y1), (x2, y2), rectangle, cv2.FONT_HERSHEY_SIMPLEX)
                cv2.putText(frame, "FPS: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.6, text_color)
                cv2.imshow("tracker", frame)
                if cv2.waitKey(1) == ord('q'): break

                Here's better version:

                from pathlib import Path
                import cv2
                import depthai as dai
                import time
                
                from environs import Env
                
                env = Env()
                env.read_env()
                
                MxID = env('MxID')
                # Set your custom ROI coordinates (x, y, width, height)
                custom_roi = (350, 250, 640, 640)  # Example coordinates, adjust as needed
                
                # tiny yolo v4 label texts
                labelMap = [
                    "person",         "bicycle",    "car",           "motorbike",     "aeroplane",   "bus",           "train",
                    "truck",          "boat",       "traffic light", "fire hydrant",  "stop sign",   "parking meter", "bench",
                    "bird",           "cat",        "dog",           "horse",         "sheep",       "cow",           "elephant",
                    "bear",           "zebra",      "giraffe",       "backpack",      "umbrella",    "handbag",       "tie",
                    "suitcase",       "frisbee",    "skis",          "snowboard",     "sports ball", "kite",          "baseball bat",
                    "baseball glove", "skateboard", "surfboard",     "tennis racket", "bottle",      "wine glass",    "cup",
                    "fork",           "knife",      "spoon",         "bowl",          "banana",      "apple",         "sandwich",
                    "orange",         "broccoli",   "carrot",        "hot dog",       "pizza",       "donut",         "cake",
                    "chair",          "sofa",       "pottedplant",   "bed",           "diningtable", "toilet",        "tvmonitor",
                    "laptop",         "mouse",      "remote",        "keyboard",      "cell phone",  "microwave",     "oven",
                    "toaster",        "sink",       "refrigerator",  "book",          "clock",       "vase",          "scissors",
                    "teddy bear",     "hair drier", "toothbrush"
                ]
                
                nnPath = str((Path(__file__).parent / Path('model/yolov6n_coco_640x640_openvino_2022.1_6shave.blob')).resolve().absolute())
                
                # Create pipeline
                pipeline = dai.Pipeline()
                
                # Define sources and outputs
                camRgb = pipeline.create(dai.node.ColorCamera)
                detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
                objectTracker = pipeline.create(dai.node.ObjectTracker)
                
                xlinkOut = pipeline.create(dai.node.XLinkOut)
                trackerOut = pipeline.create(dai.node.XLinkOut)
                
                xlinkOut.setStreamName("preview")
                trackerOut.setStreamName("tracklets")
                
                # Creating Manip node
                manip = pipeline.create(dai.node.ImageManip)
                # Setting CropRect for the Region of Interest
                manip.initialConfig.setCropRect(*custom_roi)
                # Setting Resize for the neural network input size
                manip.initialConfig.setResize(640, 640)
                # Setting maximum output frame size based on the desired output dimensions
                max_output_width = 640
                max_output_height = 640
                max_output_frame_size = 3 * max_output_width * max_output_height # Assuming 3 channels for BGR image
                manip.setMaxOutputFrameSize(max_output_frame_size)
                
                # Properties
                if MxID == "14442C10C1AD3FD700":
                    camRgb.setImageOrientation(dai.CameraImageOrientation.HORIZONTAL_MIRROR)
                camRgb.setPreviewSize(640, 640)
                camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
                camRgb.setInterleaved(False)
                camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
                camRgb.setFps(40)
                
                # Network specific settings
                detectionNetwork.setConfidenceThreshold(0.5)
                detectionNetwork.setNumClasses(80)
                detectionNetwork.setCoordinateSize(4)
                # detectionNetwork.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319]) #YOLOv4 uchun
                # detectionNetwork.setAnchorMasks({"side26": [1, 2, 3], "side13": [3, 4, 5]})
                detectionNetwork.setAnchors([10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326]) #YOLOv5 uchun
                detectionNetwork.setAnchorMasks({"side52": [0,1,2], "side26": [3,4,5], "side13": [6,7,8]})
                detectionNetwork.setIouThreshold(0.5)
                detectionNetwork.setBlobPath(nnPath)
                detectionNetwork.setNumInferenceThreads(2)
                detectionNetwork.input.setBlocking(False)
                
                objectTracker.setDetectionLabelsToTrack([0])  # track only person
                # possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
                objectTracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)
                # take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
                objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)
                
                #Linking
                # Connecting Manip node to ColorCamera
                camRgb.preview.link(manip.inputImage)
                # Connecting Manip node to YoloDetectionNetwork
                manip.out.link(detectionNetwork.input)
                # camRgb.preview.link(detectionNetwork.input)
                objectTracker.passthroughTrackerFrame.link(xlinkOut.input)
                
                
                detectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)
                
                detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
                detectionNetwork.out.link(objectTracker.inputDetections)
                objectTracker.out.link(trackerOut.input)
                
                device = dai.DeviceInfo(MxID)
                
                # Connect to device and start pipeline
                with dai.Device(pipeline, device) as device:
                
                    preview = device.getOutputQueue("preview", 4, False)
                    tracklets = device.getOutputQueue("tracklets", 4, False)
                
                    startTime = time.monotonic()
                    counter = 0
                    fps = 0
                    frame = None
                
                    while(True):
                        imgFrame = preview.get()
                        track = tracklets.get()
                
                        counter+=1
                        current_time = time.monotonic()
                        if (current_time - startTime) > 1 :
                            fps = counter / (current_time - startTime)
                            counter = 0
                            startTime = current_time
                
                        color = (255, 0, 0)
                        text_color = (0, 0, 255)
                        rectangle = (111, 147, 26)
                
                        frame = imgFrame.getCvFrame()
                        trackletsData = track.tracklets
                        for t in trackletsData:
                            if t.status.name == "TRACKED":
                                roi = t.roi.denormalize(frame.shape[1], frame.shape[0])
                                x1 = int(roi.topLeft().x)
                                y1 = int(roi.topLeft().y)
                                x2 = int(roi.bottomRight().x)
                                y2 = int(roi.bottomRight().y)
                
                                try:
                                    label = labelMap[t.label]
                                except:
                                    label = t.label
                                # if t.status.name == 'TRACKED':
                                cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, text_color)
                                cv2.putText(frame, f"ID: {[t.id]}", (x1 + 10, y1 + 45), cv2.FONT_HERSHEY_TRIPLEX, 0.5, text_color)
                                cv2.putText(frame, t.status.name, (x1 + 10, y1 + 70), cv2.FONT_HERSHEY_TRIPLEX, 0.5, text_color)
                                cv2.rectangle(frame, (x1, y1), (x2, y2), rectangle, cv2.FONT_HERSHEY_SIMPLEX)
                
                        cv2.putText(frame, "FPS: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.6, text_color)
                
                        cv2.imshow("tracker", frame)
                
                        if cv2.waitKey(1) == ord('q'):
                            break

                  Hi Fakhrullo
                  setCropRect() expects normalized values. Divide it by 416, then I believe it should work.

                  Thanks,
                  Jaka

                    5 days later

                    Hi jakaskerl
                    Thanks it worked fine, I'm getting cropped frame.
                    Is it possible to fuse cropped frame and whole frame? I mean it shows whole frame and cropped frame within whole frame may be with a box? If It is impossible how can I show whole and cropped frames in 2 different windows?

                      Hi Fakhrullo
                      Sorry, I don't really understand what you are trying to achieve. Both frames are numpy arrays which means you can easily stack one on top of the other.

                      Thanks,
                      Jaka

                        Hi jakaskerl
                        Yeah, basically I want to retrieve both full and cropped frames, to achieve that what should I do?

                        Fakhrullo camRgb.preview.link(manip.inputImage)

                        Here, you can pipe the preview stream to a new XLink node, so you can view it on host side.
                        This will enable you to see both the full preview frame as well as cropped frame (on which the inference was made).

                        Thanks,
                        Jaka