Hi,

I am trying to run Deep Sort with yolo spatial mono on the SDK using the implementation I found here: https://github.com/luxonis/depthai-experiments/blob/master/gen2-deepsort-tracking/main.py

It is necessary for me to run spatial mono because I want to use Oak-D in low light environment.

I am able to get Deep Sort running with spatial in color (there is a significant delay, but at least it is working), but I cannot get it working in mono. The only difference between the two code is that for mono, I set up a left and right camera and combine it into stereo. May I get some help on this?

Error message:

Traceback (most recent call last):
  File "...\depthai-experiments\gen2-deepsort-tracking\jae_deepsort_spatial_mono.py", line 36, in <module>
    yolo = oak.create_nn('yolov6nr3_coco_640x352', stereo,  spatial = True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "...venv\Lib\site-packages\depthai_sdk\oak_camera.py", line 322, in create_nn
    comp = NNComponent(self.device,
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "...venv\Lib\site-packages\depthai_sdk\components\nn_component.py", line 199, in __init__
    raise ValueError(
ValueError: 'input' argument passed on init isn't supported!You can only use NnComponent or CameraComponent as the input.
Sentry is attempting to send 2 pending error messages

Spatial Mono (getting error):

tracker = DeepSort(max_age=1000, nn_budget=None, embedder=None, nms_max_overlap=1.0, max_cosine_distance=0.2)

def cb(packet: TwoStagePacket):
    detections = packet.img_detections.detections
    vis = packet.visualizer
    # Update the tracker
    object_tracks = tracker.iter(detections, packet.nnData, (640, 640))

    for track in object_tracks:
        if not track.is_confirmed() or \
            track.time_since_update > 1 or \
            track.detection_id >= len(detections) or \
            track.detection_id < 0:
            continue

        det = packet.detections[track.detection_id]
        vis.add_text(f'ID: {track.track_id}',
                        bbox=(*det.top_left, *det.bottom_right),
                        position=TextPosition.MID)
    frame = vis.draw(packet.frame)
    cv2.imshow('DeepSort tracker', frame)


with OakCamera() as oak:
    left = oak.create_camera("left")
    right = oak.create_camera("right")

    stereo = oak.create_stereo(left=left, right=right, fps=15)
    yolo = oak.create_nn('yolov6nr3_coco_640x352', stereo,  spatial = True)
    embedder = oak.create_nn('mobilenetv2_imagenet_embedder_224x224', input=yolo)

    embedder.config_spatial(
        bb_scale_factor=0.5, # Scaling bounding box before averaging the depth in that ROI
        lower_threshold=300, # Discard depth points below 30cm
        upper_threshold=10000, # Discard depth pints above 10m
        # Average depth points before calculating X and Y spatial coordinates:
        calc_algo=dai.SpatialLocationCalculatorAlgorithm.AVERAGE
    )

    oak.visualize(embedder, fps=True, callback=cb)
    # oak.show_graph()
    oak.start(blocking=True)

Spatial Color (working):

tracker = DeepSort(max_age=1000, nn_budget=None, embedder=None, nms_max_overlap=1.0, max_cosine_distance=0.2)

def cb(packet: TwoStagePacket):
    detections = packet.img_detections.detections
    vis = packet.visualizer
    # Update the tracker
    object_tracks = tracker.iter(detections, packet.nnData, (640, 640))

    for track in object_tracks:
        if not track.is_confirmed() or \
            track.time_since_update > 1 or \
            track.detection_id >= len(detections) or \
            track.detection_id < 0:
            continue

        det = packet.detections[track.detection_id]
        vis.add_text(f'ID: {track.track_id}',
                        bbox=(*det.top_left, *det.bottom_right),
                        position=TextPosition.MID)
    frame = vis.draw(packet.frame)
    cv2.imshow('DeepSort tracker', frame)


with OakCamera() as oak:
    color = oak.create_camera('color', fps=15)
    yolo = oak.create_nn('yolov6nr3_coco_640x352', color,  spatial = True)
    embedder = oak.create_nn('mobilenetv2_imagenet_embedder_224x224', input=yolo)

    embedder.config_spatial(
        bb_scale_factor=0.5, # Scaling bounding box before averaging the depth in that ROI
        lower_threshold=300, # Discard depth points below 30cm
        upper_threshold=10000, # Discard depth pints above 10m
        # Average depth points before calculating X and Y spatial coordinates:
        calc_algo=dai.SpatialLocationCalculatorAlgorithm.AVERAGE
    )

    oak.visualize(embedder, fps=True, callback=cb)
    # oak.show_graph()
    oak.start(blocking=True)

    Hi jsiic
    I think you just need to correctly pass one camera to the NN and stereo for spatial inference.
    yolo = oak.create_nn('yolov6nr3_coco_640x352', left, spatial = stereo)

    Thanks,
    Jaka

      jakaskerl

      Oh yeah, when I tried that, I was getting the following error, as if the mono footage is in the wrong format... I am not sure why because I'd assume that the image size is the same in both mono and color? (I probably don't understand the ML algorithms well enough)

      [2023-10-15 21:41:05] WARNING [root.config_spatial:572] This is not a Spatial Detection network! This configuration attempt will be ignored.
      [18443010A173C51200] [169.254.1.222] [9.733] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.750] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.757] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.784] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.789] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.830] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.833] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.839] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.869] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.875] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.900] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.908] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference
      [18443010A173C51200] [169.254.1.222] [9.916] [NeuralNetwork(8)] [error] Input tensor 'input.1' (0) exceeds available data range. Data size (50176B), tensor offset (0), size (150528B) - skipping inference

        Hi jsiic
        You also have to edit the deepsort embedder to expect grayscale. Right now I think it is set to RGB, but I'm no expert in this either. cc @Matija, first model works, second needs change to adapt to grayscale.

        Thanks,
        Jaka

        jsiic

        As mentioned, you should also pass the grayscale to the deepsort embedder. Note, however, that those models are usually trained on color images so performance might be slightly worse.

        I think the error stems from passing a grayscale image to a neural network that expects a color image (the input itself is too small). This means that grayscale image would likely have to be turned into grayscale RGB by ImageManip. Unsure how that can be done in SDK itself (cc @jakaskerl on this).

        10 days later

        Sorry haven't really had the time to dive in since I need to inspect the code more in depth.
        Will be doing that today to see if I can make it work. The embedder needs to be changed manually for this to work (i think) @jsiic

        Thanks and apologies for the wait time.
        Jaka

          Hi jsiic

          with OakCamera() as oak:
              left = oak.create_camera("left")
              right = oak.create_camera("right")
          
              stereo = oak.create_stereo(left=left, right=right, fps=15)
              yolo = oak.create_nn('yolov6nr3_coco_640x352', left,  spatial = stereo)
              embedder = oak.create_nn('mobilenetv2_imagenet_embedder_224x224', input=yolo)
              embedder.image_manip.setFrameType(dai.RawImgFrame.Type.BGR888p)
          
              embedder.config_spatial(
                  bb_scale_factor=0.5, # Scaling bounding box before averaging the depth in that ROI
                  lower_threshold=300, # Discard depth points below 30cm
                  upper_threshold=10000, # Discard depth pints above 10m
                  # Average depth points before calculating X and Y spatial coordinates:
                  calc_algo=dai.SpatialLocationCalculatorAlgorithm.AVERAGE
              )
          
          
          
              oak.visualize(embedder, fps=True)
              # oak.show_graph()
              oak.start(blocking=True)

          Thanks,
          Jaka

          So it seems to be working!

          But now I am having trouble wrapping my head around how to print out the data now. I would like it to be in the format of something like:

          Tracking ID: X-depth value, Y-depth value, Z-depth value

          But I am having trouble wrapping my head around how to do this in the callback function. The original code in the SDK example is the following:

          def cb(packet: TwoStagePacket):
              detections = packet.img_detections.detections
              vis = packet.visualizer
              # Update the tracker
              object_tracks = tracker.iter(detections, packet.nnData, (640, 640))
          
              for track in object_tracks:
                  if not track.is_confirmed() or \
                      track.time_since_update > 1 or \
                      track.detection_id >= len(detections) or \
                      track.detection_id < 0:
                      continue
          
                  det = packet.detections[track.detection_id]
                  vis.add_text(f'ID: {track.track_id}', bbox=(*det.top_left, *det.bottom_right), position=TextPosition.MID)
          
              frame = vis.draw(packet.frame)
              cv2.imshow('DeepSort tracker', frame)

          I was thinking that I should be able to print the spatial data by having something like this in the for loop:

          print(packet.spatials.detections[track.detection_id].x) 

          but I am getting the error:

          File "C:...\OakD\depthai-experiments\gen2-deepsort-tracking\jae_deepsort_spatial_mono.py", line 26, in cb
              print(packet.spatials.detections[track.detection_id].x)
                    ^^^^^^^^^^^^^^^
          AttributeError: 'TwoStagePacket' object has no attribute 'spatials'

          I am sort of at a loss here again....

            Hi jsiic

            def cb(packet: TwoStagePacket):
                detections = packet.img_detections.detections
                vis = packet.visualizer
                # Update the tracker
                object_tracks = tracker.iter(detections, packet.nnData, (640, 640))
            
                for track in object_tracks:
                    if not track.is_confirmed() or \
                        track.time_since_update > 1 or \
                        track.detection_id >= len(detections) or \
                        track.detection_id < 0:
                        continue
            
                    det = packet.detections[track.detection_id]
                    spatials = detections[track.detection_id].spatialCoordinates
                    print(f'ID: {track.track_id}, Class: {det.label}, BBox: {det.top_left}, {det.bottom_right}, Spatials: {spatials.x}, {spatials.y}, {spatials.z}')
                    vis.add_text(f'ID: {track.track_id}', bbox=(*det.top_left, *det.bottom_right), position=TextPosition.MID)
            
                frame = vis.draw(packet.frame)
                cv2.imshow('DeepSort tracker', frame)

            Thanks,
            Jaka

              jakaskerl Hi jaka, I am getting errors with labels and spatials:

                File "C:....\depthai-experiments\gen2-deepsort-tracking\jae_deepsort_spatial_mono.py", line 24, in cb
                  print(f'ID: {track.track_id}, Class: {det.label}, BBox: {det.top_left}, {det.bottom_right}, Spatials: {spatials.x}, {spatials.y}, {spatials.z}')
                                                        ^^^^^^^^^
              AttributeError: 'Detection' object has no attribute 'label'
              Sentry is attempting to send 2 pending error messages

              Tried running without label and got the following error for spatials:

              File "C:\...OakD\depthai-experiments\gen2-deepsort-tracking\jae_deepsort_spatial_mono.py", line 24, in cb
                  print(f'ID: {track.track_id}, BBox: {det.top_left}, {det.bottom_right}, Spatials: {spatials.x}, {spatials.y}, {spatials.z}')
                                                                                                     ^^^^^^^^
              NameError: name 'spatials' is not defined
              Sentry is attempting to send 2 pending error messages

              I also tried det.spatials.x and detections.spatials.x but not working...

              Hi @jsiic
              Could you paste the code you are using? It might be different to the one I have locally. I didn't really keep track of all the changes.

              Thanks,
              Jaka

                jakaskerl Actually the spatials x y z are working. Class: {det.label} is not. Here it is!

                import cv2
                from depthai_sdk import OakCamera
                import depthai as dai
                from depthai_sdk.classes.packets import TwoStagePacket
                from depthai_sdk.visualize.configs import TextPosition
                from deep_sort_realtime.deepsort_tracker import DeepSort
                
                tracker = DeepSort(max_age=1000, nn_budget=None, embedder=None, nms_max_overlap=1.0, max_cosine_distance=0.2)
                
                def cb(packet: TwoStagePacket):
                    detections = packet.img_detections.detections
                    vis = packet.visualizer
                    # Update the tracker
                    object_tracks = tracker.iter(detections, packet.nnData, (640, 640))
                
                    for track in object_tracks:
                        if not track.is_confirmed() or \
                            track.time_since_update > 1 or \
                            track.detection_id >= len(detections) or \
                            track.detection_id < 0:
                            continue
                
                        det = packet.detections[track.detection_id]
                        spatials = detections[track.detection_id].spatialCoordinates
                        print(
                            f'ID: {track.track_id}, Class: {det.label}, BBox: {det.top_left}, {det.bottom_right}, Spatials: {spatials.x}, {spatials.y}, {spatials.z}')
                
                        vis.add_text(f'ID: {track.track_id}', bbox=(*det.top_left, *det.bottom_right), position=TextPosition.MID)
                
                
                    frame = vis.draw(packet.frame)
                    cv2.imshow('DeepSort tracker', frame)
                
                
                with OakCamera() as oak:
                    left = oak.create_camera("left")
                    right = oak.create_camera("right")
                
                    stereo = oak.create_stereo(left=left, right=right, fps=15)
                    yolo = oak.create_nn('yolov6nr3_coco_640x352', left, spatial=stereo)
                    embedder = oak.create_nn('mobilenetv2_imagenet_embedder_224x224', input=yolo)
                    embedder.image_manip.setFrameType(dai.RawImgFrame.Type.BGR888p)
                
                
                    embedder.config_spatial(
                        bb_scale_factor=0.5, # Scaling bounding box before averaging the depth in that ROI
                        lower_threshold=300, # Discard depth points below 30cm
                        upper_threshold=10000, # Discard depth pints above 10m
                        # Average depth points before calculating X and Y spatial coordinates:
                        calc_algo=dai.SpatialLocationCalculatorAlgorithm.AVERAGE
                    )
                
                    oak.visualize(embedder, fps=True, callback = cb)
                    # oak.show_graph()
                    oak.start(blocking=True)

                Hi @jsiic
                Change the det.label to det.label_str. I could swear the labels worked before I sent it to you...hmm.

                Thanks,
                Jaka

                  7 days later