• DepthAI-v2
  • Oak-1-Lite, 4K RGB demo, bounding boxes aren't scaled correctly.

theDesertMoon

You need to set:

stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
stereo.setOutputSize(monoLeft.getResolutionWidth(), monoLeft.getResolutionHeight())

Like here:
https://github.com/luxonis/depthai-python/blob/main/examples/SpatialDetection/spatial_mobilenet.py#L64

Make sure to update to the latest library, 2.15.4.0, we recently fixed some BUGs related to stability when RGB alignment was enabled.

    GergelySzabolcs

    Would this be valid for the Oak-1-Lite? I don't have the two mono cameras for disparity or the stereo pipeline created. I did update depthai using the force-reinstall option but it didn't help in this specific case.

      GergelySzabolcs
      Thank you for you reply.

      While I see several of the differences between this script and your '15_rgb_mobilenet_4k' example I'm still seeing the same results.

      Is there some inherent limitation to the Oak-1-Lite?

      The only change I made to the script you just pointed me to was to change the location of the blob file.

      I have an Oak-D-Lite connected to the same system (Raspberry Pi 4B, if that makes a difference) and it exhibits the same behavior.

      GergelySzabolcs
      I just realize that I lied. 🙂 I did make one other change.

      With 'camRgb.setPreviewKeepAspectRatio(False)' set this way the model seems a lot more inaccurate so I've habitually set it to 'True'. When I changed it back to 'False' the model seems to lose some accuracy but when it does recognize an item it does map the bounding box correctly to the larger image.

      Is there a way to hit a sweet point to maximize the model viability while allowing for a correct mapping to a larger image?

      Thank you!

      @theDesertMoon
      Check this one out. Without camRgb.setPreviewKeepAspectRatio(False).
      1920x1080 is cropped to 1080x1080, which is scaled to 300x300 to preserve aspect ratio. So need to add (1920-1080)/2 when displaying boundig boxes on full resolution.

      #!/usr/bin/env python3
      
      from pathlib import Path
      import sys
      import cv2
      import depthai as dai
      import numpy as np
      
      # Get argument first
      nnPath = str((Path(__file__).parent / Path('../models/mobilenet-ssd_openvino_2021.4_5shave.blob')).resolve().absolute())
      if len(sys.argv) > 1:
          nnPath = sys.argv[1]
      
      if not Path(nnPath).exists():
          import sys
          raise FileNotFoundError(f'Required file/s not found, please run "{sys.executable} install_requirements.py"')
      
      # MobilenetSSD label texts
      labelMap = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
                  "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
      
      # Create pipeline
      pipeline = dai.Pipeline()
      
      # Define sources and outputs
      camRgb = pipeline.create(dai.node.ColorCamera)
      nn = pipeline.create(dai.node.MobileNetDetectionNetwork)
      
      xoutVideo = pipeline.create(dai.node.XLinkOut)
      xoutPreview = pipeline.create(dai.node.XLinkOut)
      nnOut = pipeline.create(dai.node.XLinkOut)
      
      xoutVideo.setStreamName("video")
      xoutPreview.setStreamName("preview")
      nnOut.setStreamName("nn")
      
      # Properties
      camRgb.setPreviewSize(300, 300)    # NN input
      camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
      camRgb.setInterleaved(False)
      # camRgb.setPreviewKeepAspectRatio(False)
      # Define a neural network that will make predictions based on the source frames
      nn.setConfidenceThreshold(0.5)
      nn.setBlobPath(nnPath)
      nn.setNumInferenceThreads(2)
      nn.input.setBlocking(False)
      
      # Linking
      camRgb.video.link(xoutVideo.input)
      camRgb.preview.link(xoutPreview.input)
      camRgb.preview.link(nn.input)
      nn.out.link(nnOut.input)
      
      videoSize = camRgb.getVideoSize()
      
      videoOffset = (videoSize[0] - videoSize[1]) / 2
      print(videoOffset)
      # Connect to device and start pipeline
      with dai.Device(pipeline) as device:
      
          # Output queues will be used to get the frames and nn data from the outputs defined above
          qVideo = device.getOutputQueue(name="video", maxSize=4, blocking=False)
          qPreview = device.getOutputQueue(name="preview", maxSize=4, blocking=False)
          qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)
      
          previewFrame = None
          videoFrame = None
          detections = []
      
          # nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height
          def frameNorm(frame, bbox):
              normVals = np.full(len(bbox), frame.shape[0])
              normVals[::2] = frame.shape[1]
              return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
      
          def displayFrame(name, frame, offsetX = 0):
              color = (255, 0, 0)
              for detection in detections:
                  bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
                  bbox[0] += offsetX
                  bbox[2] -= offsetX
                  cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
                  cv2.putText(frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
                  cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
              # Show the frame
              cv2.imshow(name, frame)
      
          cv2.namedWindow("video", cv2.WINDOW_NORMAL)
          cv2.resizeWindow("video", 1280, 720)
          print("Resize video window with mouse drag!")
      
          while True:
              # Instead of get (blocking), we use tryGet (non-blocking) which will return the available data or None otherwise
              inVideo = qVideo.tryGet()
              inPreview = qPreview.tryGet()
              inDet = qDet.tryGet()
      
              if inVideo is not None:
                  videoFrame = inVideo.getCvFrame()
      
              if inPreview is not None:
                  previewFrame = inPreview.getCvFrame()
      
              if inDet is not None:
                  detections = inDet.detections
      
              if videoFrame is not None:
                  displayFrame("video", videoFrame, videoOffset)
      
              if previewFrame is not None:
                  displayFrame("preview", previewFrame)
      
              if cv2.waitKey(1) == ord('q'):
                  break

        GergelySzabolcs
        Thank you for your efforts.

        Its still the same, just different. 🙂
        The 'train' seems to transition mostly correctly. But it is centered also. I'm playing with the numbers on my side some too, however.

        GergelySzabolcs
        FWIW, I played with the numbers a bit more!

        I realized that 'train' tended to keep itself correct and the further from center the more out of proportion the dimension was so I added the below code to modify it as such. I used your 'offset' value to determine if I needed to alter the dimension.

        I need to redo the math however but its almost on the right path for anyone with a similar issue.

        `

        def frameNorm(frame, bbox):
            normVals = np.full(len(bbox), frame.shape[0])
            normVals[::2] = frame.shape[1]
            return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
        
        def displayFrame(name, frame, offsetX = 0):
            color = (255, 0, 0)
            #print(frame.shape)
            for detection in detections:
                bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
                if(offsetX == 1):
                  #print("*** *** ***")
                  #print(bbox)
                  bbox[0] = bbox[0]+(((frame.shape[1]/2)-bbox[0])*(300/frame.shape[0]))
                  bbox[2] = bbox[2]+(((frame.shape[1]/2)-bbox[2])*(300/frame.shape[0]))
                  bbox[1] = bbox[1]+(((frame.shape[0]/2)-bbox[1])*(300/frame.shape[1]))
                  bbox[3] = bbox[3]+(((frame.shape[0]/2)-bbox[3])*(300/frame.shape[1]))
                  #print(bbox)
                  #print("*** *** ***")
                cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
                cv2.putText(frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
                cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
            # Show the frame
            cv2.imshow(name, frame)

        `

        • erik replied to this.

          erik
          Wow, Erik! My face is red. Definitely a case of RTFM on my part. 🙂 Thank you as your maths are far superior to my own and the results, as you expected, are spot on.

          @GergelySzabolcs, my apologies also. I will slap my hand and dig a little further in the future.

          • erik replied to this.