can I calculate the depth of ROI instead of 1 point?

KwinT · Mar 1, 2024

Hello,
i am trying to output the distance, taking spatialgen2host as an example. In the file, there is line "spatials, centroid = hostSpatials.calcspatials(depthData, (x1,y1))" which uses calc_spatial() in calc.py. however, i want to feed in this function not the centroid, but the whole roi (x1,y1,x2,y2). the camera shows a few frames at first and froze and i must force to quit to stop the camera. the work is shown below, did i get wrong anywhere. any help will be appreciated

KwinT · Mar 1, 2024

import cv2

import depthai as dai

from calc import HostSpatialsCalc

import numpy as np

import math

import socket

import json

from ultralytics import YOLO

# Create YOLO model

model = YOLO('yolov8n-seg.pt')

# Create a DepthAI pipeline

pipeline = dai.Pipeline()

# Define source and output

camRgb = pipeline.create(dai.node.ColorCamera)

xoutVideo = pipeline.create(dai.node.XLinkOut)

xoutVideo.setStreamName("video")

# Define source and output for stereo cam

monoLeft = pipeline.create(dai.node.MonoCamera)

monoRight = pipeline.create(dai.node.MonoCamera)

stereo = pipeline.create(dai.node.StereoDepth)

# Properties

camRgb.setBoardSocket(dai.CameraBoardSocket.CAM_A)

camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)

camRgb.setVideoSize(640,480)

xoutVideo.input.setBlocking(False)

xoutVideo.input.setQueueSize(1)

# Properties for depth map

monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)

monoLeft.setBoardSocket(dai.CameraBoardSocket.CAM_B)

monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)

monoRight.setBoardSocket(dai.CameraBoardSocket.CAM_C)

stereo.initialConfig.setConfidenceThreshold(255)

stereo.setLeftRightCheck(True)

stereo.setSubpixel(False)

# Linking

camRgb.video.link(xoutVideo.input)

# Linking for depth map

monoLeft.out.link(stereo.left)

monoRight.out.link(stereo.right)

xoutDepth = pipeline.create(dai.node.XLinkOut)

xoutDepth.setStreamName("depth")

stereo.depth.link(xoutDepth.input)

xoutDepth = pipeline.create(dai.node.XLinkOut)

xoutDepth.setStreamName("disp")

stereo.disparity.link(xoutDepth.input)

# Connect to device and start pipeline

with dai.Device(pipeline) as device:

video = device.getOutputQueue(name="video", maxSize=1, blocking=False)

# Output queue will be used to get the depth frames from the outputs defined above

depth=device.getOutputQueue(name="depth")

hostSpatials = HostSpatialsCalc(device)

step = 3

delta = 5

hostSpatials.setDeltaRoi(delta)

color = (255, 255, 255)

while True:

videoIn = video.get()

# Get BGR frame from NV12 encoded video frame to show with OpenCV

frame = videoIn.getCvFrame()

# Perform YOLOv8 inference on the frame

results = model(frame)

# Get annotated frame with YOLOv8 predictions

annotated_frame = results[0].plot()

# All data needed to be transferred

result=results[0]

output=[]

# DEPTHFRAME

depthFrame = depth.get().getFrame()

depthData=depth.get()

depth_downscaled = depthFrame[::4]

if np.all(depth_downscaled == 0):

min_depth = 0 # Set a default minimum depth value when all elements are zero

else:

min_depth = np.percentile(depth_downscaled[depth_downscaled != 0], 1)

max_depth = np.percentile(depth_downscaled, 99)

depthFrameColor = np.interp(depthFrame, (min_depth, max_depth), (0, 255)).astype(np.uint8)

depthFrameColor = cv2.applyColorMap(depthFrameColor, cv2.COLORMAP_HOT)

height = frame.shape[0]

width = frame.shape[1]

for box in result.boxes:

# coordinates of the bounding box

x1,y1,x2,y2=[round(x) for x in box.xyxy[0].tolist()]

roi=(x1,y1,x2,y2)

# obj id

class_id=box.cls[0].item()

id=result.names[class_id]

output.append([x1,y1,x2,y2, id])

print(output)

center_box_x=int((x1+x2)/2)

center_box_y=int((y1+y2)/2)

#### starting ROI, NEED TO DEFINE ROI AS THE BOUNDING BOX

# Create a dai.Rect object with the normalized coordinates

roi_rect = dai.Rect(dai.Point2f(x1, y1), dai.Point2f(x2, y2))

roi_rect=roi_rect.denormalize(depthFrameColor.shape[1],depthFrameColor.shape[0])

topLeft=roi_rect.topLeft()

bottomRight=roi_rect.bottomRight()

xmin = int(topLeft.x)

ymin = int(topLeft.y)

xmax = int(bottomRight.x)

ymax = int(bottomRight.y)

cv2.rectangle(depthFrameColor, (xmin, ymin), (xmax, ymax), color, 1)

# Denormalize bounding box

xL = int(xmin * width)

xR = int(xmax * width)

yL = int(ymin * height)

yR = int(ymax * height)

cv2.circle(annotated_frame,(x1,y1),10,(0,255,255),-1) # yellow dot

# this line from spatial mobile net

#cv2.putText(frame, f"Z: {int(box.spatialCoordinates.z)} mm", (xL + 10, yL + 80), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)

# this line is from 2d map

spatials, centroid = hostSpatials.calc_spatials(depthData,roi)

cv2.putText(annotated_frame, "Z: " + ("{:.1f}m".format(spatials['z']/1000) if not math.isnan(spatials['z']) else "--"), (xL + 10, yL + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)

# Display the annotated frame

cv2.imshow("YOLOv8 Inference", annotated_frame)

# Display Depth Frame

#cv2.imshow("depth", disp)

if cv2.waitKey(1) == ord('q'):

break

cv2.destroyAllWindows()

KwinT · Mar 1, 2024

KwinT additional info, the spatial cal was never gone through, it only shows a few frames with object detection and I guest when it hits the spatial, roi =… line, it cannot go through it so it freezes

jakaskerl · Mar 2, 2024

Hi @KwinT
Just change the calc.py function to accept roi. Either way, when you pass a point to calc_spatials it will create a ROI around it and average the values.

Also, here is GPT "fixed code" that I haven't checked, but might solve the blocking.

import cv2
import depthai as dai
from calc import HostSpatialsCalc
import numpy as np
import math
from ultralytics import YOLO

# Create YOLO model
model = YOLO('yolov8n-seg.pt')

# Create a DepthAI pipeline
pipeline = dai.Pipeline()

# Define source and output for RGB camera
camRgb = pipeline.create(dai.node.ColorCamera)
xoutVideo = pipeline.create(dai.node.XLinkOut)
xoutVideo.setStreamName("video")

# Define source and output for stereo camera
monoLeft = pipeline.create(dai.node.MonoCamera)
monoRight = pipeline.create(dai.node.MonoCamera)
stereo = pipeline.create(dai.node.StereoDepth)

# RGB camera properties
camRgb.setBoardSocket(dai.CameraBoardSocket.RGB)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setVideoSize(640, 480)
xoutVideo.input.setBlocking(False)
xoutVideo.input.setQueueSize(1)

# Stereo camera properties
monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
monoLeft.setBoardSocket(dai.CameraBoardSocket.LEFT)
monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
monoRight.setBoardSocket(dai.CameraBoardSocket.RIGHT)
stereo.initialConfig.setConfidenceThreshold(255)
stereo.setLeftRightCheck(True)
stereo.setSubpixel(False)

# Linking
camRgb.video.link(xoutVideo.input)
monoLeft.out.link(stereo.left)
monoRight.out.link(stereo.right)

xoutDepth = pipeline.create(dai.node.XLinkOut)
xoutDepth.setStreamName("depth")
stereo.depth.link(xoutDepth.input)

# Connect to device and start pipeline
with dai.Device(pipeline) as device:
    videoQueue = device.getOutputQueue(name="video", maxSize=1, blocking=False)
    depthQueue = device.getOutputQueue(name="depth", maxSize=1, blocking=False)
    hostSpatials = HostSpatialsCalc(device)

    color = (255, 255, 255)

    while True:
        inDepth = depthQueue.get()  # Get depth frame
        depthFrame = inDepth.getFrame()

        videoIn = videoQueue.get()  # Get video frame
        frame = videoIn.getCvFrame()

        # Perform YOLOv8 inference on the frame
        results = model(frame)

        # Annotate frame with YOLO predictions
        annotated_frame = np.squeeze(results.render())

        # Process detections and calculate spatial information
        for box in results.boxes:
            # Coordinates of the bounding box (normalized)
            x1, y1, x2, y2 = box.xyxy[0].tolist()
            
            # Calculate spatial information for the ROI
            spatials = hostSpatials.calc_spatials(inDepth, (x1, y1, x2, y2))

            # Display spatial information on the frame
            cv2.putText(annotated_frame, f"Z: {spatials['z']/1000:.2f}m", (int(x1), int(y1)), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)

        # Display the annotated frame
        cv2.imshow("YOLOv8 Inference", annotated_frame)

        if cv2.waitKey(1) == ord('q'):
            break

cv2.destroyAllWindows()

Thoughts?