Attached are two screenshots. The first one shows gen2 example #26 detecting a "sports ball" and showing its Z-depth as 67" (f"Z: {int(detection.spatialCoordinates.z / 25.4)} in"
). The second shows a derivation of gen2 example #03, with distance calculated from disparity, using OAK-D intrinsics. Except that the latter requires a kludge factor of 0.5 on all distances, it seems quite accurate, showing the ball at 53". The actual distance to the nearest point is about 50". Since my example #03 derivation is averaging the depths of 10 pixels on either side of the center (all pixels shown in the drawn square), it should be seeing a value slightly above 50... and I'm not using subpixel in this example, so there could be some error there as well. The accuracy I'm seeing from the disparity-calculated distances is adequate for my purposes, but I need the distances of model-detected objects.
I've tested at other distances, with similar results: the disparity-calculated distances remain reasonably accurate; the yolo Z-depth is consistently high and seems to get proportionally more "too high" as actual object distance increases.
I had previously been using the gen1 yolo depths example, and had found that they were also too high, which led me to these experiments with gen2.
The example #26 code I'm using is as provided in https://github.com/luxonis/depthai-python/pull/163 except that I modified the path to the .blob file, and converted distances from mm to inches. I'm using the yolo v3 blob file that the README points to.
The example #03-derived code is attached below.
My real question is, why are the yolo depths consistently too high?
?
My example #03-derived code follows. Thanks in advance for any guidance!
Derrell
#!/usr/bin/env python3
import cv2
import depthai as dai
import struct
import numpy as np
# Start defining a pipeline
pipeline = dai.Pipeline()
# Define a source - two mono (grayscale) cameras
left = pipeline.createMonoCamera()
left.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
left.setBoardSocket(dai.CameraBoardSocket.LEFT)
right = pipeline.createMonoCamera()
right.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
right.setBoardSocket(dai.CameraBoardSocket.RIGHT)
# Create a node that will produce the depth map (using disparity output as it's easier to visualize depth this way)
depth = pipeline.createStereoDepth()
depth.setConfidenceThreshold(200)
left.out.link(depth.left)
right.out.link(depth.right)
# Create disparity output
xout = pipeline.createXLinkOut()
xout.setStreamName("disparity")
depth.disparity.link(xout.input)
# Create mono (right) output
monoout = pipeline.createXLinkOut()
monoout.setStreamName("mono")
right.out.link(monoout.input)
# Oak-D
baseline = 75 #mm
focal = 883.15
max_disp = 96
disp_type = np.uint8
disp_levels = 1
kludge = 0.5 # each distance is double what it should be. Why???
# Pipeline defined, now the device is connected to
with dai.Device(pipeline) as device:
# Start pipeline
device.startPipeline()
# Output queue will be used to get the disparity frames from the outputs defined above
q = device.getOutputQueue(name="disparity", maxSize=4, blocking=False)
qMono = device.getOutputQueue(name="mono", maxSize=4, blocking=False)
while True:
in_depth = q.get() # blocking call, will wait until a new data has arrived
# data is originally represented as a flat 1D array, it needs to be converted into HxW form
frame = in_depth.getData().reshape((in_depth.getHeight(), in_depth.getWidth())).astype(np.uint8)
height = in_depth.getHeight()
width = in_depth.getWidth()
# Compute depth from disparity
disp = np.array(in_depth.getData()).astype(np.uint8).view(disp_type).reshape((height, width))
with np.errstate(divide='ignore'): # Should be safe to ignore div by zero here
distData = (disp_levels * baseline * focal / disp).astype(np.uint16)
distData = distData / 25.4 * kludge # mm -> in; kludge to fix unknown doubling
# Find the average distance in the small rectangular region
region = []
for y in range(int(height / 2 - 10), int(height / 2 + 10)):
for x in range(int(width / 2 - 10), int(width / 2 + 10)):
region.append(distData[y][x])
distance = sum(region) / len(region)
frame = np.ascontiguousarray(frame)
# frame is transformed, the color map will be applied to highlight the depth info
frame = cv2.applyColorMap(frame, cv2.COLORMAP_JET)
# frame is ready to be shown
cv2.imshow("disparity", frame)
in_mono = qMono.get()
monoframe = in_mono.getData().reshape((height, width)).astype(np.uint8)
# Draw a rectangle in the middle of the frame
pt1 = int(width / 2 - 10), int(height / 2 - 10)
pt2 = int(width / 2 + 10), int(height / 2 + 10)
cv2.rectangle(monoframe, pt1, pt2, (255, 255, 255), 2)
cv2.putText(monoframe, f"{int(distance)} in",
(int(width / 2 - 10), int(height / 2 - 10) - 20),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
cv2.imshow("mono", monoframe)
if cv2.waitKey(1) == ord('q'):
break