Hi,
I am using the Oak D Lite like a dashcam and trying to measure the distance with vehicles and pedestrians in front of my vehicle. I am using the pedestrian-and-vehicle-detection-adas-0001 model. I'm getting around 10fps which I am ok with. The object detection is working well and fast enough; the tracking is working somewhat well but I think that can be improved by using filter's etc.
However, the thing that I'm not happy with is the depth sensing. I've set the minimum distance as 100cm and the maximum as 15meters. The readings are wildly off, especially the Z coordinate. The readings oscillate anywhere from 1 meter to 12meter.
Am I doing something wrong in the code? Is this expected because the Oak D lite uses passive stereo depth sensing and vehicles are usually somewhat featureless from some angles?
What are your suggestions? Do you think the Oak D lite or any other Oak D is good enough for this application?
My setup:
Oak D Lite
Raspberry Pi 4B 4GB RAM
USB 3 cable
import depthai as dai
import blobconverter
import cv2
import numpy as np
import time
import psutil
import datetime
label_map = ['unknown','vehicle', 'pedestrian']
class KalmanFilter(object):
def __init__(self, acc_std, meas_std, z, time):
self.dim_z = len(z)
self.time = time
self.acc_std = acc_std
self.meas_std = meas_std
# the observation matrix
self.H = np.eye(self.dim_z, 3 * self.dim_z)
self.x = np.vstack((z, np.zeros((2 * self.dim_z, 1))))
self.P = np.zeros((3 * self.dim_z, 3 * self.dim_z))
i, j = np.indices((3 * self.dim_z, 3 * self.dim_z))
self.P[(i - j) % self.dim_z == 0] = 1e5 # initial vector is a guess -> high estimate uncertainty
def predict(self, dt):
# the state transition matrix -> assuming acceleration is constant
F = np.eye(3 * self.dim_z)
np.fill_diagonal(F[:2*self.dim_z, self.dim_z:], dt)
np.fill_diagonal(F[:self.dim_z, 2*self.dim_z:], dt ** 2 / 2)
# the process noise matrix
A = np.zeros((3 * self.dim_z, 3 * self.dim_z))
np.fill_diagonal(A[2*self.dim_z:, 2*self.dim_z:], 1)
Q = self.acc_std ** 2 * F @ A @ F.T
self.x = F @ self.x
self.P = F @ self.P @ F.T + Q
def update(self, z):
if z is None: return
# the measurement uncertainty
R = self.meas_std ** 2 * np.eye(self.dim_z)
# the Kalman Gain
K = self.P @ self.H.T @ np.linalg.inv(self.H @ self.P @ self.H.T + R)
self.x = self.x + K @ (z - self.H @ self.x)
I = np.eye(3 * self.dim_z)
self.P = (I - K @ self.H) @ self.P @ (I - K @ self.H).T + K @ R @ K.T
# Create pipeline
pipeline = dai.Pipeline()
#imu = pipeline.create(dai.node.IMU)
# enable ACCELEROMETER_RAW and GYROSCOPE_RAW at 100 hz rate
#imu.enableIMUSensor([dai.IMUSensor.ACCELEROMETER_RAW], 100)
# above this threshold packets will be sent in batch of X, if the host is not blocked and USB bandwidth is available
#imu.setBatchReportThreshold(1)
# maximum number of IMU packets in a batch, if it's reached device will block sending until host can receive it
# if lower or equal to batchReportThreshold then the sending is always blocking on device
# useful to reduce device's CPU load and number of lost packets, if CPU load is high on device side due to multiple nodes
#imu.setMaxBatchReports(10)
cam_rgb = pipeline.create(dai.node.ColorCamera)
detection_network = pipeline.create(dai.node.MobileNetSpatialDetectionNetwork)
mono_left = pipeline.create(dai.node.MonoCamera)
mono_right = pipeline.create(dai.node.MonoCamera)
stereo = pipeline.create(dai.node.StereoDepth)
object_tracker = pipeline.create(dai.node.ObjectTracker)
xout_rgb = pipeline.create(dai.node.XLinkOut)
tracker_out = pipeline.create(dai.node.XLinkOut)
xout_rgb.setStreamName('rgb')
tracker_out.setStreamName('tracklets')
cam_rgb.setImageOrientation(dai.CameraImageOrientation.ROTATE_180_DEG)
cam_rgb.setPreviewSize(672, 384)
cam_rgb.setBoardSocket(dai.CameraBoardSocket.RGB)
cam_rgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
cam_rgb.setInterleaved(False)
cam_rgb .setPreviewNumFramesPool(15)
mono_left.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
mono_left.setBoardSocket(dai.CameraBoardSocket.LEFT)
mono_right.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
mono_right.setBoardSocket(dai.CameraBoardSocket.RIGHT)
mono_left.setFps(10)
mono_right.setFps(10)
cam_rgb.setFps(10)
stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)
stereo.setDepthAlign(dai.CameraBoardSocket.RGB)
detection_network.setBlobPath('./pedestrian-and-vehicle-detector-adas-0001.blob')
detection_network.setConfidenceThreshold(0.5)
detection_network.input.setBlocking(False)
detection_network.setBoundingBoxScaleFactor(0.7)
detection_network.setNumInferenceThreads(2)
#detection_network.setNumClasses(2)
detection_network.setDepthLowerThreshold(100)
detection_network.setDepthUpperThreshold(15000)
object_tracker.setDetectionLabelsToTrack([1,2]) # track only person and car
object_tracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)
object_tracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.UNIQUE_ID)
mono_left.out.link(stereo.left)
mono_right.out.link(stereo.right)
cam_rgb.preview.link(detection_network.input)
object_tracker.passthroughTrackerFrame.link(xout_rgb.input)
object_tracker.out.link(tracker_out.input)
detection_network.passthrough.link(object_tracker.inputTrackerFrame)
detection_network.passthrough.link(object_tracker.inputDetectionFrame)
detection_network.out.link(object_tracker.inputDetections)
stereo.depth.link(detection_network.inputDepth)
# Connect to device and start pipeline
with dai.Device(pipeline) as device:
calibration_handler = device.readCalibration()
baseline = calibration_handler.getBaselineDistance() * 10
focal_length = calibration_handler.getCameraIntrinsics(dai.CameraBoardSocket.RIGHT, 640, 400)[0][0]
q_rgb = device.getOutputQueue(name='rgb', maxSize=10, blocking=False)
q_tracklets = device.getOutputQueue(name='tracklets', maxSize=10, blocking=False)
kalman_filters = {}
startTime = time.monotonic()
counter = 0
fps = 0
color = (255, 255, 255)
# Get the current date as the timestamp to generate unique file names.
date = datetime.datetime.now().strftime('%m-%d-%Y_%H.%M.%S')
output_file_name = '/home/pi/data/road_footage/output_' + date + '.avi'
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_file_name, fourcc, 10.0, (672, 384))
while(True):
frame = q_rgb.get().getCvFrame()
tracklets = q_tracklets.get()
current_time = tracklets.getTimestamp()
h, w = frame.shape[:2]
for t in tracklets.tracklets:
roi = t.roi.denormalize(frame.shape[1], frame.shape[0])
x1 = int(roi.topLeft().x)
y1 = int(roi.topLeft().y)
x2 = int(roi.bottomRight().x)
y2 = int(roi.bottomRight().y)
x_space = t.spatialCoordinates.x
y_space = t.spatialCoordinates.y
z_space = t.spatialCoordinates.z
meas_vec_bbox = np.array([[(x1 + x2) / 2], [(y1 + y2) / 2], [x2 - x1], [y2 - y1]])
meas_vec_space = np.array([[x_space], [y_space], [z_space]])
meas_std_space = z_space ** 2 / (baseline * focal_length)
if t.status.name == 'NEW':
# Adjust these parameters
acc_std_space = 10
acc_std_bbox = 0.1 #0.1
meas_std_bbox = 0.05
kalman_filters[t.id] = {'bbox': KalmanFilter(meas_std_bbox, acc_std_bbox, meas_vec_bbox, current_time),
'space': KalmanFilter(meas_std_space, acc_std_space, meas_vec_space, current_time)}
print("Created new tracklet: ",t.id)
else:
try:
dt = current_time - kalman_filters[t.id]['bbox'].time
except:
print("Missed id: ",t.id);
continue
dt = dt.total_seconds()
kalman_filters[t.id]['space'].meas_std = meas_std_space
if t.status.name != 'TRACKED':
meas_vec_bbox = None
meas_vec_space = None
if z_space == 0:
meas_vec_space = None
kalman_filters[t.id]['bbox'].predict(dt)
kalman_filters[t.id]['bbox'].update(meas_vec_bbox)
kalman_filters[t.id]['space'].predict(dt)
kalman_filters[t.id]['space'].update(meas_vec_space)
kalman_filters[t.id]['bbox'].time = current_time
kalman_filters[t.id]['space'].time = current_time
vec_bbox = kalman_filters[t.id]['bbox'].x
vec_space = kalman_filters[t.id]['space'].x
x1_filter = int(vec_bbox[0] - vec_bbox[2] / 2)
x2_filter = int(vec_bbox[0] + vec_bbox[2] / 2)
y1_filter = int(vec_bbox[1] - vec_bbox[3] / 2)
y2_filter = int(vec_bbox[1] + vec_bbox[3] / 2)
#cv2.rectangle(frame, (x1_filter, y1_filter), (x2_filter, y2_filter), (0, 0, 255), 2)
#cv2.putText(frame, f'X: {int(vec_space[0])} mm', (x1 + 10, y1 + 110), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (0, 0, 255))
#cv2.putText(frame, f'Y: {int(vec_space[1])} mm', (x1 + 10, y1 + 125), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (0, 0, 255))
#cv2.putText(frame, f'Z: {int(vec_space[2])} mm', (x1 + 10, y1 + 140), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (0, 0, 255))
#print(t.label)
try:
label = label_map[t.label]
except:
label = t.label
cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, f'ID: {[t.id]}', (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, t.status.name, (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0))
cv2.putText(frame, f'X: {int(x_space)} mm', (x1 + 10, y1 + 65), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, f'Y: {int(y_space)} mm', (x1 + 10, y1 + 80), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, f'Z: {int(z_space)} mm', (x1 + 10, y1 + 95), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
counter+=1
current_time = time.monotonic()
if (current_time - startTime) > 1 :
fps = counter / (current_time - startTime)
counter = 0
startTime = current_time
cv2.putText(frame, "NN fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
#print("fps: {:.2f}".format(fps))
cv2.imshow('tracker', frame)
bytes_avail = psutil.disk_usage('/').free
#gigabytes_avail = bytes_avail / 1024 / 1024 / 1024
#print(gigabytes_avail)
if bytes_avail>1073741824:
out.write(frame)
else:
break
if cv2.waitKey(1) == ord('q'):
break
cv2.destroyAllWindows()