- Edited
I'm doing some non-standard operations with point cloud data and the built in point cloud features in the depthai library just weren't suited to my use case so I developed my own code to create a point cloud from disparity. However, when I calculate coordinates using the disparity I am getting some inaccuracy that I just can't seem to hunt down. For background this is being done with an Oak-D Pro Wide.
The process is as follows. Take the disparity from the camera and convert to depth. Convert the perpendicular depth to true depth. Use the true depth and the position where the pixel appears in the image to convert from true depth (thought of in spherical coordinates) to real world position in cartesian coordinates. Lastly I take the point cloud and take the highest value at each position to create something like a digital elevation model or a 2D occupancy grid.
This process mostly works but I have noticed a bit of inaccuracy when it comes to the height values in the point cloud. You can see this in the image where the values towards the edge of the field of view start to differ considerably from what they should be. The camera is pointed at a corner where one wall meets another so most of the DEM should be just the flat floor which is all the same height and then the walls which extend taller than the 2 meter max height I selected.
There are two possible things I can think of that might be causing this. Firstly, when converting from disparity to cartesian using spherical coordinates, I assumed that the distribution of pixels within the image are linearly spaced. Meaning that if the horizontal FOV is 90 degrees and the image is 900 pixels wide, then each pixel you move away from the centerline is 0.1 degrees. This obviously made the math much easier in the calcSphericalConversionArrays function, but it may not be entirely correct. I was basing this decision on the idea that if the pixels were distributed on a non-linear angle spacing then there would be distortion in the image, but I don't see much of that. The second idea I had is that it is safe to assume that the pixels are linearly spaced, but I just simply had the wrong FOV values. I wasn't sure exactly how to calculate the effective FOV when using the rgb aligned depth so I just put the camera close to a wall with measuring tapes and calculated it manually. I ended up with HFOV=96 and VFOV=73. I have tried changing these values and checking but I haven't found a combination that seems to eliminate the problem entirely.
Here is the pipeline along with the functions I'm using to do the calculations.
# Create pipeline
pipeline = dai.Pipeline()
device = dai.Device()
# Define the nodes
camRgb = pipeline.create(dai.node.Camera)
left = pipeline.create(dai.node.MonoCamera)
right = pipeline.create(dai.node.MonoCamera)
stereo = pipeline.create(dai.node.StereoDepth)
imu = pipeline.create(dai.node.IMU)
sync = pipeline.create(dai.node.Sync)
x_out = pipeline.create(dai.node.XLinkOut)
# Set stream names
x_out.setStreamName("xout")
# Set properties for rgb node
rgbCamSocket = dai.CameraBoardSocket.CAM_A
camRgb.setBoardSocket(rgbCamSocket)
camRgb.setSize(1280, 800)
camRgb.setFps(fps)
camRgb.setMeshSource(dai.CameraProperties.WarpMeshSource.CALIBRATION)
# For now, RGB needs fixed focus to properly align with depth.
try:
calibData = device.readCalibration2()
lensPosition = calibData.getLensPosition(rgbCamSocket)
if lensPosition:
camRgb.initialControl.setManualFocus(lensPosition)
except:
raise
# Set properties for left/right nodes
# The disparity is computed at this resolution, then upscaled to RGB resolution
monoResolution = dai.MonoCameraProperties.SensorResolution.THE_400_P
left.setResolution(monoResolution)
left.setCamera("left")
left.setFps(fps)
right.setResolution(monoResolution)
right.setCamera("right")
right.setFps(fps)
# Set properties for stereo node
stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)
stereo.initialConfig.setConfidenceThreshold(200)
stereo.initialConfig.setMedianFilter(dai.MedianFilter.KERNEL_5x5)
stereo.setLeftRightCheck(True) # LR-check is required for depth alignment
stereo.setSubpixel(True) # This adds a lot of lag on 800p mono resolution
stereo.setDepthAlign(rgbCamSocket)
stereo.setDisparityToDepthUseSpecTranslation(False)
stereo.setDepthAlignmentUseSpecTranslation(False)
# Set properties for IMU node
# imu.enableIMUSensor(dai.IMUSensor.ROTATION_VECTOR, 120)
imu.enableIMUSensor(dai.IMUSensor.GAME_ROTATION_VECTOR, 120)
imu.setBatchReportThreshold(1)
imu.setMaxBatchReports(1)
# Set properties for sync node
sync.setSyncThreshold(timedelta(milliseconds=30))
sync.setSyncAttempts(-1) # Default, only sends message if all input nodes are in sync
# Linking
left.out.link(stereo.left)
right.out.link(stereo.right)
stereo.disparity.link(sync.inputs["disparity"])
camRgb.video.link(sync.inputs["video"])
imu.out.link(sync.inputs["imu"])
sync.out.link(x_out.input)
# Get max disparity to normalize the disparity images
disparityMultiplier = 255.0 / stereo.initialConfig.getMaxDisparity()
def calcDepthFromDisparity(disparity_cv2_frame):
# Calculate the depth frame from disparity
with np.errstate(divide='ignore', invalid='ignore'):
depth = FOCAL_LENGTH_PIXELS * BASELINE / (disparity_cv2_frame / 8.0212) # Produces perpendicular depth in cm
depth[depth == np.inf] = np.nan # Set the infinites (div by 0) back to null/0
# Convert from perpendicular depth to true depth
depth = depth * true_depth_coefs
depth[depth > DEPTH_CAP] = np.nan # Set everything past the depth cap to null/0
# Create an image of depth for cv2
depth_cv2_frame = (np.nan_to_num(depth) * 255 / DEPTH_CAP).astype(np.uint8)
depth_cv2_frame = cv2.applyColorMap(depth_cv2_frame, cv2.COLORMAP_HOT)
return depth, depth_cv2_frame
def calcTrueDepthCoefs():
# Heading will always be zero since this is only calculating the distance compensation for spherical camera lens
# Convert the heading from north to theta in spherical system, equation is 90deg-heading
theta = 90 - 0
# Compute the arrays for true depth conversion
horiz_trig_rads_abs_array = (theta - abs(np.linspace(-HFOV / 2, HFOV / 2, num=IMAGE_WIDTH))) * (np.pi / 180)
horiz_trig_rads_abs_array = np.tile(horiz_trig_rads_abs_array, (IMAGE_HEIGHT, 1))
vert_trig_rads_abs_array = (90 - abs(np.linspace(-VFOV / 2, VFOV / 2, num=IMAGE_HEIGHT))) * (np.pi / 180)
vert_trig_rads_abs_array = np.transpose(np.tile(vert_trig_rads_abs_array, (IMAGE_WIDTH, 1)))
triang_horiz_hypot = 1 / np.sin(horiz_trig_rads_abs_array)
triang_vert_hypot = 1 / np.sin(vert_trig_rads_abs_array)
triang_vert_adj = np.sqrt(np.square(triang_vert_hypot) - 1)
true_depth_coef_array = np.sqrt(np.square(triang_horiz_hypot) + np.square(triang_vert_adj))
return true_depth_coef_array
def calcSphericalConversionArrays(heading, phi):
# Convert the heading from north to theta in spherical system, equation is 90deg-heading
theta = 90 - heading
# Compute the arrays for real world position conversion
horiz_angle_rads_array = (theta + np.linspace(HFOV / 2, -HFOV / 2, num=IMAGE_WIDTH)) * (np.pi / 180)
horiz_angle_rads_array = np.tile(horiz_angle_rads_array, (IMAGE_HEIGHT, 1))
vert_angle_rads_array = (phi + np.linspace(-VFOV / 2, VFOV / 2, num=IMAGE_HEIGHT)) * (np.pi / 180)
vert_angle_rads_array = np.transpose(np.tile(vert_angle_rads_array, (IMAGE_WIDTH, 1)))
return horiz_angle_rads_array, vert_angle_rads_array
def calcWorldCoordinates(depth, heading, pitch, min_height, max_height, model_voxel_size):
# Calculate the real world coordinates
horiz_angle_rads, vert_angle_rads = calcSphericalConversionArrays(heading, pitch)
# Convert from true depth to world position in meters
x_coords = depth / 100 * np.sin(vert_angle_rads) * np.cos(horiz_angle_rads)
y_coords = depth / 100 * np.sin(vert_angle_rads) * np.sin(horiz_angle_rads)
z_coords = depth / 100 * np.cos(vert_angle_rads)
z_coords = z_coords + 0.60 # Account for the height of camera on tripod
# Select points within a height range in the point cloud
selected_indices = np.where((z_coords >= min_height) & (z_coords <= max_height))# & (x_coords != 0) & (y_coords != 0))
# Round the x and y coords to the selected resolution and the z coords to 1cm resolution
x_filtered = np.round(x_coords[selected_indices].flatten() / model_voxel_size, decimals=2)
y_filtered = np.round(y_coords[selected_indices].flatten() / model_voxel_size, decimals=2)
z_filtered = np.round(z_coords[selected_indices].flatten(), decimals=2)
return x_filtered, y_filtered, z_filtered