Hello everyone,
I am having problem in aligning RGB and depth.
Below is some info including hardware, code and the issues explained.
Hardware
OAK-D S2 POE (RVC2) / depthai-core: 3.0.0
Goal
Final goal : To detect an object from RGB image and get height of the object using depth values.
- transform camera coordinate to world coordinate where Z stands for the height from the ground
- RGB-Depth alignment
Issue
Depth from StereoDepth is most accurate when the stereo output is 640×480 (I confirmed this). However, I can’t find an RGB output size that aligns exactly with the 640×480 depth image.
I followed tutorial below but it turns out that depth image is always bigger than rgb image.
Please refer to attached image
luxonis/depthai-coreblob/main/examples/python/ImageAlign/depth_align.py

Questions
How Image align works (ratio or settings)
How to get RGB configurations that fits the best with stereo 640, 480 output.
Code
1. RGB and depth alignment
#!/usr/bin/env python3
import numpy as np
import cv2
import depthai as dai
import time
from datetime import timedelta
FPS = 25.0
RGB_SOCKET = dai.CameraBoardSocket.CAM_A
LEFT_SOCKET = dai.CameraBoardSocket.CAM_B
RIGHT_SOCKET = dai.CameraBoardSocket.CAM_C
class FPSCounter:
def __init__(self):
self.frameTimes = []
def tick(self):
now = time.time()
self.frameTimes.append(now)
self.frameTimes = self.frameTimes[-20:]
def getFps(self):
if len(self.frameTimes) <= 1:
return 0.0
return (len(self.frameTimes) - 1) / (self.frameTimes[-1] - self.frameTimes[0])
def colorizeDepth(frameDepth):
"""frameDepth: np.uint16 (mm). Return BGR u8 colormap."""
invalidMask = (frameDepth == 0)
if frameDepth.size == 0:
return np.zeros((0, 0, 3), np.uint8)
nz = frameDepth[~invalidMask]
if nz.size == 0:
return np.zeros((*frameDepth.shape, 3), np.uint8)
minDepth = np.percentile(nz, 3)
maxDepth = np.percentile(nz, 95)
minDepth = max(minDepth, 1.0)
logDepth = np.zeros_like(frameDepth, dtype=np.float32)
np.log(frameDepth, out=logDepth, where=(frameDepth != 0))
logMin = float(np.log(minDepth))
logMax = float(np.log(maxDepth))
logDepth = np.clip(logDepth, logMin, logMax, out=logDepth)
depthU8 = np.uint8(255.0 * (logDepth - logMin) / (logMax - logMin + 1e-12))
depthU8[invalidMask] = 0
cm = cv2.applyColorMap(depthU8, cv2.COLORMAP_JET)
cm[invalidMask] = 0
return cm
rgbWeight = 0.4
depthWeight = 0.6
OFFSET_CENTER = 100
uOffset = OFFSET_CENTER
vOffset = OFFSET_CENTER
Z_global = None
du_px = 0
dv_px = 0
winName = "rgb-depth"
def updateBlendWeights(percentRgb):
global rgbWeight, depthWeight
rgbWeight = float(percentRgb) / 100.0
depthWeight = 1.0 - rgbWeight
def on_u(val):
global uOffset
uOffset = int(val)
def on_v(val):
global vOffset
vOffset = int(val)
def mouse_callback(event, x, y, flags, param):
if event == cv2.EVENT_LBUTTONDOWN and Z_global is not None:
h, w = Z_global.shape[:2]
x_src = x - du_px
y_src = y - dv_px
if 0 <= x_src < w and 0 <= y_src < h:
z = int(Z_global[y_src, x_src])
print(f"[{x},{y}] -> depth(mm) at shifted = {z}")
else:
print(f"[{x},{y}] -> out of bounds after offset")
pipeline = dai.Pipeline()
camRgb = pipeline.create(dai.node.Camera).build(RGB_SOCKET)
left = pipeline.create(dai.node.Camera).build(LEFT_SOCKET)
right = pipeline.create(dai.node.Camera).build(RIGHT_SOCKET)
stereo = pipeline.create(dai.node.StereoDepth)
sync = pipeline.create(dai.node.Sync)
align = pipeline.create(dai.node.ImageAlign)
stereo.setExtendedDisparity(True)
sync.setSyncThreshold(timedelta(seconds=1.0 / (2.0 * FPS)))
cap = dai.ImgFrameCapability()
cap.size.fixed((640, 480))
cap.resizeMode = dai.ImgResizeMode.CROP
cap.fps.fixed(FPS)
capRgb = dai.ImgFrameCapability()
capRgb.size.fixed((1280, 1152))
capRgb.resizeMode = dai.ImgResizeMode.CROP
capRgb.fps.fixed(FPS)
rgbOut = camRgb.requestOutput(cap, True)
leftOut = left.requestOutput(cap, True)
rightOut = right.requestOutput(cap, True)
try:
stereo.setOutputSize(640, 480)
stereo.setOutputKeepAspectRatio(False)
except Exception:
pass
rgbOut.link(sync.inputs["rgb"])
leftOut.link(stereo.left)
rightOut.link(stereo.right)
stereo.depth.link(align.input)
rgbOut.link(align.inputAlignTo)
align.outputAligned.link(sync.inputs["depth_aligned"])
queue = sync.out.createOutputQueue()
with pipeline:
pipeline.start()
cv2.namedWindow(winName, cv2.WINDOW_NORMAL)
cv2.resizeWindow(winName, 1280, 720)
cv2.setMouseCallback(winName, mouse_callback)
cv2.createTrackbar("RGB Weight %", winName, int(rgbWeight * 100), 100, updateBlendWeights)
cv2.createTrackbar("u offset (px)", winName, uOffset, 200, on_u)
cv2.createTrackbar("v offset (px)", winName, vOffset, 200, on_v)
fps = FPSCounter()
while True:
msgGrp = queue.get()
fps.tick()
frameRgb = msgGrp["rgb"]
frameDepth= msgGrp["depth_aligned"]
if frameRgb is None or frameDepth is None:
continue
rgb = frameRgb.getCvFrame()
Z = frameDepth.getFrame()
Z_global = Z
depthColor = colorizeDepth(Z)
du = int(uOffset - OFFSET_CENTER)
dv = int(vOffset - OFFSET_CENTER)
du_px = du
dv_px = dv
M = np.float32([[1, 0, du], [0, 1, dv]])
depthColorShift = cv2.warpAffine(
depthColor,
M,
(rgb.shape[1], rgb.shape[0]),
flags=cv2.INTER_NEAREST,
borderValue=0
)
if rgb.ndim == 2:
rgb = cv2.cvtColor(rgb, cv2.COLOR_GRAY2BGR)
blended = cv2.addWeighted(rgb, rgbWeight, depthColorShift, depthWeight, 0)
cv2.putText(
blended, f"FPS: {fps.getFps():.2f}",
(10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2
)
cv2.imshow(winName, blended)
key = cv2.waitKey(1) & 0xFF
if key == ord('q') or key == 27:
break
cv2.destroyAllWindows()
2. Height map (I am working on)
// src/rgb_depth_heightmap.cpp
#include <ros/ros.h>
#include <sensor_msgs/Image.h>
#include <std_msgs/Header.h>
#include <cv_bridge/cv_bridge.h>
#include <opencv2/opencv.hpp>
#include "depthai/depthai.hpp"
#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <iomanip>
constexpr float FPS = 25.0f;
const dai::CameraBoardSocket RGB_SOCKET = dai::CameraBoardSocket::CAM_A;
const dai::CameraBoardSocket LEFT_SOCKET = dai::CameraBoardSocket::CAM_B;
const dai::CameraBoardSocket RIGHT_SOCKET = dai::CameraBoardSocket::CAM_C;
static const int TARGET_W = 640;
static const int TARGET_H = 480;
static float rgbWeight = 0.4f;
static float depthWeight = 0.6f;
int staticDepthPlane = 0;
static const float PITCH_DEG = 42.905f;
static const float ROLL_DEG = -0.761f;
static const float X_SHIFT = 0.0f;
static const float Y_SHIFT = 0.0f;
static const float Z_SHIFT = 1.85f;
static cv::Mat g_X_world, g_Y_world, g_height_map, depth_map; // float32, (H,W)
static std::string g_winName = "RGB-HeightMap";
static void onUpdateBlend(int percentRgb, void*) {
rgbWeight = static_cast<float>(percentRgb) / 100.0f;
depthWeight = 1.0f - rgbWeight;
}
void updateDepthPlane(int depth, void*) {
staticDepthPlane = depth;
}
static void onMouse(int event, int x, int y, int, void*) {
if(event != cv::EVENT_LBUTTONDOWN) return;
if(g_X_world.empty() || g_Y_world.empty() || g_height_map.empty()) return;
if(x < 0 || y < 0 || x >= g_X_world.cols || y >= g_X_world.rows) return;
float wx = g_X_world.at<float>(y, x);
float wy = g_Y_world.at<float>(y, x);
float h = g_height_map.at<float>(y, x);
float depth = depth_map.at<float>(y, x);
std::cout << "Pixel (" << x << "," << y << ") -> "
<< "X: " << std::fixed << std::setprecision(2) << wx << " m, "
<< "Y: " << std::fixed << std::setprecision(2) << wy << " m, "
<< "Height: " << std::fixed << std::setprecision(3) << h << " m\n";
std::cout << "Z: " << std::fixed << std::setprecision(2) << depth << std::endl;
};
static inline double msSince(const std::chrono::steady_clock::time_point& a,
const std::chrono::steady_clock::time_point& b) {
using namespace std::chrono;
return duration_cast<duration<double, std::milli>>(b - a).count();
}
int main(int argc, char** argv) {
ros::init(argc, argv, "depthai_heightmap_publisher");
ros::NodeHandle nh("~");
bool enable_display = true;
ros::Publisher pub_rgb = nh.advertise<sensor_msgs::Image>("/oak/rgb/image_raw", 1);
ros::Publisher pub_stereo = nh.advertise<sensor_msgs::Image>("/oak/stereo/image_raw", 1);
ros::Publisher pub_height_raw = nh.advertise<sensor_msgs::Image>("/oak/height_map", 1);
ros::Publisher pub_height_vis = nh.advertise<sensor_msgs::Image>("/oak/height_map/visualized", 1);
const int W = TARGET_W, H = TARGET_H;
// Extrinsics
const float pitch = PITCH_DEG * static_cast<float>(M_PI) / 180.0f;
const float roll = ROLL_DEG * static_cast<float>(M_PI) / 180.0f;
cv::Matx33f R_pitch(
1, 0, 0,
0, -std::sin(pitch), std::cos(pitch),
0, -std::cos(pitch), -std::sin(pitch)
);
cv::Matx33f R_roll(
std::cos(roll), -std::sin(roll), 0,
std::sin(roll), std::cos(roll), 0,
0, 0, 1
);
cv::Matx33f R33 = R_pitch * R_roll;
const float r00 = R33(0,0), r01 = R33(0,1), r02 = R33(0,2);
const float r10 = R33(1,0), r11 = R33(1,1), r12 = R33(1,2);
const float r20 = R33(2,0), r21 = R33(2,1), r22 = R33(2,2);
const float t0 = X_SHIFT, t1 = Y_SHIFT, t2 = Z_SHIFT;
std::cout<<"rotation matrix: "<<std::endl;
std::cout<<r00<<" " << r01<<" "<< r02 <<std::endl;
std::cout<<r10<<" " << r11<<" "<< r12 <<std::endl;
std::cout<<r20<<" " << r21<<" "<< r22 <<std::endl;
// DepthAI pipeline
dai::Pipeline pipeline;
auto camRgb = pipeline.create<dai::node::Camera>();
camRgb->build(RGB_SOCKET);
auto left = pipeline.create<dai::node::Camera>();
left->build(LEFT_SOCKET);
auto right = pipeline.create<dai::node::Camera>();
right->build(RIGHT_SOCKET);
auto stereo = pipeline.create<dai::node::StereoDepth>();
auto sync = pipeline.create<dai::node::Sync>();
stereo->setExtendedDisparity(true);
// stereo->setLeftRightCheck(true);
sync->setSyncThreshold(std::chrono::duration<int64_t, std::nano>(static_cast<int64_t>(1e9 / (2.0 * FPS))));
const int OUT_W = W, OUT_H = H;
auto rgbOut = camRgb->requestOutput(std::make_pair(OUT_W, OUT_H), dai::ImgFrame::Type::NV12, dai::ImgResizeMode::CROP, FPS, false);
auto leftOut = left->requestOutput(std::make_pair(OUT_W, OUT_H), dai::ImgFrame::Type::NV12, dai::ImgResizeMode::CROP, FPS, false);
auto rightOut = right->requestOutput(std::make_pair(OUT_W, OUT_H), dai::ImgFrame::Type::NV12, dai::ImgResizeMode::CROP, FPS, false);
rgbOut->link(sync->inputs["rgb"]);
leftOut->link(stereo->left);
rightOut->link(stereo->right);
// stereo->setOutputSize(OUT_W,OUT_H);
stereo->setOutputKeepAspectRatio(false);
stereo->depth.link(sync->inputs["depth_aligned"]);
rgbOut->link(stereo->inputAlignTo);
// auto align = pipeline.create<dai::node::ImageAlign>();
// stereo->depth.link(align->input);
// rgbOut->link(align->inputAlignTo);
// align->outputAligned.link(sync->inputs["depth_aligned"]);
// Output queue
auto queue = sync->out.createOutputQueue();
// auto alignQ = align->inputConfig.createInputQueue();
if(enable_display) {
cv::namedWindow(g_winName, cv::WINDOW_NORMAL);
cv::resizeWindow(g_winName, 1280, 720);
cv::createTrackbar("RGB Weight %", g_winName, nullptr, 100, onUpdateBlend);
cv::setTrackbarPos("RGB Weight %", g_winName, static_cast<int>(rgbWeight * 100));
cv::setMouseCallback(g_winName, onMouse, nullptr);
}
auto dev = pipeline.getDefaultDevice();
auto calib = dev->readCalibration();
auto Kv = calib.getCameraIntrinsics(RGB_SOCKET, W, H);
cv::Matx33f K(
Kv[0][0], Kv[0][1], Kv[0][2],
Kv[1][0], Kv[1][1], Kv[1][2],
Kv[2][0], Kv[2][1], Kv[2][2]
);
float fx = K(0,0), fy = K(1,1), cx = K(0,2), cy = K(1,2);
std::cout << "K:\n" << K << std::endl;
cv::Mat x_scale(H, W, CV_32F), y_scale(H, W, CV_32F);
for(int i = 0; i < H; ++i) {
float* yptr = y_scale.ptr<float>(i);
const float iy = (static_cast<float>(i) - cy) / fy;
for(int j = 0; j < W; ++j) yptr[j] = iy;
}
for(int j = 0; j < W; ++j) {
const float jx = (static_cast<float>(j) - cx) / fx;
for(int i = 0; i < H; ++i) x_scale.at<float>(i, j) = jx;
}
g_X_world = cv::Mat::zeros(H, W, CV_32F);
g_Y_world = cv::Mat::zeros(H, W, CV_32F);
g_height_map = cv::Mat::zeros(H, W, CV_32F);
depth_map = cv::Mat::zeros(H, W, CV_32F);
pipeline.start();
ros::Rate rate(30);
try {
cv::Mat frameRgb, frameDepthVis, depth16u;
while(ros::ok()) {
using clock = std::chrono::steady_clock;
auto T0 = clock::now();
auto group = queue->get<dai::MessageGroup>();
auto T1 = clock::now();
auto msgRgb = group->get<dai::ImgFrame>("rgb");
cv::Mat frameRgb = msgRgb->getCvFrame();
auto T2 = clock::now();
auto msgDepth = group->get<dai::ImgFrame>("depth_aligned"); // 16UC1(mm)
cv::Mat depth16u, Zm, X, Y, Xr, Yr, Zr;
if(msgDepth) {
depth16u = msgDepth->getFrame();
}
auto T3 = clock::now();
if(!depth16u.empty()) {
depth16u.convertTo(Zm, CV_32F, 0.001); // mm -> m
for(int y = 0; y < Zm.rows; y++){
float* ptr = Zm.ptr<float>(y);
const uint16_t* dptr = depth16u.ptr<uint16_t>(y);
for(int x = 0; x < Zm.cols; x++){
if(dptr[x] == 0) {
ptr[x] = std::numeric_limits<float>::quiet_NaN();
}
}
}
depth_map = Zm;
}
auto T4 = clock::now();
if(!Zm.empty()) {
X = x_scale.mul(Zm);
Y = y_scale.mul(Zm);
}
auto T5 = clock::now();
if(!X.empty()) {
Xr = r00*X + r01*Y + r02*Zm; Xr += t0;
Yr = r10*X + r11*Y + r12*Zm; Yr += t1;
Zr = r20*X + r21*Y + r22*Zm; Zr += t2;
g_X_world = Xr;
g_Y_world = Yr;
g_height_map = Zr;
}
auto T6 = clock::now();
if(!Zr.empty()) {
cv::Mat maskNan = Zr != Zr;
cv::Mat Zr_clipped;
cv::max(Zr, -1.0, Zr_clipped); // 하한 적용
cv::min(Zr_clipped, 1.0, Zr_clipped); // 상한 적용
cv::Mat Zr_scaled = (Zr_clipped + 1.0f) * 127.5f; // [-1,1] -> [0,255] (필요시 클램프 추가)
cv::Mat Zr_u8;
Zr_scaled.convertTo(Zr_u8, CV_8U);
Zr_u8.setTo(0, maskNan);
cv::applyColorMap(Zr_u8, frameDepthVis, cv::COLORMAP_JET);
}
auto T7 = clock::now();
const ros::Time stamp = ros::Time::now();
if(!depth16u.empty()) {
auto msg = cv_bridge::CvImage(std_msgs::Header(), "mono16", depth16u).toImageMsg();
msg->header.stamp = stamp;
msg->header.frame_id = "oak_stereo_optical_frame";
pub_stereo.publish(msg);
}
if(!Zr.empty()) {
auto msg = cv_bridge::CvImage(std_msgs::Header(), "32FC1", Zr).toImageMsg();
msg->header.stamp = stamp;
msg->header.frame_id = "oak_camera_frame";
pub_height_raw.publish(msg);
if(!frameDepthVis.empty()) {
auto vmsg = cv_bridge::CvImage(std_msgs::Header(), "bgr8", frameDepthVis).toImageMsg();
vmsg->header.stamp = stamp;
vmsg->header.frame_id = "oak_camera_frame";
pub_height_vis.publish(vmsg);
}
}
if(!frameRgb.empty()) {
auto rgb_msg = cv_bridge::CvImage(std_msgs::Header(), "bgr8", frameRgb).toImageMsg();
rgb_msg->header.stamp = stamp;
rgb_msg->header.frame_id = "oak_rgb_optical_frame";
pub_rgb.publish(rgb_msg);
}
auto T8 = clock::now();
if(enable_display) {
if(!frameRgb.empty() && !frameDepthVis.empty()) {
cv::Mat blended;
cv::addWeighted(frameRgb, rgbWeight, frameDepthVis, depthWeight, 0.0, blended);
cv::imshow(g_winName, blended);
} else if(!frameRgb.empty()) {
cv::imshow(g_winName, frameRgb);
}
int key = cv::waitKey(1);
if(key == 'q' || key == 27) break;
}
auto T9 = clock::now();
// auto cfg = align->initialConfig;
// auto alignConfig = std::make_shared<dai::ImageAlignConfig>();
// alignConfig = cfg;
// alignConfig->staticDepthPlane = staticDepthPlane;
// alignQ->send(alignConfig);
ros::spinOnce();
rate.sleep();
auto T10 = clock::now();
}
if(enable_display) cv::destroyAllWindows();
} catch(const std::exception& e) {
ROS_ERROR_STREAM("DepthAI exception: " << e.what());
return 1;
}
return 0;
}