Hello everyone, I am having problem in aligning RGB and depth. Below is some info including hardware, code and the issues explained. **Hardware** OAK-D S2 POE (RVC2) / depthai-core: 3.0.0 **Goal** Final goal : To detect an object from RGB image and get height of the object using depth values. 1. transform camera coordinate to world coordinate where Z stands for the height from the ground 2. RGB-Depth alignment **Issue** Depth from StereoDepth is most accurate when the stereo output is 640×480 (I confirmed this). However, I can’t find an RGB output size that aligns exactly with the 640×480 depth image. I followed tutorial below but it turns out that depth image is always bigger than rgb image. Please refer to attached image https://github.com/luxonis/depthai-core/blob/main/examples/python/ImageAlign/depth_align.py [upl-image-preview uuid=fdeb3190-5d4a-4c71-93c7-8f450de3627c url=https://discuss.luxonis.com/assets/files/2025-08-19/1755629016-655577-rgb-depth-screenshot-20082025.png alt={TEXT?}] **Questions** How Image align works (ratio or settings) How to get RGB configurations that fits the best with stereo 640, 480 output. **Code** **1. RGB and depth alignment** ``` #!/usr/bin/env python3 import numpy as np import cv2 import depthai as dai import time from datetime import timedelta FPS = 25.0 RGB_SOCKET = dai.CameraBoardSocket.CAM_A LEFT_SOCKET = dai.CameraBoardSocket.CAM_B RIGHT_SOCKET = dai.CameraBoardSocket.CAM_C class FPSCounter: def __init__(self): self.frameTimes = [] def tick(self): now = time.time() self.frameTimes.append(now) self.frameTimes = self.frameTimes[-20:] def getFps(self): if len(self.frameTimes) = g_X_world.cols || y >= g_X_world.rows) return; float wx = g_X_world.at(y, x); float wy = g_Y_world.at(y, x); float h = g_height_map.at(y, x); float depth = depth_map.at(y, x); std::cout

RGB depth align error

yoonlouis

Hello everyone,
I am having problem in aligning RGB and depth.
Below is some info including hardware, code and the issues explained.

Hardware
OAK-D S2 POE (RVC2) / depthai-core: 3.0.0

Goal
Final goal : To detect an object from RGB image and get height of the object using depth values.

transform camera coordinate to world coordinate where Z stands for the height from the ground
RGB-Depth alignment

Issue
Depth from StereoDepth is most accurate when the stereo output is 640×480 (I confirmed this). However, I can’t find an RGB output size that aligns exactly with the 640×480 depth image.
I followed tutorial below but it turns out that depth image is always bigger than rgb image.
Please refer to attached image
luxonis/depthai-coreblob/main/examples/python/ImageAlign/depth_align.py

Questions
How Image align works (ratio or settings)
How to get RGB configurations that fits the best with stereo 640, 480 output.

Code
1. RGB and depth alignment

#!/usr/bin/env python3
import numpy as np
import cv2
import depthai as dai
import time
from datetime import timedelta

FPS = 25.0

RGB_SOCKET   = dai.CameraBoardSocket.CAM_A
LEFT_SOCKET  = dai.CameraBoardSocket.CAM_B
RIGHT_SOCKET = dai.CameraBoardSocket.CAM_C

class FPSCounter:
    def __init__(self):
        self.frameTimes = []

    def tick(self):
        now = time.time()
        self.frameTimes.append(now)
        self.frameTimes = self.frameTimes[-20:]

    def getFps(self):
        if len(self.frameTimes) <= 1:
            return 0.0
        return (len(self.frameTimes) - 1) / (self.frameTimes[-1] - self.frameTimes[0])

def colorizeDepth(frameDepth):
    """frameDepth: np.uint16 (mm). Return BGR u8 colormap."""
    invalidMask = (frameDepth == 0)
    if frameDepth.size == 0:
        return np.zeros((0, 0, 3), np.uint8)
    nz = frameDepth[~invalidMask]
    if nz.size == 0:
        return np.zeros((*frameDepth.shape, 3), np.uint8)

    minDepth = np.percentile(nz, 3)
    maxDepth = np.percentile(nz, 95)
    minDepth = max(minDepth, 1.0)

    logDepth = np.zeros_like(frameDepth, dtype=np.float32)
    np.log(frameDepth, out=logDepth, where=(frameDepth != 0))
    logMin = float(np.log(minDepth))
    logMax = float(np.log(maxDepth))

    logDepth = np.clip(logDepth, logMin, logMax, out=logDepth)
    depthU8 = np.uint8(255.0 * (logDepth - logMin) / (logMax - logMin + 1e-12))
    depthU8[invalidMask] = 0

    cm = cv2.applyColorMap(depthU8, cv2.COLORMAP_JET)
    cm[invalidMask] = 0
    return cm

rgbWeight = 0.4
depthWeight = 0.6
OFFSET_CENTER = 100
uOffset = OFFSET_CENTER   
vOffset = OFFSET_CENTER
Z_global = None        
du_px = 0
dv_px = 0
winName = "rgb-depth"

def updateBlendWeights(percentRgb):
    global rgbWeight, depthWeight
    rgbWeight = float(percentRgb) / 100.0
    depthWeight = 1.0 - rgbWeight

def on_u(val):
    global uOffset
    uOffset = int(val)

def on_v(val):
    global vOffset
    vOffset = int(val)

def mouse_callback(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN and Z_global is not None:
        h, w = Z_global.shape[:2]
        x_src = x - du_px
        y_src = y - dv_px
        if 0 <= x_src < w and 0 <= y_src < h:
            z = int(Z_global[y_src, x_src])
            print(f"[{x},{y}] -> depth(mm) at shifted = {z}")
        else:
            print(f"[{x},{y}] -> out of bounds after offset")

pipeline = dai.Pipeline()

camRgb = pipeline.create(dai.node.Camera).build(RGB_SOCKET)
left   = pipeline.create(dai.node.Camera).build(LEFT_SOCKET)
right  = pipeline.create(dai.node.Camera).build(RIGHT_SOCKET)
stereo = pipeline.create(dai.node.StereoDepth)
sync   = pipeline.create(dai.node.Sync)
align = pipeline.create(dai.node.ImageAlign)

stereo.setExtendedDisparity(True)
sync.setSyncThreshold(timedelta(seconds=1.0 / (2.0 * FPS)))

cap = dai.ImgFrameCapability()
cap.size.fixed((640, 480))
cap.resizeMode = dai.ImgResizeMode.CROP
cap.fps.fixed(FPS)

capRgb = dai.ImgFrameCapability()
capRgb.size.fixed((1280, 1152))
capRgb.resizeMode = dai.ImgResizeMode.CROP
capRgb.fps.fixed(FPS)

rgbOut   = camRgb.requestOutput(cap, True)
leftOut  = left.requestOutput(cap, True)
rightOut = right.requestOutput(cap, True)

try:
    stereo.setOutputSize(640, 480)
    stereo.setOutputKeepAspectRatio(False)
except Exception:
    pass  


rgbOut.link(sync.inputs["rgb"])
leftOut.link(stereo.left)
rightOut.link(stereo.right)

stereo.depth.link(align.input)
rgbOut.link(align.inputAlignTo)
align.outputAligned.link(sync.inputs["depth_aligned"])

queue = sync.out.createOutputQueue()


with pipeline:
    pipeline.start()

    cv2.namedWindow(winName, cv2.WINDOW_NORMAL)
    cv2.resizeWindow(winName, 1280, 720)
    cv2.setMouseCallback(winName, mouse_callback)
    cv2.createTrackbar("RGB Weight %", winName, int(rgbWeight * 100), 100, updateBlendWeights)
    cv2.createTrackbar("u offset (px)", winName, uOffset, 200, on_u) 
    cv2.createTrackbar("v offset (px)", winName, vOffset, 200, on_v)

    fps = FPSCounter()

    while True:
        msgGrp = queue.get()
        fps.tick()

        frameRgb  = msgGrp["rgb"]           
        frameDepth= msgGrp["depth_aligned"]

        if frameRgb is None or frameDepth is None:
            continue

        rgb = frameRgb.getCvFrame()                
        Z   = frameDepth.getFrame()             
        Z_global = Z                             

        depthColor = colorizeDepth(Z)

        du = int(uOffset - OFFSET_CENTER)
        dv = int(vOffset - OFFSET_CENTER)

        du_px = du
        dv_px = dv

        M = np.float32([[1, 0, du], [0, 1, dv]])
        depthColorShift = cv2.warpAffine(
            depthColor,
            M,
            (rgb.shape[1], rgb.shape[0]),
            flags=cv2.INTER_NEAREST,
            borderValue=0
        )

        if rgb.ndim == 2:
            rgb = cv2.cvtColor(rgb, cv2.COLOR_GRAY2BGR)
        blended = cv2.addWeighted(rgb, rgbWeight, depthColorShift, depthWeight, 0)

        cv2.putText(
            blended, f"FPS: {fps.getFps():.2f}",
            (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2
        )
        cv2.imshow(winName, blended)

        key = cv2.waitKey(1) & 0xFF
        if key == ord('q') or key == 27:
            break

cv2.destroyAllWindows()

2. Height map (I am working on)

// src/rgb_depth_heightmap.cpp
#include <ros/ros.h>
#include <sensor_msgs/Image.h>
#include <std_msgs/Header.h>
#include <cv_bridge/cv_bridge.h>

#include <opencv2/opencv.hpp>
#include "depthai/depthai.hpp"

#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <iomanip>

constexpr float FPS = 25.0f;

const dai::CameraBoardSocket RGB_SOCKET   = dai::CameraBoardSocket::CAM_A;
const dai::CameraBoardSocket LEFT_SOCKET  = dai::CameraBoardSocket::CAM_B;
const dai::CameraBoardSocket RIGHT_SOCKET = dai::CameraBoardSocket::CAM_C;

static const int TARGET_W = 640;
static const int TARGET_H = 480;

static float rgbWeight   = 0.4f;
static float depthWeight = 0.6f;
int staticDepthPlane = 0;

static const float PITCH_DEG = 42.905f;
static const float ROLL_DEG  = -0.761f;
static const float X_SHIFT   = 0.0f;
static const float Y_SHIFT   = 0.0f;
static const float Z_SHIFT   = 1.85f;

static cv::Mat g_X_world, g_Y_world, g_height_map, depth_map; // float32, (H,W)
static std::string g_winName = "RGB-HeightMap";

static void onUpdateBlend(int percentRgb, void*) {
    rgbWeight   = static_cast<float>(percentRgb) / 100.0f;
    depthWeight = 1.0f - rgbWeight;
}

void updateDepthPlane(int depth, void*) {
    staticDepthPlane = depth;
}


static void onMouse(int event, int x, int y, int, void*) {
    if(event != cv::EVENT_LBUTTONDOWN) return;
    if(g_X_world.empty() || g_Y_world.empty() || g_height_map.empty()) return;
    if(x < 0 || y < 0 || x >= g_X_world.cols || y >= g_X_world.rows) return;

    float wx = g_X_world.at<float>(y, x);
    float wy = g_Y_world.at<float>(y, x);
    float h  = g_height_map.at<float>(y, x);
    float depth  = depth_map.at<float>(y, x);
    std::cout << "Pixel (" << x << "," << y << ") -> "
              << "X: " << std::fixed << std::setprecision(2) << wx << " m, "
              << "Y: " << std::fixed << std::setprecision(2) << wy << " m, "
              << "Height: " << std::fixed << std::setprecision(3) << h  << " m\n";
    std::cout << "Z: " << std::fixed << std::setprecision(2) << depth << std::endl;
            };

static inline double msSince(const std::chrono::steady_clock::time_point& a,
                             const std::chrono::steady_clock::time_point& b) {
    using namespace std::chrono;
    return duration_cast<duration<double, std::milli>>(b - a).count();
}

int main(int argc, char** argv) {
    ros::init(argc, argv, "depthai_heightmap_publisher");
    ros::NodeHandle nh("~");

    bool enable_display = true;

    ros::Publisher pub_rgb        = nh.advertise<sensor_msgs::Image>("/oak/rgb/image_raw", 1);
    ros::Publisher pub_stereo     = nh.advertise<sensor_msgs::Image>("/oak/stereo/image_raw", 1);
    ros::Publisher pub_height_raw = nh.advertise<sensor_msgs::Image>("/oak/height_map", 1);
    ros::Publisher pub_height_vis = nh.advertise<sensor_msgs::Image>("/oak/height_map/visualized", 1);

    const int W = TARGET_W, H = TARGET_H;

    // Extrinsics 
    const float pitch = PITCH_DEG * static_cast<float>(M_PI) / 180.0f;
    const float roll  = ROLL_DEG  * static_cast<float>(M_PI) / 180.0f;

    cv::Matx33f R_pitch(
        1,      0,              0,
        0, -std::sin(pitch),  std::cos(pitch),
        0, -std::cos(pitch), -std::sin(pitch)
    );
    cv::Matx33f R_roll(
         std::cos(roll), -std::sin(roll), 0,
         std::sin(roll),  std::cos(roll), 0,
         0,               0,              1
    );
    cv::Matx33f R33 = R_pitch * R_roll;
    const float r00 = R33(0,0), r01 = R33(0,1), r02 = R33(0,2);
    const float r10 = R33(1,0), r11 = R33(1,1), r12 = R33(1,2);
    const float r20 = R33(2,0), r21 = R33(2,1), r22 = R33(2,2);
    const float t0 = X_SHIFT, t1 = Y_SHIFT, t2 = Z_SHIFT;

    std::cout<<"rotation matrix: "<<std::endl;
    std::cout<<r00<<" " << r01<<" "<< r02 <<std::endl;
    std::cout<<r10<<" " << r11<<" "<< r12 <<std::endl;
    std::cout<<r20<<" " << r21<<" "<< r22 <<std::endl;

    // DepthAI pipeline
    dai::Pipeline pipeline;

    auto camRgb = pipeline.create<dai::node::Camera>();
    camRgb->build(RGB_SOCKET);
    auto left = pipeline.create<dai::node::Camera>();
    left->build(LEFT_SOCKET);
    auto right = pipeline.create<dai::node::Camera>();
    right->build(RIGHT_SOCKET);
    auto stereo = pipeline.create<dai::node::StereoDepth>();
    auto sync   = pipeline.create<dai::node::Sync>();

    stereo->setExtendedDisparity(true);
    // stereo->setLeftRightCheck(true);
    sync->setSyncThreshold(std::chrono::duration<int64_t, std::nano>(static_cast<int64_t>(1e9 / (2.0 * FPS))));

    const int OUT_W = W, OUT_H = H;
    auto rgbOut   = camRgb->requestOutput(std::make_pair(OUT_W, OUT_H), dai::ImgFrame::Type::NV12, dai::ImgResizeMode::CROP, FPS, false);
    auto leftOut  = left->requestOutput(std::make_pair(OUT_W, OUT_H), dai::ImgFrame::Type::NV12, dai::ImgResizeMode::CROP, FPS, false);
    auto rightOut = right->requestOutput(std::make_pair(OUT_W, OUT_H), dai::ImgFrame::Type::NV12, dai::ImgResizeMode::CROP, FPS, false);
    
    rgbOut->link(sync->inputs["rgb"]);
    leftOut->link(stereo->left);
    rightOut->link(stereo->right);
    
    // stereo->setOutputSize(OUT_W,OUT_H);
    stereo->setOutputKeepAspectRatio(false);

    stereo->depth.link(sync->inputs["depth_aligned"]);
    rgbOut->link(stereo->inputAlignTo);

    // auto align = pipeline.create<dai::node::ImageAlign>();
    // stereo->depth.link(align->input);
    // rgbOut->link(align->inputAlignTo);
    // align->outputAligned.link(sync->inputs["depth_aligned"]);
    
    // Output queue
    auto queue = sync->out.createOutputQueue();
    // auto alignQ = align->inputConfig.createInputQueue();
    
    if(enable_display) {
            cv::namedWindow(g_winName, cv::WINDOW_NORMAL);
            cv::resizeWindow(g_winName, 1280, 720);
            cv::createTrackbar("RGB Weight %", g_winName, nullptr, 100, onUpdateBlend);
            cv::setTrackbarPos("RGB Weight %", g_winName, static_cast<int>(rgbWeight * 100));
            cv::setMouseCallback(g_winName, onMouse, nullptr);
    }
    
    auto dev   = pipeline.getDefaultDevice();
    auto calib = dev->readCalibration();
    auto Kv = calib.getCameraIntrinsics(RGB_SOCKET, W, H);
    
    cv::Matx33f K(
        Kv[0][0], Kv[0][1], Kv[0][2],
        Kv[1][0], Kv[1][1], Kv[1][2],
        Kv[2][0], Kv[2][1], Kv[2][2]
    );
    float fx = K(0,0), fy = K(1,1), cx = K(0,2), cy = K(1,2);
    
    std::cout << "K:\n" << K << std::endl;

    cv::Mat x_scale(H, W, CV_32F), y_scale(H, W, CV_32F);
    for(int i = 0; i < H; ++i) {
        float* yptr = y_scale.ptr<float>(i);
        const float iy = (static_cast<float>(i) - cy) / fy;
        for(int j = 0; j < W; ++j) yptr[j] = iy;
    }
    for(int j = 0; j < W; ++j) {
        const float jx = (static_cast<float>(j) - cx) / fx;
        for(int i = 0; i < H; ++i) x_scale.at<float>(i, j) = jx;
    }

    g_X_world = cv::Mat::zeros(H, W, CV_32F);
    g_Y_world = cv::Mat::zeros(H, W, CV_32F);
    g_height_map = cv::Mat::zeros(H, W, CV_32F);
    depth_map = cv::Mat::zeros(H, W, CV_32F);

    pipeline.start();
    ros::Rate rate(30);

    try {
        cv::Mat frameRgb, frameDepthVis, depth16u;

        while(ros::ok()) {
            using clock = std::chrono::steady_clock;
            auto T0 = clock::now();

            auto group = queue->get<dai::MessageGroup>();
            auto T1 = clock::now();

            auto msgRgb   = group->get<dai::ImgFrame>("rgb");      
            cv::Mat frameRgb = msgRgb->getCvFrame();

            auto T2 = clock::now();

            auto msgDepth = group->get<dai::ImgFrame>("depth_aligned");  // 16UC1(mm)
            cv::Mat depth16u, Zm, X, Y, Xr, Yr, Zr;
            if(msgDepth) {
                depth16u = msgDepth->getFrame();
            }
            auto T3 = clock::now();

            if(!depth16u.empty()) {
                depth16u.convertTo(Zm, CV_32F, 0.001); // mm -> m
                for(int y = 0; y < Zm.rows; y++){
                    float* ptr = Zm.ptr<float>(y);
                    const uint16_t* dptr = depth16u.ptr<uint16_t>(y);
                    for(int x = 0; x < Zm.cols; x++){
                        if(dptr[x] == 0) {
                            ptr[x] = std::numeric_limits<float>::quiet_NaN();
                        }
                    }
                }
                depth_map = Zm;
            }
            auto T4 = clock::now();

            if(!Zm.empty()) {
                X = x_scale.mul(Zm);
                Y = y_scale.mul(Zm);
            }
            auto T5 = clock::now();

            if(!X.empty()) {
                Xr = r00*X + r01*Y + r02*Zm; Xr += t0;
                Yr = r10*X + r11*Y + r12*Zm; Yr += t1;
                Zr = r20*X + r21*Y + r22*Zm; Zr += t2;

                g_X_world = Xr;
                g_Y_world = Yr;
                g_height_map = Zr;
            }
            auto T6 = clock::now();

            if(!Zr.empty()) {
                cv::Mat maskNan = Zr != Zr;
                cv::Mat Zr_clipped;
                cv::max(Zr, -1.0, Zr_clipped);  // 하한 적용
                cv::min(Zr_clipped,  1.0, Zr_clipped); // 상한 적용
                cv::Mat Zr_scaled = (Zr_clipped + 1.0f) * 127.5f; // [-1,1] -> [0,255] (필요시 클램프 추가)
                cv::Mat Zr_u8;
                Zr_scaled.convertTo(Zr_u8, CV_8U); 
                Zr_u8.setTo(0, maskNan);
                cv::applyColorMap(Zr_u8, frameDepthVis, cv::COLORMAP_JET);
            }
            auto T7 = clock::now();

            const ros::Time stamp = ros::Time::now();
            if(!depth16u.empty()) {
                auto msg = cv_bridge::CvImage(std_msgs::Header(), "mono16", depth16u).toImageMsg();
                msg->header.stamp = stamp;
                msg->header.frame_id = "oak_stereo_optical_frame";
                pub_stereo.publish(msg);
            }
            if(!Zr.empty()) {
                auto msg = cv_bridge::CvImage(std_msgs::Header(), "32FC1", Zr).toImageMsg();
                msg->header.stamp = stamp;
                msg->header.frame_id = "oak_camera_frame";
                pub_height_raw.publish(msg);

                if(!frameDepthVis.empty()) {
                    auto vmsg = cv_bridge::CvImage(std_msgs::Header(), "bgr8", frameDepthVis).toImageMsg();
                    vmsg->header.stamp = stamp;
                    vmsg->header.frame_id = "oak_camera_frame";
                    pub_height_vis.publish(vmsg);
                }
            }
            if(!frameRgb.empty()) {
                auto rgb_msg = cv_bridge::CvImage(std_msgs::Header(), "bgr8", frameRgb).toImageMsg();
                rgb_msg->header.stamp = stamp;
                rgb_msg->header.frame_id = "oak_rgb_optical_frame";
                pub_rgb.publish(rgb_msg);
            }
            auto T8 = clock::now();


            if(enable_display) {
                if(!frameRgb.empty() && !frameDepthVis.empty()) {
                   
                    cv::Mat blended;
                    cv::addWeighted(frameRgb, rgbWeight, frameDepthVis, depthWeight, 0.0, blended);
                    cv::imshow(g_winName, blended);
                } else if(!frameRgb.empty()) {
                    cv::imshow(g_winName, frameRgb);
                }
                int key = cv::waitKey(1);
                if(key == 'q' || key == 27) break;
            }
            auto T9 = clock::now();

            // auto cfg = align->initialConfig;
            // auto alignConfig = std::make_shared<dai::ImageAlignConfig>();
            // alignConfig = cfg;
            // alignConfig->staticDepthPlane = staticDepthPlane;
            // alignQ->send(alignConfig);

            ros::spinOnce();
            rate.sleep();
            auto T10 = clock::now();        
        }
        if(enable_display) cv::destroyAllWindows();

        

    } catch(const std::exception& e) {
        ROS_ERROR_STREAM("DepthAI exception: " << e.what());
        return 1;
    }

    return 0;
}

jakaskerl

Hi yoonlouis
I would strongly recommend you to use the example from https://docs.luxonis.com/software/depthai/examples/rgb_depth_aligned/ to check the alignment. I believe it should work better than the ImageAlign version.

Thanks,
Jaka