Problem
I am trying to run holistic replay against my pipeline that uses a SpatialDetectionNetwork
node for face detection. This is working, but stops short of the full video. I believe there is backpressure causing the RGB input frames to process more slowly, while the depth frames run normally and finish ahead of RGB. When I remove the SDN
from the pipeline, the RGB and Depth feeds appear to be more in sync (with RGB even slightly ahead). Both replay and record are set to fps=15.
Question
Is holistic replay designed to work with compute-heavy processing nodes like SpatialDetectionNetwork
that weren't present during recording? Should we be recording WITH SpatialDetectionNetwork
enabled to capture the natural pacing, or is there a way to throttle replay speed to match processing capabilities?
Here is my approach on a standard OAK-D camera:
record.py
This saves the .tar file under "recordings/"
import cv2, depthai as dai
from pathlib import Path
OUT = "recordings"
Path(OUT).mkdir(parents=True, exist_ok=True)
with dai.Pipeline(True) as pipeline:
# --- Sources you care about for your pipeline ---
camA = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_A, sensorResolution=(1920,1080), sensorFps=15)
outA = camA.requestOutput((1920,1080), fps=15)
camB = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B, sensorFps=15)
outB = camB.requestOutput((640,400), fps=15)
camC = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C, sensorFps=15)
outC = camC.requestOutput((640,400), fps=15)
# --- Holistic Record config (H.264, auto bitrate) ---
cfg = dai.RecordConfig()
cfg.outputDir = OUT
cfg.videoEncoding.enabled = True
cfg.videoEncoding.bitrate = 0
cfg.videoEncoding.profile = dai.VideoEncoderProperties.Profile.H264_MAIN
pipeline.enableHolisticRecord(cfg)
qA = outA.createOutputQueue()
qB = outB.createOutputQueue()
qC = outC.createOutputQueue()
pipeline.start()
print("Recording... press 'q' to stop")
while pipeline.isRunning():
_ = qA.get(); _ = qB.get(); _ = qC.get()
if cv2.waitKey(1) == ord('q'):
break
Replay (without SDN)
This runs with RGB and StereoDepth closely in sync.
python -m replay_no_sdn --replay recordings/test.tar --show-depth
import argparse
import time
from datetime import timedelta
import cv2
import depthai as dai
IN_W, IN_H = 300, 300 # SDN input size
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--replay", required=True, help="Path to holistic replay .tar")
ap.add_argument("--rgb-fps", type=float, default=15)
ap.add_argument("--mono-fps", type=float, default=15)
ap.add_argument("--idle-timeout", type=float, default=2.0)
ap.add_argument("--show-depth", action="store_true")
args = ap.parse_args()
with dai.Pipeline(True) as p:
# ============= RGB Camera (CAM_A) -> 300x300 BGR (undistorted) =============
cam_rgb = p.create(dai.node.Camera).build(
boardSocket=dai.CameraBoardSocket.CAM_A,
sensorResolution=(1920, 1080),
sensorFps=args.rgb_fps,
)
rgb_prev = cam_rgb.requestOutput(
size=(IN_W, IN_H),
type=dai.ImgFrame.Type.BGR888p,
resizeMode=dai.ImgResizeMode.STRETCH,
fps=args.rgb_fps,
enableUndistortion=True,
)
# ============= Stereo Depth aligned to CAM_A =============
left = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B, sensorFps=args.mono_fps)
right = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C, sensorFps=args.mono_fps)
l_gray = left.requestOutput(size=(640, 400), type=dai.ImgFrame.Type.GRAY8, fps=args.mono_fps)
r_gray = right.requestOutput(size=(640, 400), type=dai.ImgFrame.Type.GRAY8, fps=args.mono_fps)
sd = p.create(dai.node.StereoDepth)
sd.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.FAST_DENSITY)
sd.setLeftRightCheck(True)
sd.setSubpixel(True) # keeps depth output as U16 (mm)
sd.setInputResolution(640, 400)
sd.setDepthAlign(dai.CameraBoardSocket.CAM_A) # align to RGB (undistorted)
l_gray.link(sd.left)
r_gray.link(sd.right)
# Resize U16 depth to 300x300 for SDN; allocate enough buffer
depth_res = p.create(dai.node.ImageManip)
depth_res.initialConfig.setOutputSize(IN_W, IN_H, dai.ImageManipConfig.ResizeMode.STRETCH)
depth_res.setMaxOutputFrameSize(IN_W * IN_H * 2) # U16 bytes
sd.depth.link(depth_res.inputImage)
# ============= Host queues (V3 style) =============
q_rgb = rgb_prev.createOutputQueue(maxSize=4, blocking=False)
q_depth = depth_res.out.createOutputQueue(maxSize=2, blocking=False) if args.show_depth else None
# ============= Holistic replay + start =============
p.enableHolisticReplay(args.replay) # call BEFORE start
p.start()
print("Started. Press 'q' to quit.")
last_rgb = time.perf_counter()
win_rgb = "Holistic Replay (RGB)"
win_depth = "Depth (mm, vis)" if args.show_depth else None
while p.isRunning():
f = q_rgb.tryGet()
if f is not None:
cv2.imshow(win_rgb, f.getCvFrame())
last_rgb = time.perf_counter()
if q_depth is not None:
d = q_depth.tryGet()
if d is not None:
# visualize U16 depth roughly for sanity
dep = d.getFrame() # numpy uint16
# Simple viz: clip & normalize 0.2–2.0m to 0–255
vis = dep.copy()
vis[vis == 0] = 65535
vis = vis.astype("float32")
vis = 255.0 * (1.0 - (vis.clip(200, 2000) - 200) / (2000 - 200))
vis = vis.clip(0, 255).astype("uint8")
cv2.imshow(win_depth, vis)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
if (time.perf_counter() - last_rgb) > args.idle_timeout:
print(f"No new RGB for {args.idle_timeout}s. Assuming replay complete.")
break
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
Replay (with SDN)
This runs with RGB and StereoDepth out of sync.
python -m replay_sdn --replay recordings/test.tar --face-arc models/face-detection-retail-0004.rvc2.tar.xz --show-depth
import argparse
import time
from datetime import timedelta
import cv2
import depthai as dai
IN_W, IN_H = 300, 300 # SDN input size
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--replay", required=True, help="Path to holistic replay .tar")
ap.add_argument("--face-arc", required=True, help="Path to face SDN archive (.rvc2.tar.xz)")
ap.add_argument("--rgb-fps", type=float, default=15)
ap.add_argument("--mono-fps", type=float, default=15)
ap.add_argument("--idle-timeout", type=float, default=2.0)
ap.add_argument("--show-depth", action="store_true")
args = ap.parse_args()
with dai.Pipeline(True) as p:
# ============= RGB Camera (CAM_A) -> 300x300 BGR (undistorted) =============
cam_rgb = p.create(dai.node.Camera).build(
boardSocket=dai.CameraBoardSocket.CAM_A,
sensorResolution=(1920, 1080),
sensorFps=args.rgb_fps,
)
rgb_prev = cam_rgb.requestOutput(
size=(IN_W, IN_H),
type=dai.ImgFrame.Type.BGR888p,
resizeMode=dai.ImgResizeMode.STRETCH,
fps=args.rgb_fps,
enableUndistortion=True,
)
# ============= Stereo Depth aligned to CAM_A =============
left = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B, sensorFps=args.mono_fps)
right = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C, sensorFps=args.mono_fps)
l_gray = left.requestOutput(size=(640, 400), type=dai.ImgFrame.Type.GRAY8, fps=args.mono_fps)
r_gray = right.requestOutput(size=(640, 400), type=dai.ImgFrame.Type.GRAY8, fps=args.mono_fps)
sd = p.create(dai.node.StereoDepth)
sd.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.FAST_DENSITY)
sd.setLeftRightCheck(True)
sd.setSubpixel(True) # keeps depth output as U16 (mm)
sd.setInputResolution(640, 400)
sd.setDepthAlign(dai.CameraBoardSocket.CAM_A) # align to RGB (undistorted)
l_gray.link(sd.left)
r_gray.link(sd.right)
# Resize U16 depth to 300x300 for SDN; allocate enough buffer
depth_res = p.create(dai.node.ImageManip)
depth_res.initialConfig.setOutputSize(IN_W, IN_H, dai.ImageManipConfig.ResizeMode.STRETCH)
depth_res.setMaxOutputFrameSize(IN_W * IN_H * 2) # U16 bytes
sd.depth.link(depth_res.inputImage)
# ============= Spatial Detection Network (Face) =============
sdn = p.create(dai.node.SpatialDetectionNetwork)
sdn.setNNArchive(dai.NNArchive(args.face_arc))
sdn.setBoundingBoxScaleFactor(0.5)
sdn.setDepthLowerThreshold(100) # 0.1 m
sdn.setDepthUpperThreshold(10000) # 10 m
rgb_in = sdn.input
depth_in = sdn.inputDepth
rgb_prev.link(rgb_in)
depth_res.out.link(depth_in)
# ============= Host queues (V3 style) =============
q_rgb = rgb_prev.createOutputQueue(maxSize=4, blocking=False)
q_det = sdn.out.createOutputQueue(maxSize=4, blocking=False)
q_depth = depth_res.out.createOutputQueue(maxSize=2, blocking=False) if args.show_depth else None
# ============= Holistic replay + start =============
p.enableHolisticReplay(args.replay) # call BEFORE start
p.start()
print("Started. Press 'q' to quit.")
last_rgb = time.perf_counter()
win_rgb = "Holistic Replay (RGB)"
win_depth = "Depth (mm, vis)" if args.show_depth else None
while p.isRunning():
f = q_rgb.tryGet()
if f is not None:
cv2.imshow(win_rgb, f.getCvFrame())
last_rgb = time.perf_counter()
if q_depth is not None:
d = q_depth.tryGet()
if d is not None:
# visualize U16 depth roughly for sanity
dep = d.getFrame() # numpy uint16
# Simple viz: clip & normalize 0.2–2.0m to 0–255
vis = dep.copy()
vis[vis == 0] = 65535
vis = vis.astype("float32")
vis = 255.0 * (1.0 - (vis.clip(200, 2000) - 200) / (2000 - 200))
vis = vis.clip(0, 255).astype("uint8")
cv2.imshow(win_depth, vis)
# Drain detections to avoid backpressure
while True:
pkt = q_det.tryGet()
if pkt is None:
break
# no-op; just draining
if cv2.waitKey(1) & 0xFF == ord('q'):
break
if (time.perf_counter() - last_rgb) > args.idle_timeout:
print(f"No new RGB for {args.idle_timeout}s. Assuming replay complete.")
break
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
SDN Model
I'm unable to upload the face-detection-retail-0004.rvc2.tar.xz
. However, I created it by zipping the config.json and face-detection-retail-0004.blob:
tar -cJf face-detection-retail-0004.rvc2.tar.xz config.json face-detection-retail-0004.blob
config.json
{
"config_version": "1.0",
"model": {
"metadata": {
"name": "face-detection-retail-0004",
"path": "face-detection-retail-0004.blob",
"precision": "float32"
},
"inputs": [
{
"name": "data",
"dtype": "float32",
"input_type": "image",
"shape": [1, 3, 300, 300],
"layout": "NCHW",
"preprocessing": {
"mean": [0.0, 0.0, 0.0],
"scale": [1.0, 1.0, 1.0],
"reverse_channels": false,
"interleaved_to_planar": false
}
}
],
"outputs": [
{ "name": "detection_out", "dtype": "float32" }
],
"heads": [
{
"parser": "SSD",
"metadata": {
"classes": ["background","face"],
"n_classes": 1,
"iou_threshold": 0.5,
"conf_threshold": 0.5,
"max_det": 200,
"anchors": null
},
"outputs": ["detection_out"]
}
]
}
}