Problem with YoloSpatialDetectionNetwork and boundingBoxMapping

Kefon

Hi there.

I'm trying to implement the "RGB & TinyYolo with spatial data" example using yolov8n_coco_640x352.blob (for 16/9) combined with the "Rotated Spatial Detections" example (the sensor is upside down). I'm implementing it in Touchdesigner. So far everything works great, with low latency and nice FPS. The accuracy is also great. The only problem is that my "outBoundingBoxDepthMapping" output remains empty. So here I'm not even sure that the model is really using the spatial data. I tried to understand what's wrong with the code but I can't find where the problem is, all my linking seems correct. Could someone have a look?

Thanks in advance for your help!

RGB & TinyYolo with spatial data example: https://oak-api.readthedocs.io/en/stable/samples/SpatialDetection/spatial_tiny_yolo/#rgb-tinyyolo-with-spatial-data

Rotated Spatial Detections example: https://docs.luxonis.com/software/depthai/examples/rotated_spatial_detections/

My code:

model_url = f' https://artifacts.luxonis.com/artifactory/luxonis-depthai-data-local/network/yolov8n_coco_640x352.blob'

def createPipeline(oakDeviceOp):

	# Path to blob
	retDict = op.TDResources.FileDownloader.Download(url=model_url, clear=False)
	nnPath = retDict['path']

	rgbfps = float(30)

	# Create pipeline
	pipeline = dai.Pipeline()

	# models
	detectionNetwork = pipeline.create(dai.node.YoloSpatialDetectionNetwork)
	objectTracker = pipeline.create(dai.node.ObjectTracker)

	# sources
	camRgb = pipeline.create(dai.node.ColorCamera)
	monoLeft = pipeline.create(dai.node.MonoCamera)
	monoRight = pipeline.create(dai.node.MonoCamera)
	stereo = pipeline.create(dai.node.StereoDepth)

	# outputs
	outRgb = pipeline.create(dai.node.XLinkOut)
	outRgb.setStreamName("color")
	outTracklets = pipeline.create(dai.node.XLinkOut)
	outTracklets.setStreamName("tracklets")
	outBoundingBoxDepthMapping = pipeline.create(dai.node.XLinkOut)
	outBoundingBoxDepthMapping.setStreamName("boundingBoxDepthMapping")
	outDepth = pipeline.create(dai.node.XLinkOut)
	outDepth.setStreamName("depth")

	# CamRGB Properties
	camRgb.setPreviewSize(640, 352)
	camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
	camRgb.setInterleaved(False)
	camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
	camRgb.setFps(rgbfps)
	camRgb.setImageOrientation(dai.CameraImageOrientation.ROTATE_180_DEG)

	# Stereo Properties
	monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
	monoLeft.setBoardSocket(dai.CameraBoardSocket.CAM_B)
	monoLeft.setCamera("left")
	monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
	monoRight.setBoardSocket(dai.CameraBoardSocket.CAM_C)
	monoRight.setCamera("right")

	# setting node configs
	stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)

	# Align depth map to the perspective of RGB camera, on which inference is done
	stereo.setDepthAlign(dai.CameraBoardSocket.CAM_A)
	stereo.setSubpixel(True)
	stereo.setOutputSize(monoLeft.getResolutionWidth(), monoLeft.getResolutionHeight())
	
	rotate_stereo_manip = pipeline.createImageManip()
	rotate_stereo_manip.initialConfig.setVerticalFlip(True)
	rotate_stereo_manip.initialConfig.setHorizontalFlip(True)
	rotate_stereo_manip.setFrameType(dai.ImgFrame.Type.RAW16)
	stereo.depth.link(rotate_stereo_manip.inputImage)

	detectionNetwork.setBlobPath(nnPath)
	#detectionNetwork.setNumInferenceThreads(2)
	detectionNetwork.setConfidenceThreshold(0.6)
	detectionNetwork.input.setBlocking(False)
	detectionNetwork.input.setQueueSize(1)
	detectionNetwork.setBoundingBoxScaleFactor(0.5)
	detectionNetwork.setDepthLowerThreshold(600) # 60 cm
	detectionNetwork.setDepthUpperThreshold(10000) # 10 m

	detectionNetwork.setNumClasses(1)
	detectionNetwork.setCoordinateSize(4)
	#detectionNetwork.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319])
	#detectionNetwork.setAnchorMasks({"side26": [1, 2, 3], "side13": [3, 4, 5]})
	detectionNetwork.setIouThreshold(0.5)

	# LINKING
	syncNN = True
	rbgOutput = True
	depthOutput = True

	camRgb.preview.link(detectionNetwork.input)

	if syncNN:
		detectionNetwork.passthrough.link(outRgb.input)
	else:
		camRgb.preview.link(outRgb.input)

	monoLeft.out.link(stereo.left)
	monoRight.out.link(stereo.right)
	detectionNetwork.out.link(outTracklets.input)
	detectionNetwork.boundingBoxMapping.link(outBoundingBoxDepthMapping.input)

	#stereo.depth.link(detectionNetwork.inputDepth)
	rotate_stereo_manip.out.link(detectionNetwork.inputDepth)

	if depthOutput :
		detectionNetwork.passthroughDepth.link(outDepth.input)

	if rbgOutput :
		objectTracker.passthroughTrackerFrame.link(outRgb.input)

	# Tracker settings
	label_index = int(op('null_tracked_label_index')[0,0].val)
	objectTracker.setDetectionLabelsToTrack([label_index])
	# possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
	objectTracker.setTrackerType(dai.TrackerType.SHORT_TERM_IMAGELESS)
	# take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
	objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.UNIQUE_ID)

	# Linking
	detectionNetwork.out.link(objectTracker.inputDetections)
	detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
	detectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)
	objectTracker.out.link(outTracklets.input)

	return pipeline