HI @jakaskerl
Thank you very much for your reply! The hint regarding the data type of the tensor already helped a lot.
I also tried the decoding method that you linked as well as an updated version of this method by ultralytics. However, the results are still not as expected.
Below is an updated example using the methods described above. This example executes the object detection on the attached image bus.jpg
. However, the output consists of four identical bounding boxes with a class of "umbrella", which differs substantially from the detections when using the YoloDetectionNetwork
as described in my previous post.
Again, thank you for your help! I would be really happy if you could provide further help.

import blobconverter
import cv2
import depthai as dai
import numpy as np
import time
import torch
import torchvision
from typing import List
import ultralytics.utils.ops
from ultralytics.utils import yaml_load
from ultralytics.utils.checks import check_yaml
def tensor2imgdetection(tensor: torch.Tensor) -> List[dai.ImgDetection]:
detections = list()
for res in tensor:
detection = dai.ImgDetection
detection.confidence = res[4]
detection.label = int(res[5])
detection.xmin = res[0]
detection.xmax = res[2]
detection.ymin = res[1]
detection.ymax = res[3]
detections.append(detection)
return detections
def xywh2xyxy(x):
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2]
# where xy1=top-left, xy2=bottom-right
y = torch.zeros_like(x) if isinstance(
x, torch.Tensor) else np.zeros_like(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45):
prediction = np.array([cv2.transpose(prediction[0])])
prediction = torch.from_numpy(prediction)
if prediction.dtype is torch.float16:
prediction = prediction.float() # to FP32
xc = prediction[..., 4] >= conf_thres # candidates
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
x = x[xc[xi]] # confidence
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf
# x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
conf, j = x[:, 5:].max(1, keepdim=True)
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
# If none remain process next image
n = x.shape[0] # number of boxes
if not n:
continue
boxes, scores = x[:, :4], x[:, 4]
i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
detections = tensor2imgdetection(x[i])
return detections
def setup_pipeline(model_path: str,
image_path: str = None) -> dai.Pipeline:
pipeline = dai.Pipeline()
# Input image
img_in = pipeline.create(dai.node.XLinkIn)
img_in.setStreamName("img_in")
# Detection node
nn = pipeline.create(dai.node.NeuralNetwork)
nn.setNumPoolFrames(4)
nn.setBlobPath(model_path)
nn.setNumInferenceThreads(2)
# Outputs
nnXout = pipeline.create(dai.node.XLinkOut)
nnXout.setStreamName("nn")
video_out = pipeline.create(dai.node.XLinkOut)
video_out.setStreamName("video")
# Linking
img_in.out.link(nn.input)
nn.out.link(nnXout.input)
nn.passthrough.link(video_out.input)
return pipeline
label_map = yaml_load(check_yaml("coco8.yaml"))["names"]
model_path = blobconverter.from_zoo(name="yolov8n_coco_416x416",
zoo_type="depthai",
shaves=8)
model_width = 416
model_height = 416
tensor_shape = (1, 85, 2704)
def print_detection(detection: dai.ImgDetection) -> None:
d = detection
print(f"{label_map[d.label]:<8s} (conf={d.confidence:.2f}) "
f"[xmin={d.xmin:.2f}, ymin={d.ymin:.2f}, "
f"xmax={d.xmax:.2f}, ymax={d.ymax:.2f}]")
def to_planar(arr: np.ndarray, shape: tuple) -> list:
return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()
def main(confidence_threshold, image_path):
print("using NeuralNetwork with confidence threshold of " +
str(confidence_threshold))
pipeline = setup_pipeline(model_path, image_path)
with dai.Device(pipeline) as device:
nn_queue = device.getOutputQueue("nn")
video_queue = device.getOutputQueue(name="video")
input_queue = device.getInputQueue("img_in")
detections = []
def display_frame(name, frame):
color = (255, 0, 0)
for d in detections:
xmin = int(d.xmin)
xmax = int(d.xmax)
ymin = int(d.ymin)
ymax = int(d.ymax)
cv2.putText(frame, label_map[d.label],
(xmin + 10, ymin + 20),
cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, f"{int(d.confidence * 100)}%",
(xmin + 10, ymin + 40),
cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.rectangle(frame, (xmin, ymin),
(xmax, ymax), color, 2)
# Show the frame
cv2.imshow(name, frame)
frame = cv2.imread(image_path)
tstamp = time.monotonic()
img_frame = dai.ImgFrame()
img_frame.setData(to_planar(frame, (model_width, model_height)))
img_frame.setTimestamp(tstamp)
img_frame.setSequenceNum(1)
img_frame.setType(dai.RawImgFrame.Type.BGR888p)
img_frame.setWidth(model_width)
img_frame.setHeight(model_height)
input_queue.send(img_frame)
in_nn = nn_queue.get()
in_video = video_queue.get()
in_nn = np.array(in_nn.getFirstLayerFp16())
in_nn = in_nn.reshape(tensor_shape)
# Variant 1: use ultralytics method
# detections = tensor2imgdetection(
# ultralytics.utils.ops.non_max_suppression(
# torch.from_numpy(in_nn),
# confidence_threshold, nc=len(label_map))[0]
# )
# Variant 2: use method by https://github.com/luxonis/depthai-experiments/blob/master/gen2-yolo/host-decoding/main.py
detections = non_max_suppression(in_nn, confidence_threshold)
display_frame("detections", in_video.getCvFrame())
[print_detection(d) for d in detections]
cv2.waitKey(0)
if __name__ == '__main__':
confidence_threshold = 0.1
image_path = 'bus.jpg'
main(confidence_threshold, image_path)