Hi,
I have issues when using the NeuralNetwork
node instead of the YoloDetectionNetwork
node for object detection.
The below MWE applies the yolov8 coco 416x416
model from the DepthAI model zoo on the video stream. If using the YoloDetectionNetwork
(useYDN = True
), plausible detections are returned. However, when using the NeuralNetwork
(useYDN
=
False
), no detections are returned at all, even when selecting a confidence threshold of 0.
For decoding the messages of the NeuralNetwork
I stuck to the example given here: https://github.com/ultralytics/ultralytics/blob/main/examples/YOLOv8-OpenCV-ONNX-Python/main.py
The first abnormality I recognized is, that the shape of the resulting tensor seems to be 1x85x2704 instead of the expected 1x84x2704 (80 class labels + 4 bounding box coordinates). In the resulting tensor, all values except for the first four bounding box entries are 0 (confirming the observation that no objects are detected).
This is a toy example, as I could simply be using the YoloDetectionNetwork
for this. However, in practice, I want to deploy my custom model on the Oak device, which forces me to use the NeuralNetwork
node and I have not yet managed to get it running.
Can somebody spot my error and point me in the right direction? I have been stuck on this issue for quite some time now and would be very happy once I succeed. Thank you very much in advance.
import blobconverter
import cv2
import depthai as dai
import numpy as np
import sys
def setup_pipeline(
use_YoloDetectionNetwork: bool,
model_path: str,
confidence_threshold: float,
model_width: int = 416,
model_height: int = 416,
video_fps: int = 10
) -> dai.Pipeline:
pipeline = dai.Pipeline()
# Camera
cam = pipeline.create(dai.node.ColorCamera)
cam.setBoardSocket(dai.CameraBoardSocket.CAM_A)
cam.setInterleaved(False)
cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
cam.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB)
cam.setFps(video_fps)
cam.setPreviewSize(model_width, model_width)
cam.setPreviewKeepAspectRatio(False)
# NeuralNetwork
if use_YoloDetectionNetwork:
nn = pipeline.create(dai.node.YoloDetectionNetwork)
nn.setCoordinateSize(4)
nn.setNumClasses(80)
nn.setConfidenceThreshold(confidence_threshold)
else:
nn = pipeline.create(dai.node.NeuralNetwork)
nn.setBlobPath(model_path)
nn.setNumInferenceThreads(2)
# Outputs
nnXout = pipeline.create(dai.node.XLinkOut)
nnXout.setStreamName("nn")
video_out = pipeline.create(dai.node.XLinkOut)
video_out.setStreamName("video")
# Linking
cam.preview.link(nn.input)
nn.out.link(nnXout.input)
nn.passthrough.link(video_out.input)
return pipeline
def decode_detections(
outputs: np.ndarray,
confidence_threshold: float
):
# Prepare output array
outputs = np.array([cv2.transpose(outputs[0])])
rows = outputs.shape[1]
boxes = []
scores = []
class_ids = []
# Iterate through output to collect bounding boxes, confidence scores,
# and class IDs
for i in range(rows):
x, y, w, h = outputs[0][i][:4]
classes_scores = outputs[0][i][4:]
(_, maxScore, _, (_, maxClassIndex)) = cv2.minMaxLoc(classes_scores)
box = [x - (0.5 * w),
y - (0.5 * h),
w,
h]
if maxScore > confidence_threshold:
boxes.append(box)
scores.append(maxScore)
class_ids.append(maxClassIndex)
# Apply NMS (Non-maximum suppression)
result_boxes = cv2.dnn.NMSBoxes(boxes,
scores,
confidence_threshold,
0.45,
0.5)
# Iterate through NMS results to draw bounding boxes and labels
detections = []
for i in range(len(result_boxes)):
index = result_boxes[i]
box = boxes[index]
detection = {
"confidence": scores[index],
"label": class_ids[index],
"xmin": box[0],
"xmax": box[0] + box[2],
"ymin": box[1] + box[3],
"ymax": box[1] + box[3],
}
detections.append(detection)
return detections
model_path = blobconverter.from_zoo(name="yolov8n_coco_416x416",
zoo_type="depthai",
shaves=6)
use_YDN = eval(sys.argv[1]) if len(sys.argv) > 1 else False
confidence_threshold = float(sys.argv[2]) if len(sys.argv) > 2 else 0.3
print(f"using {'YoloDetectionNetwork' if use_YDN else 'NeuralNetwork'}"
f" with confidence threshold of {confidence_threshold}")
pipeline = setup_pipeline(use_YDN,
model_path,
confidence_threshold)
with dai.Device(pipeline) as device:
nn_queue = device.getOutputQueue("nn")
video_queue = device.getOutputQueue(name="video")
while True:
in_nn = nn_queue.get()
in_video = video_queue.get()
if use_YDN:
detections = in_nn.detections
else:
in_nn = in_nn.getFirstLayerFp16()
in_nn = np.asarray(in_nn, dtype=np.int32).reshape((1, 85, 2704))
detections = decode_detections(in_nn, confidence_threshold)
if len(detections) > 0:
print(",".join([str(d.label) for d in detections]))
else:
print("No detections in frame.")