Hello everyone,

I trained a yolov4 model with Pytorch to detect objects and converted it to a .blob file from an ONNX file for an RVC3-based customized edge module.

Configuration: --data_type=FP16 --mean_values=[0,0,0] --scale_values=[255,255,255], --reverse_input_channels

However, when I ran it to detect objects from an image, I encountered several issues:

  1. The detected class names seem correct, but all the bounding boxes are in the wrong positions (not consistently off in the same direction, more like random directions).
  2. setConfidenceThreshold and setIouThreshold are not really controlling the detection results.

I was trying to mimic a similar setup with letterbox and non_max_suppression when I used ONNX to detect objects on my computer (running well). Currently, I only transferred the letterbox function to the script for testing.

Below is the code I ran on my host to use blob file to predict objects. I confirmed the anchors and anchorMasks are correct and matched the training (labelMap is removed on purpose).

Any ideas will be appreciated! Thank you.

import time
import cv2
import depthai as dai
import numpy as np
from pathlib import Path

###############################################################################
#                         Helper Functions                                   #
###############################################################################
def letterbox(image, size=(416, 416)):
    """
    Resize image with unchanged aspect ratio using padding,
    and return the resized image along with the scaling factor and padding.
    """
    shape = np.shape(image)[:2]  # Get the original image shape (height, width)
    
    if isinstance(size, int):
        size = (size, size)

    # Compute scaling ratio
    r = min(size[0] / shape[0], size[1] / shape[1])
    new_unpad = (int(round(shape[1] * r)), int(round(shape[0] * r)))  # New width, height
    dw, dh = size[1] - new_unpad[0], size[0] - new_unpad[1]  # Padding width and height

    dw /= 2
    dh /= 2

    # Resize the image if necessary
    if shape[::-1] != new_unpad:
        image = cv2.resize(image, new_unpad, interpolation=cv2.INTER_LINEAR)

    # Compute padding values
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))

    # Add border (padding)
    letterboxed = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(128, 128, 128))

    return letterboxed, r, left, top

###############################################################################
#                          Model and Pipeline Setup                          #
###############################################################################

# Load model blob
nnPath = str((Path(__file__).parent / Path('model_data/models-0255-reverse_input_channels.blob')).resolve().absolute())

imagePath = "image.jpg"

# Load class labels
labelMap = [
    "XX"
]

# Create DepthAI pipeline
pipeline = dai.Pipeline()

xIn = pipeline.create(dai.node.XLinkIn)
xIn.setStreamName("inFrame")

detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
detectionNetwork.setBlobPath(nnPath)
detectionNetwork.setConfidenceThreshold(1)  
detectionNetwork.setNumClasses(len(labelMap))
detectionNetwork.setCoordinateSize(4)
detectionNetwork.setIouThreshold(1)  
detectionNetwork.setNumInferenceThreads(1)
detectionNetwork.input.setBlocking(False)

# Set anchors
anchors = [16,5, 30,9, 48,15, 67,25, 109,35, 84,53, 152,60, 223,96, 367,155]
anchorMasks = {
    "side13": [6, 7, 8],  
    "side26": [3, 4, 5],  
    "side52": [0, 1, 2]   
}

detectionNetwork.setAnchors(anchors)
detectionNetwork.setAnchorMasks(anchorMasks)

nnOut = pipeline.create(dai.node.XLinkOut)
nnOut.setStreamName("nn")

xIn.out.link(detectionNetwork.input)
detectionNetwork.out.link(nnOut.input)

###############################################################################
#                    1) Preprocess and Send Image to DepthAI                 #
###############################################################################

# Load image (BGR)
orig_image = cv2.imread(imagePath)
if orig_image is None:
    raise FileNotFoundError(f"Image {imagePath} not found")

# Resize with letterbox
resized_image, scale, pad_x, pad_y = letterbox(orig_image, (416, 416))

resized_image = resized_image.astype(np.uint8)

# Convert to CHW
planar_image = resized_image.transpose(2, 0, 1).copy()

###############################################################################
#                   2) Run Inference and Process Results                      #
###############################################################################

with dai.Device(pipeline) as device:
    qIn = device.getInputQueue("inFrame")
    qDet = device.getOutputQueue("nn")

    imgFrame = dai.ImgFrame()
    imgFrame.setData(planar_image.tobytes())
    imgFrame.setTimestamp(time.monotonic())
    imgFrame.setWidth(416)
    imgFrame.setHeight(416)
    imgFrame.setType(dai.ImgFrame.Type.BGR888p)

    qIn.send(imgFrame)

    inDet = qDet.get()
    detections = inDet.detections

    if not detections:
        print("No objects detected.")
    else:
        print("Objects detected, processing...")

    for det in detections:
        # First, get the coordinates in the letterboxed 416x416 image.
        x1_letter = det.xmin * 416
        y1_letter = det.ymin * 416
        x2_letter = det.xmax * 416
        y2_letter = det.ymax * 416

        # Then, reverse the letterbox transformation:
        # subtract the padding and scale by the inverse of the resize scale.
        x1 = int((x1_letter - pad_x) / scale)
        y1 = int((y1_letter - pad_y) / scale)
        x2 = int((x2_letter - pad_x) / scale)
        y2 = int((y2_letter - pad_y) / scale)

        # Ensure coordinates are within the original image bounds
        x1 = max(0, min(orig_image.shape[1], x1))
        y1 = max(0, min(orig_image.shape[0], y1))
        x2 = max(0, min(orig_image.shape[1], x2))
        y2 = max(0, min(orig_image.shape[0], y2))

        # Draw bounding box and label
        cv2.rectangle(orig_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        class_id = int(det.label) if hasattr(det, 'label') else 0
        label_text = labelMap[class_id] if class_id < len(labelMap) else f"ID {class_id}"
        conf_text = f"{int(det.confidence)}%"
        cv2.putText(orig_image, f"{label_text} {conf_text}", (x1, y1 - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
        print(f"Detected: {label_text} ({conf_text}) at [{x1}, {y1}, {x2}, {y2}]")

    cv2.imshow("Detection Results", orig_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()