Thank you, Jaka.
Regarding whether I'm using RGB or not, I tried both commenting in and out the line 19 in the preproc() function of the code below(where "padded_img = padded_img[:, :, ::-1]"), which I believe can reverse the channel order. However, it didn't seem to have much impact on the bounding box results.
I have included a shortened version of the code as much as possible below.
I really appreciate your help.
from pathlib import Path
import numpy as np
import cv2
import depthai as dai
from depthai_sdk.utils import toTensorResult
def preproc(image, input_size, mean=None, std=None, swap=(2, 0, 1)):
padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
img = np.array(image)
ratio = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
resized_img = cv2.resize(
(int(img.shape[1] * ratio), int(img.shape[0] * ratio)),
padded_img[: int(img.shape[0] * ratio), : int(img.shape[1] * ratio)] = resized_img
padded_img = padded_img[:, :, ::-1]
padded_img = padded_img.transpose(swap)
padded_img = np.ascontiguousarray(padded_img, dtype=np.float16)
return padded_img, ratio
def nms(boxes, scores, nms_thr):
"""Single class NMS implemented in Numpy."""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_thr)[0]
order = order[inds + 1]
return keep
def multiclass_nms(boxes, scores, nms_thr, score_thr):
"""Multiclass NMS implemented in Numpy"""
final_dets = []
num_classes = scores.shape[1]
for cls_ind in range(num_classes):
cls_scores = scores[:, cls_ind]
valid_score_mask = cls_scores > score_thr
if valid_score_mask.sum() == 0:
valid_scores = cls_scores[valid_score_mask]
valid_boxes = boxes[valid_score_mask]
keep = nms(valid_boxes, valid_scores, nms_thr)
if len(keep) > 0:
cls_inds = np.ones((len(keep), 1)) * cls_ind
dets = np.concatenate(
[valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
if len(final_dets) == 0:
return None
return np.concatenate(final_dets, 0)
def demo_postprocess(outputs, img_size, p6=False):
grids = []
expanded_strides = []
if not p6:
strides = [8, 16, 32]
strides = [8, 16, 32, 64]
hsizes = [img_size[0] // stride for stride in strides]
wsizes = [img_size[1] // stride for stride in strides]
for hsize, wsize, stride in zip(hsizes, wsizes, strides):
xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
shape = grid.shape[:2]
expanded_strides.append(np.full((*shape, 1), stride))
grids = np.concatenate(grids, 1)
expanded_strides = np.concatenate(expanded_strides, 1)
outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
return outputs
SHAPE = 416
labelMap = [
"TestClass0", "TestClass1", "car", "motorbike", "aeroplane", "bus", "train",
"truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
"bird", "cat", "dog", "horse", "sheep", "cow", "elephant",
"bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie",
"suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
"fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
"orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
"chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor",
"laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
"teddy bear", "hair drier", "toothbrush"
pipeline = dai.Pipeline()
camera = pipeline.create(dai.node.ColorCamera)
camera.setPreviewSize(SHAPE, SHAPE)
nn = pipeline.create(dai.node.NeuralNetwork)
# nn.setBlobPath(str(Path("./depthai-experiments/gen2-yolo/yolox/yolox_tiny.blob").resolve().absolute()))
# Send camera frames to the host
camera_xout = pipeline.create(dai.node.XLinkOut)
# Send converted frames from the host to the NN
nn_xin = pipeline.create(dai.node.XLinkIn)
# nn_xin.setMaxDataSize(80000000)
# Send bounding boxes from the NN to the host via XLink
nn_xout = pipeline.create(dai.node.XLinkOut)
# Pipeline is defined, now we can connect to the device
with dai.Device(pipeline) as device:
qCamera = device.getOutputQueue(name="camera", maxSize=4, blocking=False)
qNnInput = device.getInputQueue("nnInput", maxSize=4, blocking=False)
qNn = device.getOutputQueue(name="nn", maxSize=4, blocking=True)
while True:
inRgb = qCamera.get()
frame = inRgb.getCvFrame()
# frame = cv2.imread("./test_img.jpg")
image, ratio = preproc(frame, (SHAPE, SHAPE))
buff = dai.Buffer()
flat32_in_int8_array = np.array([image], dtype=np.float32).view(np.int8)
in_nn = qNn.get()
if in_nn is not None:
res = toTensorResult(in_nn).get("output")
data = np.array(res).reshape(1, -1, 85)
predictions = demo_postprocess(data, (SHAPE, SHAPE), p6=False)[0]
boxes = predictions[:, :4]
scores = predictions[:, 4, None] * predictions[:, 5:]
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.
dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.001, score_thr=0.001)
if dets is not None:
final_boxes = dets[:, :4]
final_scores, final_cls_inds = dets[:, 4], dets[:, 5]
for i in range(len(final_boxes)):
bbox = final_boxes[i]
score = final_scores[i]
class_name = labelMap[int(final_cls_inds[i])]
if score >= 0.001:
# Limit the bounding box to 0..SHAPE
bbox[bbox > SHAPE - 1] = SHAPE - 1
bbox[bbox < 0] = 0
xy_min = (int(bbox[0]), int(bbox[1]))
xy_max = (int(bbox[2]), int(bbox[3]))
# Display detection's BB, label and confidence on the frame
cv2.rectangle(frame, xy_min , xy_max, (255, 0, 0), 2)
cv2.imshow("rgb", frame)
if cv2.waitKey(1) == ord('q'):