Yolo4 on video from host instead of mobilenet

Robbal

Hi Guy

Total noob here and working my way through the examples. I am trying to infer on a video sent to the device as I cant mount my OAK-D on my car in the rain. I have seen a example that uses mobilenet but I want to do it with Yolo4. I have managed to piece this python script together but I cant seem to get the oak-d do show the inferencing. Does anyboy have some advice what I might have done wrong.

I apreciate any help

Regards Robby

`from depthai_sdk import Previews, FPSHandler
from depthai_sdk.managers import PipelineManager, PreviewManager, BlobManager, NNetManager
import depthai as dai
import cv2
import argparse
from pathlib import Path
import numpy as np
from time import monotonic

parse arguments

parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", help="Provide model path for inference",
default='models/yolo_v3_tiny_openvino_2021.3_6shave.blob', type=str)
parser.add_argument("-c", "--config", help="Provide config path for inference",
default='yolo-tiny.json', type=str)
parser.add_argument("--height", help="Input shape height", default=320, type=int)
parser.add_argument("--width", help="Input shape width", default=512, type=int)
args = parser.parse_args()

parentDir = Path(file).parent
videoPath = str((parentDir / Path('car.mp4')).resolve().absolute())

CONFIG_PATH = args.config
H, W = args.height, args.width

create pipeline manager and camera

pm = PipelineManager()
pm.createColorCam(previewSize=(W, H), xout=True)

create yolo node

bm = BlobManager(blobPath=args.model)
nm = NNetManager(inputSize=(W, H), nnFamily="YOLO")
nm.readConfig(CONFIG_PATH)
nn = nm.createNN(pipeline=pm.pipeline, nodes=pm.nodes, source=Previews.color.name,
blobPath=bm.getBlob(shaves=6, openvinoVersion=pm.pipeline.getOpenVINOVersion()))
pm.addNn(nn)

Define sources and outputs

xinFrame = pm.pipeline.create(dai.node.XLinkIn)
nnOut = pm.pipeline.create(dai.node.XLinkOut)

xinFrame.setStreamName("inFrame")
nnOut.setStreamName("nn")

Linking

xinFrame.out.link(nn.input)
nn.out.link(nnOut.input)

initialize pipeline

with dai.Device(pm.pipeline) as device:

# Input queue will be used to send video frames to the device.
qIn = device.getInputQueue(name="inFrame")
# Output queue will be used to get nn data from the video frames.
qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)

frame = None
detections = []

# nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height
def frameNorm(frame, bbox):
    normVals = np.full(len(bbox), frame.shape[0])
    normVals[::2] = frame.shape[1]
    return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)

def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
    return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

def displayFrame(name, frame):
    for detection in detections:
        if detection.label == 7:
            bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
            cv2.putText(frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
    # Show the frame
    cv2.imshow(name, frame)

cap = cv2.VideoCapture(videoPath)
while cap.isOpened():
    read_correctly, frame = cap.read()
    if not read_correctly:
        break

    img = dai.ImgFrame()
    img.setData(to_planar(frame, (512, 320)))
    img.setTimestamp(monotonic())
    img.setWidth(512)
    img.setHeight(320)
    qIn.send(img)

    inDet = qDet.tryGet()

    if inDet is not None:
        detections = inDet.detections

    if frame is not None:
        displayFrame("rgb", frame)

    if cv2.waitKey(1) == ord('q'):
        break

Robbal

Ok so I noticed that the lable will aways be 0 as this dataset is only trained for cars. I changed:
if detection.label == 7:
to

if detection.label == 0:

It starts off great but then the bounding boxes freeze and dont change after the first few frames.
Hummm.. getting closer

Robbal

Ok So worked it out... not pretty but maybe this can help somone else. I am not a fan of RoboFlow and wanted to train my own Yolo and test it without taking my device into the field. Gezz I learned a lot. Sorry how the code pastes.. The code tags are a mess

import sys
import cv2
import numpy as np
import depthai as dai
from pathlib import Path
from time import monotonic

videoPath = str((Path(__file__).parent / Path('../PXL_20220701_152159125.mp4')).resolve().absolute())
nnPath = str((Path(__file__).parent / Path('../models/yolo_v4_tiny_openvino_2021.3_6shave.blob')).resolve().absolute())

labelMap = ["Car"]

# Create pipeline
print("Starting pipeline")
pipeline = dai.Pipeline()

# Define sources and outputs
nn = pipeline.create(dai.node.YoloDetectionNetwork)

xinFrame = pipeline.create(dai.node.XLinkIn)
nnOut = pipeline.create(dai.node.XLinkOut)

print("Setting streams")
xinFrame.setStreamName("inVideo")
nnOut.setStreamName("nn")

# Properties
nn.setConfidenceThreshold(0.5)
nn.setNumClasses(1)
nn.setCoordinateSize(4)
nn.setAnchors([10,14, 23,27, 37,58, 81,82, 135,169, 344,319])
nn.setAnchorMasks({"side32": [0, 1, 2], "side16": [3, 4, 5]})
nn.setIouThreshold(0.5)
nn.setBlobPath(nnPath)
nn.setNumInferenceThreads(2)
nn.input.setBlocking(False)

# Linking
xinFrame.out.link(nn.input)
nn.out.link(nnOut.input)

# Connect to device and start pipeline
with dai.Device(pipeline) as device:

    # Input queue will be used to send video frames to the device.
    qIn = device.getInputQueue(name="inVideo")
    # Output queue will be used to get nn data from the video frames.
    qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)

    frame = None
    detections = []

    # nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height
    def frameNorm(frame, bbox):
        normVals = np.full(len(bbox), frame.shape[0])
        normVals[::2] = frame.shape[1]
        return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)

    def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
        return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

    def displayFrame(name, frame):
        for detection in detections:
            bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
            cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.putText(frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
        # Show the frame
        cv2.imshow(name, frame)

    cap = cv2.VideoCapture(videoPath)
    while cap.isOpened():
        read_correctly, frame = cap.read()
        if not read_correctly:
            break

        img = dai.ImgFrame()
        img.setData(to_planar(frame, (512, 320)))
        img.setTimestamp(monotonic())
        img.setWidth(512)
        img.setHeight(320)
        qIn.send(img)

        inDet = qDet.tryGet()

        if inDet is not None:
            detections = inDet.detections

        if frame is not None:
            displayFrame("Video Out", frame)

        if cv2.waitKey(1) == ord('q'):
            break

maurizio

Robbal Hi, thank you for posting this back. I managed to run your code and load the video, but I am getting no inferences. I have trained a yolov5 model on 640 grayscale image extracted from OAK-D encoded videos and am trying to run inference on the video from the right grayscale cam.

I am getting the following error:

[18443010D197780E00] [1.1.2] [175.730] [DetectionNetwork(0)] [warning] Input image (640x400) does not match NN (640x640)
[18443010D197780E00] [1.1.2] [175.730] [DetectionNetwork(0)] [error] Input tensor 'images' (0) exceeds available data range. Data size (768000B), tensor offset (0), size (1228800B) - skipping inference

I get the model size warning, even though I trained the model in 640p and am running the video on 640 too. But I can't seem to understand the error, tried to look around here but not many people have this issue apparently. Maybe you ran into similar?

Cheers!

jakaskerl

Hi maurizio
Looks like the image you are inputting into the model - via ImgManip node i presume - is of incorrect shape. Can you confirm this/paste your code? The data size problem likely stems from incorrect value type (using U8 but your model expects FP16).

Thanks,
Jaka