• DepthAI
  • Detection problem with Mobile Net on video from host

Hi pierreia .
The only problem in the code above is that it's not synced. You could either sync frames+detections with host-side syncing, or just use passthrough frame like I did below:

#!/usr/bin/env python3

from pathlib import Path
import sys
import cv2
import depthai as dai
import numpy as np
from time import monotonic

import blobconverter

# Get argument first
nnPath = 'mobilenet-ssd_openvino_2021.4_8shave.blob'
videoPath = 'traffic_5mn.mp4'
if len(sys.argv) > 2:
    nnPath = sys.argv[1]
    videoPath = sys.argv[2]

if not Path(nnPath).exists() or not Path(videoPath).exists():
    import sys
    raise FileNotFoundError(f'Required file/s not found, please run "{sys.executable} install_requirements.py"')

# MobilenetSSD label texts
labelMap = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
            "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

# Create pipeline
pipeline = dai.Pipeline()

# Define sources and outputs
nn = pipeline.create(dai.node.MobileNetDetectionNetwork)

xinFrame = pipeline.create(dai.node.XLinkIn)
xinFrame.setStreamName("inFrame")
xinFrame.out.link(nn.input)
# Properties
nn.setConfidenceThreshold(0.5)
nn.setBlobPath(nnPath)
nn.setNumInferenceThreads(2)
nn.input.setBlocking(True)

# Linking
nnOut = pipeline.create(dai.node.XLinkOut)
nnOut.setStreamName("nn")
nn.out.link(nnOut.input)

nnPass = pipeline.create(dai.node.XLinkOut)
nnPass.setStreamName("pass")
nn.passthrough.link(nnPass.input)


# Connect to device and start pipeline
with dai.Device(pipeline) as device:

    # Input queue will be used to send video frames to the device.
    qIn = device.getInputQueue(name="inFrame")
    # Output queue will be used to get nn data from the video frames.
    qDet = device.getOutputQueue(name="nn", maxSize=6, blocking=True)
    qPass = device.getOutputQueue("pass")

    frame = None
    detections = []

    # nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height
    def frameNorm(frame, bbox):
        normVals = np.full(len(bbox), frame.shape[0])
        normVals[::2] = frame.shape[1]
        return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)

    def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
        return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

    def displayFrame(name, frame):
        for detection in detections:
            bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
            cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.putText(frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
        # Show the frame
        cv2.imshow(name, frame)

    cap = cv2.VideoCapture(videoPath)
    while cap.isOpened():
        read_correctly, frame = cap.read()
        if not read_correctly:
            break

        img = dai.ImgFrame()
        resized = to_planar(frame, (300, 300))
        img.setTimestamp(monotonic())
        img.setType(dai.RawImgFrame.Type.BGR888p)
        img.setSize(300, 300)
        img.setData(resized)
        qIn.send(img)

        inDet = qDet.tryGet()

        if inDet is not None:
            detections = inDet.detections
            frame = qPass.get().getCvFrame()
            displayFrame("passthrough", frame)

        if cv2.waitKey(1) == ord('q'):
            break
    9 months later

    erik
    i'm trying live inferencing and video inferencing on the yolov8 nano
    pipeline

    def init_pipeline():

    pipeline = depthai.Pipeline()
    
    cam_rgb = pipeline.createColorCamera()
    
    detection_nn = pipeline.createYoloDetectionNetwork()
    
    cam_rgb.setResolution(
    
        depthai.ColorCameraProperties.SensorResolution.THE_4_K)
    
    cam_rgb.setPreviewSize(640, 640)
    
    cam_rgb.setInterleaved(True)
    
    xout_rgb = pipeline.createXLinkOut()
    
    xout_rgb.setStreamName("rgb")
    
    cam_rgb.preview.link(xout_rgb.input)
    
    cam_rgb.setPreviewKeepAspectRatio(False)
    
    manip1 = pipeline.createImageManip()
    
    manip1.setMaxOutputFrameSize(1244160)
    
    manip1.initialConfig.setResize(sizeX, sizeY)
    
    cam_rgb.preview.link(manip1.inputImage)
    
    manip1.initialConfig.setFrameType(depthai.ImgFrame.Type.BGR888p)
    
    manip1.inputImage.setBlocking(True)
    
    if args.videoPath is not None:
    
        xinFrame = pipeline.create(depthai.node.XLinkIn)
    
        xinFrame.setStreamName("inFrame")
    
        xinFrame.out.link(manip1.inputImage)
    
        xinFrame.setMaxDataSize(1920\*1080\*3)
    
        nnPass = pipeline.create(depthai.node.XLinkOut)
    
        nnPass.setStreamName("pass")
    
        detection_nn.passthrough.link(xout_rgb.input)
    
    else:
    
        xinFrame = None
    
    # Extract the values from the JSON
    
    num_classes = config['nn_config']['NN_specific_metadata']['classes']
    
    coordinates = config['nn_config']['NN_specific_metadata']['coordinates']
    
    anchors = config['nn_config']['NN_specific_metadata']['anchors']
    
    anchor_masks = config['nn_config']['NN_specific_metadata']['anchor_masks']
    
    iou_threshold = config['nn_config']['NN_specific_metadata']['iou_threshold']
    
    # Set the values
    
    detection_nn.setNumClasses(num_classes)
    
    detection_nn.setCoordinateSize(coordinates)
    
    detection_nn.setAnchors(anchors)
    
    detection_nn.setAnchorMasks(anchor_masks)
    
    detection_nn.setIouThreshold(iou_threshold)
    
    detection_nn.setConfidenceThreshold(0.5)
    
    # detection_nn.setNumInferenceThreads(2)
    
    detection_nn.input.setBlocking(True)
    
    
    
    
    
    # Blob is the Neural Network file, compiled for MyriadX. It contains both the definition and weights of the model
    
    # We're using a blobconverter tool to retreive the MobileNetSSD blob automatically from OpenVINO Model Zoo
    
    # detection_nn.setBlobPath(blobconverter.from_zoo(name='mobilenet-ssd', shaves=6))
    
    # Next, we filter out the detections that are below a confidence threshold. Confidence can be anywhere between <0..1>
    
    # Next, we link the camera 'preview' output to the neural network detection input, so that it can produce detections
    
    manip1.out.link(detection_nn.input)
    
    if customModel is True:
    
        nnPath = str(
    
            (parentDir / Path('../../data/' + model)).resolve().absolute())
    
        # print(nnPath)
    
        detection_nn.setBlobPath(nnPath)
    
        print("Custom Model" + nnPath + "Size: " +
    
              str(sizeX) + "x" + str(sizeY))
    
    else:
    
        detection_nn.setBlobPath(blobconverter.from_zoo(
    
            name='person-detection-0106', shaves=6))
    
        print("Model from OpenVINO Zoo" + "Size: " +
    
              str(sizeX) + "x" + str(sizeY))
    
    xout_nn = pipeline.createXLinkOut()
    
    xout_nn.setStreamName("nn")
    
    detection_nn.out.link(xout_nn.input)
    
    return pipeline

    def detect_and_count():

    global outputFrame, lock, zones_current_count, listeners, loop
    
    pipeline = init_pipeline()
    
    inputFrameShape = (sizeX, sizeY)
    
    with depthai.Device(pipeline) as device:
    
        q_rgb = device.getOutputQueue("rgb")
    
        q_nn = device.getOutputQueue("nn")
    
        qPass = device.getOutputQueue("pass")
    
        # q_manip = device.getInputQueue("")
    
        baseTs = time.monotonic()
    
        simulatedFps = 30
    
        frame = None
    
        detections = []
    
        timestamp = datetime.utcnow()
    
        zone_data = []
    
        def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
    
            return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()
    
        if args.videoPath is not None:
    
            videoPath = str(
    
                (parentDir / Path('../../data/' + video_source)).resolve().absolute())
    
            cap = cv2.VideoCapture(videoPath, cv2.CAP_FFMPEG)
    
        # loop over frames from the video stream
    
        while True:
    
            if args.videoPath is not None:
    
                read_correctly, frame = cap.read()
    
                
    
                if not read_correctly:
    
                    break
    
                if args.videoPath is not None:
    
                    q_vid = device.getInputQueue(name="inFrame")
    
                    img = depthai.ImgFrame()
    
                    img.setType(depthai.RawImgFrame.Type.BGR888p)
    
                    img.setData(to_planar(frame, inputFrameShape))
    
                    img.setTimestamp(baseTs)
    
                    baseTs += 1/simulatedFps
    
                    img.setWidth(inputFrameShape[0])
    
                    img.setHeight(inputFrameShape[1])
    
                    q_vid.send(img)
    
                    # in_vid = q_vid.tryGet()
    
                    print("hello", timestamp)
    
                    if args.videoPath is not None:
    
                        print("video")
    
                        frame = qPass.get().getCvFrame()
    
                    
    
            in_rgb = q_rgb.tryGet()
    
            in_nn = q_nn.tryGet()
    
            
    
            if in_rgb is not None and args.videoPath is None:
    
                print("live")
    
                frame = in_rgb.getCvFrame()
    
            
    
            if in_nn is not None:
    
                print("detect")
    
                detections = in_nn.detections
    
                zone_data += check_overlap(frame, detections)
    
                print("done",timestamp)
    
                now = datetime.utcnow()
    
                if now.second != timestamp.second:
    
                    t = threading.Thread(
    
                        target=insert_data, args=(zone_data, ))
    
                    t.daemon = True
    
                    t.start()
    
                    zone_data = []
    
                timestamp = now
    
                with lock:
    
                    outputFrame = frame.copy()
    
                    print("finish")
    
            if args.videoPath is not None:
    
                ret, frame = cap.read()
    
                if not ret:
    
                    print("video over", timestamp)
    
                    cap.release()
    
                    break
    
                    # at any time, you can press "q" and exit the main loop, therefore exiting the program itself
    
            if cv2.waitKey(1) == ord('q'):
    
                break

    parser = argparse.ArgumentParser()

    parser.add_argument('-v', '--videoPath',
    
                        help="Path to video frame", default=None)
    
    args = parser.parse_args()
    
    video_source = args.videoPath

    what's happening is both the live and video inferening is happening at a time and it stops after 30 seconds
    any take on what i'm doing wrong?

    • erik replied to this.

      erik
      Hi erik i have attached the files here
      the test 4 file gives a error where the video keeps on changing the size of the preview

      the test 5 file gives a error where the video shifts to live inferencing in between the frames
      the common error that i found was that both these stop working after 30 seconds
      only live inferencing works fine but the video inferencing stops after 30 seconds
      let me know if u need anything else

      This isn't reproducible.

       python .\test4.py
      Traceback (most recent call last):
        File "D:\Downloads\yolov8-testing-pt-files-New%20folder\test4.py", line 175, in <module>
          parser = argparse.ArgumentParser()
      NameError: name 'argparse' is not defined

        Again, this is not reproducible. Did you even try running the test4.py/test5.py?

          a year later

          @erik

          Hi, can you please provide the code to run the inference on a video file using yolov5?

          Thankyou @erik

          There are modules for object Tracking using mobilenet ssd but it did not work well for yolov5.
          can you please give module for object tracker using yolov5.

          @erik

          Whether we can run dlib tracker on-device together with yolov5 object detection?

          @Unknown it's exactly the same, you can link YoloDetectionNetwork the same way you link MobileNetDetectionNetwork to the ObjectTracker node.

          @erik okay sure, Thank you.
          Can you please let me know about the query given below.
          Whether we can run dlib tracker on-device together with yolov5 object detection?

            Unknown
            Depthai has proprietary tracking algorithms that can run on RVC. Addding custom tracking (on-device) is not possible on RVC2.
            You would have to port the tracking to host side. The rest of the pipeline (yolo and others) should still be run on device.

            Thanks,
            Jaka