Limited fps when not using objectTracker

TompoussTompousss

Hello,
I don't understand why my test to do detection on video is so slow. It is limited to around 25 fps except if I use a node with objectTracker like below. Do I miss something ? (The only thing I changed is to remove the object tracker node. When I do so, the FPS goes from 60 fps (with tracker) to 25 FPS.

Thank you.

# Create pipeline
pipeline = dai.Pipeline()

# Define sources and outputs
manip = pipeline.create(dai.node.ImageManip)
objectTracker = pipeline.create(dai.node.ObjectTracker)
detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)

manipOut = pipeline.create(dai.node.XLinkOut)
xinFrame = pipeline.create(dai.node.XLinkIn)
trackerOut = pipeline.create(dai.node.XLinkOut)
xlinkOut = pipeline.create(dai.node.XLinkOut)
nnOut = pipeline.create(dai.node.XLinkOut)

manipOut.setStreamName("manip")
xinFrame.setStreamName("inFrame")
xlinkOut.setStreamName("trackerFrame")
trackerOut.setStreamName("tracklets")
nnOut.setStreamName("nn")

# Properties
xinFrame.setMaxDataSize(NN_WIDTH*NN_HEIGHT*3)

manip.initialConfig.setResizeThumbnail(NN_WIDTH, NN_HEIGHT)
# manip.initialConfig.setResize(256, 256)
# manip.initialConfig.setKeepAspectRatio(False) #squash the image to not lose FOV
# The NN model expects BGR input. By default ImageManip output type would be same as input (gray in this case)
manip.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
manip.inputImage.setBlocking(True)

# Network specific settings
detectionNetwork.setConfidenceThreshold(confidenceThreshold)
detectionNetwork.setNumClasses(classes)
detectionNetwork.setCoordinateSize(coordinates)
detectionNetwork.setAnchors(anchors)
detectionNetwork.setAnchorMasks(anchorMasks)
detectionNetwork.setIouThreshold(iouThreshold)
detectionNetwork.setBlobPath(args.nnPath)
detectionNetwork.setNumInferenceThreads(2)
detectionNetwork.input.setBlocking(True)

objectTracker.inputTrackerFrame.setBlocking(True)
objectTracker.inputDetectionFrame.setBlocking(True)
objectTracker.inputDetections.setBlocking(True)
objectTracker.setDetectionLabelsToTrack([0]) # track only drone
objectTracker.setTrackerThreshold(0.5)
# possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
objectTracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)
# take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)

# Linking
manip.out.link(manipOut.input)
manip.out.link(detectionNetwork.input)
xinFrame.out.link(manip.inputImage)
xinFrame.out.link(objectTracker.inputTrackerFrame)
detectionNetwork.out.link(nnOut.input)
detectionNetwork.out.link(objectTracker.inputDetections)
detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
objectTracker.out.link(trackerOut.input)
objectTracker.passthroughTrackerFrame.link(xlinkOut.input)

# Connect and start the pipeline
with dai.Device(pipeline) as device:

qIn = device.getInputQueue(name="inFrame")
trackerFrameQ = device.getOutputQueue(name="trackerFrame", maxSize=4)
tracklets = device.getOutputQueue(name="tracklets", maxSize=4)
qManip = device.getOutputQueue(name="manip", maxSize=4)
qDet = device.getOutputQueue(name="nn", maxSize=4)

jakaskerl

Hi @TompoussTompousss
Can you create minimal reproducible example that I can test locally. I can't really see the issue in your code.

Thanks,
Jaka

TompoussTompousss

Hi @jakaskerl ,

I can not upload file here, or how should I do ? I just took that example. (object_tracker_video)

First code, I have deleted the tracker, changed the nn for yolo one and just use a passthrough to show the input frame from the nn instead of the input frame of the tracker (as it is done in the example). --> 65 FPS

Second code, I have now deleted the passthrough showing the input frame of the nn since it doesn't do anything anyway. --> 25 FPS

So just using a passthrough or not make the pipeline much faster ? I don't understand why.

First code:

#!/usr/bin/env python3

from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import argparse

labelMap = ["drone", ""]
model_name = 'yolov8_256'

nnPathDefault = str((Path(file).parent / Path(f'/home/touk/Desktop/Fourth_semester/yolov8/model/{model_name}_openvino_2022.1_6shave.blob')).resolve().absolute())
videoPathDefault = str((Path(file).parent / Path('/home/touk/Desktop/Fourth_semester/yolov8/videos/Drones-1-original.mp4')).resolve().absolute())
parser = argparse.ArgumentParser()
parser.add_argument('-nnPath', help="Path to mobilenet detection network blob", default=nnPathDefault)
parser.add_argument('-v', '--videoPath', help="Path to video frame", default=videoPathDefault)

args = parser.parse_args()

# Create pipeline
pipeline = dai.Pipeline()

# Define sources and outputs
manip = pipeline.create(dai.node.ImageManip)

detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)

manipOut = pipeline.create(dai.node.XLinkOut)
xinFrame = pipeline.create(dai.node.XLinkIn)

xlinkOut = pipeline.create(dai.node.XLinkOut)
nnOut = pipeline.create(dai.node.XLinkOut)

manipOut.setStreamName("manip")
xinFrame.setStreamName("inFrame")
xlinkOut.setStreamName("nnFrame")
nnOut.setStreamName("nn")

# Properties
xinFrame.setMaxDataSize(256*256*3)

manip.initialConfig.setResizeThumbnail(256, 256)
# manip.initialConfig.setResize(384, 384)
# manip.initialConfig.setKeepAspectRatio(False) #squash the image to not lose FOV
# The NN model expects BGR input. By default ImageManip output type would be same as input (gray in this case)
manip.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
manip.inputImage.setBlocking(True)

# setting node configs
detectionNetwork.setBlobPath(args.nnPath)
detectionNetwork.setConfidenceThreshold(0.5)
detectionNetwork.input.setBlocking(True)

# Linking
manip.out.link(manipOut.input)
manip.out.link(detectionNetwork.input)
xinFrame.out.link(manip.inputImage)

detectionNetwork.out.link(nnOut.input)

detectionNetwork.passthrough.link(xlinkOut.input)

# Connect and start the pipeline
with dai.Device(pipeline) as device:

qIn = device.getInputQueue(name="inFrame")
nnFrameQ = device.getOutputQueue(name="nnFrame", maxSize=4)
qManip = device.getOutputQueue(name="manip", maxSize=4)
qDet = device.getOutputQueue(name="nn", maxSize=4)

startTime = time.monotonic()
counter = 0
fps = 0
detections = []
frame = None

def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
    return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

# nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height
def frameNorm(frame, bbox):
    normVals = np.full(len(bbox), frame.shape[0])
    normVals[::2] = frame.shape[1]
    return (np.clip(np.array(bbox), 0, 1) \* normVals).astype(int)

def displayFrame(name, frame):
    for detection in detections:
        bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
        cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
        cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
        cv2.putText(frame, f"{int(detection.confidence \* 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
    cv2.imshow(name, frame)

cap = cv2.VideoCapture(args.videoPath)
baseTs = time.monotonic()
simulatedFps = 30
inputFrameShape = (256, 256)

while cap.isOpened():
    read_correctly, frame = cap.read()
    if not read_correctly:
        break

    img = dai.ImgFrame()
    img.setType(dai.ImgFrame.Type.BGR888p)
    img.setData(to_planar(frame, inputFrameShape))
    img.setTimestamp(baseTs)
    baseTs += 1/simulatedFps

    img.setWidth(inputFrameShape[0])
    img.setHeight(inputFrameShape[1])
    qIn.send(img)

    nnFrame = nnFrameQ.tryGet()
    if nnFrame is None:
        continue

    manip = qManip.get()
    inDet = qDet.get()

    counter+=1
    current_time = time.monotonic()
    if (current_time - startTime) > 1 :
        fps = counter / (current_time - startTime)
        counter = 0
        startTime = current_time

    detections = inDet.detections
    manipFrame = manip.getCvFrame()
    displayFrame("nn", manipFrame)

    color = (255, 0, 0)
    nninFrame = nnFrame.getCvFrame()

    cv2.putText(nninFrame, "Fps: {:.2f}".format(fps), (2, nninFrame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)

    cv2.imshow("nnframe", nninFrame)

    if cv2.waitKey(1) == ord('q'):
        break

TompoussTompousss

And second code (almost identical, I just don't show the input frame of nn anymore):
#!/usr/bin/env python3

from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import argparse

labelMap = ["drone", ""]
model_name = 'yolov8_256'

args = parser.parse_args()

# Create pipeline
pipeline = dai.Pipeline()

# Define sources and outputs
manip = pipeline.create(dai.node.ImageManip)

detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)

manipOut = pipeline.create(dai.node.XLinkOut)
xinFrame = pipeline.create(dai.node.XLinkIn)

nnOut = pipeline.create(dai.node.XLinkOut)

manipOut.setStreamName("manip")
xinFrame.setStreamName("inFrame")
nnOut.setStreamName("nn")

# Properties
xinFrame.setMaxDataSize(256*256*3)

# setting node configs
detectionNetwork.setBlobPath(args.nnPath)
detectionNetwork.setConfidenceThreshold(0.5)
detectionNetwork.input.setBlocking(True)

# Linking
manip.out.link(manipOut.input)
manip.out.link(detectionNetwork.input)
xinFrame.out.link(manip.inputImage)

detectionNetwork.out.link(nnOut.input)

# Connect and start the pipeline
with dai.Device(pipeline) as device:

qIn = device.getInputQueue(name="inFrame")

qManip = device.getOutputQueue(name="manip", maxSize=4)
qDet = device.getOutputQueue(name="nn", maxSize=4)

startTime = time.monotonic()
counter = 0
fps = 0
detections = []
frame = None

def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
    return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

# nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height
def frameNorm(frame, bbox):
    normVals = np.full(len(bbox), frame.shape[0])
    normVals[::2] = frame.shape[1]
    return (np.clip(np.array(bbox), 0, 1) \* normVals).astype(int)

def displayFrame(name, frame):
    for detection in detections:
        bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
        cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
        cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
        cv2.putText(frame, f"{int(detection.confidence \* 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
    cv2.imshow(name, frame)

cap = cv2.VideoCapture(args.videoPath)
baseTs = time.monotonic()
simulatedFps = 30
inputFrameShape = (256, 256)

while cap.isOpened():
    read_correctly, frame = cap.read()
    if not read_correctly:
        break

    img = dai.ImgFrame()
    img.setType(dai.ImgFrame.Type.BGR888p)
    img.setData(to_planar(frame, inputFrameShape))
    img.setTimestamp(baseTs)
    baseTs += 1/simulatedFps

    img.setWidth(inputFrameShape[0])
    img.setHeight(inputFrameShape[1])
    qIn.send(img)

    manip = qManip.get()
    inDet = qDet.get()

    counter+=1
    current_time = time.monotonic()
    if (current_time - startTime) > 1 :
        fps = counter / (current_time - startTime)
        counter = 0
        startTime = current_time

    detections = inDet.detections
    manipFrame = manip.getCvFrame()
    displayFrame("nn", manipFrame)


    if cv2.waitKey(1) == ord('q'):
        break

jakaskerl

Hi @TompoussTompousss
I could not repro your code since I don't have your model. Can you try switching to some of the stock model available in depthai-python to see if the same issue happens (it should).

Thanks,
Jaka

TompoussTompousss

Hi @jakaskerl ,

You don't need any model to see the fps dropping when the pass through isn't used. In my code I don't even try to decode the output from the model, I am just looking at the fps.

But here below my code using the mask_detection 300x300 (FP16) blob from OpenVINO ZOO. Any random mp4 an you will see the fps drop from one code to another (again, I am just removing a pass through that is used from the nn to an output to display it out).

#!/usr/bin/env python3

from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import argparse

labelMap = ["face", ""]
model_name = 'mask_detection_300x300'
video_name = 'Random_faces'

NN_WIDTH, NN_HEIGHT = 300, 300

nnPathDefault = str((Path(file).parent / Path(f'model/{model_name}.blob')).resolve().absolute())
videoPathDefault = str((Path(file).parent / Path(f'videos/{video_name}.mp4')).resolve().absolute())
parser = argparse.ArgumentParser()
parser.add_argument('-nnPath', help="Path to mobilenet detection network blob", default=nnPathDefault)
parser.add_argument('-v', '--videoPath', help="Path to video frame", default=videoPathDefault)

args = parser.parse_args()

# Create pipeline
pipeline = dai.Pipeline()

# Define sources and outputs
manip = pipeline.create(dai.node.ImageManip)

detectionNetwork = pipeline.create(dai.node.NeuralNetwork)

manipOut = pipeline.create(dai.node.XLinkOut)
xinFrame = pipeline.create(dai.node.XLinkIn)

xlinkOut = pipeline.create(dai.node.XLinkOut)
nnOut = pipeline.create(dai.node.XLinkOut)

manipOut.setStreamName("manip")
xinFrame.setStreamName("inFrame")
xlinkOut.setStreamName("nnFrame")
nnOut.setStreamName("nn")

# Properties
xinFrame.setMaxDataSize(NN_WIDTH*NN_HEIGHT*3)

manip.initialConfig.setResizeThumbnail(NN_WIDTH, NN_HEIGHT)
# manip.initialConfig.setResize(384, 384)
# manip.initialConfig.setKeepAspectRatio(False) #squash the image to not lose FOV
# The NN model expects BGR input. By default ImageManip output type would be same as input (gray in this case)
manip.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
manip.inputImage.setBlocking(True)

# setting node configs
detectionNetwork.setBlobPath(args.nnPath)
detectionNetwork.input.setBlocking(True)

# Linking
manip.out.link(manipOut.input)
manip.out.link(detectionNetwork.input)
xinFrame.out.link(manip.inputImage)

detectionNetwork.out.link(nnOut.input)

detectionNetwork.passthrough.link(xlinkOut.input)

# Connect and start the pipeline
with dai.Device(pipeline) as device:

qIn = device.getInputQueue(name="inFrame")
nnFrameQ = device.getOutputQueue(name="nnFrame", maxSize=4)
qManip = device.getOutputQueue(name="manip", maxSize=4)
qDet = device.getOutputQueue(name="nn", maxSize=4)

startTime = time.monotonic()
counter = 0
fps = 0
detections = []
frame = None

def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
    return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

def displayFrame(name, frame, fps):
    color = (255, 0, 0)
    cv2.putText(frame, "Fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4,
                color)
    cv2.imshow(name, frame)

cap = cv2.VideoCapture(args.videoPath)
baseTs = time.monotonic()
simulatedFps = 35
inputFrameShape = (NN_WIDTH, NN_HEIGHT)

while cap.isOpened():
    read_correctly, frame = cap.read()
    if not read_correctly:
        break

    img = dai.ImgFrame()
    img.setType(dai.ImgFrame.Type.BGR888p)
    img.setData(to_planar(frame, inputFrameShape))
    img.setTimestamp(baseTs)
    baseTs += 1/simulatedFps

    img.setWidth(inputFrameShape[0])
    img.setHeight(inputFrameShape[1])
    qIn.send(img)

    nnFrame = nnFrameQ.tryGet()
    if nnFrame is None:
        continue

    manip = qManip.get()
    inDet = qDet.get()

    counter+=1
    current_time = time.monotonic()
    if (current_time - startTime) > 1 :
        fps = counter / (current_time - startTime)
        counter = 0
        startTime = current_time

    manipFrame = manip.getCvFrame()
    displayFrame("nn", manipFrame, fps)

    color = (255, 0, 0)
    nninFrame = nnFrame.getCvFrame()

    cv2.putText(nninFrame, "Fps: {:.2f}".format(fps), (2, nninFrame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)

    cv2.imshow("nnframe", nninFrame)

    if cv2.waitKey(1) == ord('q'):
        break

TompoussTompousss

And the code with the fps drop.

Thanks for your help.

#!/usr/bin/env python3

from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import argparse

labelMap = ["face", ""]
model_name = 'mask_detection_300x300'
video_name = 'Random_faces'

NN_WIDTH, NN_HEIGHT = 300, 300

args = parser.parse_args()

# Create pipeline
pipeline = dai.Pipeline()

# Define sources and outputs
manip = pipeline.create(dai.node.ImageManip)

detectionNetwork = pipeline.create(dai.node.NeuralNetwork)

manipOut = pipeline.create(dai.node.XLinkOut)
xinFrame = pipeline.create(dai.node.XLinkIn)

nnOut = pipeline.create(dai.node.XLinkOut)

manipOut.setStreamName("manip")
xinFrame.setStreamName("inFrame")
nnOut.setStreamName("nn")

# Properties
xinFrame.setMaxDataSize(NN_WIDTH*NN_HEIGHT*3)

# setting node configs
detectionNetwork.setBlobPath(args.nnPath)

detectionNetwork.input.setBlocking(True)

# Linking
manip.out.link(manipOut.input)
manip.out.link(detectionNetwork.input)
xinFrame.out.link(manip.inputImage)

detectionNetwork.out.link(nnOut.input)

# Connect and start the pipeline
with dai.Device(pipeline) as device:

qIn = device.getInputQueue(name="inFrame")

qManip = device.getOutputQueue(name="manip", maxSize=4)
qDet = device.getOutputQueue(name="nn", maxSize=4)

startTime = time.monotonic()
counter = 0
fps = 0
detections = []
frame = None

def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
    return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

def displayFrame(name, frame, fps):
    color = (255, 0, 0)
    cv2.putText(frame, "Fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4,
                color)
    cv2.imshow(name, frame)

cap = cv2.VideoCapture(args.videoPath)
baseTs = time.monotonic()
simulatedFps = 30
inputFrameShape = (NN_WIDTH, NN_HEIGHT)

while cap.isOpened():
    read_correctly, frame = cap.read()
    if not read_correctly:
        break

    img = dai.ImgFrame()
    img.setType(dai.ImgFrame.Type.BGR888p)
    img.setData(to_planar(frame, inputFrameShape))
    img.setTimestamp(baseTs)
    baseTs += 1/simulatedFps

    img.setWidth(inputFrameShape[0])
    img.setHeight(inputFrameShape[1])
    qIn.send(img)

    manip = qManip.get()
    inDet = qDet.get()

    counter+=1
    current_time = time.monotonic()
    if (current_time - startTime) > 1 :
        fps = counter / (current_time - startTime)
        counter = 0
        startTime = current_time

    manipFrame = manip.getCvFrame()
    displayFrame("nn", manipFrame, fps)


    if cv2.waitKey(1) == ord('q'):
        break

jakaskerl

Hi @TompoussTompousss
Host side issue. cv2.resize() takes too long (2.7ms vs 50ms):

    def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
        return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()

Thanks,
Jaka