Hi @jakaskerl ,
I can not upload file here, or how should I do ? I just took that example. (object_tracker_video)
First code, I have deleted the tracker, changed the nn for yolo one and just use a passthrough to show the input frame from the nn instead of the input frame of the tracker (as it is done in the example). --> 65 FPS
Second code, I have now deleted the passthrough showing the input frame of the nn since it doesn't do anything anyway. --> 25 FPS
So just using a passthrough or not make the pipeline much faster ? I don't understand why.
First code:
#!/usr/bin/env python3
from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import argparse
labelMap = ["drone", ""]
model_name = 'yolov8_256'
nnPathDefault = str((Path(file).parent / Path(f'/home/touk/Desktop/Fourth_semester/yolov8/model/{model_name}_openvino_2022.1_6shave.blob')).resolve().absolute())
videoPathDefault = str((Path(file).parent / Path('/home/touk/Desktop/Fourth_semester/yolov8/videos/Drones-1-original.mp4')).resolve().absolute())
parser = argparse.ArgumentParser()
parser.add_argument('-nnPath', help="Path to mobilenet detection network blob", default=nnPathDefault)
parser.add_argument('-v', '--videoPath', help="Path to video frame", default=videoPathDefault)
args = parser.parse_args()
# Create pipeline
pipeline = dai.Pipeline()
# Define sources and outputs
manip = pipeline.create(dai.node.ImageManip)
detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
manipOut = pipeline.create(dai.node.XLinkOut)
xinFrame = pipeline.create(dai.node.XLinkIn)
xlinkOut = pipeline.create(dai.node.XLinkOut)
nnOut = pipeline.create(dai.node.XLinkOut)
manipOut.setStreamName("manip")
xinFrame.setStreamName("inFrame")
xlinkOut.setStreamName("nnFrame")
nnOut.setStreamName("nn")
# Properties
xinFrame.setMaxDataSize(256*256*3)
manip.initialConfig.setResizeThumbnail(256, 256)
# manip.initialConfig.setResize(384, 384)
# manip.initialConfig.setKeepAspectRatio(False) #squash the image to not lose FOV
# The NN model expects BGR input. By default ImageManip output type would be same as input (gray in this case)
manip.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
manip.inputImage.setBlocking(True)
# setting node configs
detectionNetwork.setBlobPath(args.nnPath)
detectionNetwork.setConfidenceThreshold(0.5)
detectionNetwork.input.setBlocking(True)
# Linking
manip.out.link(manipOut.input)
manip.out.link(detectionNetwork.input)
xinFrame.out.link(manip.inputImage)
detectionNetwork.out.link(nnOut.input)
detectionNetwork.passthrough.link(xlinkOut.input)
# Connect and start the pipeline
with dai.Device(pipeline) as device:
qIn = device.getInputQueue(name="inFrame")
nnFrameQ = device.getOutputQueue(name="nnFrame", maxSize=4)
qManip = device.getOutputQueue(name="manip", maxSize=4)
qDet = device.getOutputQueue(name="nn", maxSize=4)
startTime = time.monotonic()
counter = 0
fps = 0
detections = []
frame = None
def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
return cv2.resize(arr, shape).transpose(2, 0, 1).flatten()
# nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height
def frameNorm(frame, bbox):
normVals = np.full(len(bbox), frame.shape[0])
normVals[::2] = frame.shape[1]
return (np.clip(np.array(bbox), 0, 1) \* normVals).astype(int)
def displayFrame(name, frame):
for detection in detections:
bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, f"{int(detection.confidence \* 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.imshow(name, frame)
cap = cv2.VideoCapture(args.videoPath)
baseTs = time.monotonic()
simulatedFps = 30
inputFrameShape = (256, 256)
while cap.isOpened():
read_correctly, frame = cap.read()
if not read_correctly:
break
img = dai.ImgFrame()
img.setType(dai.ImgFrame.Type.BGR888p)
img.setData(to_planar(frame, inputFrameShape))
img.setTimestamp(baseTs)
baseTs += 1/simulatedFps
img.setWidth(inputFrameShape[0])
img.setHeight(inputFrameShape[1])
qIn.send(img)
nnFrame = nnFrameQ.tryGet()
if nnFrame is None:
continue
manip = qManip.get()
inDet = qDet.get()
counter+=1
current_time = time.monotonic()
if (current_time - startTime) > 1 :
fps = counter / (current_time - startTime)
counter = 0
startTime = current_time
detections = inDet.detections
manipFrame = manip.getCvFrame()
displayFrame("nn", manipFrame)
color = (255, 0, 0)
nninFrame = nnFrame.getCvFrame()
cv2.putText(nninFrame, "Fps: {:.2f}".format(fps), (2, nninFrame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
cv2.imshow("nnframe", nninFrame)
if cv2.waitKey(1) == ord('q'):
break