MohamedAsker

MohamedAsker · 2025-03-20T12:33:20+00:00

MohamedAsker I also tried not using an xLinkNode and instead of using the build function I set the nnarchive like that

neuralNetwork = pipeline.create(ParsingNeuralNetwork)

neuralNetwork.setNNArchive(modelArchive)
qRgb = neuralNetwork.input.createInputQueue()
qDet = neuralNetwork.out.createOutputQueue()

and still getting 53 FPS. Is this the expected behaviour or is there something that could make it better? I am also getting this warning before inference starts

[3260625470] [192.168.178.22] [1742473775.004] [DetectionParser(3)] [warning] Did not get the input image sizes from the imageIn input. Defaulting to 416 x 416

MohamedAsker · 2025-03-20T12:19:11+00:00

MohamedAsker here is the generated pipeline if this could help

{"connections":[{"node1Id":7,"node1Output":"out","node1OutputGroup":"","node2Id":2,"node2Input":"in","node2InputGroup":""}

,{"node1Id":4,"node1Output":"out","node1OutputGroup":"","node2Id":8,"node2Input":"in","node2InputGroup":""}

,{"node1Id":2,"node1Output":"out","node1OutputGroup":"","node2Id":4,"node2Input":"in","node2InputGroup":""}

,{"node1Id":0,"node1Output":"out","node1OutputGroup":"","node2Id":2,"node2Input":"in","node2InputGroup":""}

],"globalProperties":{"calibData":null,"cameraTuningBlobSize":null,"cameraTuningBlobUri":"","leonCssFrequencyHz":700000000.0,"leonMssFrequencyHz":700000000.0,"pipelineName":null,"pipelineVersion":null,"sippBufferSize":18432,"sippDmaBufferSize":16384,"xlinkChunkSize":-1}

,"nodes":[[8,{"alias":"","id":8,"ioInfo":[[["","in"],{"blocking":true,"group":"","id":8,"name":"in","queueSize":3,"type":3,"waitForMessage":false}

]],"logLevel":3,"name":"XLinkOut","parentId":-1,"properties":[185,3,136,0,0,128,191,189,9,95,95,120,95,52,95,111,117,116,0]}

],[7,{"alias":"","id":7,"ioInfo":[[["","out"],{"blocking":false,"group":"","id":7,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

]],"logLevel":3,"name":"XLinkIn","parentId":-1,"properties":[185,3,189,9,95,95,120,95,50,95,95,105,110,130,0,0,80,0,8]}

],[4,{"alias":"","id":4,"ioInfo":[[["","out"],{"blocking":false,"group":"","id":6,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

],[["","imageIn"],{"blocking":true,"group":"","id":5,"name":"imageIn","queueSize":5,"type":3,"waitForMessage":true}

],[["","in"],{"blocking":true,"group":"","id":4,"name":"in","queueSize":5,"type":3,"waitForMessage":true}

]],"logLevel":3,"name":"DetectionParser","parentId":-1,"properties":[185,3,8,187,0,185,8,0,136,0,0,0,63,8,4,186,0,187,0,186,0,136,0,0,0,63]}

],[2,{"alias":"","id":2,"ioInfo":[[["","passthrough"],{"blocking":false,"group":"","id":3,"name":"passthrough","queueSize":8,"type":0,"waitForMessage":false}

],[["","out"],{"blocking":false,"group":"","id":2,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

],[["","in"],{"blocking":true,"group":"","id":1,"name":"in","queueSize":3,"type":3,"waitForMessage":true}

]],"logLevel":3,"name":"NeuralNetwork","parentId":-1,"properties":[185,10,1,190,189,0,189,13,97,115,115,101,116,58,95,95,109,111,100,101,108,8,0,0,0,189,0,187,0]}

],[0,{"alias":"","id":0,"ioInfo":[[["","out"],{"blocking":false,"group":"","id":0,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

]],"logLevel":3,"name":"XLinkIn","parentId":-1,"properties":[185,3,189,0,130,0,0,80,0,8]}

]]}

MohamedAsker · 2025-03-20T12:13:45+00:00

Thanks for your reply. I now have this script that runs but I had to initialize an xLinkIn node and connect it to the neural net to be able to build it. The problem is that I am running a YOLOV6N 640 model and it can only run at 53 FPS while it was running with more than 200 FPS during benchmarking. What can be the reason for such a huge drop?

Here is the script that I have now:

#!/usr/bin/env python3

import cv2import depthai as daiimport numpy as npimport timefrom depthai_nodes import ParsingNeuralNetwork

import random

device = dai.Device()
modelPath = "./.depthai_cached_models/bk_gh_od_02_12_24_640_best_objects.rvc4.tar.xz"modelArchive = dai.NNArchive(modelPath)
inputSize = modelArchive.getInputSize()

type = modelArchive.getConfig().model.inputs[0].preprocessing.daiType

if type: try: frameType = dai.ImgFrame.Type.BGR888i except AttributeError: type = None
testImg = cv2.imread("/home/asker/oak4/depthai-core/test.jpg")

testImg2 = cv2.imread("/home/asker/oak4/depthai-core/test2.jpg")

testImg = cv2.resize(testImg, (inputSize[0], inputSize[1]))

testImg2 = cv2.resize(testImg2, (inputSize[0], inputSize[1]))

with dai.Pipeline(device) as pipeline:
xLinkIn = pipeline.create(dai.node.XLinkIn)

neuralNetwork = pipeline.create(ParsingNeuralNetwork).build( xLinkIn.out, modelArchive, )

qRgb = neuralNetwork.input.createInputQueue( blocking=False, maxSize=8)

qDet = neuralNetwork.out.createOutputQueue()
pipeline.start()

frame = None

detections = []

startTime = time.time()

counter = 0

color2 = (255, 255, 255)
inputFrame = dai.ImgFrame()

if random.choice([True, False]): inputFrame.setCvFrame(testImg, frameType) else: inputFrame.setCvFrame(testImg2, frameType)

while pipeline.isRunning():

qRgb.send(inputFrame)

inDet: dai.ImgDetections = qDet.get()

if inDet is not None:

detections = inDet.detections

counter += 1
if time.time() - startTime > 1:

print("FPS: {:.2f}".format(counter / (time.time() - startTime)))

counter = 0

startTime = time.time()
if cv2.waitKey(1) == ord("q"): pipeline.stop() break

MohamedAsker · 2025-03-18T17:00:04+00:00

Hello, I am trying to run inference with OAK 4 S using a custom YOLO model but feeding images that are loaded locally. in the code that I am providing I am just alternating randomly between two frames but ideally this should be a stream of images. I could run the pipeline using the benchmark node but it seems to be picking up only the first image and not using next images despite feeding them to the input queue. I also tried using xlinkin and xlinkout but it seems to be not working for me and I am lacking documentation for these two nodes on the new version of the API. Here is the code that I created.

import depthai as dai

import numpy as np

import time

import cv2

import random

device = dai.Device()
modelPath = "/home/asker/oak4/depthai-core/examples/python/.depthai_cached_models/bk_gh_od_02_12_24_640_best_objects.rvc4.tar.xz"modelArchive = dai.NNArchive(modelPath)
inputSize = modelArchive.getInputSize()type = modelArchive.getConfig().model.inputs[0].preprocessing.daiTypeif type: try: frameType = dai.ImgFrame.Type.BGR888i except AttributeError: type = None
testImg = cv2.imread("/home/asker/oak4/depthai-core/test.jpg")print(inputSize)testImg = cv2.resize(testImg, (inputSize[0], inputSize[1]))# Construct the input (white) image for benchmarkingimg = np.ones((inputSize[1], inputSize[0], 3), np.uint8) * 255inputFrame = dai.ImgFrame()inputFrame.setCvFrame(testImg, frameType)
with dai.Pipeline(device) as p: benchmarkOut = p.create(dai.node.BenchmarkOut) benchmarkOut.setRunOnHost(False) # The node can run on host or on device benchmarkOut.setFps(-1) # As fast as possible
neuralNetwork = p.create(dai.node.DetectionNetwork).build( benchmarkOut.out, modelArchive) # labelMap = neuralNetwork.getClasses()
benchmarkIn = p.create(dai.node.BenchmarkIn) benchmarkIn.setRunOnHost(False) # The node can run on host or on device benchmarkIn.sendReportEveryNMessages(100) benchmarkIn.logReportsAsWarnings(False) neuralNetwork.out.link(benchmarkIn.input)
outputQueue = benchmarkIn.report.createOutputQueue() inputQueue = benchmarkOut.input.createInputQueue()
qRgb = neuralNetwork.passthrough.createOutputQueue() qDet = neuralNetwork.out.createOutputQueue()
p.start()
frame = None detections = [] startTime = time.monotonic() counter = 0 color2 = (255, 255, 255)
# nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height def frameNorm(frame, bbox): normVals = np.full(len(bbox), frame.shape[0]) normVals[::2] = frame.shape[1] return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
def displayFrame(name, frame): color = (255, 0, 0) for detection in detections: bbox = frameNorm( frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax), ) print(f"{bbox=}") cv2.putText( frame, "class", (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, ) cv2.putText( frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, ) cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2) # Show the frame cv2.imshow(name, frame)
while p.isRunning(): if random.choice([True, False]): inputFrame.setCvFrame(testImg, frameType) else: inputFrame.setCvFrame(img, frameType) inputQueue.send(inputFrame) # Send the input image only once # benchmarkReport = outputQueue.get() # time.sleep(0.01) inRgb: dai.ImgFrame = qRgb.get() inDet: dai.ImgDetections = qDet.get() if inRgb is not None: frame = inRgb.getCvFrame() cv2.putText( frame, "NN fps: {:.2f}".format( counter / (time.monotonic() - startTime)), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color2, )
if inDet is not None: detections = inDet.detections print(inDet.detections) counter += 1
if frame is not None: displayFrame("rgb", frame) print("FPS: {:.2f}".format( counter / (time.monotonic() - startTime))) if cv2.waitKey(1) == ord("q"): p.stop() break # assert isinstance(benchmarkReport, dai.BenchmarkReport) # print(f"FPS is {benchmarkReport.fps}")

The question is what is the best way to feed images to a neural network using the new pipeline? should I use Xlink or is there a new node that can handle this? if it is Xlink, can you provide some guidance on how to use it and how to define the pipeline (queues, links, etc)?

Thanks in advance.

MohamedAsker · 17 Mar

Hello, I am running some preliminary experiments with OAK 4 S camera and I am using a custom YOLOV6N model with input size of 640 x 640. I am feeding a static image so not using the camera feed in this case as input to the neural network. Everything seems to run at an FPS of roughly 88 but I get this warning in the beginning of each run:
[3260625470] [192.168.178.22] [1742226352.711] [DetectionParser(4)] [warning] Did not get the input image sizes from the imageIn input. Defaulting to 416 x 416

I am not sure where should I set this and whether this affects the inference or not

here is the code I used

import depthai as dai

import numpy as np

import time

import cv2

from depthai_nodes import ParsingNeuralNetwork

device = dai.Device()
modelPath = "./depthai-core/examples/python/.depthai_cached_models/bk_gh_od_02_12_24_640_best_objects.rvc4.tar.xz"

modelArchive = dai.NNArchive(modelPath)
inputSize = modelArchive.getInputSize()

type = modelArchive.getConfig().model.inputs[0].preprocessing.daiType

if type: try: frameType = dai.ImgFrame.Type.BGR888i except AttributeError: type = None

testImg = cv2.imread("/home/asker/oak4/depthai-core/test.jpg")

print(inputSize)testImg = cv2.resize(testImg, (inputSize[0], inputSize[1]))

inputFrame = dai.ImgFrame()

inputFrame.setWidth(inputSize[0])

inputFrame.setHeight(inputSize[1])

inputFrame.setCvFrame(testImg, frameType)

print(inputFrame)

with dai.Pipeline(device) as p:
benchmarkOut = p.create(dai.node.BenchmarkOut)

benchmarkOut.setRunOnHost(False)

benchmarkOut.setFps(-1)

neuralNetwork = p.create(ParsingNeuralNetwork).build( benchmarkOut.out, modelArchive, )
parser_output_queue = neuralNetwork.out.createOutputQueue()
benchmarkIn = p.create(dai.node.BenchmarkIn)

benchmarkIn.setRunOnHost(False)

benchmarkIn.sendReportEveryNMessages(100)

benchmarkIn.logReportsAsWarnings(False)

neuralNetwork.out.link(benchmarkIn.input)

outputQueue = benchmarkIn.report.createOutputQueue()

inputQueue = benchmarkOut.input.createInputQueue()
qRgb = neuralNetwork.passthrough.createOutputQueue()

qDet = neuralNetwork.out.createOutputQueue()
p.start()
frame = None

detections = []

startTime = time.time()

counter = 0

color2 = (255, 255, 255)
def frameNorm(frame, bbox):

normVals = np.full(len(bbox), frame.shape[0])

normVals[::2] = frame.shape[1]

return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
def displayFrame(name, frame):

color = (255, 0, 0)

for detection in detections:

bbox = frameNorm( frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax), )

# print(f"{bbox=}")

cv2.putText( frame, "class", (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, ) cv2.putText( frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, )

cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2) cv2.imshow(name, frame)
while p.isRunning():

inputFrame = dai.ImgFrame()

inputFrame.setWidth(inputSize[0])

inputFrame.setHeight(inputSize[1])

inputFrame.setCvFrame(testImg, frameType)

inputQueue.send(inputFrame)

# benchmarkReport = outputQueue.get() # time.sleep(0.01)

# print(f"{benchmarkReport=}") inRgb: dai.ImgFrame = qRgb.get()

inDet: dai.ImgDetections = qDet.get()
if inRgb is not None:

frame = inRgb.getCvFrame()

cv2.putText( frame, "NN fps: {:.2f}".format( counter / (time.monotonic() - startTime)), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, (color2), )
if inDet is not None:

detections = inDet.detections

counter += 1
if frame is not None:

displayFrame("rgb", frame)

if time.time() - startTime > 10:

print("FPS: {:.2f}".format( counter / (time.time() - startTime)))

counter = 0

startTime = time.time()

if cv2.waitKey(1) == ord("q"):

p.stop()

break

# assert isinstance(benchmarkReport, dai.BenchmarkReport)

# print(f"FPS is {benchmarkReport.fps}")

MohamedAsker · 16 Mar

Thanks for your reply. I already tried removing this part from the loop and the python code is still running at higher FPS. One thing that might be interesting is that when I use "get()" instead of "tryGet()" the C++ code becomes faster at 19.5 compared to 18.25 with python.

MohamedAsker · 14 Mar

Hello,

I've been running a YOLOV6N model (512*512) on OAK-D Wide camera and using a static image loaded from my desk as a feed to the NN to get detections with a custom YOLO model. I used both C++ and Python with almost exactly the same code and I could see that the C++ code is running at 38-39 FPS while the python code runs at 40-41 FPS. I was expecting the C++ code to be at least slightly faster and I couldn't figure out why it is the other way around.
Here is the python code:
import cv2

import depthai as dai

import numpy as np

import time

import cProfile

import pstats
label_map = ["1", "2", "3", "4", "5", "6", "7", "8"]
nn_path = "./configuration/yolov6n_classification_real.blob"
pipeline = dai.Pipeline()

xin_frame = pipeline.create(dai.node.XLinkIn)

nn = pipeline.create(dai.node.YoloDetectionNetwork)

nn_out = pipeline.create(dai.node.XLinkOut)
xin_frame.setStreamName("input")

nn_out.setStreamName("nn")
nn.setBlobPath(nn_path)

nn.setNumClasses(8)

nn.setCoordinateSize(4)

nn.setIouThreshold(0.5)

nn.input.setQueueSize(2)

nn.setNumInferenceThreads(2)

nn.input.setBlocking(False)
xin_frame.out.link(nn.input)

nn.out.link(nn_out.input)
device = dai.Device(pipeline)

q_input = device.getInputQueue("input")

q_det = device.getOutputQueue("nn", 4, False)
image_path = "./ee.jpg"

frame = cv2.imread(image_path)
if frame is None: print(f"Failed to load image: {image_path}") exit(-1)
profiler = cProfile.Profile()

profiler.enable()
start_time = time.time()

counter = 0

for i in range(20000):

resized_frame = cv2.resize(frame, (512, 512))

img_data = resized_frame.transpose(2, 0, 1).flatten()

img_frame = dai.ImgFrame()

img_frame.setData(img_data)

img_frame.setWidth(512)

img_frame.setHeight(512)

img_frame.setType(dai.ImgFrame.Type.BGR888p)
q_input.send(img_frame)

in_det = q_det.tryGet()
if time.time() - start_time > 10:

fps = counter / (time.time() - start_time)

print(f"FPS: {fps:.2f}")

counter = 0

start_time = time.time()
if in_det:

counter += 1

detections = in_det.detections

for detection in detections:

x1 = int(detection.xmin * resized_frame.shape[1])

y1 = int(detection.ymin * resized_frame.shape[0])

x2 = int(detection.xmax * resized_frame.shape[1])

y2 = int(detection.ymax * resized_frame.shape[0])

confidence = detection.confidence * 100
if confidence >= 50:

label_str = label_map[detection.label] if detection.label < len( label_map) else str(detection.label)

cv2.rectangle(resized_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

cv2.putText(resized_frame, label_str, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
profiler.disable()

stats = pstats.Stats(profiler)

stats.sort_stats(pstats.SortKey.TIME)

stats.print_stats(20)

cv2.destroyAllWindows()

and here is the C++ code:

#include <chrono>

#include <cstdio>

#include <iostream>

#include <filesystem>

#include <opencv2/opencv.hpp>

#include "depthai/depthai.hpp"

using namespace std;

using namespace std::chrono;
static const std::vector<std::string> labelMap = { "1", "2", "3", "4", "5", "6", "7", "8"};

int main(int argc, char** argv) {

std::string nnPath = "./configuration/yolov6n_classification_real.blob";
dai::Pipeline pipeline;

auto xinFrame = pipeline.create<dai::node::XLinkIn>();

xinFrame->setStreamName("input");

auto nn = pipeline.create<dai::node::YoloDetectionNetwork>();

auto nnOut = pipeline.create<dai::node::XLinkOut>();

nnOut->setStreamName("nn");
nn->setNumClasses(8);

nn->setCoordinateSize(4);

nn->setIouThreshold(0.5);

nn->input.setQueueSize(2);

nn->setNumInferenceThreads(2);

nn->input.setBlocking(false);
dai::OpenVINO::Blob blob(nnPath);

nn->setBlob(blob);
xinFrame->out.link(nn->input);

nn->out.link(nnOut->input);
dai::Device device(pipeline, dai::UsbSpeed::SUPER_PLUS);

auto qInput = device.getInputQueue("input");

auto qDet = device.getOutputQueue("nn", 4, false);
std::string imagePath = "ee.jpg";
cv::Mat frame = cv::imread(imagePath);

if (frame.empty()) { std::cerr << "Failed to load image: " << imagePath << std::endl; return -1; }

auto startTime = steady_clock::now();

int counter = 0;

float fps = 0;

for (int i = 0; i < 20000; i++) {

cv::Mat resizedFrame;

cv::resize(frame, resizedFrame, cv::Size(512, 512));

std::vector<uint8_t> imgData(512 * 512 * 3);

std::vector<cv::Mat> channels(3);

cv::split(resizedFrame, channels);
std::memcpy(imgData.data(), channels[0].data, 512 * 512);

std::memcpy(imgData.data() + 512 * 512, channels[1].data, 512 * 512);

std::memcpy(imgData.data() + 2 * 512 * 512, channels[2].data, 512 * 512);

auto imgFrame = std::make_shared<dai::ImgFrame>();

imgFrame->setData(imgData); imgFrame->setWidth(512);

imgFrame->setHeight(512); imgFrame->setType(dai::ImgFrame::Type::BGR888p);

qInput->send(imgFrame);

std::shared_ptr<dai::ImgDetections> inDet = qDet->tryGet<dai::ImgDetections>();

auto currentTime = steady_clock::now();

auto elapsed = duration_cast<duration<float>>(currentTime - startTime);

if(elapsed > seconds(1)) {

fps = counter / elapsed.count();

std::cout << fps << std::endl;

counter = 0;

startTime = steady_clock::now();

}
if (inDet) {

counter++;

std::vector<dai::ImgDetection> detections = inDet->detections;

for (auto& detection : detections) {

int x1 = detection.xmin * resizedFrame.cols;

int y1 = detection.ymin * resizedFrame.rows;

int x2 = detection.xmax * resizedFrame.cols;

int y2 = detection.ymax * resizedFrame.rows;

int confidence = detection.confidence * 100;

if (confidence >= 50) {

std::string labelStr = (detection.label < labelMap.size()) ? labelMap[detection.label] : std::to_string(detection.label);

cv::rectangle(resizedFrame, cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2)), cv::Scalar(0, 255, 0), 2);

cv::putText(resizedFrame, labelStr, cv::Point(x1, y1 - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1); } }

return 0;

}

Can you please point out what could be the reason for this difference and is there a way to optimize the C++ code to reduce latency?