• DepthAI
  • C++ API runs slower than Python API foe YOLO inference

Hello,

I've been running a YOLOV6N model (512*512) on OAK-D Wide camera and using a static image loaded from my desk as a feed to the NN to get detections with a custom YOLO model. I used both C++ and Python with almost exactly the same code and I could see that the C++ code is running at 38-39 FPS while the python code runs at 40-41 FPS. I was expecting the C++ code to be at least slightly faster and I couldn't figure out why it is the other way around.
Here is the python code:
import cv2

import depthai as dai

import numpy as np

import time

import cProfile

import pstats
label_map = ["1", "2", "3", "4", "5", "6", "7", "8"]
nn_path = "./configuration/yolov6n_classification_real.blob"
pipeline = dai.Pipeline()

xin_frame = pipeline.create(dai.node.XLinkIn)

nn = pipeline.create(dai.node.YoloDetectionNetwork)

nn_out = pipeline.create(dai.node.XLinkOut)
xin_frame.setStreamName("input")

nn_out.setStreamName("nn")
nn.setBlobPath(nn_path)

nn.setNumClasses(8)

nn.setCoordinateSize(4)

nn.setIouThreshold(0.5)

nn.input.setQueueSize(2)

nn.setNumInferenceThreads(2)

nn.input.setBlocking(False)
xin_frame.out.link(nn.input)

nn.out.link(nn_out.input)
device = dai.Device(pipeline)

q_input = device.getInputQueue("input")

q_det = device.getOutputQueue("nn", 4, False)
image_path = "./ee.jpg"

frame = cv2.imread(image_path)
if frame is None: print(f"Failed to load image: {image_path}") exit(-1)
profiler = cProfile.Profile()

profiler.enable()
start_time = time.time()

counter = 0

for i in range(20000):

resized_frame = cv2.resize(frame, (512, 512))

img_data = resized_frame.transpose(2, 0, 1).flatten()

img_frame = dai.ImgFrame()

img_frame.setData(img_data)

img_frame.setWidth(512)

img_frame.setHeight(512)

img_frame.setType(dai.ImgFrame.Type.BGR888p)
q_input.send(img_frame)

in_det = q_det.tryGet()
if time.time() - start_time > 10:

fps = counter / (time.time() - start_time)

print(f"FPS: {fps:.2f}")

counter = 0

start_time = time.time()
if in_det:

counter += 1

detections = in_det.detections

for detection in detections:

x1 = int(detection.xmin * resized_frame.shape[1])

y1 = int(detection.ymin * resized_frame.shape[0])

x2 = int(detection.xmax * resized_frame.shape[1])

y2 = int(detection.ymax * resized_frame.shape[0])

confidence = detection.confidence * 100
if confidence >= 50:

label_str = label_map[detection.label] if detection.label < len( label_map) else str(detection.label)

cv2.rectangle(resized_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

cv2.putText(resized_frame, label_str, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
profiler.disable()

stats = pstats.Stats(profiler)

stats.sort_stats(pstats.SortKey.TIME)

stats.print_stats(20)

cv2.destroyAllWindows()

and here is the C++ code:

#include <chrono>

#include <cstdio>

#include <iostream>

#include <filesystem>

#include <opencv2/opencv.hpp>

#include "depthai/depthai.hpp"

using namespace std;

using namespace std::chrono;
static const std::vector<std::string> labelMap = { "1", "2", "3", "4", "5", "6", "7", "8"};

int main(int argc, char** argv) {

std::string nnPath = "./configuration/yolov6n_classification_real.blob";
dai::Pipeline pipeline;

auto xinFrame = pipeline.create<dai::node::XLinkIn>();

xinFrame->setStreamName("input");

auto nn = pipeline.create<dai::node::YoloDetectionNetwork>();

auto nnOut = pipeline.create<dai::node::XLinkOut>();

nnOut->setStreamName("nn");
nn->setNumClasses(8);

nn->setCoordinateSize(4);

nn->setIouThreshold(0.5);

nn->input.setQueueSize(2);

nn->setNumInferenceThreads(2);

nn->input.setBlocking(false);
dai::OpenVINO::Blob blob(nnPath);

nn->setBlob(blob);
xinFrame->out.link(nn->input);

nn->out.link(nnOut->input);
dai::Device device(pipeline, dai::UsbSpeed::SUPER_PLUS);

auto qInput = device.getInputQueue("input");

auto qDet = device.getOutputQueue("nn", 4, false);
std::string imagePath = "ee.jpg";
cv::Mat frame = cv::imread(imagePath);

if (frame.empty()) { std::cerr << "Failed to load image: " << imagePath << std::endl; return -1; }

auto startTime = steady_clock::now();

int counter = 0;

float fps = 0;

for (int i = 0; i < 20000; i++) {

cv::Mat resizedFrame;

cv::resize(frame, resizedFrame, cv::Size(512, 512));

std::vector<uint8_t> imgData(512 * 512 * 3);

std::vector<cv::Mat> channels(3);

cv::split(resizedFrame, channels);
std::memcpy(imgData.data(), channels[0].data, 512 * 512);

std::memcpy(imgData.data() + 512 * 512, channels[1].data, 512 * 512);

std::memcpy(imgData.data() + 2 * 512 * 512, channels[2].data, 512 * 512);

auto imgFrame = std::make_shared<dai::ImgFrame>();

imgFrame->setData(imgData); imgFrame->setWidth(512);

imgFrame->setHeight(512); imgFrame->setType(dai::ImgFrame::Type::BGR888p);

qInput->send(imgFrame);

std::shared_ptr<dai::ImgDetections> inDet = qDet->tryGet<dai::ImgDetections>();

auto currentTime = steady_clock::now();

auto elapsed = duration_cast<duration<float>>(currentTime - startTime);

if(elapsed > seconds(1)) {

fps = counter / elapsed.count();

std::cout << fps << std::endl;

counter = 0;

startTime = steady_clock::now();

}
if (inDet) {

counter++;

std::vector<dai::ImgDetection> detections = inDet->detections;

for (auto& detection : detections) {

int x1 = detection.xmin * resizedFrame.cols;

int y1 = detection.ymin * resizedFrame.rows;

int x2 = detection.xmax * resizedFrame.cols;

int y2 = detection.ymax * resizedFrame.rows;

int confidence = detection.confidence * 100;

if (confidence >= 50) {

std::string labelStr = (detection.label < labelMap.size()) ? labelMap[detection.label] : std::to_string(detection.label);

cv::rectangle(resizedFrame, cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2)), cv::Scalar(0, 255, 0), 2);

cv::putText(resizedFrame, labelStr, cv::Point(x1, y1 - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1); } }

return 0;

}

Can you please point out what could be the reason for this difference and is there a way to optimize the C++ code to reduce latency?

    MohamedAsker cv::split(resizedFrame, channels);
    std::memcpy(imgData.data(), channels[0].data, 512 * 512);

    std::memcpy(imgData.data() + 512 * 512, channels[1].data, 512 * 512);

    std::memcpy(imgData.data() + 2 * 512 * 512, channels[2].data, 512 * 512);

    vs

    MohamedAsker img_data = resized_frame.transpose(2, 0, 1).flatten()

    The python one uses numpy, which is very fast and does less memory copying than your c++ implementation.

    Thanks for your reply. I already tried removing this part from the loop and the python code is still running at higher FPS. One thing that might be interesting is that when I use "get()" instead of "tryGet()" the C++ code becomes faster at 19.5 compared to 18.25 with python.

      MohamedAsker
      Do a tryget on both and time the host loops for both cases. I see no reason why the device side would behave any differently. get() and tryGet() are the same implementation on python and c++ since the python API is only the bindings to c++ calls.

      Thanks,
      Jaka