- Edited
Hello,
I've been running a YOLOV6N model (512*512) on OAK-D Wide camera and using a static image loaded from my desk as a feed to the NN to get detections with a custom YOLO model. I used both C++ and Python with almost exactly the same code and I could see that the C++ code is running at 38-39 FPS while the python code runs at 40-41 FPS. I was expecting the C++ code to be at least slightly faster and I couldn't figure out why it is the other way around.
Here is the python code:
import cv2
import depthai as dai
import numpy as np
import time
import cProfile
import pstats
label_map = ["1", "2", "3", "4", "5", "6", "7", "8"]
nn_path = "./configuration/yolov6n_classification_real.blob"
pipeline = dai.Pipeline()
xin_frame = pipeline.create(dai.node.XLinkIn)
nn = pipeline.create(dai.node.YoloDetectionNetwork)
nn_out = pipeline.create(dai.node.XLinkOut)
xin_frame.setStreamName("input")
nn_out.setStreamName("nn")
nn.setBlobPath(nn_path)
nn.setNumClasses(8)
nn.setCoordinateSize(4)
nn.setIouThreshold(0.5)
nn.input.setQueueSize(2)
nn.setNumInferenceThreads(2)
nn.input.setBlocking(False)
xin_frame.out.link(nn.input)
nn.out.link(nn_out.input)
device = dai.Device(pipeline)
q_input = device.getInputQueue("input")
q_det = device.getOutputQueue("nn", 4, False)
image_path = "./ee.jpg"
frame = cv2.imread(image_path)
if frame is None: print(f"Failed to load image: {image_path}") exit(-1)
profiler = cProfile.Profile()
profiler.enable()
start_time = time.time()
counter = 0
for i in range(20000):
resized_frame = cv2.resize(frame, (512, 512))
img_data = resized_frame.transpose(2, 0, 1).flatten()
img_frame = dai.ImgFrame()
img_frame.setData(img_data)
img_frame.setWidth(512)
img_frame.setHeight(512)
img_frame.setType(dai.ImgFrame.Type.BGR888p)
q_input.send(img_frame)
in_det = q_det.tryGet()
if time.time() - start_time > 10:
fps = counter / (time.time() - start_time)
print(f"FPS: {fps:.2f}")
counter = 0
start_time = time.time()
if in_det:
counter += 1
detections = in_det.detections
for detection in detections:
x1 = int(detection.xmin * resized_frame.shape[1])
y1 = int(detection.ymin * resized_frame.shape[0])
x2 = int(detection.xmax * resized_frame.shape[1])
y2 = int(detection.ymax * resized_frame.shape[0])
confidence = detection.confidence * 100
if confidence >= 50:
label_str = label_map[detection.label] if detection.label < len( label_map) else str(detection.label)
cv2.rectangle(resized_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(resized_frame, label_str, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats(pstats.SortKey.TIME)
stats.print_stats(20)
cv2.destroyAllWindows()
and here is the C++ code:
#include <chrono>
#include <cstdio>
#include <iostream>
#include <filesystem>
#include <opencv2/opencv.hpp>
#include "depthai/depthai.hpp"
using namespace std;
using namespace std::chrono;
static const std::vector<std::string> labelMap = { "1", "2", "3", "4", "5", "6", "7", "8"};
int main(int argc, char** argv) {
std::string nnPath = "./configuration/yolov6n_classification_real.blob";
dai::Pipeline pipeline;
auto xinFrame = pipeline.create<dai::node::XLinkIn>();
xinFrame->setStreamName("input");
auto nn = pipeline.create<dai::node::YoloDetectionNetwork>();
auto nnOut = pipeline.create<dai::node::XLinkOut>();
nnOut->setStreamName("nn");
nn->setNumClasses(8);
nn->setCoordinateSize(4);
nn->setIouThreshold(0.5);
nn->input.setQueueSize(2);
nn->setNumInferenceThreads(2);
nn->input.setBlocking(false);
dai::OpenVINO::Blob blob(nnPath);
nn->setBlob(blob);
xinFrame->out.link(nn->input);
nn->out.link(nnOut->input);
dai::Device device(pipeline, dai::UsbSpeed::SUPER_PLUS);
auto qInput = device.getInputQueue("input");
auto qDet = device.getOutputQueue("nn", 4, false);
std::string imagePath = "ee.jpg";
cv::Mat frame = cv::imread(imagePath);
if (frame.empty()) { std::cerr << "Failed to load image: " << imagePath << std::endl; return -1; }
auto startTime = steady_clock::now();
int counter = 0;
float fps = 0;
for (int i = 0; i < 20000; i++) {
cv::Mat resizedFrame;
cv::resize(frame, resizedFrame, cv::Size(512, 512));
std::vector<uint8_t> imgData(512 * 512 * 3);
std::vector<cv::Mat> channels(3);
cv::split(resizedFrame, channels);
std::memcpy(imgData.data(), channels[0].data, 512 * 512);
std::memcpy(imgData.data() + 512 * 512, channels[1].data, 512 * 512);
std::memcpy(imgData.data() + 2 * 512 * 512, channels[2].data, 512 * 512);
auto imgFrame = std::make_shared<dai::ImgFrame>();
imgFrame->setData(imgData); imgFrame->setWidth(512);
imgFrame->setHeight(512); imgFrame->setType(dai::ImgFrame::Type::BGR888p);
qInput->send(imgFrame);
std::shared_ptr<dai::ImgDetections> inDet = qDet->tryGet<dai::ImgDetections>();
auto currentTime = steady_clock::now();
auto elapsed = duration_cast<duration<float>>(currentTime - startTime);
if(elapsed > seconds(1)) {
fps = counter / elapsed.count();
std::cout << fps << std::endl;
counter = 0;
startTime = steady_clock::now();
}
if (inDet) {
counter++;
std::vector<dai::ImgDetection> detections = inDet->detections;
for (auto& detection : detections) {
int x1 = detection.xmin * resizedFrame.cols;
int y1 = detection.ymin * resizedFrame.rows;
int x2 = detection.xmax * resizedFrame.cols;
int y2 = detection.ymax * resizedFrame.rows;
int confidence = detection.confidence * 100;
if (confidence >= 50) {
std::string labelStr = (detection.label < labelMap.size()) ? labelMap[detection.label] : std::to_string(detection.label);
cv::rectangle(resizedFrame, cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2)), cv::Scalar(0, 255, 0), 2);
cv::putText(resizedFrame, labelStr, cv::Point(x1, y1 - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1); } }
return 0;
}
Can you please point out what could be the reason for this difference and is there a way to optimize the C++ code to reduce latency?