MohamedAsker

  • 8 days ago
  • Joined 14 Mar
  • 0 best answers
  • MohamedAsker I also tried not using an xLinkNode and instead of using the build function I set the nnarchive like that

    neuralNetwork = pipeline.create(ParsingNeuralNetwork)

    neuralNetwork.setNNArchive(modelArchive)
    qRgb = neuralNetwork.input.createInputQueue()
    qDet = neuralNetwork.out.createOutputQueue()

    and still getting 53 FPS. Is this the expected behaviour or is there something that could make it better? I am also getting this warning before inference starts

    [3260625470] [192.168.178.22] [1742473775.004] [DetectionParser(3)] [warning] Did not get the input image sizes from the imageIn input. Defaulting to 416 x 416

    • MohamedAsker here is the generated pipeline if this could help

      {"connections":[{"node1Id":7,"node1Output":"out","node1OutputGroup":"","node2Id":2,"node2Input":"in","node2InputGroup":""}

      ,{"node1Id":4,"node1Output":"out","node1OutputGroup":"","node2Id":8,"node2Input":"in","node2InputGroup":""}

      ,{"node1Id":2,"node1Output":"out","node1OutputGroup":"","node2Id":4,"node2Input":"in","node2InputGroup":""}

      ,{"node1Id":0,"node1Output":"out","node1OutputGroup":"","node2Id":2,"node2Input":"in","node2InputGroup":""}

      ],"globalProperties":{"calibData":null,"cameraTuningBlobSize":null,"cameraTuningBlobUri":"","leonCssFrequencyHz":700000000.0,"leonMssFrequencyHz":700000000.0,"pipelineName":null,"pipelineVersion":null,"sippBufferSize":18432,"sippDmaBufferSize":16384,"xlinkChunkSize":-1}

      ,"nodes":[[8,{"alias":"","id":8,"ioInfo":[[["","in"],{"blocking":true,"group":"","id":8,"name":"in","queueSize":3,"type":3,"waitForMessage":false}

      ]],"logLevel":3,"name":"XLinkOut","parentId":-1,"properties":[185,3,136,0,0,128,191,189,9,95,95,120,95,52,95,111,117,116,0]}

      ],[7,{"alias":"","id":7,"ioInfo":[[["","out"],{"blocking":false,"group":"","id":7,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

      ]],"logLevel":3,"name":"XLinkIn","parentId":-1,"properties":[185,3,189,9,95,95,120,95,50,95,95,105,110,130,0,0,80,0,8]}

      ],[4,{"alias":"","id":4,"ioInfo":[[["","out"],{"blocking":false,"group":"","id":6,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

      ],[["","imageIn"],{"blocking":true,"group":"","id":5,"name":"imageIn","queueSize":5,"type":3,"waitForMessage":true}

      ],[["","in"],{"blocking":true,"group":"","id":4,"name":"in","queueSize":5,"type":3,"waitForMessage":true}

      ]],"logLevel":3,"name":"DetectionParser","parentId":-1,"properties":[185,3,8,187,0,185,8,0,136,0,0,0,63,8,4,186,0,187,0,186,0,136,0,0,0,63]}

      ],[2,{"alias":"","id":2,"ioInfo":[[["","passthrough"],{"blocking":false,"group":"","id":3,"name":"passthrough","queueSize":8,"type":0,"waitForMessage":false}

      ],[["","out"],{"blocking":false,"group":"","id":2,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

      ],[["","in"],{"blocking":true,"group":"","id":1,"name":"in","queueSize":3,"type":3,"waitForMessage":true}

      ]],"logLevel":3,"name":"NeuralNetwork","parentId":-1,"properties":[185,10,1,190,189,0,189,13,97,115,115,101,116,58,95,95,109,111,100,101,108,8,0,0,0,189,0,187,0]}

      ],[0,{"alias":"","id":0,"ioInfo":[[["","out"],{"blocking":false,"group":"","id":0,"name":"out","queueSize":8,"type":0,"waitForMessage":false}

      ]],"logLevel":3,"name":"XLinkIn","parentId":-1,"properties":[185,3,189,0,130,0,0,80,0,8]}

      ]]}

      • Thanks for your reply. I now have this script that runs but I had to initialize an xLinkIn node and connect it to the neural net to be able to build it. The problem is that I am running a YOLOV6N 640 model and it can only run at 53 FPS while it was running with more than 200 FPS during benchmarking. What can be the reason for such a huge drop?

        Here is the script that I have now:

        #!/usr/bin/env python3

        import cv2import depthai as daiimport numpy as npimport timefrom depthai_nodes import ParsingNeuralNetwork

        import random

        device = dai.Device()
        modelPath = "./.depthai_cached_models/bk_gh_od_02_12_24_640_best_objects.rvc4.tar.xz"modelArchive = dai.NNArchive(modelPath)
        inputSize = modelArchive.getInputSize()

        type = modelArchive.getConfig().model.inputs[0].preprocessing.daiType

        if type: try: frameType = dai.ImgFrame.Type.BGR888i except AttributeError: type = None
        testImg = cv2.imread("/home/asker/oak4/depthai-core/test.jpg")

        testImg2 = cv2.imread("/home/asker/oak4/depthai-core/test2.jpg")

        testImg = cv2.resize(testImg, (inputSize[0], inputSize[1]))

        testImg2 = cv2.resize(testImg2, (inputSize[0], inputSize[1]))

        with dai.Pipeline(device) as pipeline:
        xLinkIn = pipeline.create(dai.node.XLinkIn)

        neuralNetwork = pipeline.create(ParsingNeuralNetwork).build( xLinkIn.out, modelArchive, )

        qRgb = neuralNetwork.input.createInputQueue( blocking=False, maxSize=8)

        qDet = neuralNetwork.out.createOutputQueue()
        pipeline.start()

        frame = None

        detections = []

        startTime = time.time()

        counter = 0

        color2 = (255, 255, 255)
        inputFrame = dai.ImgFrame()

        if random.choice([True, False]): inputFrame.setCvFrame(testImg, frameType) else: inputFrame.setCvFrame(testImg2, frameType)

        while pipeline.isRunning():

        qRgb.send(inputFrame)

        inDet: dai.ImgDetections = qDet.get()

        if inDet is not None:

        detections = inDet.detections

        counter += 1
        if time.time() - startTime > 1:

        print("FPS: {:.2f}".format(counter / (time.time() - startTime)))

        counter = 0

        startTime = time.time()
        if cv2.waitKey(1) == ord("q"): pipeline.stop() break

        • Hello, I am trying to run inference with OAK 4 S using a custom YOLO model but feeding images that are loaded locally. in the code that I am providing I am just alternating randomly between two frames but ideally this should be a stream of images. I could run the pipeline using the benchmark node but it seems to be picking up only the first image and not using next images despite feeding them to the input queue. I also tried using xlinkin and xlinkout but it seems to be not working for me and I am lacking documentation for these two nodes on the new version of the API. Here is the code that I created.

          import depthai as dai

          import numpy as np

          import time

          import cv2

          import random

          device = dai.Device()
          modelPath = "/home/asker/oak4/depthai-core/examples/python/.depthai_cached_models/bk_gh_od_02_12_24_640_best_objects.rvc4.tar.xz"modelArchive = dai.NNArchive(modelPath)
          inputSize = modelArchive.getInputSize()type = modelArchive.getConfig().model.inputs[0].preprocessing.daiTypeif type: try: frameType = dai.ImgFrame.Type.BGR888i except AttributeError: type = None
          testImg = cv2.imread("/home/asker/oak4/depthai-core/test.jpg")print(inputSize)testImg = cv2.resize(testImg, (inputSize[0], inputSize[1]))# Construct the input (white) image for benchmarkingimg = np.ones((inputSize[1], inputSize[0], 3), np.uint8) * 255inputFrame = dai.ImgFrame()inputFrame.setCvFrame(testImg, frameType)
          with dai.Pipeline(device) as p: benchmarkOut = p.create(dai.node.BenchmarkOut) benchmarkOut.setRunOnHost(False) # The node can run on host or on device benchmarkOut.setFps(-1) # As fast as possible
          neuralNetwork = p.create(dai.node.DetectionNetwork).build( benchmarkOut.out, modelArchive) # labelMap = neuralNetwork.getClasses()
          benchmarkIn = p.create(dai.node.BenchmarkIn) benchmarkIn.setRunOnHost(False) # The node can run on host or on device benchmarkIn.sendReportEveryNMessages(100) benchmarkIn.logReportsAsWarnings(False) neuralNetwork.out.link(benchmarkIn.input)
          outputQueue = benchmarkIn.report.createOutputQueue() inputQueue = benchmarkOut.input.createInputQueue()
          qRgb = neuralNetwork.passthrough.createOutputQueue() qDet = neuralNetwork.out.createOutputQueue()
          p.start()
          frame = None detections = [] startTime = time.monotonic() counter = 0 color2 = (255, 255, 255)
          # nn data, being the bounding box locations, are in <0..1> range - they need to be normalized with frame width/height def frameNorm(frame, bbox): normVals = np.full(len(bbox), frame.shape[0]) normVals[::2] = frame.shape[1] return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
          def displayFrame(name, frame): color = (255, 0, 0) for detection in detections: bbox = frameNorm( frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax), ) print(f"{bbox=}") cv2.putText( frame, "class", (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, ) cv2.putText( frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, ) cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2) # Show the frame cv2.imshow(name, frame)
          while p.isRunning(): if random.choice([True, False]): inputFrame.setCvFrame(testImg, frameType) else: inputFrame.setCvFrame(img, frameType) inputQueue.send(inputFrame) # Send the input image only once # benchmarkReport = outputQueue.get() # time.sleep(0.01) inRgb: dai.ImgFrame = qRgb.get() inDet: dai.ImgDetections = qDet.get() if inRgb is not None: frame = inRgb.getCvFrame() cv2.putText( frame, "NN fps: {:.2f}".format( counter / (time.monotonic() - startTime)), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color2, )
          if inDet is not None: detections = inDet.detections print(inDet.detections) counter += 1
          if frame is not None: displayFrame("rgb", frame) print("FPS: {:.2f}".format( counter / (time.monotonic() - startTime))) if cv2.waitKey(1) == ord("q"): p.stop() break # assert isinstance(benchmarkReport, dai.BenchmarkReport) # print(f"FPS is {benchmarkReport.fps}")

          The question is what is the best way to feed images to a neural network using the new pipeline? should I use Xlink or is there a new node that can handle this? if it is Xlink, can you provide some guidance on how to use it and how to define the pipeline (queues, links, etc)?

          Thanks in advance.

          • Hello, I am running some preliminary experiments with OAK 4 S camera and I am using a custom YOLOV6N model with input size of 640 x 640. I am feeding a static image so not using the camera feed in this case as input to the neural network. Everything seems to run at an FPS of roughly 88 but I get this warning in the beginning of each run:
            [3260625470] [192.168.178.22] [1742226352.711] [DetectionParser(4)] [warning] Did not get the input image sizes from the imageIn input. Defaulting to 416 x 416

            I am not sure where should I set this and whether this affects the inference or not

            here is the code I used

            import depthai as dai

            import numpy as np

            import time

            import cv2

            from depthai_nodes import ParsingNeuralNetwork

            device = dai.Device()
            modelPath = "./depthai-core/examples/python/.depthai_cached_models/bk_gh_od_02_12_24_640_best_objects.rvc4.tar.xz"

            modelArchive = dai.NNArchive(modelPath)
            inputSize = modelArchive.getInputSize()

            type = modelArchive.getConfig().model.inputs[0].preprocessing.daiType

            if type: try: frameType = dai.ImgFrame.Type.BGR888i except AttributeError: type = None

            testImg = cv2.imread("/home/asker/oak4/depthai-core/test.jpg")

            print(inputSize)testImg = cv2.resize(testImg, (inputSize[0], inputSize[1]))

            inputFrame = dai.ImgFrame()

            inputFrame.setWidth(inputSize[0])

            inputFrame.setHeight(inputSize[1])

            inputFrame.setCvFrame(testImg, frameType)

            print(inputFrame)

            with dai.Pipeline(device) as p:
            benchmarkOut = p.create(dai.node.BenchmarkOut)

            benchmarkOut.setRunOnHost(False)

            benchmarkOut.setFps(-1)

            neuralNetwork = p.create(ParsingNeuralNetwork).build( benchmarkOut.out, modelArchive, )
            parser_output_queue = neuralNetwork.out.createOutputQueue()
            benchmarkIn = p.create(dai.node.BenchmarkIn)

            benchmarkIn.setRunOnHost(False)

            benchmarkIn.sendReportEveryNMessages(100)

            benchmarkIn.logReportsAsWarnings(False)

            neuralNetwork.out.link(benchmarkIn.input)

            outputQueue = benchmarkIn.report.createOutputQueue()

            inputQueue = benchmarkOut.input.createInputQueue()
            qRgb = neuralNetwork.passthrough.createOutputQueue()

            qDet = neuralNetwork.out.createOutputQueue()
            p.start()
            frame = None

            detections = []

            startTime = time.time()

            counter = 0

            color2 = (255, 255, 255)
            def frameNorm(frame, bbox):

            normVals = np.full(len(bbox), frame.shape[0])

            normVals[::2] = frame.shape[1]

            return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
            def displayFrame(name, frame):

            color = (255, 0, 0)

            for detection in detections:

            bbox = frameNorm( frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax), )

            # print(f"{bbox=}")

            cv2.putText( frame, "class", (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, ) cv2.putText( frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255, )

            cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2) cv2.imshow(name, frame)
            while p.isRunning():

            inputFrame = dai.ImgFrame()

            inputFrame.setWidth(inputSize[0])

            inputFrame.setHeight(inputSize[1])

            inputFrame.setCvFrame(testImg, frameType)

            inputQueue.send(inputFrame)

            # benchmarkReport = outputQueue.get() # time.sleep(0.01)

            # print(f"{benchmarkReport=}") inRgb: dai.ImgFrame = qRgb.get()

            inDet: dai.ImgDetections = qDet.get()
            if inRgb is not None:

            frame = inRgb.getCvFrame()

            cv2.putText( frame, "NN fps: {:.2f}".format( counter / (time.monotonic() - startTime)), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, (color2), )
            if inDet is not None:

            detections = inDet.detections

            counter += 1
            if frame is not None:

            displayFrame("rgb", frame)

            if time.time() - startTime > 10:

            print("FPS: {:.2f}".format( counter / (time.time() - startTime)))

            counter = 0

            startTime = time.time()

            if cv2.waitKey(1) == ord("q"):

            p.stop()

            break

            # assert isinstance(benchmarkReport, dai.BenchmarkReport)

            # print(f"FPS is {benchmarkReport.fps}")

            • Thanks for your reply. I already tried removing this part from the loop and the python code is still running at higher FPS. One thing that might be interesting is that when I use "get()" instead of "tryGet()" the C++ code becomes faster at 19.5 compared to 18.25 with python.

              • Hello,

                I've been running a YOLOV6N model (512*512) on OAK-D Wide camera and using a static image loaded from my desk as a feed to the NN to get detections with a custom YOLO model. I used both C++ and Python with almost exactly the same code and I could see that the C++ code is running at 38-39 FPS while the python code runs at 40-41 FPS. I was expecting the C++ code to be at least slightly faster and I couldn't figure out why it is the other way around.
                Here is the python code:
                import cv2

                import depthai as dai

                import numpy as np

                import time

                import cProfile

                import pstats
                label_map = ["1", "2", "3", "4", "5", "6", "7", "8"]
                nn_path = "./configuration/yolov6n_classification_real.blob"
                pipeline = dai.Pipeline()

                xin_frame = pipeline.create(dai.node.XLinkIn)

                nn = pipeline.create(dai.node.YoloDetectionNetwork)

                nn_out = pipeline.create(dai.node.XLinkOut)
                xin_frame.setStreamName("input")

                nn_out.setStreamName("nn")
                nn.setBlobPath(nn_path)

                nn.setNumClasses(8)

                nn.setCoordinateSize(4)

                nn.setIouThreshold(0.5)

                nn.input.setQueueSize(2)

                nn.setNumInferenceThreads(2)

                nn.input.setBlocking(False)
                xin_frame.out.link(nn.input)

                nn.out.link(nn_out.input)
                device = dai.Device(pipeline)

                q_input = device.getInputQueue("input")

                q_det = device.getOutputQueue("nn", 4, False)
                image_path = "./ee.jpg"

                frame = cv2.imread(image_path)
                if frame is None: print(f"Failed to load image: {image_path}") exit(-1)
                profiler = cProfile.Profile()

                profiler.enable()
                start_time = time.time()

                counter = 0

                for i in range(20000):

                resized_frame = cv2.resize(frame, (512, 512))

                img_data = resized_frame.transpose(2, 0, 1).flatten()

                img_frame = dai.ImgFrame()

                img_frame.setData(img_data)

                img_frame.setWidth(512)

                img_frame.setHeight(512)

                img_frame.setType(dai.ImgFrame.Type.BGR888p)
                q_input.send(img_frame)

                in_det = q_det.tryGet()
                if time.time() - start_time > 10:

                fps = counter / (time.time() - start_time)

                print(f"FPS: {fps:.2f}")

                counter = 0

                start_time = time.time()
                if in_det:

                counter += 1

                detections = in_det.detections

                for detection in detections:

                x1 = int(detection.xmin * resized_frame.shape[1])

                y1 = int(detection.ymin * resized_frame.shape[0])

                x2 = int(detection.xmax * resized_frame.shape[1])

                y2 = int(detection.ymax * resized_frame.shape[0])

                confidence = detection.confidence * 100
                if confidence >= 50:

                label_str = label_map[detection.label] if detection.label < len( label_map) else str(detection.label)

                cv2.rectangle(resized_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

                cv2.putText(resized_frame, label_str, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                profiler.disable()

                stats = pstats.Stats(profiler)

                stats.sort_stats(pstats.SortKey.TIME)

                stats.print_stats(20)

                cv2.destroyAllWindows()

                and here is the C++ code:

                #include <chrono>

                #include <cstdio>

                #include <iostream>

                #include <filesystem>

                #include <opencv2/opencv.hpp>

                #include "depthai/depthai.hpp"

                using namespace std;

                using namespace std::chrono;
                static const std::vector<std::string> labelMap = { "1", "2", "3", "4", "5", "6", "7", "8"};

                int main(int argc, char** argv) {

                std::string nnPath = "./configuration/yolov6n_classification_real.blob";
                dai::Pipeline pipeline;

                auto xinFrame = pipeline.create<dai::node::XLinkIn>();

                xinFrame->setStreamName("input");

                auto nn = pipeline.create<dai::node::YoloDetectionNetwork>();

                auto nnOut = pipeline.create<dai::node::XLinkOut>();

                nnOut->setStreamName("nn");
                nn->setNumClasses(8);

                nn->setCoordinateSize(4);

                nn->setIouThreshold(0.5);

                nn->input.setQueueSize(2);

                nn->setNumInferenceThreads(2);

                nn->input.setBlocking(false);
                dai::OpenVINO::Blob blob(nnPath);

                nn->setBlob(blob);
                xinFrame->out.link(nn->input);

                nn->out.link(nnOut->input);
                dai::Device device(pipeline, dai::UsbSpeed::SUPER_PLUS);

                auto qInput = device.getInputQueue("input");

                auto qDet = device.getOutputQueue("nn", 4, false);
                std::string imagePath = "ee.jpg";
                cv::Mat frame = cv::imread(imagePath);

                if (frame.empty()) { std::cerr << "Failed to load image: " << imagePath << std::endl; return -1; }

                auto startTime = steady_clock::now();

                int counter = 0;

                float fps = 0;

                for (int i = 0; i < 20000; i++) {

                cv::Mat resizedFrame;

                cv::resize(frame, resizedFrame, cv::Size(512, 512));

                std::vector<uint8_t> imgData(512 * 512 * 3);

                std::vector<cv::Mat> channels(3);

                cv::split(resizedFrame, channels);
                std::memcpy(imgData.data(), channels[0].data, 512 * 512);

                std::memcpy(imgData.data() + 512 * 512, channels[1].data, 512 * 512);

                std::memcpy(imgData.data() + 2 * 512 * 512, channels[2].data, 512 * 512);

                auto imgFrame = std::make_shared<dai::ImgFrame>();

                imgFrame->setData(imgData); imgFrame->setWidth(512);

                imgFrame->setHeight(512); imgFrame->setType(dai::ImgFrame::Type::BGR888p);

                qInput->send(imgFrame);

                std::shared_ptr<dai::ImgDetections> inDet = qDet->tryGet<dai::ImgDetections>();

                auto currentTime = steady_clock::now();

                auto elapsed = duration_cast<duration<float>>(currentTime - startTime);

                if(elapsed > seconds(1)) {

                fps = counter / elapsed.count();

                std::cout << fps << std::endl;

                counter = 0;

                startTime = steady_clock::now();

                }
                if (inDet) {

                counter++;

                std::vector<dai::ImgDetection> detections = inDet->detections;

                for (auto& detection : detections) {

                int x1 = detection.xmin * resizedFrame.cols;

                int y1 = detection.ymin * resizedFrame.rows;

                int x2 = detection.xmax * resizedFrame.cols;

                int y2 = detection.ymax * resizedFrame.rows;

                int confidence = detection.confidence * 100;

                if (confidence >= 50) {

                std::string labelStr = (detection.label < labelMap.size()) ? labelMap[detection.label] : std::to_string(detection.label);

                cv::rectangle(resizedFrame, cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2)), cv::Scalar(0, 255, 0), 2);

                cv::putText(resizedFrame, labelStr, cv::Point(x1, y1 - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1); } }

                return 0;

                }

                Can you please point out what could be the reason for this difference and is there a way to optimize the C++ code to reduce latency?