Hi @AleksNet ,
Yes, you'd use --ip FP16. You can use NeuralNetwork node's passthrough output to receive the frame on which inference ran.

I have tried inRgb.getFrame(), and put it to onnx model, but it outputs nothing…

Can you show MRE (minimal repro example) on this?
Thanks, Erik

    Hi erik,
    Sorry for the delay
    Can you please provide an example how to get frame from passthrough?
    Here are MRE, python script, blob and onnx model, and the image of cubes(model should recognize them) )

    #!/usr/bin/env python3
    from pathlib import Path
    import sys
    import cv2
    import depthai as dai
    import numpy as np
    import time
    import tensorflow as tf
    import keras_cv
    import keras
    import onnxruntime
    nnPath = str((Path('./models/YOLO KERAS/model_fp16_full.blob')).resolve().absolute())
    nnPath_onnx = str((Path('./models/YOLO KERAS/model_fp16_full.onnx')).resolve().absolute())
    session= onnxruntime.InferenceSession(nnPath_onnx)
    input_name=session.get_inputs()[0].name
    output_name0=session.get_outputs()[0].name
    output_name1=session.get_outputs()[1].name
    image_path = "all.jpg"
    BOX_REGRESSION_CHANNELS=64
    def decode_regression_to_boxes(preds):
        """Decodes the results of the YOLOV8Detector forward-pass into boxes.
        Returns left / top / right / bottom predictions with respect to anchor
        points.
        Each coordinate is encoded with 16 predicted values. Those predictions are
        softmaxed and multiplied by [0..15] to make predictions. The resulting
        predictions are relative to the stride of an anchor box (and correspondingly
        relative to the scale of the feature map from which the predictions came).
        """
        preds_bbox = keras.layers.Reshape((-1, 4, BOX_REGRESSION_CHANNELS // 4))(
            preds
        )
        preds_bbox = tf.nn.softmax(preds_bbox, axis=-1) * tf.range(
            BOX_REGRESSION_CHANNELS // 4, dtype="float32"
        )
        return tf.reduce_sum(preds_bbox, axis=-1)
    def dist2bbox(distance, anchor_points):
        """Decodes distance predictions into xyxy boxes.
        Input left / top / right / bottom predictions are transformed into xyxy box
        predictions based on anchor points.
        The resulting xyxy predictions must be scaled by the stride of their
        corresponding anchor points to yield an absolute xyxy box.
        """
        left_top, right_bottom = tf.split(distance, 2, axis=-1)
        x1y1 = anchor_points - left_top
        x2y2 = anchor_points + right_bottom
        return tf.concat((x1y1, x2y2), axis=-1)  # xyxy bbox
    def get_anchors(
        image_shape,
        strides=[8, 16, 32],
        base_anchors=[0.5, 0.5],
    ):
        """Gets anchor points for YOLOV8.
        YOLOV8 uses anchor points representing the center of proposed boxes, and
        matches ground truth boxes to anchors based on center points.
        Args:
            image_shape: tuple or list of two integers representing the height and
                width of input images, respectively.
            strides: tuple of list of integers, the size of the strides across the
                image size that should be used to create anchors.
            base_anchors: tuple or list of two integers representing the offset from
                (0,0) to start creating the center of anchor boxes, relative to the
                stride. For example, using the default (0.5, 0.5) creates the first
                anchor box for each stride such that its center is half of a stride
                from the edge of the image.
        Returns:
            A tuple of anchor centerpoints and anchor strides. Multiplying the
            two together will yield the centerpoints in absolute x,y format.
        """
        base_anchors = tf.constant(base_anchors, dtype="float32")
        all_anchors = []
        all_strides = []
        for stride in strides:
            hh_centers = tf.range(0, image_shape[0], stride)
            ww_centers = tf.range(0, image_shape[1], stride)
            ww_grid, hh_grid = tf.meshgrid(ww_centers, hh_centers)
            grid = tf.cast(
                tf.reshape(tf.stack([hh_grid, ww_grid], 2), [-1, 1, 2]),
                "float32",
            )
            anchors = (
                tf.expand_dims(
                    base_anchors * tf.constant([stride, stride], "float32"), 0
                )
                + grid
            )
            anchors = tf.reshape(anchors, [-1, 2])
            all_anchors.append(anchors)
            all_strides.append(tf.repeat(stride, anchors.shape[0]))
        all_anchors = tf.cast(tf.concat(all_anchors, axis=0), "float32")
        all_strides = tf.cast(tf.concat(all_strides, axis=0), "float32")
        all_anchors = all_anchors / all_strides[:, None]
        # Swap the x and y coordinates of the anchors.
        all_anchors = tf.concat(
            [all_anchors[:, 1, None], all_anchors[:, 0, None]], axis=-1
        )
        return all_anchors, all_strides
    def decode_predictions(
            boxes_,
            scores_,
            images,
        ):
            boxes = boxes_
            scores = scores_
            boxes = decode_regression_to_boxes(boxes)
            anchor_points, stride_tensor = get_anchors(image_shape=(640,640,3))
            stride_tensor = tf.expand_dims(stride_tensor, axis=-1)
            box_preds = dist2bbox(boxes, anchor_points) * stride_tensor
            prediction_decoder = keras_cv.layers.MultiClassNonMaxSuppression(
                        bounding_box_format="xyxy",
                        from_logits=False,
                        iou_threshold = 0.5, confidence_threshold = 0.5
                    )
          
            return prediction_decoder(box_preds, scores)
    # Get argument first
    labelMap = [
        "green",         "pink",    "orange"
    ]
    # Create pipeline
    pipeline = dai.Pipeline()
    camRgb = pipeline.create(dai.node.ColorCamera)
    camRgb.setPreviewSize(640, 640)
    camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
    camRgb.setInterleaved(False)
    camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
    camRgb.setFp16(True) # Model requires FP16 input
    # NN that detects faces in the image
    nn = pipeline.create(dai.node.NeuralNetwork)
    nn.setBlobPath(nnPath)
    nn.setNumInferenceThreads(2)
    camRgb.preview.link(nn.input)
    # Send bouding box from the NN to the host via XLink
    nn_xout = pipeline.create(dai.node.XLinkOut)
    nn_xout.setStreamName("nn")
    nn.out.link(nn_xout.input)
    # Send rgb frames to the host
    rgb_xout = pipeline.create(dai.node.XLinkOut)
    rgb_xout.setStreamName("rgb")
    nn.passthrough.link(rgb_xout.input)
    # Connect to device and start pipeline
    with dai.Device(pipeline) as device:
        # Output queues will be used to get the rgb frames and nn data from the outputs defined above
        qRgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)
        qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)
        detections = []
        while True:
            inRgb = qRgb.get()
            frame = np.array(inRgb.getData()).view(np.float16).reshape((3,640,640)).transpose(1, 2, 0).astype(np.uint8).copy()
            in_nn = qDet.tryGet()
            if in_nn is not None:
                # [print(f"Layer name: {l.name}, Type: {l.dataType}, Dimensions: {l.dims}") for l in inDet.getAllLayers()]
                # Extract the output shape: (batch_size, channels, num_predictions)
                boxes = np.array(in_nn.getLayerFp16('box')).reshape(1, 8400, 64).astype(dtype=np.float32)
               classes = np.array(in_nn.getLayerFp16('class')).reshape(1, 8400, 3).astype(dtype=np.float32)
                detections=[]
                # print(classes)
                result=decode_predictions(boxes, classes, np.expand_dims(np.array(frame),axis=0))
                # print(result)
                result_boxes=result["boxes"]
                num_of_dects=result["num_detections"]
                print("num_of_dects")
                print(num_of_dects)
                if result_boxes[0][0][0] !=-1.0:
                    detection = {
                        "label": 1,
                        "confidence": 0.1,
                        "box": result_boxes[0][0]}
                    detections.append(detection)
                            # Load and preprocess the image
                image = cv2.imread(image_path)
                image = cv2.resize(image, (640, 640))  # Resize to model's input size
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                image = inRgb.getCvFrame()
                print(image.dtype)
                image = np.expand_dims(image, axis=0)
                image = np.reshape(image, (1,3,640,640))
                res = session.run(
                   output_names=[output_name0, output_name1],
                    input_feed={input_name: image}
                )
                result=decode_predictions(res[0], res[1], np.expand_dims(np.array(frame),axis=0))
                # print(result)
                result_boxes=result["boxes"]
                num_of_dects=result["num_detections"]
                print("num_of_dects onnx")
                print(num_of_dects)
          
            cv2.imshow("rgb", frame)
            if cv2.waitKey(1) == ord('q'):
                break

    - blob model

    - onnx model
    Converted with this params - --data_type=FP16 --mean_values=[0,0,0] --scale_values=[1,1,1] --layout=NHWC --input_shape=[1,3,640,640] --ip FP16 shaves - 6 version - 2022.1

    image

    9 days later

    AleksNet
    The code looks ok at first glance.. I assume you get the desired image when viewing frames from the inRgb queue. Do you get any output from the nn node at all? If that is the case, it's probably the mean and scale values that are off.

    Thanks,
    Jaka

      Hi jakaskerl!

      I get outputs from nn, but after postprocessing, I get 0 results. It is normal behavior when image PREprocessing is incorrect. As I understand, the best way to have a look at raw input in nn is inRgb.getCvFrame() and then print it out?

      Hi jakaskerl!
      Is there a way to passthrough the image(stored on my PC), not a video input from the camera, so I would have static data to test on
      Happy New Year and best regards,
      Aleks

        Hi AleksNet
        Yes, you can use XLinkIn node to send frames to the device and process them.

        1. Create ImgFrame on host
        2. Set width height type
        3. Send via XLINKIN.

        Thanks,
        Jaka

          16 days later

          Hi jakaskerl,
          Can you please provide me an example, when I try to setSize for ImgFrame I get error about wrong input params. Also i found this approach https://discuss.luxonis.com/d/331-send-array-to-device-using-xlink/2, but I get this error Input tensor 'nchw_input' (0) exceeds available data range. Data size (1228800B), tensor offset (0), size (2457600B) - skipping inference
          Code example:

          # Create pipeline

          pipeline = dai.Pipeline()

          camRgb = pipeline.create(dai.node.ColorCamera)

          camRgb.setPreviewSize(640, 640)

          camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)

          camRgb.setInterleaved(False)

          camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)

          camRgb.setFp16(True) # Model requires FP16 input

          # NN that detects faces in the image

          nn = pipeline.create(dai.node.NeuralNetwork)

          nn.setBlobPath(nnPath)

          nn.setNumInferenceThreads(2)

          nn.input.setBlocking(True)

          xinArray = pipeline.createXLinkIn()

          nnOut = pipeline.createXLinkOut()

          xinArray.setStreamName("inArray")

          nnOut.setStreamName("nn")

          xinArray.out.link(nn.input)

          nn.out.link(nnOut.input)

          # Connect to device and start pipeline

          with dai.Device(pipeline) as device:

          # Output queues will be used to get the rgb frames and nn data from the outputs defined above

          qIn = device.getInputQueue(name="inArray", maxSize=4, blocking=False)

          qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)

          detections = []

          # Load and preprocess the image

          image = cv2.imread(image_path)

          image = cv2.resize(image, (640, 640)) # Resize to model's input size

          image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

          image = np.expand_dims(image, axis=0)

          image = np.reshape(image, (1,3,640,640)).astype(dtype=np.float16)

          image = image.flatten()

          print(image.dtype)

          data = dai.NNData()

          data.setLayer("nchw_input", image.data)

          while True:

          qIn.send(data)

          in_nn = qDet.tryGet()

          if in_nn is not None:

          # [print(f"Layer name: {l.name}, Type: {l.dataType}, Dimensions: {l.dims}") for l in inDet.getAllLayers()]

          # Extract the output shape: (batch_size, channels, num_predictions)

          boxes = np.array(in_nn.getLayerFp16('box')).reshape(1, 8400, 64).astype(dtype=np.float32)

          classes = np.array(in_nn.getLayerFp16('class')).reshape(1, 8400, 3).astype(dtype=np.float32)

          detections=[]

          # print(classes)

          result=decode_predictions(boxes, classes, image)

          # print(result)

          result_boxes=result["boxes"]

          num_of_dects=result["num_detections"]

          print("num_of_dects")

          print(num_of_dects)

          if result_boxes[0][0][0] !=-1.0:

          detection = {

          "label": 1,

          "confidence": 0.1,

          "box": result_boxes[0][0]}

          detections.append(detection)

          res = session.run(

          output_names=[output_name0, output_name1],

          input_feed={input_name: image}

          )

          result=decode_predictions(res[0], res[1], image)

          # print(result)

          result_boxes=result["boxes"]

          num_of_dects=result["num_detections"]

          print("num_of_dects onnx")

          print(num_of_dects)

          # cv2.imshow("rgb", frame)

          if cv2.waitKey(1) == ord('q'):

          break

          Best regards,
          Aleks

            AleksNet
            Something like this

            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (800, 800))
            h, w, c = frame.shape
            bgr_planar = frame.transpose(2, 0, 1).flatten()
            imgFrame = dai.ImgFrame()
            imgFrame.setType(dai.ImgFrame.Type.BGR888p)
            imgFrame.setWidth(w)
            imgFrame.setHeight(h)
            imgFrame.setData(bgr_planar)
            seq_num += 1
            imgFrame.setSequenceNum(seq_num)
            frame_buffer[seq_num] = frame.copy()
            hostInQ.send(imgFrame)

              jakaskerl
              I have tried to save camRgb.getCVFrame() and save data from it, and it looks like I get right data from camera. And now I figured out that when I am moving pink cube in front of the camera and send this data to onnx framework I get some results but only some… And blob nn still can not find anything

                AleksNet
                Cap is the webcam capture. Merely to illustrate how to send custom frames/video on to the device.

                Can you show some images of that? Particularly the nn.passthrough output?

                Thanks
                Jaka

                  jakaskerl, Sure
                  here is image and red rectangle is what it thinks where is the cube

                  And if try to load this image by opencv and then inference on it by onnx, I will get precise results

                    jakaskerl, sure.
                    Here it is:
                    #!/usr/bin/env python3

                    from pathlib import Path

                    import sys

                    import cv2

                    import depthai as dai

                    import numpy as np

                    import time

                    import tensorflow as tf

                    import keras_cv

                    import keras

                    import onnxruntime

                    nnPath = str((Path('./models/YOLO KERAS/model_fp16_full_ov.blob')).resolve().absolute())

                    nnPath_onnx = str((Path('./models/YOLO KERAS/model_fp16_full.onnx')).resolve().absolute())

                    session= onnxruntime.InferenceSession(nnPath_onnx)

                    input_name=session.get_inputs()[0].name

                    output_name0=session.get_outputs()[0].name

                    output_name1=session.get_outputs()[1].name

                    image_path = "image.jpg"

                    BOX_REGRESSION_CHANNELS=64

                    def decode_regression_to_boxes(preds):

                    """Decodes the results of the YOLOV8Detector forward-pass into boxes.

                    Returns left / top / right / bottom predictions with respect to anchor

                    points.

                    Each coordinate is encoded with 16 predicted values. Those predictions are

                    softmaxed and multiplied by [0..15] to make predictions. The resulting

                    predictions are relative to the stride of an anchor box (and correspondingly

                    relative to the scale of the feature map from which the predictions came).

                    """

                    preds_bbox = keras.layers.Reshape((-1, 4, BOX_REGRESSION_CHANNELS // 4))(

                    preds

                    )

                    preds_bbox = tf.nn.softmax(preds_bbox, axis=-1) * tf.range(

                    BOX_REGRESSION_CHANNELS // 4, dtype="float32"

                    )

                    return tf.reduce_sum(preds_bbox, axis=-1)

                    def dist2bbox(distance, anchor_points):

                    """Decodes distance predictions into xyxy boxes.

                    Input left / top / right / bottom predictions are transformed into xyxy box

                    predictions based on anchor points.

                    The resulting xyxy predictions must be scaled by the stride of their

                    corresponding anchor points to yield an absolute xyxy box.

                    """

                    left_top, right_bottom = tf.split(distance, 2, axis=-1)

                    x1y1 = anchor_points - left_top

                    x2y2 = anchor_points + right_bottom

                    return tf.concat((x1y1, x2y2), axis=-1) # xyxy bbox

                    def get_anchors(

                    image_shape,

                    strides=[8, 16, 32],

                    base_anchors=[0.5, 0.5],

                    ):

                    """Gets anchor points for YOLOV8.

                    YOLOV8 uses anchor points representing the center of proposed boxes, and

                    matches ground truth boxes to anchors based on center points.

                    Args:

                    image_shape: tuple or list of two integers representing the height and

                    width of input images, respectively.

                    strides: tuple of list of integers, the size of the strides across the

                    image size that should be used to create anchors.

                    base_anchors: tuple or list of two integers representing the offset from

                    (0,0) to start creating the center of anchor boxes, relative to the

                    stride. For example, using the default (0.5, 0.5) creates the first

                    anchor box for each stride such that its center is half of a stride

                    from the edge of the image.

                    Returns:

                    A tuple of anchor centerpoints and anchor strides. Multiplying the

                    two together will yield the centerpoints in absolute x,y format.

                    """

                    base_anchors = tf.constant(base_anchors, dtype="float32")

                    all_anchors = []

                    all_strides = []

                    for stride in strides:

                    hh_centers = tf.range(0, image_shape[0], stride)

                    ww_centers = tf.range(0, image_shape[1], stride)

                    ww_grid, hh_grid = tf.meshgrid(ww_centers, hh_centers)

                    grid = tf.cast(

                    tf.reshape(tf.stack([hh_grid, ww_grid], 2), [-1, 1, 2]),

                    "float32",

                    )

                    anchors = (

                    tf.expand_dims(

                    base_anchors * tf.constant([stride, stride], "float32"), 0

                    )

                    + grid

                    )

                    anchors = tf.reshape(anchors, [-1, 2])

                    all_anchors.append(anchors)

                    all_strides.append(tf.repeat(stride, anchors.shape[0]))

                    all_anchors = tf.cast(tf.concat(all_anchors, axis=0), "float32")

                    all_strides = tf.cast(tf.concat(all_strides, axis=0), "float32")

                    all_anchors = all_anchors / all_strides[:, None]

                    # Swap the x and y coordinates of the anchors.

                    all_anchors = tf.concat(

                    [all_anchors[:, 1, None], all_anchors[:, 0, None]], axis=-1

                    )

                    return all_anchors, all_strides

                    def decode_predictions(

                    boxes_,

                    scores_,

                    images,

                    ):

                    boxes = boxes_

                    scores = scores_

                    boxes = decode_regression_to_boxes(boxes)

                    anchor_points, stride_tensor = get_anchors(image_shape=(640,640,3))

                    stride_tensor = tf.expand_dims(stride_tensor, axis=-1)

                    box_preds = dist2bbox(boxes, anchor_points) * stride_tensor

                    prediction_decoder = keras_cv.layers.MultiClassNonMaxSuppression(

                    bounding_box_format="xyxy",

                    from_logits=False,

                    iou_threshold=0.5,

                    confidence_threshold=0.5

                    )

                    return prediction_decoder(box_preds, scores)

                    # Get argument first

                    labelMap = [

                    "green", "pink", "orange"

                    ]

                    # Create pipeline

                    pipeline = dai.Pipeline()

                    camRgb = pipeline.create(dai.node.ColorCamera)

                    camRgb.setPreviewSize(640, 640)

                    camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)

                    camRgb.setInterleaved(False)

                    camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)

                    camRgb.setFp16(True) # Model requires FP16 input

                    # NN that detects faces in the image

                    nn = pipeline.create(dai.node.NeuralNetwork)

                    nn.setBlobPath(nnPath)

                    nn.setNumInferenceThreads(2)

                    camRgb.preview.link(nn.input)

                    # Send bouding box from the NN to the host via XLink

                    nn_xout = pipeline.create(dai.node.XLinkOut)

                    nn_xout.setStreamName("nn")

                    nn.out.link(nn_xout.input)

                    # Send rgb frames to the host

                    rgb_xout = pipeline.create(dai.node.XLinkOut)

                    rgb_xout.setStreamName("rgb")

                    nn.passthrough.link(rgb_xout.input)

                    # Connect to device and start pipeline

                    with dai.Device(pipeline) as device:

                    # Output queues will be used to get the rgb frames and nn data from the outputs defined above

                    qRgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)

                    qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)

                    detections = []

                    while True:

                    inRgb = qRgb.get()

                    frame = np.array(inRgb.getData()).view(np.float16).reshape((3,640,640)).transpose(1, 2, 0).astype(np.uint8).copy()

                    in_nn = qDet.tryGet()

                    if in_nn is not None:

                    # [print(f"Layer name: {l.name}, Type: {l.dataType}, Dimensions: {l.dims}") for l in inDet.getAllLayers()]

                    # Extract the output shape: (batch_size, channels, num_predictions)

                    boxes = np.array(in_nn.getLayerFp16('box')).reshape(1, 8400, 64).astype(dtype=np.float32)

                    classes = np.array(in_nn.getLayerFp16('class')).reshape(1, 8400, 3).astype(dtype=np.float32)

                    detections=[]

                    # print(classes)

                    result=decode_predictions(boxes, classes, frame)

                    # print(result)

                    result_boxes=result["boxes"]

                    num_of_dects=result["num_detections"]

                    if(num_of_dects[0] >0):

                    print("num_of_dects")

                    print(num_of_dects)

                    for bbox_data in result_boxes[0]:

                    bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]

                    frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,255,0), 2)

                    image = inRgb.getCvFrame()

                    image = np.expand_dims(image, axis=0)

                    image = np.reshape(image, (1,3,640,640))

                    res = session.run(

                    output_names=[output_name0, output_name1],

                    input_feed={input_name: image}

                    )

                    result=decode_predictions(res[0], res[1], np.expand_dims(np.array(frame),axis=0))

                    # print(result)

                    result_boxes=result["boxes"]

                    num_of_dects=result["num_detections"]

                    if(num_of_dects[0] >0):

                    print("num_of_dects onnx")

                    print(num_of_dects)

                    for bbox_data in result_boxes[0]:

                    bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]

                    frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,0,255), 2)

                    cv2.imshow("rgb", frame)

                    if cv2.waitKey(1) == ord('q'):

                    break

                    if cv2.waitKey(1) == ord("s"):

                    cv2.imwrite("img.png", frame)

                    print(frame)

                    reshaped_frame = frame.reshape(-1, frame.shape[2]) # Reshape to (height*width, channels)

                    np.savetxt("img.txt", reshaped_frame, fmt="%.6f")

                    break

                    and google drive with onnx and blob models:
                    https://drive.google.com/drive/folders/1cXhwfOF7TG81ZSIZ4NJjKl3dlUGRLctz?usp=drive_link

                    And I set preview frame same size as my model has in input layer - 640x640

                      AleksNet
                      Ok, denormalization is not the issue. Why are you using FP16 streams. When converting to blob, you can specify the datatype and then you can just use UINT8 input so you don't need to perform conversion each time. Likely a cause of the issue as well. Also I am not getting any detections for blob side, only for ONNX (which are wrong like yours). Did you configure the scale and offset when converting to blob?

                      https://docs.luxonis.com/software/ai-inference/conversion/#Conversion-Advanced%20Settings-Model%20Compiler%20Flags

                      Thanks,
                      Jaka

                        Hi jakaskerl,
                        I am using fp16 because my model used fp32, but I changed it at least to fp16. And I convert from openVino format, there are no options for this. But when I had converted from onnx to blob, I used this parameters --data_type=FP16 --mean_values=[0,0,0] --scale_values=[1,1,1] --layout=NCHW --input_shape=[1,3,640,640], so my data represents in 0-255 range

                          AleksNet
                          That seems ok, but you can make the model input UINT8, then you can omit the setFp16. I'm not sure how the FP16 camera output looks, perhaps it's the wrong endian type...

                          Thanks,
                          Jaka

                            Hi jakaskerl!
                            I have added a layer to convert from uint8 to fp16 as input and removed setFP6 flag and now it works only for onnx. It still does not work for blob. I converted onnx to openVino and then to blob. Here are the code and files:

                            \#!/usr/bin/env python3
                            
                            from pathlib import Path
                            
                            import sys
                            
                            import cv2
                            
                            import depthai as dai
                            
                            import numpy as np
                            
                            import time
                            
                            import tensorflow as tf
                            
                            import keras_cv
                            
                            import keras
                            
                            import onnxruntime
                            
                            nnPath = str((Path('./models/YOLO KERAS/model_with_cast_uint8_to_fp16_ov.blob')).resolve().absolute())
                            
                            nnPath_onnx = str((Path('./models/YOLO KERAS/model_with_cast_uint8_to_fp16.onnx')).resolve().absolute())
                            
                            session= onnxruntime.InferenceSession(nnPath_onnx)
                            
                            input_name=session.get_inputs()[0].name
                            
                            output_name0=session.get_outputs()[0].name
                            
                            output_name1=session.get_outputs()[1].name
                            
                            image_path = "image.jpg"
                            
                            BOX_REGRESSION_CHANNELS=64
                            
                            def decode_regression_to_boxes(preds):
                            
                            """Decodes the results of the YOLOV8Detector forward-pass into boxes.
                            
                            Returns left / top / right / bottom predictions with respect to anchor
                            
                            points.
                            
                            Each coordinate is encoded with 16 predicted values. Those predictions are
                            
                            softmaxed and multiplied by [0..15] to make predictions. The resulting
                            
                            predictions are relative to the stride of an anchor box (and correspondingly
                            
                            relative to the scale of the feature map from which the predictions came).
                            
                            """
                            
                            preds_bbox = keras.layers.Reshape((-1, 4, BOX_REGRESSION_CHANNELS // 4))(
                            
                            
                            preds
                            
                            
                            )
                            
                            preds_bbox = tf.nn.softmax(preds_bbox, axis=-1) \* tf.range(
                            
                            
                            BOX_REGRESSION_CHANNELS // 4, dtype="float32"
                            
                            
                            )
                            
                            return tf.reduce_sum(preds_bbox, axis=-1)
                            
                            def dist2bbox(distance, anchor_points):
                            
                            """Decodes distance predictions into xyxy boxes.
                            
                            Input left / top / right / bottom predictions are transformed into xyxy box
                            
                            predictions based on anchor points.
                            
                            The resulting xyxy predictions must be scaled by the stride of their
                            
                            corresponding anchor points to yield an absolute xyxy box.
                            
                            """
                            
                            left_top, right_bottom = tf.split(distance, 2, axis=-1)
                            
                            x1y1 = anchor_points - left_top
                            
                            x2y2 = anchor_points + right_bottom
                            
                            return tf.concat((x1y1, x2y2), axis=-1)  # xyxy bbox
                            
                            def get_anchors(
                            
                            image_shape,
                            
                            strides=[8, 16, 32],
                            
                            base_anchors=[0.5, 0.5],
                            
                            ):
                            
                            
                            """Gets anchor points for YOLOV8.
                            
                            YOLOV8 uses anchor points representing the center of proposed boxes, and
                            
                            matches ground truth boxes to anchors based on center points.
                            
                            Args:
                            
                                image_shape: tuple or list of two integers representing the height and
                            
                                    width of input images, respectively.
                            
                                strides: tuple of list of integers, the size of the strides across the
                            
                                    image size that should be used to create anchors.
                            
                                base_anchors: tuple or list of two integers representing the offset from
                            
                                    (0,0) to start creating the center of anchor boxes, relative to the
                            
                                    stride. For example, using the default (0.5, 0.5) creates the first
                            
                                    anchor box for each stride such that its center is half of a stride
                            
                                    from the edge of the image.
                            
                            Returns:
                            
                                A tuple of anchor centerpoints and anchor strides. Multiplying the
                            
                                two together will yield the centerpoints in absolute x,y format.
                            
                            """
                            
                            base_anchors = tf.constant(base_anchors, dtype="float32")
                            
                            all_anchors = []
                            
                            all_strides = []
                            
                            for stride in strides:
                            
                                hh_centers = tf.range(0, image_shape[0], stride)
                            
                                ww_centers = tf.range(0, image_shape[1], stride)
                            
                                ww_grid, hh_grid = tf.meshgrid(ww_centers, hh_centers)
                            
                                grid = tf.cast(
                            
                                    tf.reshape(tf.stack([hh_grid, ww_grid], 2), [-1, 1, 2]),
                            
                                    "float32",
                            
                                )
                            
                                anchors = (
                            
                                    tf.expand_dims(
                            
                                        base_anchors \* tf.constant([stride, stride], "float32"), 0
                            
                                    )
                            
                                    + grid
                            
                                )
                            
                                anchors = tf.reshape(anchors, [-1, 2])
                            
                                all_anchors.append(anchors)
                            
                                all_strides.append(tf.repeat(stride, anchors.shape[0]))
                            
                            all_anchors = tf.cast(tf.concat(all_anchors, axis=0), "float32")
                            
                            all_strides = tf.cast(tf.concat(all_strides, axis=0), "float32")
                            
                            all_anchors = all_anchors / all_strides[:, None]
                            
                            # Swap the x and y coordinates of the anchors.
                            
                            all_anchors = tf.concat(
                            
                                [all_anchors[:, 1, None], all_anchors[:, 0, None]], axis=-1
                            
                            )
                            
                            return all_anchors, all_strides
                            
                            def decode_predictions(
                            
                                boxes_,
                            
                                scores_,
                            
                                images,
                            
                            ):
                            
                                boxes = boxes_
                            
                                scores = scores_
                            
                                boxes = decode_regression_to_boxes(boxes)
                            
                                anchor_points, stride_tensor = get_anchors(image_shape=(640,640,3))
                            
                                stride_tensor = tf.expand_dims(stride_tensor, axis=-1)
                            
                                box_preds = dist2bbox(boxes, anchor_points) \* stride_tensor
                            
                                prediction_decoder = keras_cv.layers.MultiClassNonMaxSuppression(
                            
                                            bounding_box_format="xyxy",
                            
                                            from_logits=False,
                            
                                            iou_threshold=0.2,
                            
                                            confidence_threshold=0.2
                            
                                        )
                            
                                
                            
                                return prediction_decoder(box_preds, scores)
                            
                            \# Get argument first
                            
                            labelMap = [
                            
                            "
                            "green",         "pink",    "orange"
                            "
                            
                            ]
                            
                            \# Create pipeline
                            
                            pipeline = dai.Pipeline()
                            
                            camRgb = pipeline.create(dai.node.ColorCamera)
                            
                            camRgb.setPreviewSize(640, 640)
                            
                            camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
                            
                            camRgb.setInterleaved(False)
                            
                            camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB)
                            
                            \# camRgb.setFp16(True) # Model requires FP16 input
                            
                            \# NN that detects faces in the image
                            
                            nn = pipeline.create(dai.node.NeuralNetwork)
                            
                            nn.setBlobPath(nnPath)
                            
                            nn.setNumInferenceThreads(2)
                            
                            camRgb.preview.link(nn.input)
                            
                            \# Send bouding box from the NN to the host via XLink
                            
                            nn_xout = pipeline.create(dai.node.XLinkOut)
                            
                            nn_xout.setStreamName("nn")
                            
                            nn.out.link(nn_xout.input)
                            
                            \# Send rgb frames to the host
                            
                            rgb_xout = pipeline.create(dai.node.XLinkOut)
                            
                            rgb_xout.setStreamName("rgb")
                            
                            nn.passthrough.link(rgb_xout.input)
                            
                            \# Connect to device and start pipeline
                            
                            with dai.Device(pipeline) as device:
                            
                            # Output queues will be used to get the rgb frames and nn data from the outputs defined above
                            
                            qRgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)
                            
                            qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)
                            
                            detections = []
                            
                            while True:
                            
                                inRgb = qRgb.get()
                            
                                frame = inRgb.getCvFrame()
                            
                                in_nn = qDet.tryGet()
                            
                                if in_nn is not None:
                            
                                    # [print(f"Layer name: {l.name}, Type: {l.dataType}, Dimensions: {l.dims}") for l in in_nn.getAllLayers()]
                            
                                    # Extract the output shape: (batch_size, channels, num_predictions)
                            
                                    boxes = np.array(in_nn.getLayerFp16('box')).reshape(1, 8400, 64)
                            
                                    classes = np.array(in_nn.getLayerFp16('class')).reshape(1, 8400, 3)
                            
                                    detections=[]
                            
                                    # print(classes)
                            
                                    result=decode_predictions(boxes, classes, frame)
                            
                                    # print(result)
                            
                                    result_boxes=result["boxes"]
                            
                                    num_of_dects=result["num_detections"]
                            
                                    if(num_of_dects[0] >0):
                            
                                        print("num_of_dects")
                            
                                        print(num_of_dects)
                            
                                        for bbox_data in result_boxes[0]:
                            
                                            bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]
                            
                                            frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,255,0), 2)
                            
                                    image = inRgb.getCvFrame()
                            
                                    image = np.expand_dims(image, axis=0)
                            
                                    image = np.reshape(image, (1,3,640,640))
                            
                                    res = session.run(
                            
                                        output_names=[output_name0, output_name1], 
                            
                                        input_feed={input_name: image}
                            
                                    )
                            
                                    result=decode_predictions(res[0], res[1], np.expand_dims(np.array(frame),axis=0))
                            
                                    # print(result)
                            
                                    result_boxes=result["boxes"]
                            
                                    num_of_dects=result["num_detections"]
                            
                                    if(num_of_dects[0] >0):
                            
                                        print("num_of_dects onnx")
                            
                                        print(num_of_dects)
                            
                                        for bbox_data in result_boxes[0]:
                            
                                            bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]
                            
                                            frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,0,255), 2)
                            
                                
                            
                                cv2.imshow("rgb", frame)
                            
                                if cv2.waitKey(1) == ord('q'):
                            
                                    break
                            
                                if cv2.waitKey(1) == ord("s"):
                            
                                    cv2.imwrite("img.png", frame)
                            
                                    print(frame)
                            
                                    reshaped_frame = frame.reshape(-1, frame.shape[2])  # Reshape to (height\*width, channels)
                            
                                    np.savetxt("img.txt", reshaped_frame, fmt="%.6f")
                            
                                    break

                            Files:
                            https://drive.google.com/drive/folders/1cXhwfOF7TG81ZSIZ4NJjKl3dlUGRLctz

                            thanks,
                            Aleks

                            7 days later

                            Hi @AleksNet,

                            thank you for the update! I want to update you as well. I have compared the predictions of ONNX and OpenVino IR models and both models work. This suggest that the issue lies in the conversion from IR to blob. I have also tried to change the dynamic input shape of the ONNX model to static, but the resulting blob also didn't work. I'm investigating the IR -> blob conversion at the moment. I'll keep you updated.

                            Best,
                            Jan