Export parameters for model from keras yolov8
Hi jakaskerl,
Can you please provide me an example, when I try to setSize for ImgFrame I get error about wrong input params. Also i found this approach https://discuss.luxonis.com/d/331-send-array-to-device-using-xlink/2, but I get this error Input tensor 'nchw_input' (0) exceeds available data range. Data size (1228800B), tensor offset (0), size (2457600B) - skipping inference
Code example:
# Create pipeline
pipeline = dai.Pipeline()
camRgb = pipeline.create(dai.node.ColorCamera)
camRgb.setPreviewSize(640, 640)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setInterleaved(False)
camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
camRgb.setFp16(True) # Model requires FP16 input
# NN that detects faces in the image
nn = pipeline.create(dai.node.NeuralNetwork)
nn.setBlobPath(nnPath)
nn.setNumInferenceThreads(2)
nn.input.setBlocking(True)
xinArray = pipeline.createXLinkIn()
nnOut = pipeline.createXLinkOut()
xinArray.setStreamName("inArray")
nnOut.setStreamName("nn")
xinArray.out.link(nn.input)
nn.out.link(nnOut.input)
# Connect to device and start pipeline
with dai.Device(pipeline) as device:
# Output queues will be used to get the rgb frames and nn data from the outputs defined above
qIn = device.getInputQueue(name="inArray", maxSize=4, blocking=False)
qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)
detections = []
# Load and preprocess the image
image = cv2.imread(image_path)
image = cv2.resize(image, (640, 640)) # Resize to model's input size
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
image = np.expand_dims(image, axis=0)
image = np.reshape(image, (1,3,640,640)).astype(dtype=np.float16)
image = image.flatten()
print(image.dtype)
data = dai.NNData()
data.setLayer("nchw_input", image.data)
while True:
qIn.send(data)
in_nn = qDet.tryGet()
if in_nn is not None:
# [print(f"Layer name: {l.name}, Type: {l.dataType}, Dimensions: {l.dims}") for l in inDet.getAllLayers()]
# Extract the output shape: (batch_size, channels, num_predictions)
boxes = np.array(in_nn.getLayerFp16('box')).reshape(1, 8400, 64).astype(dtype=np.float32)
classes = np.array(in_nn.getLayerFp16('class')).reshape(1, 8400, 3).astype(dtype=np.float32)
detections=[]
# print(classes)
result=decode_predictions(boxes, classes, image)
# print(result)
result_boxes=result["boxes"]
num_of_dects=result["num_detections"]
print("num_of_dects")
print(num_of_dects)
if result_boxes[0][0][0] !=-1.0:
detection = {
"label": 1,
"confidence": 0.1,
"box": result_boxes[0][0]}
detections.append(detection)
res = session.run(
output_names=[output_name0, output_name1],
input_feed={input_name: image}
)
result=decode_predictions(res[0], res[1], image)
# print(result)
result_boxes=result["boxes"]
num_of_dects=result["num_detections"]
print("num_of_dects onnx")
print(num_of_dects)
# cv2.imshow("rgb", frame)
if cv2.waitKey(1) == ord('q'):
break
Best regards,
Aleks
AleksNet
Something like this
ret, frame = cap.read()
if not ret:
break
frame = cv2.resize(frame, (800, 800))
h, w, c = frame.shape
bgr_planar = frame.transpose(2, 0, 1).flatten()
imgFrame = dai.ImgFrame()
imgFrame.setType(dai.ImgFrame.Type.BGR888p)
imgFrame.setWidth(w)
imgFrame.setHeight(h)
imgFrame.setData(bgr_planar)
seq_num += 1
imgFrame.setSequenceNum(seq_num)
frame_buffer[seq_num] = frame.copy()
hostInQ.send(imgFrame)
jakaskerl
I have tried to save camRgb.getCVFrame() and save data from it, and it looks like I get right data from camera. And now I figured out that when I am moving pink cube in front of the camera and send this data to onnx framework I get some results but only some⦠And blob nn still can not find anything
AleksNet
Ok. I assume the model works correctly because only one bbox is displayed (no issue with scale/image type/etc..). I think bbox is not rescaled back. Can I see code please? Looks like you are inputting a smaller (eg. 300x300) frame, but drawing over eg. 1080p.
see https://docs.luxonis.com/software/depthai/resolution-techniques/
Thanks,
Jaka
- Edited
jakaskerl, sure.
Here it is:
#!/usr/bin/env python3
from pathlib import Path
import sys
import cv2
import depthai as dai
import numpy as np
import time
import tensorflow as tf
import keras_cv
import keras
import onnxruntime
nnPath = str((Path('./models/YOLO KERAS/model_fp16_full_ov.blob')).resolve().absolute())
nnPath_onnx = str((Path('./models/YOLO KERAS/model_fp16_full.onnx')).resolve().absolute())
session= onnxruntime.InferenceSession(nnPath_onnx)
input_name=session.get_inputs()[0].name
output_name0=session.get_outputs()[0].name
output_name1=session.get_outputs()[1].name
image_path = "image.jpg"
BOX_REGRESSION_CHANNELS=64
def decode_regression_to_boxes(preds):
"""Decodes the results of the YOLOV8Detector forward-pass into boxes.
Returns left / top / right / bottom predictions with respect to anchor
points.
Each coordinate is encoded with 16 predicted values. Those predictions are
softmaxed and multiplied by [0..15] to make predictions. The resulting
predictions are relative to the stride of an anchor box (and correspondingly
relative to the scale of the feature map from which the predictions came).
"""
preds_bbox = keras.layers.Reshape((-1, 4, BOX_REGRESSION_CHANNELS // 4))(
preds
)
preds_bbox = tf.nn.softmax(preds_bbox, axis=-1) * tf.range(
BOX_REGRESSION_CHANNELS // 4, dtype="float32"
)
return tf.reduce_sum(preds_bbox, axis=-1)
def dist2bbox(distance, anchor_points):
"""Decodes distance predictions into xyxy boxes.
Input left / top / right / bottom predictions are transformed into xyxy box
predictions based on anchor points.
The resulting xyxy predictions must be scaled by the stride of their
corresponding anchor points to yield an absolute xyxy box.
"""
left_top, right_bottom = tf.split(distance, 2, axis=-1)
x1y1 = anchor_points - left_top
x2y2 = anchor_points + right_bottom
return tf.concat((x1y1, x2y2), axis=-1) # xyxy bbox
def get_anchors(
image_shape,
strides=[8, 16, 32],
base_anchors=[0.5, 0.5],
):
"""Gets anchor points for YOLOV8.
YOLOV8 uses anchor points representing the center of proposed boxes, and
matches ground truth boxes to anchors based on center points.
Args:
image_shape: tuple or list of two integers representing the height and
width of input images, respectively.
strides: tuple of list of integers, the size of the strides across the
image size that should be used to create anchors.
base_anchors: tuple or list of two integers representing the offset from
(0,0) to start creating the center of anchor boxes, relative to the
stride. For example, using the default (0.5, 0.5) creates the first
anchor box for each stride such that its center is half of a stride
from the edge of the image.
Returns:
A tuple of anchor centerpoints and anchor strides. Multiplying the
two together will yield the centerpoints in absolute x,y format.
"""
base_anchors = tf.constant(base_anchors, dtype="float32")
all_anchors = []
all_strides = []
for stride in strides:
hh_centers = tf.range(0, image_shape[0], stride)
ww_centers = tf.range(0, image_shape[1], stride)
ww_grid, hh_grid = tf.meshgrid(ww_centers, hh_centers)
grid = tf.cast(
tf.reshape(tf.stack([hh_grid, ww_grid], 2), [-1, 1, 2]),
"float32",
)
anchors = (
tf.expand_dims(
base_anchors * tf.constant([stride, stride], "float32"), 0
)
+ grid
)
anchors = tf.reshape(anchors, [-1, 2])
all_anchors.append(anchors)
all_strides.append(tf.repeat(stride, anchors.shape[0]))
all_anchors = tf.cast(tf.concat(all_anchors, axis=0), "float32")
all_strides = tf.cast(tf.concat(all_strides, axis=0), "float32")
all_anchors = all_anchors / all_strides[:, None]
# Swap the x and y coordinates of the anchors.
all_anchors = tf.concat(
[all_anchors[:, 1, None], all_anchors[:, 0, None]], axis=-1
)
return all_anchors, all_strides
def decode_predictions(
boxes_,
scores_,
images,
):
boxes = boxes_
scores = scores_
boxes = decode_regression_to_boxes(boxes)
anchor_points, stride_tensor = get_anchors(image_shape=(640,640,3))
stride_tensor = tf.expand_dims(stride_tensor, axis=-1)
box_preds = dist2bbox(boxes, anchor_points) * stride_tensor
prediction_decoder = keras_cv.layers.MultiClassNonMaxSuppression(
bounding_box_format="xyxy",
from_logits=False,
iou_threshold=0.5,
confidence_threshold=0.5
)
return prediction_decoder(box_preds, scores)
# Get argument first
labelMap = [
"green", "pink", "orange"
]
# Create pipeline
pipeline = dai.Pipeline()
camRgb = pipeline.create(dai.node.ColorCamera)
camRgb.setPreviewSize(640, 640)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setInterleaved(False)
camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
camRgb.setFp16(True) # Model requires FP16 input
# NN that detects faces in the image
nn = pipeline.create(dai.node.NeuralNetwork)
nn.setBlobPath(nnPath)
nn.setNumInferenceThreads(2)
camRgb.preview.link(nn.input)
# Send bouding box from the NN to the host via XLink
nn_xout = pipeline.create(dai.node.XLinkOut)
nn_xout.setStreamName("nn")
nn.out.link(nn_xout.input)
# Send rgb frames to the host
rgb_xout = pipeline.create(dai.node.XLinkOut)
rgb_xout.setStreamName("rgb")
nn.passthrough.link(rgb_xout.input)
# Connect to device and start pipeline
with dai.Device(pipeline) as device:
# Output queues will be used to get the rgb frames and nn data from the outputs defined above
qRgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)
qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)
detections = []
while True:
inRgb = qRgb.get()
frame = np.array(inRgb.getData()).view(np.float16).reshape((3,640,640)).transpose(1, 2, 0).astype(np.uint8).copy()
in_nn = qDet.tryGet()
if in_nn is not None:
# [print(f"Layer name: {l.name}, Type: {l.dataType}, Dimensions: {l.dims}") for l in inDet.getAllLayers()]
# Extract the output shape: (batch_size, channels, num_predictions)
boxes = np.array(in_nn.getLayerFp16('box')).reshape(1, 8400, 64).astype(dtype=np.float32)
classes = np.array(in_nn.getLayerFp16('class')).reshape(1, 8400, 3).astype(dtype=np.float32)
detections=[]
# print(classes)
result=decode_predictions(boxes, classes, frame)
# print(result)
result_boxes=result["boxes"]
num_of_dects=result["num_detections"]
if(num_of_dects[0] >0):
print("num_of_dects")
print(num_of_dects)
for bbox_data in result_boxes[0]:
bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]
frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,255,0), 2)
image = inRgb.getCvFrame()
image = np.expand_dims(image, axis=0)
image = np.reshape(image, (1,3,640,640))
res = session.run(
output_names=[output_name0, output_name1],
input_feed={input_name: image}
)
result=decode_predictions(res[0], res[1], np.expand_dims(np.array(frame),axis=0))
# print(result)
result_boxes=result["boxes"]
num_of_dects=result["num_detections"]
if(num_of_dects[0] >0):
print("num_of_dects onnx")
print(num_of_dects)
for bbox_data in result_boxes[0]:
bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]
frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,0,255), 2)
cv2.imshow("rgb", frame)
if cv2.waitKey(1) == ord('q'):
break
if cv2.waitKey(1) == ord("s"):
cv2.imwrite("img.png", frame)
print(frame)
reshaped_frame = frame.reshape(-1, frame.shape[2]) # Reshape to (height*width, channels)
np.savetxt("img.txt", reshaped_frame, fmt="%.6f")
break
and google drive with onnx and blob models:
https://drive.google.com/drive/folders/1cXhwfOF7TG81ZSIZ4NJjKl3dlUGRLctz?usp=drive_link
And I set preview frame same size as my model has in input layer - 640x640
AleksNet
Ok, denormalization is not the issue. Why are you using FP16 streams. When converting to blob, you can specify the datatype and then you can just use UINT8 input so you don't need to perform conversion each time. Likely a cause of the issue as well. Also I am not getting any detections for blob side, only for ONNX (which are wrong like yours). Did you configure the scale and offset when converting to blob?
Thanks,
Jaka
Hi jakaskerl,
I am using fp16 because my model used fp32, but I changed it at least to fp16. And I convert from openVino format, there are no options for this. But when I had converted from onnx to blob, I used this parameters --data_type=FP16 --mean_values=[0,0,0] --scale_values=[1,1,1] --layout=NCHW --input_shape=[1,3,640,640], so my data represents in 0-255 range
- Edited
Hi jakaskerl!
I have added a layer to convert from uint8 to fp16 as input and removed setFP6 flag and now it works only for onnx. It still does not work for blob. I converted onnx to openVino and then to blob. Here are the code and files:
\#!/usr/bin/env python3
from pathlib import Path
import sys
import cv2
import depthai as dai
import numpy as np
import time
import tensorflow as tf
import keras_cv
import keras
import onnxruntime
nnPath = str((Path('./models/YOLO KERAS/model_with_cast_uint8_to_fp16_ov.blob')).resolve().absolute())
nnPath_onnx = str((Path('./models/YOLO KERAS/model_with_cast_uint8_to_fp16.onnx')).resolve().absolute())
session= onnxruntime.InferenceSession(nnPath_onnx)
input_name=session.get_inputs()[0].name
output_name0=session.get_outputs()[0].name
output_name1=session.get_outputs()[1].name
image_path = "image.jpg"
BOX_REGRESSION_CHANNELS=64
def decode_regression_to_boxes(preds):
"""Decodes the results of the YOLOV8Detector forward-pass into boxes.
Returns left / top / right / bottom predictions with respect to anchor
points.
Each coordinate is encoded with 16 predicted values. Those predictions are
softmaxed and multiplied by [0..15] to make predictions. The resulting
predictions are relative to the stride of an anchor box (and correspondingly
relative to the scale of the feature map from which the predictions came).
"""
preds_bbox = keras.layers.Reshape((-1, 4, BOX_REGRESSION_CHANNELS // 4))(
preds
)
preds_bbox = tf.nn.softmax(preds_bbox, axis=-1) \* tf.range(
BOX_REGRESSION_CHANNELS // 4, dtype="float32"
)
return tf.reduce_sum(preds_bbox, axis=-1)
def dist2bbox(distance, anchor_points):
"""Decodes distance predictions into xyxy boxes.
Input left / top / right / bottom predictions are transformed into xyxy box
predictions based on anchor points.
The resulting xyxy predictions must be scaled by the stride of their
corresponding anchor points to yield an absolute xyxy box.
"""
left_top, right_bottom = tf.split(distance, 2, axis=-1)
x1y1 = anchor_points - left_top
x2y2 = anchor_points + right_bottom
return tf.concat((x1y1, x2y2), axis=-1) # xyxy bbox
def get_anchors(
image_shape,
strides=[8, 16, 32],
base_anchors=[0.5, 0.5],
):
"""Gets anchor points for YOLOV8.
YOLOV8 uses anchor points representing the center of proposed boxes, and
matches ground truth boxes to anchors based on center points.
Args:
image_shape: tuple or list of two integers representing the height and
width of input images, respectively.
strides: tuple of list of integers, the size of the strides across the
image size that should be used to create anchors.
base_anchors: tuple or list of two integers representing the offset from
(0,0) to start creating the center of anchor boxes, relative to the
stride. For example, using the default (0.5, 0.5) creates the first
anchor box for each stride such that its center is half of a stride
from the edge of the image.
Returns:
A tuple of anchor centerpoints and anchor strides. Multiplying the
two together will yield the centerpoints in absolute x,y format.
"""
base_anchors = tf.constant(base_anchors, dtype="float32")
all_anchors = []
all_strides = []
for stride in strides:
hh_centers = tf.range(0, image_shape[0], stride)
ww_centers = tf.range(0, image_shape[1], stride)
ww_grid, hh_grid = tf.meshgrid(ww_centers, hh_centers)
grid = tf.cast(
tf.reshape(tf.stack([hh_grid, ww_grid], 2), [-1, 1, 2]),
"float32",
)
anchors = (
tf.expand_dims(
base_anchors \* tf.constant([stride, stride], "float32"), 0
)
+ grid
)
anchors = tf.reshape(anchors, [-1, 2])
all_anchors.append(anchors)
all_strides.append(tf.repeat(stride, anchors.shape[0]))
all_anchors = tf.cast(tf.concat(all_anchors, axis=0), "float32")
all_strides = tf.cast(tf.concat(all_strides, axis=0), "float32")
all_anchors = all_anchors / all_strides[:, None]
# Swap the x and y coordinates of the anchors.
all_anchors = tf.concat(
[all_anchors[:, 1, None], all_anchors[:, 0, None]], axis=-1
)
return all_anchors, all_strides
def decode_predictions(
boxes_,
scores_,
images,
):
boxes = boxes_
scores = scores_
boxes = decode_regression_to_boxes(boxes)
anchor_points, stride_tensor = get_anchors(image_shape=(640,640,3))
stride_tensor = tf.expand_dims(stride_tensor, axis=-1)
box_preds = dist2bbox(boxes, anchor_points) \* stride_tensor
prediction_decoder = keras_cv.layers.MultiClassNonMaxSuppression(
bounding_box_format="xyxy",
from_logits=False,
iou_threshold=0.2,
confidence_threshold=0.2
)
return prediction_decoder(box_preds, scores)
\# Get argument first
labelMap = [
"
"green", "pink", "orange"
"
]
\# Create pipeline
pipeline = dai.Pipeline()
camRgb = pipeline.create(dai.node.ColorCamera)
camRgb.setPreviewSize(640, 640)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setInterleaved(False)
camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB)
\# camRgb.setFp16(True) # Model requires FP16 input
\# NN that detects faces in the image
nn = pipeline.create(dai.node.NeuralNetwork)
nn.setBlobPath(nnPath)
nn.setNumInferenceThreads(2)
camRgb.preview.link(nn.input)
\# Send bouding box from the NN to the host via XLink
nn_xout = pipeline.create(dai.node.XLinkOut)
nn_xout.setStreamName("nn")
nn.out.link(nn_xout.input)
\# Send rgb frames to the host
rgb_xout = pipeline.create(dai.node.XLinkOut)
rgb_xout.setStreamName("rgb")
nn.passthrough.link(rgb_xout.input)
\# Connect to device and start pipeline
with dai.Device(pipeline) as device:
# Output queues will be used to get the rgb frames and nn data from the outputs defined above
qRgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)
qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)
detections = []
while True:
inRgb = qRgb.get()
frame = inRgb.getCvFrame()
in_nn = qDet.tryGet()
if in_nn is not None:
# [print(f"Layer name: {l.name}, Type: {l.dataType}, Dimensions: {l.dims}") for l in in_nn.getAllLayers()]
# Extract the output shape: (batch_size, channels, num_predictions)
boxes = np.array(in_nn.getLayerFp16('box')).reshape(1, 8400, 64)
classes = np.array(in_nn.getLayerFp16('class')).reshape(1, 8400, 3)
detections=[]
# print(classes)
result=decode_predictions(boxes, classes, frame)
# print(result)
result_boxes=result["boxes"]
num_of_dects=result["num_detections"]
if(num_of_dects[0] >0):
print("num_of_dects")
print(num_of_dects)
for bbox_data in result_boxes[0]:
bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]
frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,255,0), 2)
image = inRgb.getCvFrame()
image = np.expand_dims(image, axis=0)
image = np.reshape(image, (1,3,640,640))
res = session.run(
output_names=[output_name0, output_name1],
input_feed={input_name: image}
)
result=decode_predictions(res[0], res[1], np.expand_dims(np.array(frame),axis=0))
# print(result)
result_boxes=result["boxes"]
num_of_dects=result["num_detections"]
if(num_of_dects[0] >0):
print("num_of_dects onnx")
print(num_of_dects)
for bbox_data in result_boxes[0]:
bbox = [np.int64(bbox_data[0]),np.int64(bbox_data[1]),np.int64(bbox_data[2]),np.int64(bbox_data[3])]
frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,0,255), 2)
cv2.imshow("rgb", frame)
if cv2.waitKey(1) == ord('q'):
break
if cv2.waitKey(1) == ord("s"):
cv2.imwrite("img.png", frame)
print(frame)
reshaped_frame = frame.reshape(-1, frame.shape[2]) # Reshape to (height\*width, channels)
np.savetxt("img.txt", reshaped_frame, fmt="%.6f")
break
Files:
https://drive.google.com/drive/folders/1cXhwfOF7TG81ZSIZ4NJjKl3dlUGRLctz
thanks,
Aleks
Hi @AleksNet,
thank you for the update! I want to update you as well. I have compared the predictions of ONNX and OpenVino IR models and both models work. This suggest that the issue lies in the conversion from IR to blob. I have also tried to change the dynamic input shape of the ONNX model to static, but the resulting blob also didn't work. I'm investigating the IR -> blob conversion at the moment. I'll keep you updated.
Best,
Jan