Hi Letty
GPT:
Understanding the Output Structure
Output Shapes:
• Output1: [1, 85, 44, 80] (Large scale)
• Output2: [1, 85, 22, 40] (Medium scale)
• Output3: [1, 85, 11, 20] (Small scale)
Interpretation:
• Batch Size: 1
• Channels: 85 (This usually represents 5 bbox parameters + 80 class probabilities for COCO dataset)
• Height & Width: Grid dimensions for that scale.
Grid Sizes:
• Output1 (Large scale): Grid size 80 x 44
• Output2 (Medium scale): Grid size 40 x 22
• Output3 (Small scale): Grid size 20 x 11
Strides:
Given your input image size is 640 x 352, the strides can be calculated as:
• Stride1: 8 (640 / 80)
• Stride2: 16 (640 / 40)
• Stride3: 32 (640 / 20)
import cv2
import numpy as np
from openvino.runtime import Core
# Initialize OpenVINO
ie = Core()
model = ie.read_model(model="models/yolov8n_coco_640x352/yolov8n_coco_640x352.xml")
compiled_model = ie.compile_model(model=model, device_name="CPU")
# Get input and output layers
input_layer_ir = compiled_model.input(0)
output_layers_ir = compiled_model.outputs
# Read and preprocess the image
image = cv2.imread("../test.jpg")
input_height, input_width = 352, 640
resized_image = cv2.resize(image, (input_width, input_height))
input_image = resized_image.transpose(2, 0, 1) # HWC to CHW
input_image = input_image[np.newaxis, :] / 255.0 # Normalize and add batch dimension
# Run inference
outputs = compiled_model([input_image])
# Processing parameters
num_classes = 80
conf_threshold = 0.25
iou_threshold = 0.45
# For collecting all detections
all_detections = []
# Strides and grid sizes for each output layer
strides = [8, 16, 32]
output_shapes = {
0: (44, 80), # Output1: [1,85,44,80]
1: (22, 40), # Output2: [1,85,22,40]
2: (11, 20), # Output3: [1,85,11,20]
}
for idx, output_layer in enumerate(output_layers_ir):
output = outputs[output_layer]
grid_h, grid_w = output_shapes[idx]
stride = strides[idx]
# Reshape and permute the output to [batch, grid_h, grid_w, channels]
output = output[0].transpose(1, 2, 0)
output = output.reshape(-1, 85)
# Apply sigmoid to the objectness score and class scores
output[:, 4:] = 1 / (1 + np.exp(-output[:, 4:]))
# Filter out low confidence detections
objectness = output[:, 4]
mask = objectness > conf_threshold
filtered_output = output[mask]
if filtered_output.size == 0:
continue
# Get coordinates, objectness, and class scores
x = filtered_output[:, 0]
y = filtered_output[:, 1]
w = filtered_output[:, 2]
h = filtered_output[:, 3]
scores = filtered_output[:, 5:] * filtered_output[:, 4:5]
# Get class IDs and scores
class_ids = np.argmax(scores, axis=1)
class_scores = scores[np.arange(len(scores)), class_ids]
# Only keep detections with class score above threshold
keep = class_scores > conf_threshold
x = x[keep]
y = y[keep]
w = w[keep]
h = h[keep]
class_ids = class_ids[keep]
class_scores = class_scores[keep]
# Calculate positions on the original image
grid_x, grid_y = np.meshgrid(np.arange(grid_w), np.arange(grid_h))
grid_x = grid_x.flatten()[mask][keep]
grid_y = grid_y.flatten()[mask][keep]
# Decode bounding boxes
x = (x + grid_x) * stride
y = (y + grid_y) * stride
w = np.exp(w) * stride
h = np.exp(h) * stride
# Convert to [x1, y1, x2, y2]
x1 = x - w / 2
y1 = y - h / 2
x2 = x + w / 2
y2 = y + h / 2
# Append detections
for i in range(len(x1)):
detection = [x1[i], y1[i], x2[i], y2[i], class_scores[i], class_ids[i]]
all_detections.append(detection)
# Convert to numpy array
all_detections = np.array(all_detections)
# Apply Non-Maximum Suppression
if len(all_detections) > 0:
boxes = all_detections[:, :4]
scores = all_detections[:, 4]
class_ids = all_detections[:, 5].astype(int)
# Perform NMS
indices = cv2.dnn.NMSBoxes(
bboxes=boxes.tolist(),
scores=scores.tolist(),
score_threshold=conf_threshold,
nms_threshold=iou_threshold,
)
# Draw detections
for i in indices:
i = i[0] # OpenCV returns a list of lists
x1, y1, x2, y2 = boxes[i]
conf = scores[i]
class_id = class_ids[i]
# Draw bounding box
cv2.rectangle(
image,
(int(x1), int(y1)),
(int(x2), int(y2)),
color=(0, 255, 0),
thickness=2,
)
# Put label
label = f"{class_id}: {conf:.2f}"
cv2.putText(
image,
label,
(int(x1), int(y1) - 10),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
2,
)
# Show the image
cv2.imshow("Detections", image)
cv2.waitKey(0)
cv2.destroyAllWindows()
else:
print("No detections")
Thanks,
Jaka