Hi @erik and everyone,
I trained a ResNet18 model with PyTorch to classify excavator actions and converted it to a .blob file for the OAK D camera. However, its accuracy was lower on the camera than on the host.
This is a apart of script running on my OAK D camera:
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)
def to_planar(arr: np.ndarray, shape: tuple) -> np.ndarray:
resized = cv2.resize(arr, shape)
return resized.transpose(2, 0, 1).flatten()
# Pipeline defined, now the device is assigned and pipeline is started
with dai.Device(pipeline) as device:
# Input queue will be used to send video frames to the device.
qIn = device.getInputQueue(name="inFrame")
# Output queue will be used to get nn data from the video frames.
qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False) # maxSize=6, blocking=True)
qPass = device.getOutputQueue("pass")
frame = None
result = None
cap = cv2.VideoCapture(videoPath)
while cap.isOpened():
read_correctly, frame = cap.read()
if not read_correctly:
break
frame_planar = to_planar(frame, (224, 224))
# Create a dai.ImgFrame and send to the device
img = dai.ImgFrame()
img.setType(dai.RawImgFrame.Type.BGR888p)
img.setSize(224, 224)
img.setData(frame_planar)
qIn.send(img)
inDet = qDet.tryGet()
if inDet is not None:
data = softmax(inDet.getFirstLayerFp16())
result_conf = np.max(data)
if result_conf > 0.2:
result = {
"name": labels[np.argmax(data)],
"conf": round(100 * result_conf, 2)
}
else:
result = None
frame_main = qPass.get().getCvFrame()
if result is not None:
cv2.putText(frame_main, "{}".format(result["name"]), (5, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255))
cv2.putText(frame_main, "{}%".format(result["conf"]), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255))
cv2.imshow("passthrough", cv2.resize(frame_main, (224, 224)))
And this is how I run the model on my host (without using OAK D camera):
cap = cv2.VideoCapture(video_path)
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Convert frame to RGB for model inference
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Resize the frame to 224x224
frame_resized = cv2.resize(frame_rgb, (224, 224))
# Normalize the image and add the batch dimension
img_tensor = torch.from_numpy(frame_resized / 255.0).permute(2, 0, 1).float().unsqueeze(0)
# Inference with the ResNet18 model
with torch.no_grad():
outputs = model(img_tensor)
probs = F.softmax(outputs, dim=1)
# Get class with highest confidence
confidences, class_idx = probs.squeeze(0).max(0)
label = f'{class_names[class_idx]} {confidences:.2f}'
# Draw class and confidence on the frame
cv2.putText(frame_resized, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
# Convert back to BGR for displaying
frame_display = cv2.cvtColor(frame_resized, cv2.COLOR_RGB2BGR)
# Display the frame
cv2.imshow('Video', frame_display)
I ran the model on both OAK D and my host. While both worked, the OAK D's results were significantly less accurate. Could the difference in neural network processing be the cause?
I trained a YOLOV5s model and observed better accuracy on the host than on OAK D. This suggests the issue might not be with the models themselves.
I've been confused about this quite long, and just want the NN node on my OAK D can generate equally accurate results.
Glad to know your thoughts!
Regards,
Austin