Hello, I'm trying to deploy late-fusion multi-modal custom network onto Oak-D-IoT-40.
The network takes image(NCHW)(1x3x360x640) and sensor data(NWC)(1x200x5) for inputs.
My model is in onnx and openvino format, both of which are giving sane results when
tested with training dataset (99% accuracy)
But when I convert them to blob with blobconverter, the training accuracy drops to about 37%.
I've tried many things but I can't seem to figure out what this is about.
I was wondering if I were putting in wrong mean values and scale values, but when applied the
same parameters while converting to openvino ir, they worked perfectly fine and showed good results.
I assume
a. something went wrong during conversion from openvino to blob or
b. there's some part of my code that is causing a major drop in accuracy.
I was hoping if I could get some help. Here's my code for testing blob on my device :
import numpy as np
import cv2
import depthai as dai
import os
import time
#####################################################
# Example: Minimal code for .blob inference on DepthAI
# with internal mean/std for image branch
# and no normalization for mechanical branch
#####################################################
# Label mapping (adjust if you have different labels)
class_label = {
0: 'unpaved ramp ascent',
1: 'paved ramp ascent',
2: 'paved ramp descent',
3: 'unpaved ramp descent',
4: 'unpaved flat',
5: 'paved flat',
6: 'stair ascent',
7: 'stair descent'
}
# Softmax for final logits
def softmax(logits):
exp_logits = np.exp(logits - np.max(logits))
return exp_logits / np.sum(exp_logits)
def preprocess_image_depthai(image_path):
bgr_img = cv2.imread(image_path)
if bgr_img is None:
return None
# Resize to (640,360) => (W,H)
resized_bgr = cv2.resize(bgr_img, (640, 360))
# Convert BGR to RGB
rgb_img = cv2.cvtColor(resized_bgr, cv2.COLOR_BGR2RGB)
# Transpose to (C,H,W) = (3,360,640)
transposed = np.transpose(rgb_img, (2, 0, 1))
# shape is now (3,360,640), dtype=uint8 (assuming original was uint8)
# Expand batch dimension => (1,3,360,640)
# expand dims : (1, 2, 3) - shape (1, 3) -> [[1],[2],[3]] - shape (3, 1)으로 바꿈
return np.expand_dims(transposed, axis=0)
# Example file paths (adjust to your own):
image_dir = '/train/img/'
mech_data_np = '/train/mec_posimumag.npy'
label_path = '/train/gt.npy' # Path to ground truth labels
blob_path = "0108_nwc _output_named.blob"
# Load mechanical data and labels
mech_data = np.load(mech_data_np).astype(np.float16)
labels = np.load(label_path)
# Create DepthAI pipeline
pipeline = dai.Pipeline()
# Create XLinkIn nodes (image + mechanical)
cam_xin = pipeline.create(dai.node.XLinkIn)
cam_xin.setStreamName("cam_input")
mech_xin = pipeline.create(dai.node.XLinkIn)
mech_xin.setStreamName("mech_input")
# for later use
manip = pipeline.create(dai.node.ImageManip)
nn = pipeline.create(dai.node.NeuralNetwork)
nn.setBlobPath(blob_path) # .blob with internal mean/std (for image_input) and none for mec_input
# Link nodes
cam_xin.out.link(manip.inputImage)
manip.out.link(nn.inputs['image_input'])
mech_xin.out.link(nn.inputs['mec_input'])
xout_nn = pipeline.create(dai.node.XLinkOut)
xout_nn.setStreamName("nn")
nn.out.link(xout_nn.input)
# Inference loop
with dai.Device(pipeline) as device:
cam_input_queue = device.getInputQueue("cam_input", maxSize=4, blocking=False)
mech_input_queue = device.getInputQueue("mech_input", maxSize=4, blocking=False)
q_nn = device.getOutputQueue("nn", maxSize=4, blocking=False)
correct_predictions = 0
total_images = 0
times = []
N = len(labels)
for idx in range(N):
start_time = time.time()
image_path = os.path.join(image_dir, f"{idx}.png")
preproc_img = preprocess_image_depthai(image_path)
if preproc_img is None:
print(f"Failed to load image {image_path}, skipping.")
continue
# Convert to ImgFrame
imgFrame = dai.ImgFrame()
# Flatten to 1D
imgFrame.setData(preproc_img.flatten())
imgFrame.setType(dai.RawImgFrame.Type.RGB888p)
imgFrame.setWidth(640)
imgFrame.setHeight(360)
cam_input_queue.send(imgFrame)
# Mechanical data: shape
mech_sample = mech_data[idx].reshape(1, 200, 5)
mec_tensor = dai.NNData()
mec_tensor.setLayer("mec_input", mech_sample.flatten())
mech_input_queue.send(mec_tensor)
# Get NN output
in_nn = q_nn.get()
if in_nn:
logits = np.array(in_nn.getLayerFp16("output")) # Expect length = 8 if single 8-class output
probabilities = softmax(logits)
predicted_class = np.argmax(probabilities)
# Compare to ground truth
if predicted_class == labels[idx]:
correct_predictions += 1
total_images += 1
# Debug
iter_time = time.time() - start_time
times.append(iter_time)
accuracy = (correct_predictions / total_images) * 100
print(f"[{idx}] Predicted: {class_label[predicted_class]}, "
f"GT: {class_label[labels[idx]]} "
f"Acc: {accuracy:.2f}%, Iter time: {iter_time:.4f}s")
# Final stats
overall_acc = (correct_predictions / total_images) * 100 if total_images > 0 else 0
avg_time = np.mean(times) if len(times) > 0 else 0
print(f"Final Accuracy: {overall_acc:.2f}%")
print(f"Avg Iter Time: {avg_time:.4f}s")