Big drop in accuracy after converting to blob

jiwon

Hello, I'm trying to deploy late-fusion multi-modal custom network onto Oak-D-IoT-40.

The network takes image(NCHW)(1x3x360x640) and sensor data(NWC)(1x200x5) for inputs.

My model is in onnx and openvino format, both of which are giving sane results when

tested with training dataset (99% accuracy)

But when I convert them to blob with blobconverter, the training accuracy drops to about 37%.

I've tried many things but I can't seem to figure out what this is about.

I was wondering if I were putting in wrong mean values and scale values, but when applied the

same parameters while converting to openvino ir, they worked perfectly fine and showed good results.

I assume

a. something went wrong during conversion from openvino to blob or

b. there's some part of my code that is causing a major drop in accuracy.

I was hoping if I could get some help. Here's my code for testing blob on my device :

import numpy as np
import cv2
import depthai as dai
import os
import time

#####################################################
# Example: Minimal code for .blob inference on DepthAI
#          with internal mean/std for image branch 
#          and no normalization for mechanical branch
#####################################################

# Label mapping (adjust if you have different labels)
class_label = {
    0: 'unpaved ramp ascent',
    1: 'paved ramp ascent',
    2: 'paved ramp descent',
    3: 'unpaved ramp descent',
    4: 'unpaved flat',
    5: 'paved flat',
    6: 'stair ascent',
    7: 'stair descent'
}

# Softmax for final logits
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / np.sum(exp_logits)

def preprocess_image_depthai(image_path):
    bgr_img = cv2.imread(image_path)
    if bgr_img is None:
        return None
    # Resize to (640,360) => (W,H)
    resized_bgr = cv2.resize(bgr_img, (640, 360))
    # Convert BGR to RGB
    rgb_img = cv2.cvtColor(resized_bgr, cv2.COLOR_BGR2RGB)
    # Transpose to (C,H,W) = (3,360,640)
    transposed = np.transpose(rgb_img, (2, 0, 1))
    # shape is now (3,360,640), dtype=uint8 (assuming original was uint8)
    # Expand batch dimension => (1,3,360,640)
    # expand dims : (1, 2, 3) - shape (1, 3) -> [[1],[2],[3]] - shape (3, 1)으로 바꿈 
    return np.expand_dims(transposed, axis=0)

# Example file paths (adjust to your own):
image_dir = '/train/img/'
mech_data_np = '/train/mec_posimumag.npy'
label_path = '/train/gt.npy'  # Path to ground truth labels
blob_path    = "0108_nwc _output_named.blob"

# Load mechanical data and labels
mech_data = np.load(mech_data_np).astype(np.float16)
labels    = np.load(label_path)

# Create DepthAI pipeline
pipeline = dai.Pipeline()

# Create XLinkIn nodes (image + mechanical)
cam_xin = pipeline.create(dai.node.XLinkIn)
cam_xin.setStreamName("cam_input")

mech_xin = pipeline.create(dai.node.XLinkIn)
mech_xin.setStreamName("mech_input")

# for later use
manip = pipeline.create(dai.node.ImageManip)

nn = pipeline.create(dai.node.NeuralNetwork)
nn.setBlobPath(blob_path)  # .blob with internal mean/std (for image_input) and none for mec_input

# Link nodes
cam_xin.out.link(manip.inputImage)
manip.out.link(nn.inputs['image_input'])
mech_xin.out.link(nn.inputs['mec_input'])

xout_nn = pipeline.create(dai.node.XLinkOut)
xout_nn.setStreamName("nn")
nn.out.link(xout_nn.input)

# Inference loop
with dai.Device(pipeline) as device:
    cam_input_queue  = device.getInputQueue("cam_input",  maxSize=4, blocking=False)
    mech_input_queue = device.getInputQueue("mech_input", maxSize=4, blocking=False)
    q_nn             = device.getOutputQueue("nn", maxSize=4, blocking=False)

    correct_predictions = 0
    total_images        = 0
    times               = []
    N = len(labels)

    for idx in range(N):
        start_time  = time.time()
        image_path  = os.path.join(image_dir, f"{idx}.png")
        preproc_img = preprocess_image_depthai(image_path)
        if preproc_img is None:
            print(f"Failed to load image {image_path}, skipping.")
            continue

        # Convert to ImgFrame
        imgFrame = dai.ImgFrame()
        # Flatten to 1D
        imgFrame.setData(preproc_img.flatten())
        imgFrame.setType(dai.RawImgFrame.Type.RGB888p)
        imgFrame.setWidth(640)
        imgFrame.setHeight(360)
        cam_input_queue.send(imgFrame)

        # Mechanical data: shape 
        mech_sample = mech_data[idx].reshape(1, 200, 5)
        mec_tensor  = dai.NNData()
        mec_tensor.setLayer("mec_input", mech_sample.flatten())
        mech_input_queue.send(mec_tensor)
        # Get NN output
        in_nn = q_nn.get()
        if in_nn:
            logits = np.array(in_nn.getLayerFp16("output"))  # Expect length = 8 if single 8-class output
            probabilities   = softmax(logits)
            predicted_class = np.argmax(probabilities)

            # Compare to ground truth
            if predicted_class == labels[idx]:
                correct_predictions += 1
            total_images += 1

            # Debug
            iter_time  = time.time() - start_time
            times.append(iter_time)
            accuracy = (correct_predictions / total_images) * 100
            print(f"[{idx}] Predicted: {class_label[predicted_class]}, "
                  f"GT: {class_label[labels[idx]]} "
                  f"Acc: {accuracy:.2f}%, Iter time: {iter_time:.4f}s")

    # Final stats
    overall_acc = (correct_predictions / total_images) * 100 if total_images > 0 else 0
    avg_time    = np.mean(times) if len(times) > 0 else 0
    print(f"Final Accuracy: {overall_acc:.2f}%")
    print(f"Avg Iter Time: {avg_time:.4f}s")

jakaskerl

jiwon
Use nn.passthrough outptu to view how the frame that are received by the nn node look like. Perhaps there is something wrong there - maybe color order, interleaved/planar, squeezed/cropped image...

Thanks,
Jaka

jiwon

Thank you, I used passthrough to inspect the input for the nodes, and I figured that there is a conversion from fp16 to int8.

I wonder why this conversion happens and what can I do about it. I've set parameter to -ip FP16. I've tried setting getData().astype(fp16), but it only gets the data from 255 to 255.0 .

jakaskerl

jiwon
IP should be U8 like here:

Thanks,
Jaka