• DepthAI
  • grayscale frame input using color camera

hi @erik can we pass grayscale frame to the blob yolov5 model for detection using the color camera? if yes, how?

Note: We want blob model to process the detections on the grayscale frame instead of the rgb frame using the color camera.

# Properties
camRgb.setPreviewSize(300, 300)
camRgb.setInterleaved(False)
camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB)

# manip
manip = pipeline.create(dai.node.ImageManip)
manip.setFrameType(dai.RawImgFrame.Type.GRAY8)

manip.out.link(xoutRgb.input)

# Linking
camRgb.preview.link(manip.inputImage)

hi @jakaskerl
the above code is not working if the model is trained on img size 896*896.

#!/usr/bin/env python3
"""
The code is edited from docs (https://docs.luxonis.com/projects/api/en/latest/samples/Yolo/tiny_yolo/)
We add parsing from JSON files that contain configuration and process detections on a grayscale frame.
"""

from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import json
import blobconverter
import os

def run_inference(model_path, config_path):
    # parse config
    configPath = Path(config_path)
    if not configPath.exists():
        raise ValueError("Path {} does not exist!".format(configPath))

    with configPath.open() as f:
        config = json.load(f)
    nnConfig = config.get("nn_config", {})

    # parse input shape
    if "input_size" in nnConfig:
        W, H = tuple(map(int, nnConfig.get("input_size").split('x')))

    # extract metadata
    metadata = nnConfig.get("NN_specific_metadata", {})
    classes = metadata.get("classes", {})
    coordinates = metadata.get("coordinates", {})
    anchors = metadata.get("anchors", {})
    anchorMasks = metadata.get("anchor_masks", {})
    iouThreshold = metadata.get("iou_threshold", {})
    confidenceThreshold = metadata.get("confidence_threshold", {})

    # parse labels
    nnMappings = config.get("mappings", {})
    labels = nnMappings.get("labels", {})

    # get model path
    nnPath = model_path
    if not Path(nnPath).exists():
        print("No blob found at {}. Looking into DepthAI model zoo.".format(nnPath))
        nnPath = str(blobconverter.from_zoo(model_path, shaves=6, zoo_type="depthai", use_cache=True))

    # Create pipeline
    pipeline = dai.Pipeline()

    # Define sources and outputs
    camRgb = pipeline.create(dai.node.ColorCamera)
    manip = pipeline.create(dai.node.ImageManip)
    detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
    xoutRgb = pipeline.create(dai.node.XLinkOut)
    nnOut = pipeline.create(dai.node.XLinkOut)

    xoutRgb.setStreamName("rgb")
    nnOut.setStreamName("nn")

    # Camera properties
    camRgb.setPreviewSize(W, H)
    camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
    camRgb.setInterleaved(False)
    camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB)
    camRgb.setFps(30)

    # Convert to grayscale
    manip.initialConfig.setFrameType(dai.ImgFrame.Type.GRAY8)
    manip.setMaxOutputFrameSize(W * H )  # Ensure correct output size allocation for grayscale
    camRgb.video.link(manip.inputImage)

    # Network specific settings
    detectionNetwork.setConfidenceThreshold(confidenceThreshold)
    detectionNetwork.setNumClasses(classes)
    detectionNetwork.setCoordinateSize(coordinates)
    detectionNetwork.setAnchors(anchors)
    detectionNetwork.setAnchorMasks(anchorMasks)
    detectionNetwork.setIouThreshold(iouThreshold)
    detectionNetwork.setBlobPath(nnPath)
    detectionNetwork.setNumInferenceThreads(2)
    detectionNetwork.input.setBlocking(False)

    # Linking grayscale image to the detection network
    manip.out.link(detectionNetwork.input)
    detectionNetwork.passthrough.link(xoutRgb.input)
    detectionNetwork.out.link(nnOut.input)

    # Connect to device and start pipeline
    with dai.Device(pipeline) as device:
        qRgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)
        qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)

        frame = None
        detections = []
        startTime = time.monotonic()
        counter = 0

        while True:
            inRgb = qRgb.get()
            inDet = qDet.get()

            if inRgb is not None:
                frame = inRgb.getCvFrame()
                cv2.putText(frame, "NN fps: {:.2f}".format(counter / (time.monotonic() - startTime)),
                            (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, (255, 255, 255))
                print(counter / (time.monotonic() - startTime))
            
            if inDet is not None:
                detections = inDet.detections
                counter += 1

            if frame is not None:
                cv2.imshow("Grayscale Detection", frame)

            if cv2.waitKey(1) == ord('q'):
                break

# Example usage
model_path = '../model/896_yolov5n6_blob/best_openvino_2022.1_6shave.blob'
config_path = '../model/896_yolov5n6_blob/best.json'
run_inference(model_path, config_path)

@jakaskerl getting this error continuously…
[1844301051C738F500] [1.3] [3.097] [ImageManip(1)] [error] Output image is bigger (2073600B) than maximum frame size specified in properties (802816B) - skipping frame. Please use the setMaxOutputFrameSize API to explicitly config the [maximum] output size.

please note: the model is trained on 896*896 resolution.

please give the code which can run such models without any errors.

    Unknown
    The input image is not really 896*896 because you are linking a video output, whilst setting the frame size for preview output. Instead of camRgb.setPreviewSize(W, H), do camRgb.setVideoSize(W, H) or just link the preview.

    Thanks,
    Jaka

    hi @jakaskerl , even on doing "Instead of camRgb.setPreviewSize(W, H), do camRgb.setVideoSize(W, H)".

    I m getting error for the above fix - [1844301051C738F500] [1.3] [4.342] [DetectionNetwork(2)] [error] Input tensor 'images' (0) exceeds available data range. Data size (802816B), tensor offset (0), size (2408448B) - skipping inference

    Can you please give me correct code with all the fixes done.

      Unknown
      Well looks to me like the model you are using expects a 8968963 input, when you pass mono into it, two channels are empty, hence the error. Make sure model is also trained for mono image inference.

      Thanks,
      Jaka

      hi @jakaskerl ,Thanks for the support.
      I want to use the model trained with the 3 channels images, can we convert the grayscale frame to 3-channel before passing to the BLOB model for detection, if yes , how?

        Unknown
        Then do the exact opposite. I am not sure which camera you actually want to use.

        • If color camera. Use one ImageManip to convert it to GRAY8, then another to convert it to RGB88p - (three mono frames) --- though I am not sure how you profit by doing that
        • if mono camera, use image manip to convert it to RGB888p which will stack the mono frames to form W,H,3 frames.

        Thanks,
        Jaka

        thanks @jakaskerl , Actually my aim is to use color camera and process the detections on the grayscale frame, the model i am using is trained on 3 channel images.

          Unknown
          In that case you are wasting compute power.. I'd suggest compiling for single channel.

          Thanks.
          Jaka

          hi @jakaskerl whether the code given below for detection using the mono camera is correct?
          is it activating the ir flood light to detect in the night?

          #!/usr/bin/env python3
          """
          Mono Camera Object Detection using DepthAI YOLO
          Reads configuration from JSON and runs inference.
          """
          
          import cv2
          import depthai as dai
          import numpy as np
          import json
          import time
          import os
          from pathlib import Path
          from datetime import datetime
          import blobconverter
          
          def run_inference(model_path, config_path):
              # Load configuration file
              configPath = Path(config_path)
              if not configPath.exists():
                  raise ValueError(f"Path {configPath} does not exist!")
          
              with configPath.open() as f:
                  config = json.load(f)
              
              nnConfig = config.get("nn_config", {})
          
              # Parse input size
              if "input_size" in nnConfig:
                  W, H = tuple(map(int, nnConfig.get("input_size").split('x')))
          
              # Extract YOLO-specific metadata
              metadata = nnConfig.get("NN_specific_metadata", {})
              classes = metadata.get("classes", 80)  # Default to 80 if missing
              coordinates = metadata.get("coordinates", 4)
              anchors = metadata.get("anchors", [])
              anchorMasks = metadata.get("anchor_masks", {})
              iouThreshold = metadata.get("iou_threshold", 0.5)
              confidenceThreshold = metadata.get("confidence_threshold", 0.5)
          
              # Parse labels
              nnMappings = config.get("mappings", {})
              labels = nnMappings.get("labels", {})
          
              # Check if model exists, else download from DepthAI zoo
              nnPath = model_path
              if not Path(nnPath).exists():
                  print(f"No blob found at {nnPath}. Downloading from DepthAI Model Zoo...")
                  nnPath = str(blobconverter.from_zoo(model_path, shaves=6, zoo_type="depthai", use_cache=True))
          
              # Create DepthAI Pipeline
              pipeline = dai.Pipeline()
          
              # Define Mono Camera
              mono = pipeline.create(dai.node.MonoCamera)
              mono.setResolution(dai.MonoCameraProperties.SensorResolution.THE_720_P)
              mono.setFps(30)
          
              # Image Manipulation to Resize Input to NN
              manip = pipeline.create(dai.node.ImageManip)
              manip.initialConfig.setResize(W, H)
              manip.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
              manip.setMaxOutputFrameSize(W*H*3)  # Convert mono to RGB
          
              mono.out.link(manip.inputImage)
          
              # Define YOLO Detection Network
              detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
              detectionNetwork.setBlobPath(nnPath)
              detectionNetwork.setConfidenceThreshold(confidenceThreshold)
              detectionNetwork.setNumClasses(classes)
              detectionNetwork.setCoordinateSize(coordinates)
              detectionNetwork.setAnchors(anchors)
              detectionNetwork.setAnchorMasks(anchorMasks)
              detectionNetwork.setIouThreshold(iouThreshold)
              detectionNetwork.setNumInferenceThreads(2)
              detectionNetwork.input.setBlocking(False)
          
              # Linking Mono Camera to NN
              manip.out.link(detectionNetwork.input)
          
              # Define Output Streams
              xoutMono = pipeline.create(dai.node.XLinkOut)
              xoutNN = pipeline.create(dai.node.XLinkOut)
              xoutMono.setStreamName("mono")
              xoutNN.setStreamName("nn")
          
              # Linking Outputs
              detectionNetwork.passthrough.link(xoutMono.input)
              detectionNetwork.out.link(xoutNN.input)
          
              # Create output directory with timestamp
              timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
              output_dir = "output_mono"
              os.makedirs(output_dir, exist_ok=True)
          
              raw_video_path = os.path.join(output_dir, f"raw_{timestamp}.avi")
              detected_video_path = os.path.join(output_dir, f"detected_{timestamp}.avi")
          
              # Define Video Writers
              fourcc = cv2.VideoWriter_fourcc(*'XVID')
              fps = 30
              raw_writer = cv2.VideoWriter(raw_video_path, fourcc, fps, (W, H))
              detected_writer = cv2.VideoWriter(detected_video_path, fourcc, fps, (W, H))
          
              # Helper function for normalization
              def frameNorm(frame, bbox):
                  normVals = np.full(len(bbox), frame.shape[0])
                  normVals[::2] = frame.shape[1]
                  return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)
          
              def displayFrame(name, frame, detections):
                  color = (255, 0, 0)
                  for detection in detections:
                      bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))
                      cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
                      cv2.putText(frame, labels[detection.label], (bbox[0] + 10, bbox[1] + 20),
                                  cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
                      cv2.putText(frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40),
                                  cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
          
                  detected_writer.write(frame)
                  cv2.imshow(name, frame)
          
              # Start Pipeline Execution
              with dai.Device(pipeline) as device:
          
          
          
                  ir_emitter_brightness = 1200  # Set between 0 and 1500 (max)
                  device.setIrLaserDotProjectorBrightness(ir_emitter_brightness)
                  device.setIrFloodLightBrightness(ir_emitter_brightness)
          
          
          
                  qMono = device.getOutputQueue(name="mono", maxSize=4, blocking=False)
                  qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)
          
                  frame = None
                  detections = []
                  startTime = time.monotonic()
                  counter = 0
          
                  while True:
                      inMono = qMono.get()
                      inDet = qDet.get()
          
                      if inMono is not None:
                          frame = inMono.getCvFrame()
                          raw_writer.write(frame)
          
                          cv2.putText(frame, "NN FPS: {:.2f}".format(counter / (time.monotonic() - startTime)),
                                      (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, (255, 255, 255))
          
                      if inDet is not None:
                          detections = inDet.detections
                          counter += 1
          
                      if frame is not None:
                          displayFrame("Mono Camera Detection", frame, detections)
          
                      if cv2.waitKey(1) == ord('q'):
                          break
          
              # Release Resources
              raw_writer.release()
              detected_writer.release()
              cv2.destroyAllWindows()
          
          
          
          model_path = './model/drone_rc_birds_1000epoch/best_dorne_rc_birds_1000_openvino_2022.1_6shave.blob'
          config_path = './model/drone_rc_birds_1000epoch/best_dorne_rc_birds_1000.json'
          # Example usage
          # model_path = './model/blob_640_nano_rc_b_d_1k/best_openvino_2022.1_6shave.blob'
          # config_path = './model/blob_640_nano_rc_b_d_1k/best.json'
          run_inference(model_path, config_path)

            Unknown
            Looks good. One thing I'd change is to use setIrLaserDotProjectorIntensity instead of "brightness". So you can have a value between 0 and 1. Right now, 800mA value is where the brightness peaks.

            Thanks,
            Jaka