• DepthAI-v2
  • Make inference on the user-defined area.

Hey, I am trying to make predictions on the area that the user defines by drawing a rectangle. Right now, I am making predictions on the full image, and if the center of a detected object falls inside the user-defined rectangle, I am displaying it.

I don't think this is the best way. I want to make predictions only in the given area, not for the full image.

Is it possible?


Here is my Code

import cv2

import numpy as np

import depthai as dai

# path to .blob model

nnPath = "Models/chess-yolov5n-blob/last_openvino_2022.1_6shave.blob"

# labels

labelMap = ['black-bishop', 'black-king', 'black-knight', 'black-pawn',

'black-queen', 'black-rook', 'white-bishop', 'white-king', 'white-knight',

'white-pawn', 'white-queen', 'white-rook']

previewSize = (640, 640)

# Create pipeline

pipeline = dai.Pipeline()

# Define source and outputs

camRgb = pipeline.create(dai.node.ColorCamera)

camRgb.setPreviewSize(previewSize)

camRgb.setInterleaved(False)

# Define a neural network that will make predictions based on the source frames

nn = pipeline.create(dai.node.YoloDetectionNetwork)

nn.setConfidenceThreshold(0.8)

nn.setIouThreshold(0.9)

nn.setBlobPath(nnPath)

nn.setNumInferenceThreads(2)

nn.input.setBlocking(False)

nn.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319])

nn.setAnchorMasks({

'side80': [0, 1, 2],

'side40': [3, 4, 5],

'side20': [6, 7, 8]

})

nn.setNumClasses(12)

nn.setCoordinateSize(4)

camRgb.preview.link(nn.input)

# Linking

xoutRgb = pipeline.create(dai.node.XLinkOut)

xoutRgb.setStreamName("rgb")

camRgb.preview.link(xoutRgb.input)

nnOut = pipeline.create(dai.node.XLinkOut)

nnOut.setStreamName("nn")

nn.out.link(nnOut.input)

class InferenceRegion:

step = 10

position = (0, 0)

size = (100, 100)

maxDims = previewSize[0], previewSize[1]

def grow(self, x=0, y=0):

self.size = (

max(1, self.size[0] + x),

max(1, self.size[1] + y)

)

def move(self, x=0, y=0):

self.position = (

max(0, self.position[0] + x),

max(0, self.position[1] + y)

)

def endPosition(self):

return (

min(self.position[0] + self.size[0], self.maxDims[0]),

min(self.position[1] + self.size[1], self.maxDims[1]),

)

# Connect to device and start pipeline

with dai.Device(pipeline) as device:

qRgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)

qDet = device.getOutputQueue(name="nn", maxSize=4, blocking=False)

frame = None

detections = []

region = InferenceRegion()

def frameNorm(frame, bbox):

normVals = np.full(len(bbox), frame.shape[0])

normVals[::2] = frame.shape[1]

return (np.clip(np.array(bbox), 0, 1) * normVals).astype(int)

def displayFrame(name, frame):

for detection in detections:

bbox = frameNorm(frame, (detection.xmin, detection.ymin, detection.xmax, detection.ymax))

center_x = (bbox[0] + bbox[2]) // 2

center_y = (bbox[1] + bbox[3]) // 2

# Check if the center of the detected object is within the region

if (region.position[0] <= center_x <= region.endPosition()[0] and

region.position[1] <= center_y <= region.endPosition()[1]):

cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)

cv2.putText(frame, labelMap[detection.label], (bbox[0] + 10, bbox[1] + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)

cv2.putText(frame, f"{int(detection.confidence * 100)}%", (bbox[0] + 10, bbox[1] + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)

cv2.rectangle(frame, region.position, region.endPosition(), (0, 255, 0), 2)

cv2.imshow(name, frame)

while True:

inRgb = qRgb.tryGet()

inDet = qDet.tryGet()

if inRgb is not None:

frame = inRgb.getCvFrame()

if inDet is not None:

detections = inDet.detections

if frame is not None:

displayFrame("rgb", frame)

key = cv2.waitKey(1)

if key == ord('w'):

region.move(y=-region.step)

elif key == ord('s'):

region.move(y=region.step)

elif key == ord('a'):

region.move(x=-region.step)

elif key == ord('d'):

region.move(x=region.step)

elif key == ord('+'):

region.grow(x=10, y=10)

region.step = region.step + 1

elif key == ord('-'):

region.grow(x=-10, y=-10)

region.step = max(region.step - 1, 1)

if key == ord('q'):

break

cv2.destroyAllWindows()

    Hi siromer
    You could train a model with a smaller input (eg. 416 x 416) and have ImageManip node make a crop of the ColorCamera frame in the appropriate size and sent it to the NN. That way the inference could be faster since the model takes less resources. But it might be that the accuracy will be lower since the input size is also lower.

    Thanks,
    Jaka