Hi I would like to do people tracking from the top, so I'm having some issues, First of all I'm using as a base https://docs.luxonis.com/projects/api/en/latest/samples/ObjectTracker/object_tracker/#object-tracker-on-rgb but mobilenet is not able to detect people from top, but is pretty good detecting from the side.
From side:
Based on the mobilenet issue I decided to move to Yolo but when using yolov4 I got many overlapping tracks and I have been trying to modify the IoU and confidence threshold but have not success
Here is the code:
#!/usr/bin/env python3
from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import argparse
import logging
logging.basicConfig(filename='test.log', format='%(asctime)s | %(levelname)s | %(message)s',
level=logging.DEBUG)
# labelMap = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
# "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
labelMap = [
"person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train",
"truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
"bird", "cat", "dog", "horse", "sheep", "cow", "elephant",
"bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie",
"suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
"fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
"orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
"chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor",
"laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
"teddy bear", "hair drier", "toothbrush"
]
nnPathDefault = str((Path(__file__).parent / Path('../models/yolo-v4-tiny-tf_openvino_2021.4_6shave.blob')).resolve().absolute())
parser = argparse.ArgumentParser()
parser.add_argument('nnPath', nargs='?', help="Path to mobilenet detection network blob", default=nnPathDefault)
parser.add_argument('-ff', '--full_frame', action="store_true", help="Perform tracking on full RGB frame", default=False)
args = parser.parse_args()
fullFrameTracking = args.full_frame
# Create pipeline
pipeline = dai.Pipeline()
# Define sources and outputs
camRgb = pipeline.create(dai.node.ColorCamera)
detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
objectTracker = pipeline.create(dai.node.ObjectTracker)
xlinkOut = pipeline.create(dai.node.XLinkOut)
trackerOut = pipeline.create(dai.node.XLinkOut)
nnout = pipeline.create(dai.node.XLinkOut)
xlinkOut.setStreamName("preview")
trackerOut.setStreamName("tracklets")
nnout.setStreamName("nn")
# Properties
camRgb.setPreviewSize(416, 416)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setInterleaved(False)
camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
camRgb.setFps(25)
# testing MobileNet DetectionNetwork
detectionNetwork.setConfidenceThreshold(0.9)
detectionNetwork.setNumClasses(80)
detectionNetwork.setCoordinateSize(4)
detectionNetwork.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319])
detectionNetwork.setAnchorMasks({"side26": [1, 2, 3], "side13": [3, 4, 5]})
detectionNetwork.setIouThreshold(0.9)
detectionNetwork.setBlobPath(args.nnPath)
detectionNetwork.setNumInferenceThreads(2)
detectionNetwork.input.setBlocking(False)
objectTracker.setDetectionLabelsToTrack([0]) # track only person
# possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
objectTracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)
# take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)
# Linking
camRgb.preview.link(detectionNetwork.input)
objectTracker.passthroughTrackerFrame.link(xlinkOut.input)
if fullFrameTracking:
camRgb.video.link(objectTracker.inputTrackerFrame)
else:
detectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)
detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
detectionNetwork.out.link(objectTracker.inputDetections)
detectionNetwork.out.link(nnout.input)
objectTracker.out.link(trackerOut.input)
result = cv2.VideoWriter('filename.avi',
cv2.VideoWriter_fourcc(*'MJPG'),
30, (600, 600))
# Connect to device and start pipeline
with dai.Device(pipeline) as device:
preview = device.getOutputQueue("preview", 4, False)
tracklets = device.getOutputQueue("tracklets", 4, False)
nn = device.getOutputQueue("nn", 4, False)
startTime = time.monotonic()
counter = 0
fps = 0
frame = None
to_right = 0
to_left = 0
diff = 0
entry = {}
while(True):
imgFrame = preview.get()
track = tracklets.get()
nnData = nn.get()
print("got image")
counter+=1
current_time = time.monotonic()
if (current_time - startTime) > 1 :
fps = counter / (current_time - startTime)
counter = 0
startTime = current_time
color = (255, 0, 0)
frame = imgFrame.getCvFrame()
trackletsData = track.tracklets
print("got traklets")
print("Tracklets Content: ", trackletsData)
print("Tracklets Length: ", len(trackletsData))
print("nnData.detections: ", nnData.detections)
# if len(nnData.detections) > 0:
# print("nnDataLabel: ", nnData.detections[0].label)
# print("nnDataConfidence: ", nnData.detections[0].confidence)
# print("nnDataBoundingBox: ", nnData.detections[0].xmin, nnData.detections[0].ymin, nnData.detections[0].xmax, nnData.detections[0].ymax)
for t in trackletsData:
roi = t.roi.denormalize(frame.shape[1], frame.shape[0])
x1 = int(roi.topLeft().x)
y1 = int(roi.topLeft().y)
x2 = int(roi.bottomRight().x)
y2 = int(roi.bottomRight().y)
try:
label = labelMap[t.label]
except:
label = t.label
print(x1, y1, x2, y2, label)
centroid_x = int((x1 + x2) / 2)
centroid_y = int((y1 + y2) / 2)
cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, f"ID: {[t.id]}", (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.putText(frame, t.status.name, (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
cv2.rectangle(frame, (x1, y1), (x2, y2), color, cv2.FONT_HERSHEY_SIMPLEX)
cv2.circle(frame, (centroid_x, centroid_y), 5, color, -1)
logging.info(f"ID: {[t.id]} | Label: {label} | Status: {t.status.name} | Coordinates: {[x1, y1, x2, y2]} | Centroids: {[centroid_x, centroid_y]}")
if t.status.name == "NEW" and t.id not in entry:
entry[t.id] = centroid_x
if t.status.name == "REMOVED" and centroid_x > frame.shape[1]*0.75:
if abs(entry[t.id] - centroid_x) > frame.shape[1]*0.4:
to_right += 1
diff = to_right - to_left
del entry[t.id]
if t.status.name == "REMOVED" and centroid_x < frame.shape[1]*0.25:
if abs(entry[t.id] - centroid_x) > frame.shape[1]*0.4:
to_left += 1
diff = to_right - to_left
del entry[t.id]
cv2.putText(frame, "NN fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
cv2.putText(frame, "left: {:.0f}".format(to_left), (2, 10), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
cv2.putText(frame, "right: {:.0f}".format(to_right), (2, 25), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
cv2.putText(frame, "Diff: {:.0f}".format(diff), (2, 40), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
resized = cv2.resize(frame, (600, 600), interpolation = cv2.INTER_AREA)
result.write(resized)
cv2.imshow("tracker", resized)
if cv2.waitKey(1) == ord('q'):
result.release()
break
Any suggestions on how to avoid those overlapping tracklets are welcome.
With all the previous in mind I'm thinking on 2 solutions:
1 - Look for a way to retrain Mobilenet to detect people from top , is that a viable solution? if I provide a new mobilenet .blob in the (detectionNetwork.setBlobPath(args.nnPath)) retrained with people from top the node will it work out-of-the-box? do you have any example of retraining mobilenet to be used with the MobileNetDetectionNetwork?
2- Train or use a NN that already detects people from the top and implement a script node to get the neural network output and generate the ImgDetections object required by the Object detector as input? Do you have any idea if that is feasible to do or is it a so heavy process to do it in the device?