Object tracker reused with a different neural network or retrained

RafaelRodriguez

Hi I would like to do people tracking from the top, so I'm having some issues, First of all I'm using as a base https://docs.luxonis.com/projects/api/en/latest/samples/ObjectTracker/object_tracker/#object-tracker-on-rgb but mobilenet is not able to detect people from top, but is pretty good detecting from the side.
From side:

mobilenet.mp4

6MB

Based on the mobilenet issue I decided to move to Yolo but when using yolov4 I got many overlapping tracks and I have been trying to modify the IoU and confidence threshold but have not success

yoloasdetector.mp4

1MB

Here is the code:

#!/usr/bin/env python3

from pathlib import Path
import cv2
import depthai as dai
import numpy as np
import time
import argparse
import logging
logging.basicConfig(filename='test.log', format='%(asctime)s | %(levelname)s | %(message)s',
                    level=logging.DEBUG)

# labelMap = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
#             "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
labelMap = [
    "person",         "bicycle",    "car",           "motorbike",     "aeroplane",   "bus",           "train",
    "truck",          "boat",       "traffic light", "fire hydrant",  "stop sign",   "parking meter", "bench",
    "bird",           "cat",        "dog",           "horse",         "sheep",       "cow",           "elephant",
    "bear",           "zebra",      "giraffe",       "backpack",      "umbrella",    "handbag",       "tie",
    "suitcase",       "frisbee",    "skis",          "snowboard",     "sports ball", "kite",          "baseball bat",
    "baseball glove", "skateboard", "surfboard",     "tennis racket", "bottle",      "wine glass",    "cup",
    "fork",           "knife",      "spoon",         "bowl",          "banana",      "apple",         "sandwich",
    "orange",         "broccoli",   "carrot",        "hot dog",       "pizza",       "donut",         "cake",
    "chair",          "sofa",       "pottedplant",   "bed",           "diningtable", "toilet",        "tvmonitor",
    "laptop",         "mouse",      "remote",        "keyboard",      "cell phone",  "microwave",     "oven",
    "toaster",        "sink",       "refrigerator",  "book",          "clock",       "vase",          "scissors",
    "teddy bear",     "hair drier", "toothbrush"
]

nnPathDefault = str((Path(__file__).parent / Path('../models/yolo-v4-tiny-tf_openvino_2021.4_6shave.blob')).resolve().absolute())
parser = argparse.ArgumentParser()
parser.add_argument('nnPath', nargs='?', help="Path to mobilenet detection network blob", default=nnPathDefault)
parser.add_argument('-ff', '--full_frame', action="store_true", help="Perform tracking on full RGB frame", default=False)

args = parser.parse_args()

fullFrameTracking = args.full_frame

# Create pipeline
pipeline = dai.Pipeline()

# Define sources and outputs
camRgb = pipeline.create(dai.node.ColorCamera)
detectionNetwork = pipeline.create(dai.node.YoloDetectionNetwork)
objectTracker = pipeline.create(dai.node.ObjectTracker)

xlinkOut = pipeline.create(dai.node.XLinkOut)
trackerOut = pipeline.create(dai.node.XLinkOut)
nnout = pipeline.create(dai.node.XLinkOut)

xlinkOut.setStreamName("preview")
trackerOut.setStreamName("tracklets")
nnout.setStreamName("nn")

# Properties
camRgb.setPreviewSize(416, 416)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setInterleaved(False)
camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
camRgb.setFps(25)

# testing MobileNet DetectionNetwork
detectionNetwork.setConfidenceThreshold(0.9)
detectionNetwork.setNumClasses(80)
detectionNetwork.setCoordinateSize(4)
detectionNetwork.setAnchors([10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319])
detectionNetwork.setAnchorMasks({"side26": [1, 2, 3], "side13": [3, 4, 5]})
detectionNetwork.setIouThreshold(0.9)
detectionNetwork.setBlobPath(args.nnPath)
detectionNetwork.setNumInferenceThreads(2)
detectionNetwork.input.setBlocking(False)

objectTracker.setDetectionLabelsToTrack([0])  # track only person
# possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
objectTracker.setTrackerType(dai.TrackerType.ZERO_TERM_COLOR_HISTOGRAM)
# take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)

# Linking
camRgb.preview.link(detectionNetwork.input)
objectTracker.passthroughTrackerFrame.link(xlinkOut.input)

if fullFrameTracking:
    camRgb.video.link(objectTracker.inputTrackerFrame)
else:
    detectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)

detectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
detectionNetwork.out.link(objectTracker.inputDetections)
detectionNetwork.out.link(nnout.input)
objectTracker.out.link(trackerOut.input)
result = cv2.VideoWriter('filename.avi', 
                         cv2.VideoWriter_fourcc(*'MJPG'),
                         30, (600, 600))
# Connect to device and start pipeline
with dai.Device(pipeline) as device:

    preview = device.getOutputQueue("preview", 4, False)
    tracklets = device.getOutputQueue("tracklets", 4, False)
    nn = device.getOutputQueue("nn", 4, False)

    startTime = time.monotonic()
    counter = 0
    fps = 0
    frame = None
    to_right = 0
    to_left = 0
    diff = 0
    entry = {}
    while(True):
        imgFrame = preview.get()
        track = tracklets.get()
        nnData = nn.get()
        print("got image")
        counter+=1
        current_time = time.monotonic()
        if (current_time - startTime) > 1 :
            fps = counter / (current_time - startTime)
            counter = 0
            startTime = current_time

        color = (255, 0, 0)
        frame = imgFrame.getCvFrame()
        trackletsData = track.tracklets
        print("got traklets")
        print("Tracklets Content: ", trackletsData)
        print("Tracklets Length: ", len(trackletsData))
        print("nnData.detections: ", nnData.detections)
        # if len(nnData.detections) > 0:
        #     print("nnDataLabel: ", nnData.detections[0].label)
        #     print("nnDataConfidence: ", nnData.detections[0].confidence)
        #     print("nnDataBoundingBox: ", nnData.detections[0].xmin, nnData.detections[0].ymin, nnData.detections[0].xmax, nnData.detections[0].ymax)
        for t in trackletsData:
            roi = t.roi.denormalize(frame.shape[1], frame.shape[0])
            x1 = int(roi.topLeft().x)
            y1 = int(roi.topLeft().y)
            x2 = int(roi.bottomRight().x)
            y2 = int(roi.bottomRight().y)
            try:
                label = labelMap[t.label]
            except:
                label = t.label
            print(x1, y1, x2, y2, label) 
            centroid_x = int((x1 + x2) / 2)
            centroid_y = int((y1 + y2) / 2)
            cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.putText(frame, f"ID: {[t.id]}", (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.putText(frame, t.status.name, (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, cv2.FONT_HERSHEY_SIMPLEX)
            cv2.circle(frame, (centroid_x, centroid_y), 5, color, -1)
            logging.info(f"ID: {[t.id]} | Label: {label} | Status: {t.status.name} | Coordinates: {[x1, y1, x2, y2]} | Centroids: {[centroid_x, centroid_y]}")
            if t.status.name == "NEW" and t.id not in entry:
                entry[t.id] = centroid_x
            if t.status.name == "REMOVED" and centroid_x > frame.shape[1]*0.75:
                if abs(entry[t.id] - centroid_x) > frame.shape[1]*0.4:
                    to_right += 1
                    diff = to_right - to_left
                del entry[t.id]
            if t.status.name == "REMOVED" and centroid_x < frame.shape[1]*0.25:
                if abs(entry[t.id] - centroid_x) > frame.shape[1]*0.4:
                    to_left += 1
                    diff = to_right - to_left
                del entry[t.id]
        cv2.putText(frame, "NN fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
        cv2.putText(frame, "left: {:.0f}".format(to_left), (2, 10), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
        cv2.putText(frame, "right: {:.0f}".format(to_right), (2, 25), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
        cv2.putText(frame, "Diff: {:.0f}".format(diff), (2, 40), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
        resized = cv2.resize(frame, (600, 600), interpolation = cv2.INTER_AREA)
        result.write(resized)
        cv2.imshow("tracker", resized)

        if cv2.waitKey(1) == ord('q'):
            result.release()
            break

Any suggestions on how to avoid those overlapping tracklets are welcome.

With all the previous in mind I'm thinking on 2 solutions:
1 - Look for a way to retrain Mobilenet to detect people from top , is that a viable solution? if I provide a new mobilenet .blob in the (detectionNetwork.setBlobPath(args.nnPath)) retrained with people from top the node will it work out-of-the-box? do you have any example of retraining mobilenet to be used with the MobileNetDetectionNetwork?

2- Train or use a NN that already detects people from the top and implement a script node to get the neural network output and generate the ImgDetections object required by the Object detector as input? Do you have any idea if that is feasible to do or is it a so heavy process to do it in the device?

jakaskerl

Hi RafaelRodriguez

I'm not too familiar with retraining NN models, but perhaps you should try following one of our guides at https://github.com/luxonis/depthai-ml-training/tree/master/colab-notebooks. There are also some MobileNet retraining guides, which are deprecated, but might still work.
For yolo, I think you need to play around and change the anchors as well.
You can probably just use a NN node and parse the detections on the host side (you can use a scipt node though, as it most likely won't be to heavy on processing).

Thanks,
Jaka