Here's the code;
 
#!/usr/bin/env python3
import cv2
import depthai as dai
import blobconverter
# Create pipeline
pipeline = dai.Pipeline()
# Define source and output
camRgb = pipeline.create(dai.node.ColorCamera)
xoutVideo = pipeline.create(dai.node.XLinkOut)
xoutVideo.setStreamName("video")
# Properties
camRgb.setBoardSocket(dai.CameraBoardSocket.RGB)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setVideoSize(1920, 1080)
xoutVideo.input.setBlocking(False)
xoutVideo.input.setQueueSize(1)
# Create MobileNet detection network
mobilenet = pipeline.create(dai.node.MobileNetDetectionNetwork)
mobilenet.setBlobPath(
    blobconverter.from_zoo(name="face-detection-retail-0004", shaves=3)
)
mobilenet.setConfidenceThreshold(0.7)
# manipRgb = pipeline.createImageManip()
# rgbRr = dai.RotatedRect()
# rgbRr.center.x, rgbRr.center.y = camRgb.getPreviewWidth() // 2, camRgb.getPreviewHeight() // 2
# rgbRr.size.width, rgbRr.size.height = camRgb.getPreviewHeight(), camRgb.getPreviewWidth()
# rgbRr.angle = 0
# manipRgb.initialConfig.setCropRotatedRect(rgbRr, False)
#
#
# camRgb.isp.link(manipRgb.inputImage)
# manipRgb.out.link(mobilenet.input)
crop_manip2 = pipeline.create(dai.node.ImageManip)
crop_manip2.initialConfig.setResize(300, 300)
crop_manip2.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
camRgb.isp.link(crop_manip2.inputImage)
#crop_manip2.out.link(mobilenet.input)
crop_manip = pipeline.create(dai.node.ImageManip)
crop_manip.initialConfig.setResize(300, 300)
crop_manip.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
crop_manip.out.link(crop_manip2.inputImage)
# camRgb.isp.link(crop_manip.inputImage)
# crop_manip2.out.link(crop_manip.inputImage)
crop_manip.out.link(mobilenet.input)
# Script node
script = pipeline.create(dai.node.Script)
mobilenet.out.link(script.inputs["dets"])
script.outputs["cam_cfg"].link(camRgb.inputConfig)
script.outputs["cam_ctrl"].link(camRgb.inputControl)
script.setScript(
    """
    ORIGINAL_SIZE = (5312, 6000) # 48MP with size constraints described on IMX582 luxonis page
    SCENE_SIZE = (1920, 1080) # 1080P
    x_arr = []
    y_arr = []
    AVG_MAX_NUM=7
    limits = [0, 0] # xmin and ymin limits
    limits.append((ORIGINAL_SIZE[0] - SCENE_SIZE[0]) / ORIGINAL_SIZE[0]) # xmax limit
    limits.append((ORIGINAL_SIZE[1] - SCENE_SIZE[1]) / ORIGINAL_SIZE[1]) # ymax limit
    cfg = ImageManipConfig()
    ctrl = CameraControl()
    def average_filter(x, y):
        x_arr.append(x)
        y_arr.append(y)
        if AVG_MAX_NUM < len(x_arr): x_arr.pop(0)
        if AVG_MAX_NUM < len(y_arr): y_arr.pop(0)
        x_avg = 0
        y_avg = 0
        for i in range(len(x_arr)):
            x_avg += x_arr[i]
            y_avg += y_arr[i]
        x_avg = x_avg / len(x_arr)
        y_avg = y_avg / len(y_arr)
        if x_avg < limits[0]: x_avg = limits[0]
        if y_avg < limits[1]: y_avg = limits[1]
        if limits[2] < x_avg: x_avg = limits[2]
        if limits[3] < y_avg: y_avg = limits[3]
        return x_avg, y_avg
    while True:
    
        dets = node.io['dets'].get().detections
        if len(dets) == 0: continue
        coords = dets[0] # take first
        width = (coords.xmax - coords.xmin) * ORIGINAL_SIZE[0]
        height = (coords.ymax - coords.ymin) * ORIGINAL_SIZE[1]
        x_pixel = int(max(0, coords.xmin * ORIGINAL_SIZE[0]))
        y_pixel = int(max(0, coords.ymin * ORIGINAL_SIZE[1]))
        # ctrl.setAutoFocusRegion(x_pixel, y_pixel, int(width), int(height))
        # ctrl.setAutoExposureRegion(x_pixel, y_pixel, int(width), int(height))
        # Get detection center
        x = (coords.xmin + coords.xmax) / 2
        y = (coords.ymin + coords.ymax) / 2
        x -= SCENE_SIZE[0] / ORIGINAL_SIZE[0] / 2
        y -= SCENE_SIZE[1] / ORIGINAL_SIZE[1] / 2
        # node.warn(f"{x=} {y=}")
        x_avg, y_avg = average_filter(x,y)
        # node.warn(f"{x_avg=} {y_avg=}")
        cfg.setCropRect(x_avg, y_avg, 0, 0)
        node.io['cam_cfg'].send(cfg)
        node.io['cam_ctrl'].send(ctrl)
    """
)
# Linking
camRgb.video.link(xoutVideo.input)
# Connect to device and start pipeline
with dai.Device(pipeline) as device:
    video = device.getOutputQueue(name="video", maxSize=1, blocking=False)
    while True:
        videoIn = video.get()
        print("Done in seconds")
        # Get BGR frame from NV12 encoded video frame to show with opencv
        # Visualizing the frame on slower hosts might have overhead
        cv2.imshow("video", videoIn.getCvFrame())
        if cv2.waitKey(1) == ord('q'):
            break