This is the code i am using for a clustering,and when anchors are not used(commended the anchors-->raises error)
#!/usr/bin/env python3
import depthai as dai
import cv2
import numpy as np
import time
from pathlib import Path
from sklearn.cluster import OPTICS
from sklearn.neighbors import NearestNeighbors
# Path to blob file (update to your YOLO blob path)
nnBlobPath = str((Path(file).parent / Path('../yoloe/2608_best.blob')).resolve().absolute())
if not Path(nnBlobPath).exists():
raise FileNotFoundError(f"Blob file not found at {nnBlobPath}")
# Label map
labelMap = ["cotton"]
# Create pipeline
pipeline = dai.Pipeline()
# Define nodes
camRgb = pipeline.create(dai.node.ColorCamera)
stereo = pipeline.create(dai.node.StereoDepth)
monoLeft = pipeline.create(dai.node.MonoCamera)
monoRight = pipeline.create(dai.node.MonoCamera)
manip = pipeline.create(dai.node.ImageManip)
spatialDetectionNetwork = pipeline.create(dai.node.YoloSpatialDetectionNetwork)
xoutRgb = pipeline.create(dai.node.XLinkOut)
xoutDepth = pipeline.create(dai.node.XLinkOut)
xoutNN = pipeline.create(dai.node.XLinkOut)
manipOut = pipeline.create(dai.node.XLinkOut)
xoutRgb.setStreamName("rgb")
xoutDepth.setStreamName("depth")
xoutNN.setStreamName("detections")
manipOut.setStreamName("manip")
# Configure color camera
camRgb.setPreviewSize(1920, 1080)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
camRgb.setInterleaved(False)
camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
camRgb.setFps(30)
# Configure manip
manip.initialConfig.setResizeThumbnail(416, 416)
# Configure stereo depth
stereo.initialConfig.setConfidenceThreshold(200) # Lowered for better depth
stereo.initialConfig.setMedianFilter(dai.MedianFilter.KERNEL_7x7)
stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.DEFAULT)
stereo.setLeftRightCheck(True)
stereo.setSubpixel(True)
stereo.setExtendedDisparity(True)
stereo.setDepthAlign(dai.CameraBoardSocket.CAM_A)
# Configure mono cameras
monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
monoLeft.setCamera("left")
monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
monoRight.setCamera("right")
# Configure neural network
spatialDetectionNetwork.setBlobPath(nnBlobPath)
spatialDetectionNetwork.setConfidenceThreshold(0.5)
spatialDetectionNetwork.setNumClasses(1)
spatialDetectionNetwork.setCoordinateSize(4)
#spatialDetectionNetwork.setAnchors(np.array([10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326]))
#spatialDetectionNetwork.setAnchorMasks({"side52": np.array([0,1,2]), "side26": np.array([3,4,5]), "side13": np.array([6,7,8])})
spatialDetectionNetwork.setIouThreshold(0.5)
spatialDetectionNetwork.setBoundingBoxScaleFactor(0.5)
spatialDetectionNetwork.setDepthLowerThreshold(100)
spatialDetectionNetwork.setDepthUpperThreshold(5000)
spatialDetectionNetwork.input.setBlocking(False)
spatialDetectionNetwork.setNumInferenceThreads(2)
# Linking
camRgb.preview.link(manip.inputImage)
manip.out.link(spatialDetectionNetwork.input)
stereo.depth.link(spatialDetectionNetwork.inputDepth)
manip.out.link(manipOut.input)
spatialDetectionNetwork.passthrough.link(xoutRgb.input)
monoLeft.out.link(stereo.left)
monoRight.out.link(stereo.right)
stereo.depth.link(xoutDepth.input)
spatialDetectionNetwork.out.link(xoutNN.input)
# Statistical outlier removal
def remove_outliers(points, k=10, std_ratio=2.0):
if len(points) < k:
return points, np.arange(len(points))
nbrs = NearestNeighbors(n_neighbors=k).fit(points)
distances, _ = nbrs.kneighbors(points)
mean_dist = np.mean(distances, axis=1)
std_dist = np.std(distances, axis=1)
mask = mean_dist < (np.mean(mean_dist) + std_ratio \* np.std(mean_dist))
return points[mask], np.where(mask)[0]
# Connect to device
with dai.Device(pipeline, usb2Mode=True) as device:
q_rgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)
q_depth = device.getOutputQueue(name="depth", maxSize=4, blocking=False)
q_nn = device.getOutputQueue(name="detections", maxSize=4, blocking=False)
q_manip = device.getOutputQueue(name="manip", maxSize=4, blocking=False)
print("Waiting 5 seconds for camera to stabilize...")
time.sleep(5)
frame_count = 0
while True:
try:
in_rgb = q_rgb.get()
in_depth = q_depth.get()
in_nn = q_nn.get()
print("Frame received successfully")
except:
print("Could not get frame, XLink error")
continue
frame = in_rgb.getCvFrame()
depth_frame = in_depth.getFrame()
detections = in_nn.detections
# Ensure depth frame matches RGB resolution
if frame.shape[:2] != depth_frame.shape:
depth_frame = cv2.resize(depth_frame, (frame.shape[1], frame.shape[0]))
print(f"Number of detections: {len(detections)}")
# Visualize YOLO detections
if len(detections) > 0:
print("Detections found, drawing boxes...")
for detection in detections:
x1 = int(detection.xmin \* frame.shape[1])
x2 = int(detection.xmax \* frame.shape[1])
y1 = int(detection.ymin \* frame.shape[0])
y2 = int(detection.ymax \* frame.shape[0])
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(frame.shape[1], x2), min(frame.shape[0], y2)
if x2 <= x1 or y2 <= y1:
continue
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2) # Blue for YOLO
cv2.putText(frame, labelMap[detection.label], (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (255, 0, 0), 2)
cv2.putText(frame, f"Conf: {detection.confidence:.2f}", (x1 + 10, y1 + 40), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (255, 0, 0), 2)
cv2.putText(frame, f"Z: {int(detection.spatialCoordinates.z)}mm", (x1 + 10, y1 + 60), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (255, 0, 0), 2)
# Depth-based clustering
for detection in detections:
x1 = int(detection.xmin \* frame.shape[1])
x2 = int(detection.xmax \* frame.shape[1])
y1 = int(detection.ymin \* frame.shape[0])
y2 = int(detection.ymax \* frame.shape[0])
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(frame.shape[1], x2), min(frame.shape[0], y2)
if x2 <= x1 or y2 <= y1:
continue
print(f"Processing ROI at ({x1}, {y1}) to ({x2}, {y2})")
roi_rgb = frame[y1:y2, x1:x2].copy()
roi_depth = depth_frame[y1:y2, x1:x2]
# Normalize depth to 8-bit for medianBlur
roi_depth_8u = cv2.normalize(roi_depth, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
roi_depth_8u = cv2.medianBlur(roi_depth_8u, 9)
# Dynamic depth threshold
detection_z = detection.spatialCoordinates.z
if detection_z <= 0:
detection_z = 500 # Fallback
print("Warning: Invalid detection_z, using fallback 500mm")
depth_range = (max(100, detection_z - 500), min(5000, detection_z + 500))
depth_mask = (roi_depth >= depth_range[0]) & (roi_depth <= depth_range[1])
# Create cotton mask using HSV
hsv_roi = cv2.cvtColor(roi_rgb, cv2.COLOR_BGR2HSV)
cotton_hsv_range = (np.array([0, 0, 150]), np.array([180, 30, 255])) # Tighter for cotton
mask = cv2.inRange(hsv_roi, cotton_hsv_range[0], cotton_hsv_range[1])
mask = cv2.dilate(mask, np.ones((5, 5), np.uint8), iterations=1)
mask = cv2.erode(mask, np.ones((3, 3), np.uint8), iterations=1) # Remove small noise
# Combine masks
combined_mask = cv2.bitwise_and(mask, depth_mask.astype(np.uint8) \* 255)
# Save ROI and mask for debugging
cv2.imwrite(f"roi_rgb_{frame_count}.jpg", roi_rgb)
cv2.imwrite(f"roi_depth_{frame_count}.png", roi_depth_8u)
cv2.imwrite(f"roi_mask_{frame_count}.png", combined_mask)
print(f"Saved ROI images: roi_rgb_{frame_count}.jpg, roi_depth_{frame_count}.png, roi_mask_{frame_count}.png")
# Downsample points
y_coords, x_coords = np.where(combined_mask > 0)
if len(y_coords) == 0:
print("No valid points after masking")
continue
sample_indices = np.arange(0, len(y_coords), 8)
y_coords = y_coords[sample_indices]
x_coords = x_coords[sample_indices]
valid_depths = roi_depth[y_coords, x_coords]
brightness_vals = hsv_roi[y_coords, x_coords, 2]
print(f"Valid points after downsampling: {len(valid_depths)}")
print(f"Depth stats: min={valid_depths.min():.0f}mm, max={valid_depths.max():.0f}mm, mean={valid_depths.mean():.0f}mm")
# Fallback to brightness if depth is invalid
if valid_depths.max() - valid_depths.min() < 10:
print("Warning: Invalid depth, falling back to brightness clustering")
points = np.column_stack((x_coords, y_coords, brightness_vals))
points_norm = np.zeros_like(points, dtype=float)
points_norm[:, 0] = points[:, 0] / max(1, roi_rgb.shape[1])
points_norm[:, 1] = points[:, 1] / max(1, roi_rgb.shape[0])
points_norm[:, 2] = points[:, 2] / 255.0
else:
points = np.column_stack((x_coords, y_coords, valid_depths))
points_norm = np.zeros_like(points, dtype=float)
points_norm[:, 0] = points[:, 0] / max(1, roi_rgb.shape[1])
points_norm[:, 1] = points[:, 1] / max(1, roi_rgb.shape[0])
points_norm[:, 2] = (points[:, 2] - points[:, 2].min()) / max(1e-6, points[:, 2].max() - points[:, 2].min())
# Statistical outlier removal
points_norm, valid_indices = remove_outliers(points_norm)
points = points[valid_indices]
# OPTICS clustering
min_samples = max(20, int(len(points_norm) \* 0.05)) # 5% of points
optics = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=min_samples).fit(points_norm)
labels = optics.labels_
unique_labels = np.unique(labels)
n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
print(f"OPTICS clusters: {n_clusters} (min_samples={min_samples})")
# Select top 5 clusters by size
cluster_sizes = [(label, len(points_norm[labels == label])) for label in unique_labels if label != -1]
cluster_sizes.sort(key=lambda x: x[1], reverse=True)
top_clusters = [label for label, size in cluster_sizes[:5] if size >= min_samples]
# Draw clusters
for label in top_clusters:
cluster_size = len(points_norm[labels == label])
print(f"Cluster {label}: {cluster_size} points")
cluster_points = points[labels == label]
x_min, y_min = int(cluster_points[:, 0].min()), int(cluster_points[:, 1].min())
x_max, y_max = int(cluster_points[:, 0].max()), int(cluster_points[:, 1].max())
cv2.rectangle(frame, (x1 + x_min, y1 + y_min), (x1 + x_max, y1 + y_max), (0, 255, 0), 2)
# Display output
cv2.imshow("Clustered Frame", frame)
print("Press 'q' to quit.")
if cv2.waitKey(1) == ord('q'):
break
frame_count += 1
cv2.destroyAllWindows()
Error:
101: DeprecationWarning: Use constructor taking 'UsbSpeed' instead
with dai.Device(pipeline, usb2Mode=True) as device:
[18443010510C9F1200] [1.1.1] [2.854] [SpatialDetectionNetwork(5)] [warning] Network compiled for 6 shaves, maximum available 7, compiling for 3 shaves likely will yield in better performance
Waiting 5 seconds for camera to stabilize...
[18443010510C9F1200] [1.1.1] [3.292] [StereoDepth(1)] [error] Maximum disparity value '1520' exceeds the maximum supported '1024' by median filter. Disabling median filter!
[18443010510C9F1200] [1.1.1] [3.377] [StereoDepth(1)] [error] Maximum disparity value '1520' exceeds the maximum supported '1024' by median filter. Disabling median filter!
[18443010510C9F1200] [1.1.1] [3.892] [SpatialDetectionNetwork(5)] [error] Mask is not defined for output layer with width '3549'. Define at pipeline build time using: 'setAnchorMasks' for 'side3549'.
[18443010510C9F1200] [1.1.1] [4.835] [SpatialDetectionNetwork(5)] [error] Mask is not defined for output layer with width '3549'. Define at pipeline build time using: 'setAnchorMasks' for 'side3549'.