CUSTOM OCR TRAINING

erik

Hi @DavidP
You can use ImageManip and crop the bottom frame;

#!/usr/bin/env python3

import cv2
import depthai as dai

# Create pipeline
pipeline = dai.Pipeline()

camRgb = pipeline.create(dai.node.ColorCamera)
camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_12_MP) # 4056x3040
width = 4056
height = 1520

crop_manip = pipeline.create(dai.node.ImageManip)
crop_manip.initialConfig.setCropRect(0, 0.5, 1, 1)
crop_manip.setMaxOutputFrameSize(int(width * height * 1.5))
crop_manip.setFrameType(dai.RawImgFrame.Type.NV12)
camRgb.isp.link(crop_manip.inputImage)

xout2 = pipeline.create(dai.node.XLinkOut)
xout2.setStreamName('crop')
crop_manip.out.link(xout2.input)

with dai.Device(pipeline) as device:
    # Output queue will be used to get the rgb frames from the output defined above
    q1 = device.getOutputQueue(name="crop", maxSize=4, blocking=False)

    while True:
        if q1.has():
            cv2.imshow("Bottom Tile", q1.get().getCvFrame())
        if cv2.waitKey(1) == ord('q'):
            break

DavidP

erik Thank you Erik, i test to modify with your version.

Regards

DavidP

Unfortunately, cropping is ok alone but not in my case,
The text detection NN taking 256x256 images, when i use colorcam.preview to get in, then ok but if i do a resize of another image to 256x256 to get into this neural network then crash.

works :

crash :

i miss basics of links declarations , is there a documentation about dai.node objects?
and about how intanciate them correctly?
a kind of "newby" documentation lol
Thanks

DavidP

i took the example at this page :
Hello World (luxonis.com)
and modified to use image manip as preview :

then i have the following issue :

which i don't understand clearly

erik

@DavidP you could google the error and get some useful links...
ImageManip will output NV12 format (not BGR), and NN expects BGR planar (very likely).

crop_manip.setFrameType(dai.RawImgFrame.Type.BGR888p)

DavidP

erik LOL i used chatgpt which is not a boss on depthai honnestly, and i lost my habitudes to get answers from google but i will check in the future because it works.
I go back in my OCR code to transpose.

Thank you Erik!

DavidP

It is ok now, i can locate the ROI and then focus on it.
Some questions :

For now i am at 1024x1024. I guess it is more complicated to adapt to 2048x1024 the OCR job?
I saw that memory was ook at this size of 2048x1024x3.
and do you if by this way i can manage that code to work on monochrone cameras?
or the NN used wont work on them?

Regards and thanks

DavidP

Share of the code for other people.
main.py

import threading

import image_processing as IP

if name == "main":

camera_type ="rgb"

roi = IP.ROI(1700,1520)

thread = threading.Thread(target=IP.process_images,args=(camera_type,roi))

thread.start()

thread.join()

image_processin.py

from pathlib import Path

import cv2

import numpy as np

import depthai as dai

import east

import blobconverter

import threading

class ROI:

def __init__(self, x, y):

    self.x = x

    self.y = y

    self.width = 1024

    self.height = 1024

class HostSeqSync:

def __init__(self):

    self.imfFrames = []

def add_msg(self, msg):

    self.imfFrames.append(msg)

def get_msg(self, target_seq):

    for i, imgFrame in enumerate(self.imfFrames):

        if target_seq == imgFrame.getSequenceNum():

            self.imfFrames = self.imfFrames[i:]

            break

    return self.imfFrames[0]

def process_images(camera_type, roi):

pipeline = dai.Pipeline()

version = "2022.1"

pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2022_1)



colorCam = pipeline.create(dai.node.ColorCamera)

colorCam.setPreviewSize(256, 256)

#colorCam.setVideoSize(2048, 2048) # 4 times larger in both axis

colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_12_MP)

colorCam.setInterleaved(False)

colorCam.setBoardSocket(dai.CameraBoardSocket.RGB)

colorCam.setFps(10)

#MODIFICATION -> creation of the video Imagemanip

manip_video= pipeline.create(dai.node.ImageManip)

RrVideo = dai.RotatedRect()

RrVideo.center.x,RrVideo.center.y = int(roi.x+(roi.height/2)),int(roi.y+(roi.width/2))

RrVideo.size.width,RrVideo.size.height = 1024,1024

manip_video.initialConfig.setCropRotatedRect(RrVideo,False)

manip_video.setResize(1024,1024)

manip_video.setMaxOutputFrameSize(1024\*1024\*3)

manip_video.setFrameType(dai.RawImgFrame.Type.BGR888p)

colorCam.isp.link(manip_video.inputImage)

#MODIFICATION -> creation of the preview video Imagemanip

manip_preview= pipeline.create(dai.node.ImageManip)

manip_preview.setResize(256,256)

manip_preview.setMaxOutputFrameSize(int(256\*256\*3))

manip_preview.setFrameType(dai.RawImgFrame.Type.BGR888p)

manip_video.out.link(manip_preview.inputImage)

controlIn = pipeline.create(dai.node.XLinkIn)

controlIn.setStreamName('control')

controlIn.out.link(colorCam.inputControl)

cam_xout = pipeline.create(dai.node.XLinkOut)

cam_xout.setStreamName('video')

manip_video.out.link(cam_xout.input)

# ---------------------------------------

# 1st stage NN - text-detection

# ---------------------------------------

nn = pipeline.create(dai.node.NeuralNetwork)

nn.setBlobPath(blobconverter.from_zoo(name="east_text_detection_256x256",zoo_type="depthai",shaves=6, version=version))

manip_preview.out.link(nn.input)



nn_xout = pipeline.create(dai.node.XLinkOut)

nn_xout.setStreamName('detections')

nn.out.link(nn_xout.input)

# ---------------------------------------

# 2nd stage NN - text-recognition-0012

# ---------------------------------------

manip = pipeline.create(dai.node.ImageManip)

manip.setWaitForConfigInput(True)

manip_img = pipeline.create(dai.node.XLinkIn)

manip_img.setStreamName('manip_img')

manip_img.out.link(manip.inputImage)

manip_cfg = pipeline.create(dai.node.XLinkIn)

manip_cfg.setStreamName('manip_cfg')

manip_cfg.out.link(manip.inputConfig)

manip_xout = pipeline.create(dai.node.XLinkOut)

manip_xout.setStreamName('manip_out')

nn2 = pipeline.create(dai.node.NeuralNetwork)

nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version))

nn2.setNumInferenceThreads(2)

manip.out.link(nn2.input)

manip.out.link(manip_xout.input)

nn2_xout = pipeline.create(dai.node.XLinkOut)

nn2_xout.setStreamName("recognitions")

nn2.out.link(nn2_xout.input)

 

def to_tensor_result(packet):

    return {

        name: np.array(packet.getLayerFp16(name))

        for name in [tensor.name for tensor in packet.getRaw().tensors]

    }

def to_planar(frame):

    return frame.transpose(2, 0, 1).flatten()

with dai.Device(pipeline) as device:

    q_vid = device.getOutputQueue("video", 4, blocking=False)

    #q_prev = device.getOutputQueue("preview", 4, blocking=False)

    #q1 = device.getOutputQueue("crop", 4, blocking=False)

    q_det = device.getOutputQueue("detections", 4, blocking=False)

    q_rec = device.getOutputQueue("recognitions", 4, blocking=True)

    q_manip_img = device.getInputQueue("manip_img")

    q_manip_cfg = device.getInputQueue("manip_cfg")

    q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False)

    controlQueue = device.getInputQueue('control')

    frame = None

    cropped_stacked = None

    rotated_rectangles = []

    rec_pushed = 0

    rec_received = 0

    host_sync = HostSeqSync()

    class CTCCodec(object):

        """ Convert between text-label and text-index """

        def __init__(self, characters):

            dict_character = list(characters)

            self.dict = {}

            for i, char in enumerate(dict_character):

                self.dict[char] = i + 1

            self.characters = dict_character

        def decode(self, preds):

            """ convert text-index into text-label. """

            texts = []

            index = 0

            preds = preds.astype(np.float16)

            preds_index = np.argmax(preds, 2)

            preds_index = preds_index.transpose(1, 0)

            preds_index_reshape = preds_index.reshape(-1)

            preds_sizes = np.array([preds_index.shape[1]] \* preds_index.shape[0])

            for l in preds_sizes:

                t = preds_index_reshape[index:index + l]

                if t.shape[0] == 0:

                    continue

                char_list = []

                for i in range(l):

                    if not (i > 0 and t[i - 1] == t[i]):

                        if self.characters[t[i]] != '#':

                            char_list.append(self.characters[t[i]])

                text = ''.join(char_list)

                texts.append(text)

                index += l

            return texts

    characters = '0123456789abcdefghijklmnopqrstuvwxyz#'

    codec = CTCCodec(characters)

    ctrl = dai.CameraControl()

    ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO)

    ctrl.setAutoFocusTrigger()

    controlQueue.send(ctrl)

    while True:

        vid_in = q_vid.tryGet()

        #prev_in = q_prev.tryGet()

        if vid_in is not None:

            host_sync.add_msg(vid_in)

        #if prev_in is not None:

            #host_sync.add_msg(prev_in)

        while True:

            #framepreview = host_sync.get_msg(prev_in.getSequenceNum()).getCvFrame().copy()

            #cv2.imshow("preview ", framepreview)

            in_rec = q_rec.tryGet()

            if in_rec is None:

                break

            rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30,1,37)

            decoded_text = codec.decode(rec_data)[0]

            pos = rotated_rectangles[rec_received]

            print("{:2}: {:20}".format(rec_received, decoded_text),

                "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format(

                    int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2]))

            if cropped_stacked is not None:

                cv2.putText(cropped_stacked, decoded_text,

                                (120 + 10 , 32 \* rec_received + 24),

                                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)

                cv2.imshow(f"cropped_stacked {camera_type} ", cropped_stacked)

            rec_received += 1

        if cv2.waitKey(1) == ord('q'):

            break

        if rec_received >= rec_pushed:

            in_det = q_det.tryGet()

            if in_det is not None:

                frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy()

                scores, geom1, geom2 = to_tensor_result(in_det).values()

                scores = np.reshape(scores, (1, 1, 64, 64))

                geom1 = np.reshape(geom1, (1, 4, 64, 64))

                geom2 = np.reshape(geom2, (1, 1, 64, 64))

                bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2)

                boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))

                rotated_rectangles = [

                    east.get_cv_rotated_rect(bbox, angle \* -1)

                    for (bbox, angle) in zip(boxes, angles)

                ]

                rec_received = 0

                rec_pushed = len(rotated_rectangles)

                if rec_pushed:

                    print("====== Pushing for recognition, count:", rec_pushed)

                cropped_stacked = None

                for idx, rotated_rect in enumerate(rotated_rectangles):

                    rotated_rect[0][0] = rotated_rect[0][0] \* 4

                    rotated_rect[0][1] = rotated_rect[0][1] \* 4

                    rotated_rect[1][0] = rotated_rect[1][0] \* 4

                    rotated_rect[1][1] = rotated_rect[1][1] \* 4

                    points = np.int0(cv2.boxPoints(rotated_rect))

                    print(rotated_rect)

                    cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8)

                    rr = dai.RotatedRect()

                    rr.center.x    = rotated_rect[0][0]

                    rr.center.y    = rotated_rect[0][1]

                    rr.size.width  = rotated_rect[1][0]

                    rr.size.height = rotated_rect[1][1]

                    rr.angle       = rotated_rect[2]

                    cfg = dai.ImageManipConfig()

                    cfg.setCropRotatedRect(rr, False)

                    cfg.setResize(120, 32)

                    if idx == 0:

                        w,h,c = frame.shape

                        imgFrame = dai.ImgFrame()

                        imgFrame.setData(to_planar(frame))

                        imgFrame.setType(dai.ImgFrame.Type.BGR888p)

                        imgFrame.setWidth(w)

                        imgFrame.setHeight(h)

                        q_manip_img.send(imgFrame)

                    else:

                        cfg.setReusePreviousImage(True)

                    q_manip_cfg.send(cfg)

                    transformed = q_manip_out.get().getCvFrame()

                    rec_placeholder_img = np.zeros((32, 200, 3), np.uint8)

                    transformed = np.hstack((transformed, rec_placeholder_img))

                    if cropped_stacked is None:

                        cropped_stacked = transformed

                    else:

                        cropped_stacked = np.vstack((cropped_stacked, transformed))

        if cropped_stacked is not None:

            cv2.imshow(f"cropped_stacked {camera_type} ", cropped_stacked)

        if frame is not None:

            cv2.imshow(f'frame {camera_type}', frame)

        key = cv2.waitKey(1)

        if  key == ord('q'):

            break

        elif key == ord('t'):

            print("Autofocus trigger (and disable continuous)")

            ctrl = dai.CameraControl()

            ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO)

            ctrl.setAutoFocusTrigger()

            controlQueue.send(ctrl)

Don't forget east.py provided into the github.

erik

DavidP

Yes, I assume both models require 1:1 aspect ratio, so you'd be better off by cropping some more FOV to get 1:1 AR, instead of (current) 4056x1520. How you corp it is up to you.
Models require color frames, you can convert monochrome to color (using ImageManip node), which could affect the performance of the model.

DavidP

erik thank you Erik for all your support.

« Previous Page