Add example for Interactive OCR using Tesseract (#108)

amtam0 · web-flow · commit 01ca0af3485f · 2022-04-29T13:46:40.000+03:00
diff --git a/label_studio_ml/examples/tesseract/README.md b/label_studio_ml/examples/tesseract/README.md
@@ -0,0 +1,52 @@
+
+
+## Interactive BBOX OCR using Tesseract
+Using an OCR engine for Interactive ML-Assisted Labelling, this functionality can speed up annotation for layout detection, classification and recognition models.
+
+Tesseract is used for OCR but minimal adaptation is needed to connect other OCR engines or models.
+
+Tested with LabelStudio v1.4.1.post1, and assuming data for annotation is stored in AWS S3 (some adaptation is needed if using other storage methods).
+
+### Setup process
+0. Install label-studio-ml and Tesseract
+
+1. Start LabelStudio and create a new project
+
+2. In the project **Settings**, set up the **Cloud storage**. Add your Source and Target storage by connecting to AWS S3 Bucket
+
+3. In the project **Settings**, set up the **Labeling Interface**
+   Fill in the following template code, important to specifiy `smart="true"` in RectangleLabels
+```
+<View>
+  <View style="display:flex;align-items:start;gap:0px;flex-direction:column-reverse">
+    <View style="padding:0px; border: 1px solid #555;">
+    	<Image name="image" value="$ocr" zoom="true" zoomControl="false" rotateControl="true" width="100%" height="100%" maxHeight="auto" maxWidth="auto"/>
+    </View>
+    <RectangleLabels name="bbox" toName="image" strokeWidth="1" smart="true">
+      <Label value="Label1" background="green"/>
+      <Label value="Label2" background="blue"/>
+      <Label value="Label3" background="red"/>
+    </RectangleLabels>
+  </View>
+  	<TextArea name="transcription" toName="image" editable="true" perRegion="true" required="false" maxSubmissions="1" rows="5" placeholder="Recognized Text" displayMode="region-list"/>
+</View>
+```
+
+4. Setup Tesseract ML backend:
+    ```
+    pip install -r label_studio_ml/examples/tesseract/requirements.txt
+    label-studio-ml init my-ml-backend --from label_studio_ml/examples/tesseract/ner_ml_backend.py --force
+    label-studio-ml start my-ml-backend -d -p=9090 --debug
+    ```
+    
+5. Open the **Machine Learning** settings and click **Add Model**. Add the URL `http://localhost:9090` and save the model as an ML backend.
+
+6. To use this functionality, activate `Auto-Annotation` and use `Autotdetect` rectangle for drawing boxes
+
+Exemple below :
+
+![ls_demo_ocr](https://user-images.githubusercontent.com/17755198/165186574-05f0236f-a5f2-4179-ac90-ef11123927bc.gif)
+
+Reference links : 
+- https://labelstud.io/blog/Improve-OCR-quality-with-Tesseract-and-Label-Studio.html
+- https://labelstud.io/blog/release-130.html
diff --git a/label_studio_ml/examples/tesseract/requirements.txt b/label_studio_ml/examples/tesseract/requirements.txt
@@ -0,0 +1,2 @@
+pytesseract==0.3.9
+boto3==1.22.0
diff --git a/label_studio_ml/examples/tesseract/tesseract.py b/label_studio_ml/examples/tesseract/tesseract.py
@@ -0,0 +1,116 @@
+
+from PIL import Image
+import pytesseract as pt
+import boto3
+from label_studio_ml.model import LabelStudioMLBase
+import pathlib
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+global OCR_config, aws_credentials
+OCR_config = "--psm 6"
+aws_credentials = {"aws_access_key_id":"",
+                "aws_secret_access_key":"",
+                "aws_session_token":""
+                }
+
+def split_s3_path(s3_path):
+    path_parts=s3_path.replace("s3://","").split("/")
+    bucket=path_parts.pop(0)
+    key="/".join(path_parts)
+    return bucket, key
+
+def download_S3_file(img_path_url=None, aws_credentials=None):
+    """
+    download image file from S3 and save in ./tmp.{file_extension}
+    """
+    session = boto3.Session(
+            aws_access_key_id=aws_credentials["aws_access_key_id"],
+            aws_secret_access_key=aws_credentials["aws_secret_access_key"],
+            aws_session_token=aws_credentials["aws_session_token"]
+    )
+    #Then use the session to get the resource
+    # s3 = session.resource('s3')
+    resource = session.resource('s3')
+    bucket, key = split_s3_path(img_path_url)
+    file_extension = pathlib.Path(key).suffix
+    key_basename = "tmp{}".format(file_extension)
+    my_bucket = resource.Bucket(bucket)
+    my_bucket.download_file(key, key_basename)
+    return key_basename
+
+class BBOXOCR(LabelStudioMLBase):
+    def __init__(self, **kwargs):
+        super(BBOXOCR, self).__init__(**kwargs)
+
+    def predict(self, tasks, **kwargs):
+        # extract task meta data: labels, from_name, to_name and other
+        task = tasks[0]
+        # print("task", task)
+        img_path_url = task["data"]["ocr"]
+        # print("img_path_url", img_path_url)
+        context = kwargs.get('context')
+        # print("context", context)
+        if context:
+            if not context["result"]:
+                return []
+            result = context.get('result')[0]
+            # print("result", result)
+            meta = self._extract_meta({**task, **result})
+            # print("meta", meta)
+            x = meta["x"]*meta["original_width"]/100
+            y = meta["y"]*meta["original_height"]/100
+            w = meta["width"]*meta["original_width"]/100
+            h = meta["height"]*meta["original_height"]/100
+            filepath = download_S3_file(img_path_url, aws_credentials)
+            IMG = Image.open(filepath)
+            result_text = pt.image_to_string(IMG.crop((x,y,x+w,y+h)),
+                                            config=OCR_config)
+            meta["text"] = result_text
+            # print(meta["text"])
+            temp = {
+                "original_width": meta["original_width"],
+                "original_height": meta["original_height"],
+                "image_rotation": 0,
+                "value": {
+                    "x": x/meta["original_width"]*100,
+                    "y": y/meta["original_height"]*100,
+                    "width": w/meta["original_width"]*100,
+                    "height": h/meta["original_height"]*100,
+                    "rotation": 0,
+                    "text": [
+                    meta["text"]
+                    ]
+                },
+                "id": meta["id"],
+                "from_name": "transcription",
+                "to_name": meta['to_name'],
+                "type": "textarea",
+                "origin": "manual"
+            }
+            # print("temp",temp)
+            return [{
+                'result': [result, temp],
+                'score': 0
+            }]
+        else:
+            return []
+
+    @staticmethod
+    def _extract_meta(task):
+        meta = dict()
+        if task:
+            meta['id'] = task['id']
+            meta['from_name'] = task['from_name']
+            meta['to_name'] = task['to_name']
+            meta['type'] = task['type']
+            # meta['text'] = task['value']['text']
+            # meta['data'] = list(task['data'].values())[0]
+            meta['x'] = task['value']['x']
+            meta['y'] = task['value']['y']
+            meta['width'] = task['value']['width']
+            meta['height'] = task['value']['height']
+            meta["original_width"] = task['original_width']
+            meta["original_height"] = task['original_height']
+        return meta

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+pytesseract==0.3.9`
	`2`	`+boto3==1.22.0`