Merge pull request #4 from clamsproject/3-fix-relative-box-coordinates

keighrim · web-flow · commit d056fcd3b0ae · 2024-04-18T13:02:03.000-04:00
converting relative geometry from docTR into abs pixel coordinates
diff --git a/Containerfile b/Containerfile
@@ -1,5 +1,5 @@
 # Use the same base image version as the clams-python python library version
-FROM ghcr.io/clamsproject/clams-python:1.0.9
+FROM ghcr.io/clamsproject/clams-python:1.2.0
 # See https://github.com/orgs/clamsproject/packages?tab=packages&q=clams-python for more base images
 # IF you want to automatically publish this image to the clamsproject organization, 
 # 1. you should have generated this template without --no-github-actions flag
diff --git a/app.py b/app.py
@@ -4,28 +4,37 @@
 
 import argparse
 import logging
-from typing import Union
+from concurrent.futures import ThreadPoolExecutor
+from math import floor, ceil
 
+import numpy as np
+import torch
+from clams import ClamsApp, Restifier
+from doctr.models import ocr_predictor
 from lapps.discriminators import Uri
+from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
+from mmif.utils import video_document_helper as vdh
 
-from concurrent.futures import ThreadPoolExecutor
 
 # Imports needed for Clams and MMIF.
 # Non-NLP Clams applications will require AnnotationTypes
 
-from clams import ClamsApp, Restifier
-from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
-from mmif.utils import video_document_helper as vdh
-
-from doctr.models import ocr_predictor
-import torch
-import numpy as np
 
+def rel_coords_to_abs(coords, width, height):
+    """
+    Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel). 
+    Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners, 
+    and compute floor and ceiling based on the geometry.
+    """
+    x1, y1 = coords[0]
+    x2, y2 = coords[1]
+    return [(floor(x1 * height), floor(y1 * width)), (ceil(x2 * height), ceil(y2 * width))]
+    
 
 def create_bbox(view: View, coordinates, box_type, time_point):
     bbox = view.new_annotation(AnnotationTypes.BoundingBox)
     bbox.add_property("coordinates", coordinates)
-    bbox.add_property("boxType", box_type)
+    bbox.add_property("label", box_type)
     bbox.add_property("timePoint", time_point)
     return bbox
 
@@ -50,40 +59,24 @@ def __init__(self):
             self.gpu = False
 
     def _appmetadata(self):
-        # see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata
-        # Also check out ``metadata.py`` in this directory. 
-        # When using the ``metadata.py`` leave this do-nothing "pass" method here. 
+        # using metadata.py
         pass
-
-    class Paragraph:
-        """
-        lapps annotation corresponding to a DocTR Block object targeting contained sentences.
-        """
-        def __init__(self, region: Annotation, document: Document):
-            self.region = region
-            self.region.add_property("document", document.id)
-            self.sentences = []
-
-        def add_sentence(self, sentence):
-            self.sentences.append(sentence)
-
-        def collect_targets(self):
-            self.region.add_property("targets", [s.region.id for s in self.sentences])
-
-    class Sentence:
+    
+    class LingUnit(object):
         """
-        Span annotation corresponding to a DocTR Line object targeting contained tokens.
+        A thin wrapper for LAPPS linguistic unit annotations that 
+        represent different geometric levels from DocTR OCR output.
         """
         def __init__(self, region: Annotation, document: Document):
             self.region = region
             self.region.add_property("document", document.id)
-            self.tokens = []
+            self.children = []
 
-        def add_token(self, token):
-            self.tokens.append(token)
+        def add_child(self, sentence):
+            self.children.append(sentence)
 
         def collect_targets(self):
-            self.region.add_property("targets", [t.region.id for t in self.tokens])
+            self.region.add_property("targets", [child.region.id for child in self.children])
 
     class Token:
         """
@@ -96,72 +89,78 @@ def __init__(self, region: Annotation, document: Document, start: int, end: int)
             self.region.add_property("end", end)
 
     def process_timepoint(self, representative: Annotation, new_view: View, video_doc: Document):
-        rep_frame_index = vdh.convert(representative.get("timePoint"), "milliseconds",
-                                      "frame", vdh.get_framerate(video_doc))
+        rep_frame_index = vdh.convert(representative.get("timePoint"), 
+                                      representative.get("timeUnit"), "frame", 
+                                      video_doc.get("fps"))
         image: np.ndarray = vdh.extract_frames_as_images(video_doc, [rep_frame_index], as_PIL=False)[0]
         result = self.reader([image])
+        # assume only one page, as we are passing one image at a time
         blocks = result.pages[0].blocks
         text_document: Document = new_view.new_textdocument(result.render())
 
+        h, w = image.shape[:2]
         for block in blocks:
             try:
-                self.process_block(block, new_view, text_document, representative)
+                self.process_block(block, new_view, text_document, representative, w, h)
             except Exception as e:
                 self.logger.error(f"Error processing block: {e}")
                 continue
 
         return text_document, representative
 
-    def process_block(self, block, view, text_document, representative):
-        paragraph = self.Paragraph(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
-        paragraph_bb = create_bbox(view, block.geometry, "text", representative.id)
+    def process_block(self, block, view, text_document, representative, img_width, img_height):
+        paragraph = self.LingUnit(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
+        paragraph_bb = create_bbox(view, rel_coords_to_abs(block.geometry, img_width, img_height), "text", representative.id)
         create_alignment(view, paragraph.region.id, paragraph_bb.id)
 
         for line in block.lines:
             try:
-                sentence = self.process_line(line, view, text_document, representative)
+                sentence = self.process_line(line, view, text_document, representative, img_width, img_height)
             except Exception as e:
                 self.logger.error(f"Error processing line: {e}")
                 continue
-            paragraph.add_sentence(sentence)
+            paragraph.add_child(sentence)
         paragraph.collect_targets()
 
-    def process_line(self, line, view, text_document, representative):
-        sentence = self.Sentence(view.new_annotation(at_type=Uri.SENTENCE), text_document)
-        sentence_bb = create_bbox(view, line.geometry, "text", representative.id)
+    def process_line(self, line, view, text_document, representative, img_width, img_height):
+        sentence = self.LingUnit(view.new_annotation(at_type=Uri.SENTENCE), text_document)
+        sentence_bb = create_bbox(view, rel_coords_to_abs(line.geometry, img_width, img_height), "text", representative.id)
         create_alignment(view, sentence.region.id, sentence_bb.id)
 
         for word in line.words:
             if word.confidence > 0.4:
                 start = text_document.text_value.find(word.value)
                 end = start + len(word.value)
                 token = self.Token(view.new_annotation(at_type=Uri.TOKEN), text_document, start, end)
-                token_bb = create_bbox(view, word.geometry, "text", representative.id)
+                token_bb = create_bbox(view, rel_coords_to_abs(word.geometry, img_width, img_height), "text", representative.id)
                 create_alignment(view, token.region.id, token_bb.id)
-                sentence.add_token(token)
+                sentence.add_child(token)
 
         sentence.collect_targets()
         return sentence
 
-    def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
+    def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
         if self.gpu:
             self.logger.debug("running app on GPU")
         else:
             self.logger.debug("running app on CPU")
         video_doc: Document = mmif.get_documents_by_type(DocumentTypes.VideoDocument)[0]
-        input_view: View = mmif.get_views_for_document(video_doc.properties.id)[0]
+        input_view: View = mmif.get_views_for_document(video_doc.properties.id)[-1]
 
         new_view: View = mmif.new_view()
         self.sign_view(new_view, parameters)
 
         with ThreadPoolExecutor() as executor:
             futures = []
             for timeframe in input_view.get_annotations(AnnotationTypes.TimeFrame):
-                representative_ids = timeframe.get("representatives")
-                representatives = [
-                    input_view.get_annotation_by_id(representative_id) for representative_id in representative_ids]
-                for representative in representatives:
+                for rep_id in timeframe.get("representatives"):
+                    if Mmif.id_delimiter not in rep_id:
+                        rep_id = f'{input_view.id}{Mmif.id_delimiter}{rep_id}'
+                    representative = mmif[rep_id]
                     futures.append(executor.submit(self.process_timepoint, representative, new_view, video_doc))
+                if len(futures) == 0:
+                    # TODO (krim @ 4/18/24): if "representatives" is not present, process just the middle frame
+                    pass
 
             for future in futures:
                 try:
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 # Make sure clams-python version is explicitly specified, at least the lower bound
-clams-python==1.1.3
-mmif-python[cv]==1.0.10
-python-doctr[torch]==0.7.0
-torch~=2.1.2
-numpy~=1.24.4
-Pillow==10.2.0
-lapps~=0.0.2
+clams-python==1.2.0
+mmif-python[cv]
+python-doctr[torch]>=0.7.0
+torch>=2.*
+numpy
+Pillow
+lapps