33"""
44
55import argparse
6+ import json
67import logging
78from concurrent .futures import ThreadPoolExecutor
89from math import floor , ceil
10+ from typing import Tuple
911
1012import numpy as np
1113import torch
1618from mmif .utils import video_document_helper as vdh
1719
1820
19- # Imports needed for Clams and MMIF.
20- # Non-NLP Clams applications will require AnnotationTypes
21-
22-
23- def rel_coords_to_abs (coords , width , height ):
24- """
25- Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
26- Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
27- and compute floor and ceiling based on the geometry.
28- """
29- x1 , y1 = coords [0 ]
30- x2 , y2 = coords [1 ]
31- return [(floor (x1 * height ), floor (y1 * width )), (ceil (x2 * height ), ceil (y2 * width ))]
32-
33-
34- def create_bbox (view : View , coordinates , box_type , time_point ):
35- bbox = view .new_annotation (AnnotationTypes .BoundingBox )
36- bbox .add_property ("coordinates" , coordinates )
37- bbox .add_property ("label" , box_type )
38- bbox .add_property ("timePoint" , time_point )
39- return bbox
40-
41-
42- def create_alignment (view : View , source , target ) -> None :
43- alignment = view .new_annotation (AnnotationTypes .Alignment )
44- alignment .add_property ("source" , source )
45- alignment .add_property ("target" , target )
46-
47-
4821class DoctrWrapper (ClamsApp ):
4922
5023 def __init__ (self ):
@@ -61,83 +34,63 @@ def __init__(self):
6134 def _appmetadata (self ):
6235 # using metadata.py
6336 pass
64-
65- class LingUnit (object ):
66- """
67- A thin wrapper for LAPPS linguistic unit annotations that
68- represent different geometric levels from DocTR OCR output.
69- """
70- def __init__ (self , region : Annotation , document : Document ):
71- self .region = region
72- self .region .add_property ("document" , document .id )
73- self .children = []
74-
75- def add_child (self , sentence ):
76- self .children .append (sentence )
77-
78- def collect_targets (self ):
79- self .region .add_property ("targets" , [child .region .id for child in self .children ])
8037
81- class Token :
38+ @staticmethod
39+ def rel_coords_to_abs (coords : Tuple [Tuple [float , float ]], width : int , height : int ) -> Tuple [Tuple [int , int ]]:
8240 """
83- Span annotation corresponding to a DocTR Word object. Start and end are character offsets in the text document.
41+ Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
42+ Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
43+ and compute floor and ceiling based on the geometry.
8444 """
85- def __init__ (self , region : Annotation , document : Document , start : int , end : int ):
86- self .region = region
87- self .region .add_property ("document" , document .id )
88- self .region .add_property ("start" , start )
89- self .region .add_property ("end" , end )
45+ x1 , y1 = coords [0 ]
46+ x2 , y2 = coords [1 ]
47+ return (floor (x1 * height ), floor (y1 * width )), (ceil (x2 * height ), ceil (y2 * width ))
48+
49+ @staticmethod
50+ def create_bbox (new_view : View ,
51+ coordinates : Tuple [Tuple [int , int ]],
52+ timepoint_ann : Annotation , text_ann : Annotation ):
53+ bbox_ann = new_view .new_annotation (AnnotationTypes .BoundingBox , coordinates = coordinates , label = "text" )
54+ new_view .new_annotation (AnnotationTypes .Alignment , source = timepoint_ann .id , target = bbox_ann .id )
55+ new_view .new_annotation (AnnotationTypes .Alignment , source = text_ann .id , target = bbox_ann .id )
9056
9157 def process_timepoint (self , representative : Annotation , new_view : View , video_doc : Document ):
92- rep_frame_index = vdh .convert (representative .get ("timePoint" ),
58+ rep_frame_index = vdh .convert (representative .get ("timePoint" ),
9359 representative .get ("timeUnit" ), "frame" ,
9460 video_doc .get ("fps" ))
9561 image : np .ndarray = vdh .extract_frames_as_images (video_doc , [rep_frame_index ], as_PIL = False )[0 ]
62+ h , w = image .shape [:2 ]
9663 result = self .reader ([image ])
9764 # assume only one page, as we are passing one image at a time
98- blocks = result .pages [0 ].blocks
65+ text_content = result .render ()
66+ if not text_content :
67+ return representative .get ('timePoint' ), None
9968 text_document : Document = new_view .new_textdocument (result .render ())
100-
101- h , w = image .shape [:2 ]
102- for block in blocks :
103- try :
104- self .process_block (block , new_view , text_document , representative , w , h )
105- except Exception as e :
106- self .logger .error (f"Error processing block: { e } " )
107- continue
108-
109- return text_document , representative
110-
111- def process_block (self , block , view , text_document , representative , img_width , img_height ):
112- paragraph = self .LingUnit (view .new_annotation (at_type = Uri .PARAGRAPH ), text_document )
113- paragraph_bb = create_bbox (view , rel_coords_to_abs (block .geometry , img_width , img_height ), "text" , representative .id )
114- create_alignment (view , paragraph .region .id , paragraph_bb .id )
115-
116- for line in block .lines :
117- try :
118- sentence = self .process_line (line , view , text_document , representative , img_width , img_height )
119- except Exception as e :
120- self .logger .error (f"Error processing line: { e } " )
121- continue
122- paragraph .add_child (sentence )
123- paragraph .collect_targets ()
124-
125- def process_line (self , line , view , text_document , representative , img_width , img_height ):
126- sentence = self .LingUnit (view .new_annotation (at_type = Uri .SENTENCE ), text_document )
127- sentence_bb = create_bbox (view , rel_coords_to_abs (line .geometry , img_width , img_height ), "text" , representative .id )
128- create_alignment (view , sentence .region .id , sentence_bb .id )
129-
130- for word in line .words :
131- if word .confidence > 0.4 :
132- start = text_document .text_value .find (word .value )
133- end = start + len (word .value )
134- token = self .Token (view .new_annotation (at_type = Uri .TOKEN ), text_document , start , end )
135- token_bb = create_bbox (view , rel_coords_to_abs (word .geometry , img_width , img_height ), "text" , representative .id )
136- create_alignment (view , token .region .id , token_bb .id )
137- sentence .add_child (token )
138-
139- sentence .collect_targets ()
140- return sentence
69+ td_id = text_document .id
70+ new_view .new_annotation (AnnotationTypes .Alignment , source = representative .id , target = td_id )
71+
72+ e = 0
73+ for block in result .pages [0 ].blocks :
74+ para_ann = new_view .new_annotation (Uri .PARAGRAPH , document = td_id , text = block .render ())
75+ self .create_bbox (new_view , self .rel_coords_to_abs (block .geometry , w , h ), representative , para_ann )
76+ target_sents = []
77+
78+ for line in block .lines :
79+ sent_ann = new_view .new_annotation (Uri .SENTENCE , document = td_id , text = line .render ())
80+ target_sents .append (sent_ann .id )
81+ self .create_bbox (new_view , self .rel_coords_to_abs (line .geometry , w , h ), representative , sent_ann )
82+ target_tokens = []
83+
84+ for word in line .words :
85+ s = text_content .find (word .value , e )
86+ e = s + len (word .value )
87+ token_ann = new_view .new_annotation (Uri .TOKEN , document = td_id , start = s , end = e , text = word .value )
88+ target_tokens .append (token_ann .id )
89+ self .create_bbox (new_view , self .rel_coords_to_abs (word .geometry , w , h ), representative , token_ann )
90+ sent_ann .add_property ("targets" , target_tokens )
91+ para_ann .add_property ("targets" , target_sents )
92+
93+ return representative .get ('timePoint' ), text_content
14194
14295 def _annotate (self , mmif : Mmif , ** parameters ) -> Mmif :
14396 if self .gpu :
@@ -149,6 +102,12 @@ def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
149102
150103 new_view : View = mmif .new_view ()
151104 self .sign_view (new_view , parameters )
105+ new_view .new_contain (DocumentTypes .TextDocument )
106+ new_view .new_contain (AnnotationTypes .BoundingBox )
107+ new_view .new_contain (AnnotationTypes .Alignment )
108+ new_view .new_contain (Uri .PARAGRAPH )
109+ new_view .new_contain (Uri .SENTENCE )
110+ new_view .new_contain (Uri .TOKEN )
152111
153112 with ThreadPoolExecutor () as executor :
154113 futures = []
@@ -163,13 +122,8 @@ def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
163122 pass
164123
165124 for future in futures :
166- try :
167- text_document , representative = future .result ()
168- self .logger .debug (text_document .get ('text' ))
169- create_alignment (new_view , representative .id , text_document .id )
170- except Exception as e :
171- self .logger .error (f"Error processing timeframe: { e } " )
172- continue
125+ timestemp , text_content = future .result ()
126+ self .logger .debug (f'Processed timepoint: { timestemp } , recognized text: "{ json .dumps (text_content )} "' )
173127
174128 return mmif
175129
0 commit comments