3
3
"""
4
4
5
5
import argparse
6
+ import json
6
7
import logging
7
8
from concurrent .futures import ThreadPoolExecutor
8
9
from math import floor , ceil
10
+ from typing import Tuple
9
11
10
12
import numpy as np
11
13
import torch
16
18
from mmif .utils import video_document_helper as vdh
17
19
18
20
19
- # Imports needed for Clams and MMIF.
20
- # Non-NLP Clams applications will require AnnotationTypes
21
-
22
-
23
- def rel_coords_to_abs (coords , width , height ):
24
- """
25
- Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
26
- Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
27
- and compute floor and ceiling based on the geometry.
28
- """
29
- x1 , y1 = coords [0 ]
30
- x2 , y2 = coords [1 ]
31
- return [(floor (x1 * height ), floor (y1 * width )), (ceil (x2 * height ), ceil (y2 * width ))]
32
-
33
-
34
- def create_bbox (view : View , coordinates , box_type , time_point ):
35
- bbox = view .new_annotation (AnnotationTypes .BoundingBox )
36
- bbox .add_property ("coordinates" , coordinates )
37
- bbox .add_property ("label" , box_type )
38
- bbox .add_property ("timePoint" , time_point )
39
- return bbox
40
-
41
-
42
- def create_alignment (view : View , source , target ) -> None :
43
- alignment = view .new_annotation (AnnotationTypes .Alignment )
44
- alignment .add_property ("source" , source )
45
- alignment .add_property ("target" , target )
46
-
47
-
48
21
class DoctrWrapper (ClamsApp ):
49
22
50
23
def __init__ (self ):
@@ -61,83 +34,63 @@ def __init__(self):
61
34
def _appmetadata (self ):
62
35
# using metadata.py
63
36
pass
64
-
65
- class LingUnit (object ):
66
- """
67
- A thin wrapper for LAPPS linguistic unit annotations that
68
- represent different geometric levels from DocTR OCR output.
69
- """
70
- def __init__ (self , region : Annotation , document : Document ):
71
- self .region = region
72
- self .region .add_property ("document" , document .id )
73
- self .children = []
74
-
75
- def add_child (self , sentence ):
76
- self .children .append (sentence )
77
-
78
- def collect_targets (self ):
79
- self .region .add_property ("targets" , [child .region .id for child in self .children ])
80
37
81
- class Token :
38
+ @staticmethod
39
+ def rel_coords_to_abs (coords : Tuple [Tuple [float , float ]], width : int , height : int ) -> Tuple [Tuple [int , int ]]:
82
40
"""
83
- Span annotation corresponding to a DocTR Word object. Start and end are character offsets in the text document.
41
+ Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
42
+ Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
43
+ and compute floor and ceiling based on the geometry.
84
44
"""
85
- def __init__ (self , region : Annotation , document : Document , start : int , end : int ):
86
- self .region = region
87
- self .region .add_property ("document" , document .id )
88
- self .region .add_property ("start" , start )
89
- self .region .add_property ("end" , end )
45
+ x1 , y1 = coords [0 ]
46
+ x2 , y2 = coords [1 ]
47
+ return (floor (x1 * height ), floor (y1 * width )), (ceil (x2 * height ), ceil (y2 * width ))
48
+
49
+ @staticmethod
50
+ def create_bbox (new_view : View ,
51
+ coordinates : Tuple [Tuple [int , int ]],
52
+ timepoint_ann : Annotation , text_ann : Annotation ):
53
+ bbox_ann = new_view .new_annotation (AnnotationTypes .BoundingBox , coordinates = coordinates , label = "text" )
54
+ new_view .new_annotation (AnnotationTypes .Alignment , source = timepoint_ann .id , target = bbox_ann .id )
55
+ new_view .new_annotation (AnnotationTypes .Alignment , source = text_ann .id , target = bbox_ann .id )
90
56
91
57
def process_timepoint (self , representative : Annotation , new_view : View , video_doc : Document ):
92
- rep_frame_index = vdh .convert (representative .get ("timePoint" ),
58
+ rep_frame_index = vdh .convert (representative .get ("timePoint" ),
93
59
representative .get ("timeUnit" ), "frame" ,
94
60
video_doc .get ("fps" ))
95
61
image : np .ndarray = vdh .extract_frames_as_images (video_doc , [rep_frame_index ], as_PIL = False )[0 ]
62
+ h , w = image .shape [:2 ]
96
63
result = self .reader ([image ])
97
64
# assume only one page, as we are passing one image at a time
98
- blocks = result .pages [0 ].blocks
65
+ text_content = result .render ()
66
+ if not text_content :
67
+ return representative .get ('timePoint' ), None
99
68
text_document : Document = new_view .new_textdocument (result .render ())
100
-
101
- h , w = image .shape [:2 ]
102
- for block in blocks :
103
- try :
104
- self .process_block (block , new_view , text_document , representative , w , h )
105
- except Exception as e :
106
- self .logger .error (f"Error processing block: { e } " )
107
- continue
108
-
109
- return text_document , representative
110
-
111
- def process_block (self , block , view , text_document , representative , img_width , img_height ):
112
- paragraph = self .LingUnit (view .new_annotation (at_type = Uri .PARAGRAPH ), text_document )
113
- paragraph_bb = create_bbox (view , rel_coords_to_abs (block .geometry , img_width , img_height ), "text" , representative .id )
114
- create_alignment (view , paragraph .region .id , paragraph_bb .id )
115
-
116
- for line in block .lines :
117
- try :
118
- sentence = self .process_line (line , view , text_document , representative , img_width , img_height )
119
- except Exception as e :
120
- self .logger .error (f"Error processing line: { e } " )
121
- continue
122
- paragraph .add_child (sentence )
123
- paragraph .collect_targets ()
124
-
125
- def process_line (self , line , view , text_document , representative , img_width , img_height ):
126
- sentence = self .LingUnit (view .new_annotation (at_type = Uri .SENTENCE ), text_document )
127
- sentence_bb = create_bbox (view , rel_coords_to_abs (line .geometry , img_width , img_height ), "text" , representative .id )
128
- create_alignment (view , sentence .region .id , sentence_bb .id )
129
-
130
- for word in line .words :
131
- if word .confidence > 0.4 :
132
- start = text_document .text_value .find (word .value )
133
- end = start + len (word .value )
134
- token = self .Token (view .new_annotation (at_type = Uri .TOKEN ), text_document , start , end )
135
- token_bb = create_bbox (view , rel_coords_to_abs (word .geometry , img_width , img_height ), "text" , representative .id )
136
- create_alignment (view , token .region .id , token_bb .id )
137
- sentence .add_child (token )
138
-
139
- sentence .collect_targets ()
140
- return sentence
69
+ td_id = text_document .id
70
+ new_view .new_annotation (AnnotationTypes .Alignment , source = representative .id , target = td_id )
71
+
72
+ e = 0
73
+ for block in result .pages [0 ].blocks :
74
+ para_ann = new_view .new_annotation (Uri .PARAGRAPH , document = td_id , text = block .render ())
75
+ self .create_bbox (new_view , self .rel_coords_to_abs (block .geometry , w , h ), representative , para_ann )
76
+ target_sents = []
77
+
78
+ for line in block .lines :
79
+ sent_ann = new_view .new_annotation (Uri .SENTENCE , document = td_id , text = line .render ())
80
+ target_sents .append (sent_ann .id )
81
+ self .create_bbox (new_view , self .rel_coords_to_abs (line .geometry , w , h ), representative , sent_ann )
82
+ target_tokens = []
83
+
84
+ for word in line .words :
85
+ s = text_content .find (word .value , e )
86
+ e = s + len (word .value )
87
+ token_ann = new_view .new_annotation (Uri .TOKEN , document = td_id , start = s , end = e , text = word .value )
88
+ target_tokens .append (token_ann .id )
89
+ self .create_bbox (new_view , self .rel_coords_to_abs (word .geometry , w , h ), representative , token_ann )
90
+ sent_ann .add_property ("targets" , target_tokens )
91
+ para_ann .add_property ("targets" , target_sents )
92
+
93
+ return representative .get ('timePoint' ), text_content
141
94
142
95
def _annotate (self , mmif : Mmif , ** parameters ) -> Mmif :
143
96
if self .gpu :
@@ -149,6 +102,12 @@ def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
149
102
150
103
new_view : View = mmif .new_view ()
151
104
self .sign_view (new_view , parameters )
105
+ new_view .new_contain (DocumentTypes .TextDocument )
106
+ new_view .new_contain (AnnotationTypes .BoundingBox )
107
+ new_view .new_contain (AnnotationTypes .Alignment )
108
+ new_view .new_contain (Uri .PARAGRAPH )
109
+ new_view .new_contain (Uri .SENTENCE )
110
+ new_view .new_contain (Uri .TOKEN )
152
111
153
112
with ThreadPoolExecutor () as executor :
154
113
futures = []
@@ -163,13 +122,8 @@ def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
163
122
pass
164
123
165
124
for future in futures :
166
- try :
167
- text_document , representative = future .result ()
168
- self .logger .debug (text_document .get ('text' ))
169
- create_alignment (new_view , representative .id , text_document .id )
170
- except Exception as e :
171
- self .logger .error (f"Error processing timeframe: { e } " )
172
- continue
125
+ timestemp , text_content = future .result ()
126
+ self .logger .debug (f'Processed timepoint: { timestemp } , recognized text: "{ json .dumps (text_content )} "' )
173
127
174
128
return mmif
175
129
0 commit comments