4
4
5
5
import argparse
6
6
import logging
7
- from typing import Union
7
+ from concurrent .futures import ThreadPoolExecutor
8
+ from math import floor , ceil
8
9
10
+ import numpy as np
11
+ import torch
12
+ from clams import ClamsApp , Restifier
13
+ from doctr .models import ocr_predictor
9
14
from lapps .discriminators import Uri
15
+ from mmif import Mmif , View , Annotation , Document , AnnotationTypes , DocumentTypes
16
+ from mmif .utils import video_document_helper as vdh
10
17
11
- from concurrent .futures import ThreadPoolExecutor
12
18
13
19
# Imports needed for Clams and MMIF.
14
20
# Non-NLP Clams applications will require AnnotationTypes
15
21
16
- from clams import ClamsApp , Restifier
17
- from mmif import Mmif , View , Annotation , Document , AnnotationTypes , DocumentTypes
18
- from mmif .utils import video_document_helper as vdh
19
-
20
- from doctr .models import ocr_predictor
21
- import torch
22
- import numpy as np
23
22
23
+ def rel_coords_to_abs (coords , width , height ):
24
+ """
25
+ Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
26
+ Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
27
+ and compute floor and ceiling based on the geometry.
28
+ """
29
+ x1 , y1 = coords [0 ]
30
+ x2 , y2 = coords [1 ]
31
+ return [(floor (x1 * height ), floor (y1 * width )), (ceil (x2 * height ), ceil (y2 * width ))]
32
+
24
33
25
34
def create_bbox (view : View , coordinates , box_type , time_point ):
26
35
bbox = view .new_annotation (AnnotationTypes .BoundingBox )
27
36
bbox .add_property ("coordinates" , coordinates )
28
- bbox .add_property ("boxType " , box_type )
37
+ bbox .add_property ("label " , box_type )
29
38
bbox .add_property ("timePoint" , time_point )
30
39
return bbox
31
40
@@ -50,40 +59,24 @@ def __init__(self):
50
59
self .gpu = False
51
60
52
61
def _appmetadata (self ):
53
- # see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata
54
- # Also check out ``metadata.py`` in this directory.
55
- # When using the ``metadata.py`` leave this do-nothing "pass" method here.
62
+ # using metadata.py
56
63
pass
57
-
58
- class Paragraph :
59
- """
60
- lapps annotation corresponding to a DocTR Block object targeting contained sentences.
61
- """
62
- def __init__ (self , region : Annotation , document : Document ):
63
- self .region = region
64
- self .region .add_property ("document" , document .id )
65
- self .sentences = []
66
-
67
- def add_sentence (self , sentence ):
68
- self .sentences .append (sentence )
69
-
70
- def collect_targets (self ):
71
- self .region .add_property ("targets" , [s .region .id for s in self .sentences ])
72
-
73
- class Sentence :
64
+
65
+ class LingUnit (object ):
74
66
"""
75
- Span annotation corresponding to a DocTR Line object targeting contained tokens.
67
+ A thin wrapper for LAPPS linguistic unit annotations that
68
+ represent different geometric levels from DocTR OCR output.
76
69
"""
77
70
def __init__ (self , region : Annotation , document : Document ):
78
71
self .region = region
79
72
self .region .add_property ("document" , document .id )
80
- self .tokens = []
73
+ self .children = []
81
74
82
- def add_token (self , token ):
83
- self .tokens .append (token )
75
+ def add_child (self , sentence ):
76
+ self .children .append (sentence )
84
77
85
78
def collect_targets (self ):
86
- self .region .add_property ("targets" , [t .region .id for t in self .tokens ])
79
+ self .region .add_property ("targets" , [child .region .id for child in self .children ])
87
80
88
81
class Token :
89
82
"""
@@ -96,72 +89,78 @@ def __init__(self, region: Annotation, document: Document, start: int, end: int)
96
89
self .region .add_property ("end" , end )
97
90
98
91
def process_timepoint (self , representative : Annotation , new_view : View , video_doc : Document ):
99
- rep_frame_index = vdh .convert (representative .get ("timePoint" ), "milliseconds" ,
100
- "frame" , vdh .get_framerate (video_doc ))
92
+ rep_frame_index = vdh .convert (representative .get ("timePoint" ),
93
+ representative .get ("timeUnit" ), "frame" ,
94
+ video_doc .get ("fps" ))
101
95
image : np .ndarray = vdh .extract_frames_as_images (video_doc , [rep_frame_index ], as_PIL = False )[0 ]
102
96
result = self .reader ([image ])
97
+ # assume only one page, as we are passing one image at a time
103
98
blocks = result .pages [0 ].blocks
104
99
text_document : Document = new_view .new_textdocument (result .render ())
105
100
101
+ h , w = image .shape [:2 ]
106
102
for block in blocks :
107
103
try :
108
- self .process_block (block , new_view , text_document , representative )
104
+ self .process_block (block , new_view , text_document , representative , w , h )
109
105
except Exception as e :
110
106
self .logger .error (f"Error processing block: { e } " )
111
107
continue
112
108
113
109
return text_document , representative
114
110
115
- def process_block (self , block , view , text_document , representative ):
116
- paragraph = self .Paragraph (view .new_annotation (at_type = Uri .PARAGRAPH ), text_document )
117
- paragraph_bb = create_bbox (view , block .geometry , "text" , representative .id )
111
+ def process_block (self , block , view , text_document , representative , img_width , img_height ):
112
+ paragraph = self .LingUnit (view .new_annotation (at_type = Uri .PARAGRAPH ), text_document )
113
+ paragraph_bb = create_bbox (view , rel_coords_to_abs ( block .geometry , img_width , img_height ) , "text" , representative .id )
118
114
create_alignment (view , paragraph .region .id , paragraph_bb .id )
119
115
120
116
for line in block .lines :
121
117
try :
122
- sentence = self .process_line (line , view , text_document , representative )
118
+ sentence = self .process_line (line , view , text_document , representative , img_width , img_height )
123
119
except Exception as e :
124
120
self .logger .error (f"Error processing line: { e } " )
125
121
continue
126
- paragraph .add_sentence (sentence )
122
+ paragraph .add_child (sentence )
127
123
paragraph .collect_targets ()
128
124
129
- def process_line (self , line , view , text_document , representative ):
130
- sentence = self .Sentence (view .new_annotation (at_type = Uri .SENTENCE ), text_document )
131
- sentence_bb = create_bbox (view , line .geometry , "text" , representative .id )
125
+ def process_line (self , line , view , text_document , representative , img_width , img_height ):
126
+ sentence = self .LingUnit (view .new_annotation (at_type = Uri .SENTENCE ), text_document )
127
+ sentence_bb = create_bbox (view , rel_coords_to_abs ( line .geometry , img_width , img_height ) , "text" , representative .id )
132
128
create_alignment (view , sentence .region .id , sentence_bb .id )
133
129
134
130
for word in line .words :
135
131
if word .confidence > 0.4 :
136
132
start = text_document .text_value .find (word .value )
137
133
end = start + len (word .value )
138
134
token = self .Token (view .new_annotation (at_type = Uri .TOKEN ), text_document , start , end )
139
- token_bb = create_bbox (view , word .geometry , "text" , representative .id )
135
+ token_bb = create_bbox (view , rel_coords_to_abs ( word .geometry , img_width , img_height ) , "text" , representative .id )
140
136
create_alignment (view , token .region .id , token_bb .id )
141
- sentence .add_token (token )
137
+ sentence .add_child (token )
142
138
143
139
sentence .collect_targets ()
144
140
return sentence
145
141
146
- def _annotate (self , mmif : Union [ str , dict , Mmif ] , ** parameters ) -> Mmif :
142
+ def _annotate (self , mmif : Mmif , ** parameters ) -> Mmif :
147
143
if self .gpu :
148
144
self .logger .debug ("running app on GPU" )
149
145
else :
150
146
self .logger .debug ("running app on CPU" )
151
147
video_doc : Document = mmif .get_documents_by_type (DocumentTypes .VideoDocument )[0 ]
152
- input_view : View = mmif .get_views_for_document (video_doc .properties .id )[0 ]
148
+ input_view : View = mmif .get_views_for_document (video_doc .properties .id )[- 1 ]
153
149
154
150
new_view : View = mmif .new_view ()
155
151
self .sign_view (new_view , parameters )
156
152
157
153
with ThreadPoolExecutor () as executor :
158
154
futures = []
159
155
for timeframe in input_view .get_annotations (AnnotationTypes .TimeFrame ):
160
- representative_ids = timeframe .get ("representatives" )
161
- representatives = [
162
- input_view . get_annotation_by_id ( representative_id ) for representative_id in representative_ids ]
163
- for representative in representatives :
156
+ for rep_id in timeframe .get ("representatives" ):
157
+ if Mmif . id_delimiter not in rep_id :
158
+ rep_id = f' { input_view . id } { Mmif . id_delimiter } { rep_id } '
159
+ representative = mmif [ rep_id ]
164
160
futures .append (executor .submit (self .process_timepoint , representative , new_view , video_doc ))
161
+ if len (futures ) == 0 :
162
+ # TODO (krim @ 4/18/24): if "representatives" is not present, process just the middle frame
163
+ pass
165
164
166
165
for future in futures :
167
166
try :
0 commit comments