Skip to content

Commit b1fa2ca

Browse files
committed
major refactoring and fixes for #2
1 parent d056fcd commit b1fa2ca

File tree

1 file changed

+56
-102
lines changed

1 file changed

+56
-102
lines changed

app.py

+56-102
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
"""
44

55
import argparse
6+
import json
67
import logging
78
from concurrent.futures import ThreadPoolExecutor
89
from math import floor, ceil
10+
from typing import Tuple
911

1012
import numpy as np
1113
import torch
@@ -16,35 +18,6 @@
1618
from mmif.utils import video_document_helper as vdh
1719

1820

19-
# Imports needed for Clams and MMIF.
20-
# Non-NLP Clams applications will require AnnotationTypes
21-
22-
23-
def rel_coords_to_abs(coords, width, height):
24-
"""
25-
Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
26-
Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
27-
and compute floor and ceiling based on the geometry.
28-
"""
29-
x1, y1 = coords[0]
30-
x2, y2 = coords[1]
31-
return [(floor(x1 * height), floor(y1 * width)), (ceil(x2 * height), ceil(y2 * width))]
32-
33-
34-
def create_bbox(view: View, coordinates, box_type, time_point):
35-
bbox = view.new_annotation(AnnotationTypes.BoundingBox)
36-
bbox.add_property("coordinates", coordinates)
37-
bbox.add_property("label", box_type)
38-
bbox.add_property("timePoint", time_point)
39-
return bbox
40-
41-
42-
def create_alignment(view: View, source, target) -> None:
43-
alignment = view.new_annotation(AnnotationTypes.Alignment)
44-
alignment.add_property("source", source)
45-
alignment.add_property("target", target)
46-
47-
4821
class DoctrWrapper(ClamsApp):
4922

5023
def __init__(self):
@@ -61,83 +34,63 @@ def __init__(self):
6134
def _appmetadata(self):
6235
# using metadata.py
6336
pass
64-
65-
class LingUnit(object):
66-
"""
67-
A thin wrapper for LAPPS linguistic unit annotations that
68-
represent different geometric levels from DocTR OCR output.
69-
"""
70-
def __init__(self, region: Annotation, document: Document):
71-
self.region = region
72-
self.region.add_property("document", document.id)
73-
self.children = []
74-
75-
def add_child(self, sentence):
76-
self.children.append(sentence)
77-
78-
def collect_targets(self):
79-
self.region.add_property("targets", [child.region.id for child in self.children])
8037

81-
class Token:
38+
@staticmethod
39+
def rel_coords_to_abs(coords: Tuple[Tuple[float, float]], width: int, height: int) -> Tuple[Tuple[int, int]]:
8240
"""
83-
Span annotation corresponding to a DocTR Word object. Start and end are character offsets in the text document.
41+
Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
42+
Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
43+
and compute floor and ceiling based on the geometry.
8444
"""
85-
def __init__(self, region: Annotation, document: Document, start: int, end: int):
86-
self.region = region
87-
self.region.add_property("document", document.id)
88-
self.region.add_property("start", start)
89-
self.region.add_property("end", end)
45+
x1, y1 = coords[0]
46+
x2, y2 = coords[1]
47+
return (floor(x1 * height), floor(y1 * width)), (ceil(x2 * height), ceil(y2 * width))
48+
49+
@staticmethod
50+
def create_bbox(new_view: View,
51+
coordinates: Tuple[Tuple[int, int]],
52+
timepoint_ann: Annotation, text_ann: Annotation):
53+
bbox_ann = new_view.new_annotation(AnnotationTypes.BoundingBox, coordinates=coordinates, label="text")
54+
new_view.new_annotation(AnnotationTypes.Alignment, source=timepoint_ann.id, target=bbox_ann.id)
55+
new_view.new_annotation(AnnotationTypes.Alignment, source=text_ann.id, target=bbox_ann.id)
9056

9157
def process_timepoint(self, representative: Annotation, new_view: View, video_doc: Document):
92-
rep_frame_index = vdh.convert(representative.get("timePoint"),
58+
rep_frame_index = vdh.convert(representative.get("timePoint"),
9359
representative.get("timeUnit"), "frame",
9460
video_doc.get("fps"))
9561
image: np.ndarray = vdh.extract_frames_as_images(video_doc, [rep_frame_index], as_PIL=False)[0]
62+
h, w = image.shape[:2]
9663
result = self.reader([image])
9764
# assume only one page, as we are passing one image at a time
98-
blocks = result.pages[0].blocks
65+
text_content = result.render()
66+
if not text_content:
67+
return representative.get('timePoint'), None
9968
text_document: Document = new_view.new_textdocument(result.render())
100-
101-
h, w = image.shape[:2]
102-
for block in blocks:
103-
try:
104-
self.process_block(block, new_view, text_document, representative, w, h)
105-
except Exception as e:
106-
self.logger.error(f"Error processing block: {e}")
107-
continue
108-
109-
return text_document, representative
110-
111-
def process_block(self, block, view, text_document, representative, img_width, img_height):
112-
paragraph = self.LingUnit(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
113-
paragraph_bb = create_bbox(view, rel_coords_to_abs(block.geometry, img_width, img_height), "text", representative.id)
114-
create_alignment(view, paragraph.region.id, paragraph_bb.id)
115-
116-
for line in block.lines:
117-
try:
118-
sentence = self.process_line(line, view, text_document, representative, img_width, img_height)
119-
except Exception as e:
120-
self.logger.error(f"Error processing line: {e}")
121-
continue
122-
paragraph.add_child(sentence)
123-
paragraph.collect_targets()
124-
125-
def process_line(self, line, view, text_document, representative, img_width, img_height):
126-
sentence = self.LingUnit(view.new_annotation(at_type=Uri.SENTENCE), text_document)
127-
sentence_bb = create_bbox(view, rel_coords_to_abs(line.geometry, img_width, img_height), "text", representative.id)
128-
create_alignment(view, sentence.region.id, sentence_bb.id)
129-
130-
for word in line.words:
131-
if word.confidence > 0.4:
132-
start = text_document.text_value.find(word.value)
133-
end = start + len(word.value)
134-
token = self.Token(view.new_annotation(at_type=Uri.TOKEN), text_document, start, end)
135-
token_bb = create_bbox(view, rel_coords_to_abs(word.geometry, img_width, img_height), "text", representative.id)
136-
create_alignment(view, token.region.id, token_bb.id)
137-
sentence.add_child(token)
138-
139-
sentence.collect_targets()
140-
return sentence
69+
td_id = text_document.id
70+
new_view.new_annotation(AnnotationTypes.Alignment, source=representative.id, target=td_id)
71+
72+
e = 0
73+
for block in result.pages[0].blocks:
74+
para_ann = new_view.new_annotation(Uri.PARAGRAPH, document=td_id, text=block.render())
75+
self.create_bbox(new_view, self.rel_coords_to_abs(block.geometry, w, h), representative, para_ann)
76+
target_sents = []
77+
78+
for line in block.lines:
79+
sent_ann = new_view.new_annotation(Uri.SENTENCE, document=td_id, text=line.render())
80+
target_sents.append(sent_ann.id)
81+
self.create_bbox(new_view, self.rel_coords_to_abs(line.geometry, w, h), representative, sent_ann)
82+
target_tokens = []
83+
84+
for word in line.words:
85+
s = text_content.find(word.value, e)
86+
e = s + len(word.value)
87+
token_ann = new_view.new_annotation(Uri.TOKEN, document=td_id, start=s, end=e, text=word.value)
88+
target_tokens.append(token_ann.id)
89+
self.create_bbox(new_view, self.rel_coords_to_abs(word.geometry, w, h), representative, token_ann)
90+
sent_ann.add_property("targets", target_tokens)
91+
para_ann.add_property("targets", target_sents)
92+
93+
return representative.get('timePoint'), text_content
14194

14295
def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
14396
if self.gpu:
@@ -149,6 +102,12 @@ def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
149102

150103
new_view: View = mmif.new_view()
151104
self.sign_view(new_view, parameters)
105+
new_view.new_contain(DocumentTypes.TextDocument)
106+
new_view.new_contain(AnnotationTypes.BoundingBox)
107+
new_view.new_contain(AnnotationTypes.Alignment)
108+
new_view.new_contain(Uri.PARAGRAPH)
109+
new_view.new_contain(Uri.SENTENCE)
110+
new_view.new_contain(Uri.TOKEN)
152111

153112
with ThreadPoolExecutor() as executor:
154113
futures = []
@@ -163,13 +122,8 @@ def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
163122
pass
164123

165124
for future in futures:
166-
try:
167-
text_document, representative = future.result()
168-
self.logger.debug(text_document.get('text'))
169-
create_alignment(new_view, representative.id, text_document.id)
170-
except Exception as e:
171-
self.logger.error(f"Error processing timeframe: {e}")
172-
continue
125+
timestemp, text_content = future.result()
126+
self.logger.debug(f'Processed timepoint: {timestemp}, recognized text: "{json.dumps(text_content)}"')
173127

174128
return mmif
175129

0 commit comments

Comments
 (0)