Skip to content

Commit c631b93

Browse files
committed
converting relative geometry from docTR into abs pixel coordinates
1 parent b0aedb9 commit c631b93

File tree

1 file changed

+24
-7
lines changed

1 file changed

+24
-7
lines changed

Diff for: app.py

+24-7
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import argparse
66
import logging
77
from concurrent.futures import ThreadPoolExecutor
8+
from math import floor, ceil
89

910
import numpy as np
1011
import torch
@@ -19,6 +20,17 @@
1920
# Non-NLP Clams applications will require AnnotationTypes
2021

2122

23+
def rel_coords_to_abs(coords, width, height):
24+
"""
25+
Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
26+
Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
27+
and compute floor and ceiling based on the geometry.
28+
"""
29+
x1, y1 = coords[0]
30+
x2, y2 = coords[1]
31+
return [(floor(x1 * height), floor(y1 * width)), (ceil(x2 * height), ceil(y2 * width))]
32+
33+
2234
def create_bbox(view: View, coordinates, box_type, time_point):
2335
bbox = view.new_annotation(AnnotationTypes.BoundingBox)
2436
bbox.add_property("coordinates", coordinates)
@@ -82,43 +94,45 @@ def process_timepoint(self, representative: Annotation, new_view: View, video_do
8294
video_doc.get("fps"))
8395
image: np.ndarray = vdh.extract_frames_as_images(video_doc, [rep_frame_index], as_PIL=False)[0]
8496
result = self.reader([image])
97+
# assume only one page, as we are passing one image at a time
8598
blocks = result.pages[0].blocks
8699
text_document: Document = new_view.new_textdocument(result.render())
87100

101+
h, w = image.shape[:2]
88102
for block in blocks:
89103
try:
90-
self.process_block(block, new_view, text_document, representative)
104+
self.process_block(block, new_view, text_document, representative, w, h)
91105
except Exception as e:
92106
self.logger.error(f"Error processing block: {e}")
93107
continue
94108

95109
return text_document, representative
96110

97-
def process_block(self, block, view, text_document, representative):
111+
def process_block(self, block, view, text_document, representative, img_width, img_height):
98112
paragraph = self.LingUnit(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
99-
paragraph_bb = create_bbox(view, block.geometry, "text", representative.id)
113+
paragraph_bb = create_bbox(view, rel_coords_to_abs(block.geometry, img_width, img_height), "text", representative.id)
100114
create_alignment(view, paragraph.region.id, paragraph_bb.id)
101115

102116
for line in block.lines:
103117
try:
104-
sentence = self.process_line(line, view, text_document, representative)
118+
sentence = self.process_line(line, view, text_document, representative, img_width, img_height)
105119
except Exception as e:
106120
self.logger.error(f"Error processing line: {e}")
107121
continue
108122
paragraph.add_child(sentence)
109123
paragraph.collect_targets()
110124

111-
def process_line(self, line, view, text_document, representative):
125+
def process_line(self, line, view, text_document, representative, img_width, img_height):
112126
sentence = self.LingUnit(view.new_annotation(at_type=Uri.SENTENCE), text_document)
113-
sentence_bb = create_bbox(view, line.geometry, "text", representative.id)
127+
sentence_bb = create_bbox(view, rel_coords_to_abs(line.geometry, img_width, img_height), "text", representative.id)
114128
create_alignment(view, sentence.region.id, sentence_bb.id)
115129

116130
for word in line.words:
117131
if word.confidence > 0.4:
118132
start = text_document.text_value.find(word.value)
119133
end = start + len(word.value)
120134
token = self.Token(view.new_annotation(at_type=Uri.TOKEN), text_document, start, end)
121-
token_bb = create_bbox(view, word.geometry, "text", representative.id)
135+
token_bb = create_bbox(view, rel_coords_to_abs(word.geometry, img_width, img_height), "text", representative.id)
122136
create_alignment(view, token.region.id, token_bb.id)
123137
sentence.add_child(token)
124138

@@ -144,6 +158,9 @@ def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
144158
rep_id = f'{input_view.id}{Mmif.id_delimiter}{rep_id}'
145159
representative = mmif[rep_id]
146160
futures.append(executor.submit(self.process_timepoint, representative, new_view, video_doc))
161+
if len(futures) == 0:
162+
# TODO (krim @ 4/18/24): if "representatives" is not present, process just the middle frame
163+
pass
147164

148165
for future in futures:
149166
try:

0 commit comments

Comments
 (0)