Skip to content

Commit d056fcd

Browse files
authored
Merge pull request #4 from clamsproject/3-fix-relative-box-coordinates
converting relative geometry from docTR into abs pixel coordinates
2 parents 780430f + c631b93 commit d056fcd

File tree

3 files changed

+61
-62
lines changed

3 files changed

+61
-62
lines changed

Containerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Use the same base image version as the clams-python python library version
2-
FROM ghcr.io/clamsproject/clams-python:1.0.9
2+
FROM ghcr.io/clamsproject/clams-python:1.2.0
33
# See https://github.com/orgs/clamsproject/packages?tab=packages&q=clams-python for more base images
44
# IF you want to automatically publish this image to the clamsproject organization,
55
# 1. you should have generated this template without --no-github-actions flag

app.py

+53-54
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,37 @@
44

55
import argparse
66
import logging
7-
from typing import Union
7+
from concurrent.futures import ThreadPoolExecutor
8+
from math import floor, ceil
89

10+
import numpy as np
11+
import torch
12+
from clams import ClamsApp, Restifier
13+
from doctr.models import ocr_predictor
914
from lapps.discriminators import Uri
15+
from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
16+
from mmif.utils import video_document_helper as vdh
1017

11-
from concurrent.futures import ThreadPoolExecutor
1218

1319
# Imports needed for Clams and MMIF.
1420
# Non-NLP Clams applications will require AnnotationTypes
1521

16-
from clams import ClamsApp, Restifier
17-
from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
18-
from mmif.utils import video_document_helper as vdh
19-
20-
from doctr.models import ocr_predictor
21-
import torch
22-
import numpy as np
2322

23+
def rel_coords_to_abs(coords, width, height):
24+
"""
25+
Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
26+
Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
27+
and compute floor and ceiling based on the geometry.
28+
"""
29+
x1, y1 = coords[0]
30+
x2, y2 = coords[1]
31+
return [(floor(x1 * height), floor(y1 * width)), (ceil(x2 * height), ceil(y2 * width))]
32+
2433

2534
def create_bbox(view: View, coordinates, box_type, time_point):
2635
bbox = view.new_annotation(AnnotationTypes.BoundingBox)
2736
bbox.add_property("coordinates", coordinates)
28-
bbox.add_property("boxType", box_type)
37+
bbox.add_property("label", box_type)
2938
bbox.add_property("timePoint", time_point)
3039
return bbox
3140

@@ -50,40 +59,24 @@ def __init__(self):
5059
self.gpu = False
5160

5261
def _appmetadata(self):
53-
# see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata
54-
# Also check out ``metadata.py`` in this directory.
55-
# When using the ``metadata.py`` leave this do-nothing "pass" method here.
62+
# using metadata.py
5663
pass
57-
58-
class Paragraph:
59-
"""
60-
lapps annotation corresponding to a DocTR Block object targeting contained sentences.
61-
"""
62-
def __init__(self, region: Annotation, document: Document):
63-
self.region = region
64-
self.region.add_property("document", document.id)
65-
self.sentences = []
66-
67-
def add_sentence(self, sentence):
68-
self.sentences.append(sentence)
69-
70-
def collect_targets(self):
71-
self.region.add_property("targets", [s.region.id for s in self.sentences])
72-
73-
class Sentence:
64+
65+
class LingUnit(object):
7466
"""
75-
Span annotation corresponding to a DocTR Line object targeting contained tokens.
67+
A thin wrapper for LAPPS linguistic unit annotations that
68+
represent different geometric levels from DocTR OCR output.
7669
"""
7770
def __init__(self, region: Annotation, document: Document):
7871
self.region = region
7972
self.region.add_property("document", document.id)
80-
self.tokens = []
73+
self.children = []
8174

82-
def add_token(self, token):
83-
self.tokens.append(token)
75+
def add_child(self, sentence):
76+
self.children.append(sentence)
8477

8578
def collect_targets(self):
86-
self.region.add_property("targets", [t.region.id for t in self.tokens])
79+
self.region.add_property("targets", [child.region.id for child in self.children])
8780

8881
class Token:
8982
"""
@@ -96,72 +89,78 @@ def __init__(self, region: Annotation, document: Document, start: int, end: int)
9689
self.region.add_property("end", end)
9790

9891
def process_timepoint(self, representative: Annotation, new_view: View, video_doc: Document):
99-
rep_frame_index = vdh.convert(representative.get("timePoint"), "milliseconds",
100-
"frame", vdh.get_framerate(video_doc))
92+
rep_frame_index = vdh.convert(representative.get("timePoint"),
93+
representative.get("timeUnit"), "frame",
94+
video_doc.get("fps"))
10195
image: np.ndarray = vdh.extract_frames_as_images(video_doc, [rep_frame_index], as_PIL=False)[0]
10296
result = self.reader([image])
97+
# assume only one page, as we are passing one image at a time
10398
blocks = result.pages[0].blocks
10499
text_document: Document = new_view.new_textdocument(result.render())
105100

101+
h, w = image.shape[:2]
106102
for block in blocks:
107103
try:
108-
self.process_block(block, new_view, text_document, representative)
104+
self.process_block(block, new_view, text_document, representative, w, h)
109105
except Exception as e:
110106
self.logger.error(f"Error processing block: {e}")
111107
continue
112108

113109
return text_document, representative
114110

115-
def process_block(self, block, view, text_document, representative):
116-
paragraph = self.Paragraph(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
117-
paragraph_bb = create_bbox(view, block.geometry, "text", representative.id)
111+
def process_block(self, block, view, text_document, representative, img_width, img_height):
112+
paragraph = self.LingUnit(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
113+
paragraph_bb = create_bbox(view, rel_coords_to_abs(block.geometry, img_width, img_height), "text", representative.id)
118114
create_alignment(view, paragraph.region.id, paragraph_bb.id)
119115

120116
for line in block.lines:
121117
try:
122-
sentence = self.process_line(line, view, text_document, representative)
118+
sentence = self.process_line(line, view, text_document, representative, img_width, img_height)
123119
except Exception as e:
124120
self.logger.error(f"Error processing line: {e}")
125121
continue
126-
paragraph.add_sentence(sentence)
122+
paragraph.add_child(sentence)
127123
paragraph.collect_targets()
128124

129-
def process_line(self, line, view, text_document, representative):
130-
sentence = self.Sentence(view.new_annotation(at_type=Uri.SENTENCE), text_document)
131-
sentence_bb = create_bbox(view, line.geometry, "text", representative.id)
125+
def process_line(self, line, view, text_document, representative, img_width, img_height):
126+
sentence = self.LingUnit(view.new_annotation(at_type=Uri.SENTENCE), text_document)
127+
sentence_bb = create_bbox(view, rel_coords_to_abs(line.geometry, img_width, img_height), "text", representative.id)
132128
create_alignment(view, sentence.region.id, sentence_bb.id)
133129

134130
for word in line.words:
135131
if word.confidence > 0.4:
136132
start = text_document.text_value.find(word.value)
137133
end = start + len(word.value)
138134
token = self.Token(view.new_annotation(at_type=Uri.TOKEN), text_document, start, end)
139-
token_bb = create_bbox(view, word.geometry, "text", representative.id)
135+
token_bb = create_bbox(view, rel_coords_to_abs(word.geometry, img_width, img_height), "text", representative.id)
140136
create_alignment(view, token.region.id, token_bb.id)
141-
sentence.add_token(token)
137+
sentence.add_child(token)
142138

143139
sentence.collect_targets()
144140
return sentence
145141

146-
def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
142+
def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
147143
if self.gpu:
148144
self.logger.debug("running app on GPU")
149145
else:
150146
self.logger.debug("running app on CPU")
151147
video_doc: Document = mmif.get_documents_by_type(DocumentTypes.VideoDocument)[0]
152-
input_view: View = mmif.get_views_for_document(video_doc.properties.id)[0]
148+
input_view: View = mmif.get_views_for_document(video_doc.properties.id)[-1]
153149

154150
new_view: View = mmif.new_view()
155151
self.sign_view(new_view, parameters)
156152

157153
with ThreadPoolExecutor() as executor:
158154
futures = []
159155
for timeframe in input_view.get_annotations(AnnotationTypes.TimeFrame):
160-
representative_ids = timeframe.get("representatives")
161-
representatives = [
162-
input_view.get_annotation_by_id(representative_id) for representative_id in representative_ids]
163-
for representative in representatives:
156+
for rep_id in timeframe.get("representatives"):
157+
if Mmif.id_delimiter not in rep_id:
158+
rep_id = f'{input_view.id}{Mmif.id_delimiter}{rep_id}'
159+
representative = mmif[rep_id]
164160
futures.append(executor.submit(self.process_timepoint, representative, new_view, video_doc))
161+
if len(futures) == 0:
162+
# TODO (krim @ 4/18/24): if "representatives" is not present, process just the middle frame
163+
pass
165164

166165
for future in futures:
167166
try:

requirements.txt

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Make sure clams-python version is explicitly specified, at least the lower bound
2-
clams-python==1.1.3
3-
mmif-python[cv]==1.0.10
4-
python-doctr[torch]==0.7.0
5-
torch~=2.1.2
6-
numpy~=1.24.4
7-
Pillow==10.2.0
8-
lapps~=0.0.2
2+
clams-python==1.2.0
3+
mmif-python[cv]
4+
python-doctr[torch]>=0.7.0
5+
torch>=2.*
6+
numpy
7+
Pillow
8+
lapps

0 commit comments

Comments
 (0)