Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

converting relative geometry from docTR into abs pixel coordinates #4

Merged
merged 3 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Containerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Use the same base image version as the clams-python python library version
FROM ghcr.io/clamsproject/clams-python:1.0.9
FROM ghcr.io/clamsproject/clams-python:1.2.0
# See https://github.com/orgs/clamsproject/packages?tab=packages&q=clams-python for more base images
# IF you want to automatically publish this image to the clamsproject organization,
# 1. you should have generated this template without --no-github-actions flag
Expand Down
107 changes: 53 additions & 54 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,37 @@

import argparse
import logging
from typing import Union
from concurrent.futures import ThreadPoolExecutor
from math import floor, ceil

import numpy as np
import torch
from clams import ClamsApp, Restifier
from doctr.models import ocr_predictor
from lapps.discriminators import Uri
from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
from mmif.utils import video_document_helper as vdh

from concurrent.futures import ThreadPoolExecutor

# Imports needed for Clams and MMIF.
# Non-NLP Clams applications will require AnnotationTypes

from clams import ClamsApp, Restifier
from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes
from mmif.utils import video_document_helper as vdh

from doctr.models import ocr_predictor
import torch
import numpy as np

def rel_coords_to_abs(coords, width, height):
"""
Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel).
Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners,
and compute floor and ceiling based on the geometry.
"""
x1, y1 = coords[0]
x2, y2 = coords[1]
return [(floor(x1 * height), floor(y1 * width)), (ceil(x2 * height), ceil(y2 * width))]


def create_bbox(view: View, coordinates, box_type, time_point):
bbox = view.new_annotation(AnnotationTypes.BoundingBox)
bbox.add_property("coordinates", coordinates)
bbox.add_property("boxType", box_type)
bbox.add_property("label", box_type)
bbox.add_property("timePoint", time_point)
return bbox

Expand All @@ -50,40 +59,24 @@ def __init__(self):
self.gpu = False

def _appmetadata(self):
# see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata
# Also check out ``metadata.py`` in this directory.
# When using the ``metadata.py`` leave this do-nothing "pass" method here.
# using metadata.py
pass

class Paragraph:
"""
lapps annotation corresponding to a DocTR Block object targeting contained sentences.
"""
def __init__(self, region: Annotation, document: Document):
self.region = region
self.region.add_property("document", document.id)
self.sentences = []

def add_sentence(self, sentence):
self.sentences.append(sentence)

def collect_targets(self):
self.region.add_property("targets", [s.region.id for s in self.sentences])

class Sentence:

class LingUnit(object):
"""
Span annotation corresponding to a DocTR Line object targeting contained tokens.
A thin wrapper for LAPPS linguistic unit annotations that
represent different geometric levels from DocTR OCR output.
"""
def __init__(self, region: Annotation, document: Document):
self.region = region
self.region.add_property("document", document.id)
self.tokens = []
self.children = []

def add_token(self, token):
self.tokens.append(token)
def add_child(self, sentence):
self.children.append(sentence)

def collect_targets(self):
self.region.add_property("targets", [t.region.id for t in self.tokens])
self.region.add_property("targets", [child.region.id for child in self.children])

class Token:
"""
Expand All @@ -96,72 +89,78 @@ def __init__(self, region: Annotation, document: Document, start: int, end: int)
self.region.add_property("end", end)

def process_timepoint(self, representative: Annotation, new_view: View, video_doc: Document):
rep_frame_index = vdh.convert(representative.get("timePoint"), "milliseconds",
"frame", vdh.get_framerate(video_doc))
rep_frame_index = vdh.convert(representative.get("timePoint"),
representative.get("timeUnit"), "frame",
video_doc.get("fps"))
image: np.ndarray = vdh.extract_frames_as_images(video_doc, [rep_frame_index], as_PIL=False)[0]
result = self.reader([image])
# assume only one page, as we are passing one image at a time
blocks = result.pages[0].blocks
text_document: Document = new_view.new_textdocument(result.render())

h, w = image.shape[:2]
for block in blocks:
try:
self.process_block(block, new_view, text_document, representative)
self.process_block(block, new_view, text_document, representative, w, h)
except Exception as e:
self.logger.error(f"Error processing block: {e}")
continue

return text_document, representative

def process_block(self, block, view, text_document, representative):
paragraph = self.Paragraph(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
paragraph_bb = create_bbox(view, block.geometry, "text", representative.id)
def process_block(self, block, view, text_document, representative, img_width, img_height):
paragraph = self.LingUnit(view.new_annotation(at_type=Uri.PARAGRAPH), text_document)
paragraph_bb = create_bbox(view, rel_coords_to_abs(block.geometry, img_width, img_height), "text", representative.id)
create_alignment(view, paragraph.region.id, paragraph_bb.id)

for line in block.lines:
try:
sentence = self.process_line(line, view, text_document, representative)
sentence = self.process_line(line, view, text_document, representative, img_width, img_height)
except Exception as e:
self.logger.error(f"Error processing line: {e}")
continue
paragraph.add_sentence(sentence)
paragraph.add_child(sentence)
paragraph.collect_targets()

def process_line(self, line, view, text_document, representative):
sentence = self.Sentence(view.new_annotation(at_type=Uri.SENTENCE), text_document)
sentence_bb = create_bbox(view, line.geometry, "text", representative.id)
def process_line(self, line, view, text_document, representative, img_width, img_height):
sentence = self.LingUnit(view.new_annotation(at_type=Uri.SENTENCE), text_document)
sentence_bb = create_bbox(view, rel_coords_to_abs(line.geometry, img_width, img_height), "text", representative.id)
create_alignment(view, sentence.region.id, sentence_bb.id)

for word in line.words:
if word.confidence > 0.4:
start = text_document.text_value.find(word.value)
end = start + len(word.value)
token = self.Token(view.new_annotation(at_type=Uri.TOKEN), text_document, start, end)
token_bb = create_bbox(view, word.geometry, "text", representative.id)
token_bb = create_bbox(view, rel_coords_to_abs(word.geometry, img_width, img_height), "text", representative.id)
create_alignment(view, token.region.id, token_bb.id)
sentence.add_token(token)
sentence.add_child(token)

sentence.collect_targets()
return sentence

def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif:
def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
if self.gpu:
self.logger.debug("running app on GPU")
else:
self.logger.debug("running app on CPU")
video_doc: Document = mmif.get_documents_by_type(DocumentTypes.VideoDocument)[0]
input_view: View = mmif.get_views_for_document(video_doc.properties.id)[0]
input_view: View = mmif.get_views_for_document(video_doc.properties.id)[-1]

new_view: View = mmif.new_view()
self.sign_view(new_view, parameters)

with ThreadPoolExecutor() as executor:
futures = []
for timeframe in input_view.get_annotations(AnnotationTypes.TimeFrame):
representative_ids = timeframe.get("representatives")
representatives = [
input_view.get_annotation_by_id(representative_id) for representative_id in representative_ids]
for representative in representatives:
for rep_id in timeframe.get("representatives"):
if Mmif.id_delimiter not in rep_id:
rep_id = f'{input_view.id}{Mmif.id_delimiter}{rep_id}'
representative = mmif[rep_id]
futures.append(executor.submit(self.process_timepoint, representative, new_view, video_doc))
if len(futures) == 0:
# TODO (krim @ 4/18/24): if "representatives" is not present, process just the middle frame
pass

for future in futures:
try:
Expand Down
14 changes: 7 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Make sure clams-python version is explicitly specified, at least the lower bound
clams-python==1.1.3
mmif-python[cv]==1.0.10
python-doctr[torch]==0.7.0
torch~=2.1.2
numpy~=1.24.4
Pillow==10.2.0
lapps~=0.0.2
clams-python==1.2.0
mmif-python[cv]
python-doctr[torch]>=0.7.0
torch>=2.*
numpy
Pillow
lapps
Loading