diff --git a/Containerfile b/Containerfile index 53e58a1..e1a7dd1 100644 --- a/Containerfile +++ b/Containerfile @@ -1,5 +1,5 @@ # Use the same base image version as the clams-python python library version -FROM ghcr.io/clamsproject/clams-python:1.0.9 +FROM ghcr.io/clamsproject/clams-python:1.2.0 # See https://github.com/orgs/clamsproject/packages?tab=packages&q=clams-python for more base images # IF you want to automatically publish this image to the clamsproject organization, # 1. you should have generated this template without --no-github-actions flag diff --git a/app.py b/app.py index 657094e..1aaca52 100644 --- a/app.py +++ b/app.py @@ -4,28 +4,37 @@ import argparse import logging -from typing import Union +from concurrent.futures import ThreadPoolExecutor +from math import floor, ceil +import numpy as np +import torch +from clams import ClamsApp, Restifier +from doctr.models import ocr_predictor from lapps.discriminators import Uri +from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes +from mmif.utils import video_document_helper as vdh -from concurrent.futures import ThreadPoolExecutor # Imports needed for Clams and MMIF. # Non-NLP Clams applications will require AnnotationTypes -from clams import ClamsApp, Restifier -from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentTypes -from mmif.utils import video_document_helper as vdh - -from doctr.models import ocr_predictor -import torch -import numpy as np +def rel_coords_to_abs(coords, width, height): + """ + Simple conversion from relative coordinates (percentage) to absolute coordinates (pixel). + Assumes the passed shape is a rectangle, represented by top-left and bottom-right corners, + and compute floor and ceiling based on the geometry. + """ + x1, y1 = coords[0] + x2, y2 = coords[1] + return [(floor(x1 * height), floor(y1 * width)), (ceil(x2 * height), ceil(y2 * width))] + def create_bbox(view: View, coordinates, box_type, time_point): bbox = view.new_annotation(AnnotationTypes.BoundingBox) bbox.add_property("coordinates", coordinates) - bbox.add_property("boxType", box_type) + bbox.add_property("label", box_type) bbox.add_property("timePoint", time_point) return bbox @@ -50,40 +59,24 @@ def __init__(self): self.gpu = False def _appmetadata(self): - # see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata - # Also check out ``metadata.py`` in this directory. - # When using the ``metadata.py`` leave this do-nothing "pass" method here. + # using metadata.py pass - - class Paragraph: - """ - lapps annotation corresponding to a DocTR Block object targeting contained sentences. - """ - def __init__(self, region: Annotation, document: Document): - self.region = region - self.region.add_property("document", document.id) - self.sentences = [] - - def add_sentence(self, sentence): - self.sentences.append(sentence) - - def collect_targets(self): - self.region.add_property("targets", [s.region.id for s in self.sentences]) - - class Sentence: + + class LingUnit(object): """ - Span annotation corresponding to a DocTR Line object targeting contained tokens. + A thin wrapper for LAPPS linguistic unit annotations that + represent different geometric levels from DocTR OCR output. """ def __init__(self, region: Annotation, document: Document): self.region = region self.region.add_property("document", document.id) - self.tokens = [] + self.children = [] - def add_token(self, token): - self.tokens.append(token) + def add_child(self, sentence): + self.children.append(sentence) def collect_targets(self): - self.region.add_property("targets", [t.region.id for t in self.tokens]) + self.region.add_property("targets", [child.region.id for child in self.children]) class Token: """ @@ -96,39 +89,42 @@ def __init__(self, region: Annotation, document: Document, start: int, end: int) self.region.add_property("end", end) def process_timepoint(self, representative: Annotation, new_view: View, video_doc: Document): - rep_frame_index = vdh.convert(representative.get("timePoint"), "milliseconds", - "frame", vdh.get_framerate(video_doc)) + rep_frame_index = vdh.convert(representative.get("timePoint"), + representative.get("timeUnit"), "frame", + video_doc.get("fps")) image: np.ndarray = vdh.extract_frames_as_images(video_doc, [rep_frame_index], as_PIL=False)[0] result = self.reader([image]) + # assume only one page, as we are passing one image at a time blocks = result.pages[0].blocks text_document: Document = new_view.new_textdocument(result.render()) + h, w = image.shape[:2] for block in blocks: try: - self.process_block(block, new_view, text_document, representative) + self.process_block(block, new_view, text_document, representative, w, h) except Exception as e: self.logger.error(f"Error processing block: {e}") continue return text_document, representative - def process_block(self, block, view, text_document, representative): - paragraph = self.Paragraph(view.new_annotation(at_type=Uri.PARAGRAPH), text_document) - paragraph_bb = create_bbox(view, block.geometry, "text", representative.id) + def process_block(self, block, view, text_document, representative, img_width, img_height): + paragraph = self.LingUnit(view.new_annotation(at_type=Uri.PARAGRAPH), text_document) + paragraph_bb = create_bbox(view, rel_coords_to_abs(block.geometry, img_width, img_height), "text", representative.id) create_alignment(view, paragraph.region.id, paragraph_bb.id) for line in block.lines: try: - sentence = self.process_line(line, view, text_document, representative) + sentence = self.process_line(line, view, text_document, representative, img_width, img_height) except Exception as e: self.logger.error(f"Error processing line: {e}") continue - paragraph.add_sentence(sentence) + paragraph.add_child(sentence) paragraph.collect_targets() - def process_line(self, line, view, text_document, representative): - sentence = self.Sentence(view.new_annotation(at_type=Uri.SENTENCE), text_document) - sentence_bb = create_bbox(view, line.geometry, "text", representative.id) + def process_line(self, line, view, text_document, representative, img_width, img_height): + sentence = self.LingUnit(view.new_annotation(at_type=Uri.SENTENCE), text_document) + sentence_bb = create_bbox(view, rel_coords_to_abs(line.geometry, img_width, img_height), "text", representative.id) create_alignment(view, sentence.region.id, sentence_bb.id) for word in line.words: @@ -136,20 +132,20 @@ def process_line(self, line, view, text_document, representative): start = text_document.text_value.find(word.value) end = start + len(word.value) token = self.Token(view.new_annotation(at_type=Uri.TOKEN), text_document, start, end) - token_bb = create_bbox(view, word.geometry, "text", representative.id) + token_bb = create_bbox(view, rel_coords_to_abs(word.geometry, img_width, img_height), "text", representative.id) create_alignment(view, token.region.id, token_bb.id) - sentence.add_token(token) + sentence.add_child(token) sentence.collect_targets() return sentence - def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif: + def _annotate(self, mmif: Mmif, **parameters) -> Mmif: if self.gpu: self.logger.debug("running app on GPU") else: self.logger.debug("running app on CPU") video_doc: Document = mmif.get_documents_by_type(DocumentTypes.VideoDocument)[0] - input_view: View = mmif.get_views_for_document(video_doc.properties.id)[0] + input_view: View = mmif.get_views_for_document(video_doc.properties.id)[-1] new_view: View = mmif.new_view() self.sign_view(new_view, parameters) @@ -157,11 +153,14 @@ def _annotate(self, mmif: Union[str, dict, Mmif], **parameters) -> Mmif: with ThreadPoolExecutor() as executor: futures = [] for timeframe in input_view.get_annotations(AnnotationTypes.TimeFrame): - representative_ids = timeframe.get("representatives") - representatives = [ - input_view.get_annotation_by_id(representative_id) for representative_id in representative_ids] - for representative in representatives: + for rep_id in timeframe.get("representatives"): + if Mmif.id_delimiter not in rep_id: + rep_id = f'{input_view.id}{Mmif.id_delimiter}{rep_id}' + representative = mmif[rep_id] futures.append(executor.submit(self.process_timepoint, representative, new_view, video_doc)) + if len(futures) == 0: + # TODO (krim @ 4/18/24): if "representatives" is not present, process just the middle frame + pass for future in futures: try: diff --git a/requirements.txt b/requirements.txt index c23f7bf..55c0b21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # Make sure clams-python version is explicitly specified, at least the lower bound -clams-python==1.1.3 -mmif-python[cv]==1.0.10 -python-doctr[torch]==0.7.0 -torch~=2.1.2 -numpy~=1.24.4 -Pillow==10.2.0 -lapps~=0.0.2 \ No newline at end of file +clams-python==1.2.0 +mmif-python[cv] +python-doctr[torch]>=0.7.0 +torch>=2.* +numpy +Pillow +lapps