diff --git a/.gitignore b/.gitignore index 0a9b11b..11c7e30 100644 --- a/.gitignore +++ b/.gitignore @@ -73,4 +73,6 @@ tags # VSCode .devcontainer -devcontainer.json \ No newline at end of file +devcontainer.json + +static/mmif-viz-cache \ No newline at end of file diff --git a/app.py b/app.py index 3d8f887..a92f92c 100644 --- a/app.py +++ b/app.py @@ -3,16 +3,20 @@ import secrets import sys from threading import Thread +from shutil import rmtree -from flask import request, render_template, flash, send_from_directory, redirect +from flask import Flask, request, render_template, flash, send_from_directory, redirect from mmif.serialize import Mmif +from mmif.vocabulary import DocumentTypes import cache from cache import set_last_access, cleanup -from utils import app, render_ocr, documents_to_htmls, prep_annotations, prepare_ocr_visualization import traceback -import utils -from utils import app +from render import render_documents, render_annotations, prepare_ocr, render_ocr_page + +# these two static folder-related params are important, do not remove +app = Flask(__name__, static_folder='static', static_url_path='') +app.secret_key = 'your_secret_key_here' @app.route('/') @@ -22,24 +26,12 @@ def index(): @app.route('/ocr', methods=['POST']) def ocr(): - try: - data = dict(request.json) - mmif_str = open(cache.get_cache_root() / data["mmif_id"] / "file.mmif").read() - mmif = Mmif(mmif_str) - ocr_view = mmif.get_view_by_id(data["view_id"]) - return utils.prepare_ocr_visualization(mmif, ocr_view, data["mmif_id"]) - except Exception as e: - app.logger.error(f"{e}\n{traceback.format_exc()}") - return f'

Error: {e} Check the server log for more information.' - - -@app.route('/ocrpage', methods=['POST']) -def ocrpage(): - data = request.json - try: - return utils.render_ocr(data["mmif_id"], data['vid_path'], data["view_id"], data["page_number"]) - except Exception as e: - return f'

Unexpected error of type {type(e)}: {e}' + if "page_number" not in request.json: + build_ocr_tab(request.json) + request.json["page_number"] = 0 + # return serve_first_ocr_page(request.json) + # else: + return serve_ocr_page(request.json) @app.route('/upload', methods=['GET', 'POST']) @@ -93,7 +85,7 @@ def display(viz_id): return html_file else: app.logger.debug(f"Visualization {viz_id} not found in cache.") - os.remove(path) + rmtree(path) flash("File not found -- please upload again (it may have been deleted to clear up cache space).") return redirect("/upload") @@ -103,6 +95,45 @@ def send_js(path): return send_from_directory("uv", path) +def render_mmif(mmif_str, viz_id): + mmif = Mmif(mmif_str) + rendered_documents = render_documents(mmif, viz_id) + rendered_annotations = render_annotations(mmif, viz_id) + return render_template('player.html', + docs=rendered_documents, + viz_id=viz_id, + annotations=rendered_annotations) + + +def build_ocr_tab(data): + """ + Prepares OCR (at load time, due to lazy loading) + """ + try: + data = dict(request.json) + mmif_str = open(cache.get_cache_root() / + data["mmif_id"] / "file.mmif").read() + mmif = Mmif(mmif_str) + ocr_view = mmif.get_view_by_id(data["view_id"]) + prepare_ocr(mmif, ocr_view, data["mmif_id"]) + request.json["vid_path"] = mmif.get_documents_by_type(DocumentTypes.VideoDocument)[ + 0].location_path() + + except Exception as e: + app.logger.error(f"{e}\n{traceback.format_exc()}") + return f'

Error: {e} Check the server log for more information.' + + +def serve_ocr_page(data): + """ + Serves subsequent OCR pages + """ + try: + return render_ocr_page(data["mmif_id"], data['vid_path'], data["view_id"], data["page_number"]) + except Exception as e: + return f'

Unexpected error of type {type(e)}: {e}' + + def upload_file(in_mmif): # Save file locally in_mmif_bytes = in_mmif if isinstance(in_mmif, bytes) else in_mmif.read() @@ -117,13 +148,7 @@ def upload_file(in_mmif): with open(path / 'file.mmif', 'w') as in_mmif_file: app.logger.debug(f"Writing original MMIF to {path / 'file.mmif'}") in_mmif_file.write(in_mmif_str) - mmif = Mmif(in_mmif_str) - htmlized_docs = utils.documents_to_htmls(mmif, viz_id) - app.logger.debug(f"Prepared document: {[d[0] for d in htmlized_docs]}") - annotations = utils.prep_annotations(mmif, viz_id) - app.logger.debug(f"Prepared Annotations: {[annotation[0] for annotation in annotations]}") - html_page = render_template('player.html', - docs=htmlized_docs, viz_id=viz_id, annotations=annotations) + html_page = render_mmif(in_mmif_str, viz_id) with open(os.path.join(path, "index.html"), "w") as f: f.write(html_page) except FileExistsError: @@ -133,7 +158,6 @@ def upload_file(in_mmif): t = Thread(target=cleanup) t.daemon = True t.run() - agent = request.headers.get('User-Agent') if 'curl' in agent.lower(): return f"Visualization ID is {viz_id}\nYou can access the visualized file at {request.url_root}display/{viz_id}\n" @@ -143,7 +167,8 @@ def upload_file(in_mmif): if __name__ == '__main__': # Make path for temp files cache_path = cache.get_cache_root() - cache_symlink_path = os.path.join(app.static_folder, cache._CACHE_DIR_SUFFIX) + cache_symlink_path = os.path.join( + app.static_folder, cache._CACHE_DIR_SUFFIX) if os.path.islink(cache_symlink_path): os.unlink(cache_symlink_path) elif os.path.exists(cache_symlink_path): @@ -158,5 +183,5 @@ def upload_file(in_mmif): port = 5000 if len(sys.argv) > 2 and sys.argv[1] == '-p': port = int(sys.argv[2]) - + app.run(port=port, host='0.0.0.0', debug=True, use_reloader=True) diff --git a/displacy/__init__.py b/displacy/__init__.py index 0ae0ffe..5a76760 100644 --- a/displacy/__init__.py +++ b/displacy/__init__.py @@ -48,11 +48,11 @@ def read_text(textdoc, app_root): # container, see the comment in html_text() in ../app.py) if not os.path.isfile(location): if location.startswith('file:///'): - location = location[8:] + location = location[7:] else: # this should not happen anymore, but keeping it anyway location = location[1:] - location = os.path.join(app_root, 'static', location) + # location = os.path.join(app_root, 'static', location) with open(location) as fh: text = fh.read() else: diff --git a/examples/whisper-spacy.json b/examples/whisper-spacy.json index 967a3d4..9a164e6 100644 --- a/examples/whisper-spacy.json +++ b/examples/whisper-spacy.json @@ -8,7 +8,7 @@ "properties": { "mime": "video", "id": "d1", - "location": "file:///data/video/service-mbrs-ntscrm-01181182.mp4" + "location": "file:///data/service-mbrs-ntscrm-01181182.mp4" } }, { @@ -16,7 +16,7 @@ "properties": { "mime": "audio", "id": "d2", - "location": "file:///data/audio/service-mbrs-ntscrm-01181182.wav" + "location": "file:///data/service-mbrs-ntscrm-01181182.wav" } }, { @@ -24,7 +24,7 @@ "properties": { "mime": "text", "id": "d3", - "location": "file:///data/text/service-mbrs-ntscrm-01181182.txt" + "location": "file:///data/service-mbrs-ntscrm-01181182.txt" } } ], diff --git a/ocr.py b/ocr.py index a964296..dc1bbc6 100644 --- a/ocr.py +++ b/ocr.py @@ -1,14 +1,13 @@ import datetime -import pathlib import cv2 -import tempfile import json import re -import os, shutil +import os +import shutil +from mmif.vocabulary.annotation_types import AnnotationTypes +from mmif.vocabulary.document_types import DocumentTypes -from flask import render_template -from mmif import AnnotationTypes, DocumentTypes, Mmif from mmif.utils.video_document_helper import convert_timepoint, convert_timeframe import cache @@ -51,14 +50,17 @@ def update(self, anno, mmif): elif anno.at_type.shortname == "Paragraph": view = mmif.get_view_by_id(anno.parent) - text_anno = mmif[anno.properties.get("document")] + text_anno = view.get_annotation_by_id( + anno.properties.get("document")) self.add_text_document(text_anno) - def add_bounding_box(self, anno, mmif: Mmif): - timepoint_anno = None + def add_bounding_box(self, anno, mmif): if "timePoint" in anno.properties: timepoint_anno = mmif[anno.get("timePoint")] - + + if timepoint_anno: + self.add_timepoint(timepoint_anno, mmif, + skip_if_view_has_frames=False) else: for alignment_anns in mmif.get_alignments(AnnotationTypes.BoundingBox, AnnotationTypes.TimePoint).values(): for alignment_ann in alignment_anns: @@ -88,9 +90,11 @@ def add_bounding_box(self, anno, mmif: Mmif): def add_timeframe(self, anno, mmif): # If annotation has multiple targets, pick the first and last as start and end if "targets" in anno.properties: - start_id, end_id = anno.properties.get("targets")[0], anno.properties.get("targets")[-1] + start_id, end_id = anno.properties.get( + "targets")[0], anno.properties.get("targets")[-1] anno_parent = mmif.get_view_by_id(anno.parent) - start_anno, end_anno = mmif[start_id], mmif[end_id] + start_anno, end_anno = anno_parent.get_annotation_by_id( + start_id), anno_parent.get_annotation_by_id(end_id) start = convert_timepoint(mmif, start_anno, "frames") end = convert_timepoint(mmif, end_anno, "frames") start_secs = convert_timepoint(mmif, start_anno, "seconds") @@ -99,7 +103,8 @@ def add_timeframe(self, anno, mmif): start, end = convert_timeframe(mmif, anno, "frames") start_secs, end_secs = convert_timeframe(mmif, anno, "seconds") self.range = (start, end) - self.timestamp_range = (str(datetime.timedelta(seconds=start_secs)), str(datetime.timedelta(seconds=end_secs))) + self.timestamp_range = (str(datetime.timedelta(seconds=start_secs)), str( + datetime.timedelta(seconds=end_secs))) self.sec_range = (start_secs, end_secs) if anno.properties.get("frameType"): self.frametype = str(anno.properties.get("frameType")) @@ -107,24 +112,43 @@ def add_timeframe(self, anno, mmif): self.frametype = str(anno.properties.get("label")) def add_timepoint(self, anno, mmif, skip_if_view_has_frames=True): - parent = mmif.get_view_by_id(anno.parent) - other_annotations = [k for k in parent.metadata.contains.keys() if k != anno.id] - # If there are TimeFrames in the same view, they most likely represent - # condensed information about representative frames (e.g. SWT). In this - # case, only render the TimeFrames and ignore the TimePoints. - if any([anno == AnnotationTypes.TimeFrame for anno in other_annotations]) and skip_if_view_has_frames: - return - self.frame_num = convert_timepoint(mmif, anno, "frames") - self.secs = convert_timepoint(mmif, anno, "seconds") - self.timestamp = str(datetime.timedelta(seconds=self.secs)) - if anno.properties.get("label"): - self.frametype = anno.properties.get("label") + parent = mmif.get_view_by_id(anno.parent) + other_annotations = [ + k for k in parent.metadata.contains.keys() if k != anno.id] + # If there are TimeFrames in the same view, they most likely represent + # condensed information about representative frames (e.g. SWT). In this + # case, only render the TimeFrames and ignore the TimePoints. + if any([anno.shortname == "TimeFrame" for anno in other_annotations]) and skip_if_view_has_frames: + return + self.frame_num = convert_timepoint(mmif, anno, "frames") + self.secs = convert_timepoint(mmif, anno, "seconds") + self.timestamp = str(datetime.timedelta(seconds=self.secs)) + if anno.properties.get("label"): + self.frametype = anno.properties.get("label") def add_text_document(self, anno): - t = anno.properties.get("text_value") or anno.properties.get("text").value + t = anno.properties.get( + "text_value") or anno.properties.get("text").value if t: text_val = re.sub(r'([\\\/\|\"\'])', r'\1 ', t) - self.text = self.text + [text_val] if text_val not in self.text else self.text + self.text = self.text + \ + [text_val] if text_val not in self.text else self.text + + +def prepare_ocr(mmif, view, viz_id): + """ + Prepares list of frames that will be passed back and forth between server + and client, and renders the first page of the OCR. + """ + ocr_frames = get_ocr_frames(view, mmif) + + # Generate pages (necessary to reduce IO cost) and render + frames_list = [(k, vars(v)) for k, v in ocr_frames.items()] + frames_list = find_duplicates(frames_list) + frames_pages = paginate(frames_list) + # Save page list as temp file + save_json(frames_pages, view.id, viz_id) + def get_ocr_frames(view, mmif): @@ -149,7 +173,7 @@ def get_ocr_frames(view, mmif): frames[i].update(target, mmif) else: frames[i] = frame - + else: for annotation in view.get_annotations(): frame = OCRFrame(annotation, mmif) @@ -160,7 +184,6 @@ def get_ocr_frames(view, mmif): frames[i].update(annotation, mmif) else: frames[i] = frame - print(frames) return frames @@ -183,45 +206,9 @@ def paginate(frames_list): return {i: page for (i, page) in enumerate(pages)} -def render_ocr(mmif_id, vid_path, view_id, page_number): - """ - Iterate through frames and display the contents/alignments. - """ - # Path for storing temporary images generated by cv2 - cv2_vid = cv2.VideoCapture(vid_path) - tn_data_fname = cache.get_cache_root() / mmif_id / f"{view_id}-pages.json" - thumbnail_pages = json.load(open(tn_data_fname)) - page = thumbnail_pages[str(page_number)] - prev_frame_cap = None - path = make_image_directory(mmif_id) - for frame_num, frame in page: - # If index is range instead of frame... - if frame.get("range"): - frame_num = (int(frame["range"][0]) + int(frame["range"][1])) / 2 - cv2_vid.set(1, frame_num) - _, frame_cap = cv2_vid.read() - if frame_cap is None: - raise FileNotFoundError(f"Video file {vid_path} not found!") - - # Double check histogram similarity of "repeat" frames -- if they're significantly different, un-mark as repeat - if prev_frame_cap is not None and frame["repeat"] and not is_duplicate_image(prev_frame_cap, frame_cap, - cv2_vid): - frame["repeat"] = False - with tempfile.NamedTemporaryFile(dir=str(path), suffix=".jpg", delete=False) as tf: - cv2.imwrite(tf.name, frame_cap) - # "id" is just the name of the temp image file - frame["id"] = pathlib.Path(tf.name).name - prev_frame_cap = frame_cap - - tn_page_html = render_template( - 'ocr.html', vid_path=vid_path, view_id=view_id, page=page, - n_pages=len(thumbnail_pages), page_number=str(page_number), mmif_id=mmif_id) - return tn_page_html - - -def make_image_directory(mmif_id): +def make_image_directory(mmif_id, view_id): # Make path for temp OCR image files or clear image files if it exists - path = cache.get_cache_root() / mmif_id / "img" + path = cache.get_cache_root() / mmif_id / "img" / view_id if os.path.exists(path): shutil.rmtree(path) os.makedirs(path) @@ -266,10 +253,14 @@ def is_duplicate_image(prev_frame, frame, cv2_vid): img2_hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) # Calculate the histogram and normalize it - hist_img1 = cv2.calcHist([img1_hsv], [0, 1], None, [180, 256], [0, 180, 0, 256]) - cv2.normalize(hist_img1, hist_img1, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX); - hist_img2 = cv2.calcHist([img2_hsv], [0, 1], None, [180, 256], [0, 180, 0, 256]) - cv2.normalize(hist_img2, hist_img2, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX); + hist_img1 = cv2.calcHist([img1_hsv], [0, 1], None, [ + 180, 256], [0, 180, 0, 256]) + cv2.normalize(hist_img1, hist_img1, alpha=0, + beta=1, norm_type=cv2.NORM_MINMAX) + hist_img2 = cv2.calcHist([img2_hsv], [0, 1], None, [ + 180, 256], [0, 180, 0, 256]) + cv2.normalize(hist_img2, hist_img2, alpha=0, + beta=1, norm_type=cv2.NORM_MINMAX) # Find the metric value metric_val = cv2.compareHist(hist_img1, hist_img2, cv2.HISTCMP_CHISQR) @@ -289,29 +280,6 @@ def round_boxes(boxes): return rounded_boxes -def get_ocr_views(mmif): - """Returns all CV views, which contain timeframes or bounding boxes""" - views = [] - required_types = ["TimeFrame", "BoundingBox", "TimePoint"] - for view in mmif.views: - for anno_type, anno in view.metadata.contains.items(): - # Annotation belongs to a CV view if it is a TimeFrame/BB and it refers to a VideoDocument - # if anno.get("document") is None: - # continue - # if anno_type.shortname in required_types and mmif.get_document_by_id( - # anno["document"]).at_type.shortname == "VideoDocument": - # views.append(view) - # continue - if anno_type.shortname in required_types: - views.append(view) - break - # TODO: Couldn't find a simple way to show if an alignment view is a CV/Frames-type view - elif "parseq" in view.metadata.app: - views.append(view) - break - return views - - def save_json(data, view_id, mmif_id): path = cache.get_cache_root() / mmif_id / f"{view_id}-pages.json" with open(path, 'w') as f: diff --git a/render.py b/render.py new file mode 100644 index 0000000..dc4b758 --- /dev/null +++ b/render.py @@ -0,0 +1,321 @@ +import os +import pathlib +from io import StringIO +from collections import Counter +from flask import render_template, current_app +import re + +from mmif import DocumentTypes +from lapps.discriminators import Uri +import displacy +import traceback + +from utils import get_status, get_properties, get_abstract_view_type, url2posix, get_vtt_file +from ocr import prepare_ocr, make_image_directory, is_duplicate_image +import cv2 +import json +import tempfile + +import cache + +""" +Methods to render MMIF documents and their annotations in various formats. +""" + +# -- Render methods -- + + +def render_documents(mmif, viz_id): + """ + Returns HTML Tab representation of all documents in the MMIF object. + """ + tabs = [] + for document in mmif.documents: + if document.at_type == DocumentTypes.TextDocument: + tabs.append(TextTab(document, viz_id)) + elif document.at_type == DocumentTypes.ImageDocument: + tabs.append(ImageTab(document, viz_id)) + elif document.at_type == DocumentTypes.AudioDocument: + tabs.append(AudioTab(document, viz_id)) + elif document.at_type == DocumentTypes.VideoDocument: + tabs.append(VideoTab(document, mmif, viz_id)) + + return tabs + + +def render_annotations(mmif, viz_id): + """ + Returns HTML Tab representation of all annotations in the MMIF object. + """ + tabs = [] + # These tabs should always be present + tabs.append(InfoTab(mmif)) + tabs.append(AnnotationTableTab(mmif)) + tabs.append(JSTreeTab(mmif)) + # These tabs are optional + for view in mmif.views: + abstract_view_type = get_abstract_view_type(view, mmif) + if abstract_view_type == "NER": + tabs.append(NERTab(mmif, view)) + elif abstract_view_type == "ASR": + tabs.append(VTTTab(mmif, view, viz_id)) + elif abstract_view_type == "OCR": + tabs.append(OCRTab(mmif, view, viz_id)) + + return tabs + + +# -- Base Tab Class -- + +class DocumentTab(): + def __init__(self, document, viz_id): + self.id = document.id + self.tab_name = document.at_type.shortname + self.viz_id = viz_id + + try: + # Add symbolic link to document to static folder, so it can be accessed + # by the browser. + self.doc_path = document.location_path() + self.doc_symlink_path = pathlib.Path( + current_app.static_folder) / cache._CACHE_DIR_SUFFIX / viz_id / (f"{document.id}.{self.doc_path.split('.')[-1]}") + os.symlink(self.doc_path, self.doc_symlink_path) + self.doc_symlink_rel_path = '/' + \ + self.doc_symlink_path.relative_to( + current_app.static_folder).as_posix() + + self.html = self.render() + + except Exception as e: + self.html = f"Error rendering document:

{traceback.format_exc()}
" + + def __str__(self): + return f"Tab: {self.tab_name} ({self.id})" + + +class AnnotationTab(): + def __init__(self, mmif, view=None): + self.mmif = mmif + # Some AnnotationTab sub-classes don't refer to a specific view, and so + # they specify their own ids and tab names. For ones that do refer to + # a specific view, we set the ids/tab names based on view properties. + if view: + self.view = view + # Workaround to deal with the fact that some apps have a version number + # in the URL + app_url = view.metadata.app if re.search( + r"\/v\d+\.?\d?$", view.metadata.app) else view.metadata.app + "/v1" + app_shortname = app_url.split("/")[-2] + + self.id = view.id + self.tab_name = f"{app_shortname}-{view.id}" + try: + self.html = self.render() + except Exception as e: + self.html = f"Error rendering view:

{traceback.format_exc()}
" + + +# -- Document Classes -- + +class TextTab(DocumentTab): + def __init__(self, document, viz_id): + super().__init__(document, viz_id) + + def render(self): + with open(self.doc_path) as t_file: + content = t_file.read().replace("\n", "
\n") + return f"{content}\n" + + +class ImageTab(DocumentTab): + def __init__(self, document, viz_id): + super().__init__(document, viz_id) + + def render(self): + img_path = url2posix(self.doc_path) + html = StringIO() + html.write( + f'Image\n') + return html.getvalue() + + +class AudioTab(DocumentTab): + def __init__(self, document, viz_id): + super().__init__(document, viz_id) + + def render(self): + audio_path = url2posix(self.doc_symlink_rel_path) + html = StringIO() + html.write('\n") + return html.getvalue() + + +class VideoTab(DocumentTab): + def __init__(self, document, mmif, viz_id): + # VideoTab needs access to the MMIF object to get the VTT file + self.mmif = mmif + super().__init__(document, viz_id) + + def render(self): + vid_path = url2posix(self.doc_symlink_rel_path) + html = StringIO() + html.write('\n") + return html.getvalue() + + +# -- Annotation Classes -- + +class InfoTab(AnnotationTab): + def __init__(self, mmif): + self.id = "info" + self.tab_name = "Info" + super().__init__(mmif) + + def render(self): + mmif = self.mmif + s = StringIO('Howdy') + s.write("
")
+        for document in mmif.documents:
+            at_type = document.at_type.shortname
+            location = document.location
+            s.write("%s  %s\n" % (at_type, location))
+        s.write('\n')
+        for view in mmif.views:
+            app = view.metadata.app
+            status = get_status(view)
+            s.write('%s  %s  %s  %d\n' %
+                    (view.id, app, status, len(view.annotations)))
+            if len(view.annotations) > 0:
+                s.write('\n')
+                types = Counter([a.at_type.shortname
+                                for a in view.annotations])
+                for attype, count in types.items():
+                    s.write('    %4d %s\n' % (count, attype))
+            s.write('\n')
+        s.write("
") + return s.getvalue() + + +class AnnotationTableTab(AnnotationTab): + def __init__(self, mmif): + self.id = "annotations" + self.tab_name = "Annotations" + super().__init__(mmif) + + def render(self): + mmif = self.mmif + s = StringIO('Howdy') + for view in mmif.views: + status = get_status(view) + s.write('

%s %s %s %d annotations

\n' + % (view.id, view.metadata.app, status, len(view.annotations))) + s.write("
\n") + s.write("\n") + def limit_len(str): return str[:500] + \ + " . . . }" if len(str) > 500 else str + for annotation in view.annotations: + s.write(' \n') + s.write(' \n' % annotation.id) + s.write(' \n' % annotation.at_type.shortname) + s.write(' \n' % + limit_len(get_properties(annotation))) + s.write(' \n') + s.write("
%s%s%s
\n") + s.write("
\n") + return s.getvalue() + + +class JSTreeTab(AnnotationTab): + def __init__(self, mmif): + self.id = "tree" + self.tab_name = "Tree" + super().__init__(mmif) + + def render(self): + mmif = self.mmif + return render_template('interactive.html', mmif=mmif, aligned_views=[]) + + +class NERTab(AnnotationTab): + def __init__(self, mmif, view): + super().__init__(mmif, view) + + def render(self): + metadata = self.view.metadata.contains.get(Uri.NE) + ner_document = metadata.get('document') + return displacy.visualize_ner(self.mmif, self.view, ner_document, current_app.root_path) + + +class VTTTab(AnnotationTab): + def __init__(self, mmif, view, viz_id): + self.viz_id = viz_id + super().__init__(mmif, view) + + def render(self): + vtt_filename = get_vtt_file(self.view, self.viz_id) + with open(vtt_filename) as vtt_file: + vtt_content = vtt_file.read() + return f"
{vtt_content}
" + + +class OCRTab(AnnotationTab): + def __init__(self, mmif, view, viz_id): + self.viz_id = viz_id + self.vid_path = mmif.get_documents_by_type(DocumentTypes.VideoDocument)[ + 0].location_path() + + super().__init__(mmif, view) + + def render(self): + return render_template("pre-ocr.html", view_id=self.view.id, tabname=self.tab_name, mmif_id=self.viz_id) + # prepare_ocr(self.mmif, self.view, self.viz_id) + # return render_ocr_page(self.viz_id, self.vid_path, self.view.id, 0) + + +def render_ocr_page(mmif_id, vid_path, view_id, page_number): + """ + Renders a single OCR page by iterating through frames and displaying the + contents/alignments. Note: this needs to be a separate function (not a method + in OCRTab) because it is called by the server when the page is changed. + """ + # Path for storing temporary images generated by cv2 + cv2_vid = cv2.VideoCapture(vid_path) + tn_data_fname = cache.get_cache_root() / mmif_id / f"{view_id}-pages.json" + thumbnail_pages = json.load(open(tn_data_fname)) + page = thumbnail_pages[str(page_number)] + prev_frame_cap = None + path = make_image_directory(mmif_id, view_id) + for frame_num, frame in page: + # If index is range instead of frame... + if frame.get("range"): + frame_num = (int(frame["range"][0]) + int(frame["range"][1])) / 2 + cv2_vid.set(1, frame_num) + _, frame_cap = cv2_vid.read() + if frame_cap is None: + raise FileNotFoundError(f"Video file {vid_path} not found!") + + # Double check histogram similarity of "repeat" frames -- if they're significantly different, un-mark as repeat + if prev_frame_cap is not None and frame["repeat"] and not is_duplicate_image(prev_frame_cap, frame_cap, + cv2_vid): + frame["repeat"] = False + with tempfile.NamedTemporaryFile(dir=str(path), suffix=".jpg", delete=False) as tf: + cv2.imwrite(tf.name, frame_cap) + # "id" is just the name of the temp image file + frame["id"] = pathlib.Path(tf.name).name + prev_frame_cap = frame_cap + + tn_page_html = render_template( + 'ocr.html', vid_path=vid_path, view_id=view_id, page=page, + n_pages=len(thumbnail_pages), page_number=str(page_number), mmif_id=mmif_id) + return tn_page_html diff --git a/templates/ocr.html b/templates/ocr.html index 7daea2b..9bf370f 100644 --- a/templates/ocr.html +++ b/templates/ocr.html @@ -1,7 +1,7 @@
{% for frame_num, frame in page %} - {% set filename = "/mmif-viz-cache/" + mmif_id + "/img/" + frame["id"] %} + {% set filename = "/mmif-viz-cache/" + mmif_id + "/img/" + view_id + "/" + frame["id"] %} {% set id = frame["id"] %} {% set boxes = frame["boxes"] %} {% set secs = frame["secs"] %} @@ -142,7 +142,7 @@

if (data["page_number"] >= 0 && data["page_number"] < parseInt("{{n_pages}}")) { $.ajax({ type:'POST', - url:'/ocrpage', + url:'/ocr', contentType: "application/json", data: JSON.stringify(data), success: function(res_html){ diff --git a/templates/player.html b/templates/player.html index 3f56ea8..af62b47 100644 --- a/templates/player.html +++ b/templates/player.html @@ -117,27 +117,27 @@

Visualizing MMIF

-
+

-

{{ docs[0][2] }}

- {{ docs[0][3] | safe }} + + {{ docs[0]['html'] | safe }}
{% for medium in docs[1:] %} -
+

-

{{ medium[2] }}

- {{ medium[3] | safe }} + + {{ medium['html'] | safe }}
{% endfor %}
@@ -148,8 +148,8 @@

Visualizing MMIF

@@ -157,9 +157,9 @@

Visualizing MMIF

{% for annotation in annotations %} -
+

- {{ annotation[1] | safe }} + {{ annotation['html'] | safe }}
{% endfor %}
diff --git a/templates/pre-ocr.html b/templates/pre-ocr.html index eba08d3..c2bf537 100644 --- a/templates/pre-ocr.html +++ b/templates/pre-ocr.html @@ -1,3 +1,5 @@ + +
diff --git a/templates/tab-placeholder.html b/templates/tab-placeholder.html new file mode 100644 index 0000000..c2bf537 --- /dev/null +++ b/templates/tab-placeholder.html @@ -0,0 +1,33 @@ + + +
+
+
+ + \ No newline at end of file diff --git a/utils.py b/utils.py index ce2df4c..5f44679 100644 --- a/utils.py +++ b/utils.py @@ -1,374 +1,17 @@ -from collections import Counter -from datetime import timedelta -from io import StringIO - -from flask import Flask, url_for -from lapps.discriminators import Uri -from mmif.serialize.annotation import Text, Document -from mmif.utils.timeunit_helper import UNIT_NORMALIZATION - -import displacy -import iiif_utils -from ocr import * - -# Get Properties from MMIF file --- - -# these two static folder-related params are important, do not remove -app = Flask(__name__, static_folder='static', static_url_path='') -app.secret_key = 'your_secret_key_here' - - -def normalize_timeunit(tu_str): - if tu_str in UNIT_NORMALIZATION: - return UNIT_NORMALIZATION[tu_str] - else: - return tu_str - - -def asr_alignments_to_vtt(alignment_view, viz_id): - vtt_filename = cache.get_cache_root() / viz_id / f"{alignment_view.id.replace(':', '-')}.vtt" - if vtt_filename.exists(): - return str(vtt_filename) - vtt_file = open(vtt_filename, 'w') - vtt_file.write("WEBVTT\n\n") - annotations = alignment_view.annotations - timeframe_at_type = [at_type for at_type in alignment_view.metadata.contains if at_type.shortname == "TimeFrame"][0] - timeunit = normalize_timeunit(alignment_view.metadata.contains[timeframe_at_type]["timeUnit"]) - # make plural so that this key can be used in timedelta init - if timeunit[-1] != 's': - timeunit += 's' - # TODO: wanted to use "mmif.get_alignments(AnnotationTypes.TimeFrame, Uri.TOKEN)" - # but that gave errors so I gave up on it - token_idx = {a.id: a for a in annotations if a.at_type.shortname == "Token"} - timeframe_idx = {a.id: a for a in annotations if a.at_type.shortname == "TimeFrame"} - alignments = [a for a in annotations if a.at_type.shortname == "Alignment"] - vtt_start = None - texts = [] - for alignment in alignments: - start_end_text = build_alignment(alignment, token_idx, timeframe_idx) - if start_end_text is not None: - # VTT specifically requires timestamps expressed in miliseconds and - # must be be in one of these formats: mm:ss.ttt or hh:mm:ss.ttt - # (https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API) - # ISO format can have up to 6 below the decimal point, on the other hand - # Assuming here that start and end are in miliseconds - start, end, text = start_end_text - start_kwarg, end_kwarg = {timeunit: float(start)}, {timeunit: float(end)} - start, end = timedelta(**start_kwarg), timedelta(**end_kwarg) - s_mins, s_secs = divmod(start.seconds, 60) - e_mins, e_secs = divmod(end.seconds, 60) - if not vtt_start: - vtt_start = f'{s_mins:02d}:{s_secs:02d}.{((s_secs - int(s_secs)) * 1000):03d}' - texts.append(text) - if len(texts) > 8: - vtt_end = f'{e_mins:02d}:{e_secs:02d}.{((e_secs - int(e_secs)) * 1000):03d}' - vtt_file.write(f'{vtt_start} --> {vtt_end}\n{" ".join(texts)}\n\n') - vtt_start = None - texts = [] - return vtt_file.name - - -def build_alignment(alignment, token_idx, timeframe_idx): - target = alignment.get('target') - source = alignment.get('source') - timeframe = timeframe_idx.get(source) - token = token_idx.get(target) - if timeframe and token: - start = timeframe.get('start') - end = timeframe.get('end') - for text_key in ['text', 'word']: - if text_key in token: - text = token.get(text_key) - return start, end, text - - -def get_src_media_symlink_basename(doc: Document): - doc_path = doc.location_path() - return f"{doc.id}.{doc_path.split('.')[-1]}" - - -def get_symlink_relurl(viz_id, symlink_fname): - static_folder = pathlib.Path(app.static_folder) - symlink_path = pathlib.Path(cache._CACHE_DIR_SUFFIX) / viz_id / symlink_fname - return static_folder / symlink_path - - -def symlink_to_static(viz_id, original_path, symlink_fname) -> str: - static_folder = pathlib.Path(app.static_folder) - symlink_path = pathlib.Path(cache._CACHE_DIR_SUFFIX) / viz_id / symlink_fname - app.logger.debug(f"Symlinking {original_path} to {symlink_path}") - try: - os.symlink(original_path, static_folder / symlink_path) - except Exception as e: - app.logger.error(f"SOME ERROR when symlinking: {str(e)}") - app.logger.debug(f"{original_path} is symlinked to {symlink_path}") - symlink_rel_path = url_for('static', filename=symlink_path) - app.logger.debug(f"and exposable as {symlink_rel_path}") - return symlink_rel_path - - -def documents_to_htmls(mmif, viz_id): - """ - Returns a list of tuples, one for each element in the documents list of - the MMIF object, following the order in that list. Each tuple has four - elements: document type, document identifier, document path and the HTML - visualization. - """ - htmlized = [] - for document in mmif.documents: - doc_path = document.location_path() - app.logger.debug(f"MMIF on AV asset: {doc_path}") - linked = symlink_to_static(viz_id, doc_path, get_src_media_symlink_basename(document)) - if document.at_type == DocumentTypes.TextDocument: - html = html_text(linked) - elif document.at_type == DocumentTypes.VideoDocument: - fa_views = get_alignment_views(mmif) - fa_view = fa_views[0] if fa_views else None - html = html_video(viz_id, linked, fa_view) - elif document.at_type == DocumentTypes.AudioDocument: - html = html_audio(linked) - elif document.at_type == DocumentTypes.ImageDocument: - boxes = get_boxes(mmif) - html = html_img(linked, boxes) - htmlized.append((document.at_type.shortname, document.id, doc_path, html)) - manifest_filename = iiif_utils.generate_iiif_manifest(mmif, viz_id) - app.logger.debug(f"Generated IIIF manifest: {manifest_filename}") - man = os.path.basename(manifest_filename) - app.logger.debug(f"Manifest filename: {man}") - symlink_to_static(viz_id, manifest_filename, man) - app.logger.debug(f"Symlinked IIIF manifest: {None}") - temp = render_template("uv_player.html", manifest=man, mmif_id=viz_id) - # TODO (krim @ 2024-03-12): Turning off IIIF added to the HTML page since - # 1. current IIIF manifest conversion is based on old version of manifest API, and quite brittle - # 2. the conversion code at the moment can only convert TimeFrame annotation to "jump-able" IIIF canvases, - # but the case is already covered by `Thumbnails` tab (look for usage of `pre-ocr.html` template) - # htmlized.append(('UV', "", "", temp)) - return htmlized - - -def get_boxes(mmif): - # TODO: this gives you the last view with BoundingBoxes, should - # perhaps use get_views_contain() instead, should also select just - # the bounding boxes and add information from alignments to text - # documents. - tbox_view = mmif.get_view_contains(str(AnnotationTypes.BoundingBox)) - tbox_annotations = tbox_view.annotations - # For the boxes we pull some information from the annotation: the - # identifier, boxType and the (x,y,w,h) coordinates used by the - # Javascript code that draws the rectangle. - boxes = [] - for a in tbox_annotations: - coordinates = a.get("coordinates") - x = coordinates[0][0] - y = coordinates[0][1] - w = coordinates[1][0] - x - h = coordinates[2][1] - y - box = [a.get("id"), a.get("boxType"), [x, y, w, h]] - boxes.append(box) - return boxes - - -def prep_annotations(mmif, viz_id): - """Prepare annotations from the views, and return a list of pairs of tabname - and tab content. The first tab is alway the full MMIF pretty print.""" - tabs = [] - tabs.append(("Info", "
" + create_info(mmif) + "
")) - app.logger.debug(f"Prepared INFO Tab: {tabs[-1][0]}") - # tabs.append(("MMIF", "
" + mmif.serialize(pretty=True) + "
")) - # app.logger.debug(f"Prepared RAW Tab: {tabs[-1][0]}") - tabs.append(("Annotations", create_annotation_tables(mmif))) - app.logger.debug(f"Prepared SUMMARY Tab: {tabs[-1][0]}") - tabs.append(("Tree", render_interactive_mmif(mmif))) - app.logger.debug(f"Prepared JSTREE Tab: {tabs[-1][0]}") - # TODO: since this uses the same tab-name this will only show the same - # stuff; it does a loop but for now we assume there is just one file with - # alignments (generated by Kaldi) - for fa_view in get_alignment_views(mmif): - vtt_file = asr_alignments_to_vtt(fa_view, viz_id) - tabs.append(("WebVTT", '
' + open(vtt_file).read() + '
')) - app.logger.debug(f"Prepared a VTT Tab: {tabs[-1][0]}") - ner_views = get_ner_views(mmif) - use_id = True if len(ner_views) > 1 else False - for ner_view in ner_views: - if not ner_view.annotations: - continue - visualization = create_ner_visualization(mmif, ner_view) - tabname = "Entities-%s" % ner_view.id if use_id else "Entities" - tabs.append((tabname, visualization)) - app.logger.debug(f"Prepared a displaCy Tab: {tabs[-1][0]}") - # TODO: somewhat hackish - ocr_views = get_ocr_views(mmif) - use_id = True if len(ocr_views) > 1 else False - for ocr_view in ocr_views: - if not ocr_view.annotations: - continue - tabname = "Thumbnails-%s" % ocr_view.id - visualization = render_template("pre-ocr.html", view_id=ocr_view.id, tabname=tabname, mmif_id=viz_id) - tabs.append((tabname, visualization)) - app.logger.debug(f"Prepared a Thumbnails Tab: {tabs[-1][0]}") - return tabs - - -def create_info(mmif): - s = StringIO('Howdy') - for document in mmif.documents: - at_type = document.at_type.shortname - location = document.location - s.write("%s %s\n" % (at_type, location)) - s.write('\n') - for view in mmif.views: - app = view.metadata.app - status = get_status(view) - s.write('%s %s %s %d\n' % (view.id, app, status, len(view.annotations))) - if len(view.annotations) > 0: - s.write('\n') - types = Counter([a.at_type.shortname - for a in view.annotations]) - for attype, count in types.items(): - s.write(' %4d %s\n' % (count, attype)) - s.write('\n') - return s.getvalue() - - -def create_annotation_tables(mmif): - s = StringIO('Howdy') - for view in mmif.views: - status = get_status(view) - s.write('

%s %s %s %d annotations

\n' - % (view.id, view.metadata.app, status, len(view.annotations))) - s.write("
\n") - s.write("\n") - limit_len = lambda str: str[:500] + " . . . }" if len(str) > 500 else str - for annotation in view.annotations: - s.write(' \n') - s.write(' \n' % annotation.id) - s.write(' \n' % annotation.at_type.shortname) - s.write(' \n' % limit_len(get_properties(annotation))) - s.write(' \n') - s.write("
%s%s%s
\n") - s.write("
\n") - return s.getvalue() - - -def get_document_ids(view, annotation_type): - metadata = view.metadata.contains.get(annotation_type) - ids = set([metadata['document']]) if 'document' in metadata else set() - for annotation in view.annotations: - if annotation.at_type.shortname == str(annotation_type): - try: - ids.add(annotation.get("document")) - except KeyError: - pass - return list(ids) - - -def get_alignment_views(mmif): - """Return alignment views which have at least TextDocument, Token, TimeFrame and - Alignment annotations.""" - views = [] - needed_types = set(['TextDocument', 'Token', 'TimeFrame', 'Alignment']) - for view in mmif.views: - annotation_types = view.metadata.contains.keys() - annotation_types = [at.shortname for at in annotation_types] - if needed_types.issubset(annotation_types): - views.append(view) - return views - - -# Render documents as HTML ------------ - -def html_video(viz_id, vpath, vtt_srcview=None): - vpath = url2posix(vpath) - html = StringIO() - html.write('\n") - return html.getvalue() - - -def html_text(tpath): - """Return the content of the text document, but with some HTML tags added.""" - if not os.path.isfile(tpath): - raise FileNotFoundError(f"File not found: {tpath}") - with open(tpath) as t_file: - content = t_file.read().replace("\n", "
\n") - return f"{content}\n" - - -def html_img(ipath, boxes=None, id="imgCanvas"): - ipath = url2posix(ipath) - boxes = [] if boxes is None else boxes - return render_template('image.html', filename=ipath, boxes=boxes, id=id) - - -def html_audio(apath): - apath = url2posix(apath) - return f"" +from mmif.serialize.annotation import Text +from flask import current_app +import cache def url2posix(path): """For the visualizer we often want a POSIX path and not a URL so we strip off the protocol if there is one.""" - if path.startswith('file:///'): + if str(path).startswith('file:///'): path = path[7:] return path -# Interactive MMIF Tab ----------- - -def render_interactive_mmif(mmif): - return render_template('interactive.html', mmif=mmif, aligned_views=get_aligned_views(mmif)) - - -# Functions for checking if view can be rendered with alignment highlighting -def get_aligned_views(mmif): - """Return list of properly aligned views (for tree display)""" - aligned_views = [] - for view in mmif.views: - if any([at_type.shortname == "Alignment" for at_type in view.metadata.contains]): - if check_view_alignment(view.annotations) == True: - aligned_views.append(view.id) - return aligned_views - - -def check_view_alignment(annotations): - anno_stack = [] - for annotation in annotations: - if annotation.at_type.shortname == "Alignment": - anno_stack.insert(0, annotation.properties) - else: - anno_stack.append(annotation.id) - if len(anno_stack) == 3: - if type(anno_stack[0]) == str or not ( - anno_stack[0]["source"] in anno_stack and anno_stack[0]["target"] in anno_stack): - return False - anno_stack = [] - return True - - -# NER Tools ---------------------- - -def get_ner_views(mmif): - return [v for v in mmif.views if Uri.NE in v.metadata.contains] - - -def create_ner_visualization(mmif, view): - metadata = view.metadata.contains.get(Uri.NE) - try: - # all the view's named entities refer to the same text document (kaldi) - document_ids = get_document_ids(view, Uri.NE) - return displacy.visualize_ner(mmif, view, document_ids[0], app.root_path) - except KeyError as e: - # the view's entities refer to more than one text document (tessearct) - pass - - -def get_status(view): +def get_status(view): return 'ERROR' if 'message' in view.metadata.error else 'OKAY' @@ -384,19 +27,95 @@ def get_properties(annotation): return '{ %s }' % ', '.join(props_list) -# OCR Tools ---------------------- +def get_abstract_view_type(view, mmif): + annotation_types = [a.shortname for a in view.metadata.contains.keys()] + if "NamedEntity" in annotation_types: + return "NER" + elif all([anno_type in annotation_types for anno_type in ["Token", "TimeFrame", "Alignment"]]): + return "ASR" + ocr_apps = ["swt-detection", "doctr-wrapper", "pyscenedetect-wrapper", "easyocr-wrapper", + "slatedetection", "fewshotclassifier", "barsdetection", "east-textdetection", + "parseqocr-wrapper", "tesseractocr-wrapper", "chyron-detection", "paddleocr-wrapper"] + if any([app in view.metadata.app for app in ocr_apps]): + return "OCR" + # Define an OCR view as one that refers to a video and doesn't contain Sentences + # or Tokens + # else: + # for configuration in view.metadata.contains.values(): + # if "document" in configuration \ + # and mmif.get_document_by_id(configuration["document"]).at_type.shortname == "VideoDocument": + # if not any([anno_type in annotation_types for anno_type in ["Sentence", "Token"]]): + # return "OCR" + + +def get_vtt_file(view, viz_id): + vtt_filename = cache.get_cache_root() / viz_id / \ + f"{view.id.replace(':', '-')}.vtt" + if not vtt_filename.exists(): + with open(vtt_filename, 'w') as vtt_file: + vtt_file.write(write_vtt(view, viz_id)) + return str(vtt_filename) + + +def write_vtt(view, viz_id): + vtt = "WEBVTT\n\n" + timeunit = "milliseconds" + for a in view.metadata.contains.values(): + if "timeUnit" in a: + timeunit = a["timeUnit"] + break + token_idx = { + a.id: a for a in view.annotations if a.at_type.shortname == "Token"} + timeframe_idx = { + a.id: a for a in view.annotations if a.at_type.shortname == "TimeFrame"} + alignments = [ + a for a in view.annotations if a.at_type.shortname == "Alignment"] + vtt_start = None + texts = [] + for alignment in alignments: + start_end_text = build_alignment(alignment, token_idx, timeframe_idx) + if start_end_text is None: + continue + start, end, text = start_end_text + if not vtt_start: + vtt_start = format_time(start, timeunit) + texts.append(text) + if len(texts) > 8: + vtt_end = format_time(end, timeunit) + vtt += f"{vtt_start} --> {vtt_end}\n{' '.join(texts)}\n\n" + vtt_start = None + texts = [] + return vtt -def prepare_ocr_visualization(mmif, view, mmif_id): - """ Visualize OCR by extracting image frames with BoundingBoxes from video""" - # frames, text_docs, alignments = {}, {}, {} - vid_path = mmif.get_documents_by_type(DocumentTypes.VideoDocument)[0].location_path() - ocr_frames = get_ocr_frames(view, mmif) +def build_alignment(alignment, token_idx, timeframe_idx): + target = alignment.properties['target'] + source = alignment.properties['source'] + timeframe = timeframe_idx.get(source) + token = token_idx.get(target) + if timeframe and token: + start = timeframe.properties['start'] + end = timeframe.properties['end'] + text = token.properties['word'] + return start, end, text - # Generate pages (necessary to reduce IO cost) and render - frames_list = [(k, vars(v)) for k, v in ocr_frames.items()] - frames_list = find_duplicates(frames_list) - frames_pages = paginate(frames_list) - # Save page list as temp file - save_json(frames_pages, view.id, mmif_id) - return render_ocr(mmif_id, vid_path, view.id, 0) + +def format_time(time, unit): + """ + Formats a time in seconds as a string in the format "hh:mm:ss.fff" + VTT specifically requires timestamps expressed in miliseconds and + must be be in one of these formats: mm:ss.ttt or hh:mm:ss.ttt + (https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API) + ISO format can have up to 6 below the decimal point, on the other hand + """ + if unit == "seconds": + time_in_ms = int(time * 1000) + else: + time_in_ms = int(time) + hours = time_in_ms // (1000 * 60 * 60) + time_in_ms %= (1000 * 60 * 60) + minutes = time_in_ms // (1000 * 60) + time_in_ms %= (1000 * 60) + seconds = time_in_ms // 1000 + time_in_ms %= 1000 + return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{time_in_ms:03d}"