1
1
import os
2
2
import sys
3
- import json
4
3
import secrets
4
+ import html
5
+ import datetime
6
+ import ast
5
7
6
- from io import StringIO
7
- from string import Template
8
-
9
- import displacy
10
- import requests
11
- import tempfile
12
-
13
- from flask import Flask , request , render_template , flash , redirect
8
+ from flask import request , render_template , flash , redirect , send_from_directory
14
9
from werkzeug .utils import secure_filename
10
+ from mmif .serialize import Mmif
15
11
16
- from mmif .serialize import Mmif , View
17
- from mmif .vocabulary import AnnotationTypes
18
- from mmif .vocabulary import DocumentTypes
19
- from lapps .discriminators import Uri
20
-
21
-
22
- # these two static folder-related params are important, do not remove
23
- app = Flask (__name__ , static_folder = 'static' , static_url_path = '' )
12
+ from utils import app , render_ocr , get_media , prep_annotations
24
13
25
14
@app .route ('/' )
26
15
def index ():
27
16
return render_template ('index.html' )
28
17
18
+ @app .route ('/ocrpage' , methods = ['POST' ])
19
+ def ocrpage ():
20
+ data = request .form
21
+ try :
22
+ # print(html.unescape(data['frames_pages']))
23
+ frames_pages = eval (html .unescape (data ['frames_pages' ]))
24
+ page_number = int (data ['page_number' ])
25
+
26
+ return (render_ocr (data ['vid_path' ], frames_pages , page_number ))
27
+ except Exception as e :
28
+ print (html .unescape (data ['frames_pages' ]))
29
+ return f'<p class="error">Unexpected error of type { type (e )} : { e } </h1>'
30
+ pass
29
31
30
32
@app .route ('/upload' , methods = ['GET' , 'POST' ])
31
33
def upload ():
@@ -52,6 +54,11 @@ def upload():
52
54
return render_template ('upload.html' )
53
55
54
56
57
+ @app .route ('/uv/<path:path>' )
58
+ def send_js (path ):
59
+ return send_from_directory ("uv" , path )
60
+
61
+
55
62
def render_mmif (mmif_str ):
56
63
mmif = Mmif (mmif_str )
57
64
media = get_media (mmif )
@@ -60,270 +67,18 @@ def render_mmif(mmif_str):
60
67
mmif = mmif , media = media , annotations = annotations )
61
68
62
69
63
- def view_to_vtt (alignment_view ):
64
- """Write alignments to a file in VTT style and return the filename."""
65
- vtt_file = get_alignments (alignment_view )
66
- return os .sep .join (vtt_file .name .split (os .sep )[- 2 :])
67
-
68
-
69
- def get_alignments (alignment_view ):
70
- vtt_file = tempfile .NamedTemporaryFile ('w' , dir = "static/" , suffix = '.vtt' , delete = False )
71
- vtt_file .write ("WEBVTT\n \n " )
72
- annotations = alignment_view .annotations
73
- # TODO: wanted to use "mmif.get_alignments(AnnotationTypes.TimeFrame, Uri.TOKEN)"
74
- # but that gave errors so I gave up on it
75
- token_idx = {a .id :a for a in annotations if str (a .at_type ).endswith ('Token' )}
76
- timeframe_idx = {a .id :a for a in annotations if str (a .at_type ).endswith ('TimeFrame' )}
77
- alignments = [a for a in annotations if str (a .at_type ).endswith ('Alignment' )]
78
- vtt_start = None
79
- texts = []
80
- for alignment in alignments :
81
- start_end_text = build_alignment (alignment , token_idx , timeframe_idx )
82
- if start_end_text is not None :
83
- # VTT specifically requires timestamps expressed in miliseconds
84
- # and must be be in one of these formats
85
- # mm:ss.ttt
86
- # hh:mm:ss.ttt
87
- # (https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API)
88
- # ISO format can have up to 6 below the decimal point, on the other hand
89
- # Assuming here that start and end are in miliseconds
90
- start , end , text = start_end_text
91
- if not vtt_start :
92
- vtt_start = f'{ start // 60000 :02d} :{ start % 60000 // 1000 } .{ start % 1000 :03d} '
93
- texts .append (text )
94
- if len (texts ) > 8 :
95
- vtt_end = f'{ end // 60000 :02d} :{ end % 60000 // 1000 } .{ end % 1000 :03d} '
96
- vtt_file .write (f'{ vtt_start } --> { vtt_end } \n { " " .join (texts )} \n \n ' )
97
- vtt_start = None
98
- texts = []
99
- return vtt_file
100
-
101
-
102
- def build_alignment (alignment , token_idx , timeframe_idx ):
103
- target = alignment .properties ['target' ]
104
- source = alignment .properties ['source' ]
105
- timeframe = timeframe_idx .get (source )
106
- token = token_idx .get (target )
107
- if timeframe and token :
108
- start = timeframe .properties ['start' ]
109
- end = timeframe .properties ['end' ]
110
- text = token .properties ['word' ]
111
- return start , end , text
112
-
113
-
114
- def html_video (vpath , vtt_srcview = None ):
115
- vpath = url2posix (vpath )
116
- html = StringIO ()
117
- html .write ("<video controls>\n " )
118
- html .write (f' <source src=\" { vpath } \" >\n ' )
119
- if vtt_srcview is not None :
120
- vtt_path = view_to_vtt (vtt_srcview )
121
- # use only basename because "static" directory is mapped to '' route by
122
- # `static_url_path` param
123
- src = os .path .basename (vtt_path )
124
- html .write (f' <track kind="subtitles" srclang="en" src="{ src } " default>\n ' )
125
- html .write ("</video>\n " )
126
- return html .getvalue ()
127
-
128
-
129
- def html_text (tpath ):
130
- """Return the conent of the text document, but with some HTML tags added."""
131
- if not os .path .isfile (tpath ):
132
- # This is to fix a problem when running this from a local machine where
133
- # /data/text may not be available (it always is available from the
134
- # container). The same problem occurs in displacy/__init__.py.
135
- if tpath .startswith ('file:///' ):
136
- tpath = tpath [8 :]
137
- else :
138
- # this should not happen anymore, but keeping it anyway
139
- tpath = tpath [1 :]
140
- tpath = os .path .join (app .root_path , 'static' , tpath )
141
- with open (tpath ) as t_file :
142
- #return f"<pre width=\"100%\">\n{t_file.read()}\n</pre>"
143
- content = t_file .read ().replace ("\n " , "<br/>\n " )
144
- return f"{ content } \n "
145
-
146
-
147
- def html_img (ipath , boxes = None ):
148
- ipath = url2posix (ipath )
149
- boxes = [] if boxes is None else boxes
150
- t = Template (open ('templates/image.html' ).read ())
151
- return t .substitute (filename = ipath , boxes = boxes )
152
-
153
-
154
- def html_audio (apath ):
155
- apath = url2posix (apath )
156
- return f"<audio controls src={ apath } ></audio>"
157
-
158
-
159
- def url2posix (path ):
160
- """For the visualizer we often want a POSIX path and not a URL so we strip off
161
- the protocol if there is one."""
162
- if path .startswith ('file:///' ):
163
- path = path [7 :]
164
- return path
165
-
166
-
167
- def get_media (mmif ):
168
- # Returns a list of tuples, one for each element in the documents list of
169
- # the MMIF object, following the order in that list. Each tuple has four
170
- # elements: document type, document identifier, document path and the HTML
171
- # visualization.
172
- media = []
173
- for document in mmif .documents :
174
- doc_type = get_document_type_short_form (document )
175
- doc_path = document .location
176
- print ('>>>' , doc_path )
177
- if doc_type == 'Text' :
178
- html = html_text (doc_path )
179
- elif doc_type == 'Video' :
180
- fa_views = get_alignment_views (mmif )
181
- fa_view = fa_views [0 ] if fa_views else None
182
- html = html_video (doc_path , fa_view )
183
- elif doc_type == 'Audio' :
184
- html = html_audio (doc_path )
185
- elif doc_type == 'Image' :
186
- # TODO: this gives you the last view with BoundingBoxes, should
187
- # perhaps use get_views_contain() instead, should also select just
188
- # the bounding boxes and add information from alignments to text
189
- # documents
190
- tbox_view = mmif .get_view_contains (str (AnnotationTypes .BoundingBox ))
191
- tbox_annotations = tbox_view .annotations
192
- # For the boxes we pull some information from the annotation: the
193
- # identifier, boxType and the (x,y,w,h) coordinates used by the
194
- # Javascript code that draws the rectangle.
195
- boxes = []
196
- for a in tbox_annotations :
197
- coordinates = a .properties ["coordinates" ]
198
- x = coordinates [0 ][0 ]
199
- y = coordinates [0 ][1 ]
200
- w = coordinates [1 ][0 ] - x
201
- h = coordinates [2 ][1 ] - y
202
- box = [a .properties ["id" ], a .properties ["boxType" ], [x , y , w , h ]]
203
- boxes .append (box )
204
- html = html_img (doc_path , boxes )
205
- media .append ((doc_type , document .id , doc_path , html ))
206
- return media
207
-
208
-
209
- def get_document_type_short_form (document ):
210
- """Returns 'Video', 'Text', 'Audio' or 'Image' from the document type of
211
- the document."""
212
- document_type = os .path .split (str (document .at_type ))[1 ]
213
- return document_type [:- 8 ]
214
-
215
-
216
- def prep_annotations (mmif ):
217
- """Prepare annotations from the views, and return a list of pairs of tabname
218
- and tab content. The first tab is alway the full MMIF pretty print."""
219
- tabs = [("MMIF" , "<pre>" + mmif .serialize (pretty = True ) + "</pre>" )]
220
- # TODO: since this uses the same tab-name this will only show the same
221
- # stuff; it does a loop but for now we assume there is just one file with
222
- # alignments (generated by Kaldi)
223
- for fa_view in get_alignment_views (mmif ):
224
- vtt_file = view_to_vtt (fa_view )
225
- tabs .append (("WEBVTT" , '<pre>' + open (vtt_file ).read () + '</pre>' ))
226
- ner_views = get_ner_views (mmif )
227
- use_id = True if len (ner_views ) > 1 else False
228
- for ner_view in ner_views :
229
- if not ner_view .annotations :
230
- continue
231
- visualization = create_ner_visualization (mmif , ner_view )
232
- tabname = "Entities-%s" % ner_view .id if use_id else "Entities"
233
- tabs .append ((tabname , visualization ))
234
- # TODO: somewhat hackish
235
- ocr_views = get_ocr_views (mmif )
236
- use_id = True if len (ocr_views ) > 1 else False
237
- for ocr_view in ocr_views :
238
- if not ocr_view .annotations :
239
- continue
240
- visualization = create_ocr_visualization (mmif , ocr_view )
241
- tabname = "OCR-%s" % ocr_view .id if use_id else "OCR"
242
- tabs .append ((tabname , visualization ))
243
- return tabs
244
-
245
-
246
- def create_ner_visualization (mmif , view ):
247
- metadata = view .metadata .contains .get (Uri .NE )
248
- try :
249
- # all the view's named entities refer to the same text document (kaldi)
250
- document_ids = get_document_ids (view , Uri .NE )
251
- return displacy .visualize_ner (mmif , view , document_ids [0 ], app .root_path )
252
- except KeyError :
253
- # the view's entities refer to more than one text document (tessearct)
254
- pass
255
-
256
-
257
- def create_ocr_visualization (mmif , view ):
258
- # TODO: the text boxes had no timePoint so I could not create a VTT
259
- # TODO: no app in the metadata
260
- text = '<pre>'
261
- for anno in view .annotations :
262
- try :
263
- if anno .at_type .endswith ('TextDocument' ):
264
- # TODO: this is a hack because the text documents do not have a text
265
- # field, instead they have an @value field
266
- t = str (anno .properties ).split ('"id":' )
267
- t = anno .properties ['_value' ]
268
- t = ' ' .join (t .split ()).strip ()
269
- if t :
270
- text += t + '\n '
271
- except :
272
- pass
273
- return text + '</pre>'
274
-
275
-
276
- def get_document_ids (view , annotation_type ):
277
- metadata = view .metadata .contains .get (annotation_type )
278
- ids = set ([metadata ['document' ]]) if 'document' in metadata else set ()
279
- for annotation in view .annotations :
280
- if str (annotation .at_type ).endswith (str (annotation_type )):
281
- try :
282
- ids .add (annotation .properties ["document" ])
283
- except KeyError :
284
- pass
285
- return list (ids )
286
-
287
-
288
- def get_alignment_views (mmif ):
289
- """Return alignment views which have at least TextDocument, Token, TimeFrame and
290
- Alignment annotations."""
291
- views = []
292
- needed_types = set (['TextDocument' , 'Token' , 'TimeFrame' , 'Alignment' ])
293
- for view in mmif .views :
294
- annotation_types = view .metadata .contains .keys ()
295
- annotation_types = [os .path .split (str (at ))[- 1 ] for at in annotation_types ]
296
- if needed_types .issubset (annotation_types ):
297
- views .append (view )
298
- return views
299
-
300
-
301
- def get_ocr_views (mmif ):
302
- """Return OCR views, which have TextDocument and Alignment annotations, but no
303
- other annotations."""
304
- views = []
305
- # TODO: not sure why we use the full URL
306
- needed_types = set ([
307
- "http://mmif.clams.ai/0.2.1/vocabulary/TextDocument" ,
308
- "http://mmif.clams.ai/0.2.1/vocabulary/Alignment" ])
309
- for view in mmif .views :
310
- annotation_types = view .metadata .contains .keys ()
311
- if needed_types .issubset (annotation_types ) and len (annotation_types ) == 2 :
312
- views .append (view )
313
- return views
314
-
315
-
316
- def get_ner_views (mmif ):
317
- return [v for v in mmif .views if Uri .NE in v .metadata .contains ]
318
-
319
-
320
70
# Not sure what this was for, it had a route /display, but that did not work
321
71
# def display_file():
322
72
# mmif_str = requests.get(request.args["file"]).text
323
73
# return display_mmif(mmif_str)
324
74
325
75
326
76
if __name__ == '__main__' :
77
+ # Make path for temp files
78
+ tmp_path = '/app/static/tmp'
79
+ if not os .path .exists (tmp_path ):
80
+ os .makedirs (tmp_path )
81
+
327
82
328
83
# to avoid runtime errors for missing keys when using flash()
329
84
alphabet = 'abcdefghijklmnopqrstuvwxyz1234567890'
0 commit comments