Adding support for news items

glenrobson · glenrobson · commit 969de0faffe8 · 2024-09-19T19:16:45.000+01:00
diff --git a/iiify/app.py b/iiify/app.py
@@ -3,12 +3,12 @@
 import os
 import time
 import requests
-from flask import Flask, send_file, jsonify, abort, request, render_template, redirect
+from flask import Flask, send_file, jsonify, abort, request, render_template, redirect, make_response
 from flask_cors import CORS
 from flask_caching import Cache
 from iiif2 import iiif, web
 from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
-    purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
+    purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations, create_vtt_stream
 from .configs import options, cors, approot, cache_root, media_root, \
     cache_expr, version, image_server, cache_timeouts
 from urllib.parse import quote
@@ -197,6 +197,13 @@ def annnotations(version, identifier, fileName, canvas_no):
     domain = purify_domain(request.args.get('domain', request.url_root))
     return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))
 
+@app.route('/iiif/vtt/streaming/<identifier>.vtt')
+@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
+def vtt_stream(identifier):
+    response = make_response(create_vtt_stream(identifier))
+    response.headers['Content-Type'] = 'text/vtt'
+    return response
+
 @app.route('/iiif/<identifier>/manifest.json')
 @cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
 def manifest(identifier):
diff --git a/iiify/resolver.py b/iiify/resolver.py
@@ -12,6 +12,7 @@
 import math 
 import re
 import xml.etree.ElementTree as ET
+from datetime import timedelta
 
 IMG_CTX = 'http://iiif.io/api/image/2/context.json'
 PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
@@ -658,77 +659,131 @@ def create_manifest3(identifier, domain=None, page=None):
                     vttfiles[sourceFilename] = []    
                     
                 vttfiles[sourceFilename].append(f)    
-            
-        # create the canvases for each original
-        for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
-            normalised_id = file['name'].rsplit(".", 1)[0]
+
+        if 'access-restricted-item' in metadata['metadata'] and metadata['metadata']['access-restricted-item']:
+            # this is a news item so has to be treated differently
+            # https://ia801803.us.archive.org/29/items/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.mp4?start=0&end=360&ignore=x.mp4&cnt=0
+            mp4File = None
+            duration = 0.0
+            filedata = None
+            for file in metadata['files']:
+                if file['name'].endswith('.mp4'):
+                    mp4File = file['name']
+                    duration = float(file['length'])
+                    filedata = file
+
+            normalised_id = mp4File.rsplit(".", 1)[0]
             slugged_id = normalised_id.replace(" ", "-")
             c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
-            c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
-
-            # Add vtt if present
-            if vttfiles and normalised_id in vttfiles:
-                vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
-
-                vttNo = 1
-                for vttFile in vttfiles[normalised_id]:
-                    vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", 
-                                               motivation="supplementing", 
-                                               target=c.id, 
-                                               anno_page_id=vttAPId,
-                                               body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
-                                                     "type": "Text",
-                                                    "format": "text/vtt",
-                                                    })
-                    # add label and language
-                    if vttFile['name'].endswith("autogenerated.vtt"):
-                        vtAnno.body.label = { 'en': ['autogenerated']}
-                    else:
-                        # Assume language
-                        splitName = vttFile['name'].split(".")
-                        lang = splitName[-2]
-                        vtAnno.body.add_label(lang, language="none")
-                        vtAnno.body.language = lang
-
-                    vttNo += 1
-
-            # create intermediary objects
+            c = Canvas(id=c_id, label=normalised_id, duration=duration, height=int(filedata['height']), width=int(filedata['width']))        
             ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
-            anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)
 
-            # create body based on whether there are derivatives or not:
-            if file['name'] in derivatives:
-                body = Choice(items=[])
-                # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
-                for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
-                    if format in derivatives[file['name']]:
-                        r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
-                                         type='Video',
-                                         format=to_mimetype(format),
-                                         label={"none": [format]},
-                                         duration=float(file['length']), 
-                                         height=int(file['height']),
-                                         width=int(file['width']),                      
-                        )
-                        body.items.append(r)
-                    elif file['format'] == format:
-                        r = ResourceItem(
-                            id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
-                            type='Video',
-                            format=to_mimetype(format),
-                            label={"none": [format]},
-                            duration=float(file['length']),
-                            height=int(file['height']),
-                            width=int(file['width']))
-                        body.items.append(r)
-            else:
-                # todo: deal with instances where there are no derivatives for whatever reason
-                pass
-
-            anno.body = body
-            ap.add_item(anno)
+            vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
+            vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/streamed", 
+                                                motivation="supplementing", 
+                                                target=c.id, 
+                                                anno_page_id=vttAPId,
+                                                body={"id": f"{domain}vtt/streaming/{identifier}.vtt",
+                                                        "type": "Text",
+                                                        "format": "text/vtt",
+                                                        })
+
+            segments = math.floor(duration / 60)
+            for i in range(segments):
+                start = i * 60
+                if i == segments - 1:
+                    end = int(duration)
+                else:
+                    end = (i + 1) * 60
+
+                #print (f"Start: {start} End: {end}, Duration: {float(end) - float(start)} full duration: {duration}")
+                anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/{i}", motivation="painting", target=f"{c.id}#t={start},{end}")
+                streamurl = f"https://{metadata['server']}{metadata['dir']}/{mp4File}?start={start}&end={end}&ignore=x.mp4&cnt=0"        
+                body = ResourceItem(id=streamurl,
+                                    type='Video',
+                                    format="video/mp4",
+                                    label={"en": [f"Part {i + 1} of {segments}"]},
+                                    duration=end - start, 
+                                    height=int(filedata['height']),
+                                    width=int(filedata['width']),                      
+                                )
+
+                anno.body = body
+                ap.add_item(anno)
+            
             c.add_item(ap)
             manifest.add_item(c)
+        else:
+            # create the canvases for each original
+            for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
+                normalised_id = file['name'].rsplit(".", 1)[0]
+                slugged_id = normalised_id.replace(" ", "-")
+                c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
+                c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
+
+                # Add vtt if present
+                if vttfiles and normalised_id in vttfiles:
+                    vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
+
+                    vttNo = 1
+                    for vttFile in vttfiles[normalised_id]:
+                        vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", 
+                                                motivation="supplementing", 
+                                                target=c.id, 
+                                                anno_page_id=vttAPId,
+                                                body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
+                                                        "type": "Text",
+                                                        "format": "text/vtt",
+                                                        })
+                        # add label and language
+                        if vttFile['name'].endswith("autogenerated.vtt"):
+                            vtAnno.body.label = { 'en': ['autogenerated']}
+                        else:
+                            # Assume language
+                            splitName = vttFile['name'].split(".")
+                            lang = splitName[-2]
+                            vtAnno.body.add_label(lang, language="none")
+                            vtAnno.body.language = lang
+
+                        vttNo += 1
+
+                # create intermediary objects
+                ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
+                anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)
+
+                # create body based on whether there are derivatives or not:
+                if file['name'] in derivatives:
+                    body = Choice(items=[])
+                    # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
+                    for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
+                        if format in derivatives[file['name']]:
+                            r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
+                                            type='Video',
+                                            format=to_mimetype(format),
+                                            label={"none": [format]},
+                                            duration=float(file['length']), 
+                                            height=int(file['height']),
+                                            width=int(file['width']),                      
+                            )
+                            body.items.append(r)
+                        elif file['format'] == format:
+                            r = ResourceItem(
+                                id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
+                                type='Video',
+                                format=to_mimetype(format),
+                                label={"none": [format]},
+                                duration=float(file['length']),
+                                height=int(file['height']),
+                                width=int(file['width']))
+                            body.items.append(r)
+                else:
+                    # todo: deal with instances where there are no derivatives for whatever reason
+                    pass
+
+                anno.body = body
+                ap.add_item(anno)
+                c.add_item(ap)
+                manifest.add_item(c)
     elif mediatype == "collection":
         raise IsCollection
     else:
@@ -785,6 +840,73 @@ def create_annotations(version, identifier, fileName, canvas_no, domain=None):
 
     return json.loads(annotationPage.jsonld())
 
+def create_vtt_stream(identifier): 
+    """
+    This method will read a SRT file using the following URL:
+    https://archive.org/download/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.cc5.srt?t=0/360
+    and convert it to vtt. The streaming text above takes seconds as a parameter. 
+    """   
+
+    metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
+    filename = ""
+    duration = 0.0
+    for file in metadata['files']:
+        if file['name'].endswith('.mpg') and file['source'] == 'original':
+            duration = float(file['length'])
+        # There seems to be multiple srt files but unclear how they are different    
+        if file['name'].endswith('.srt'):    
+            filename = file['name']
+
+    # Initialize the vtt content with the WEBVTT header
+    vtt_content = ["WEBVTT\n"]
+
+    segments = math.floor(duration / 60)
+    for i in range(segments):
+        start = i * 60
+        if i == segments - 1:
+            end = int(duration)
+        else:
+            end = (i + 1) * 60
+
+        
+        response = requests.get(f"https://archive.org/download/{identifier}/{filename}?t={start}/{end}")
+
+        if response.status_code == 200:
+            # Get the content of the SRT file as a string
+            srt_content = response.text    
+             # Split the srt file by lines
+            lines = srt_content.splitlines()
+            for line in lines:
+                # Convert time format: 00:00:00,000 -> 00:00:00.000
+                if "-->" in line:
+                    splitline = line.split("-->")
+                    starttime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
+                    endtime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
+                    line = f"{formatTimeVTT(starttime)} -> {formatTimeVTT(endtime)}"
+
+                vtt_content.append(line)
+
+            vtt_content.append(" ")
+
+    # Join the list into a single string
+    return "\n".join(vtt_content)
+
+def formatTimeVTT(time):
+    hours, remainder = divmod(time.total_seconds(), 3600)
+    minutes, seconds = divmod(remainder, 60)
+    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}.{int(time.microseconds / 1000):03}"
+
+def timeToDelta(time):
+    """
+    Convert SRT formated times to timedelta
+    """
+    milliseconds = int(time.split(",")[1])
+    timeStr = time.split(",")[0]
+    hour = int(timeStr.split(":")[0])
+    minute = int(timeStr.split(":")[1])
+    second = int(timeStr.split(":")[2])
+    return timedelta(hours=hour, minutes=minute, seconds=second, milliseconds=milliseconds)
+
 def coerce_list(value):
     if isinstance(value, list):
         return ". ".join(value)
diff --git a/tests/test_video.py b/tests/test_video.py
@@ -2,6 +2,7 @@
 os.environ["FLASK_CACHE_DISABLE"] = "true"
 
 import unittest
+import math
 from flask.testing import FlaskClient
 from iiify.app import app
 
@@ -66,6 +67,34 @@ def test_vtt_multilingual(self):
             if item['body']['language'] == 'cy':
                 self.assertEqual(item['body']['id'], 'https://localhost/iiif/resource/cruz-test/cruz-test.cy.vtt', 'Unexpected link for the Welsh vtt file')
 
+    def test_newsitem(self):
+        resp = self.test_app.get("/iiif/3/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/manifest.json")
+        self.assertEqual(resp.status_code, 200)
+        manifest = resp.json
+
+        canvas = manifest['items'][0]
+        annoPages = canvas['items'][0]
+        annotations = annoPages['items']
+        self.assertEqual(len(annotations), math.floor(780.89 / 60), 'Expected the video to contain the 13min video split into 1 minute segments')
+
+        # Check vtt file
+        self.assertTrue('annotations' in canvas, "Expected canvas to have annotations")
+        vttFile = canvas['annotations'][0]['items'][0]['body']['id']
+        self.assertTrue(vttFile.endswith("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt"),f"Expected vttFile to be located at /iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt but found it at {vttFile}")
+
+        resp = self.test_app.get("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt")
+        checkLine=False
+        for line in resp.text.split("\n"):
+            if checkLine:
+                self.assertEqual("00:01:02.000 -> 00:01:02.000", line, "Expected the timecode to be over a minute as its the second video")
+                break    
+            if line.startswith("28"):
+                checkLine=True
+        # 28
+        # 00:01:02.000 -> 00:01:02.000
+        # I AM THE DIRECTOR OF ARCHAEOLOGY
+
+
 
 if __name__ == '__main__':
     unittest.main()