Skip to content

Commit 969de0f

Browse files
committed
Adding support for news items
1 parent 0d9a212 commit 969de0f

File tree

3 files changed

+225
-67
lines changed

3 files changed

+225
-67
lines changed

iiify/app.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
import os
44
import time
55
import requests
6-
from flask import Flask, send_file, jsonify, abort, request, render_template, redirect
6+
from flask import Flask, send_file, jsonify, abort, request, render_template, redirect, make_response
77
from flask_cors import CORS
88
from flask_caching import Cache
99
from iiif2 import iiif, web
1010
from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
11-
purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
11+
purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations, create_vtt_stream
1212
from .configs import options, cors, approot, cache_root, media_root, \
1313
cache_expr, version, image_server, cache_timeouts
1414
from urllib.parse import quote
@@ -197,6 +197,13 @@ def annnotations(version, identifier, fileName, canvas_no):
197197
domain = purify_domain(request.args.get('domain', request.url_root))
198198
return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))
199199

200+
@app.route('/iiif/vtt/streaming/<identifier>.vtt')
201+
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
202+
def vtt_stream(identifier):
203+
response = make_response(create_vtt_stream(identifier))
204+
response.headers['Content-Type'] = 'text/vtt'
205+
return response
206+
200207
@app.route('/iiif/<identifier>/manifest.json')
201208
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
202209
def manifest(identifier):

iiify/resolver.py

Lines changed: 187 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import math
1313
import re
1414
import xml.etree.ElementTree as ET
15+
from datetime import timedelta
1516

1617
IMG_CTX = 'http://iiif.io/api/image/2/context.json'
1718
PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
@@ -658,77 +659,131 @@ def create_manifest3(identifier, domain=None, page=None):
658659
vttfiles[sourceFilename] = []
659660

660661
vttfiles[sourceFilename].append(f)
661-
662-
# create the canvases for each original
663-
for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
664-
normalised_id = file['name'].rsplit(".", 1)[0]
662+
663+
if 'access-restricted-item' in metadata['metadata'] and metadata['metadata']['access-restricted-item']:
664+
# this is a news item so has to be treated differently
665+
# https://ia801803.us.archive.org/29/items/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.mp4?start=0&end=360&ignore=x.mp4&cnt=0
666+
mp4File = None
667+
duration = 0.0
668+
filedata = None
669+
for file in metadata['files']:
670+
if file['name'].endswith('.mp4'):
671+
mp4File = file['name']
672+
duration = float(file['length'])
673+
filedata = file
674+
675+
normalised_id = mp4File.rsplit(".", 1)[0]
665676
slugged_id = normalised_id.replace(" ", "-")
666677
c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
667-
c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
668-
669-
# Add vtt if present
670-
if vttfiles and normalised_id in vttfiles:
671-
vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
672-
673-
vttNo = 1
674-
for vttFile in vttfiles[normalised_id]:
675-
vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}",
676-
motivation="supplementing",
677-
target=c.id,
678-
anno_page_id=vttAPId,
679-
body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
680-
"type": "Text",
681-
"format": "text/vtt",
682-
})
683-
# add label and language
684-
if vttFile['name'].endswith("autogenerated.vtt"):
685-
vtAnno.body.label = { 'en': ['autogenerated']}
686-
else:
687-
# Assume language
688-
splitName = vttFile['name'].split(".")
689-
lang = splitName[-2]
690-
vtAnno.body.add_label(lang, language="none")
691-
vtAnno.body.language = lang
692-
693-
vttNo += 1
694-
695-
# create intermediary objects
678+
c = Canvas(id=c_id, label=normalised_id, duration=duration, height=int(filedata['height']), width=int(filedata['width']))
696679
ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
697-
anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)
698680

699-
# create body based on whether there are derivatives or not:
700-
if file['name'] in derivatives:
701-
body = Choice(items=[])
702-
# add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
703-
for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
704-
if format in derivatives[file['name']]:
705-
r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
706-
type='Video',
707-
format=to_mimetype(format),
708-
label={"none": [format]},
709-
duration=float(file['length']),
710-
height=int(file['height']),
711-
width=int(file['width']),
712-
)
713-
body.items.append(r)
714-
elif file['format'] == format:
715-
r = ResourceItem(
716-
id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
717-
type='Video',
718-
format=to_mimetype(format),
719-
label={"none": [format]},
720-
duration=float(file['length']),
721-
height=int(file['height']),
722-
width=int(file['width']))
723-
body.items.append(r)
724-
else:
725-
# todo: deal with instances where there are no derivatives for whatever reason
726-
pass
727-
728-
anno.body = body
729-
ap.add_item(anno)
681+
vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
682+
vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/streamed",
683+
motivation="supplementing",
684+
target=c.id,
685+
anno_page_id=vttAPId,
686+
body={"id": f"{domain}vtt/streaming/{identifier}.vtt",
687+
"type": "Text",
688+
"format": "text/vtt",
689+
})
690+
691+
segments = math.floor(duration / 60)
692+
for i in range(segments):
693+
start = i * 60
694+
if i == segments - 1:
695+
end = int(duration)
696+
else:
697+
end = (i + 1) * 60
698+
699+
#print (f"Start: {start} End: {end}, Duration: {float(end) - float(start)} full duration: {duration}")
700+
anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/{i}", motivation="painting", target=f"{c.id}#t={start},{end}")
701+
streamurl = f"https://{metadata['server']}{metadata['dir']}/{mp4File}?start={start}&end={end}&ignore=x.mp4&cnt=0"
702+
body = ResourceItem(id=streamurl,
703+
type='Video',
704+
format="video/mp4",
705+
label={"en": [f"Part {i + 1} of {segments}"]},
706+
duration=end - start,
707+
height=int(filedata['height']),
708+
width=int(filedata['width']),
709+
)
710+
711+
anno.body = body
712+
ap.add_item(anno)
713+
730714
c.add_item(ap)
731715
manifest.add_item(c)
716+
else:
717+
# create the canvases for each original
718+
for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
719+
normalised_id = file['name'].rsplit(".", 1)[0]
720+
slugged_id = normalised_id.replace(" ", "-")
721+
c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
722+
c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
723+
724+
# Add vtt if present
725+
if vttfiles and normalised_id in vttfiles:
726+
vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
727+
728+
vttNo = 1
729+
for vttFile in vttfiles[normalised_id]:
730+
vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}",
731+
motivation="supplementing",
732+
target=c.id,
733+
anno_page_id=vttAPId,
734+
body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
735+
"type": "Text",
736+
"format": "text/vtt",
737+
})
738+
# add label and language
739+
if vttFile['name'].endswith("autogenerated.vtt"):
740+
vtAnno.body.label = { 'en': ['autogenerated']}
741+
else:
742+
# Assume language
743+
splitName = vttFile['name'].split(".")
744+
lang = splitName[-2]
745+
vtAnno.body.add_label(lang, language="none")
746+
vtAnno.body.language = lang
747+
748+
vttNo += 1
749+
750+
# create intermediary objects
751+
ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
752+
anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)
753+
754+
# create body based on whether there are derivatives or not:
755+
if file['name'] in derivatives:
756+
body = Choice(items=[])
757+
# add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
758+
for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
759+
if format in derivatives[file['name']]:
760+
r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
761+
type='Video',
762+
format=to_mimetype(format),
763+
label={"none": [format]},
764+
duration=float(file['length']),
765+
height=int(file['height']),
766+
width=int(file['width']),
767+
)
768+
body.items.append(r)
769+
elif file['format'] == format:
770+
r = ResourceItem(
771+
id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
772+
type='Video',
773+
format=to_mimetype(format),
774+
label={"none": [format]},
775+
duration=float(file['length']),
776+
height=int(file['height']),
777+
width=int(file['width']))
778+
body.items.append(r)
779+
else:
780+
# todo: deal with instances where there are no derivatives for whatever reason
781+
pass
782+
783+
anno.body = body
784+
ap.add_item(anno)
785+
c.add_item(ap)
786+
manifest.add_item(c)
732787
elif mediatype == "collection":
733788
raise IsCollection
734789
else:
@@ -785,6 +840,73 @@ def create_annotations(version, identifier, fileName, canvas_no, domain=None):
785840

786841
return json.loads(annotationPage.jsonld())
787842

843+
def create_vtt_stream(identifier):
844+
"""
845+
This method will read a SRT file using the following URL:
846+
https://archive.org/download/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.cc5.srt?t=0/360
847+
and convert it to vtt. The streaming text above takes seconds as a parameter.
848+
"""
849+
850+
metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
851+
filename = ""
852+
duration = 0.0
853+
for file in metadata['files']:
854+
if file['name'].endswith('.mpg') and file['source'] == 'original':
855+
duration = float(file['length'])
856+
# There seems to be multiple srt files but unclear how they are different
857+
if file['name'].endswith('.srt'):
858+
filename = file['name']
859+
860+
# Initialize the vtt content with the WEBVTT header
861+
vtt_content = ["WEBVTT\n"]
862+
863+
segments = math.floor(duration / 60)
864+
for i in range(segments):
865+
start = i * 60
866+
if i == segments - 1:
867+
end = int(duration)
868+
else:
869+
end = (i + 1) * 60
870+
871+
872+
response = requests.get(f"https://archive.org/download/{identifier}/{filename}?t={start}/{end}")
873+
874+
if response.status_code == 200:
875+
# Get the content of the SRT file as a string
876+
srt_content = response.text
877+
# Split the srt file by lines
878+
lines = srt_content.splitlines()
879+
for line in lines:
880+
# Convert time format: 00:00:00,000 -> 00:00:00.000
881+
if "-->" in line:
882+
splitline = line.split("-->")
883+
starttime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
884+
endtime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
885+
line = f"{formatTimeVTT(starttime)} -> {formatTimeVTT(endtime)}"
886+
887+
vtt_content.append(line)
888+
889+
vtt_content.append(" ")
890+
891+
# Join the list into a single string
892+
return "\n".join(vtt_content)
893+
894+
def formatTimeVTT(time):
895+
hours, remainder = divmod(time.total_seconds(), 3600)
896+
minutes, seconds = divmod(remainder, 60)
897+
return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}.{int(time.microseconds / 1000):03}"
898+
899+
def timeToDelta(time):
900+
"""
901+
Convert SRT formated times to timedelta
902+
"""
903+
milliseconds = int(time.split(",")[1])
904+
timeStr = time.split(",")[0]
905+
hour = int(timeStr.split(":")[0])
906+
minute = int(timeStr.split(":")[1])
907+
second = int(timeStr.split(":")[2])
908+
return timedelta(hours=hour, minutes=minute, seconds=second, milliseconds=milliseconds)
909+
788910
def coerce_list(value):
789911
if isinstance(value, list):
790912
return ". ".join(value)

tests/test_video.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
os.environ["FLASK_CACHE_DISABLE"] = "true"
33

44
import unittest
5+
import math
56
from flask.testing import FlaskClient
67
from iiify.app import app
78

@@ -66,6 +67,34 @@ def test_vtt_multilingual(self):
6667
if item['body']['language'] == 'cy':
6768
self.assertEqual(item['body']['id'], 'https://localhost/iiif/resource/cruz-test/cruz-test.cy.vtt', 'Unexpected link for the Welsh vtt file')
6869

70+
def test_newsitem(self):
71+
resp = self.test_app.get("/iiif/3/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/manifest.json")
72+
self.assertEqual(resp.status_code, 200)
73+
manifest = resp.json
74+
75+
canvas = manifest['items'][0]
76+
annoPages = canvas['items'][0]
77+
annotations = annoPages['items']
78+
self.assertEqual(len(annotations), math.floor(780.89 / 60), 'Expected the video to contain the 13min video split into 1 minute segments')
79+
80+
# Check vtt file
81+
self.assertTrue('annotations' in canvas, "Expected canvas to have annotations")
82+
vttFile = canvas['annotations'][0]['items'][0]['body']['id']
83+
self.assertTrue(vttFile.endswith("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt"),f"Expected vttFile to be located at /iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt but found it at {vttFile}")
84+
85+
resp = self.test_app.get("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt")
86+
checkLine=False
87+
for line in resp.text.split("\n"):
88+
if checkLine:
89+
self.assertEqual("00:01:02.000 -> 00:01:02.000", line, "Expected the timecode to be over a minute as its the second video")
90+
break
91+
if line.startswith("28"):
92+
checkLine=True
93+
# 28
94+
# 00:01:02.000 -> 00:01:02.000
95+
# I AM THE DIRECTOR OF ARCHAEOLOGY
96+
97+
6998

7099
if __name__ == '__main__':
71100
unittest.main()

0 commit comments

Comments
 (0)