diff --git a/.gitignore b/.gitignore index ec3a9ad..22e6381 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,12 @@ docs/_build/ # PyBuilder target/ + +# Cache dir +cache/ + +# PyCharm +.idea/ + +# Old folder from v2? +media/ \ No newline at end of file diff --git a/README.md b/README.md index ee45d44..757ce1d 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,11 @@ Unit tests are in the `tests` folder and can be run with: python -m unittest discover -s tests ``` +Run single test: +``` +python -m unittest tests.test_video.TestVideo.test_vtt_autogenerated +``` + Retrieve large.jpg as 800px wide JPEG * http://127.0.0.1:8080/iiif/large.jpg/full/800,/0/default.jpg diff --git a/iiify/app.py b/iiify/app.py index 8f8a041..e78e60a 100755 --- a/iiify/app.py +++ b/iiify/app.py @@ -8,7 +8,7 @@ from flask_caching import Cache from iiif2 import iiif, web from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \ - purify_domain, cantaloupe_resolver, create_collection3, IsCollection + purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations from .configs import options, cors, approot, cache_root, media_root, \ cache_expr, version, image_server, cache_timeouts from urllib.parse import quote @@ -17,7 +17,7 @@ app = Flask(__name__) # disabling sorting of the output json app.config['JSON_SORT_KEYS'] = False -app.config['CACHE_TYPE'] = "FileSystemCache" +app.config['CACHE_TYPE'] = "FileSystemCache" if os.environ.get("FLASK_CACHE_DISABLE", None) != "true" else "NullCache" app.config['CACHE_DIR'] = "cache" cors = CORS(app) if cors else None cache = Cache(app) @@ -191,6 +191,11 @@ def manifest3(identifier): raise excpt # abort(404) +@app.route('/iiif//annotations///.json') +@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust) +def annnotations(version, identifier, fileName, canvas_no): + domain = purify_domain(request.args.get('domain', request.url_root)) + return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain)) @app.route('/iiif//manifest.json') @cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust) @@ -199,7 +204,7 @@ def manifest(identifier): @app.route('/iiif/2//manifest.json') def manifest2(identifier): - domain = "https://iiif.archivelab.org/iiif/" + domain = purify_domain(request.args.get('domain', request.url_root)) page = None if '$' in identifier: identifier, page = identifier.split('$') diff --git a/iiify/configs/__init__.py b/iiify/configs/__init__.py index ef4bce7..9bee0ff 100644 --- a/iiify/configs/__init__.py +++ b/iiify/configs/__init__.py @@ -14,6 +14,7 @@ import sys import types import configparser +import json path = os.path.dirname(os.path.realpath(__file__)) approot = os.path.abspath(os.path.join(path, os.pardir)) @@ -75,3 +76,6 @@ def getdef(self, section, option, default_value): "long": 432000, # 5 days "longest": 2592000 # 30 days } + +with open('%s/links.json' % path, 'r') as file: + LINKS = json.load(file) \ No newline at end of file diff --git a/iiify/configs/links.json b/iiify/configs/links.json new file mode 100644 index 0000000..58f2174 --- /dev/null +++ b/iiify/configs/links.json @@ -0,0 +1,137 @@ +{ + "Animated GIF": { + "field": "rendering", + "type": "Image", + "format": "image/gif" + }, + "Text PDF": { + "field": "rendering", + "type": "Text", + "format": "application/pdf" + }, + "Abbyy GZ": { + "field": "rendering", + "type": "Dataset", + "format": "application/gzip" + }, + "Archive BitTorrent": { + "field": "rendering", + "type": "Dataset", + "format": "application/x-bittorrent" + }, + "Grayscale PDF": { + "field": "rendering", + "type": "Text", + "format": "application/pdf" + }, + "chOCR": { + "field": "rendering", + "type": "Text", + "format": "application/gzip" + }, + "DjVuTXT": { + "field": "rendering", + "type": "Text", + "format": "text/plain" + }, + "Djvu XML": { + "field": "rendering", + "type": "Dataset", + "format": "application/xml" + }, + "hOCR": { + "field": "rendering", + "type": "Text", + "format": "text/html" + }, + "Single Page Processed JP2 ZIP": { + "field": "rendering", + "type": "Image", + "format": "application/zip" + }, + "OCR Search Text": { + "field": "rendering", + "type": "Text", + "format": "application/gzip" + }, + "Single Page Original JP2 Tar": { + "field": "rendering", + "type": "Image", + "format": "application/x-tar" + }, + "DjVu": { + "field": "rendering", + "type": "Image", + "format": "image/vnd.djvu" + }, + "Cloth Cover Detection Log": { + "field": "seeAlso", + "type": "Text", + "format": "text/plain" + }, + "Dublin Core": { + "field": "seeAlso", + "type": "Dataset", + "format": "application/xml" + }, + "OCR Page Index": { + "field": "seeAlso", + "type": "Dataset", + "format": "application/json" + }, + "MARC": { + "field": "seeAlso", + "type": "Dataset", + "format": "application/xml" + }, + "MARC Binary": { + "field": "seeAlso", + "type": "Dataset", + "format": "application/marc" + }, + "MARC Source": { + "field": "seeAlso", + "type": "Dataset", + "format": "application/xml" + }, + "Page Numbers JSON": { + "field": "seeAlso", + "type": "Dataset", + "format": "application/json" + }, + "Scandata": { + "field": "seeAlso", + "type": "Dataset", + "format": "application/xml" + }, + "SubRip": { + "field": "rendering", + "type": "Text", + "format": "text/plain" + }, + "Web Video Text Tracks": { + "field": "rendering", + "type": "Text", + "format": "text/vtt" + }, + "Intermediate ASR JSON": { + "field": "rendering", + "type": "Text", + "format": "application/json" + }, + "Whisper ASR JSON": { + "field": "rendering", + "type": "Text", + "format": "application/json" + }, + "Storj Upload Log": { + "field": "seeAlso", + "type": "Text", + "format": "text/plain" + }, + "Storj Upload Trigger": { + "field": "seeAlso", + "type": "Text", + "format": "text/plain" + } +} \ No newline at end of file diff --git a/iiify/resolver.py b/iiify/resolver.py index 986c538..f41f6ae 100644 --- a/iiify/resolver.py +++ b/iiify/resolver.py @@ -3,16 +3,19 @@ import os import requests from iiif2 import iiif, web -from .configs import options, cors, approot, cache_root, media_root, apiurl -from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef + +from .configs import options, cors, approot, cache_root, media_root, apiurl, LINKS +from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage,AnnotationPageRef, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef + from urllib.parse import urlparse, parse_qs, quote import json import math import re +import xml.etree.ElementTree as ET IMG_CTX = 'http://iiif.io/api/image/2/context.json' PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json' -ARCHIVE = 'http://archive.org' +ARCHIVE = 'https://archive.org' IMG_SRV = 'https://iiif.archive.org/image/iiif' METADATA_FIELDS = ("title", "volume", "publisher", "subject", "date", "contributor", "creator") bookdata = 'http://%s/BookReader/BookReaderJSON.php' @@ -37,6 +40,28 @@ def getids(q, limit=1000, cursor=''): }, allow_redirects=True, timeout=None) return r.json() +def checkMultiItem(metadata): + # Maybe add call to book stack to see if that works first + + # Count the number of each original file + file_types = {} + for file in metadata['files']: + if file['source'] == "original": + if file['format'] not in file_types: + file_types[file['format']] = 0 + + file_types[file['format']] += 1 + #print (file_types) + + # If there is multiple files of the same type then return the first format + # Will have to see if there are objects with multiple images and formats + for format in file_types: + if file_types[format] > 1 and format.lower() in valid_filetypes: + return (True, format) + + return (False, None) + + def to_mimetype(format): formats = { "VBR MP3": "audio/mp3", @@ -123,10 +148,13 @@ def create_collection3(identifier, domain, page=1, rows=1000): return json.loads(collection.jsonld()) -def manifest_page(identifier, label='', page='', width='', height='', metadata=None): +def manifest_page(identifier, label='', page='', width='', height='', metadata=None, canvasId=""): + if not canvasId: + canvasId = f"{identifier}/canvas" + metadata = metadata or {} return { - '@id': '%s/canvas' % identifier, + '@id': canvasId, '@type': 'sc:Canvas', '@context': PRZ_CTX, 'description': metadata.get('description', ''), @@ -212,7 +240,8 @@ def create_manifest(identifier, domain=None, page=None): label=metadata['title'], width=info['width'], height=info['height'], - metadata=metadata + metadata=metadata, + canvasId= f"https://iiif.archivelab.org/iiif/{identifier}/canvas" ) ) @@ -252,7 +281,8 @@ def create_manifest(identifier, domain=None, page=None): label=metadata['title'], width=info['width'], height=info['height'], - metadata=metadata + metadata=metadata, + canvasId= f"https://iiif.archivelab.org/iiif/{identifier}/canvas" ) ) else: @@ -271,7 +301,8 @@ def create_manifest(identifier, domain=None, page=None): identifier = "%s%s$%s" % (domain, identifier, page), label=data['pageNums'][page], width=data['pageWidths'][page], - height=data['pageHeights'][page] + height=data['pageHeights'][page], + canvasId= f"https://iiif.archivelab.org/iiif/{identifier}${page}/canvas" ) ) return manifest @@ -282,7 +313,8 @@ def create_manifest(identifier, domain=None, page=None): identifier = "%s%s$%s" % (domain, identifier, page), label=data['pageNums'][page], width=data['pageWidths'][page], - height=data['pageHeights'][page] + height=data['pageHeights'][page], + canvasId= f"https://iiif.archivelab.org/iiif/{identifier}${page}/canvas" ) ) return manifest @@ -362,6 +394,66 @@ def addMetadata(item, identifier, metadata, collection=False): item.metadata = manifest_metadata +def addSeeAlso(manifest, identifier, files): + + manifest.seeAlso = [ + {"id": f"{ARCHIVE}/metadata/{identifier}", + "type": "Metadata", + "label": {"en": ["Item Metadata"]}, + "format": "application/json"} + ] + + # Type format from IA Metadata -> Type description in IIIF + SEEALSO_TYPES = { + "Abbyy GZ": "OCR Data", + "Abbyy XML": "OCR Data", + "Djvu XML": "OCR Data", + "Scandata": "OCR Data", + "Archive BitTorrent": "Torrent", + "Metadata": "Metadata", + } + + for file in files: + if file['format'] in LINKS and LINKS[file['format']]['field'] == 'seeAlso': + seeAlso = LINKS[file['format']] + manifest.seeAlso.append( + {"id": f"{ARCHIVE}/download/{identifier}/{file['name']}", + "type": seeAlso['type'], + "label": {"en": [file["format"]]}, + "format": seeAlso['format'] + }) + + +def addRendering(manifest, identifier, files): + manifest.rendering = [] + + for file in files: + if file['format'] in LINKS and LINKS[file['format']]['field'] == 'rendering': + rendering = LINKS[file['format']] + manifest.rendering.append( + {"id": f"{ARCHIVE}/download/{identifier}/{file['name']}", + "type": rendering['type'], + "label": {"en": [file["format"]]}, + "format": rendering['format'] + }) + +def addThumbnails(manifest, identifier, files): + thumbnails = [] + + for file in files: + if file['format'] == "Thumbnail": + mimetype = "image/jpeg" + if file['name'].endswith('.png'): + mimetype = "image/png" + + thumbnails.append({ + "id": f"{ARCHIVE}/download/{identifier}/{file['name']}", + "type": "Image", + "format": mimetype, + }) + + if thumbnails: + manifest.thumbnail = thumbnails def create_manifest3(identifier, domain=None, page=None): # Get item metadata @@ -377,15 +469,21 @@ def create_manifest3(identifier, domain=None, page=None): manifest = Manifest(id=f"{uri}/manifest.json", label=metadata["metadata"]["title"]) addMetadata(manifest, identifier, metadata['metadata']) + addSeeAlso(manifest, identifier, metadata['files']) + addRendering(manifest, identifier, metadata['files']) + addThumbnails(manifest, identifier, metadata['files']) if mediatype == 'texts': # Get bookreader metadata (mostly for filenames and height / width of image) # subprefix can be different from the identifier use the scandata filename to find the correct prefix # if not present fall back to identifier subprefix = identifier + djvuFile = "" for fileMd in metadata['files']: if fileMd['name'].endswith('_scandata.xml'): subprefix = fileMd['name'].replace('_scandata.xml', '') + if fileMd['format'] == 'Djvu XML': + djvuFile = fileMd['name'] bookReaderURL = f"https://{metadata.get('server')}/BookReader/BookReaderJSIA.php?id={identifier}&itemPath={metadata.get('dir')}&server={metadata.get('server')}&format=jsonp&subPrefix={subprefix}" @@ -446,9 +544,46 @@ def create_manifest3(identifier, domain=None, page=None): except: pass + # Add annotations if djvu file is present + if djvuFile: + count = 1 + for canvas in manifest.items: + if 'annotations' in canvas: + annotations = canvas.annotations + else: + annotations = [] + annotations.append( + AnnotationPageRef(id=f"{domain}3/annotations/{identifier}/{quote(djvuFile, safe='()')}/{count}.json", type="AnnotationPage") + ) + canvas.annotations = annotations + count += 1 elif mediatype == 'image': - singleImage(metadata, identifier, manifest, uri) + (multiFile, format) = checkMultiItem(metadata) + print (f"Checking multiFile {multiFile} {format}") + if multiFile: + # Create multi file manifest + pageCount = 0 + for file in metadata['files']: + if file['source'] == "original" and file['format'] == format: + imgId = f"{identifier}/{file['name']}".replace('/','%2f') + imgURL = f"{IMG_SRV}/3/{imgId}" + pageCount += 1 + + try: + manifest.make_canvas_from_iiif(url=imgURL, + id=f"{URI_PRIFIX}/{identifier}${pageCount}/canvas", + label=f"{file['name']}", + anno_page_id=f"{uri}/annotationPage/1", + anno_id=f"{uri}/annotation/1") + except requests.exceptions.HTTPError as error: + print (f'Failed to get {imgURL}') + manifest.make_canvas(label=f"Failed to load {file['name']} from Image Server", + summary=f"Got {error}", + id=f"{URI_PRIFIX}/{identifier}/canvas", + height=1800, width=1200) + else: + singleImage(metadata, identifier, manifest, uri) elif mediatype == 'audio' or mediatype == 'etree': # sort the files into originals and derivatives, splitting the derivatives into buckets based on the original originals = [] @@ -506,6 +641,7 @@ def create_manifest3(identifier, domain=None, page=None): # sort the files into originals and derivatives, splitting the derivatives into buckets based on the original originals = [] derivatives = {} + vttfiles = {} for f in metadata['files']: if f['source'] == 'derivative': if f['original'] in derivatives: @@ -514,6 +650,14 @@ def create_manifest3(identifier, domain=None, page=None): derivatives[f['original']] = {f['format']: f} elif f['source'] == 'original': originals.append(f) + + if f['format'] == 'Web Video Text Tracks': + # Example: cruz-test.en.vtt and 34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt + sourceFilename = re.sub('\.[a-zA-H-]*\.vtt', '', f['name']) + if sourceFilename not in vttfiles: + vttfiles[sourceFilename] = [] + + vttfiles[sourceFilename].append(f) # create the canvases for each original for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]: @@ -522,6 +666,32 @@ def create_manifest3(identifier, domain=None, page=None): c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas" c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width'])) + # Add vtt if present + if vttfiles and normalised_id in vttfiles: + vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt" + + vttNo = 1 + for vttFile in vttfiles[normalised_id]: + vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", + motivation="supplementing", + target=c.id, + anno_page_id=vttAPId, + body={"id": f"{domain}resource/{identifier}/{vttFile['name']}", + "type": "Text", + "format": "text/vtt", + }) + # add label and language + if vttFile['name'].endswith("autogenerated.vtt"): + vtAnno.body.label = { 'en': ['autogenerated']} + else: + # Assume language + splitName = vttFile['name'].split(".") + lang = splitName[-2] + vtAnno.body.add_label(lang, language="none") + vtAnno.body.language = lang + + vttNo += 1 + # create intermediary objects ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page") anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id) @@ -566,6 +736,55 @@ def create_manifest3(identifier, domain=None, page=None): return json.loads(manifest.jsonld()) +def create_annotations(version, identifier, fileName, canvas_no, domain=None): + annotationPage = AnnotationPage(id=f"{domain}{version}/annotations/{identifier}/{quote(fileName, safe='()')}/{canvas_no}.json") + annotationPage.items = [] + index = int(canvas_no) - 1 + url = f"{ARCHIVE}/download/{identifier}/{fileName}" + try: + # Fetch the remote XML file + response = requests.get(url) + response.raise_for_status() # Raise an error for bad status codes + + # Parse the XML content + djfu = ET.fromstring(response.content) + page = djfu.findall(f".//OBJECT[{canvas_no}]")[0] + words = page.findall(".//WORD") + count = 1 + for word in words: + # [David + # [David + # x = lx + # y = ty + # w = rx - lx + # h = by - ty + (left_x, bottom_y, right_x, top_y) = word.attrib['coords'].split(',') + x = left_x + y = top_y + width = int(right_x) - int(left_x) + height = int(bottom_y) - int(top_y) + annotationPage.items.append({ + "id": f"https://iiif.archive.org/iiif/{identifier}/canvas/{index}/anno/{count}", + "type": "Annotation", + "motivation": "supplementing", + "body": { + "type": "TextualBody", + "format": "text/plain", + "value": word.text + }, + "target": f"https://iiif.archive.org/iiif/{identifier}${index}/canvas#xywh={x},{y},{width},{height}" + }) + count += 1 + + except requests.exceptions.RequestException as e: + print(f"Error fetching the XML file: {e}") + raise ValueError("Failed to retrieve {url}") + except ET.ParseError as e: + print(f"Error parsing the XML content: {e}") + raise ValueError("Failed to process {url}") + + return json.loads(annotationPage.jsonld()) + def coerce_list(value): if isinstance(value, list): return ". ".join(value) @@ -643,7 +862,6 @@ def ia_resolver(identifier): def cantaloupe_resolver(identifier): """Resolves an existing Image Service identifier to what it should be with the new Cantaloupe setup""" - leaf = None if "$" in identifier: identifier, leaf = identifier.split("$", 1) @@ -655,7 +873,6 @@ def cantaloupe_resolver(identifier): mediatype = metadata['metadata']['mediatype'].lower() files = metadata['files'] - if mediatype == "image": # single image file - find the filename @@ -711,6 +928,6 @@ def cantaloupe_resolver(identifier): filepath = f"{fileIdentifier}_{leaf.zfill(4)}{extension}" return f"{identifier}%2f{filename}%2f{dirpath}%2f{filepath}" - # print (f'images not found for {identifier}') - # for f in files: - # print (f"source: {f['source'].lower()} name: {f['name']} and {f['source'].lower() == 'derivative'} {f['name'].endswith('_jp2.zip')}") + # print (f'images not found for {identifier}') + # for f in files: + # print (f"source: {f['source'].lower()} name: {f['name']} and {f['source'].lower() == 'derivative'} {f['name'].endswith('_jp2.zip')}") diff --git a/nginx-vhost.conf b/nginx-vhost.conf index bf7645e..0819c31 100644 --- a/nginx-vhost.conf +++ b/nginx-vhost.conf @@ -37,4 +37,11 @@ server { # Reverse proxy with the variables captured above proxy_pass https://cantaloupe.prod.archive.org/iiif/$1/$2; } + + location /iiif/resource/ { + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, HEAD, POST, PUT, PATCH, DELETE' always; + # https://archive.org/download/cruz-test/cruz-test.af.vtt + proxy_pass https://archive.org/download/; + } } diff --git a/tests/test_annotations.py b/tests/test_annotations.py new file mode 100644 index 0000000..c7bde59 --- /dev/null +++ b/tests/test_annotations.py @@ -0,0 +1,57 @@ +import unittest +from flask.testing import FlaskClient +from iiify.app import app + +class TestAnnotations(unittest.TestCase): + + def setUp(self) -> None: + self.test_app = FlaskClient(app) + + def test_v3_manifest_has_annotations(self): + resp = self.test_app.get("/iiif/3/journalofexpedit00ford/manifest.json?recache=true") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + count = 1 + for canvas in manifest['items']: + self.assertTrue('annotations' in canvas, f"Expected annotations in canvas {canvas['id']}") + annotations_url = f"https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/{count}.json" + found=False + for anno in canvas['annotations']: + if anno['id'] == annotations_url: + found=True + self.assertFalse('items' in anno, "As a referenced AnnotationPage it shouldn't contain items.") + self.assertTrue('type' in anno and anno['type'] == "AnnotationPage",f"Expected annotation page to have a type {anno}") + + self.assertTrue(found, f"Expected to find {annotations_url} in {canvas['annotations']}") + count += 1 + + def test_v3_annotations(self): + resp = self.test_app.get("/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json?recache=true") + self.assertEqual(resp.status_code, 200) + annotations = resp.json + + self.assertEqual(annotations['id'], "https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json", "Unexpected id") + self.assertEqual(annotations['@context'], "http://iiif.io/api/presentation/3/context.json", "Unexpected context") + self.assertEqual(annotations['type'], "AnnotationPage", "Unexpected type, expected AnnotationPage") + annotationList = annotations['items'] + self.assertEqual(len(annotationList), 6, "Unexpected number of annotations") + + ids = [] + first=True + for anno in annotationList: + self.assertTrue(anno['id'] not in ids,"Duplicate ID: {anno['id']}") + ids.append(anno['id']) + self.assertEqual(anno['type'], "Annotation", "Expected type of Annotation") + self.assertTrue("body" in anno and "target" in anno, "Body or target missing from annotation {anno}") + self.assertEqual(anno['body']['type'], "TextualBody", "Expected body to be a TextualBody") + self.assertEqual(anno['body']['format'], "text/plain", "Expected format to be a text/plain") + self.assertEqual(anno['target'].split('#')[0], "https://iiif.archive.org/iiif/journalofexpedit00ford$0/canvas") + if first: + self.assertEqual(anno['target'].split('#')[1],"xywh=592,1742,460,118") + self.assertEqual(anno['body']['value'],"JOURNAL ") + + self.assertEqual(anno['motivation'], "supplementing", "Expected motivation of supplementing") + first=False + + \ No newline at end of file diff --git a/tests/test_basic.py b/tests/test_basic.py index e690f0a..0c40327 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,3 +1,6 @@ +import os +os.environ["FLASK_CACHE_DISABLE"] = "true" + import unittest from flask.testing import FlaskClient from iiify.app import app @@ -8,6 +11,7 @@ class TestBasic(unittest.TestCase): def setUp(self) -> None: self.test_app = FlaskClient(app) + def test_documentation(self): resp = self.test_app.get("/iiif/documentation") self.assertEqual(resp.status_code, 200) diff --git a/tests/test_cantaloupe_resolver.py b/tests/test_cantaloupe_resolver.py index 0d963f6..e24f6cf 100644 --- a/tests/test_cantaloupe_resolver.py +++ b/tests/test_cantaloupe_resolver.py @@ -1,3 +1,6 @@ +import os +os.environ["FLASK_CACHE_DISABLE"] = "true" + import unittest from iiify.resolver import cantaloupe_resolver diff --git a/tests/test_collections.py b/tests/test_collections.py index 23d79d9..a0c1667 100644 --- a/tests/test_collections.py +++ b/tests/test_collections.py @@ -1,7 +1,9 @@ +import os +os.environ["FLASK_ENV"] = "testing" + import unittest from flask.testing import FlaskClient from iiify.app import app - class TestCollections(unittest.TestCase): def setUp(self) -> None: diff --git a/tests/test_linking.py b/tests/test_linking.py new file mode 100644 index 0000000..edb326e --- /dev/null +++ b/tests/test_linking.py @@ -0,0 +1,105 @@ +import os +os.environ["FLASK_CACHE_DISABLE"] = "true" + +import unittest +from flask.testing import FlaskClient +from iiify.app import app + +class TestLinking(unittest.TestCase): + + def setUp(self) -> None: + self.test_app = FlaskClient(app) + + def convertListToHash(self, items): + map = {} + for item in items: + map[item['label']['en'][0]] = item + return map + + def checkLink(self, map, field, name, value): + self.assertTrue(name in map, f"Expected to find {name} in {field}") + + self.assertEqual(map[name]['id'], value, f"Expected {value} in {map[name]}") + + def test_v3_image_links(self): + resp = self.test_app.get("/iiif/3/journalofexpedit00ford/manifest.json?recache=true") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + self.assertTrue('rendering' in manifest, "Expected rendering in Manifest") + renderingMap = self.convertListToHash(manifest['rendering']) + # Animated GIF - rendering + self.checkLink(renderingMap, "rendering", "Animated GIF", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford.gif") + # Text PDF - rendering + self.checkLink(renderingMap, "rendering", "Text PDF", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford.pdf") + # Abbyy GZ - rendering + self.checkLink(renderingMap, "rendering", "Abbyy GZ", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_abbyy.gz") + # Archive BitTorrent - rendering + self.checkLink(renderingMap, "rendering", "Archive BitTorrent", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_archive.torrent") + # Grayscale PDF - rendering + self.checkLink(renderingMap, "rendering", "Grayscale PDF", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_bw.pdf") + # chOCR - rendering + self.checkLink(renderingMap, "rendering", "chOCR", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_chocr.html.gz") + # DjVuTXT - rendering + self.checkLink(renderingMap, "rendering", "DjVuTXT", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_djvu.txt") + # Djvu XML - rendering + self.checkLink(renderingMap, "rendering", "Djvu XML", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_djvu.xml") + # hOCR - rendering + self.checkLink(renderingMap, "rendering", "hOCR", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_hocr.html") + # Single Page Processed JP2 ZIP - rendering + self.checkLink(renderingMap, "rendering", "Single Page Processed JP2 ZIP", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_jp2.zip") + # OCR Search Text - rendering + self.checkLink(renderingMap, "rendering", "OCR Search Text", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_hocr_searchtext.txt.gz") + # Single Page Original JP2 Tar - rendering + self.checkLink(renderingMap, "rendering", "Single Page Original JP2 Tar", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_orig_jp2.tar") + # DjVu - rendering + self.checkLink(renderingMap, "rendering", "DjVu", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford.djvu") + + self.assertTrue('seeAlso' in manifest, "Expected seeAlso in Manifest") + seeAlsoMap = self.convertListToHash(manifest['seeAlso']) + # Cloth Cover Detection Log - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "Cloth Cover Detection Log", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_cloth_detection.log") + # Dublin Core - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "Dublin Core", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_dc.xml") + # OCR Page Index - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "OCR Page Index", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_hocr_pageindex.json.gz") + # MARC - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "MARC", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_marc.xml") + # MARC Binary - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "MARC Binary", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_meta.mrc") + # MARC Source - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "MARC Source", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_metasource.xml") + # Page Numbers JSON - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "Page Numbers JSON", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_page_numbers.json") + # Scandata - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "Scandata", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_scandata.xml") + + def test_v3_video_links(self): + resp = self.test_app.get("/iiif/3/DuckandC1951/manifest.json?recache=true") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + self.assertTrue('rendering' in manifest, "Expected rendering in Manifest") + renderingMap = self.convertListToHash(manifest['rendering']) + seeAlsoMap = self.convertListToHash(manifest['seeAlso']) + self.assertTrue("Unknown" not in renderingMap and "Unknown" not in seeAlsoMap, "Found Unknown in rendering or seeAlso where it shouldn't be.") + + # SubRip - rendering + self.checkLink(renderingMap, "rendering", "SubRip", "https://archive.org/download/DuckandC1951/DuckandC1951.asr.srt") + # Web Video Text Tracks - rendering + self.checkLink(renderingMap, "rendering", "Web Video Text Tracks", "https://archive.org/download/DuckandC1951/DuckandC1951.asr.vtt") + # Archive BitTorrent - rendering + self.checkLink(renderingMap, "rendering", "Archive BitTorrent", "https://archive.org/download/DuckandC1951/DuckandC1951_archive.torrent") + # Intermediate ASR JSON - rendering + self.checkLink(renderingMap, "rendering", "Intermediate ASR JSON", "https://archive.org/download/DuckandC1951/DuckandC1951_intermediate_asr.json") + # Whisper ASR JSON + self.checkLink(renderingMap, "rendering", "Whisper ASR JSON", "https://archive.org/download/DuckandC1951/DuckandC1951_whisper_asr.json") + + # Storj Upload Log - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "Storj Upload Log", "https://archive.org/download/DuckandC1951/DuckandC1951.storj-store.log") + # Storj Upload Trigger - seeAlso + self.checkLink(seeAlsoMap, "seeAlso", "Storj Upload Trigger", "https://archive.org/download/DuckandC1951/DuckandC1951.storj-store.trigger") + + # Thumbnail - thumbnail + # 19 thumbs + self.assertEqual(len(manifest['thumbnail']), 19, f"Expected 19 thumbnails: {manifest['thumbnail']}") \ No newline at end of file diff --git a/tests/test_manifests.py b/tests/test_manifests.py index 3fa25ba..ce34c21 100644 --- a/tests/test_manifests.py +++ b/tests/test_manifests.py @@ -1,3 +1,6 @@ +import os +os.environ["FLASK_CACHE_DISABLE"] = "true" + import unittest from flask.testing import FlaskClient from iiify.app import app @@ -42,21 +45,13 @@ def test_v3_single_text_manifest(self): self.assertEqual(manifest['type'], "Manifest", f"Unexpected type. Expected Manifest go {manifest['type']}") self.assertEqual(len(manifest['items']),1,f"Expected 1 canvas but got: {len(manifest['items'])}") - def test_v3_vermont_Life_Magazine(self): resp = self.test_app.get("/iiif/3/rbmsbk_ap2-v4_2001_V55N4/manifest.json") self.assertEqual(resp.status_code, 200) manifest = resp.json self.assertEqual(len(manifest['items']),116,f"Expected 116 canvas but got: {len(manifest['items'])}") - - def test_v3_single_video_manifest(self): - resp = self.test_app.get("/iiif/3/youtube-7w8F2Xi3vFw/manifest.json") - self.assertEqual(resp.status_code, 200) - manifest = resp.json - - self.assertEqual(len(manifest['items']),1,f"Expected 1 canvas but got: {len(manifest['items'])}") - + #logic to cover etree mediatype github issue #123 def test_v3_etree_mediatype(self): resp = self.test_app.get("/iiif/3/gd72-04-14.aud.vernon.23662.sbeok.shnf/manifest.json") @@ -66,7 +61,6 @@ def test_v3_etree_mediatype(self): self.assertEqual(len(manifest['items']),36,f"Expected 36 canvases but got: {len(manifest['items'])}") self.assertEqual(manifest['items'][0]['items'][0]['items'][0]['body']['items'][0]['type'],"Sound",f"Expected 'Sound' but got: {manifest['items'][0]['items'][0]['items'][0]['body']['items'][0]['type']}") - def test_v3_64Kbps_MP3(self): resp = self.test_app.get("/iiif/3/TvQuran.com__Alafasi/manifest.json") self.assertEqual(resp.status_code, 200) @@ -74,7 +68,6 @@ def test_v3_64Kbps_MP3(self): self.assertEqual(len(manifest['items']),114,f"Expected 114 canvases but got: {len(manifest['items'])}") self.assertEqual("64Kbps MP3".lower() in resp.text.lower(), True, f"Expected the string '64Kbps MP3'") - def test_v3_128Kbps_MP3(self): resp = self.test_app.get("/iiif/3/alice_in_wonderland_librivox/manifest.json") self.assertEqual(resp.status_code, 200) @@ -82,14 +75,6 @@ def test_v3_128Kbps_MP3(self): self.assertEqual(len(manifest['items']),12,f"Expected 12 canvases but got: {len(manifest['items'])}") self.assertEqual("128kbps mp3".lower() in resp.text.lower(), True, f"Expected the string '128kbps mp3'") - def test_v3_h264_MPEG4_OGG_Theora(self): - resp = self.test_app.get("/iiif/3/taboca_201002_03/manifest.json") - self.assertEqual(resp.status_code, 200) - manifest = resp.json - self.assertEqual(len(manifest['items']),251,f"Expected 251 canvases but got: {len(manifest['items'])}") - self.assertEqual("h.264 MPEG4".lower() in resp.text.lower(), True, f"Expected the string 'h.264 MPEG4'") - self.assertEqual("OGG Theora".lower() in resp.text.lower(), True, f"Expected the string 'OGG Theora'") - def test_v3_aiff(self): resp = self.test_app.get("/iiif/3/PDextend_AIFF/manifest.json") self.assertEqual(resp.status_code, 200) @@ -144,6 +129,22 @@ def test_metadata_array(self): manifest = resp.json self.assertTrue(len(manifest['summary']['none']) > 1, f"Expected multiple summary values, but got {manifest['summary']['none']}") + def test_multi_file_image(self): + resp = self.test_app.get("/iiif/3/arkivkopia.se-lms-G70-48.3/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + self.assertEqual(len(manifest['items']),3, f"Expected three canvases, but got {len(manifest['items'])}") + + firstCanvasId = manifest['items'][0]['id'] + for i in range(1, len(manifest['items'])): + self.assertNotEqual(manifest['items'][i]['id'], firstCanvasId, 'Canvas Ids need to be unique') + + def test_multi_file(self): + resp = self.test_app.get("/iiif/3/st-anthony-relics-01/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + self.assertEqual(len(manifest['items']),6, f"Expected five canvases, but got {len(manifest['items'])}") + ''' to test: kaled_jalil (no derivatives) diff --git a/tests/test_manifests_v2.py b/tests/test_manifests_v2.py index 2380259..b0ba8a1 100644 --- a/tests/test_manifests_v2.py +++ b/tests/test_manifests_v2.py @@ -1,3 +1,6 @@ +import os +os.environ["FLASK_CACHE_DISABLE"] = "true" + import unittest from flask.testing import FlaskClient from iiify.app import app @@ -12,12 +15,47 @@ def test_v2_image_manifest(self): self.assertEqual(resp.status_code, 200) manifest = resp.json - self.assertEqual(manifest['@id'], 'https://iiif.archivelab.org/iiif/rashodgson68/manifest.json', 'V2 Manifest ID has changed') + self.assertEqual(manifest['@id'], 'https://localhost/iiif/rashodgson68/manifest.json', 'V2 Manifest ID is using new infrastructure changed') self.assertEqual(manifest['@type'], "sc:Manifest", f"Unexpected type. Expected Manifest got {manifest['@type']}") self.assertEqual(len(manifest['sequences'][0]['canvases']),32,f"Expected 32 canvases but got: {len(manifest['sequences'][0]['canvases'])}") self.assertEqual(manifest['sequences'][0]['canvases'][0]['@id'],"https://iiif.archivelab.org/iiif/rashodgson68$0/canvas",f"v2 canvas id has changed") + def test_v2_image_api(self): + resp = self.test_app.get("/iiif/2/1991-12-compute-magazine/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + self.assertEqual(manifest['@id'], 'https://localhost/iiif/1991-12-compute-magazine/manifest.json', 'V2 Manifest ID is using new infrastructure changed') + image = manifest['sequences'][0]['canvases'][0]['images'][0]['resource'] + self.assertEqual(image['@id'], "https://localhost/iiif/1991-12-compute-magazine$0/full/full/0/default.jpg", "Resource not using new image server") + self.assertEqual(image['service']['@id'], 'https://localhost/iiif/1991-12-compute-magazine$0', "V2 service not using the new image server") + + def test_v2_single_image(self): + resp = self.test_app.get("/iiif/2/img-8664_202009/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + self.assertEqual(manifest['@id'], 'https://localhost/iiif/img-8664_202009/manifest.json', 'V2 Manifest ID is using new infrastructure changed') + canvas = manifest['sequences'][0]['canvases'][0] + self.assertEqual(canvas['@id'], 'https://iiif.archivelab.org/iiif/img-8664_202009/canvas', 'Expected canvas id to be the same') + image = canvas['images'][0]['resource'] + self.assertEqual(image['@id'], "https://localhost/iiif/img-8664_202009/full/full/0/default.jpg", "Resource not using new image server") + self.assertEqual(image['service']['@id'], 'https://localhost/iiif/img-8664_202009', "V2 service not using the new image server") + + def test_v2_single_text_manifest(self): + resp = self.test_app.get("/iiif/2/fbf_3chords_1_/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + self.assertEqual(manifest['@id'], 'https://localhost/iiif/fbf_3chords_1_/manifest.json', 'V2 Manifest ID is using new infrastructure changed') + canvas = manifest['sequences'][0]['canvases'][0] + self.assertEqual(canvas['@id'], 'https://iiif.archivelab.org/iiif/fbf_3chords_1_$0/canvas', 'Expected canvas id to be the same') + image = canvas['images'][0]['resource'] + self.assertEqual(image['@id'], "https://localhost/iiif/fbf_3chords_1_$0/full/full/0/default.jpg", "Resource not using new image server") + self.assertEqual(image['service']['@id'], 'https://localhost/iiif/fbf_3chords_1_$0', "V2 service not using the new image server") + + def test_text_which_is_image(self): resp = self.test_app.get("/iiif/2/fbf_3chords_1_/manifest.json") diff --git a/tests/test_resolver.py b/tests/test_resolver.py index fe01b99..b14f327 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -1,3 +1,6 @@ +import os +os.environ["FLASK_CACHE_DISABLE"] = "true" + import unittest from iiify.resolver import purify_domain, collection, manifest_page diff --git a/tests/test_video.py b/tests/test_video.py new file mode 100644 index 0000000..117d14c --- /dev/null +++ b/tests/test_video.py @@ -0,0 +1,71 @@ +import os +os.environ["FLASK_CACHE_DISABLE"] = "true" + +import unittest +from flask.testing import FlaskClient +from iiify.app import app + +class TestVideo(unittest.TestCase): + + def setUp(self) -> None: + self.test_app = FlaskClient(app) + + def test_v3_single_video_manifest(self): + resp = self.test_app.get("/iiif/3/youtube-7w8F2Xi3vFw/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + self.assertEqual(len(manifest['items']),1,f"Expected 1 canvas but got: {len(manifest['items'])}") + + def test_v3_h264_MPEG4_OGG_Theora(self): + resp = self.test_app.get("/iiif/3/taboca_201002_03/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + self.assertEqual(len(manifest['items']),251,f"Expected 251 canvases but got: {len(manifest['items'])}") + self.assertEqual("h.264 MPEG4".lower() in resp.text.lower(), True, f"Expected the string 'h.264 MPEG4'") + self.assertEqual("OGG Theora".lower() in resp.text.lower(), True, f"Expected the string 'OGG Theora'") + + def test_vtt_autogenerated(self): + resp = self.test_app.get("/iiif/3/youtube-SvH4fbjOT0A/manifest.json?recache=true") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + self.assertEqual(len(manifest['items']),1,f"Expected 1 canvas but got: {len(manifest['items'])}") + self.assertTrue('annotations' in manifest['items'][0], "Expected annotations in manifest") + self.assertTrue(isinstance(manifest['items'][0]['annotations'], list), "Expected annotations to be a list") + self.assertEqual(len(manifest['items'][0]['annotations']), 1, "Expected 1 item in annotations") + annotationPage = manifest['items'][0]['annotations'][0] + self.assertEqual(annotationPage['type'], 'AnnotationPage', "Expected annotations to contain annotation page") + + self.assertTrue('items' in annotationPage and isinstance(annotationPage['items'],list) and len(annotationPage['items']) == 1, f"Expected annotation page to contain a list of items which contains 1 item. Found {annotationPage['items']}") + annotation = annotationPage['items'][0] + self.assertEqual(annotation['type'], 'Annotation', "Expected annotationPage to contain annotations") + self.assertEqual(annotation['motivation'], 'supplementing', "Expected annotation to have the supplementing annotation") + self.assertTrue('body' in annotation, "Expected annotation to have a body") + body = annotation['body'] + self.assertEqual(body['type'],'Text', "Expected body to have a type text") + self.assertEqual(body['format'],'text/vtt', "Expected body to have a type text") + self.assertEqual(body['label']['en'][0], "autogenerated", "Expected VTT file to have the label autogenerated") + self.assertFalse("language" in body, "We don't know the language for this item so there shouldn't be a language specified") + self.assertEqual(body['id'], "https://localhost/iiif/resource/youtube-SvH4fbjOT0A/34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt","Unexpected URL for the VTT file") + + def test_vtt_multilingual(self): + resp = self.test_app.get("/iiif/3/cruz-test/manifest.json?recache=true") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + canvas = manifest['items'][0] + self.assertTrue('annotations' in canvas, 'Expected annotations in Canvas') + self.assertEqual(len(canvas['annotations']), 1, 'Expected one AnnotationPage') + annotations = canvas['annotations'][0]['items'] + self.assertEqual(len(annotations), 104, 'Expected all 104 langues') + + # Check welsh + for item in annotations: + self.assertTrue('language' in item['body'], f"All vtt files should have a language: {item}") + if item['body']['language'] == 'cy': + self.assertEqual(item['body']['id'], 'https://localhost/iiif/resource/cruz-test/cruz-test.cy.vtt', 'Unexpected link for the Welsh vtt file') + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file