Skip to content

Commit 0b1ed87

Browse files
Merge pull request #28 from clamsproject/url_access
sub-page/sub-URL generation
2 parents c1b4c9a + 9a28864 commit 0b1ed87

11 files changed

+322
-163
lines changed

Diff for: README.md

+11-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ In these notes we assume that the data are in a local directory named `/Users/Sh
4747
$ docker run --rm -d -p 5000:5000 -v /Users/Shared/archive:/data clams-mmif-visualizer
4848
```
4949

50-
After this, all you need to do is point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File", select a MMIF file and then click "Visualize". See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder.
50+
See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder.
5151

5252
**Some background**
5353

@@ -89,8 +89,17 @@ To run the server do:
8989
$ python app.py
9090
```
9191

92-
Then point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File" and then click "Visualize".
9392

93+
## Uploading Files
94+
MMIF files can be uploaded to the visualization server one of two ways:
95+
* Point your browser to http://0.0.0.0:5000/upload, click "Choose File" and then click "Visualize". This will generate a static URL containing the visualization of the input file (e.g. `http://localhost:5000/display/HaTxbhDfwakewakmzdXu5e`). Once the file is uploaded, the page will automatically redirect to the file's visualization.
96+
* Using a command line, enter:
97+
```
98+
curl -X POST -F "file=@<filename>" -s http://localhost:5000/upload
99+
```
100+
This will upload the file and print the unique identifier for the file visualization. The visualization can be accessed at `http://localhost:5000/display/<id>`
101+
102+
The server will maintain a cache of up to 50MB for these temporary files, so the visualizations can be repeatedly accessed without needing to re-upload any files. Once this limit is reached, the server will delete stored visualizations until enough space is reclaimed, drawing from oldest/least recently accessed pages first. If you attempt to access the /display URL of a deleted file, you will be redirected back to the upload page instead.
94103

95104

96105
## Data source repository and input MMIF file

Diff for: app.py

+82-26
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
1+
import hashlib
12
import os
2-
import pathlib
3-
import sys
43
import secrets
5-
import json
6-
import html
4+
import sys
5+
from threading import Thread
76

8-
from flask import request, render_template, flash, redirect, send_from_directory, session
9-
from werkzeug.utils import secure_filename
7+
from flask import request, render_template, flash, send_from_directory, redirect
108
from mmif.serialize import Mmif
119

12-
from utils import app, render_ocr, get_media, prep_annotations, prepare_ocr_visualization
10+
import cache
11+
from cache import set_last_access, cleanup
12+
from utils import app, render_ocr, documents_to_htmls, prep_annotations, prepare_ocr_visualization
13+
1314

1415
@app.route('/')
1516
def index():
@@ -20,10 +21,10 @@ def index():
2021
def ocr():
2122
try:
2223
data = dict(request.json)
23-
mmif_str = open(session["mmif_file"]).read()
24+
mmif_str = open(cache.get_cache_path() / data["mmif_id"] / "file.mmif").read()
2425
mmif = Mmif(mmif_str)
2526
ocr_view = mmif.get_view_by_id(data["view_id"])
26-
return prepare_ocr_visualization(mmif, ocr_view)
27+
return prepare_ocr_visualization(mmif, ocr_view, data["mmif_id"])
2728
except Exception as e:
2829
return f'<p class="error">{e}</h1>'
2930

@@ -32,10 +33,11 @@ def ocr():
3233
def ocrpage():
3334
data = request.json
3435
try:
35-
return (render_ocr(data['vid_path'], data["view_id"], data["page_number"]))
36+
return render_ocr(data["mmif_id"], data['vid_path'], data["view_id"], data["page_number"])
3637
except Exception as e:
3738
return f'<p class="error">Unexpected error of type {type(e)}: {e}</h1>'
3839

40+
3941
@app.route('/upload', methods=['GET', 'POST'])
4042
def upload():
4143
# NOTE. Uses of flash() originally gaven a RuntimeError (The session is
@@ -44,7 +46,7 @@ def upload():
4446
if request.method == 'POST':
4547
# Check if request is coming from elasticsearch
4648
if 'data' in request.form:
47-
return render_mmif(request.form['data'])
49+
return upload_file(request.form['data'])
4850
# Otherwise, check if the post request has the file part
4951
elif 'file' not in request.files:
5052
flash('WARNING: post request has no file part')
@@ -56,34 +58,87 @@ def upload():
5658
flash('WARNING: no file was selected')
5759
return redirect(request.url)
5860
if file:
59-
filename = secure_filename(file.filename)
60-
file.save(os.path.join('temp', filename))
61-
with open("temp/" + filename) as fh:
62-
session["mmif_file"] = fh.name
63-
mmif_str = fh.read()
64-
return render_mmif(mmif_str)
61+
return upload_file(file)
62+
6563
return render_template('upload.html')
6664

6765

66+
@app.route('/decache', methods=['GET', 'POST'])
67+
def invalidate_cache():
68+
app.logger.debug(f"Request to invalidate cache on {request.args}")
69+
if not request.args.get('viz_id'):
70+
cache.invalidate_cache()
71+
return redirect("/upload")
72+
viz_id = request.args.get('viz_id')
73+
in_mmif = open(cache.get_cache_path() / viz_id / 'file.mmif', 'rb').read()
74+
cache.invalidate_cache([viz_id])
75+
return upload_file(in_mmif)
76+
77+
78+
@app.route('/display/<viz_id>')
79+
def display(viz_id):
80+
try:
81+
path = cache.get_cache_path() / viz_id
82+
set_last_access(path)
83+
with open(os.path.join(path, "index.html")) as f:
84+
html_file = f.read()
85+
return html_file
86+
except FileNotFoundError:
87+
flash("File not found -- please upload again (it may have been deleted to clear up cache space).")
88+
return redirect("/upload")
89+
90+
6891
@app.route('/uv/<path:path>')
6992
def send_js(path):
7093
return send_from_directory("uv", path)
7194

7295

73-
def render_mmif(mmif_str):
96+
def render_mmif(mmif_str, viz_id):
7497
mmif = Mmif(mmif_str)
75-
media = get_media(mmif)
76-
annotations = prep_annotations(mmif)
98+
media = documents_to_htmls(mmif, viz_id)
99+
app.logger.debug(f"Prepared Media: {[m[0] for m in media]}")
100+
annotations = prep_annotations(mmif, viz_id)
101+
app.logger.debug(f"Prepared Annotations: {[annotation[0] for annotation in annotations]}")
77102
return render_template('player.html',
78-
mmif=mmif, media=media, annotations=annotations)
103+
media=media, viz_id=viz_id, annotations=annotations)
104+
105+
106+
def upload_file(in_mmif):
107+
# Save file locally
108+
in_mmif_bytes = in_mmif if isinstance(in_mmif, bytes) else in_mmif.read()
109+
in_mmif_str = in_mmif_bytes.decode('utf-8')
110+
viz_id = hashlib.sha1(in_mmif_bytes).hexdigest()
111+
app.logger.debug(f"Visualization ID: {viz_id}")
112+
path = cache.get_cache_path() / viz_id
113+
app.logger.debug(f"Visualization Directory: {path}")
114+
try:
115+
os.makedirs(path)
116+
set_last_access(path)
117+
with open(path / 'file.mmif', 'w') as in_mmif_file:
118+
app.logger.debug(f"Writing original MMIF to {path / 'file.mmif'}")
119+
in_mmif_file.write(in_mmif_str)
120+
html_page = render_mmif(in_mmif_str, viz_id)
121+
with open(os.path.join(path, "index.html"), "w") as f:
122+
f.write(html_page)
123+
except FileExistsError:
124+
app.logger.debug("Visualization already cached")
125+
finally:
126+
# Perform cleanup
127+
t = Thread(target=cleanup)
128+
t.daemon = True
129+
t.run()
130+
131+
agent = request.headers.get('User-Agent')
132+
if 'curl' in agent.lower():
133+
return f"Visualization ID is {viz_id}\nYou can access the visualized file at /display/{viz_id}\n"
134+
return redirect(f"/display/{viz_id}", code=301)
79135

80136

81137
if __name__ == '__main__':
82138
# Make path for temp files
83-
tmp_path = pathlib.Path(__file__).parent /'static'/'tmp'
84-
if not os.path.exists(tmp_path):
85-
os.makedirs(tmp_path)
86-
139+
cache_path = cache.get_cache_path()
140+
if not os.path.exists(cache_path):
141+
os.makedirs(cache_path)
87142

88143
# to avoid runtime errors for missing keys when using flash()
89144
alphabet = 'abcdefghijklmnopqrstuvwxyz1234567890'
@@ -92,4 +147,5 @@ def render_mmif(mmif_str):
92147
port = 5000
93148
if len(sys.argv) > 2 and sys.argv[1] == '-p':
94149
port = int(sys.argv[2])
95-
app.run(port=port, host='0.0.0.0', debug=True)
150+
151+
app.run(port=port, host='0.0.0.0', debug=True, use_reloader=False)

Diff for: cache.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import os
2+
import time
3+
import shutil
4+
import threading
5+
import pathlib
6+
7+
from utils import app
8+
9+
lock = threading.Lock()
10+
11+
12+
def get_cache_path():
13+
return pathlib.Path(app.static_folder) / "tmp"
14+
15+
16+
def get_cache_relpath(full_path):
17+
return str(full_path)[len(app.static_folder):]
18+
19+
20+
def invalidate_cache(viz_ids):
21+
if not viz_ids:
22+
app.logger.debug("Invalidating entire cache.")
23+
shutil.rmtree(get_cache_path())
24+
os.makedirs(get_cache_path())
25+
else:
26+
for v in viz_ids:
27+
app.logger.debug(f"Invalidating {v} from cache.")
28+
shutil.rmtree(get_cache_path() / v)
29+
30+
31+
def set_last_access(path):
32+
with open(os.path.join(path, "last_access.txt"), "w") as f:
33+
f.write(str(time.time()))
34+
35+
36+
def scan_tmp_directory():
37+
oldest_accessed_dir = {"dir": None, "access_time": None}
38+
total_size = sum(f.stat().st_size for f in get_cache_path().glob('**/*') if f.is_file())
39+
# this will be some visualization IDs
40+
for p in get_cache_path().glob('*'):
41+
if not (p / 'last_access.txt').exists():
42+
oldest_accessed_dir = {"dir": p, "access_time": 0}
43+
elif oldest_accessed_dir["dir"] is None:
44+
with open(p / 'last_access.txt') as f:
45+
timestamp = f.read()
46+
if timestamp == '':
47+
continue
48+
oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)}
49+
else:
50+
with open(p / 'last_access.txt') as f:
51+
if float(f.read()) < oldest_accessed_dir["access_time"]:
52+
timestamp = f.read()
53+
if timestamp == '':
54+
continue
55+
oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)}
56+
return total_size, oldest_accessed_dir["dir"]
57+
58+
59+
def cleanup():
60+
with lock:
61+
print("Checking visualization cache...")
62+
# Max tmp size is 500MB
63+
max_size = 500000000
64+
folder_size, oldest_dir = scan_tmp_directory()
65+
while folder_size > max_size:
66+
print(f"Maximum cache size reached. Deleting {os.path.basename(oldest_dir)}.")
67+
shutil.rmtree(oldest_dir)
68+
folder_size, oldest_dir = scan_tmp_directory()

Diff for: iiif_utils.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1+
import datetime
12
import json
23
import os
3-
import pathlib
44
import tempfile
55
from typing import Dict
66

77
import mmif
88
from flask import url_for
99
from mmif import AnnotationTypes, DocumentTypes, Mmif
10-
import datetime
10+
11+
import cache
1112

1213

13-
def generate_iiif_manifest(in_mmif: mmif.Mmif):
14+
def generate_iiif_manifest(in_mmif: mmif.Mmif, viz_id):
1415
iiif_json = {
1516
"@context": "http://iiif.io/api/presentation/2/context.json",
1617
"id": "http://0.0.0.0:5000/mmif_example_manifest.json",
@@ -28,7 +29,7 @@ def generate_iiif_manifest(in_mmif: mmif.Mmif):
2829
}
2930
add_canvas_from_documents(in_mmif, iiif_json)
3031
add_structure_from_timeframe(in_mmif, iiif_json)
31-
return save_manifest(iiif_json)
32+
return save_manifest(iiif_json, viz_id)
3233

3334

3435
def add_canvas_from_documents(in_mmif, iiif_json):
@@ -105,9 +106,10 @@ def add_structure_from_timeframe(in_mmif: Mmif, iiif_json: Dict):
105106
iiif_json["structures"].append(view_range)
106107

107108

108-
def save_manifest(iiif_json: Dict) -> str:
109+
def save_manifest(iiif_json: Dict, viz_id) -> str:
109110
# generate a iiif manifest and save output file
110-
manifest = tempfile.NamedTemporaryFile('w', dir=str(pathlib.Path(__file__).parent /'static'/'tmp'), suffix='.json', delete=False)
111+
manifest = tempfile.NamedTemporaryFile(
112+
'w', dir=str(cache.get_cache_path() / viz_id), suffix='.json', delete=False)
111113
json.dump(iiif_json, manifest, indent=4)
112114
return manifest.name
113115

0 commit comments

Comments
 (0)