-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstorage_api.py
202 lines (182 loc) · 9.04 KB
/
storage_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from mmif import Mmif
from clams import mmif_utils
from flask import Flask, request, jsonify, Blueprint
from typing import List, Dict
from typing_extensions import Annotated
import os
import hashlib
import json
# make blueprint of app to be used in __init__.py
blueprint = Blueprint('app', __name__)
# get post request from user
# read mmif inside post request, get view metadata
# store in nested directory relating to view metadata
@blueprint.route("/")
def root():
return {"message": "Storage api for pipelined mmif files"}
@blueprint.route("/storeapi/mmif/", methods=["POST"])
def upload_mmif():
body = request.get_data(as_text=True)
# read local storage directory from .env
directory = os.environ.get('STORAGE_DIR')
mmif = Mmif(body)
# get guid from location
document = mmif.documents['d1']['properties'].location.split('/')[2].split('.')[0]
# append '.mmif' to guid
document = document + '.mmif'
# IMPORTANT: In order to enable directory creation after this loop and also store each parameter
# dictionary in its proper directory, I create a dictionary to associate the current path level with
# its param dict. After this loop, I create the dirs and then iterate through this dictionary to
# place the param dicts in their proper spots.
param_path_dict = {}
for view in mmif.views:
# this should return the back half of the app url, so just app name and version number
subdir_list = view.metadata.app.split('/')[3:]
# create path string for this view
view_path = os.path.join('', *subdir_list)
# IMPORTANT: We must check for both nonexistent and unresolvable version numbers.
# In both of these cases we do not want to store the mmif, as it would cause conflicts.
# TODO: Check back on this because I might be making assumptions about this data that aren't always
# TODO: true, e.g about list length. Far as I can tell though it aligns with typical mmif metadata.
if len(subdir_list) < 2 or subdir_list[1] == "unresolvable":
return jsonify({'error': f'app {subdir_list[0]} version is underspecified'}), 400
# now we want to convert the parameter dictionary to a string and then hash it.
# this hash will be the name of another subdirectory.
try:
param_dict = view.metadata["parameters"]
param_list = ['='.join(pair) for pair in param_dict.items()]
param_list.sort()
param_string = ','.join(param_list)
except KeyError:
param_dict = ""
param_string = ""
# hash the (sorted and concatenated list of params) string and join with path
# NOTE: this is *not* for security purposes, so the usage of md5 is not an issue.
param_hash = hashlib.md5(param_string.encode('utf-8')).hexdigest()
view_path = os.path.join(view_path, param_hash)
# check if this is a duplicate view. if it is, skip the current view.
# NOTE: duplicate views are those with the same app, version number, AND parameter dict.
if view_path in directory:
continue
# create path by joining directory with the current view path
directory = os.path.join(directory, view_path)
# now that we know it's not a duplicate view and we have the proper path location, we
# store it and the associated param dict inside param_path_dict.
param_path_dict[directory] = param_dict
# we have finished looping through the views. now time to create the directories
# and dump the param dicts
os.makedirs(directory, exist_ok=True)
for path in param_path_dict:
file_path = os.path.join(os.path.dirname(path), 'parameters.json')
with open(file_path, "w") as f:
json.dump(param_path_dict[path], f)
# put mmif into the lowest level directory with filename based on guid
file_path = os.path.join(directory, document)
with open(file_path, "w") as f:
f.write(mmif.serialize())
return "Success", 201
@blueprint.route("/searchapi/mmif/", methods=["POST"])
def download_mmif():
data = json.loads(request.data.decode('utf-8'))
# get both pipeline and guid from data
# obtain pipeline using helper method
pipeline = pipeline_from_param_json(data)
# get number of views for rewind if necessary
num_views = len(data['pipeline'])
guid = data.get('guid')
# validate existence of pipeline, guid is not necessary if you just want the pipeline returned
if not pipeline:
return jsonify({'error': 'Missing required parameters: need at least a pipeline'})
# load environment variables to concat pipeline with local storage path
directory = os.environ.get('STORAGE_DIR')
pipeline = os.path.join(directory, pipeline)
# if this is a "zero-guid" request, the user will receive just the local storage pipeline
# this allows clients to utilize the api without downloading files (for working with local files)
if not guid:
return jsonify({'pipeline': pipeline})
# CHECK IF GUID IS SINGLE VALUE OR LIST
if not isinstance(guid, list):
guid = guid + ".mmif"
# get file from storage directory
path = os.path.join(pipeline, guid)
# if file exists, we can return it
try:
with open(path, 'r') as file:
mmif = file.read()
return mmif
# otherwise we will use the rewinder
# this assumes the user has provided a subset of a mmif pipeline that we have previously stored
# in the case where this is not true, we return a FileNotFound error.
except FileNotFoundError:
return rewind_time(pipeline, guid, num_views)
else:
# in the case where we want multiple mmifs retrieved, we construct a json to store
# each guid as a key and each mmif as the value.
mmifs_by_guid = dict()
for curr_guid in guid:
curr_guid = curr_guid + ".mmif"
# get file from storage directory
path = os.path.join(pipeline, curr_guid)
# if file exists, we can put it in the json
try:
with open(path, 'r') as file:
mmif = file.read()
# place serialized mmif into dictionary/json with guid key (remove file ext)
mmifs_by_guid[curr_guid.split('.')[0]] = mmif
# otherwise we will use the rewinder
# as with the single-guid case, this assumes the pipeline is a proper subset of
# another guid-matching mmif's pipeline.
# otherwise we store a string representing the lack of a file for that guid.
except FileNotFoundError:
try:
mmif = rewind_time(pipeline, curr_guid, num_views)
mmifs_by_guid[curr_guid.split('.')[0]] = mmif
except FileNotFoundError:
# TODO: figure out a good way to mark file not found
mmifs_by_guid[curr_guid.split('.')[0]] = "File not found"
# now turn the dictionary into a json and return it
return json.dumps(mmifs_by_guid)
# helper method for extracting pipeline
def pipeline_from_param_json(param_json):
"""
This method reads in a json containing the names of the pipelined apps and their
respective parameters, and then builds a path out of the pipelined apps and hashed
parameters.
"""
pipeline = ""
for clams_app in param_json["pipeline"]:
# not using os path join until later for testing purposes
pipeline = pipeline + "/" + clams_app
# try to get param items
try:
param_list = ['='.join(pair) for pair in param_json["pipeline"][clams_app].items()]
param_list.sort()
param_string = ','.join(param_list)
# throws attribute error if empty (because empty means it's a set and not dict)
except AttributeError:
param_string = ""
# hash parameters
param_hash = hashlib.md5(param_string.encode('utf-8')).hexdigest()
pipeline = pipeline + "/" + param_hash
# removing first "/" so it doesn't mess with os.path.join later
pipeline = pipeline[1:]
return pipeline
def rewind_time(pipeline, guid, num_views):
"""
This method takes in a pipeline (path), a guid, and a number of views, and uses os.walk to iterate through
directories that begin with that pipeline. It takes the first mmif file that matches the guid and uses the
rewind feature to include only the views indicated by the pipeline.
"""
for home, dirs, files in os.walk(pipeline):
# find mmif with matching guid to rewind
for file in files:
if guid == file:
# rewind the mmif
with open(os.path.join(home, file), 'r') as f:
mmif = Mmif(f.read())
# we need to calculate the number of views to rewind
rewound = mmif_utils.rewind.rewind_mmif(mmif, len(mmif.views) - num_views)
return rewound.serialize()
raise FileNotFoundError
# if __name__ == "__main__":
# blueprint.run(port=8912)