Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prototype for storage API #7

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ FLASK_RUN_PORT=8001
FLASK_RUN_HOST=127.0.0.1
ASSET_DIR=/mnt
DOWNLOAD_DIR=${ASSET_DIR}/downloads
STORAGE_DIR=/home/jordanyouner/storage_test
BUILD_DB=1
9 changes: 8 additions & 1 deletion api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@
from datetime import date
from pathlib import Path
from string import Template
from api.storage_api import blueprint as storage_bp
from mmif import Mmif
from clams import mmif_utils
import hashlib
import json

from flask import Flask, render_template, request, Blueprint
from flask import Flask, render_template, request, Blueprint, jsonify

DATABASE = Path(__file__).parent / 'database.db'
SEARCH_DIRECTORY = os.environ.get('ASSET_DIR')
RESULT_DIRECTORY = os.environ.get('DOWNLOAD_DIR')
BUILD_DB = bool(os.environ.get('BUILD_DB'))
STORAGE_DIRECTORY = os.environ.get('STORAGE_DIR')

bp = Blueprint('app', __name__, template_folder='templates')

Expand Down Expand Up @@ -207,5 +213,6 @@ def create_app(build_db=BUILD_DB):
app = Flask(__name__)
app.config.from_prefixed_env()
app.register_blueprint(bp)
app.register_blueprint(storage_bp, name='storage_app')

return app
202 changes: 202 additions & 0 deletions api/storage_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from mmif import Mmif
from clams import mmif_utils
from flask import Flask, request, jsonify, Blueprint
from typing import List, Dict
from typing_extensions import Annotated
import os
import hashlib
import json

# make blueprint of app to be used in __init__.py
blueprint = Blueprint('app', __name__)
# get post request from user
# read mmif inside post request, get view metadata
# store in nested directory relating to view metadata



@blueprint.route("/")
def root():
return {"message": "Storage api for pipelined mmif files"}


@blueprint.route("/storeapi/mmif/", methods=["POST"])
def upload_mmif():
body = request.get_data(as_text=True)
# read local storage directory from .env
directory = os.environ.get('STORAGE_DIR')
mmif = Mmif(body)
# get guid from location
document = mmif.documents['d1']['properties'].location.split('/')[2].split('.')[0]
# append '.mmif' to guid
document = document + '.mmif'
# IMPORTANT: In order to enable directory creation after this loop and also store each parameter
# dictionary in its proper directory, I create a dictionary to associate the current path level with
# its param dict. After this loop, I create the dirs and then iterate through this dictionary to
# place the param dicts in their proper spots.
param_path_dict = {}
for view in mmif.views:
# this should return the back half of the app url, so just app name and version number
subdir_list = view.metadata.app.split('/')[3:]
# create path string for this view
view_path = os.path.join('', *subdir_list)
# IMPORTANT: We must check for both nonexistent and unresolvable version numbers.
# In both of these cases we do not want to store the mmif, as it would cause conflicts.
# TODO: Check back on this because I might be making assumptions about this data that aren't always
# TODO: true, e.g about list length. Far as I can tell though it aligns with typical mmif metadata.
if len(subdir_list) < 2 or subdir_list[1] == "unresolvable":
return jsonify({'error': f'app {subdir_list[0]} version is underspecified'}), 400
# now we want to convert the parameter dictionary to a string and then hash it.
# this hash will be the name of another subdirectory.
try:
param_dict = view.metadata["parameters"]
param_list = ['='.join(pair) for pair in param_dict.items()]
param_list.sort()
param_string = ','.join(param_list)
except KeyError:
param_dict = ""
param_string = ""
# hash the (sorted and concatenated list of params) string and join with path
# NOTE: this is *not* for security purposes, so the usage of md5 is not an issue.
param_hash = hashlib.md5(param_string.encode('utf-8')).hexdigest()
view_path = os.path.join(view_path, param_hash)
# check if this is a duplicate view. if it is, skip the current view.
# NOTE: duplicate views are those with the same app, version number, AND parameter dict.
if view_path in directory:
continue
# create path by joining directory with the current view path
directory = os.path.join(directory, view_path)
# now that we know it's not a duplicate view and we have the proper path location, we
# store it and the associated param dict inside param_path_dict.
param_path_dict[directory] = param_dict
# we have finished looping through the views. now time to create the directories
# and dump the param dicts
os.makedirs(directory, exist_ok=True)
for path in param_path_dict:
file_path = os.path.join(os.path.dirname(path), 'parameters.json')
with open(file_path, "w") as f:
json.dump(param_path_dict[path], f)
# put mmif into the lowest level directory with filename based on guid
file_path = os.path.join(directory, document)
with open(file_path, "w") as f:
f.write(mmif.serialize())
return "Success", 201


@blueprint.route("/searchapi/mmif/", methods=["POST"])
def download_mmif():
data = json.loads(request.data.decode('utf-8'))
# get both pipeline and guid from data
# obtain pipeline using helper method
pipeline = pipeline_from_param_json(data)
# get number of views for rewind if necessary
num_views = len(data['pipeline'])
guid = data.get('guid')
# validate existence of pipeline, guid is not necessary if you just want the pipeline returned
if not pipeline:
return jsonify({'error': 'Missing required parameters: need at least a pipeline'})
# load environment variables to concat pipeline with local storage path
directory = os.environ.get('STORAGE_DIR')
pipeline = os.path.join(directory, pipeline)
# if this is a "zero-guid" request, the user will receive just the local storage pipeline
# this allows clients to utilize the api without downloading files (for working with local files)
if not guid:
return jsonify({'pipeline': pipeline})
# CHECK IF GUID IS SINGLE VALUE OR LIST
if not isinstance(guid, list):
guid = guid + ".mmif"
# get file from storage directory
path = os.path.join(pipeline, guid)
# if file exists, we can return it
try:
with open(path, 'r') as file:
mmif = file.read()
return mmif
# otherwise we will use the rewinder
# this assumes the user has provided a subset of a mmif pipeline that we have previously stored
# in the case where this is not true, we return a FileNotFound error.
except FileNotFoundError:
return rewind_time(pipeline, guid, num_views)
else:
# in the case where we want multiple mmifs retrieved, we construct a json to store
# each guid as a key and each mmif as the value.
mmifs_by_guid = dict()
for curr_guid in guid:
curr_guid = curr_guid + ".mmif"
# get file from storage directory
path = os.path.join(pipeline, curr_guid)
# if file exists, we can put it in the json
try:
with open(path, 'r') as file:
mmif = file.read()
# place serialized mmif into dictionary/json with guid key (remove file ext)
mmifs_by_guid[curr_guid.split('.')[0]] = mmif
# otherwise we will use the rewinder
# as with the single-guid case, this assumes the pipeline is a proper subset of
# another guid-matching mmif's pipeline.
# otherwise we store a string representing the lack of a file for that guid.
except FileNotFoundError:
try:
mmif = rewind_time(pipeline, curr_guid, num_views)
mmifs_by_guid[curr_guid.split('.')[0]] = mmif
except FileNotFoundError:
# TODO: figure out a good way to mark file not found
mmifs_by_guid[curr_guid.split('.')[0]] = "File not found"
# now turn the dictionary into a json and return it
return json.dumps(mmifs_by_guid)


# helper method for extracting pipeline
def pipeline_from_param_json(param_json):
"""
This method reads in a json containing the names of the pipelined apps and their
respective parameters, and then builds a path out of the pipelined apps and hashed
parameters.
"""
pipeline = ""
for clams_app in param_json["pipeline"]:
# not using os path join until later for testing purposes
pipeline = pipeline + "/" + clams_app
# try to get param items
try:
param_list = ['='.join(pair) for pair in param_json["pipeline"][clams_app].items()]
param_list.sort()
param_string = ','.join(param_list)
# throws attribute error if empty (because empty means it's a set and not dict)
except AttributeError:
param_string = ""
# hash parameters
param_hash = hashlib.md5(param_string.encode('utf-8')).hexdigest()
pipeline = pipeline + "/" + param_hash
# removing first "/" so it doesn't mess with os.path.join later
pipeline = pipeline[1:]
return pipeline


def rewind_time(pipeline, guid, num_views):
"""
This method takes in a pipeline (path), a guid, and a number of views, and uses os.walk to iterate through
directories that begin with that pipeline. It takes the first mmif file that matches the guid and uses the
rewind feature to include only the views indicated by the pipeline.
"""
for home, dirs, files in os.walk(pipeline):
# find mmif with matching guid to rewind
for file in files:
if guid == file:
# rewind the mmif
with open(os.path.join(home, file), 'r') as f:
mmif = Mmif(f.read())
# we need to calculate the number of views to rewind
rewound = mmif_utils.rewind.rewind_mmif(mmif, len(mmif.views) - num_views)
return rewound.serialize()
raise FileNotFoundError




# if __name__ == "__main__":
# blueprint.run(port=8912)




1 change: 1 addition & 0 deletions prototype/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
storage_dir: C:\Users\eljor\desktop\fast_playground\
22 changes: 22 additions & 0 deletions prototype/test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"pipeline": {
"swt-detection/unresolvable": {
},

"simple-timepoints-stitcher/v1.3": {
"labelMap": "['B:bars', 'S:slate', 'S-H:slate', 'S-C:slate', 'S-D:slate', 'S-G:slate', 'I:chyron', 'N:chyron', 'Y:chyron', 'C:credits']"
},

"doctr-wrapper/unresolvable": {
"tfLabel": "chyron",
"pretty": ""
},

"doctr-wrapper": {
"tfLabel": "['credits', 'credit']",
"pretty": ""
}

},
"guid": "cpb-aacip-0515ac167c0"
}
13 changes: 13 additions & 0 deletions prototype/test_multiple_retrieve.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"pipeline": {
"distil-whisper-wrapper/v1.1": {
"pretty": "True"
},

"spacy-wrapper/v2.0": {
"pretty": "True"
}

},
"guid":["cpb-aacip-507-154dn40c26", "cpb-aacip-507-1v5bc3tf81"]
}
12 changes: 12 additions & 0 deletions prototype/test_rewind.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"pipeline": {
"swt-detection/unresolvable": {
},

"simple-timepoints-stitcher/v1.3": {
"labelMap": "['B:bars', 'S:slate', 'S-H:slate', 'S-C:slate', 'S-D:slate', 'S-G:slate', 'I:chyron', 'N:chyron', 'Y:chyron', 'C:credits']"
}

},
"guid": "cpb-aacip-0515ac167c0"
}
21 changes: 21 additions & 0 deletions prototype/test_zero_guid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"pipeline": {
"swt-detection/unresolvable": {
},

"simple-timepoints-stitcher/v1.3": {
"labelMap": "['B:bars', 'S:slate', 'S-H:slate', 'S-C:slate', 'S-D:slate', 'S-G:slate', 'I:chyron', 'N:chyron', 'Y:chyron', 'C:credits']"
},

"doctr-wrapper/unresolvable": {
"tfLabel": "chyron",
"pretty": ""
},

"doctr-wrapper": {
"tfLabel": "['credits', 'credit']",
"pretty": ""
}

}
}
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
flask==2.3.2
gunicorn==20.1.0
python-dotenv
python-dotenv~=1.0.1
flask-dotenv

pyyaml~=6.0.1
pydantic~=1.10.17
clams