Skip to content

Commit

Permalink
generate many versions
Browse files Browse the repository at this point in the history
  • Loading branch information
Volodymyr Savchenko authored and Volodymyr Savchenko committed May 24, 2024
1 parent e451314 commit 61ccf92
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 53 deletions.
2 changes: 1 addition & 1 deletion tests/test_absolutize.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_make_paths_absolute(self, workflow_id):
rdflib.Graph().parse(data=json.dumps(json_data), format="json-ld")
)

json_data_abs_paths = make_paths_absolute(json_data, BASE_URL, 41)
json_data_abs_paths = make_paths_absolute(json_data, BASE_URL, 41, 1)

parsed_graph = rdflib.Graph().parse(
data=json.dumps(json_data_abs_paths), format="json-ld"
Expand Down
41 changes: 32 additions & 9 deletions tests/test_source_crates.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,22 +123,45 @@ def test_download_workflow_ids_404(self, mock_requests_get):


class TestProcessWorkflowIds:
@patch("workflowhub_graph.source_crates.get_dot_json_endpoint")
@patch("workflowhub_graph.source_crates.download_and_extract_json_from_zip")
@pytest.mark.parametrize("all_versions", [True, False])
def test_process_workflow_ids(
self, mock_download_and_extract_json_from_zip, setup_output_dir
self,
mock_download_and_extract_json_from_zip,
get_dot_json_endpoint,
setup_output_dir,
all_versions,
):
"""Mock a successful download and extraction of a JSON file from a zip file"""

mock_download_and_extract_json_from_zip.return_value = b'{"name": "test"}'

get_dot_json_endpoint.return_value = {
"data": {
"attributes": {
"latest_version": 12,
"versions": [{"version": 3}, {"version": 4}],
}
}
}

workflows_data = {"data": [{"id": "883"}]}

output_dir = setup_output_dir
process_workflow_ids(workflows_data, output_dir)

expected_file_path = os.path.join(output_dir, "883_ro-crate-metadata.json")
assert os.path.exists(expected_file_path)

with open(expected_file_path, "rb") as f:
content = f.read()
assert content == b'{"name": "test"}'
process_workflow_ids(workflows_data, output_dir, all_versions=all_versions)

if all_versions:
expected_versions = [3, 4]
else:
expected_versions = [12]

for version in expected_versions:
expected_file_path = os.path.join(
output_dir, f"883_{version}_ro-crate-metadata.json"
)
assert os.path.exists(expected_file_path)

with open(expected_file_path, "rb") as f:
content = f.read()
assert content == b'{"name": "test"}'
13 changes: 8 additions & 5 deletions workflowhub_graph/absolutize.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def is_all_absolute(G: rdflib.Graph) -> bool:
# normative schema.org dev docs: https://schema.org/docs/developers.html
# TODO: make a note for validation of the graph


def make_paths_absolute(
json_data: dict, workflowhub_url: str, workflow_id: int
json_data: dict, workflowhub_url: str, workflow_id: int, workflow_version: int
) -> dict:
"""
Makes all paths in the JSON content absolute by adding an '@base' key to the JSON-LD context.
Expand All @@ -40,8 +41,9 @@ def make_paths_absolute(
:raises ValueError: If '@context' key is missing or if '@base' key already exists in the JSON content.
"""

# TODO: where version comes from?
workflow_url = f"{workflowhub_url}/workflows/{workflow_id}/ro_crate?version=1"
workflow_url = (
f"{workflowhub_url}/workflows/{workflow_id}/ro_crate?version={workflow_version}"
)

if "@context" not in json_data:
raise ValueError(
Expand Down Expand Up @@ -69,7 +71,8 @@ def main():
)
parser.add_argument("json_file", help="The JSON file to process.")
parser.add_argument("output_file", help="The output file.")
parser.add_argument("workflow_id", help="The WorkflowHub ID.")
parser.add_argument("workflow_id", help="The Workflow ID.")
parser.add_argument("workflow_version", help="The Workflow version.")
parser.add_argument(
"-u",
"--workflowhub-url",
Expand All @@ -83,7 +86,7 @@ def main():
json_data = json.load(f)

processed_json_data = make_paths_absolute(
json_data, args.workflowhub_url, args.workflow_id
json_data, args.workflowhub_url, args.workflow_id, args.workflow_version
)

if args.output_file == "-":
Expand Down
3 changes: 2 additions & 1 deletion workflowhub_graph/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
BASE_URL = BASE_URL_DEV
WORKFLOWS_URL = WORKFLOWS_URL_DEV

METADATA_ENDPOINT = "/workflows/{w_id}/ro_crate_metadata"
DOT_JSON_ENDPOINT = "/workflows/{w_id}.json"
METADATA_ENDPOINT = "/workflows/{w_id}/ro_crate_metadata?version={w_version}"
ZIP_ENDPOINT = "/workflows/{w_id}/ro_crate?version={w_version}"

TARGET_FILE_NAME = "ro-crate-metadata.json"
16 changes: 14 additions & 2 deletions workflowhub_graph/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import glob
import json
import os
import re

import rdflib

Expand Down Expand Up @@ -33,9 +34,20 @@ def merge_all_files(
with open(fn, "r") as f:
print(f"Processing {fn}, {i}/{len(filenames)}")

w_id = int(os.path.basename(fn).split("_")[0])
basename = os.path.basename(fn)

json_data = make_paths_absolute(json.load(f), base_url, w_id)
if matched := re.match("([0-9]+?)_ro-crate-metadata.json", basename):
w_id = int(matched.group(1))
w_version = 1
elif matched := re.match(
"([0-9]+?)_([0-9]+?)_ro-crate-metadata.json", basename
):
w_id = int(matched.group(1))
w_version = int(matched.group(2))
else:
raise ValueError(f"Could not match the filename {basename}")

json_data = make_paths_absolute(json.load(f), base_url, w_id, w_version)

# TODO: Is there an issue here? Linting shows "Expected type 'str | bytes | None', got 'dict' instead"
with patch_rdflib_urlopen(**cache_kwargs):
Expand Down
100 changes: 79 additions & 21 deletions workflowhub_graph/source_crates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import json
import os
import traceback

Expand All @@ -9,6 +10,7 @@
from workflowhub_graph.constants import (
BASE_URL_DEV,
BASE_URL_PROD,
DOT_JSON_ENDPOINT,
TARGET_FILE_NAME,
METADATA_ENDPOINT,
WORKFLOWS_URL_DEV,
Expand All @@ -17,6 +19,33 @@
)


def get_dot_json_endpoint(target_url) -> dict | None:
"""
Returns the endpoint to download a JSON file from WorkflowHub.
:param workflow_id: The workflow ID.
:return: The endpoint to download the JSON file.
"""
try:
response = requests.get(target_url)
response.raise_for_status()

if "application/vnd.api+json" in response.headers["Content-Type"]:
return response.json()
else:
print(f"No JSON file found at: '{target_url}'")
return None

except requests.RequestException as e:
print(f"Failed to download the JSON file from {target_url}. Error: {e}")
return None
except Exception as e:
print(
f"An error occurred while downloading the JSON file from {target_url}. Error: {e}"
)
return None


def download_and_extract_json_from_metadata_endpoint(target_url: str) -> bytes | None:
"""
Downloads a JSON file from WorkflowHub (or specified URL) of a specific workflow and returns its content.
Expand Down Expand Up @@ -116,6 +145,7 @@ def process_workflow_ids(
output_dir: str = "data",
is_metadata_endpoint: bool = False,
base_url: str = BASE_URL_DEV,
all_versions: bool = False,
) -> None:
"""
Utilises the JSON file downloaded by download_workflow_ids(). This file is used to download a list
Expand All @@ -141,29 +171,57 @@ def process_workflow_ids(
f"Processing workflow ID {workflow_id} ({i_workflow + 1}/{len(workflows)})..."
)

# TODO: Remove dev WorkflowHub URL:
if is_metadata_endpoint:
endpoint = METADATA_ENDPOINT.format(w_id=workflow_id)
json_content = download_and_extract_json_from_metadata_endpoint(
base_url + endpoint
)
else:
# TODO: Where does version come from?
# TODO: make mutliple versions or the latest version
endpoint = ZIP_ENDPOINT.format(w_id=workflow_id, w_version=1)
json_content = download_and_extract_json_from_zip(base_url + endpoint)

if json_content:
output_file_path = os.path.join(
output_dir, f"{workflow_id}_ro-crate-metadata.json"
workflow_json = get_dot_json_endpoint(
base_url + DOT_JSON_ENDPOINT.format(w_id=workflow_id)
)

try:
if all_versions:
workflow_versions = [
workflow_version_dict["version"]
for workflow_version_dict in workflow_json["data"][
"attributes"
]["versions"]
]
else:
workflow_versions = [
workflow_json["data"]["attributes"]["latest_version"]
]
except (KeyError, TypeError):
print(
f"Failed to extract workflow versions from:\n",
json.dumps(workflow_json, indent=4),
)
with open(output_file_path, "wb") as output_file:
output_file.write(json_content)
print(f"Content saved to {output_file_path}")
n_successful += 1
continue

# TODO: Remove dev WorkflowHub URL:
for w_version in workflow_versions:
if is_metadata_endpoint:
endpoint = METADATA_ENDPOINT.format(
w_id=workflow_id, w_version=w_version
)
json_content = download_and_extract_json_from_metadata_endpoint(
base_url + endpoint
)
else:
endpoint = ZIP_ENDPOINT.format(
w_id=workflow_id, w_version=w_version
)
json_content = download_and_extract_json_from_zip(
base_url + endpoint
)

if json_content:
output_file_path = os.path.join(
output_dir, f"{workflow_id}_{w_version}_ro-crate-metadata.json"
)
with open(output_file_path, "wb") as output_file:
output_file.write(json_content)
print(f"Content saved to {output_file_path}")
n_successful += 1

else:
print(f"Failed to process workflow ID {workflow_id}")
else:
print(f"Failed to process workflow ID {workflow_id}")

except Exception as e:
print(f"An error occurred while processing workflow IDs. Error: {e}")
Expand Down
28 changes: 14 additions & 14 deletions workflowhub_graph/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,28 @@

def main():
# TODO: this is very preliminary, needs to be improved

token = os.getenv("ZENODO_ACCESS_TOKEN")

if token is None:
raise Exception("ZENODO_ACCESS_TOKEN environment variable is not set")

headers = {"Content-Type": "application/json"}
params = {'access_token': token}
r = requests.post('https://sandbox.zenodo.org/api/deposit/depositions',
params=params,
json={},
headers=headers
)

params = {"access_token": token}
r = requests.post(
"https://sandbox.zenodo.org/api/deposit/depositions",
params=params,
json={},
headers=headers,
)

if r.status_code != 201:
raise Exception(f'Failed to create deposition: {r} {r.text}')
raise Exception(f"Failed to create deposition: {r} {r.text}")

print(r.json())

bucket_url = r.json()["links"]["bucket"]


for filename in "merged.ttl", "merged.pdf":
path = filename

Expand All @@ -37,8 +37,8 @@ def main():
)
r.json()
if r.status_code != 201:
raise Exception(f'Failed to upload file: {r} {r.text}')
raise Exception(f"Failed to upload file: {r} {r.text}")


if __name__ == "__main__":
main()
main()

0 comments on commit 61ccf92

Please sign in to comment.