generate many versions

workflowhub-eu · May 24, 2024 · 61ccf92 · 61ccf92
1 parent e451314
commit 61ccf92
Show file tree

Hide file tree

Showing 7 changed files with 150 additions and 53 deletions.
diff --git a/tests/test_absolutize.py b/tests/test_absolutize.py
@@ -31,7 +31,7 @@ def test_make_paths_absolute(self, workflow_id):
                 rdflib.Graph().parse(data=json.dumps(json_data), format="json-ld")
             )
 
-            json_data_abs_paths = make_paths_absolute(json_data, BASE_URL, 41)
+            json_data_abs_paths = make_paths_absolute(json_data, BASE_URL, 41, 1)
 
             parsed_graph = rdflib.Graph().parse(
                 data=json.dumps(json_data_abs_paths), format="json-ld"

diff --git a/tests/test_source_crates.py b/tests/test_source_crates.py
@@ -123,22 +123,45 @@ def test_download_workflow_ids_404(self, mock_requests_get):
 
 
 class TestProcessWorkflowIds:
+    @patch("workflowhub_graph.source_crates.get_dot_json_endpoint")
     @patch("workflowhub_graph.source_crates.download_and_extract_json_from_zip")
+    @pytest.mark.parametrize("all_versions", [True, False])
     def test_process_workflow_ids(
-        self, mock_download_and_extract_json_from_zip, setup_output_dir
+        self,
+        mock_download_and_extract_json_from_zip,
+        get_dot_json_endpoint,
+        setup_output_dir,
+        all_versions,
     ):
         """Mock a successful download and extraction of a JSON file from a zip file"""
 
         mock_download_and_extract_json_from_zip.return_value = b'{"name": "test"}'
 
+        get_dot_json_endpoint.return_value = {
+            "data": {
+                "attributes": {
+                    "latest_version": 12,
+                    "versions": [{"version": 3}, {"version": 4}],
+                }
+            }
+        }
+
         workflows_data = {"data": [{"id": "883"}]}
 
         output_dir = setup_output_dir
-        process_workflow_ids(workflows_data, output_dir)
-
-        expected_file_path = os.path.join(output_dir, "883_ro-crate-metadata.json")
-        assert os.path.exists(expected_file_path)
-
-        with open(expected_file_path, "rb") as f:
-            content = f.read()
-            assert content == b'{"name": "test"}'
+        process_workflow_ids(workflows_data, output_dir, all_versions=all_versions)
+
+        if all_versions:
+            expected_versions = [3, 4]
+        else:
+            expected_versions = [12]
+
+        for version in expected_versions:
+            expected_file_path = os.path.join(
+                output_dir, f"883_{version}_ro-crate-metadata.json"
+            )
+            assert os.path.exists(expected_file_path)
+
+            with open(expected_file_path, "rb") as f:
+                content = f.read()
+                assert content == b'{"name": "test"}'
diff --git a/workflowhub_graph/absolutize.py b/workflowhub_graph/absolutize.py
@@ -27,8 +27,9 @@ def is_all_absolute(G: rdflib.Graph) -> bool:
 # normative schema.org dev docs: https://schema.org/docs/developers.html
 # TODO: make a note for validation of the graph
 
+
 def make_paths_absolute(
-    json_data: dict, workflowhub_url: str, workflow_id: int
+    json_data: dict, workflowhub_url: str, workflow_id: int, workflow_version: int
 ) -> dict:
     """
     Makes all paths in the JSON content absolute by adding an '@base' key to the JSON-LD context.
@@ -40,8 +41,9 @@ def make_paths_absolute(
     :raises ValueError: If '@context' key is missing or if '@base' key already exists in the JSON content.
     """
 
-    # TODO: where version comes from?
-    workflow_url = f"{workflowhub_url}/workflows/{workflow_id}/ro_crate?version=1"
+    workflow_url = (
+        f"{workflowhub_url}/workflows/{workflow_id}/ro_crate?version={workflow_version}"
+    )
 
     if "@context" not in json_data:
         raise ValueError(
@@ -69,7 +71,8 @@ def main():
     )
     parser.add_argument("json_file", help="The JSON file to process.")
     parser.add_argument("output_file", help="The output file.")
-    parser.add_argument("workflow_id", help="The WorkflowHub ID.")
+    parser.add_argument("workflow_id", help="The Workflow ID.")
+    parser.add_argument("workflow_version", help="The Workflow version.")
     parser.add_argument(
         "-u",
         "--workflowhub-url",
@@ -83,7 +86,7 @@ def main():
         json_data = json.load(f)
 
     processed_json_data = make_paths_absolute(
-        json_data, args.workflowhub_url, args.workflow_id
+        json_data, args.workflowhub_url, args.workflow_id, args.workflow_version
     )
 
     if args.output_file == "-":

diff --git a/workflowhub_graph/constants.py b/workflowhub_graph/constants.py
@@ -15,7 +15,8 @@
 BASE_URL = BASE_URL_DEV
 WORKFLOWS_URL = WORKFLOWS_URL_DEV
 
-METADATA_ENDPOINT = "/workflows/{w_id}/ro_crate_metadata"
+DOT_JSON_ENDPOINT = "/workflows/{w_id}.json"
+METADATA_ENDPOINT = "/workflows/{w_id}/ro_crate_metadata?version={w_version}"
 ZIP_ENDPOINT = "/workflows/{w_id}/ro_crate?version={w_version}"
 
 TARGET_FILE_NAME = "ro-crate-metadata.json"
diff --git a/workflowhub_graph/merge.py b/workflowhub_graph/merge.py
@@ -2,6 +2,7 @@
 import glob
 import json
 import os
+import re
 
 import rdflib
 
@@ -33,9 +34,20 @@ def merge_all_files(
         with open(fn, "r") as f:
             print(f"Processing {fn}, {i}/{len(filenames)}")
 
-            w_id = int(os.path.basename(fn).split("_")[0])
+            basename = os.path.basename(fn)
 
-            json_data = make_paths_absolute(json.load(f), base_url, w_id)
+            if matched := re.match("([0-9]+?)_ro-crate-metadata.json", basename):
+                w_id = int(matched.group(1))
+                w_version = 1
+            elif matched := re.match(
+                "([0-9]+?)_([0-9]+?)_ro-crate-metadata.json", basename
+            ):
+                w_id = int(matched.group(1))
+                w_version = int(matched.group(2))
+            else:
+                raise ValueError(f"Could not match the filename {basename}")
+
+            json_data = make_paths_absolute(json.load(f), base_url, w_id, w_version)
 
             # TODO: Is there an issue here? Linting shows "Expected type 'str | bytes | None', got 'dict' instead"
             with patch_rdflib_urlopen(**cache_kwargs):

diff --git a/workflowhub_graph/source_crates.py b/workflowhub_graph/source_crates.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 import os
 import traceback
 
@@ -9,6 +10,7 @@
 from workflowhub_graph.constants import (
     BASE_URL_DEV,
     BASE_URL_PROD,
+    DOT_JSON_ENDPOINT,
     TARGET_FILE_NAME,
     METADATA_ENDPOINT,
     WORKFLOWS_URL_DEV,
@@ -17,6 +19,33 @@
 )
 
 
+def get_dot_json_endpoint(target_url) -> dict | None:
+    """
+    Returns the endpoint to download a JSON file from WorkflowHub.
+
+    :param workflow_id: The workflow ID.
+    :return: The endpoint to download the JSON file.
+    """
+    try:
+        response = requests.get(target_url)
+        response.raise_for_status()
+
+        if "application/vnd.api+json" in response.headers["Content-Type"]:
+            return response.json()
+        else:
+            print(f"No JSON file found at: '{target_url}'")
+            return None
+
+    except requests.RequestException as e:
+        print(f"Failed to download the JSON file from {target_url}. Error: {e}")
+        return None
+    except Exception as e:
+        print(
+            f"An error occurred while downloading the JSON file from {target_url}. Error: {e}"
+        )
+        return None
+
+
 def download_and_extract_json_from_metadata_endpoint(target_url: str) -> bytes | None:
     """
     Downloads a JSON file from WorkflowHub (or specified URL) of a specific workflow and returns its content.
@@ -116,6 +145,7 @@ def process_workflow_ids(
     output_dir: str = "data",
     is_metadata_endpoint: bool = False,
     base_url: str = BASE_URL_DEV,
+    all_versions: bool = False,
 ) -> None:
     """
     Utilises the JSON file downloaded by download_workflow_ids(). This file is used to download a list
@@ -141,29 +171,57 @@ def process_workflow_ids(
                 f"Processing workflow ID {workflow_id} ({i_workflow + 1}/{len(workflows)})..."
             )
 
-            # TODO: Remove dev WorkflowHub URL:
-            if is_metadata_endpoint:
-                endpoint = METADATA_ENDPOINT.format(w_id=workflow_id)
-                json_content = download_and_extract_json_from_metadata_endpoint(
-                    base_url + endpoint
-                )
-            else:
-                # TODO: Where does version come from?
-                # TODO: make mutliple versions or the latest version
-                endpoint = ZIP_ENDPOINT.format(w_id=workflow_id, w_version=1)
-                json_content = download_and_extract_json_from_zip(base_url + endpoint)
-
-            if json_content:
-                output_file_path = os.path.join(
-                    output_dir, f"{workflow_id}_ro-crate-metadata.json"
+            workflow_json = get_dot_json_endpoint(
+                base_url + DOT_JSON_ENDPOINT.format(w_id=workflow_id)
+            )
+
+            try:
+                if all_versions:
+                    workflow_versions = [
+                        workflow_version_dict["version"]
+                        for workflow_version_dict in workflow_json["data"][
+                            "attributes"
+                        ]["versions"]
+                    ]
+                else:
+                    workflow_versions = [
+                        workflow_json["data"]["attributes"]["latest_version"]
+                    ]
+            except (KeyError, TypeError):
+                print(
+                    f"Failed to extract workflow versions from:\n",
+                    json.dumps(workflow_json, indent=4),
                 )
-                with open(output_file_path, "wb") as output_file:
-                    output_file.write(json_content)
-                print(f"Content saved to {output_file_path}")
-                n_successful += 1
+                continue
+
+            # TODO: Remove dev WorkflowHub URL:
+            for w_version in workflow_versions:
+                if is_metadata_endpoint:
+                    endpoint = METADATA_ENDPOINT.format(
+                        w_id=workflow_id, w_version=w_version
+                    )
+                    json_content = download_and_extract_json_from_metadata_endpoint(
+                        base_url + endpoint
+                    )
+                else:
+                    endpoint = ZIP_ENDPOINT.format(
+                        w_id=workflow_id, w_version=w_version
+                    )
+                    json_content = download_and_extract_json_from_zip(
+                        base_url + endpoint
+                    )
+
+                if json_content:
+                    output_file_path = os.path.join(
+                        output_dir, f"{workflow_id}_{w_version}_ro-crate-metadata.json"
+                    )
+                    with open(output_file_path, "wb") as output_file:
+                        output_file.write(json_content)
+                    print(f"Content saved to {output_file_path}")
+                    n_successful += 1
 
-            else:
-                print(f"Failed to process workflow ID {workflow_id}")
+                else:
+                    print(f"Failed to process workflow ID {workflow_id}")
 
     except Exception as e:
         print(f"An error occurred while processing workflow IDs. Error: {e}")

diff --git a/workflowhub_graph/upload.py b/workflowhub_graph/upload.py
@@ -4,28 +4,28 @@
 
 def main():
     # TODO: this is very preliminary, needs to be improved
-    
+
     token = os.getenv("ZENODO_ACCESS_TOKEN")
 
     if token is None:
         raise Exception("ZENODO_ACCESS_TOKEN environment variable is not set")
 
     headers = {"Content-Type": "application/json"}
-    params = {'access_token': token}
-    r = requests.post('https://sandbox.zenodo.org/api/deposit/depositions',
-                    params=params,
-                    json={},
-                    headers=headers
-                    )
-
+    params = {"access_token": token}
+    r = requests.post(
+        "https://sandbox.zenodo.org/api/deposit/depositions",
+        params=params,
+        json={},
+        headers=headers,
+    )
+
     if r.status_code != 201:
-        raise Exception(f'Failed to create deposition: {r} {r.text}')
-    
+        raise Exception(f"Failed to create deposition: {r} {r.text}")
+
     print(r.json())
-    
+
     bucket_url = r.json()["links"]["bucket"]
 
-
     for filename in "merged.ttl", "merged.pdf":
         path = filename
 
@@ -37,8 +37,8 @@ def main():
             )
         r.json()
         if r.status_code != 201:
-            raise Exception(f'Failed to upload file: {r} {r.text}')
+            raise Exception(f"Failed to upload file: {r} {r.text}")
 
 
 if __name__ == "__main__":
-    main()
+    main()