workflowhub-eu
diff --git a/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/__init__.py b/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/__init__.py
diff --git a/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/absolutize.py
Lines changed: 106 additions & 0 deletions b/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/absolutize.py
Lines changed: 106 additions & 0 deletions
diff --git a/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/cached_url_open.py
Lines changed: 81 additions & 0 deletions b/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/cached_url_open.py
Lines changed: 81 additions & 0 deletions
diff --git a/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/check_outputs.py
Lines changed: 110 additions & 0 deletions b/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/check_outputs.py
Lines changed: 110 additions & 0 deletions
diff --git a/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/cli.py
Lines changed: 17 additions & 0 deletions b/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/cli.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/constants.py
Lines changed: 22 additions & 0 deletions b/‎ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/constants.py
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,106 @@
+import argparse
+import copy
+import json
+from urllib.parse import urlparse
+import arcp
+import rdflib
+
+
+# TODO: following https://github.com/workflowhub-eu/workflowhub-graph/issues/12
+#       builing upon is_all_absolute
+#       add extended RO-Crate profile validation
+#       get information like schema.org domain and check if the graph is compliant with the schema
+#       normative schema.org dev docs: https://schema.org/docs/developers.html
+#       make a note for validation of the graph
+
+
+def is_all_absolute(G: rdflib.Graph) -> bool:
+    for triple in G:
+        for item in triple:
+            if isinstance(item, rdflib.URIRef):
+                # TODO: is this enough?
+                parsed = urlparse(item)
+
+                # we accept file:// with a netloc, even if netloc is not a FQDN,
+                # see https://github.com/workflowhub-eu/workflowhub-graph/issues/1#issuecomment-2127351752
+                if parsed.netloc == "" and parsed.scheme != "mailto":
+                    print(
+                        f"found non-absolute path <{item}> {parsed.netloc}, {urlparse(item)}"
+                    )
+                    return False
+    return True
+
+
+def make_paths_absolute(
+    json_data: dict, workflowhub_url: str, workflow_id: int, workflow_version: int
+) -> dict:
+    """
+    Makes all paths in the JSON content absolute by adding an '@base' key to the JSON-LD context.
+
+    :param json_data: The JSON content as a dictionary.
+    :param workflowhub_url: The base URL for WorkflowHub.
+    :param workflow_id: The workflow ID to construct the absolute paths.
+    :param workflow_version: The workflow version.
+    :return: The modified JSON content with absolute paths.
+    :raises ValueError: If '@context' key is missing or if '@base' key already exists in the JSON content.
+    """
+
+    json_data = copy.deepcopy(json_data)
+
+    workflow_url = (
+        f"{workflowhub_url}/workflows/{workflow_id}/ro_crate?version={workflow_version}"
+    )
+
+    if "@context" not in json_data:
+        raise ValueError(
+            "The JSON content does not contain a '@context' key, refusing to add it, can not absolutize paths"
+        )
+
+    if not isinstance(json_data["@context"], list):
+        json_data["@context"] = [json_data["@context"]]
+
+    if any(
+        isinstance(item, dict) and "@base" in item for item in json_data["@context"]
+    ):
+        raise ValueError(
+            "The JSON content already contains an '@base' key, it was probably already processed."
+        )
+
+    json_data["@context"].append({"@base": arcp.arcp_location(workflow_url)})
+
+    return json_data
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Make all paths in a JSON file absolute."
+    )
+    parser.add_argument("json_file", help="The JSON file to process.")
+    parser.add_argument("output_file", help="The output file.")
+    parser.add_argument("workflow_id", help="The Workflow ID.")
+    parser.add_argument("workflow_version", help="The Workflow version.")
+    parser.add_argument(
+        "-u",
+        "--workflowhub-url",
+        help="The WorkflowHub URL.",
+        default="https://workflowhub.eu",
+    )
+
+    args = parser.parse_args()
+
+    with open(args.json_file, "r") as f:
+        json_data = json.load(f)
+
+    processed_json_data = make_paths_absolute(
+        json_data, args.workflowhub_url, args.workflow_id, args.workflow_version
+    )
+
+    if args.output_file == "-":
+        print(json.dumps(processed_json_data, indent=2))
+    else:
+        with open(args.output_file, "w") as f:
+            json.dump(processed_json_data, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,81 @@
+import json
+import os
+import re
+from unittest.mock import patch, MagicMock
+from contextlib import contextmanager
+import io
+from urllib.parse import urlparse
+from urllib.request import urlopen
+
+
+def url_to_filename(url):
+    """
+    Converts a URL to a filename by removing non-alphanumeric characters and replacing them with dashes.
+    :param url: The URL to convert.
+    :return: The filename.
+    """
+
+    parsed = urlparse(url)
+    if parsed.scheme not in ["http", "https"]:
+        raise ValueError(f"Unsupported scheme {parsed.scheme}")
+
+    return re.sub("[^0-9a-z]+", "-", (parsed.netloc + parsed.path).lower().strip("_"))
+
+
+@contextmanager
+def patch_rdflib_urlopen(
+    cache_base_dir=None,
+    write_cache=True,
+    allowed_urls_pattern=r"https://w3id.org/ro/crate/1\.[01]/context",
+):
+    """
+    Context manager to patch rdflib.parser.urlopen to cache and return the content of a URL.
+    :param cache_base_dir: The base directory to store the cached files.
+    :param write_cache: Whether to write the cache if the file is not found.
+    :param allowed_urls_pattern: A regex pattern to match the allowed URLs to cache.
+    """
+
+    allowed_urls_re = re.compile(allowed_urls_pattern)
+    if cache_base_dir is None:
+        cache_base_dir = "cached_urlopen"
+        os.makedirs(cache_base_dir, exist_ok=True)
+
+    def cached_urlopen(request):
+        url = request.get_full_url()
+
+        class Response(io.StringIO):
+            content_type = "text/html"
+            headers = {"Content-Type": "text/html"}
+
+            def info(self):
+                return self.headers
+
+            def geturl(self):
+                return url
+
+        if not allowed_urls_re.match(url):
+            return Response(json.dumps({"@context": {}}))
+            # raise ValueError(
+            #     f"URL {url} not allowed to cache, allowed: {allowed_urls_pattern}"
+            # )
+
+        cached_filename = os.path.join(cache_base_dir, url_to_filename(url))
+
+        if not os.path.exists(cached_filename):
+            if write_cache:
+                response = urlopen(request)
+                content = response.read().decode("utf-8")
+
+                with open(cached_filename, "wt") as f:
+                    f.write(content)
+            else:
+                raise ValueError(
+                    f"Cache file {cached_filename} not found, not allowed to download and update cache"
+                )
+
+        content = open(cached_filename, "rt").read()
+
+        return Response(content)
+
+    with patch("rdflib.parser.urlopen", cached_urlopen):
+        yield
@@ -0,0 +1,110 @@
+import argparse
+import json
+import os
+import re
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments.
+
+    :return: Parsed command-line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="Generate list of created files based on workflow IDs and versions."
+    )
+    parser.add_argument(
+        "--workflow-ids",
+        type=str,
+        help="Range of workflow IDs to process (e.g., '1-10').",
+    )
+    parser.add_argument(
+        "--versions",
+        type=str,
+        required=True,
+        help="Comma-separated list of versions to process (e.g., '1,2,3').",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data",
+        help="Directory where the output files are stored (default: 'data').",
+    )
+    return parser.parse_args()
+
+
+def get_max_id_from_files(output_dir: str) -> int:
+    """
+    If no workflow ID parameter is provided, get the maximum workflow ID from the files in the output directory.
+
+    :param output_dir: The directory where output files are stored.
+    :return: The maximum workflow ID.
+    """
+    max_id = 0
+    pattern = re.compile(r"^(\d+)_\d+_ro-crate-metadata\.json$")
+    for filename in os.listdir(output_dir):
+        match = pattern.match(filename)
+        if match:
+            wf_id = int(match.group(1))
+            if wf_id > max_id:
+                max_id = wf_id
+    return max_id
+
+
+def generate_expected_files(
+    output_dir: str, workflow_ids: range, versions: list[str]
+) -> list[str]:
+    """
+    Generate a list of expected file paths based on the workflow IDs and versions.
+
+    :param output_dir: The directory where output files are stored.
+    :param workflow_ids: The range of workflow IDs to process.
+    :param versions: The list of versions to process.
+
+    :return: A list of expected file paths.
+    """
+
+    expected_files = []
+    for wf_id in workflow_ids:
+        for ver in versions:
+            expected_files.append(f"{output_dir}/{wf_id}_{ver}_ro-crate-metadata.json")
+    return expected_files
+
+
+def verify_created_files(expected_files: list[str]) -> list[str]:
+    """
+    Verify which files from the list of expected files actually exist.
+
+    :param expected_files: The list of expected file paths.
+    :return: A list of file paths that actually exist.
+    """
+    return [f for f in expected_files if os.path.exists(f)]
+
+
+def main():
+    args = parse_args()
+
+    if args.workflow_ids:
+        min_id, max_id = map(int, args.workflow_ids.split("-"))
+        workflow_ids = range(min_id, max_id + 1)
+    else:
+        max_id = get_max_id_from_files(args.output_dir)
+        workflow_ids = range(1, max_id + 1)
+
+    versions = args.versions.split(",")
+
+    # Generate expected file paths
+    expected_files = generate_expected_files(args.output_dir, workflow_ids, versions)
+
+    # Check which files were actually created
+    created_files = verify_created_files(expected_files)
+
+    # Output the list of created files to a JSON file
+    with open("created_files.json", "w") as f:
+        json.dump(created_files, f)
+
+    print("\nFile names written to created_files.json")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,17 @@
+import sys
+
+
+def update_progress_bar(progress: int, total: int, bar_length: int = 50):
+    """
+    Updates the progress bar.
+
+    :param progress: Current progress.
+    :param total: The total value when the progress is complete.
+    :param bar_length: The length of the progress bar in characters.
+    """
+    fraction = progress / total
+    arrow = int(fraction * bar_length) * "="
+    padding = int(bar_length - len(arrow)) * " "
+    percent = int(fraction * 100)
+    sys.stdout.write(f"\r[{arrow}{padding}] {percent}%")
+    sys.stdout.flush()
@@ -0,0 +1,22 @@
+import os
+
+import certifi
+
+os.environ["SSL_CERT_FILE"] = certifi.where()
+
+
+BASE_URL_PROD = "https://workflowhub.eu"
+WORKFLOWS_URL_PROD = "https://workflowhub.eu/workflows.json"
+
+BASE_URL_DEV = "https://dev.workflowhub.eu"
+WORKFLOWS_URL_DEV = "https://dev.workflowhub.eu/workflows.json"
+
+# TODO: Why are we duplicating the URLs?
+BASE_URL = BASE_URL_DEV
+WORKFLOWS_URL = WORKFLOWS_URL_DEV
+
+DOT_JSON_ENDPOINT = "/workflows/{w_id}.json"
+METADATA_ENDPOINT = "/workflows/{w_id}/ro_crate_metadata?version={w_version}"
+ZIP_ENDPOINT = "/workflows/{w_id}/ro_crate?version={w_version}"
+
+TARGET_FILE_NAME = "ro-crate-metadata.json"