Skip to content

Commit 7acea3a

Browse files
committed
Update example RO crate (#39)
- Addresses #39 - RO crate now includes source files and better conforms to workflow run RO crate - Includes mainEntity, conformsTo and references to source files - Snakemake component now represented correctly
1 parent 06cf57e commit 7acea3a

File tree

17 files changed

+4882
-238794
lines changed

17 files changed

+4882
-238794
lines changed

ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/__init__.py

Whitespace-only changes.
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import argparse
2+
import copy
3+
import json
4+
from urllib.parse import urlparse
5+
import arcp
6+
import rdflib
7+
8+
9+
# TODO: following https://github.com/workflowhub-eu/workflowhub-graph/issues/12
10+
# builing upon is_all_absolute
11+
# add extended RO-Crate profile validation
12+
# get information like schema.org domain and check if the graph is compliant with the schema
13+
# normative schema.org dev docs: https://schema.org/docs/developers.html
14+
# make a note for validation of the graph
15+
16+
17+
def is_all_absolute(G: rdflib.Graph) -> bool:
18+
for triple in G:
19+
for item in triple:
20+
if isinstance(item, rdflib.URIRef):
21+
# TODO: is this enough?
22+
parsed = urlparse(item)
23+
24+
# we accept file:// with a netloc, even if netloc is not a FQDN,
25+
# see https://github.com/workflowhub-eu/workflowhub-graph/issues/1#issuecomment-2127351752
26+
if parsed.netloc == "" and parsed.scheme != "mailto":
27+
print(
28+
f"found non-absolute path <{item}> {parsed.netloc}, {urlparse(item)}"
29+
)
30+
return False
31+
return True
32+
33+
34+
def make_paths_absolute(
35+
json_data: dict, workflowhub_url: str, workflow_id: int, workflow_version: int
36+
) -> dict:
37+
"""
38+
Makes all paths in the JSON content absolute by adding an '@base' key to the JSON-LD context.
39+
40+
:param json_data: The JSON content as a dictionary.
41+
:param workflowhub_url: The base URL for WorkflowHub.
42+
:param workflow_id: The workflow ID to construct the absolute paths.
43+
:param workflow_version: The workflow version.
44+
:return: The modified JSON content with absolute paths.
45+
:raises ValueError: If '@context' key is missing or if '@base' key already exists in the JSON content.
46+
"""
47+
48+
json_data = copy.deepcopy(json_data)
49+
50+
workflow_url = (
51+
f"{workflowhub_url}/workflows/{workflow_id}/ro_crate?version={workflow_version}"
52+
)
53+
54+
if "@context" not in json_data:
55+
raise ValueError(
56+
"The JSON content does not contain a '@context' key, refusing to add it, can not absolutize paths"
57+
)
58+
59+
if not isinstance(json_data["@context"], list):
60+
json_data["@context"] = [json_data["@context"]]
61+
62+
if any(
63+
isinstance(item, dict) and "@base" in item for item in json_data["@context"]
64+
):
65+
raise ValueError(
66+
"The JSON content already contains an '@base' key, it was probably already processed."
67+
)
68+
69+
json_data["@context"].append({"@base": arcp.arcp_location(workflow_url)})
70+
71+
return json_data
72+
73+
74+
def main():
75+
parser = argparse.ArgumentParser(
76+
description="Make all paths in a JSON file absolute."
77+
)
78+
parser.add_argument("json_file", help="The JSON file to process.")
79+
parser.add_argument("output_file", help="The output file.")
80+
parser.add_argument("workflow_id", help="The Workflow ID.")
81+
parser.add_argument("workflow_version", help="The Workflow version.")
82+
parser.add_argument(
83+
"-u",
84+
"--workflowhub-url",
85+
help="The WorkflowHub URL.",
86+
default="https://workflowhub.eu",
87+
)
88+
89+
args = parser.parse_args()
90+
91+
with open(args.json_file, "r") as f:
92+
json_data = json.load(f)
93+
94+
processed_json_data = make_paths_absolute(
95+
json_data, args.workflowhub_url, args.workflow_id, args.workflow_version
96+
)
97+
98+
if args.output_file == "-":
99+
print(json.dumps(processed_json_data, indent=2))
100+
else:
101+
with open(args.output_file, "w") as f:
102+
json.dump(processed_json_data, f, indent=2)
103+
104+
105+
if __name__ == "__main__":
106+
main()
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import json
2+
import os
3+
import re
4+
from unittest.mock import patch, MagicMock
5+
from contextlib import contextmanager
6+
import io
7+
from urllib.parse import urlparse
8+
from urllib.request import urlopen
9+
10+
11+
def url_to_filename(url):
12+
"""
13+
Converts a URL to a filename by removing non-alphanumeric characters and replacing them with dashes.
14+
:param url: The URL to convert.
15+
:return: The filename.
16+
"""
17+
18+
parsed = urlparse(url)
19+
if parsed.scheme not in ["http", "https"]:
20+
raise ValueError(f"Unsupported scheme {parsed.scheme}")
21+
22+
return re.sub("[^0-9a-z]+", "-", (parsed.netloc + parsed.path).lower().strip("_"))
23+
24+
25+
@contextmanager
26+
def patch_rdflib_urlopen(
27+
cache_base_dir=None,
28+
write_cache=True,
29+
allowed_urls_pattern=r"https://w3id.org/ro/crate/1\.[01]/context",
30+
):
31+
"""
32+
Context manager to patch rdflib.parser.urlopen to cache and return the content of a URL.
33+
:param cache_base_dir: The base directory to store the cached files.
34+
:param write_cache: Whether to write the cache if the file is not found.
35+
:param allowed_urls_pattern: A regex pattern to match the allowed URLs to cache.
36+
"""
37+
38+
allowed_urls_re = re.compile(allowed_urls_pattern)
39+
if cache_base_dir is None:
40+
cache_base_dir = "cached_urlopen"
41+
os.makedirs(cache_base_dir, exist_ok=True)
42+
43+
def cached_urlopen(request):
44+
url = request.get_full_url()
45+
46+
class Response(io.StringIO):
47+
content_type = "text/html"
48+
headers = {"Content-Type": "text/html"}
49+
50+
def info(self):
51+
return self.headers
52+
53+
def geturl(self):
54+
return url
55+
56+
if not allowed_urls_re.match(url):
57+
return Response(json.dumps({"@context": {}}))
58+
# raise ValueError(
59+
# f"URL {url} not allowed to cache, allowed: {allowed_urls_pattern}"
60+
# )
61+
62+
cached_filename = os.path.join(cache_base_dir, url_to_filename(url))
63+
64+
if not os.path.exists(cached_filename):
65+
if write_cache:
66+
response = urlopen(request)
67+
content = response.read().decode("utf-8")
68+
69+
with open(cached_filename, "wt") as f:
70+
f.write(content)
71+
else:
72+
raise ValueError(
73+
f"Cache file {cached_filename} not found, not allowed to download and update cache"
74+
)
75+
76+
content = open(cached_filename, "rt").read()
77+
78+
return Response(content)
79+
80+
with patch("rdflib.parser.urlopen", cached_urlopen):
81+
yield
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import argparse
2+
import json
3+
import os
4+
import re
5+
6+
7+
def parse_args() -> argparse.Namespace:
8+
"""
9+
Parse command-line arguments.
10+
11+
:return: Parsed command-line arguments.
12+
"""
13+
parser = argparse.ArgumentParser(
14+
description="Generate list of created files based on workflow IDs and versions."
15+
)
16+
parser.add_argument(
17+
"--workflow-ids",
18+
type=str,
19+
help="Range of workflow IDs to process (e.g., '1-10').",
20+
)
21+
parser.add_argument(
22+
"--versions",
23+
type=str,
24+
required=True,
25+
help="Comma-separated list of versions to process (e.g., '1,2,3').",
26+
)
27+
parser.add_argument(
28+
"--output-dir",
29+
type=str,
30+
default="data",
31+
help="Directory where the output files are stored (default: 'data').",
32+
)
33+
return parser.parse_args()
34+
35+
36+
def get_max_id_from_files(output_dir: str) -> int:
37+
"""
38+
If no workflow ID parameter is provided, get the maximum workflow ID from the files in the output directory.
39+
40+
:param output_dir: The directory where output files are stored.
41+
:return: The maximum workflow ID.
42+
"""
43+
max_id = 0
44+
pattern = re.compile(r"^(\d+)_\d+_ro-crate-metadata\.json$")
45+
for filename in os.listdir(output_dir):
46+
match = pattern.match(filename)
47+
if match:
48+
wf_id = int(match.group(1))
49+
if wf_id > max_id:
50+
max_id = wf_id
51+
return max_id
52+
53+
54+
def generate_expected_files(
55+
output_dir: str, workflow_ids: range, versions: list[str]
56+
) -> list[str]:
57+
"""
58+
Generate a list of expected file paths based on the workflow IDs and versions.
59+
60+
:param output_dir: The directory where output files are stored.
61+
:param workflow_ids: The range of workflow IDs to process.
62+
:param versions: The list of versions to process.
63+
64+
:return: A list of expected file paths.
65+
"""
66+
67+
expected_files = []
68+
for wf_id in workflow_ids:
69+
for ver in versions:
70+
expected_files.append(f"{output_dir}/{wf_id}_{ver}_ro-crate-metadata.json")
71+
return expected_files
72+
73+
74+
def verify_created_files(expected_files: list[str]) -> list[str]:
75+
"""
76+
Verify which files from the list of expected files actually exist.
77+
78+
:param expected_files: The list of expected file paths.
79+
:return: A list of file paths that actually exist.
80+
"""
81+
return [f for f in expected_files if os.path.exists(f)]
82+
83+
84+
def main():
85+
args = parse_args()
86+
87+
if args.workflow_ids:
88+
min_id, max_id = map(int, args.workflow_ids.split("-"))
89+
workflow_ids = range(min_id, max_id + 1)
90+
else:
91+
max_id = get_max_id_from_files(args.output_dir)
92+
workflow_ids = range(1, max_id + 1)
93+
94+
versions = args.versions.split(",")
95+
96+
# Generate expected file paths
97+
expected_files = generate_expected_files(args.output_dir, workflow_ids, versions)
98+
99+
# Check which files were actually created
100+
created_files = verify_created_files(expected_files)
101+
102+
# Output the list of created files to a JSON file
103+
with open("created_files.json", "w") as f:
104+
json.dump(created_files, f)
105+
106+
print("\nFile names written to created_files.json")
107+
108+
109+
if __name__ == "__main__":
110+
main()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import sys
2+
3+
4+
def update_progress_bar(progress: int, total: int, bar_length: int = 50):
5+
"""
6+
Updates the progress bar.
7+
8+
:param progress: Current progress.
9+
:param total: The total value when the progress is complete.
10+
:param bar_length: The length of the progress bar in characters.
11+
"""
12+
fraction = progress / total
13+
arrow = int(fraction * bar_length) * "="
14+
padding = int(bar_length - len(arrow)) * " "
15+
percent = int(fraction * 100)
16+
sys.stdout.write(f"\r[{arrow}{padding}] {percent}%")
17+
sys.stdout.flush()
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import os
2+
3+
import certifi
4+
5+
os.environ["SSL_CERT_FILE"] = certifi.where()
6+
7+
8+
BASE_URL_PROD = "https://workflowhub.eu"
9+
WORKFLOWS_URL_PROD = "https://workflowhub.eu/workflows.json"
10+
11+
BASE_URL_DEV = "https://dev.workflowhub.eu"
12+
WORKFLOWS_URL_DEV = "https://dev.workflowhub.eu/workflows.json"
13+
14+
# TODO: Why are we duplicating the URLs?
15+
BASE_URL = BASE_URL_DEV
16+
WORKFLOWS_URL = WORKFLOWS_URL_DEV
17+
18+
DOT_JSON_ENDPOINT = "/workflows/{w_id}.json"
19+
METADATA_ENDPOINT = "/workflows/{w_id}/ro_crate_metadata?version={w_version}"
20+
ZIP_ENDPOINT = "/workflows/{w_id}/ro_crate?version={w_version}"
21+
22+
TARGET_FILE_NAME = "ro-crate-metadata.json"

0 commit comments

Comments
 (0)