1
- import copy
2
- import pdb
3
1
import datetime
4
- import logging
5
2
import urllib
6
3
import uuid
7
4
import json
8
- from io import BytesIO
9
5
from pathlib import PurePath , PurePosixPath
10
- from socket import getfqdn
11
6
from typing import (
12
7
Any ,
13
8
Dict ,
14
- Iterable ,
15
9
List ,
16
- MutableMapping ,
17
10
MutableSequence ,
18
11
Optional ,
19
12
Tuple ,
23
16
24
17
from prov .identifier import Identifier
25
18
from prov .model import PROV , PROV_LABEL , PROV_TYPE , PROV_VALUE , ProvDocument , ProvEntity
26
- from schema_salad .sourceline import SourceLine
27
- from typing_extensions import TYPE_CHECKING
28
- from tools .load_ga_export import load_ga_history_export , GalaxyJob , GalaxyDataset
19
+ from tools .load_ga_export import load_ga_history_export , GalaxyJob
29
20
from ast import literal_eval
30
21
import os
31
22
36
27
from rocrate .provenance_constants import (
37
28
ACCOUNT_UUID ,
38
29
CWLPROV ,
39
- ENCODING ,
40
- FOAF ,
41
30
METADATA ,
42
31
ORE ,
43
32
PROVENANCE ,
44
33
RO ,
45
34
SCHEMA ,
46
35
SHA1 ,
47
- SHA256 ,
48
- TEXT_PLAIN ,
49
36
UUID ,
50
37
WF4EVER ,
51
38
WFDESC ,
59
46
# from rocrate.provenance import ResearchObject
60
47
61
48
from pathlib import Path
62
- import rocrate . rocrate as roc
49
+
63
50
64
51
def posix_path (local_path : str ) -> str :
65
52
return str (PurePosixPath (Path (local_path )))
66
53
54
+
67
55
def remove_escapes (s ):
68
56
escapes = '' .join ([chr (char ) for char in range (1 , 32 )])
69
57
translator = str .maketrans ('' , '' , escapes )
70
- t = s .translate (translator )
58
+ s .translate (translator )
59
+
71
60
72
61
def reassign (d ):
73
62
for k , v in d .items ():
@@ -78,16 +67,17 @@ def reassign(d):
78
67
except ValueError :
79
68
pass
80
69
70
+
81
71
class ProvenanceProfile :
82
- """
72
+ """\
83
73
Provenance profile.
84
74
85
75
Populated from a galaxy workflow export.
86
76
"""
87
77
88
78
def __init__ (
89
79
self ,
90
- ga_export : Dict ,
80
+ ga_export : Dict ,
91
81
full_name : str = None ,
92
82
orcid : str = None ,
93
83
# prov_name: str = None,
@@ -112,12 +102,11 @@ def __init__(
112
102
self .base_uri = "arcp://uuid,%s/" % self .ro_uuid
113
103
self .document = ProvDocument ()
114
104
# TODO extract engine_uuid from galaxy, type: str
115
- self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () #type: str
105
+ self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () # type: str
116
106
self .full_name = full_name
117
107
self .workflow_run_uuid = run_uuid or uuid .uuid4 ()
118
108
self .workflow_run_uri = self .workflow_run_uuid .urn # type: str
119
-
120
- # move to separate function
109
+ # move to separate function
121
110
metadata_export = load_ga_history_export (ga_export )
122
111
self .generate_prov_doc ()
123
112
self .jobs = []
@@ -143,7 +132,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
143
132
# PROV_TYPE: FOAF["OnlineAccount"],
144
133
# TODO: change how we register galaxy version, probably a declare_version func
145
134
# self.galaxy_version = self.ga_export["jobs_attrs"][0]["galaxy_version"]
146
- # TODO: change notation to already imported namespaces?
135
+ # TODO: change notation to already imported namespaces?
147
136
self .document .add_namespace ("wfprov" , "http://purl.org/wf4ever/wfprov#" )
148
137
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
149
138
self .document .add_namespace ("wfdesc" , "http://purl.org/wf4ever/wfdesc#" )
@@ -166,7 +155,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
166
155
"provenance" , self .base_uri + posix_path (PROVENANCE ) + "/"
167
156
)
168
157
# TODO: use appropriate refs for ga_export and related inputs
169
- ro_identifier_workflow = self .base_uri + "ga_export" + "/"
158
+ ro_identifier_workflow = self .base_uri + "ga_export" + "/"
170
159
self .wf_ns = self .document .add_namespace ("wf" , ro_identifier_workflow )
171
160
ro_identifier_input = (
172
161
self .base_uri + "ga_export/datasets#"
@@ -230,15 +219,15 @@ def declare_process(
230
219
"""Record the start of each Process."""
231
220
if process_run_id is None :
232
221
process_run_id = uuid .uuid4 ().urn
233
-
234
- cmd = ga_export_jobs_attrs ["command_line" ]
222
+
223
+ # cmd = ga_export_jobs_attrs["command_line"]
235
224
process_name = ga_export_jobs_attrs ["tool_id" ]
236
- tool_version = ga_export_jobs_attrs ["tool_version" ]
225
+ # tool_version = ga_export_jobs_attrs["tool_version"]
237
226
prov_label = "Run of ga_export/jobs_attrs.txt#" + process_name
238
227
start_time = ga_export_jobs_attrs ["create_time" ]
239
228
end_time = ga_export_jobs_attrs ["update_time" ]
240
229
241
- #TODO: Find out how to include commandline as a string
230
+ # TODO: Find out how to include commandline as a string
242
231
# cmd = self.document.entity(
243
232
# uuid.uuid4().urn,
244
233
# {PROV_TYPE: WFPROV["Artifact"], PROV_LABEL: ga_export_jobs_attrs["command_line"]}
@@ -249,9 +238,9 @@ def declare_process(
249
238
start_time ,
250
239
end_time ,
251
240
{
252
- PROV_TYPE : WFPROV ["ProcessRun" ],
253
- PROV_LABEL : prov_label ,
254
- #TODO: Find out how to include commandline as a string
241
+ PROV_TYPE : WFPROV ["ProcessRun" ],
242
+ PROV_LABEL : prov_label ,
243
+ # TODO: Find out how to include commandline as a string
255
244
# PROV_LABEL: cmd
256
245
},
257
246
)
@@ -279,7 +268,7 @@ def used_artefacts(
279
268
base += "/" + process_name
280
269
tool_id = process_metadata ["tool_id" ]
281
270
base += "/" + tool_id
282
- items = ["inputs" ,"outputs" ,"parameters" ]
271
+ items = ["inputs" , "outputs" , "parameters" ]
283
272
# print(process_metadata["params"])
284
273
for item in items :
285
274
# print(item)
@@ -293,8 +282,8 @@ def used_artefacts(
293
282
value = json .loads (value )
294
283
if isinstance (key , str ):
295
284
key = key .replace ("|" , "_" )
296
- if isinstance (value , str ):
297
- val = value .replace ("|" , "_" )
285
+ if isinstance (value , str ):
286
+ value = value .replace ("|" , "_" )
298
287
299
288
prov_role = self .wf_ns [f"{ base } /{ key } " ]
300
289
@@ -307,7 +296,6 @@ def used_artefacts(
307
296
308
297
# for artefact in value:
309
298
try :
310
- # pdb.set_trace()
311
299
entity = self .declare_artefact (value )
312
300
self .document .used (
313
301
process_run_id ,
@@ -346,7 +334,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
346
334
# byte_s = BytesIO(value)
347
335
# data_file = self.research_object.add_data_file(byte_s)
348
336
# FIXME: Don't naively assume add_data_file uses hash in filename!
349
- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
337
+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
350
338
return self .document .entity (
351
339
data_id ,
352
340
{PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )},
@@ -383,7 +371,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
383
371
)
384
372
385
373
if value .get ("class" ):
386
- #_logger.warning("Unknown data class %s.", value["class"])
374
+ # _logger.warning("Unknown data class %s.", value["class"])
387
375
# FIXME: The class might be "http://example.com/somethingelse"
388
376
coll .add_asserted_type (CWLPROV [value ["class" ]])
389
377
@@ -393,7 +381,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
393
381
# clean up unwanted characters
394
382
if isinstance (key , str ):
395
383
key = key .replace ("|" , "_" )
396
- if isinstance (val , str ):
384
+ if isinstance (val , str ):
397
385
val = val .replace ("|" , "_" )
398
386
399
387
v_ent = self .declare_artefact (val )
@@ -440,7 +428,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
440
428
# FIXME: list value does not support adding "@id"
441
429
return coll
442
430
except TypeError :
443
- #_logger.warning("Unrecognized type %s of %r", type(value), value)
431
+ # _logger.warning("Unrecognized type %s of %r", type(value), value)
444
432
# Let's just fall back to Python repr()
445
433
entity = self .document .entity (uuid .uuid4 ().urn , {PROV_LABEL : repr (value )})
446
434
# self.research_object.add_uri(entity.identifier.uri)
@@ -455,7 +443,7 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
455
443
if "checksum" in value :
456
444
csum = cast (str , value ["checksum" ])
457
445
(method , checksum ) = csum .split ("$" , 1 )
458
- if method == SHA1 : # and self.research_object.has_data_file(checksum):
446
+ if method == SHA1 : # and self.research_object.has_data_file(checksum):
459
447
entity = self .document .entity ("data:" + checksum )
460
448
461
449
if not entity and "location" in value :
@@ -502,8 +490,8 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
502
490
503
491
# Check for secondaries
504
492
for sec in cast (
505
- # MutableSequence[CWLObjectType],
506
- value .get ("secondaryFiles" , [])
493
+ # MutableSequence[CWLObjectType],
494
+ value .get ("secondaryFiles" , []) # noqa
507
495
):
508
496
# TODO: Record these in a specializationOf entity with UUID?
509
497
if sec ["class" ] == "File" :
@@ -524,8 +512,10 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
524
512
525
513
return file_entity , entity , checksum
526
514
527
- def declare_directory (self
528
- # , value: CWLObjectType
515
+ def declare_directory (
516
+ self ,
517
+ # value: CWLObjectType
518
+ value
529
519
) -> ProvEntity :
530
520
"""Register any nested files/directories."""
531
521
# FIXME: Calculate a hash-like identifier for directory
@@ -636,12 +626,11 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
636
626
# checksum = PurePosixPath(data_file).name
637
627
# FIXME: Don't naively assume add_data_file uses hash in filename!
638
628
value = str (value ).replace ("|" , "_" )
639
- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
629
+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
640
630
entity = self .document .entity (
641
631
data_id , {PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )}
642
632
) # type: ProvEntity
643
- return entity #, checksum
644
-
633
+ return entity # , checksum
645
634
646
635
def generate_output_prov (
647
636
self ,
@@ -724,7 +713,7 @@ def activity_has_provenance(self, activity, prov_ids):
724
713
self .document .activity (activity , other_attributes = attribs )
725
714
# Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
726
715
# as prov:mentionOf() is only for entities, not activities
727
- uris = [i .uri for i in prov_ids ]
716
+ # uris = [i.uri for i in prov_ids]
728
717
# self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri)
729
718
730
719
def finalize_prov_profile (self , name = None , out_path = None ):
@@ -759,7 +748,7 @@ def finalize_prov_profile(self, name=None, out_path=None):
759
748
760
749
# https://www.w3.org/TR/prov-xml/
761
750
# serialized_prov_docs["xml"] = self.document.serialize(format="xml", indent=4)
762
- prov_ids .append (self .provenance_ns [filename + ".xml" ])
751
+ prov_ids .append (self .provenance_ns [filename + ".xml" ])
763
752
with open (basename + ".xml" , "w" ) as provenance_file :
764
753
self .document .serialize (provenance_file , format = "xml" , indent = 4 )
765
754
@@ -768,7 +757,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
768
757
prov_ids .append (self .provenance_ns [filename + ".provn" ])
769
758
with open (basename + ".provn" , "w" ) as provenance_file :
770
759
self .document .serialize (provenance_file , format = "provn" , indent = 2 )
771
-
772
760
773
761
# https://www.w3.org/Submission/prov-json/
774
762
# serialized_prov_docs["json"] = self.document.serialize(format="json", indent=2)
@@ -799,7 +787,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
799
787
prov_ids .append (self .provenance_ns [filename + ".jsonld" ])
800
788
with open (basename + ".jsonld" , "w" ) as provenance_file :
801
789
self .document .serialize (provenance_file , format = "rdf" , rdf_format = "json-ld" )
802
-
803
790
804
- #_logger.debug("[provenance] added provenance: %s", prov_ids)
791
+ # _logger.debug("[provenance] added provenance: %s", prov_ids)
805
792
return (serialized_prov_docs , prov_ids )
0 commit comments