Skip to content

Commit 5f93354

Browse files
authored
scandeps should not apply mergedirs at every level (#1615)
* scandeps should not apply mergedirs at every level When getting back a list of dependencies, and using "nestdirs", in the specific case it there are references to both a directory and files within that directory, it may report multiple dependencies for the same directory. The mergedir method merges those references into one. * Add comments to scandeps * Add test. Fix mypy/format
1 parent 7cef4dd commit 5f93354

File tree

4 files changed

+100
-24
lines changed

4 files changed

+100
-24
lines changed

cwltool/main.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
CWL_IANA,
7474
Process,
7575
add_sizes,
76+
mergedirs,
7677
scandeps,
7778
shortname,
7879
use_custom_schema,
@@ -620,7 +621,9 @@ def loadref(base: str, uri: str) -> Union[CommentedMap, CommentedSeq, str, None]
620621
nestdirs=nestdirs,
621622
)
622623
if sfs is not None:
623-
deps["secondaryFiles"] = cast(MutableSequence[CWLOutputAtomType], sfs)
624+
deps["secondaryFiles"] = cast(
625+
MutableSequence[CWLOutputAtomType], mergedirs(sfs)
626+
)
624627

625628
return deps
626629

cwltool/process.py

+42-23
Original file line numberDiff line numberDiff line change
@@ -1104,42 +1104,38 @@ def nestdir(base: str, deps: CWLObjectType) -> CWLObjectType:
11041104
sp = s2.split("/")
11051105
sp.pop()
11061106
while sp:
1107+
loc = dirname + "/".join(sp)
11071108
nx = sp.pop()
1108-
deps = {"class": "Directory", "basename": nx, "listing": [deps]}
1109+
deps = {
1110+
"class": "Directory",
1111+
"basename": nx,
1112+
"listing": [deps],
1113+
"location": loc,
1114+
}
11091115
return deps
11101116

11111117

1112-
def mergedirs(listing: List[CWLObjectType]) -> List[CWLObjectType]:
1118+
def mergedirs(
1119+
listing: MutableSequence[CWLObjectType],
1120+
) -> MutableSequence[CWLObjectType]:
11131121
r = [] # type: List[CWLObjectType]
11141122
ents = {} # type: Dict[str, CWLObjectType]
1115-
collided = set() # type: Set[str]
11161123
for e in listing:
11171124
basename = cast(str, e["basename"])
11181125
if basename not in ents:
11191126
ents[basename] = e
1127+
elif e["location"] != ents[basename]["location"]:
1128+
raise ValidationException(
1129+
"Conflicting basename in listing or secondaryFiles, '%s' used by both '%s' and '%s'"
1130+
% (basename, e["location"], ents[basename]["location"])
1131+
)
11201132
elif e["class"] == "Directory":
11211133
if e.get("listing"):
1134+
# name already in entries
1135+
# merge it into the existing listing
11221136
cast(
11231137
List[CWLObjectType], ents[basename].setdefault("listing", [])
11241138
).extend(cast(List[CWLObjectType], e["listing"]))
1125-
if cast(str, ents[basename]["location"]).startswith("_:"):
1126-
ents[basename]["location"] = e["location"]
1127-
elif e["location"] != ents[basename]["location"]:
1128-
# same basename, different location, collision,
1129-
# rename both.
1130-
collided.add(basename)
1131-
e2 = ents[basename]
1132-
1133-
e["basename"] = urllib.parse.quote(cast(str, e["location"]), safe="")
1134-
e2["basename"] = urllib.parse.quote(cast(str, e2["location"]), safe="")
1135-
1136-
e["nameroot"], e["nameext"] = os.path.splitext(cast(str, e["basename"]))
1137-
e2["nameroot"], e2["nameext"] = os.path.splitext(cast(str, e2["basename"]))
1138-
1139-
ents[cast(str, e["basename"])] = e
1140-
ents[cast(str, e2["basename"])] = e2
1141-
for c in collided:
1142-
del ents[c]
11431139
for e in ents.values():
11441140
if e["class"] == "Directory" and "listing" in e:
11451141
e["listing"] = cast(
@@ -1162,6 +1158,30 @@ def scandeps(
11621158
urljoin: Callable[[str, str], str] = urllib.parse.urljoin,
11631159
nestdirs: bool = True,
11641160
) -> MutableSequence[CWLObjectType]:
1161+
1162+
"""Given a CWL document or input object, search for dependencies
1163+
(references to external files) of 'doc' and return them as a list
1164+
of File or Directory objects.
1165+
1166+
The 'base' is the base URL for relative references.
1167+
1168+
Looks for objects with 'class: File' or 'class: Directory' and
1169+
adds them to the list of dependencies.
1170+
1171+
Anything in 'urlfields' is also added as a File dependency.
1172+
1173+
Anything in 'reffields' (such as workflow step 'run') will be
1174+
added as a dependency and also loaded (using the 'loadref'
1175+
function) and recursively scanned for dependencies. Those
1176+
dependencies will be added as secondary files to the primary file.
1177+
1178+
If "nestdirs" is true, create intermediate directory objects when
1179+
a file is located in a subdirectory under the starting directory.
1180+
This is so that if the dependencies are materialized, they will
1181+
produce the same relative file system locations.
1182+
1183+
"""
1184+
11651185
r: MutableSequence[CWLObjectType] = []
11661186
if isinstance(doc, MutableMapping):
11671187
if "id" in doc:
@@ -1268,7 +1288,7 @@ def scandeps(
12681288
)
12691289
if sf:
12701290
deps2["secondaryFiles"] = cast(
1271-
MutableSequence[CWLOutputAtomType], sf
1291+
MutableSequence[CWLOutputAtomType], mergedirs(sf)
12721292
)
12731293
if nestdirs:
12741294
deps2 = nestdir(base, deps2)
@@ -1313,7 +1333,6 @@ def scandeps(
13131333

13141334
if r:
13151335
normalizeFilesDirs(r)
1316-
r = mergedirs(cast(List[CWLObjectType], r))
13171336

13181337
return r
13191338

tests/test_examples.py

+47
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,53 @@ def loadref(
512512
assert scanned_deps2 == expected_deps
513513

514514

515+
def test_scandeps_samedirname() -> None:
516+
obj: CWLObjectType = {
517+
"dir1": {"class": "Directory", "location": "tests/wf/dir1/foo"},
518+
"dir2": {"class": "Directory", "location": "tests/wf/dir2/foo"},
519+
}
520+
521+
def loadref(
522+
base: str, p: Union[CommentedMap, CommentedSeq, str, None]
523+
) -> Union[CommentedMap, CommentedSeq, str, None]:
524+
if isinstance(p, dict):
525+
return p
526+
raise Exception("test case can't load things")
527+
528+
scanned_deps = cast(
529+
List[Dict[str, Any]],
530+
cwltool.process.scandeps(
531+
"",
532+
obj,
533+
{"$import", "run"},
534+
{"$include", "$schemas", "location"},
535+
loadref,
536+
nestdirs=False,
537+
),
538+
)
539+
540+
scanned_deps.sort(key=lambda k: cast(str, k["basename"]))
541+
542+
expected_deps = [
543+
{"basename": "foo", "class": "Directory", "location": "tests/wf/dir1/foo"},
544+
{"basename": "foo", "class": "Directory", "location": "tests/wf/dir2/foo"},
545+
]
546+
547+
assert scanned_deps == expected_deps
548+
549+
550+
def test_scandeps_collision() -> None:
551+
stream = StringIO()
552+
553+
assert (
554+
main(
555+
["--print-deps", "--debug", get_data("tests/wf/dir_deps.json")],
556+
stdout=stream,
557+
)
558+
== 1
559+
)
560+
561+
515562
def test_trick_scandeps() -> None:
516563
stream = StringIO()
517564

tests/wf/dir_deps.json

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"dir1": {
2+
"class": "Directory",
3+
"listing": [
4+
{"class": "File", "basename": "foo", "location": "tests/wf/foo1"},
5+
{"class": "File", "basename": "foo", "location": "tests/wf/foo2"}
6+
]
7+
}

0 commit comments

Comments
 (0)