Skip to content

Commit 96ebb49

Browse files
AlexTatemr-c
andauthored
Fix erroneous URL encoding of final output filenames (#1446)
* This commit addresses issue #1445. Specifically, it corrects the erroneous URL encoding of output file basenames and nameroots when collecting final outputs. * Further URL encoding bugfixes for CommandLineTool but focused in rev_map(). The changes in collect_output() are formatting only. Unit test for collect_output() are complete Co-authored-by: Michael R. Crusoe <[email protected]>
1 parent 63c7c72 commit 96ebb49

File tree

2 files changed

+106
-28
lines changed

2 files changed

+106
-28
lines changed

Diff for: cwltool/command_line_tool.py

+31-25
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,7 @@ def revmap_file(
200200
outside the container. Recognizes files in the pathmapper or remaps
201201
internal output directories to the external directory.
202202
"""
203-
split = urllib.parse.urlsplit(outdir)
204-
if not split.scheme:
205-
outdir = file_uri(str(outdir))
203+
outdir_uri, outdir_path = file_uri(str(outdir)), outdir
206204

207205
# builder.outdir is the inner (container/compute node) output directory
208206
# outdir is the outer (host/storage system) output directory
@@ -236,21 +234,20 @@ def revmap_file(
236234
):
237235
f["location"] = revmap_f[1]
238236
elif (
239-
uripath == outdir
240-
or uripath.startswith(outdir + os.sep)
241-
or uripath.startswith(outdir + "/")
237+
uripath == outdir_uri
238+
or uripath.startswith(outdir_uri + os.sep)
239+
or uripath.startswith(outdir_uri + "/")
242240
):
243-
f["location"] = file_uri(path)
241+
f["location"] = uripath
244242
elif (
245243
path == builder.outdir
246244
or path.startswith(builder.outdir + os.sep)
247245
or path.startswith(builder.outdir + "/")
248246
):
249-
f["location"] = builder.fs_access.join(
250-
outdir, path[len(builder.outdir) + 1 :]
247+
joined_path = builder.fs_access.join(
248+
outdir_path, path[len(builder.outdir) + 1 :]
251249
)
252-
elif not os.path.isabs(path):
253-
f["location"] = builder.fs_access.join(outdir, path)
250+
f["location"] = file_uri(joined_path)
254251
else:
255252
raise WorkflowException(
256253
"Output file path %s must be within designated output directory (%s) or an input "
@@ -1337,6 +1334,15 @@ def collect_output(
13371334
)
13381335
try:
13391336
prefix = fs_access.glob(outdir)
1337+
sorted_glob_result = sorted(
1338+
fs_access.glob(fs_access.join(outdir, gb)),
1339+
key=cmp_to_key(
1340+
cast(
1341+
Callable[[str, str], int],
1342+
locale.strcoll,
1343+
)
1344+
),
1345+
)
13401346
r.extend(
13411347
[
13421348
{
@@ -1347,24 +1353,24 @@ def collect_output(
13471353
g[len(prefix[0]) + 1 :]
13481354
),
13491355
),
1350-
"basename": os.path.basename(g),
1351-
"nameroot": os.path.splitext(
1352-
os.path.basename(g)
1353-
)[0],
1354-
"nameext": os.path.splitext(
1355-
os.path.basename(g)
1356-
)[1],
1356+
"basename": decoded_basename,
1357+
"nameroot": os.path.splitext(decoded_basename)[
1358+
0
1359+
],
1360+
"nameext": os.path.splitext(decoded_basename)[
1361+
1
1362+
],
13571363
"class": "File"
13581364
if fs_access.isfile(g)
13591365
else "Directory",
13601366
}
1361-
for g in sorted(
1362-
fs_access.glob(fs_access.join(outdir, gb)),
1363-
key=cmp_to_key(
1364-
cast(
1365-
Callable[[str, str], int],
1366-
locale.strcoll,
1367-
)
1367+
for g, decoded_basename in zip(
1368+
sorted_glob_result,
1369+
map(
1370+
lambda x: os.path.basename(
1371+
urllib.parse.unquote(x)
1372+
),
1373+
sorted_glob_result,
13681374
),
13691375
)
13701376
]

Diff for: tests/test_path_checks.py

+75-3
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
1-
from pathlib import Path
2-
31
import pytest
2+
import urllib.parse
43

5-
from cwltool.main import main
4+
from pathlib import Path
5+
from typing import cast
6+
from ruamel.yaml.comments import CommentedMap
7+
from schema_salad.sourceline import cmap
68

9+
from cwltool.main import main
10+
from cwltool.command_line_tool import CommandLineTool
11+
from cwltool.context import LoadingContext, RuntimeContext
12+
from cwltool.update import INTERNAL_VERSION
13+
from cwltool.utils import CWLObjectType
714
from .util import needs_docker
815

16+
917
script = """
1018
#!/usr/bin/env cwl-runner
1119
cwlVersion: v1.0
@@ -95,3 +103,67 @@ def test_unicode_in_output_files(tmp_path: Path, filename: str) -> None:
95103
filename,
96104
]
97105
assert main(params) == 0
106+
107+
108+
def test_clt_returns_specialchar_names(tmp_path: Path) -> None:
109+
"""Confirm that special characters in filenames do not cause problems."""
110+
loading_context = LoadingContext(
111+
{
112+
"metadata": {
113+
"cwlVersion": INTERNAL_VERSION,
114+
"http://commonwl.org/cwltool#original_cwlVersion": INTERNAL_VERSION,
115+
}
116+
}
117+
)
118+
clt = CommandLineTool(
119+
cast(
120+
CommentedMap,
121+
cmap(
122+
{
123+
"cwlVersion": INTERNAL_VERSION,
124+
"class": "CommandLineTool",
125+
"inputs": [],
126+
"outputs": [],
127+
"requirements": [],
128+
}
129+
),
130+
),
131+
loading_context,
132+
)
133+
134+
# Reserved characters will be URL encoded during the creation of a file URI
135+
# Internal references to files are in URI form, and are therefore URL encoded
136+
# Final output files should not retain their URL encoded filenames
137+
rfc_3986_gen_delims = [":", "/", "?", "#", "[", "]", "@"]
138+
rfc_3986_sub_delims = ["!", "$", "&", "'", "(", ")", "*", "+", ",", ";", "="]
139+
unix_reserved = ["/", "\0"]
140+
reserved = [
141+
special_char
142+
for special_char in (rfc_3986_gen_delims + rfc_3986_sub_delims)
143+
if special_char not in unix_reserved
144+
]
145+
146+
# Mock an "output" file with the above special characters in its name
147+
special = "".join(reserved)
148+
output_schema = cast(
149+
CWLObjectType, {"type": "File", "outputBinding": {"glob": special}}
150+
)
151+
mock_output = tmp_path / special
152+
mock_output.touch()
153+
154+
# Prepare minimal arguments for CommandLineTool.collect_output()
155+
builder = clt._init_job({}, RuntimeContext())
156+
builder.pathmapper = clt.make_path_mapper(
157+
builder.files, builder.stagedir, RuntimeContext(), True
158+
)
159+
fs_access = builder.make_fs_access(str(tmp_path))
160+
161+
result = cast(
162+
CWLObjectType,
163+
clt.collect_output(output_schema, builder, str(tmp_path), fs_access),
164+
)
165+
166+
assert result["class"] == "File"
167+
assert result["basename"] == special
168+
assert result["nameroot"] == special
169+
assert str(result["location"]).endswith(urllib.parse.quote(special))

0 commit comments

Comments
 (0)