diff --git a/nextstrain_profiles/nextstrain-open/nextstrain_description.md b/nextstrain_profiles/nextstrain-open/nextstrain_description.md index e67effca7..22c2143e5 100644 --- a/nextstrain_profiles/nextstrain-open/nextstrain_description.md +++ b/nextstrain_profiles/nextstrain-open/nextstrain_description.md @@ -37,11 +37,13 @@ To maximize the utility and visibility of these generously shared data, [we prov #### Subsampled sequences and intermediate files -The files below exist for each region / timespan combination, eg `global/6m`, `global/all-time`, `africa/6m`, `africa/all-time`, etc... The links below refer to the `${BUILD}` build, substitute `${BUILD}` with another build name in the links if desired. - -- [${BUILD} metadata.tsv.xz](https://data.nextstrain.org/files/ncov/open/${BUILD}/metadata.tsv.xz) -- [${BUILD} sequences.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD}/sequences.fasta.xz) -- [${BUILD} aligned.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD}/aligned.fasta.xz) -- [${BUILD} Auspice tree](https://data.nextstrain.org/files/ncov/open/${BUILD}/${BUILD}.json) -- [${BUILD} Auspice root sequence](https://data.nextstrain.org/files/ncov/open/${BUILD}/${BUILD}_root-sequence.json) -- [${BUILD} Auspice tip frequencies](https://data.nextstrain.org/files/ncov/open/${BUILD}/${BUILD}_tip-frequencies.json) +The files below exist for every region (`global`, `africa`, `asia`, `europe`, `north-america`, `oceania` and `south-america`) and correspond to each region's 6 month timespan build (e.g. `global/6m`, `africa/6m`, `asia/6m`, etc). +Files for the `all-time` builds (e.g. `global/all-time`, etc.) are not yet available. +The links below refer to the `${BUILD_PART_0}` region; substitute `${BUILD_PART_0}` with another region name in the links if desired. + +- [${BUILD_PART_0}/6m metadata.tsv.xz](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/metadata.tsv.xz) +- [${BUILD_PART_0}/6m sequences.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/sequences.fasta.xz) +- [${BUILD_PART_0}/6m aligned.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/aligned.fasta.xz) +- [${BUILD_PART_0}/6m Auspice tree](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/${BUILD_PART_0}.json) +- [${BUILD_PART_0}/6m Auspice root sequence](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/${BUILD_PART_0}_root-sequence.json) +- [${BUILD_PART_0}/6m Auspice tip frequencies](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/${BUILD_PART_0}_tip-frequencies.json) diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index fe580992b..e9a6e6bc8 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -1,6 +1,7 @@ """Small, shared functions used to generate inputs and parameters. """ import datetime +from itertools import product from urllib.parse import urlsplit def numeric_date(dt=None): @@ -209,16 +210,35 @@ def _get_upload_inputs(wildcards): origin = config["S3_DST_ORIGINS"][0] + # This function bakes in these assumptions here about the build names used + # for the nextstrain.org/ncov/gisaid and …/open builds and then + # special-cases them below. + regions = {"global", "africa", "asia", "europe", "north-america", "oceania", "south-america"} + timespans = {"6m", "all-time"} + region_timespan_builds = [f"{region}_{timespan}" for region, timespan in product(regions, timespans)] + # mapping of remote → local filenames build_files = {} for build_name in config["builds"]: + if build_name in region_timespan_builds: + region, timespan = build_name.split("_") + + # We name remote files only by region (for now), so only include + # the 6m timespan builds. + if timespan != "6m": + continue + + upload_name = region + else: + upload_name = build_name + build_files.update({ - f"{build_name}/sequences.fasta.xz": f"results/{build_name}/{build_name}_subsampled_sequences.fasta.xz", # from `rule combine_samples` - f"{build_name}/metadata.tsv.xz": f"results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", # from `rule combine_samples` - f"{build_name}/aligned.fasta.xz": f"results/{build_name}/aligned.fasta.xz", # from `rule build_align` + f"{upload_name}/sequences.fasta.xz": f"results/{build_name}/{build_name}_subsampled_sequences.fasta.xz", # from `rule combine_samples` + f"{upload_name}/metadata.tsv.xz": f"results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", # from `rule combine_samples` + f"{upload_name}/aligned.fasta.xz": f"results/{build_name}/aligned.fasta.xz", # from `rule build_align` # export the auspice dataset which matches the subsampled sequences / metadata (see `rule finalize`) - f"{build_name}/{build_name}.json": f"auspice/{config['auspice_json_prefix']}_{build_name}.json", - f"{build_name}/{build_name}_tip-frequencies.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_tip-frequencies.json", - f"{build_name}/{build_name}_root-sequence.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_root-sequence.json" + f"{upload_name}/{upload_name}.json": f"auspice/{config['auspice_json_prefix']}_{build_name}.json", + f"{upload_name}/{upload_name}_tip-frequencies.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_tip-frequencies.json", + f"{upload_name}/{upload_name}_root-sequence.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_root-sequence.json" }) return build_files diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 33c3f3533..73038aa4b 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -1365,13 +1365,22 @@ rule build_description: log: "logs/build_description_{build_name}.txt" conda: config["conda_environment"] - shell: - """ - env BUILD={wildcards.build_name:q} \ - perl -pe 's/\$\{{BUILD\}}/$ENV{{BUILD}}/g' \ - < {input.description:q} \ - > {output.description:q} - """ + run: + from string import Template + + context = { + "BUILD": wildcards.build_name, + **{ + f"BUILD_PART_{idx}": part + for idx, part + in enumerate(wildcards.build_name.split("_"))}, + } + + with open(input.description, "r", encoding = "utf-8") as i: + template = Template(i.read()) + + with open(output.description, "w", encoding = "utf-8") as o: + o.write(template.safe_substitute(context)) rule export: message: "Exporting data files for Auspice"