From 5ddf7cc861eca7fd879310ccbf00d80ccc6d18d4 Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Sun, 10 Apr 2022 09:27:50 -0700 Subject: [PATCH] Split GISAID profile to "six-month" and "all-time" builds This commit splits the existing regional builds "global", "africa", etc... in the "nextstrain-gisaid" profile into "six-month" builds that focus subsampling on the previous six months and "all-time" builds that subsample evenly across time. This uses the new relative dates functionality in "augur filter" to make these subsampling strategies easier to implement and more obvious. Frequencies timespans are set to match subsampling ranges. The general subsampling logic is cleaned up in a few ways: 1. North America and Oceania are subsampled and traits reconstructed at the "division" level, while Africa, Asia, Europe and South America are subsampled and traits reconstructed at the "country" level. Previously this behavior had been inconsistent between subsampling, traits, etc... 2. For global builds, all regions are now sampled at equal frequency except for Oceania which is 33%. Previous overemphasis on Europe and North America is no longer justified. 3. There is a consistent 4:1 emphasis on recent vs early samples for the "six-month" builds and a consistent 4:1 emphasis on focal vs context for the regional builds. --- defaults/parameters.yaml | 8 +- .../nextstrain-gisaid/builds.yaml | 419 +++++++++++++----- .../nextstrain-gisaid/subsampling_ranges.smk | 14 - workflow/snakemake_rules/common.smk | 8 +- 4 files changed, 313 insertions(+), 136 deletions(-) delete mode 100644 nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 263e3ec0d..8a4391c58 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -81,8 +81,8 @@ filter: exclude_where: "division='USA'" exclude_ambiguous_dates_by: "any" - # Exclude sequences which are from before late 2019 (likely date mix-ups) - min_date: 2019.74 + # Exclude sequences which are from before Dec 2019 (likely date mix-ups) + min_date: "2019-12-01" # When choosing contextual samples for a focal set, applying crowding penalty # will help reduce the number of genetically identical strains that get chosen, @@ -136,10 +136,10 @@ frequencies: # min_date is set by default to 1 year before present # but can be explicitly set if desired - # Number of months between pivots + # Number of weeks between pivots pivot_interval: 1 - # Measure pivots in weeks rather than months + # Measure pivots in weeks pivot_interval_units: "weeks" # KDE bandwidths in proportion of a year to use per strain. diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 14765df7c..3d37c9d85 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -3,7 +3,6 @@ auspice_json_prefix: ncov_gisaid # Define custom rules for pre- or post-standard workflow processing of data. custom_rules: - workflow/snakemake_rules/export_for_nextstrain.smk - - nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. # To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. @@ -14,8 +13,17 @@ S3_DST_ORIGINS: ["gisaid"] upload: - build-files +# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds +deploy_url: s3://nextstrain-data +slack_token: ~ +slack_channel: "#ncov-gisaid-updates" + genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] use_nextalign: true +include_hcov19_prefix: True + +files: + description: "nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md" # Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment # as we re-align everything after subsampling. @@ -29,192 +37,371 @@ inputs: # For each build we specify a subsampling scheme via an explicit key. # These subsampling schemes are defined at the bottom of this file. # (They override the defaults) +# North America and Oceania are subsampled at the "division" level +# Africa, Asia, Europe and South America are subsampled at the "country" level builds: reference: - subsampling_scheme: nextstrain_clades + subsampling_scheme: nextstrain_reference auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling - global: - subsampling_scheme: nextstrain_region_global + global_six-months: + subsampling_scheme: nextstrain_global_6m + auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months + global_all-time: + subsampling_scheme: nextstrain_global_all_time auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with global subsampling - africa: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start + africa_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Africa auspice_config: "nextstrain_profiles/nextstrain-gisaid/africa_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Africa-focused subsampling - asia: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months + africa_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Africa + auspice_config: "nextstrain_profiles/nextstrain-gisaid/africa_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start + asia_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Asia auspice_config: "nextstrain_profiles/nextstrain-gisaid/asia_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Asia-focused subsampling - europe: - subsampling_scheme: nextstrain_region_grouped_by_country + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months + asia_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Asia + auspice_config: "nextstrain_profiles/nextstrain-gisaid/asia_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start + europe_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Europe auspice_config: "nextstrain_profiles/nextstrain-gisaid/europe_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Europe-focused subsampling - north-america: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months + europe_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Europe + auspice_config: "nextstrain_profiles/nextstrain-gisaid/europe_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start + north-america_six-months: + subsampling_scheme: nextstrain_region_grouped_by_division_6m + region: North America + auspice_config: "nextstrain_profiles/nextstrain-gisaid/north-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months + north-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time region: North America auspice_config: "nextstrain_profiles/nextstrain-gisaid/north-america_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with North America-focused subsampling - oceania: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start + oceania_six-months: + subsampling_scheme: nextstrain_region_grouped_by_division_6m + region: Oceania + auspice_config: "nextstrain_profiles/nextstrain-gisaid/oceania_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months + oceania_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time region: Oceania auspice_config: "nextstrain_profiles/nextstrain-gisaid/oceania_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Oceania-focused subsampling - south-america: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start + south-america_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m + region: South America + auspice_config: "nextstrain_profiles/nextstrain-gisaid/south-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months + south-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time region: South America auspice_config: "nextstrain_profiles/nextstrain-gisaid/south-america_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with South America-focused subsampling + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start -# remove S dropout sequences and sequences without division label in US +# remove sequences without division label in US filter: - exclude_where: "division='USA' purpose_of_sequencing='S dropout'" + exclude_where: "division='USA'" +subsampling: -# if different traits should be reconstructed for some builds, specify here -# otherwise the default trait config in defaults/parameters.yaml will used -traits: - global: - sampling_bias_correction: 2.5 - columns: ["region"] - europe: - sampling_bias_correction: 2.5 - columns: ["country"] - africa: - sampling_bias_correction: 2.5 - columns: ["country"] - asia: - sampling_bias_correction: 2.5 - columns: ["country"] - south-america: - sampling_bias_correction: 2.5 - columns: ["country"] - north-america: - sampling_bias_correction: 2.5 - columns: ["division"] - oceania: - sampling_bias_correction: 2.5 - columns: ["division"] - -files: - description: "nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md" + # Custom subsampling logic for group by clade + nextstrain_reference: + clades: + group_by: "Nextstrain_clade" + max_sequences: 300 -# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds -deploy_url: s3://nextstrain-data -slack_token: ~ -slack_channel: "#ncov-gisaid-updates" + # Custom subsampling logic for regions over 6m + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_6m: + # Early focal samples for region + focal_early: + group_by: "division year month" + max_sequences: 640 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_early: + group_by: "country year month" + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "division year month" + max_sequences: 2560 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_recent: + group_by: "country year month" + max_sequences: 640 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region={region}'" -subsampling: - # Custom subsampling logic for regions - nextstrain_region: + # Custom subsampling logic for regions over all-time + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_all_time: # Focal samples for region - region_early: + focal: group_by: "division year month" + max_sequences: 3200 + exclude: "--exclude-where 'region!={region}'" + # Contextual samples from the rest of the world + context: + group_by: "country year month" max_sequences: 800 + exclude: "--exclude-where 'region={region}'" + + # Custom subsampling logic for regions over 6m + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_6m: + # Early focal samples for region + focal_early: + group_by: "country year month" + max_sequences: 640 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_early: + # Early contextual samples from the rest of the world + context_early: group_by: "country year month" - max_sequences: 600 + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "country year month" + max_sequences: 2560 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_recent: + group_by: "country year month" + max_sequences: 640 + min_date: "--min-date 6M" exclude: "--exclude-where 'region={region}'" - region_late: - group_by: "division year month" - max_sequences: 1700 + # Custom subsampling logic for regions over all-time + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_all_time: + # Focal samples for region + focal: + group_by: "country year month" + max_sequences: 3200 exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_late: + # Contextual samples from the rest of the world + context: group_by: "country year month" - max_sequences: 1000 + max_sequences: 800 exclude: "--exclude-where 'region={region}'" - # Custom subsampling logic for global region. - nextstrain_region_global: + # Custom subsampling logic for global region over 6m + # 4000 total + # 4:1 ratio of focal to context + # all regions equal except Oceania at 33% + nextstrain_global_6m: africa_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 300 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Asia'" europe_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Europe'" north_america_early: group_by: "division year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_early: group_by: "division year month" - max_sequences: 200 + max_sequences: 50 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Oceania'" - - africa_late: + africa_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Africa'" - asia_late: + asia_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Asia'" - europe_late: + europe_recent: group_by: "country year month" max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Europe'" - north_america_late: + north_america_recent: group_by: "division year month" max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=North America'" - south_america_late: + south_america_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=South America'" - oceania_late: + oceania_recent: group_by: "division year month" - max_sequences: 300 + max_sequences: 200 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Oceania'" - # Custom subsampling for regions like Europe where grouping by country - # is the smallest resolution requied - nextstrain_region_grouped_by_country: - # Focal samples for region - region_late: + # Custom subsampling logic for global region over all-time + # 4000 total + # all regions equal except Oceania at 33% + nextstrain_global_all_time: + africa: group_by: "country year month" - max_sequences: 1700 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_late: + max_sequences: 750 + exclude: "--exclude-where 'region!=Africa'" + asia: group_by: "country year month" - max_sequences: 1000 - exclude: "--exclude-where 'region={region}'" - # Focal samples for region - region_early: + max_sequences: 750 + exclude: "--exclude-where 'region!=Asia'" + europe: group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_early: + max_sequences: 750 + exclude: "--exclude-where 'region!=Europe'" + north_america: + group_by: "division year month" + max_sequences: 750 + exclude: "--exclude-where 'region!=North America'" + south_america: group_by: "country year month" - max_sequences: 500 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for group by clade - nextstrain_clades: - clades: - group_by: "Nextstrain_clade" - max_sequences: 300 + max_sequences: 750 + exclude: "--exclude-where 'region!=South America'" + oceania: + group_by: "division year month" + max_sequences: 250 + exclude: "--exclude-where 'region!=Oceania'" + +# if different traits should be reconstructed for some builds, specify here +# otherwise the default trait config in defaults/parameters.yaml will used +traits: + global_six-months: + sampling_bias_correction: 2.5 + columns: ["region"] + global_all-time: + sampling_bias_correction: 2.5 + columns: ["region"] + africa_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + africa_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + north-america_six-months: + sampling_bias_correction: 2.5 + columns: ["division"] + north-america_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_six-months: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + south-america_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + south-america_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] -# Define frequencies parameters. +# Define frequencies parameters +# Target frequencies to "six-month" vs "all-time" builds frequencies: + global_six-months: recent_days_to_censor: 7 - -include_hcov19_prefix: True + min_date: "2021-10-09" + global_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + africa_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + africa_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + asia_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + asia_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + europe_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + europe_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + north-america_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + north-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + oceania_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + oceania_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + south-america_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + south-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" diff --git a/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk b/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk deleted file mode 100644 index 8609eef7f..000000000 --- a/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk +++ /dev/null @@ -1,14 +0,0 @@ -import datetime - -# Set subsampling max date to today. -today = datetime.date.today() - -# Set the earliest date to roughly 4 months ago (18 weeks). -early_late_cutoff = today - datetime.timedelta(weeks=18) - -for build in config["subsampling"]: - for scheme in config["subsampling"][build]: - if "_early" in scheme: - config["subsampling"][build][scheme]["max_date"] = f"--max-date {early_late_cutoff.strftime('%Y-%m-%d')}" - if "_late" in scheme: - config["subsampling"][build][scheme]["min_date"] = f"--min-date {early_late_cutoff.strftime('%Y-%m-%d')}" diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index 92ea875d1..68928f727 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -167,7 +167,9 @@ def _get_sampling_bias_correction_for_wildcards(wildcards): return config["traits"]["default"]["sampling_bias_correction"] def _get_min_date_for_frequencies(wildcards): - if "frequencies" in config and "min_date" in config["frequencies"]: + if wildcards.build_name in config["frequencies"] and "min_date" in config["frequencies"][wildcards.build_name]: + return config["frequencies"][wildcards.build_name]["min_date"] + elif "frequencies" in config and "min_date" in config["frequencies"]: return config["frequencies"]["min_date"] else: # If not explicitly specified, default to 1 year back from the present @@ -177,7 +179,9 @@ def _get_min_date_for_frequencies(wildcards): ) def _get_max_date_for_frequencies(wildcards): - if "frequencies" in config and "max_date" in config["frequencies"]: + if wildcards.build_name in config["frequencies"] and "max_date" in config["frequencies"][wildcards.build_name]: + return config["frequencies"][wildcards.build_name]["max_date"] + elif "frequencies" in config and "max_date" in config["frequencies"]: return config["frequencies"]["max_date"] else: # Allow users to censor the N most recent days to minimize effects of