diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 263e3ec0d..8a4391c58 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -81,8 +81,8 @@ filter: exclude_where: "division='USA'" exclude_ambiguous_dates_by: "any" - # Exclude sequences which are from before late 2019 (likely date mix-ups) - min_date: 2019.74 + # Exclude sequences which are from before Dec 2019 (likely date mix-ups) + min_date: "2019-12-01" # When choosing contextual samples for a focal set, applying crowding penalty # will help reduce the number of genetically identical strains that get chosen, @@ -136,10 +136,10 @@ frequencies: # min_date is set by default to 1 year before present # but can be explicitly set if desired - # Number of months between pivots + # Number of weeks between pivots pivot_interval: 1 - # Measure pivots in weeks rather than months + # Measure pivots in weeks pivot_interval_units: "weeks" # KDE bandwidths in proportion of a year to use per strain. diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 14765df7c..3d37c9d85 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -3,7 +3,6 @@ auspice_json_prefix: ncov_gisaid # Define custom rules for pre- or post-standard workflow processing of data. custom_rules: - workflow/snakemake_rules/export_for_nextstrain.smk - - nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. # To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. @@ -14,8 +13,17 @@ S3_DST_ORIGINS: ["gisaid"] upload: - build-files +# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds +deploy_url: s3://nextstrain-data +slack_token: ~ +slack_channel: "#ncov-gisaid-updates" + genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] use_nextalign: true +include_hcov19_prefix: True + +files: + description: "nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md" # Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment # as we re-align everything after subsampling. @@ -29,192 +37,371 @@ inputs: # For each build we specify a subsampling scheme via an explicit key. # These subsampling schemes are defined at the bottom of this file. # (They override the defaults) +# North America and Oceania are subsampled at the "division" level +# Africa, Asia, Europe and South America are subsampled at the "country" level builds: reference: - subsampling_scheme: nextstrain_clades + subsampling_scheme: nextstrain_reference auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling - global: - subsampling_scheme: nextstrain_region_global + global_six-months: + subsampling_scheme: nextstrain_global_6m + auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months + global_all-time: + subsampling_scheme: nextstrain_global_all_time auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with global subsampling - africa: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start + africa_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Africa auspice_config: "nextstrain_profiles/nextstrain-gisaid/africa_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Africa-focused subsampling - asia: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months + africa_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Africa + auspice_config: "nextstrain_profiles/nextstrain-gisaid/africa_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start + asia_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Asia auspice_config: "nextstrain_profiles/nextstrain-gisaid/asia_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Asia-focused subsampling - europe: - subsampling_scheme: nextstrain_region_grouped_by_country + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months + asia_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Asia + auspice_config: "nextstrain_profiles/nextstrain-gisaid/asia_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start + europe_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Europe auspice_config: "nextstrain_profiles/nextstrain-gisaid/europe_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Europe-focused subsampling - north-america: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months + europe_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Europe + auspice_config: "nextstrain_profiles/nextstrain-gisaid/europe_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start + north-america_six-months: + subsampling_scheme: nextstrain_region_grouped_by_division_6m + region: North America + auspice_config: "nextstrain_profiles/nextstrain-gisaid/north-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months + north-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time region: North America auspice_config: "nextstrain_profiles/nextstrain-gisaid/north-america_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with North America-focused subsampling - oceania: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start + oceania_six-months: + subsampling_scheme: nextstrain_region_grouped_by_division_6m + region: Oceania + auspice_config: "nextstrain_profiles/nextstrain-gisaid/oceania_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months + oceania_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time region: Oceania auspice_config: "nextstrain_profiles/nextstrain-gisaid/oceania_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Oceania-focused subsampling - south-america: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start + south-america_six-months: + subsampling_scheme: nextstrain_region_grouped_by_country_6m + region: South America + auspice_config: "nextstrain_profiles/nextstrain-gisaid/south-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months + south-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time region: South America auspice_config: "nextstrain_profiles/nextstrain-gisaid/south-america_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with South America-focused subsampling + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start -# remove S dropout sequences and sequences without division label in US +# remove sequences without division label in US filter: - exclude_where: "division='USA' purpose_of_sequencing='S dropout'" + exclude_where: "division='USA'" +subsampling: -# if different traits should be reconstructed for some builds, specify here -# otherwise the default trait config in defaults/parameters.yaml will used -traits: - global: - sampling_bias_correction: 2.5 - columns: ["region"] - europe: - sampling_bias_correction: 2.5 - columns: ["country"] - africa: - sampling_bias_correction: 2.5 - columns: ["country"] - asia: - sampling_bias_correction: 2.5 - columns: ["country"] - south-america: - sampling_bias_correction: 2.5 - columns: ["country"] - north-america: - sampling_bias_correction: 2.5 - columns: ["division"] - oceania: - sampling_bias_correction: 2.5 - columns: ["division"] - -files: - description: "nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md" + # Custom subsampling logic for group by clade + nextstrain_reference: + clades: + group_by: "Nextstrain_clade" + max_sequences: 300 -# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds -deploy_url: s3://nextstrain-data -slack_token: ~ -slack_channel: "#ncov-gisaid-updates" + # Custom subsampling logic for regions over 6m + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_6m: + # Early focal samples for region + focal_early: + group_by: "division year month" + max_sequences: 640 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_early: + group_by: "country year month" + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "division year month" + max_sequences: 2560 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_recent: + group_by: "country year month" + max_sequences: 640 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region={region}'" -subsampling: - # Custom subsampling logic for regions - nextstrain_region: + # Custom subsampling logic for regions over all-time + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_all_time: # Focal samples for region - region_early: + focal: group_by: "division year month" + max_sequences: 3200 + exclude: "--exclude-where 'region!={region}'" + # Contextual samples from the rest of the world + context: + group_by: "country year month" max_sequences: 800 + exclude: "--exclude-where 'region={region}'" + + # Custom subsampling logic for regions over 6m + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_6m: + # Early focal samples for region + focal_early: + group_by: "country year month" + max_sequences: 640 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_early: + # Early contextual samples from the rest of the world + context_early: group_by: "country year month" - max_sequences: 600 + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "country year month" + max_sequences: 2560 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_recent: + group_by: "country year month" + max_sequences: 640 + min_date: "--min-date 6M" exclude: "--exclude-where 'region={region}'" - region_late: - group_by: "division year month" - max_sequences: 1700 + # Custom subsampling logic for regions over all-time + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_all_time: + # Focal samples for region + focal: + group_by: "country year month" + max_sequences: 3200 exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_late: + # Contextual samples from the rest of the world + context: group_by: "country year month" - max_sequences: 1000 + max_sequences: 800 exclude: "--exclude-where 'region={region}'" - # Custom subsampling logic for global region. - nextstrain_region_global: + # Custom subsampling logic for global region over 6m + # 4000 total + # 4:1 ratio of focal to context + # all regions equal except Oceania at 33% + nextstrain_global_6m: africa_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 300 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Asia'" europe_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Europe'" north_america_early: group_by: "division year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_early: group_by: "division year month" - max_sequences: 200 + max_sequences: 50 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Oceania'" - - africa_late: + africa_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Africa'" - asia_late: + asia_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Asia'" - europe_late: + europe_recent: group_by: "country year month" max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Europe'" - north_america_late: + north_america_recent: group_by: "division year month" max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=North America'" - south_america_late: + south_america_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=South America'" - oceania_late: + oceania_recent: group_by: "division year month" - max_sequences: 300 + max_sequences: 200 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Oceania'" - # Custom subsampling for regions like Europe where grouping by country - # is the smallest resolution requied - nextstrain_region_grouped_by_country: - # Focal samples for region - region_late: + # Custom subsampling logic for global region over all-time + # 4000 total + # all regions equal except Oceania at 33% + nextstrain_global_all_time: + africa: group_by: "country year month" - max_sequences: 1700 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_late: + max_sequences: 750 + exclude: "--exclude-where 'region!=Africa'" + asia: group_by: "country year month" - max_sequences: 1000 - exclude: "--exclude-where 'region={region}'" - # Focal samples for region - region_early: + max_sequences: 750 + exclude: "--exclude-where 'region!=Asia'" + europe: group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_early: + max_sequences: 750 + exclude: "--exclude-where 'region!=Europe'" + north_america: + group_by: "division year month" + max_sequences: 750 + exclude: "--exclude-where 'region!=North America'" + south_america: group_by: "country year month" - max_sequences: 500 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for group by clade - nextstrain_clades: - clades: - group_by: "Nextstrain_clade" - max_sequences: 300 + max_sequences: 750 + exclude: "--exclude-where 'region!=South America'" + oceania: + group_by: "division year month" + max_sequences: 250 + exclude: "--exclude-where 'region!=Oceania'" + +# if different traits should be reconstructed for some builds, specify here +# otherwise the default trait config in defaults/parameters.yaml will used +traits: + global_six-months: + sampling_bias_correction: 2.5 + columns: ["region"] + global_all-time: + sampling_bias_correction: 2.5 + columns: ["region"] + africa_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + africa_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + north-america_six-months: + sampling_bias_correction: 2.5 + columns: ["division"] + north-america_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_six-months: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + south-america_six-months: + sampling_bias_correction: 2.5 + columns: ["country"] + south-america_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] -# Define frequencies parameters. +# Define frequencies parameters +# Target frequencies to "six-month" vs "all-time" builds frequencies: + global_six-months: recent_days_to_censor: 7 - -include_hcov19_prefix: True + min_date: "2021-10-09" + global_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + africa_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + africa_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + asia_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + asia_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + europe_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + europe_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + north-america_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + north-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + oceania_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + oceania_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + south-america_six-months: + recent_days_to_censor: 7 + min_date: "2021-10-09" + south-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" diff --git a/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk b/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk deleted file mode 100644 index 8609eef7f..000000000 --- a/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk +++ /dev/null @@ -1,14 +0,0 @@ -import datetime - -# Set subsampling max date to today. -today = datetime.date.today() - -# Set the earliest date to roughly 4 months ago (18 weeks). -early_late_cutoff = today - datetime.timedelta(weeks=18) - -for build in config["subsampling"]: - for scheme in config["subsampling"][build]: - if "_early" in scheme: - config["subsampling"][build][scheme]["max_date"] = f"--max-date {early_late_cutoff.strftime('%Y-%m-%d')}" - if "_late" in scheme: - config["subsampling"][build][scheme]["min_date"] = f"--min-date {early_late_cutoff.strftime('%Y-%m-%d')}" diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index a3295fef1..fe580992b 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -167,7 +167,9 @@ def _get_sampling_bias_correction_for_wildcards(wildcards): return config["traits"]["default"]["sampling_bias_correction"] def _get_min_date_for_frequencies(wildcards): - if "frequencies" in config and "min_date" in config["frequencies"]: + if wildcards.build_name in config["frequencies"] and "min_date" in config["frequencies"][wildcards.build_name]: + return config["frequencies"][wildcards.build_name]["min_date"] + elif "frequencies" in config and "min_date" in config["frequencies"]: return config["frequencies"]["min_date"] else: # If not explicitly specified, default to 1 year back from the present @@ -177,7 +179,9 @@ def _get_min_date_for_frequencies(wildcards): ) def _get_max_date_for_frequencies(wildcards): - if "frequencies" in config and "max_date" in config["frequencies"]: + if wildcards.build_name in config["frequencies"] and "max_date" in config["frequencies"][wildcards.build_name]: + return config["frequencies"][wildcards.build_name]["max_date"] + elif "frequencies" in config and "max_date" in config["frequencies"]: return config["frequencies"]["max_date"] else: # Allow users to censor the N most recent days to minimize effects of