diff --git a/.github/workflows/rebuild-gisaid.yml b/.github/workflows/rebuild-gisaid.yml index 50944d877..1de85ff64 100644 --- a/.github/workflows/rebuild-gisaid.yml +++ b/.github/workflows/rebuild-gisaid.yml @@ -50,14 +50,14 @@ jobs: nextstrain build \ --aws-batch \ --detach \ - --cpus 36 \ - --memory 70GiB \ + --cpus 72 \ + --memory 140GiB \ . \ deploy \ upload \ --config "${config[@]}" \ --profile nextstrain_profiles/nextstrain-gisaid \ - --set-threads tree=16 \ + --set-threads tree=8 \ |& tee build-launch.log env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} diff --git a/.github/workflows/rebuild-open.yml b/.github/workflows/rebuild-open.yml index 50095dbb6..e42d13522 100644 --- a/.github/workflows/rebuild-open.yml +++ b/.github/workflows/rebuild-open.yml @@ -51,14 +51,14 @@ jobs: nextstrain build \ --aws-batch \ --detach \ - --cpus 36 \ - --memory 70GiB \ + --cpus 72 \ + --memory 140GiB \ . \ deploy \ upload \ --config "${config[@]}" \ --profile nextstrain_profiles/nextstrain-open \ - --set-threads tree=16 \ + --set-threads tree=8 \ |& tee build-launch.log env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 25035d2a2..4f91e7fd9 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -81,8 +81,8 @@ filter: exclude_where: "division='USA'" exclude_ambiguous_dates_by: "any" - # Exclude sequences which are from before late 2019 (likely date mix-ups) - min_date: 2019.74 + # Exclude sequences which are from before Dec 2019 (likely date mix-ups) + min_date: "2019-12-01" # When choosing contextual samples for a focal set, applying crowding penalty # will help reduce the number of genetically identical strains that get chosen, @@ -136,10 +136,10 @@ frequencies: # min_date is set by default to 1 year before present # but can be explicitly set if desired - # Number of months between pivots + # Number of weeks between pivots pivot_interval: 1 - # Measure pivots in weeks rather than months + # Measure pivots in weeks pivot_interval_units: "weeks" # KDE bandwidths in proportion of a year to use per strain. diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index 37fd8547e..fb20d8271 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update +- 29 April 2022: Include multiple timespans in Nextstrain profile builds. [PR 910](https://github.com/nextstrain/ncov/pull/910) + - 29 April 2022: Update default mask parameters to mask 200 bases from the end of the genome rather than the existing 50. This was necessary because there is a large deletion in this region in circulating 21L viruses. This deletion is causing problems with alignment and the resulting mis-alignment appears as excess mutations in the tree. [PR 939](https://github.com/nextstrain/ncov/pull/939). - 27 April 2022: Include new clades 22A, 22B and 22C, where 22A corresponds to Pango lineage BA.4, 22B corresponds to Pango lineage BA.5 and 22C corresponds to Pango lineage BA.2.12.1. Please see [PR 933](https://github.com/nextstrain/ncov/pull/933) for rationale behind these clade updates. diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 14765df7c..9af669cce 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -3,7 +3,6 @@ auspice_json_prefix: ncov_gisaid # Define custom rules for pre- or post-standard workflow processing of data. custom_rules: - workflow/snakemake_rules/export_for_nextstrain.smk - - nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. # To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. @@ -14,8 +13,17 @@ S3_DST_ORIGINS: ["gisaid"] upload: - build-files +# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds +deploy_url: s3://nextstrain-data +slack_token: ~ +slack_channel: "#ncov-gisaid-updates" + genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] use_nextalign: true +include_hcov19_prefix: True + +files: + description: "nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md" # Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment # as we re-align everything after subsampling. @@ -29,192 +37,371 @@ inputs: # For each build we specify a subsampling scheme via an explicit key. # These subsampling schemes are defined at the bottom of this file. # (They override the defaults) +# North America and Oceania are subsampled at the "division" level +# Africa, Asia, Europe and South America are subsampled at the "country" level builds: reference: - subsampling_scheme: nextstrain_clades + subsampling_scheme: nextstrain_reference auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling - global: - subsampling_scheme: nextstrain_region_global + global_6m: + subsampling_scheme: nextstrain_global_6m + auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months + global_all-time: + subsampling_scheme: nextstrain_global_all_time auspice_config: "nextstrain_profiles/nextstrain-gisaid/global_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with global subsampling - africa: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start + africa_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Africa auspice_config: "nextstrain_profiles/nextstrain-gisaid/africa_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Africa-focused subsampling - asia: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months + africa_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Africa + auspice_config: "nextstrain_profiles/nextstrain-gisaid/africa_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start + asia_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Asia auspice_config: "nextstrain_profiles/nextstrain-gisaid/asia_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Asia-focused subsampling - europe: - subsampling_scheme: nextstrain_region_grouped_by_country + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months + asia_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Asia + auspice_config: "nextstrain_profiles/nextstrain-gisaid/asia_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start + europe_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Europe auspice_config: "nextstrain_profiles/nextstrain-gisaid/europe_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Europe-focused subsampling - north-america: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months + europe_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Europe + auspice_config: "nextstrain_profiles/nextstrain-gisaid/europe_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start + north-america_6m: + subsampling_scheme: nextstrain_region_grouped_by_division_6m + region: North America + auspice_config: "nextstrain_profiles/nextstrain-gisaid/north-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months + north-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time region: North America auspice_config: "nextstrain_profiles/nextstrain-gisaid/north-america_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with North America-focused subsampling - oceania: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start + oceania_6m: + subsampling_scheme: nextstrain_region_grouped_by_division_6m + region: Oceania + auspice_config: "nextstrain_profiles/nextstrain-gisaid/oceania_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months + oceania_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time region: Oceania auspice_config: "nextstrain_profiles/nextstrain-gisaid/oceania_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with Oceania-focused subsampling - south-america: - subsampling_scheme: nextstrain_region + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start + south-america_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m + region: South America + auspice_config: "nextstrain_profiles/nextstrain-gisaid/south-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months + south-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time region: South America auspice_config: "nextstrain_profiles/nextstrain-gisaid/south-america_auspice_config.json" - title: Genomic epidemiology of SARS-CoV-2 with South America-focused subsampling + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start -# remove S dropout sequences and sequences without division label in US +# remove sequences without division label in US filter: - exclude_where: "division='USA' purpose_of_sequencing='S dropout'" + exclude_where: "division='USA'" +subsampling: -# if different traits should be reconstructed for some builds, specify here -# otherwise the default trait config in defaults/parameters.yaml will used -traits: - global: - sampling_bias_correction: 2.5 - columns: ["region"] - europe: - sampling_bias_correction: 2.5 - columns: ["country"] - africa: - sampling_bias_correction: 2.5 - columns: ["country"] - asia: - sampling_bias_correction: 2.5 - columns: ["country"] - south-america: - sampling_bias_correction: 2.5 - columns: ["country"] - north-america: - sampling_bias_correction: 2.5 - columns: ["division"] - oceania: - sampling_bias_correction: 2.5 - columns: ["division"] - -files: - description: "nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md" + # Custom subsampling logic for group by clade + nextstrain_reference: + clades: + group_by: "Nextstrain_clade" + max_sequences: 300 -# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds -deploy_url: s3://nextstrain-data -slack_token: ~ -slack_channel: "#ncov-gisaid-updates" + # Custom subsampling logic for regions over 6m + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_6m: + # Early focal samples for region + focal_early: + group_by: "division year month" + max_sequences: 640 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_early: + group_by: "country year month" + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "division year month" + max_sequences: 2560 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_recent: + group_by: "country year month" + max_sequences: 640 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region={region}'" -subsampling: - # Custom subsampling logic for regions - nextstrain_region: + # Custom subsampling logic for regions over all-time + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_all_time: # Focal samples for region - region_early: + focal: group_by: "division year month" + max_sequences: 3200 + exclude: "--exclude-where 'region!={region}'" + # Contextual samples from the rest of the world + context: + group_by: "country year month" max_sequences: 800 + exclude: "--exclude-where 'region={region}'" + + # Custom subsampling logic for regions over 6m + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_6m: + # Early focal samples for region + focal_early: + group_by: "country year month" + max_sequences: 640 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_early: + # Early contextual samples from the rest of the world + context_early: group_by: "country year month" - max_sequences: 600 + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "country year month" + max_sequences: 2560 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_recent: + group_by: "country year month" + max_sequences: 640 + min_date: "--min-date 6M" exclude: "--exclude-where 'region={region}'" - region_late: - group_by: "division year month" - max_sequences: 1700 + # Custom subsampling logic for regions over all-time + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_all_time: + # Focal samples for region + focal: + group_by: "country year month" + max_sequences: 3200 exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_late: + # Contextual samples from the rest of the world + context: group_by: "country year month" - max_sequences: 1000 + max_sequences: 800 exclude: "--exclude-where 'region={region}'" - # Custom subsampling logic for global region. - nextstrain_region_global: + # Custom subsampling logic for global region over 6m + # 4000 total + # 4:1 ratio of focal to context + # all regions equal except Oceania at 33% + nextstrain_global_6m: africa_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 300 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Asia'" europe_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Europe'" north_america_early: group_by: "division year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_early: group_by: "division year month" - max_sequences: 200 + max_sequences: 50 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Oceania'" - - africa_late: + africa_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Africa'" - asia_late: + asia_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Asia'" - europe_late: + europe_recent: group_by: "country year month" max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Europe'" - north_america_late: + north_america_recent: group_by: "division year month" max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=North America'" - south_america_late: + south_america_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 600 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=South America'" - oceania_late: + oceania_recent: group_by: "division year month" - max_sequences: 300 + max_sequences: 200 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Oceania'" - # Custom subsampling for regions like Europe where grouping by country - # is the smallest resolution requied - nextstrain_region_grouped_by_country: - # Focal samples for region - region_late: + # Custom subsampling logic for global region over all-time + # 4000 total + # all regions equal except Oceania at 33% + nextstrain_global_all_time: + africa: group_by: "country year month" - max_sequences: 1700 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_late: + max_sequences: 750 + exclude: "--exclude-where 'region!=Africa'" + asia: group_by: "country year month" - max_sequences: 1000 - exclude: "--exclude-where 'region={region}'" - # Focal samples for region - region_early: + max_sequences: 750 + exclude: "--exclude-where 'region!=Asia'" + europe: group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_early: + max_sequences: 750 + exclude: "--exclude-where 'region!=Europe'" + north_america: + group_by: "division year month" + max_sequences: 750 + exclude: "--exclude-where 'region!=North America'" + south_america: group_by: "country year month" - max_sequences: 500 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for group by clade - nextstrain_clades: - clades: - group_by: "Nextstrain_clade" - max_sequences: 300 + max_sequences: 750 + exclude: "--exclude-where 'region!=South America'" + oceania: + group_by: "division year month" + max_sequences: 250 + exclude: "--exclude-where 'region!=Oceania'" + +# if different traits should be reconstructed for some builds, specify here +# otherwise the default trait config in defaults/parameters.yaml will used +traits: + global_6m: + sampling_bias_correction: 2.5 + columns: ["region"] + global_all-time: + sampling_bias_correction: 2.5 + columns: ["region"] + africa_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + africa_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + north-america_6m: + sampling_bias_correction: 2.5 + columns: ["division"] + north-america_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_6m: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + south-america_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + south-america_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] -# Define frequencies parameters. +# Define frequencies parameters +# Target frequencies to "6m" vs "all-time" builds frequencies: + global_6m: recent_days_to_censor: 7 - -include_hcov19_prefix: True + min_date: "6M" + global_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + africa_6m: + recent_days_to_censor: 7 + min_date: "6M" + africa_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + asia_6m: + recent_days_to_censor: 7 + min_date: "6M" + asia_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + europe_6m: + recent_days_to_censor: 7 + min_date: "6M" + europe_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + north-america_6m: + recent_days_to_censor: 7 + min_date: "6M" + north-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + oceania_6m: + recent_days_to_censor: 7 + min_date: "6M" + oceania_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + south-america_6m: + recent_days_to_censor: 7 + min_date: "6M" + south-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" diff --git a/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md b/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md index 12b972c2b..c4247ada9 100644 --- a/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md +++ b/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md @@ -1,14 +1,31 @@ -Compiled Nextstrain SARS-CoV-2 resources are available at [nextstrain.org/sars-cov-2](https://nextstrain.org/sars-cov-2/). Follow [@nextstrain](https://twitter.com/nextstrain) for continual data updates. +Compiled Nextstrain SARS-CoV-2 resources are available at [nextstrain.org/sars-cov-2](https://nextstrain.org/sars-cov-2/). Follow [@nextstrain](https://twitter.com/nextstrain) for updates. -This phylogeny shows evolutionary relationships of SARS-CoV-2 viruses from the ongoing COVID-19 pandemic. Although the genetic relationships among sampled viruses are quite clear, there is considerable uncertainty surrounding estimates of specific transmission dates and in reconstruction of geographic spread. Please be aware that specific inferred geographic transmission patterns and temporal estimates are only a hypothesis. +This phylogeny shows evolutionary relationships of SARS-CoV-2 viruses from the ongoing COVID-19 pandemic. Although the genetic relationships among sampled viruses are generally quite clear, there is considerable uncertainty surrounding estimates of specific transmission dates and in reconstruction of geographic spread. Please be aware that specific inferred geographic transmission patterns and temporal estimates are only a hypothesis. -There are millions of complete SARS-CoV-2 genomes available and this number increases every day. This visualization can only handle ~4000 genomes in a single view for performance and legibility reasons. Because of this we subsample available genome data for these analysis views. Our primary [global analysis](/ncov/global/) subsamples to ~600 genomes per continental region with ~400 from the previous 4 months and ~200 from before this. This results in a more equitable global sequence distribution, but hides samples available from regions that are doing lots of sequencing. To mitigate against this, we've set up separate analyses to focus on particular regions. They are available on the "Dataset" dropdown on the left or by clicking on the following links: [Africa](/ncov/africa?f_region=Africa), [Asia](/ncov/asia?f_region=Asia), [Europe](/ncov/europe?f_region=Europe), [North America](/ncov/north-america?f_region=North%20America), [Oceania](/ncov/oceania?f_region=Oceania) and [South America](/ncov/south-america?f_region=South%20America). +There are millions of complete SARS-CoV-2 genomes available and this number increases every day. This visualization can only handle ~4000 genomes in a single view for performance and legibility reasons. Because of this we subsample available genome data for our analysis views. We provision multiple views to focus subsampling on different geographic regions and different time periods. These views are available through the "Dataset" dropdown on the left or by clicking on the following links: + +region | time period | URL +------------- | ------------- | --- +global | past 6 months | [/ncov/gisaid/global/6m](/ncov/gisaid/global/6m) +Africa | past 6 months | [/ncov/gisaid/africa/6m](/ncov/gisaid/africa/6m?f_region=Africa) +Asia | past 6 months | [/ncov/gisaid/asia/6m](/ncov/gisaid/asia/6m?f_region=Asia) +Europe | past 6 months | [/ncov/gisaid/europe/6m](/ncov/gisaid/europe/6m?f_region=Europe) +North America | past 6 months | [/ncov/gisaid/north-america/6m](/ncov/gisaid/north-america/6m?f_region=North%20America) +Oceania | past 6 months | [/ncov/gisaid/oceania/6m](/ncov/gisaid/oceania/6m?f_region=Oceania) +South America | past 6 months | [/ncov/gisaid/south-america/6m](/ncov/gisaid/south-america/6m?f_region=South%20America) +global | all time | [/ncov/gisaid/global/all-time](/ncov/gisaid/global/all-time) +Africa | all time | [/ncov/gisaid/africa/all-time](/ncov/gisaid/africa/all-time?f_region=Africa) +Asia | all time | [/ncov/gisaid/asia/all-time](/ncov/gisaid/asia/all-time?f_region=Asia) +Europe | all time | [/ncov/gisaid/europe/all-time](/ncov/gisaid/europe/all-time?f_region=Europe) +North America | all time | [/ncov/gisaid/north-america/all-time](/ncov/gisaid/north-america/all-time?f_region=North%20America) +Oceania | all time | [/ncov/gisaid/oceania/all-time](/ncov/gisaid/oceania/all-time?f_region=Oceania) +South America | all time | [/ncov/gisaid/south-america/all-time](/ncov/gisaid/south-america/all-time?f_region=South%20America) Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of 8 × 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al (under review)](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1). Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov). -We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequence and metadata made available through [GISAID](https://gisaid.org) on which this research is based. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Acknowledgments" in the resulting dialog box. +We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata made available through [GISAID](https://gisaid.org) on which this research is based. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Acknowledgments" in the resulting dialog box. At the specific request of GISAID, we: - maintain the prefix `hCoV-19/` in the names of viral isolates - - disable download of full metadata TSV and provide instead an acknowledgments TSV in the "download data" link at the bottom of the page + - disable download of full metadata TSV and provide instead an acknowledgments TSV in the "Download Data" link at the bottom of the page - refrain from sharing alignments or other intermediate files computed in our pipeline diff --git a/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk b/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk deleted file mode 100644 index 8609eef7f..000000000 --- a/nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk +++ /dev/null @@ -1,14 +0,0 @@ -import datetime - -# Set subsampling max date to today. -today = datetime.date.today() - -# Set the earliest date to roughly 4 months ago (18 weeks). -early_late_cutoff = today - datetime.timedelta(weeks=18) - -for build in config["subsampling"]: - for scheme in config["subsampling"][build]: - if "_early" in scheme: - config["subsampling"][build][scheme]["max_date"] = f"--max-date {early_late_cutoff.strftime('%Y-%m-%d')}" - if "_late" in scheme: - config["subsampling"][build][scheme]["min_date"] = f"--min-date {early_late_cutoff.strftime('%Y-%m-%d')}" diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index 37d273516..d426c387e 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -3,7 +3,6 @@ auspice_json_prefix: ncov_open # Define custom rules for pre- or post-standard workflow processing of data. custom_rules: - workflow/snakemake_rules/export_for_nextstrain.smk - - nextstrain_profiles/nextstrain-gisaid/subsampling_ranges.smk # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. # To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. @@ -14,6 +13,18 @@ S3_DST_ORIGINS: ["open"] upload: - build-files +# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds +deploy_url: s3://nextstrain-data +slack_token: ~ +slack_channel: "#ncov-genbank-updates" + +genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] +use_nextalign: true +include_hcov19_prefix: False + +files: + description: "nextstrain_profiles/nextstrain-open/nextstrain_description.md" + # Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment # as we re-align everything after subsampling. inputs: @@ -26,167 +37,376 @@ inputs: # For each build we specify a subsampling scheme via an explicit key. # These subsampling schemes are defined at the bottom of this file. # (They override the defaults) +# North America and Oceania are subsampled at the "division" level +# Africa, Asia, Europe and South America are subsampled at the "country" level builds: reference: - subsampling_scheme: nextstrain_clades - auspice_config: nextstrain_profiles/nextstrain-open/global_auspice_config.json + subsampling_scheme: nextstrain_reference + auspice_config: "nextstrain_profiles/nextstrain-open/global_auspice_config.json" title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling - global: - subsampling_scheme: nextstrain_region_global - auspice_config: nextstrain_profiles/nextstrain-open/global_auspice_config.json - title: Genomic epidemiology of SARS-CoV-2 with global subsampling - africa: - subsampling_scheme: nextstrain_region + global_6m: + subsampling_scheme: nextstrain_global_6m + auspice_config: "nextstrain_profiles/nextstrain-open/global_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months + global_all-time: + subsampling_scheme: nextstrain_global_all_time + auspice_config: "nextstrain_profiles/nextstrain-open/global_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start + africa_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m + region: Africa + auspice_config: "nextstrain_profiles/nextstrain-open/africa_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months + africa_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time region: Africa - auspice_config: nextstrain_profiles/nextstrain-open/africa_auspice_config.json - title: Genomic epidemiology of SARS-CoV-2 with Africa-focused subsampling - asia: - subsampling_scheme: nextstrain_region + auspice_config: "nextstrain_profiles/nextstrain-open/africa_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start + asia_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: Asia - auspice_config: nextstrain_profiles/nextstrain-open/asia_auspice_config.json - title: Genomic epidemiology of SARS-CoV-2 with Asia-focused subsampling - europe: - subsampling_scheme: nextstrain_region + auspice_config: "nextstrain_profiles/nextstrain-open/asia_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months + asia_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: Asia + auspice_config: "nextstrain_profiles/nextstrain-open/asia_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start + europe_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m + region: Europe + auspice_config: "nextstrain_profiles/nextstrain-open/europe_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months + europe_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time region: Europe - auspice_config: nextstrain_profiles/nextstrain-open/europe_auspice_config.json - title: Genomic epidemiology of SARS-CoV-2 with Europe-focused subsampling - north-america: - subsampling_scheme: nextstrain_region + auspice_config: "nextstrain_profiles/nextstrain-open/europe_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start + north-america_6m: + subsampling_scheme: nextstrain_region_grouped_by_division_6m region: North America - auspice_config: nextstrain_profiles/nextstrain-open/north-america_auspice_config.json - title: Genomic epidemiology of SARS-CoV-2 with North America-focused subsampling - oceania: - subsampling_scheme: nextstrain_region + auspice_config: "nextstrain_profiles/nextstrain-open/north-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months + north-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time + region: North America + auspice_config: "nextstrain_profiles/nextstrain-open/north-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start + oceania_6m: + subsampling_scheme: nextstrain_region_grouped_by_division_6m + region: Oceania + auspice_config: "nextstrain_profiles/nextstrain-open/oceania_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months + oceania_all-time: + subsampling_scheme: nextstrain_region_grouped_by_division_all_time region: Oceania - auspice_config: nextstrain_profiles/nextstrain-open/oceania_auspice_config.json - title: Genomic epidemiology of SARS-CoV-2 with Oceania-focused subsampling - south-america: - subsampling_scheme: nextstrain_region + auspice_config: "nextstrain_profiles/nextstrain-open/oceania_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start + south-america_6m: + subsampling_scheme: nextstrain_region_grouped_by_country_6m region: South America - auspice_config: nextstrain_profiles/nextstrain-open/south-america_auspice_config.json - title: Genomic epidemiology of SARS-CoV-2 with South America-focused subsampling - + auspice_config: "nextstrain_profiles/nextstrain-open/south-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months + south-america_all-time: + subsampling_scheme: nextstrain_region_grouped_by_country_all_time + region: South America + auspice_config: "nextstrain_profiles/nextstrain-open/south-america_auspice_config.json" + title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start -# remove S dropout sequences and sequences without division label in US +# remove sequences without division label in US filter: - exclude_where: "division='USA' purpose_of_sequencing='S dropout'" - -# if different traits should be reconstructed for some builds, specify here -# otherwise the default trait config in defaults/parameters.yaml will used -traits: - global: - sampling_bias_correction: 2.5 - columns: ["region"] - europe: - sampling_bias_correction: 2.5 - columns: ["country"] - africa: - sampling_bias_correction: 2.5 - columns: ["country"] - asia: - sampling_bias_correction: 2.5 - columns: ["country"] - south-america: - sampling_bias_correction: 2.5 - columns: ["country"] - north-america: - sampling_bias_correction: 2.5 - columns: ["division"] - oceania: - sampling_bias_correction: 2.5 - columns: ["division"] + exclude_where: "division='USA'" -files: - description: "nextstrain_profiles/nextstrain-open/nextstrain_description.md" +subsampling: -# GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build -# as Wuhan/Hu-1/2019 is not in the data. -refine: - root: "Wuhan-Hu-1/2019" + # Custom subsampling logic for group by clade + nextstrain_reference: + clades: + group_by: "Nextstrain_clade" + max_sequences: 300 -# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds -deploy_url: s3://nextstrain-data -slack_token: ~ -slack_channel: "#ncov-genbank-updates" + # Custom subsampling logic for regions over 6m + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_6m: + # Early focal samples for region + focal_early: + group_by: "division year month" + max_sequences: 640 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_early: + group_by: "country year month" + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "division year month" + max_sequences: 2560 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_recent: + group_by: "country year month" + max_sequences: 640 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region={region}'" -subsampling: - # Custom subsampling logic for regions - nextstrain_region: + # Custom subsampling logic for regions over all-time + # Grouping by division for North America and Oceania + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_division_all_time: # Focal samples for region - region_early: + focal: group_by: "division year month" + max_sequences: 3200 + exclude: "--exclude-where 'region!={region}'" + # Contextual samples from the rest of the world + context: + group_by: "country year month" max_sequences: 800 + exclude: "--exclude-where 'region={region}'" + + # Custom subsampling logic for regions over 6m + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of recent to early + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_6m: + # Early focal samples for region + focal_early: + group_by: "country year month" + max_sequences: 640 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region!={region}'" + # Early contextual samples from the rest of the world + context_early: + group_by: "country year month" + max_sequences: 160 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region={region}'" + # Recent focal samples for region + focal_recent: + group_by: "country year month" + max_sequences: 2560 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_early: + # Early contextual samples from the rest of the world + context_recent: group_by: "country year month" - max_sequences: 400 + max_sequences: 640 + min_date: "--min-date 6M" exclude: "--exclude-where 'region={region}'" - region_late: - group_by: "division year month" - max_sequences: 2000 + # Custom subsampling logic for regions over all-time + # Grouping by country for Africa, Asia, Europe and South America + # 4000 total + # 4:1 ratio of focal to context + nextstrain_region_grouped_by_country_all_time: + # Focal samples for region + focal: + group_by: "country year month" + max_sequences: 3200 exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global_late: + # Contextual samples from the rest of the world + context: group_by: "country year month" max_sequences: 800 exclude: "--exclude-where 'region={region}'" - # Custom subsampling logic for global region. - nextstrain_region_global: + # Custom subsampling logic for global region over 6m + # 4000 total + # 4:1 ratio of focal to context + # all regions equal except Oceania at 33% + nextstrain_global_6m: africa_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Asia'" europe_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Europe'" north_america_early: group_by: "division year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 200 + max_sequences: 150 + max_date: "--max-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_early: + group_by: "division year month" + max_sequences: 50 + max_date: "--max-date 6M" + exclude: "--exclude-where 'region!=Oceania'" + africa_recent: + group_by: "country year month" + max_sequences: 600 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!=Africa'" + asia_recent: + group_by: "country year month" + max_sequences: 600 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!=Asia'" + europe_recent: + group_by: "country year month" + max_sequences: 600 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!=Europe'" + north_america_recent: + group_by: "division year month" + max_sequences: 600 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!=North America'" + south_america_recent: + group_by: "country year month" + max_sequences: 600 + min_date: "--min-date 6M" + exclude: "--exclude-where 'region!=South America'" + oceania_recent: group_by: "division year month" max_sequences: 200 + min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Oceania'" - africa_late: + # Custom subsampling logic for global region over all-time + # 4000 total + # all regions equal except Oceania at 33% + nextstrain_global_all_time: + africa: group_by: "country year month" - max_sequences: 500 + max_sequences: 750 exclude: "--exclude-where 'region!=Africa'" - asia_late: + asia: group_by: "country year month" - max_sequences: 500 + max_sequences: 750 exclude: "--exclude-where 'region!=Asia'" - europe_late: + europe: group_by: "country year month" - max_sequences: 500 + max_sequences: 750 exclude: "--exclude-where 'region!=Europe'" - north_america_late: + north_america: group_by: "division year month" - max_sequences: 500 + max_sequences: 750 exclude: "--exclude-where 'region!=North America'" - south_america_late: + south_america: group_by: "country year month" - max_sequences: 500 + max_sequences: 750 exclude: "--exclude-where 'region!=South America'" - oceania_late: + oceania: group_by: "division year month" - max_sequences: 300 + max_sequences: 250 exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for clades. - nextstrain_clades: - clades: - group_by: "Nextstrain_clade" - max_sequences: 300 + +# GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build +# as Wuhan/Hu-1/2019 is not in the data. +refine: + root: "Wuhan-Hu-1/2019" + +# if different traits should be reconstructed for some builds, specify here +# otherwise the default trait config in defaults/parameters.yaml will used +traits: + global_6m: + sampling_bias_correction: 2.5 + columns: ["region"] + global_all-time: + sampling_bias_correction: 2.5 + columns: ["region"] + africa_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + africa_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + asia_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + europe_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + north-america_6m: + sampling_bias_correction: 2.5 + columns: ["division"] + north-america_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_6m: + sampling_bias_correction: 2.5 + columns: ["division"] + oceania_all-time: + sampling_bias_correction: 2.5 + columns: ["division"] + south-america_6m: + sampling_bias_correction: 2.5 + columns: ["country"] + south-america_all-time: + sampling_bias_correction: 2.5 + columns: ["country"] + +# Define frequencies parameters +# Target frequencies to "6m" vs "all-time" builds +frequencies: + global_6m: + recent_days_to_censor: 7 + min_date: "6M" + global_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + africa_6m: + recent_days_to_censor: 7 + min_date: "6M" + africa_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + asia_6m: + recent_days_to_censor: 7 + min_date: "6M" + asia_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + europe_6m: + recent_days_to_censor: 7 + min_date: "6M" + europe_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + north-america_6m: + recent_days_to_censor: 7 + min_date: "6M" + north-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + oceania_6m: + recent_days_to_censor: 7 + min_date: "6M" + oceania_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" + south-america_6m: + recent_days_to_censor: 7 + min_date: "6M" + south-america_all-time: + recent_days_to_censor: 7 + min_date: "2020-01-01" diff --git a/nextstrain_profiles/nextstrain-open/nextstrain_description.md b/nextstrain_profiles/nextstrain-open/nextstrain_description.md index 858953ea6..22c2143e5 100644 --- a/nextstrain_profiles/nextstrain-open/nextstrain_description.md +++ b/nextstrain_profiles/nextstrain-open/nextstrain_description.md @@ -1,38 +1,49 @@ -Compiled Nextstrain SARS-CoV-2 resources are available at [nextstrain.org/sars-cov-2](https://nextstrain.org/sars-cov-2/). Follow [@nextstrain](https://twitter.com/nextstrain) for continual data updates. - -This phylogeny shows evolutionary relationships of SARS-CoV-2 viruses from the ongoing COVID-19 pandemic. Although the genetic relationships among sampled viruses are quite clear, there is considerable uncertainty surrounding estimates of specific transmission dates and in reconstruction of geographic spread. Please be aware that specific inferred geographic transmission patterns and temporal estimates are only a hypothesis. - -There are hundreds of thousands of complete SARS-CoV-2 genomes available on open databases and this number increases every day, but geographical representation varies. This visualization can only handle ~3000 genomes in a single view for performance and legibility reasons. Because of this we subsample available genome data for these analysis views. Our primary [global analysis](/ncov/open/global/) subsamples to ~600 genomes per continental region with ~400 from the previous 4 months and ~200 from before this. This results in a more equitable global sequence distribution, but hides samples available from regions that are doing lots of sequencing. To mitigate against this, we've set up separate analyses to focus on particular regions. They are available on the "Dataset" dropdown on the left or by clicking on the following links: [Africa](/ncov/open/africa?f_region=Africa), [Asia](/ncov/open/asia?f_region=Asia), [Europe](/ncov/open/europe?f_region=Europe), [North America](/ncov/open/north-america?f_region=North%20America), [Oceania](/ncov/open/oceania?f_region=Oceania) and [South America](/ncov/open/south-america?f_region=South%20America). +Compiled Nextstrain SARS-CoV-2 resources are available at [nextstrain.org/sars-cov-2](https://nextstrain.org/sars-cov-2/). Follow [@nextstrain](https://twitter.com/nextstrain) for updates. + +This phylogeny shows evolutionary relationships of SARS-CoV-2 viruses from the ongoing COVID-19 pandemic. Although the genetic relationships among sampled viruses are generally quite clear, there is considerable uncertainty surrounding estimates of specific transmission dates and in reconstruction of geographic spread. Please be aware that specific inferred geographic transmission patterns and temporal estimates are only a hypothesis. + +There are millions of complete SARS-CoV-2 genomes available on open databases and this number increases every day. This visualization can only handle ~4000 genomes in a single view for performance and legibility reasons. Because of this we subsample available genome data for our analysis views. We provision multiple views to focus subsampling on different geographic regions and different time periods. These views are available through the "Dataset" dropdown on the left or by clicking on the following links: + +region | time period | URL +------------- | ------------- | --- +global | past 6 months | [/ncov/open/global/6m](/ncov/open/global/6m) +Africa | past 6 months | [/ncov/open/africa/6m](/ncov/open/africa/6m?f_region=Africa) +Asia | past 6 months | [/ncov/open/asia/6m](/ncov/open/asia/6m?f_region=Asia) +Europe | past 6 months | [/ncov/open/europe/6m](/ncov/open/europe/6m?f_region=Europe) +North America | past 6 months | [/ncov/open/north-america/6m](/ncov/open/north-america/6m?f_region=North%20America) +Oceania | past 6 months | [/ncov/open/oceania/6m](/ncov/open/oceania/6m?f_region=Oceania) +South America | past 6 months | [/ncov/open/south-america/6m](/ncov/open/south-america/6m?f_region=South%20America) +global | all time | [/ncov/open/global/all-time](/ncov/open/global/all-time) +Africa | all time | [/ncov/open/africa/all-time](/ncov/open/africa/all-time?f_region=Africa) +Asia | all time | [/ncov/open/asia/all-time](/ncov/open/asia/all-time?f_region=Asia) +Europe | all time | [/ncov/open/europe/all-time](/ncov/open/europe/all-time?f_region=Europe) +North America | all time | [/ncov/open/north-america/all-time](/ncov/open/north-america/all-time?f_region=North%20America) +Oceania | all time | [/ncov/open/oceania/all-time](/ncov/open/oceania/all-time?f_region=Oceania) +South America | all time | [/ncov/open/south-america/all-time](/ncov/open/south-america/all-time?f_region=South%20America) Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of 8 × 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al (under review)](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1). Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov). -The analysis on this page uses data from NCBI GenBank as a source following Open Data principles, such that we can make input data and intermediate files available for further analysis (see below). Open Data is data that can be freely used, re-used and redistributed by anyone - subject only, at most, to the requirement to attribute and sharealike. But be aware that not all regions are well represented in open databases and some of the above trees might lack recent data from particular geographic regions. +The analysis on this page uses data from NCBI GenBank as a source following [Open Data principles](https://opendatahandbook.org/guide/en/what-is-open-data/), such that we can make input data and intermediate files available for further analysis. Open Data is data that can be freely used, re-used and redistributed by anyone - subject only, at most, to the requirement to attribute and sharealike. But be aware that not all regions are well represented in open databases and some of the above trees might lack recent data from particular geographic regions. We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata for sharing their work in open databases. Please note that although data generators have generously shared data in an open fashion, that does not mean there should be free license to publish on this data. Data generators should be cited where possible and collaborations should be sought in some circumstances. Please try to avoid scooping someone else's work. Reach out if uncertain. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Strain Metadata" in the resulting dialog box. To maximize the utility and visibility of these generously shared data, [we provide preprocessed files that can serve as a starting point for additional analyses](https://docs.nextstrain.org/projects/ncov/en/latest/reference/remote_inputs.html). -### All sequences and metadata - -#### Ingested and parsed data - - * [sequences.fasta.xz](https://data.nextstrain.org/files/ncov/open/sequences.fasta.xz) - * [metadata.tsv.gz](https://data.nextstrain.org/files/ncov/open/metadata.tsv.gz) - -#### Pre-processed files +#### All sequences and metadata - * [aligned.fasta.xz](https://data.nextstrain.org/files/ncov/open/aligned.fasta.xz) - * [filtered.fasta.xz](https://data.nextstrain.org/files/ncov/open/filtered.fasta.xz) - * [mutation-summary.tsv.xz](https://data.nextstrain.org/files/ncov/open/mutation-summary.tsv.xz) +- [metadata.tsv.gz](https://data.nextstrain.org/files/ncov/open/metadata.tsv.gz) +- [sequences.fasta.xz](https://data.nextstrain.org/files/ncov/open/sequences.fasta.xz) +- [aligned.fasta.xz](https://data.nextstrain.org/files/ncov/open/aligned.fasta.xz) -### Subsampled sequences and intermediate files +#### Subsampled sequences and intermediate files -The files below exist for the `global` and the regional builds (`africa`, `asia`, `europe`, `north-america`, `oceania` and `south-america`). -The links below refer to the `${BUILD}` build, substitute `${BUILD}` with another build name in the links if desired. +The files below exist for every region (`global`, `africa`, `asia`, `europe`, `north-america`, `oceania` and `south-america`) and correspond to each region's 6 month timespan build (e.g. `global/6m`, `africa/6m`, `asia/6m`, etc). +Files for the `all-time` builds (e.g. `global/all-time`, etc.) are not yet available. +The links below refer to the `${BUILD_PART_0}` region; substitute `${BUILD_PART_0}` with another region name in the links if desired. - * [${BUILD}/sequences.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD}/sequences.fasta.xz) - * [${BUILD}/metadata.tsv.xz](https://data.nextstrain.org/files/ncov/open/${BUILD}/metadata.tsv.xz) - * [${BUILD}/aligned.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD}/aligned.fasta.xz) - * [${BUILD} auspice tree](https://data.nextstrain.org/files/ncov/open/${BUILD}/${BUILD}.json) - * [${BUILD} auspice root sequence](https://data.nextstrain.org/files/ncov/open/${BUILD}/${BUILD}_root-sequence.json) - * [${BUILD} auspice tip frequencies](https://data.nextstrain.org/files/ncov/open/${BUILD}/${BUILD}_tip-frequencies.json) +- [${BUILD_PART_0}/6m metadata.tsv.xz](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/metadata.tsv.xz) +- [${BUILD_PART_0}/6m sequences.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/sequences.fasta.xz) +- [${BUILD_PART_0}/6m aligned.fasta.xz](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/aligned.fasta.xz) +- [${BUILD_PART_0}/6m Auspice tree](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/${BUILD_PART_0}.json) +- [${BUILD_PART_0}/6m Auspice root sequence](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/${BUILD_PART_0}_root-sequence.json) +- [${BUILD_PART_0}/6m Auspice tip frequencies](https://data.nextstrain.org/files/ncov/open/${BUILD_PART_0}/${BUILD_PART_0}_tip-frequencies.json) diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index a3295fef1..e9a6e6bc8 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -1,6 +1,7 @@ """Small, shared functions used to generate inputs and parameters. """ import datetime +from itertools import product from urllib.parse import urlsplit def numeric_date(dt=None): @@ -167,7 +168,9 @@ def _get_sampling_bias_correction_for_wildcards(wildcards): return config["traits"]["default"]["sampling_bias_correction"] def _get_min_date_for_frequencies(wildcards): - if "frequencies" in config and "min_date" in config["frequencies"]: + if wildcards.build_name in config["frequencies"] and "min_date" in config["frequencies"][wildcards.build_name]: + return config["frequencies"][wildcards.build_name]["min_date"] + elif "frequencies" in config and "min_date" in config["frequencies"]: return config["frequencies"]["min_date"] else: # If not explicitly specified, default to 1 year back from the present @@ -177,7 +180,9 @@ def _get_min_date_for_frequencies(wildcards): ) def _get_max_date_for_frequencies(wildcards): - if "frequencies" in config and "max_date" in config["frequencies"]: + if wildcards.build_name in config["frequencies"] and "max_date" in config["frequencies"][wildcards.build_name]: + return config["frequencies"][wildcards.build_name]["max_date"] + elif "frequencies" in config and "max_date" in config["frequencies"]: return config["frequencies"]["max_date"] else: # Allow users to censor the N most recent days to minimize effects of @@ -205,16 +210,35 @@ def _get_upload_inputs(wildcards): origin = config["S3_DST_ORIGINS"][0] + # This function bakes in these assumptions here about the build names used + # for the nextstrain.org/ncov/gisaid and …/open builds and then + # special-cases them below. + regions = {"global", "africa", "asia", "europe", "north-america", "oceania", "south-america"} + timespans = {"6m", "all-time"} + region_timespan_builds = [f"{region}_{timespan}" for region, timespan in product(regions, timespans)] + # mapping of remote → local filenames build_files = {} for build_name in config["builds"]: + if build_name in region_timespan_builds: + region, timespan = build_name.split("_") + + # We name remote files only by region (for now), so only include + # the 6m timespan builds. + if timespan != "6m": + continue + + upload_name = region + else: + upload_name = build_name + build_files.update({ - f"{build_name}/sequences.fasta.xz": f"results/{build_name}/{build_name}_subsampled_sequences.fasta.xz", # from `rule combine_samples` - f"{build_name}/metadata.tsv.xz": f"results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", # from `rule combine_samples` - f"{build_name}/aligned.fasta.xz": f"results/{build_name}/aligned.fasta.xz", # from `rule build_align` + f"{upload_name}/sequences.fasta.xz": f"results/{build_name}/{build_name}_subsampled_sequences.fasta.xz", # from `rule combine_samples` + f"{upload_name}/metadata.tsv.xz": f"results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", # from `rule combine_samples` + f"{upload_name}/aligned.fasta.xz": f"results/{build_name}/aligned.fasta.xz", # from `rule build_align` # export the auspice dataset which matches the subsampled sequences / metadata (see `rule finalize`) - f"{build_name}/{build_name}.json": f"auspice/{config['auspice_json_prefix']}_{build_name}.json", - f"{build_name}/{build_name}_tip-frequencies.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_tip-frequencies.json", - f"{build_name}/{build_name}_root-sequence.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_root-sequence.json" + f"{upload_name}/{upload_name}.json": f"auspice/{config['auspice_json_prefix']}_{build_name}.json", + f"{upload_name}/{upload_name}_tip-frequencies.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_tip-frequencies.json", + f"{upload_name}/{upload_name}_root-sequence.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_root-sequence.json" }) return build_files diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk index 1300b2869..65977dd14 100644 --- a/workflow/snakemake_rules/export_for_nextstrain.smk +++ b/workflow/snakemake_rules/export_for_nextstrain.smk @@ -19,6 +19,7 @@ # snakemake --profile nextstrain_profiles/nextstrain-gisaid all_regions # to produce the final Auspice files! +import re import requests import json from workflow.lib.persistent_dict import PersistentDict, NoSuchEntryError @@ -138,8 +139,11 @@ rule dated_json: benchmark: "benchmarks/dated_json_{prefix}_{build_name}_{date}.txt" wildcard_constraints: - # Allow build names to contain alphanumeric characters, underscores, and hyphens - # but not special strings used for Nextstrain builds. + # Allow build names to contain alphanumeric characters, underscores, and + # hyphens but not special strings used for Nextstrain builds. Include + # the user-defined prefix as a constraint, so Snakemake does not parse + # parts of the actual build names as part of the prefix. + prefix = re.escape(config["auspice_json_prefix"]), build_name = r'(?:[-a-zA-Z0-9_](?!(tip-frequencies|\d{4}-\d{2}-\d{2})))+', date = r"\d{4}-\d{2}-\d{2}" conda: config["conda_environment"] diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 33c3f3533..73038aa4b 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -1365,13 +1365,22 @@ rule build_description: log: "logs/build_description_{build_name}.txt" conda: config["conda_environment"] - shell: - """ - env BUILD={wildcards.build_name:q} \ - perl -pe 's/\$\{{BUILD\}}/$ENV{{BUILD}}/g' \ - < {input.description:q} \ - > {output.description:q} - """ + run: + from string import Template + + context = { + "BUILD": wildcards.build_name, + **{ + f"BUILD_PART_{idx}": part + for idx, part + in enumerate(wildcards.build_name.split("_"))}, + } + + with open(input.description, "r", encoding = "utf-8") as i: + template = Template(i.read()) + + with open(output.description, "w", encoding = "utf-8") as o: + o.write(template.safe_substitute(context)) rule export: message: "Exporting data files for Auspice"