Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lineages coloring from https://dengue-lineages.org/ #93

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
5 changes: 3 additions & 2 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,13 @@ curate:
nextclade:
min_length: 1000 # E gene length is approximately 1400nt
min_seed_cover: 0.1
gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"]
gene: ["E","C","prM","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"]
# Nextclade Fields to rename to metadata field names.
field_map:
seqName: accession # ID field used to merge annotations
clade: genotype_nextclade
alignmentStart: alignmentStart
alignmentEnd: alignmentEnd
coverage: genome_coverage
failedCdses: failedCdses
failedCdses: failedCdses
id_field: accession
195 changes: 142 additions & 53 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,23 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
SUPPORTED_NEXTCLADE_SEROTYPES = ['denv1', 'denv2', 'denv3', 'denv4']
SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)

rule nextclade_denvX:
rule get_nextclade_dataset:
"""Download Nextclade dataset"""
output:
dataset="data/nextclade_data/v-gen-lab/{serotype}.zip",
params:
dataset_name=lambda wildcards: f"community/v-gen-lab/dengue/{wildcards.serotype}",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS,
shell:
r"""
nextclade3 dataset get \
--name={params.dataset_name:q} \
--output-zip={output.dataset} \
--verbose
"""

rule run_nextclade:
"""
For each type, classify into the appropriate Dengue genotype
1. Capture the alignment
Expand All @@ -26,89 +42,79 @@ rule nextclade_denvX:
"""
input:
sequences="results/sequences_{serotype}.fasta",
dataset="../nextclade_data/{serotype}",
dataset="data/nextclade_data/v-gen-lab/{serotype}.zip",
output:
nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv",
nextclade_alignment="results/aligned_{serotype}.fasta",
nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]),
nextclade="results/v-gen-lab/{serotype}/nextclade.tsv",
alignment="results/v-gen-lab/{serotype}/alignment.fasta",
translations=expand("data/v-gen-lab/{{serotype}}/translations/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]),
threads: 4
params:
min_length=config["nextclade"]["min_length"],
min_seed_cover=config["nextclade"]["min_seed_cover"],
output_translations = lambda wildcards: f"data/translations/{wildcards.serotype}/{{cds}}/seqs.gene.fasta",
output_translations = lambda wildcards: f"data/v-gen-lab/{wildcards.serotype}/translations/{{cds}}/seqs.gene.fasta",
log:
"logs/v-gen-lab/{serotype}/run_nextclade.txt",
benchmark:
"benchmarks/v-gen-lab/{serotype}/run_nextclade.txt",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output.nextclade_denvX} \
--min-length {params.min_length} \
--min-seed-cover {params.min_seed_cover} \
--silent \
--output-fasta {output.nextclade_alignment} \
--output-translations {params.output_translations} \
{input.sequences}
r"""
nextclade3 run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output.nextclade} \
--min-length {params.min_length} \
--min-seed-cover {params.min_seed_cover} \
--silent \
--output-fasta {output.alignment} \
--output-translations {params.output_translations} \
{input.sequences} \
&> {log:q}
"""

rule concat_genotype_nextclade_results:
"""
Concatenate all the nextclade results for dengue genotype classification
"""
input:
nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
nextclade_files=expand("results/v-gen-lab/{serotype}/nextclade.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
genotype_nextclade="results/nextclade_genotypes.tsv",
genotype_nextclade=temp("results/v-gen-lab/nextclade_metadata.tsv"),
params:
input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]),
output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]),
log:
"logs/v-gen-lab/concat_genotype_nextclade_results.txt",
benchmark:
"benchmarks/v-gen-lab/concat_genotype_nextclade_results.txt",
shell:
"""
echo "{params.output_nextclade_fields}" \
| tr ',' '\t' \
> {output.genotype_nextclade}

tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \
tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_files} \
| awk 'NR>1 {{print}}' \
>> {output.genotype_nextclade}
"""

rule append_nextclade_columns:
"""
Append the nextclade results to the metadata
"""
input:
metadata="data/metadata_all.tsv",
genotype_nextclade="results/nextclade_genotypes.tsv",
output:
metadata_all="data/metadata_nextclade.tsv",
params:
id_field=list(config["nextclade"]["field_map"].values())[0],
output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()][1:]),
shell:
"""
tsv-join -H \
--filter-file {input.genotype_nextclade} \
--key-fields {params.id_field} \
--append-fields {params.output_nextclade_fields} \
--write-all ? \
{input.metadata} \
> {output.metadata_all}
"""

rule calculate_gene_coverage:
"""
Calculate the coverage of the gene of interest
"""
input:
nextclade_translation="data/translations/{serotype}/{gene}/seqs.gene.fasta",
nextclade_translation="data/v-gen-lab/{serotype}/translations/{gene}/seqs.gene.fasta",
output:
gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv",
gene_coverage="data/v-gen-lab/{serotype}/translations/{gene}/gene_coverage.tsv",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS,
params:
id_field=config["curate"]["output_id_field"],
log:
"logs/v-gen-lab/{serotype}/{gene}/calculate_gene_coverage.txt",
benchmark:
"benchmarks/v-gen-lab/{serotype}/{gene}/calculate_gene_coverage.txt",
shell:
"""
python scripts/calculate-gene-converage-from-nextclade-translation.py \
Expand All @@ -123,40 +129,119 @@ rule aggregate_gene_coverage_by_gene:
Aggregate the gene coverage results by gene
"""
input:
gene_coverage=expand("data/translations/{serotype}/{{gene}}/gene_coverage.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
gene_coverage=expand("data/v-gen-lab/{serotype}/translations/{{gene}}/gene_coverage.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
gene_coverage_all="results/{gene}/gene_coverage_all.tsv",
log:
"logs/v-gen-lab/{gene}/aggregate_gene_coverage_by_gene.txt",
benchmark:
"benchmarks/v-gen-lab/{gene}/aggregate_gene_coverage_by_gene.txt",
shell:
"""
tsv-append -H {input.gene_coverage} > {output.gene_coverage_all}
"""

rule append_gene_coverage_columns:
rule combine_gene_coverage_columns:
"""
Append the gene coverage results to the metadata
Since gene coverage values should be a value between 0 and 1, empty fields should be filled with 0's
"""
input:
metadata="data/metadata_nextclade.tsv",
metadata="data/metadata_all.tsv",
gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"])
output:
metadata_all="results/metadata_all.tsv",
gene_coverage_combined="results/gene_coverage_combined.tsv",
params:
id_field=config["curate"]["output_id_field"],
log:
"logs/v-gen-lab/combine_gene_coverage_columns.txt",
benchmark:
"benchmarks/v-gen-lab/combine_gene_coverage_columns.txt",
shell:
"""
cp {input.metadata} {output.metadata_all}
tsv-select -H -f "{params.id_field}" {input.metadata} > {output.gene_coverage_combined}
for FILE in {input.gene_coverage}; do
tsv-join -H \
--filter-file $FILE \
--key-fields {params.id_field} \
--append-fields '*_coverage' \
--write-all 0 \
{output.metadata_all} \
{output.gene_coverage_combined} \
> results/temp_aggregate_gene_coverage.tsv
mv results/temp_aggregate_gene_coverage.tsv {output.metadata_all}
mv results/temp_aggregate_gene_coverage.tsv {output.gene_coverage_combined}
done
"""

rule append_nextclade_and_gene_coverage_columns:
"""
Append the nextclade results to the metadata
"""
input:
metadata="data/metadata_all.tsv",
genotype_nextclade="results/v-gen-lab/nextclade_metadata.tsv",
gene_coverage="results/gene_coverage_combined.tsv",
output:
metadata="data/metadata_nextclade.tsv",
params:
metadata_id_field=config["curate"]["output_id_field"],
nextclade_id_field=config["nextclade"]["id_field"],
log:
"logs/v-gen-lab/append_nextclade_and_gene_coverage_columns.txt",
benchmark:
"benchmarks/v-gen-lab/append_nextclade_and_gene_coverage_columns.txt",
shell:
"""
augur merge \
--metadata \
metadata={input.metadata:q} \
nextclade={input.genotype_nextclade:q} \
gene_coverage={input.gene_coverage:q} \
--metadata-id-columns \
metadata={params.metadata_id_field:q} \
nextclade={params.nextclade_id_field:q} \
gene_coverage={params.metadata_id_field:q} \
--output-metadata {output.metadata:q} \
--no-source-columns \
&> {log:q}
"""

rule infer_major_lineage:
"""
Infer Major dengue lineages
Reference: https://dengue-lineages.org/assets/img/homepage-img-01.png
For example:
Minor lineage -> Major lineage
1I_A.2.3 -> 1I_A
2II_B -> 2II_B
4III -> 4III
"""
input:
metadata="data/metadata_nextclade.tsv",
output:
metadata="results/metadata_all.tsv",
params:
nextclade_field="genotype_nextclade",
log:
"logs/v-gen-lab/infer_major_lineage.txt",
benchmark:
"benchmarks/v-gen-lab/infer_major_lineage.txt",
shell:
"""
cat {input.metadata:q} \
| csvtk -tl mutate \
-f {params.nextclade_field} \
-n genotype \
-p "^([0-9][A-Z]+)" \
| csvtk -tl mutate \
-f {params.nextclade_field} \
-n major_lineage \
-p "^([0-9][A-Z]+(?:_[A-Z])?)" \
| csvtk -tl mutate \
-f {params.nextclade_field} \
-n minor_lineage \
> {output.metadata:q}
"""

rule split_metadata_by_serotype:
"""
Split the metadata by serotype
Expand All @@ -169,7 +254,11 @@ rule split_metadata_by_serotype:
serotype=SEROTYPE_CONSTRAINTS
params:
serotype_field=config["curate"]["serotype_field"],
log:
"logs/split_metadata_by_serotype_{serotype}.txt",
benchmark:
"benchmarks/split_metadata_by_serotype_{serotype}.txt",
shell:
"""
tsv-filter -H --str-eq {params.serotype_field}:{wildcards.serotype} {input.metadata} > {output.serotype_metadata}
"""
"""
Loading