From 121b61364c6af6a3071e66e5659bf346981ad043 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 21 Feb 2024 10:14:00 -0800 Subject: [PATCH] Remove unused variables and refactor `GENE_LIST` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Noted in previous PRs that that the `GENES` and `GENES_SPACE_DELIMITED` variables are not needed¹ or used in the workflow,² so refactor the `GENE_LIST` to be a hardcoded list of genes. If we want to ensure that we do not miss any genes from the Nextclade dataset, we could parse out the gene names from the dataset's genome_annotation.gff file. However, I think that will over-complicate the Snakemake workflow so I'm leaving the hardcoded list. ¹ https://github.com/nextstrain/ncov-ingest/pull/372#discussion_r1046020969 ² https://github.com/nextstrain/ncov-ingest/pull/435#discussion_r1496332575 --- Snakefile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index 919d09ea..54c2b196 100644 --- a/Snakefile +++ b/Snakefile @@ -5,9 +5,12 @@ import os # Snakemake 7.7.0 introduced `retries` directive used in fetch_sequences min_version("7.7.0") -GENES = "E,M,N,ORF1a,ORF1b,ORF3a,ORF6,ORF7a,ORF7b,ORF8,ORF9b,S" -GENES_SPACE_DELIMITED = GENES.replace(",", " ") -GENE_LIST = GENES.split(",") +# Hardcoded gene list used to create the DAG for both nextclade.smk and upload.smk +# It does _not_ need to be supplied to the `nextclade run` invocations because +# it matches the genes listed in the SARS-CoV-2 Nextclade dataset genome_annotations.gff +# https://github.com/nextstrain/nextclade_data/blob/244058e7d599a8295d748b12cffdd25cec6d3e7b/data/nextstrain/sars-cov-2/wuhan-hu-1/orfs/genome_annotation.gff3 +# - Jover, 21 Feb 2024 +GENE_LIST = ['E', 'M', 'N', 'ORF1a', 'ORF1b', 'ORF3a', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8', 'ORF9b', 'S'] ################################################################# ####################### general setup ###########################