Skip to content

Commit

Permalink
Updated criterion for curated counts
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasesta committed Dec 10, 2024
1 parent c17cfef commit aed4fab
Show file tree
Hide file tree
Showing 7 changed files with 1,108 additions and 6,060 deletions.
8 changes: 5 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,17 @@ rule annotate_counts:
counts=rules.get_counts_table.output.csv,
clade_founder=rules.get_clade_founder.output.csv,
output:
csv=temp("results/mut_counts_by_clade.csv"),
counts_csv=temp("results/mut_counts_by_clade.csv"),
founder_csv="results/clade_founder.csv",
notebook:
"notebook/counts_by_clade.py.ipynb"

rule curated_counts:
message:
"Create training dataset to infer the General Linear Model for mutations in Omicron and pre-Omicron sequences"
input:
mut_counts=rules.annotate_counts.output.csv,
mut_counts=rules.annotate_counts.output.counts_csv,
clade_founder=rules.annotate_counts.output.founder_csv,
output:
pre_omicron="results/curated/curated_mut_counts_pre_omicron.csv",
omicron="results/curated/curated_mut_counts_omicron.csv"
Expand All @@ -84,7 +86,7 @@ rule predicted_counts:
message:
"Add predicted counts, based on the inferred mutation rate model, to the table with observed mutation counts."
input:
counts_df=rules.annotate_counts.output.csv,
counts_df=rules.annotate_counts.output.counts_csv,
pre_omicron=rules.curated_counts.output.pre_omicron,
omicron=rules.curated_counts.output.omicron,
output:
Expand Down
18 changes: 14 additions & 4 deletions notebook/counts_by_clade.py.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
"clade_founder = snakemake.input.clade_founder\n",
"counts = snakemake.input.counts\n",
"rna_struct = snakemake.input.rna_struct\n",
"output_csv = snakemake.output.csv"
"founder_csv = snakemake.output.founder_csv\n",
"counts_csv = snakemake.output.counts_csv"
]
},
{
Expand Down Expand Up @@ -95,6 +96,15 @@
"founder_df.rename(columns={'site': 'nt_site'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"founder_df.to_csv(founder_csv, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -416,14 +426,14 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save to file\n",
"counts_df.drop(columns=['subset'], inplace=True)\n",
"if not os.path.isfile(output_csv):\n",
" counts_df.to_csv(output_csv, index=False)"
"if not os.path.isfile(counts_csv):\n",
" counts_df.to_csv(counts_csv, index=False)"
]
},
{
Expand Down
93 changes: 65 additions & 28 deletions notebook/curate_counts_pre_post_omicron.py.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"outputs": [],
"source": [
"counts_df_file = snakemake.input.mut_counts\n",
"clade_founder = snakemake.input.clade_founder\n",
"outfile_pre_omicron = snakemake.output.pre_omicron\n",
"outfile_omicron = snakemake.output.omicron"
]
Expand All @@ -46,7 +47,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read in and curate counts data"
"## Read-in counts and clade founders"
]
},
{
Expand All @@ -59,6 +60,48 @@
"counts_df = pd.read_csv(counts_df_file, low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Read in clade founder\n",
"founder_df = pd.read_csv(clade_founder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Curate counts data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Determine conserved sites"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Identify sites where the codon and motif are conserved across all clade founders\n",
"# by subsetting data to entries with identical codons/motifs to reference, then\n",
"# identifying sites that still have entries for all clades\n",
"data = founder_df[\n",
" (founder_df['codon'] == founder_df['ref_codon']) &\n",
" (founder_df['motif'] == founder_df['ref_motif'])\n",
"]\n",
"site_counts = data['nt_site'].value_counts()\n",
"nclades = len(founder_df['clade'].unique())\n",
"conserved_sites = site_counts[site_counts == nclades].index"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -68,7 +111,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -299,14 +342,12 @@
" 28826, 28854, 29700, 4050, 13402, 11083, 15324, 21575\n",
"]\n",
"\n",
"# Retain only non-excluded sites\n",
"# Retain only non-excluded and conserved sites\n",
"curated_counts_df = counts_df[\n",
" counts_df['nt_site'].isin(conserved_sites) &\n",
" ~(counts_df['nt_site'].isin(sites_to_ignore))\n",
"]\n",
"\n",
"# Save curated counts to an output file\n",
"print(sum(curated_counts_df['motif'] != curated_counts_df['ref_motif']))\n",
"\n",
"curated_counts_df.head()"
]
},
Expand All @@ -319,22 +360,20 @@
"* Cluster of pre-Omicron clades\n",
"* Cluster of Omicron clades\n",
"\n",
"Sites are retained if the corresponding codon and motif are conserved. Then aggregates mutation counts accros clades belonging to the same cluster"
"Mutation counts are aggregated accros clades belonging to the same cluster"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Split Omicron/non-Omicron clades\n",
"counts_pre_omicron = curated_counts_df.query('pre_omicron_or_omicron == \"pre_omicron\"')\n",
"counts_omicron = curated_counts_df.query('pre_omicron_or_omicron == \"omicron\"')\n",
"curated_pre_omicron = curated_counts_df.query('pre_omicron_or_omicron == \"pre_omicron\"')\n",
"curated_omicron = curated_counts_df.query('pre_omicron_or_omicron == \"omicron\"')\n",
"\n",
"# Retain only sites with conserved motifs\n",
"curated_pre_omicron = counts_pre_omicron.query('motif == ref_motif')\n",
"curated_omicron = counts_omicron.query('motif == ref_motif')\n",
"# Check that motifs are conserved\n",
"assert sum(curated_pre_omicron['motif'] != curated_pre_omicron['ref_motif']) == 0\n",
"assert sum(curated_omicron['motif'] != curated_omicron['ref_motif']) == 0\n",
"\n",
Expand All @@ -354,22 +393,11 @@
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Drop multiple nt_mutation which correspond to different clade founder codons\n",
"curated_pre_omicron.drop_duplicates(subset='nt_mutation', inplace=True)\n",
"curated_omicron.drop_duplicates(subset='nt_mutation', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check \n",
"# Check there are no duplicate n.t. mutations\n",
"assert sum(curated_pre_omicron['nt_mutation'].duplicated(keep=False)) == 0\n",
"assert sum(curated_omicron['nt_mutation'].duplicated(keep=False)) == 0"
]
Expand Down Expand Up @@ -471,22 +499,31 @@
"curated_omicron['mut_class'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save curated counts dataframes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Drop columns for site exclusions and masking\n",
"curated_pre_omicron.drop(columns=['exclude', 'masked_in_usher'], inplace=True)\n",
"curated_omicron.drop(columns=['exclude', 'masked_in_usher'], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Write curated dataframes to file\n",
"if not os.path.isfile(outfile_pre_omicron):\n",
" curated_pre_omicron.to_csv(outfile_pre_omicron, index=False)\n",
"\n",
Expand Down
Loading

0 comments on commit aed4fab

Please sign in to comment.