diff --git a/Cargo.toml b/Cargo.toml index d6ac268a9..8475f0865 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -180,6 +180,7 @@ indexing_slicing = "allow" integer_division = "allow" iter_nth_zero = "allow" large_digit_groups = "allow" +len_without_is_empty = "allow" len_zero = "allow" let_underscore_must_use = "allow" manual_string_new = "allow" diff --git a/packages/nextclade/src/alphabet/aa.rs b/packages/nextclade/src/alphabet/aa.rs index cbc67d4a8..a0b3b6133 100644 --- a/packages/nextclade/src/alphabet/aa.rs +++ b/packages/nextclade/src/alphabet/aa.rs @@ -66,6 +66,7 @@ impl Display for Aa { impl Letter for Aa { const GAP: Aa = Aa::Gap; + const UNKNOWN: Aa = Aa::X; #[inline] fn is_gap(&self) -> bool { diff --git a/packages/nextclade/src/alphabet/letter.rs b/packages/nextclade/src/alphabet/letter.rs index a5623e2ea..25b644b6b 100644 --- a/packages/nextclade/src/alphabet/letter.rs +++ b/packages/nextclade/src/alphabet/letter.rs @@ -11,6 +11,7 @@ pub trait ScoreMatrixLookup { /// Generic representation of a character defining nucleotide or amino acid pub trait Letter: Copy + Display + Eq + Ord + ScoreMatrixLookup { const GAP: L; + const UNKNOWN: L; fn is_gap(&self) -> bool; diff --git a/packages/nextclade/src/alphabet/nuc.rs b/packages/nextclade/src/alphabet/nuc.rs index 4b839285c..6043b6d61 100644 --- a/packages/nextclade/src/alphabet/nuc.rs +++ b/packages/nextclade/src/alphabet/nuc.rs @@ -57,6 +57,7 @@ impl Display for Nuc { impl Letter for Nuc { const GAP: Nuc = Nuc::Gap; + const UNKNOWN: Nuc = Nuc::N; #[inline] fn is_gap(&self) -> bool { diff --git a/packages/nextclade/src/analyze/aa_alignment.rs b/packages/nextclade/src/analyze/aa_alignment.rs new file mode 100644 index 000000000..db4102387 --- /dev/null +++ b/packages/nextclade/src/analyze/aa_alignment.rs @@ -0,0 +1,52 @@ +use crate::alphabet::aa::Aa; +use crate::analyze::aa_sub_min::AaSubMin; +use crate::coord::position::{AaRefPosition, PositionLike}; +use crate::gene::cds::Cds; +use crate::translate::translate_genes::CdsTranslation; + +/// Represents a pair of aligned amino acid sequences (resulting from pairwise alignment +pub struct AaAlignment<'c, 'r, 'q> { + cds: &'c Cds, + ref_tr: &'r CdsTranslation, + qry_tr: &'q CdsTranslation, +} + +impl<'c, 'r, 'q> AaAlignment<'c, 'r, 'q> { + pub fn new(cds: &'c Cds, ref_tr: &'r CdsTranslation, qry_tr: &'q CdsTranslation) -> Self { + assert_eq!(ref_tr.seq.len(), qry_tr.seq.len()); + Self { cds, ref_tr, qry_tr } + } + + pub const fn cds(&self) -> &Cds { + self.cds + } + + pub fn len(&self) -> usize { + self.ref_tr.seq.len() + } + + pub fn ref_at(&self, pos: AaRefPosition) -> Aa { + self.ref_tr.seq[pos.as_usize()] + } + + pub fn qry_at(&self, pos: AaRefPosition) -> Aa { + self.qry_tr.seq[pos.as_usize()] + } + + pub fn mut_at(&self, pos: AaRefPosition) -> AaSubMin { + AaSubMin { + ref_aa: self.ref_at(pos), + pos, + qry_aa: self.qry_at(pos), + } + } + + pub fn is_sequenced(&self, pos: AaRefPosition) -> bool { + pos >= 0 + && self + .qry_tr + .alignment_ranges + .iter() + .any(|aa_alignment_range| aa_alignment_range.contains(pos)) + } +} diff --git a/packages/nextclade/src/analyze/aa_change_with_context.rs b/packages/nextclade/src/analyze/aa_change_with_context.rs new file mode 100644 index 000000000..af4c382b5 --- /dev/null +++ b/packages/nextclade/src/analyze/aa_change_with_context.rs @@ -0,0 +1,107 @@ +use crate::alphabet::aa::Aa; +use crate::alphabet::letter::{serde_deserialize_seq, serde_serialize_seq}; +use crate::alphabet::nuc::Nuc; +use crate::analyze::aa_sub_min::AaSubMin; +use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; +use crate::analyze::nuc_alignment::NucAlignment; +use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; +use crate::coord::position::{AaRefPosition, NucRefGlobalPosition}; +use crate::coord::range::NucRefGlobalRange; +use crate::gene::cds::Cds; +use crate::gene::gene::GeneStrand; +use crate::translate::complement::reverse_complement_in_place; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct AaChangeWithContext { + pub cds_name: String, + pub pos: AaRefPosition, + pub ref_aa: Aa, + pub qry_aa: Aa, + pub nuc_pos: NucRefGlobalPosition, + + #[schemars(with = "String")] + #[serde(serialize_with = "serde_serialize_seq")] + #[serde(deserialize_with = "serde_deserialize_seq")] + pub ref_triplet: Vec, + + #[schemars(with = "String")] + #[serde(serialize_with = "serde_serialize_seq")] + #[serde(deserialize_with = "serde_deserialize_seq")] + pub qry_triplet: Vec, + pub nuc_ranges: Vec, +} + +impl Pos for AaChangeWithContext { + fn pos(&self) -> AaRefPosition { + self.pos + } +} + +impl QryLetter for AaChangeWithContext { + fn qry_letter(&self) -> Aa { + self.qry_aa + } +} + +impl RefLetter for AaChangeWithContext { + fn ref_letter(&self) -> Aa { + self.ref_aa + } +} + +impl AbstractMutation for AaChangeWithContext { + fn clone_with(&self, params: MutParams) -> Self { + Self { + cds_name: self.cds_name.clone(), + pos: params.pos, + ref_aa: params.ref_letter, + qry_aa: params.qry_letter, + ..self.clone() + } + } +} + +impl AaChangeWithContext { + pub fn new(cds: &Cds, sub: &AaSubMin, aln: &NucAlignment) -> Self { + let AaSubMin { pos, ref_aa, qry_aa } = *sub; + let nuc_ranges = cds_codon_pos_to_ref_range(cds, pos); + + let ref_triplet = nuc_ranges + .iter() + .flat_map(|(range, strand)| { + let mut nucs = aln.ref_range(range).to_vec(); + if strand == &GeneStrand::Reverse { + reverse_complement_in_place(&mut nucs); + } + nucs + }) + .collect_vec(); + + let qry_triplet = nuc_ranges + .iter() + .flat_map(|(range, strand)| { + let mut nucs = aln.qry_range(range).to_vec(); + if strand == &GeneStrand::Reverse { + reverse_complement_in_place(&mut nucs); + } + nucs + }) + .collect_vec(); + + let nuc_ranges = nuc_ranges.into_iter().map(|(range, _)| range).collect_vec(); + + Self { + cds_name: cds.name.clone(), + pos, + ref_aa, + qry_aa, + nuc_pos: nuc_ranges[0].begin, + nuc_ranges, + ref_triplet, + qry_triplet, + } + } +} diff --git a/packages/nextclade/src/analyze/aa_changes.rs b/packages/nextclade/src/analyze/aa_changes.rs deleted file mode 100644 index e4a0e514e..000000000 --- a/packages/nextclade/src/analyze/aa_changes.rs +++ /dev/null @@ -1,420 +0,0 @@ -use crate::alphabet::aa::Aa; -use crate::alphabet::letter::Letter; -use crate::alphabet::letter::{serde_deserialize_seq, serde_serialize_seq}; -use crate::alphabet::nuc::Nuc; -use crate::analyze::aa_del::AaDel; -use crate::analyze::aa_sub::AaSub; -use crate::analyze::nuc_del::NucDelRange; -use crate::analyze::nuc_sub::NucSub; -use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; -use crate::coord::position::{AaRefPosition, NucRefGlobalPosition, PositionLike}; -use crate::coord::range::{have_intersection, AaRefRange, NucRefGlobalRange}; -use crate::gene::cds::Cds; -use crate::gene::gene::GeneStrand; -use crate::gene::gene_map::GeneMap; -use crate::translate::complement::reverse_complement_in_place; -use crate::translate::translate_genes::{CdsTranslation, Translation}; -use crate::utils::collections::extend_map_of_vecs; -use either::Either; -use eyre::Report; -use itertools::{Itertools, MinMaxResult}; -use serde::{Deserialize, Serialize}; -use std::collections::BTreeMap; - -#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] -#[serde(rename_all = "camelCase")] -pub struct AaChangeWithContext { - pub cds_name: String, - pub pos: AaRefPosition, - pub ref_aa: Aa, - pub qry_aa: Aa, - pub nuc_pos: NucRefGlobalPosition, - - #[schemars(with = "String")] - #[serde(serialize_with = "serde_serialize_seq")] - #[serde(deserialize_with = "serde_deserialize_seq")] - pub ref_triplet: Vec, - - #[schemars(with = "String")] - #[serde(serialize_with = "serde_serialize_seq")] - #[serde(deserialize_with = "serde_deserialize_seq")] - pub qry_triplet: Vec, - pub nuc_ranges: Vec, -} - -impl AaChangeWithContext { - pub fn new( - cds: &Cds, - pos: AaRefPosition, - qry_seq: &[Nuc], - ref_seq: &[Nuc], - ref_tr: &CdsTranslation, - qry_tr: &CdsTranslation, - ) -> Self { - let ref_aa = ref_tr.seq[pos.as_usize()]; - let qry_aa = qry_tr.seq[pos.as_usize()]; - let nuc_ranges = cds_codon_pos_to_ref_range(cds, pos); - - let ref_triplet = nuc_ranges - .iter() - .flat_map(|(range, strand)| { - let mut nucs = ref_seq[range.to_std()].to_vec(); - if strand == &GeneStrand::Reverse { - reverse_complement_in_place(&mut nucs); - } - nucs - }) - .collect_vec(); - - let qry_triplet = nuc_ranges - .iter() - .flat_map(|(range, strand)| { - let mut nucs = qry_seq[range.clamp_range(0, qry_seq.len()).to_std()].to_vec(); - if strand == &GeneStrand::Reverse { - reverse_complement_in_place(&mut nucs); - } - nucs - }) - .collect_vec(); - - let nuc_ranges = nuc_ranges.into_iter().map(|(range, _)| range).collect_vec(); - - Self { - cds_name: cds.name.clone(), - pos, - ref_aa, - qry_aa, - nuc_pos: nuc_ranges[0].begin, - nuc_ranges, - ref_triplet, - qry_triplet, - } - } - - #[inline] - pub fn is_mutated_or_deleted(&self) -> bool { - is_aa_mutated_or_deleted(self.ref_aa, self.qry_aa) - } -} - -#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] -#[serde(rename_all = "camelCase")] -pub struct AaChangesGroup { - name: String, - range: AaRefRange, - changes: Vec, - nuc_subs: Vec, - nuc_dels: Vec, -} - -impl AaChangesGroup { - pub fn new(name: impl AsRef) -> Self { - Self::with_changes(name, vec![]) - } - - pub fn with_changes(name: impl AsRef, changes: Vec) -> Self { - Self { - name: name.as_ref().to_owned(), - range: Self::find_codon_range(&changes), - changes, - nuc_subs: vec![], - nuc_dels: vec![], - } - } - - pub fn push(&mut self, change: AaChangeWithContext) { - self.changes.push(change); - self.range = Self::find_codon_range(&self.changes); - } - - pub fn last(&self) -> Option<&AaChangeWithContext> { - self.changes.last() - } - - fn find_codon_range(changes: &[AaChangeWithContext]) -> AaRefRange { - match changes.iter().minmax_by_key(|change| change.pos) { - MinMaxResult::NoElements => AaRefRange::from_isize(0, 0), - MinMaxResult::OneElement(one) => AaRefRange::new(one.pos, one.pos + 1), - MinMaxResult::MinMax(first, last) => AaRefRange::new(first.pos, last.pos + 1), - } - } -} - -#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] -#[serde(rename_all = "camelCase")] -pub struct FindAaChangesOutput { - pub aa_changes_groups: Vec, - pub aa_substitutions: Vec, - pub aa_deletions: Vec, - pub nuc_to_aa_muts: BTreeMap>, -} - -/// Finds aminoacid substitutions and deletions in query peptides relative to reference peptides, in all genes -/// -/// ## Precondition -/// Nucleotide sequences and peptides are required to be stripped from insertions -pub fn find_aa_changes( - ref_seq: &[Nuc], - qry_seq: &[Nuc], - ref_translation: &Translation, - qry_translation: &Translation, - gene_map: &GeneMap, - nuc_subs: &[NucSub], - nuc_dels: &[NucDelRange], -) -> Result { - let mut changes = qry_translation - .iter_cdses() - .map(|(qry_name, qry_cds_tr)| { - let ref_cds_tr = ref_translation.get_cds(qry_name)?; - let cds = gene_map.get_cds(&qry_cds_tr.name)?; - Ok(find_aa_changes_for_cds( - cds, qry_seq, ref_seq, ref_cds_tr, qry_cds_tr, nuc_subs, nuc_dels, - )) - }) - .collect::, Report>>()? - .into_iter() - // Merge changes from all CDSes into one struct - .fold(FindAaChangesOutput::default(), |mut output, changes| { - output.aa_changes_groups.extend(changes.aa_changes_groups); - output.aa_substitutions.extend(changes.aa_substitutions); - output.aa_deletions.extend(changes.aa_deletions); - extend_map_of_vecs(&mut output.nuc_to_aa_muts, changes.nuc_to_aa_muts); - output - }); - - changes.aa_substitutions.sort(); - changes.aa_deletions.sort(); - changes.nuc_to_aa_muts.iter_mut().for_each(|(_, vals)| { - vals.sort(); - vals.dedup(); - }); - - Ok(changes) -} - -/// Finds aminoacid substitutions and deletions in query peptides relative to reference peptides, in one gene -/// -/// ## Precondition -/// Nucleotide sequences and peptides are required to be stripped from insertions -/// -/// -/// ## Implementation details -/// We compare reference and query peptides (extracted by the preceding call to Nextalign), -/// one aminoacid at at time, and deduce changes. We then report the change and relevant nucleotide context surrounding -/// this change. -/// Previously we reported one-to-one mapping of aminoacid changes to corresponding nucleotide changes. However, it -/// was not always accurate, because if there are multiple nucleotide changes in a codon, the direct correspondence -/// might not always be established without knowing the order in which nucleotide changes have occurred. And in the -/// context of Nextclade we don't have this information. -fn find_aa_changes_for_cds( - cds: &Cds, - qry_seq: &[Nuc], - ref_seq: &[Nuc], - ref_tr: &CdsTranslation, - qry_tr: &CdsTranslation, - nuc_subs: &[NucSub], - nuc_dels: &[NucDelRange], -) -> FindAaChangesOutput { - assert_eq!(ref_tr.seq.len(), qry_tr.seq.len()); - assert_eq!(qry_seq.len(), ref_seq.len()); - - let aa_alignment_ranges = &qry_tr.alignment_ranges; - let mut aa_changes_groups = vec![AaChangesGroup::new(&cds.name)]; - let mut curr_group = aa_changes_groups.last_mut().unwrap(); - for codon in AaRefRange::from_usize(0, qry_tr.seq.len()).iter() { - if !is_codon_sequenced(aa_alignment_ranges, codon) { - continue; - } - - let ref_aa = ref_tr.seq[codon.as_usize()]; - let qry_aa = qry_tr.seq[codon.as_usize()]; - if is_aa_mutated_or_deleted(ref_aa, qry_aa) { - match curr_group.last() { - // If current group is empty, then we are about to insert the first codon into the first group. - None => { - if codon > 0 && is_codon_sequenced(aa_alignment_ranges, codon - 1) { - // Also prepend one codon to the left, for additional context, to start the group - curr_group.push(AaChangeWithContext::new( - cds, - codon - 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); - } - - // The current codon itself - curr_group.push(AaChangeWithContext::new(cds, codon, qry_seq, ref_seq, ref_tr, qry_tr)); - } - // Current group is not empty - Some(prev) => { - // If previous codon in the group is adjacent or almost adjacent (there is 1 item in between), - // then append to the group. - if codon <= prev.pos + 2 { - // If previous codon in the group is not exactly adjacent, there is 1 item in between, - // then cover the hole by inserting previous codon. - if codon == prev.pos + 2 && is_codon_sequenced(aa_alignment_ranges, codon - 1) { - curr_group.push(AaChangeWithContext::new( - cds, - codon - 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); - } - - // And insert the current codon - curr_group.push(AaChangeWithContext::new(cds, codon, qry_seq, ref_seq, ref_tr, qry_tr)); - } - // If previous codon in the group is not adjacent, then terminate the current group and start a new group. - else { - // Add one codon to the right, for additional context, to finalize the current group - if is_codon_sequenced(aa_alignment_ranges, prev.pos + 1) { - curr_group.push(AaChangeWithContext::new( - cds, - prev.pos + 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); - } - - let mut new_group = AaChangesGroup::new(&cds.name); - - // Start a new group and push the current codon into it. - if is_codon_sequenced(aa_alignment_ranges, codon - 1) { - // Also prepend one codon to the left, for additional context, to start the new group. - new_group.push(AaChangeWithContext::new( - cds, - codon - 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); - } - - // Push the current codon to the new group - new_group.push(AaChangeWithContext::new(cds, codon, qry_seq, ref_seq, ref_tr, qry_tr)); - - aa_changes_groups.push(new_group); - - curr_group = aa_changes_groups.last_mut().unwrap(); - } - } - } - } - } - - // Add one codon to the right, for additional context, to finalize the last group - if let Some(last) = curr_group.last() { - if is_codon_sequenced(aa_alignment_ranges, last.pos + 1) { - curr_group.push(AaChangeWithContext::new( - cds, - last.pos + 1, - qry_seq, - ref_seq, - ref_tr, - qry_tr, - )); - } - } - - // Keep only non-empty groups - aa_changes_groups.retain(|group| !group.range.is_empty() && !group.changes.is_empty()); - - aa_changes_groups.iter_mut().for_each(|group| { - let ranges = group - .range - .iter() - .flat_map(|codon| { - cds_codon_pos_to_ref_range(cds, codon) - .into_iter() - .map(|(range, _)| range) - }) - .collect_vec(); - - group.nuc_subs = nuc_subs - .iter() - .filter(|nuc_sub| ranges.iter().any(|range| range.contains(nuc_sub.pos))) - .cloned() - .collect_vec(); - - group.nuc_dels = nuc_dels - .iter() - .filter(|nuc_del| ranges.iter().any(|range| have_intersection(range, nuc_del.range()))) - .cloned() - .collect_vec(); - }); - - let (aa_substitutions, aa_deletions): (Vec, Vec) = aa_changes_groups - .iter() - .flat_map(|aa_changes_group| &aa_changes_group.changes) - .filter(|change| is_aa_mutated_or_deleted(change.ref_aa, change.qry_aa)) - .partition_map(|change| { - if change.qry_aa.is_gap() { - Either::Right(AaDel { - cds_name: cds.name.clone(), - ref_aa: change.ref_aa, - pos: change.pos, - }) - } else { - Either::Left(AaSub { - cds_name: cds.name.clone(), - ref_aa: change.ref_aa, - pos: change.pos, - qry_aa: change.qry_aa, - }) - } - }); - - // Associate nuc positions with aa mutations. - let nuc_to_aa_muts: BTreeMap> = aa_changes_groups - .iter() - .flat_map(|group| { - group - .changes - .iter() - .filter(|change| AaChangeWithContext::is_mutated_or_deleted(change)) - .flat_map(|change| { - change.nuc_ranges.iter().flat_map(move |range| { - range.iter() - // TODO: We convert position to string here, because when communicating with WASM we will pass through - // JSON schema, and JSON object keys must be strings. Maybe there is a way to keep the keys as numbers? - .map(move |pos| (pos.to_string(), AaSub::from(change))) - }) - }) - }) - .into_group_map() - .into_iter() - .map(|(pos, mut aa_muts)| { - aa_muts.sort(); - aa_muts.dedup(); - (pos, aa_muts) - }) - .collect(); - - FindAaChangesOutput { - aa_changes_groups, - aa_substitutions, - aa_deletions, - nuc_to_aa_muts, - } -} - -/// Check whether a given pair if reference and query aminoacids constitute a mutation or deletion -#[inline] -fn is_aa_mutated_or_deleted(ref_aa: Aa, qry_aa: Aa) -> bool { - // NOTE: We chose to ignore mutations to `X`. - qry_aa != ref_aa && qry_aa != Aa::X -} - -/// Check whether a given codon position corresponds to a sequenced aminoacid -fn is_codon_sequenced(aa_alignment_ranges: &[AaRefRange], codon: AaRefPosition) -> bool { - aa_alignment_ranges - .iter() - .any(|aa_alignment_range| aa_alignment_range.contains(codon)) -} diff --git a/packages/nextclade/src/analyze/aa_changes_find.rs b/packages/nextclade/src/analyze/aa_changes_find.rs new file mode 100644 index 000000000..5a3425868 --- /dev/null +++ b/packages/nextclade/src/analyze/aa_changes_find.rs @@ -0,0 +1,48 @@ +use crate::analyze::aa_alignment::AaAlignment; +use crate::analyze::aa_changes_find_for_cds::{aa_changes_find_for_cds, FindAaChangesOutput}; +use crate::analyze::nuc_alignment::NucAlignment; +use crate::analyze::nuc_del::NucDelRange; +use crate::analyze::nuc_sub::NucSub; +use crate::gene::gene_map::GeneMap; +use crate::translate::translate_genes::Translation; +use crate::utils::collections::extend_map_of_vecs; +use eyre::Report; + +/// Finds aminoacid substitutions and deletions in query peptides relative to reference peptides, in all genes +/// +/// ## Precondition +/// Nucleotide sequences and peptides are required to be stripped from insertions +pub fn aa_changes_find( + aln: &NucAlignment, + ref_translation: &Translation, + qry_translation: &Translation, + gene_map: &GeneMap, + nuc_subs: &[NucSub], + nuc_dels: &[NucDelRange], +) -> Result { + let mut changes = qry_translation.iter_cdses().map(|(qry_name, qry_tr)| { + let ref_tr = ref_translation.get_cds(qry_name)?; + let cds = gene_map.get_cds(&qry_tr.name)?; + let tr = AaAlignment::new(cds, ref_tr, qry_tr); + Ok(aa_changes_find_for_cds(aln, &tr, nuc_subs, nuc_dels)) + }) + .collect::, Report>>()? + .into_iter() + // Merge changes from all CDSes into one struct + .fold(FindAaChangesOutput::default(), |mut output, changes| { + output.aa_changes_groups.extend(changes.aa_changes_groups); + output.aa_substitutions.extend(changes.aa_substitutions); + output.aa_deletions.extend(changes.aa_deletions); + extend_map_of_vecs(&mut output.nuc_to_aa_muts, changes.nuc_to_aa_muts); + output + }); + + changes.aa_substitutions.sort(); + changes.aa_deletions.sort(); + changes.nuc_to_aa_muts.iter_mut().for_each(|(_, vals)| { + vals.sort(); + vals.dedup(); + }); + + Ok(changes) +} diff --git a/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs new file mode 100644 index 000000000..03c3f29d5 --- /dev/null +++ b/packages/nextclade/src/analyze/aa_changes_find_for_cds.rs @@ -0,0 +1,211 @@ +use crate::alphabet::aa::Aa; +use crate::alphabet::letter::Letter; +use crate::analyze::aa_alignment::AaAlignment; +use crate::analyze::aa_change_with_context::AaChangeWithContext; +use crate::analyze::aa_changes_group::AaChangesGroup; +use crate::analyze::aa_del::AaDel; +use crate::analyze::aa_sub::AaSub; +use crate::analyze::abstract_mutation::AbstractMutation; +use crate::analyze::nuc_alignment::NucAlignment; +use crate::analyze::nuc_del::NucDelRange; +use crate::analyze::nuc_sub::NucSub; +use crate::coord::coord_map_cds_to_global::cds_codon_pos_to_ref_range; +use crate::coord::position::AaRefPosition; +use crate::coord::range::{have_intersection, AaRefRange}; +use either::Either; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct FindAaChangesOutput { + pub aa_changes_groups: Vec, + pub aa_substitutions: Vec, + pub aa_deletions: Vec, + pub nuc_to_aa_muts: BTreeMap>, +} + +/// Finds aminoacid substitutions and deletions in query peptides relative to reference peptides, in one gene +/// +/// ## Precondition +/// Nucleotide sequences and peptides are required to be stripped from insertions +/// +/// +/// ## Implementation details +/// We compare reference and query peptides (extracted by the preceding call to Nextalign), +/// one aminoacid at a time, and deduce changes. We then report the change and relevant nucleotide context surrounding +/// this change. +/// Previously we reported one-to-one mapping of aminoacid changes to corresponding nucleotide changes. However, it +/// was not always accurate, because if there are multiple nucleotide changes in a codon, the direct correspondence +/// might not always be established without knowing the order in which nucleotide changes have occurred. And in the +/// context of Nextclade we don't have this information. +pub fn aa_changes_find_for_cds( + aln: &NucAlignment, + tr: &AaAlignment, + nuc_subs: &[NucSub], + nuc_dels: &[NucDelRange], +) -> FindAaChangesOutput { + let aa_changes = AaRefRange::from_usize(0, tr.len()) + .iter() + .filter_map(|codon| tr.is_sequenced(codon).then_some(tr.mut_at(codon))) + .collect_vec(); + + aa_changes_group(&aa_changes, aln, tr, nuc_subs, nuc_dels) +} + +pub fn aa_changes_group>( + aa_muts: &[M], + aln: &NucAlignment, + tr: &AaAlignment, + nuc_subs: &[NucSub], + nuc_dels: &[NucDelRange], +) -> FindAaChangesOutput { + let cds = tr.cds(); + let mut aa_changes_groups = vec![AaChangesGroup::new(&cds.name)]; + let mut curr_group = aa_changes_groups.last_mut().unwrap(); + + for aa_mut in aa_muts { + let codon = aa_mut.pos(); + + if aa_mut.is_mutated_and_not_unknown() { + match curr_group.last() { + // If current group is empty, then we are about to insert the first codon into the first group. + None => { + if codon > 0 && tr.is_sequenced(codon - 1) { + // Also prepend one codon to the left, for additional context, to start the group + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), aln)); + } + + // The current codon itself + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), aln)); + } + // Current group is not empty + Some(prev) => { + // If previous codon in the group is adjacent or almost adjacent (there is 1 item in between), + // then append to the group. + if codon <= prev.pos + 2 { + // If previous codon in the group is not exactly adjacent, there is 1 item in between, + // then cover the hole by inserting previous codon. + if codon == prev.pos + 2 && tr.is_sequenced(codon - 1) { + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), aln)); + } + + // And insert the current codon + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), aln)); + } + // If previous codon in the group is not adjacent, then terminate the current group and start a new group. + else { + // Add one codon to the right, for additional context, to finalize the current group + if tr.is_sequenced(prev.pos + 1) { + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(prev.pos + 1), aln)); + } + + let mut new_group = AaChangesGroup::new(&cds.name); + + // Start a new group and push the current codon into it. + if tr.is_sequenced(codon - 1) { + // Also prepend one codon to the left, for additional context, to start the new group. + new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon - 1), aln)); + } + + // Push the current codon to the new group + new_group.push(AaChangeWithContext::new(cds, &tr.mut_at(codon), aln)); + + aa_changes_groups.push(new_group); + + curr_group = aa_changes_groups.last_mut().unwrap(); + } + } + } + } + } + + // Add one codon to the right, for additional context, to finalize the last group + if let Some(last) = curr_group.last() { + if tr.is_sequenced(last.pos + 1) { + curr_group.push(AaChangeWithContext::new(cds, &tr.mut_at(last.pos + 1), aln)); + } + } + + // Keep only non-empty groups + aa_changes_groups.retain(|group| !group.range.is_empty() && !group.changes.is_empty()); + + aa_changes_groups.iter_mut().for_each(|group| { + let ranges = group + .range + .iter() + .flat_map(|codon| { + cds_codon_pos_to_ref_range(cds, codon) + .into_iter() + .map(|(range, _)| range) + }) + .collect_vec(); + + group.nuc_subs = nuc_subs + .iter() + .filter(|nuc_sub| ranges.iter().any(|range| range.contains(nuc_sub.pos))) + .cloned() + .collect_vec(); + + group.nuc_dels = nuc_dels + .iter() + .filter(|nuc_del| ranges.iter().any(|range| have_intersection(range, nuc_del.range()))) + .cloned() + .collect_vec(); + }); + + let (aa_substitutions, aa_deletions): (Vec, Vec) = aa_changes_groups + .iter() + .flat_map(|aa_changes_group| &aa_changes_group.changes) + .filter(|change| change.is_mutated_and_not_unknown()) + .partition_map(|change| { + if change.qry_aa.is_gap() { + Either::Right(AaDel { + cds_name: cds.name.clone(), + ref_aa: change.ref_aa, + pos: change.pos, + }) + } else { + Either::Left(AaSub { + cds_name: cds.name.clone(), + ref_aa: change.ref_aa, + pos: change.pos, + qry_aa: change.qry_aa, + }) + } + }); + + // Associate nuc positions with aa mutations. + let nuc_to_aa_muts: BTreeMap> = aa_changes_groups + .iter() + .flat_map(|group| { + group + .changes + .iter() + .filter(|change| change.is_mutated_and_not_unknown()) + .flat_map(|change| { + change.nuc_ranges.iter().flat_map(move |range| { + range.iter() + // TODO: We convert position to string here, because when communicating with WASM we will pass through + // JSON schema, and JSON object keys must be strings. Maybe there is a way to keep the keys as numbers? + .map(move |pos| (pos.to_string(), AaSub::from(change))) + }) + }) + }) + .into_group_map() + .into_iter() + .map(|(pos, mut aa_muts)| { + aa_muts.sort(); + aa_muts.dedup(); + (pos, aa_muts) + }) + .collect(); + + FindAaChangesOutput { + aa_changes_groups, + aa_substitutions, + aa_deletions, + nuc_to_aa_muts, + } +} diff --git a/packages/nextclade/src/analyze/aa_changes_group.rs b/packages/nextclade/src/analyze/aa_changes_group.rs new file mode 100644 index 000000000..0733aeb8b --- /dev/null +++ b/packages/nextclade/src/analyze/aa_changes_group.rs @@ -0,0 +1,49 @@ +use crate::analyze::aa_change_with_context::AaChangeWithContext; +use crate::analyze::nuc_del::NucDelRange; +use crate::analyze::nuc_sub::NucSub; +use crate::coord::range::AaRefRange; +use itertools::{Itertools, MinMaxResult}; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct AaChangesGroup { + pub name: String, + pub range: AaRefRange, + pub changes: Vec, + pub nuc_subs: Vec, + pub nuc_dels: Vec, +} + +impl AaChangesGroup { + pub fn new(name: impl AsRef) -> Self { + Self::with_changes(name, vec![]) + } + + pub fn with_changes(name: impl AsRef, changes: Vec) -> Self { + Self { + name: name.as_ref().to_owned(), + range: Self::find_codon_range(&changes), + changes, + nuc_subs: vec![], + nuc_dels: vec![], + } + } + + pub fn push(&mut self, change: AaChangeWithContext) { + self.changes.push(change); + self.range = Self::find_codon_range(&self.changes); + } + + pub fn last(&self) -> Option<&AaChangeWithContext> { + self.changes.last() + } + + fn find_codon_range(changes: &[AaChangeWithContext]) -> AaRefRange { + match changes.iter().minmax_by_key(|change| change.pos) { + MinMaxResult::NoElements => AaRefRange::from_isize(0, 0), + MinMaxResult::OneElement(one) => AaRefRange::new(one.pos, one.pos + 1), + MinMaxResult::MinMax(first, last) => AaRefRange::new(first.pos, last.pos + 1), + } + } +} diff --git a/packages/nextclade/src/analyze/aa_sub.rs b/packages/nextclade/src/analyze/aa_sub.rs index ec3a122cd..8a5f0d962 100644 --- a/packages/nextclade/src/analyze/aa_sub.rs +++ b/packages/nextclade/src/analyze/aa_sub.rs @@ -1,6 +1,6 @@ use crate::alphabet::aa::{from_aa, Aa}; use crate::alphabet::letter::Letter; -use crate::analyze::aa_changes::AaChangeWithContext; +use crate::analyze::aa_change_with_context::AaChangeWithContext; use crate::analyze::aa_del::AaDel; use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; use crate::coord::position::AaRefPosition; @@ -53,11 +53,6 @@ impl Pos for AaSub { } impl AaSub { - /// Checks whether this substitution is a deletion (substitution of letter `Gap`) - pub fn is_del(&self) -> bool { - self.qry_aa.is_gap() - } - pub fn from_str_and_gene(mut_str: impl AsRef, cds_name: impl AsRef) -> Result { Self::from_str(&format!("{}:{}", cds_name.as_ref(), mut_str.as_ref())) } @@ -152,3 +147,16 @@ impl From<&AaChangeWithContext> for AaSub { } } } + +// /// Check whether a given pair if reference and query aminoacids constitute a mutation or deletion +// pub fn is_aa_mutated_or_deleted(ref_aa: Aa, qry_aa: Aa) -> bool { +// // NOTE: We chose to ignore mutations to `X`. +// qry_aa != ref_aa && qry_aa != Aa::X +// } +// +// /// Check whether a given codon position corresponds to a sequenced aminoacid +// pub fn is_codon_sequenced(aa_alignment_ranges: &[AaRefRange], codon: AaRefPosition) -> bool { +// aa_alignment_ranges +// .iter() +// .any(|aa_alignment_range| aa_alignment_range.contains(codon)) +// } diff --git a/packages/nextclade/src/analyze/aa_sub_min.rs b/packages/nextclade/src/analyze/aa_sub_min.rs new file mode 100644 index 000000000..2ef1bd3ad --- /dev/null +++ b/packages/nextclade/src/analyze/aa_sub_min.rs @@ -0,0 +1,61 @@ +use crate::alphabet::aa::{from_aa, Aa}; +use crate::analyze::abstract_mutation::{AbstractMutation, MutParams, Pos, QryLetter, RefLetter}; +use crate::coord::position::AaRefPosition; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::fmt::{Display, Formatter}; + +/// Represents minimal aminoacid substitution (without feature name) +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct AaSubMin { + pub pos: AaRefPosition, + pub ref_aa: Aa, + pub qry_aa: Aa, +} + +impl AbstractMutation for AaSubMin { + fn clone_with(&self, params: MutParams) -> Self { + Self { + pos: params.pos, + ref_aa: params.ref_letter, + qry_aa: params.qry_letter, + } + } +} + +impl QryLetter for AaSubMin { + fn qry_letter(&self) -> Aa { + self.qry_aa + } +} + +impl RefLetter for AaSubMin { + fn ref_letter(&self) -> Aa { + self.ref_aa + } +} + +impl Pos for AaSubMin { + fn pos(&self) -> AaRefPosition { + self.pos + } +} + +impl AaSubMin { + #[must_use] + pub const fn invert(&self) -> Self { + Self { + ref_aa: self.qry_aa, + pos: self.pos, + qry_aa: self.ref_aa, + } + } +} + +impl Display for AaSubMin { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // NOTE: by convention, in bioinformatics, amino acids are numbered starting from 1, however our arrays are 0-based + write!(f, "{}{}{}", from_aa(self.ref_aa), self.pos + 1, from_aa(self.qry_aa)) + } +} diff --git a/packages/nextclade/src/analyze/abstract_mutation.rs b/packages/nextclade/src/analyze/abstract_mutation.rs index 5f8f42034..066775a88 100644 --- a/packages/nextclade/src/analyze/abstract_mutation.rs +++ b/packages/nextclade/src/analyze/abstract_mutation.rs @@ -16,6 +16,13 @@ pub trait RefLetter> { fn ref_letter(&self) -> L; } +pub trait Sub +where + P: PositionLike, + L: Letter, +{ +} + pub struct MutParams where P: PositionLike, @@ -35,4 +42,29 @@ where /// Creates a copy of the mutation, overwriting some of the fields according to the provided parameters #[must_use] fn clone_with(&self, params: MutParams) -> Self; + + #[must_use] + fn is_mutated(&self) -> bool { + self.qry_letter() != self.ref_letter() + } + + #[must_use] + fn is_sub(&self) -> bool { + self.is_mutated() && !self.qry_letter().is_gap() && !self.is_unknown() + } + + #[must_use] + fn is_del(&self) -> bool { + self.is_mutated() && self.qry_letter().is_gap() + } + + #[must_use] + fn is_unknown(&self) -> bool { + self.qry_letter().is_unknown() + } + + #[must_use] + fn is_mutated_and_not_unknown(&self) -> bool { + self.is_sub() || self.is_del() + } } diff --git a/packages/nextclade/src/analyze/mod.rs b/packages/nextclade/src/analyze/mod.rs index b4833b2e6..371b85a76 100644 --- a/packages/nextclade/src/analyze/mod.rs +++ b/packages/nextclade/src/analyze/mod.rs @@ -1,6 +1,11 @@ -pub mod aa_changes; +pub mod aa_alignment; +pub mod aa_change_with_context; +pub mod aa_changes_find; +pub mod aa_changes_find_for_cds; +pub mod aa_changes_group; pub mod aa_del; pub mod aa_sub; +pub mod aa_sub_min; pub mod abstract_mutation; pub mod count_gaps; pub mod divergence; @@ -11,6 +16,7 @@ pub mod find_private_nuc_mutations; pub mod is_sequenced; pub mod letter_composition; pub mod letter_ranges; +pub mod nuc_alignment; pub mod nuc_changes; pub mod nuc_del; pub mod nuc_sub; diff --git a/packages/nextclade/src/analyze/nuc_alignment.rs b/packages/nextclade/src/analyze/nuc_alignment.rs new file mode 100644 index 000000000..1b1f32821 --- /dev/null +++ b/packages/nextclade/src/analyze/nuc_alignment.rs @@ -0,0 +1,56 @@ +use crate::alphabet::nuc::Nuc; +use crate::analyze::nuc_sub::NucSub; +use crate::coord::position::{NucRefGlobalPosition, PositionLike}; +use crate::coord::range::NucRefGlobalRange; + +/// Represents a pair of aligned amino acid sequences (resulting from pairwise alignment +pub struct NucAlignment<'r, 'q, 'a> { + ref_seq: &'r [Nuc], + qry_seq: &'q [Nuc], + alignment_range: &'a NucRefGlobalRange, +} + +impl<'r, 'q, 'a> NucAlignment<'r, 'q, 'a> { + pub fn new(ref_seq: &'r [Nuc], qry_seq: &'q [Nuc], alignment_range: &'a NucRefGlobalRange) -> Self { + assert_eq!(ref_seq.len(), qry_seq.len()); + Self { + ref_seq, + qry_seq, + alignment_range, + } + } + + pub const fn len(&self) -> usize { + self.ref_seq.len() + } + + pub fn ref_at(&self, pos: NucRefGlobalPosition) -> Nuc { + self.ref_seq[pos.as_usize()] + } + + pub fn qry_at(&self, pos: NucRefGlobalPosition) -> Nuc { + self.qry_seq[pos.as_usize()] + } + + pub fn ref_range(&self, range: &NucRefGlobalRange) -> &[Nuc] { + let range = range.clamp_range(0, self.len()); + &self.ref_seq[range.to_std()] + } + + pub fn qry_range(&self, range: &NucRefGlobalRange) -> &[Nuc] { + let range = range.clamp_range(0, self.len()); + &self.qry_seq[range.to_std()] + } + + pub fn mut_at(&self, pos: NucRefGlobalPosition) -> NucSub { + NucSub { + ref_nuc: self.ref_at(pos), + pos, + qry_nuc: self.qry_at(pos), + } + } + + pub fn is_sequenced(&self, pos: NucRefGlobalPosition) -> bool { + pos >= 0 && self.alignment_range.contains(pos) + } +} diff --git a/packages/nextclade/src/analyze/nuc_sub.rs b/packages/nextclade/src/analyze/nuc_sub.rs index 48fdce9b5..7e3f94d6b 100644 --- a/packages/nextclade/src/analyze/nuc_sub.rs +++ b/packages/nextclade/src/analyze/nuc_sub.rs @@ -50,11 +50,6 @@ impl Pos for NucSub { } impl NucSub { - /// Checks whether this substitution is a deletion (substitution of letter `Gap`) - pub fn is_del(&self) -> bool { - self.qry_nuc.is_gap() - } - pub const fn genotype(&self) -> Genotype { Genotype { pos: self.pos, diff --git a/packages/nextclade/src/run/nextclade_run_one.rs b/packages/nextclade/src/run/nextclade_run_one.rs index 1fd99de25..0ed47f6ba 100644 --- a/packages/nextclade/src/run/nextclade_run_one.rs +++ b/packages/nextclade/src/run/nextclade_run_one.rs @@ -3,7 +3,9 @@ use crate::align::insertions_strip::{get_aa_insertions, insertions_strip, AaIns, use crate::alphabet::aa::Aa; use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; -use crate::analyze::aa_changes::{find_aa_changes, AaChangesGroup, FindAaChangesOutput}; +use crate::analyze::aa_changes_find::aa_changes_find; +use crate::analyze::aa_changes_find_for_cds::FindAaChangesOutput; +use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; use crate::analyze::aa_sub::AaSub; use crate::analyze::divergence::calculate_branch_length; @@ -15,6 +17,7 @@ use crate::analyze::letter_composition::get_letter_composition; use crate::analyze::letter_ranges::{ find_aa_letter_ranges, find_letter_ranges, find_letter_ranges_by, CdsAaRange, NucRange, }; +use crate::analyze::nuc_alignment::NucAlignment; use crate::analyze::nuc_changes::{find_nuc_changes, FindNucChangesOutput}; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::pcr_primer_changes::get_pcr_primer_changes; @@ -108,6 +111,8 @@ pub fn nextclade_run_one( alignment_range, } = find_nuc_changes(&stripped.qry_seq, ref_seq); + let aln = NucAlignment::new(ref_seq, &stripped.qry_seq, &alignment_range); + let total_substitutions = substitutions.len(); let total_deletions = deletions.iter().map(NucDelRange::len).sum(); @@ -198,9 +203,8 @@ pub fn nextclade_run_one( aa_substitutions, aa_deletions, nuc_to_aa_muts, - } = find_aa_changes( - ref_seq, - &stripped.qry_seq, + } = aa_changes_find( + &aln, ref_translation, &translation, gene_map, @@ -259,11 +263,11 @@ pub fn nextclade_run_one( let nearest_nodes = params.general.include_nearest_node_info.then_some( nearest_node_candidates - .iter() - // Choose all nodes with distance equal to the distance of the nearest node - .filter(|n| n.distance == nearest_node_candidates[0].distance) - .map(|n| Ok(graph.get_node(n.node_key)?.payload().name.clone())) - .collect::, Report>>()?, + .iter() + // Choose all nodes with distance equal to the distance of the nearest node + .filter(|n| n.distance == nearest_node_candidates[0].distance) + .map(|n| Ok(graph.get_node(n.node_key)?.payload().name.clone())) + .collect::, Report>>()?, ); let clade = nearest_node.clade(); diff --git a/packages/nextclade/src/types/outputs.rs b/packages/nextclade/src/types/outputs.rs index e603e65ae..135bdf4dd 100644 --- a/packages/nextclade/src/types/outputs.rs +++ b/packages/nextclade/src/types/outputs.rs @@ -1,6 +1,6 @@ use crate::align::insertions_strip::{AaIns, Insertion}; use crate::alphabet::nuc::Nuc; -use crate::analyze::aa_changes::AaChangesGroup; +use crate::analyze::aa_changes_group::AaChangesGroup; use crate::analyze::aa_del::AaDel; use crate::analyze::aa_sub::AaSub; use crate::analyze::find_aa_motifs_changes::{AaMotifsChangesMap, AaMotifsMap};