From 4483423eff8f1cf5714c94cb9a51ece5184939fc Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Wed, 8 May 2024 18:04:43 +0200 Subject: [PATCH] fix: ensure loss of aa motif detected when it's the only motif Currently Nextclade fails to detect a lost motif if the motif is the only one in its category. This is due to incorrect iteration of the category keys (names) of motifs. Here I implemented an iterator to correctly visit both the reference and query motif maps: it will visit the category even if it is entirely missing in one or the other map. Previously, the category missing from one map would not be considered, which caused silent failure to detect the disappearance of a motif. This was initially reported for the flu h5 datasets - the loss of `polybasic_cleavage_site` motif would not be detected. I tested by comparing the outputs of smoke tests from master branch and from this branch. The `polybasic_cleavage_site` motif loss is now detected in h5 datasets and there are no other changes. --- .../src/analyze/find_aa_motifs_changes.rs | 27 ++-- packages/nextclade/src/utils/map.rs | 63 ++++++++ packages/nextclade/src/utils/mod.rs | 2 + packages/nextclade/src/utils/zip_map.rs | 148 ++++++++++++++++++ 4 files changed, 224 insertions(+), 16 deletions(-) create mode 100644 packages/nextclade/src/utils/map.rs create mode 100644 packages/nextclade/src/utils/zip_map.rs diff --git a/packages/nextclade/src/analyze/find_aa_motifs_changes.rs b/packages/nextclade/src/analyze/find_aa_motifs_changes.rs index 9f4b5b6e6..dfb93d501 100644 --- a/packages/nextclade/src/analyze/find_aa_motifs_changes.rs +++ b/packages/nextclade/src/analyze/find_aa_motifs_changes.rs @@ -3,12 +3,14 @@ use crate::analyze::find_aa_motifs::{AaMotif, AaMotifWithoutSeq}; use crate::coord::position::AaRefPosition; use crate::coord::range::{intersect, Range}; use crate::translate::translate_genes::Translation; -use crate::utils::collections::{cloned_into, zip_map_hashmap}; +use crate::utils::collections::cloned_into; +use crate::utils::zip_map::zip_by_key; use eyre::Report; use itertools::{Either, Itertools, Zip}; use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, HashSet}; use std::fmt::Debug; +use std::ops::Deref; pub type AaMotifsMap = BTreeMap>; pub type AaMotifsChangesMap = BTreeMap; @@ -40,21 +42,14 @@ pub fn find_aa_motifs_changes( ref_translation: &Translation, qry_translation: &Translation, ) -> Result { - let query_motif_keys = aa_motifs_qry.keys().collect_vec(); - - // Exclude ref motifs missing in query - let aa_motifs_ref: AaMotifsMap = aa_motifs_ref - .iter() - .filter(|(name, _)| query_motif_keys.contains(name)) - .map(|(key, val)| (key.clone(), val.clone())) - .collect(); - - zip_map_hashmap(&aa_motifs_ref, aa_motifs_qry, |name, motifs_ref, motifs_qry| { - let changes = find_aa_motifs_changes_one(motifs_ref, motifs_qry, ref_translation, qry_translation)?; - - Ok((name.clone(), changes)) - }) - .collect() + zip_by_key(aa_motifs_ref, aa_motifs_qry) + .map(|(name, motifs_ref, motifs_qry)| { + let motifs_ref = motifs_ref.map(Deref::deref).unwrap_or_default(); + let motifs_qry = motifs_qry.map(Deref::deref).unwrap_or_default(); + let changes = find_aa_motifs_changes_one(motifs_ref, motifs_qry, ref_translation, qry_translation)?; + Ok((name.clone(), changes)) + }) + .collect() } fn find_aa_motifs_changes_one( diff --git a/packages/nextclade/src/utils/map.rs b/packages/nextclade/src/utils/map.rs new file mode 100644 index 000000000..ec45f57a9 --- /dev/null +++ b/packages/nextclade/src/utils/map.rs @@ -0,0 +1,63 @@ +use std::collections::{BTreeMap, HashMap}; +use std::hash::Hash; + +#[cfg(feature = "indexmap")] +use indexmap::IndexMap; + +/// Generic interface for maps: HashMap, BTreeMap, IndexMap +pub trait Map { + fn keys<'a>(&'a self) -> impl Iterator + where + K: 'a; + + fn get(&self, key: &K) -> Option<&V>; +} + +impl Map for BTreeMap +where + K: Ord, +{ + fn keys<'a>(&'a self) -> impl Iterator + where + K: 'a, + { + self.keys() + } + + fn get(&self, key: &K) -> Option<&V> { + self.get(key) + } +} + +impl Map for HashMap +where + K: Eq + Hash, +{ + fn keys<'a>(&'a self) -> impl Iterator + where + K: 'a, + { + self.keys() + } + + fn get(&self, key: &K) -> Option<&V> { + self.get(key) + } +} + +#[cfg(feature = "indexmap")] +impl Map for IndexMap +where + K: Eq + Hash, +{ + fn keys<'a>(&'a self) -> impl Iterator + where + K: 'a, + { + self.keys() + } + + fn get(&self, key: &K) -> Option<&V> { + IndexMap::get(self, key) + } +} diff --git a/packages/nextclade/src/utils/mod.rs b/packages/nextclade/src/utils/mod.rs index 6e8c4909e..1b6f01f18 100644 --- a/packages/nextclade/src/utils/mod.rs +++ b/packages/nextclade/src/utils/mod.rs @@ -7,8 +7,10 @@ pub mod fs; pub mod getenv; pub mod global_init; pub mod info; +pub mod map; pub mod num; pub mod option; pub mod string; pub mod vec2d; pub mod wraparound; +pub mod zip_map; diff --git a/packages/nextclade/src/utils/zip_map.rs b/packages/nextclade/src/utils/zip_map.rs new file mode 100644 index 000000000..126bca188 --- /dev/null +++ b/packages/nextclade/src/utils/zip_map.rs @@ -0,0 +1,148 @@ +use crate::utils::map::Map; +use itertools::{chain, Itertools}; +use std::fmt::Debug; +use std::hash::Hash; +use std::marker::PhantomData; + +/// Iterate a pair of maps synchronized by keys +pub fn zip_by_key<'a, M1, M2, K, V1, V2>(map1: &'a M1, map2: &'a M2) -> ZipByKeyIter<'a, K, V1, V2, M1, M2> +where + K: Ord + Debug + Hash + 'a, + V1: 'a, + V2: 'a, + M1: Map, + M2: Map, +{ + ZipByKeyIter::new(map1, map2) +} + +/// Iterator for a pair of maps, synchronized by keys +pub struct ZipByKeyIter<'a, K, V1, V2, M1: Map, M2: Map> { + keys: Vec<&'a K>, + map1: &'a M1, + map2: &'a M2, + index: usize, + _phantom: PhantomData<(V1, V2)>, +} + +impl<'a, K, V1, V2, M1, M2> ZipByKeyIter<'a, K, V1, V2, M1, M2> +where + K: Ord + Debug + Hash + 'a, + V1: 'a, + V2: 'a, + M1: Map, + M2: Map, +{ + pub fn new(map1: &'a M1, map2: &'a M2) -> Self { + let keys = chain!(map1.keys(), map2.keys()).unique().sorted().collect_vec(); + ZipByKeyIter { + keys, + map1, + map2, + index: 0, + _phantom: PhantomData, + } + } +} + +impl<'a, K, V1, V2, M1, M2> Iterator for ZipByKeyIter<'a, K, V1, V2, M1, M2> +where + K: Ord + 'a, + V1: 'a, + V2: 'a, + M1: Map, + M2: Map, +{ + type Item = (&'a K, Option<&'a V1>, Option<&'a V2>); + + fn next(&mut self) -> Option { + if self.index >= self.keys.len() { + return None; + } + let key = self.keys[self.index]; + let value1 = self.map1.get(key); + let value2 = self.map2.get(key); + self.index += 1; + Some((key, value1, value2)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use maplit::{btreemap, hashmap}; + use pretty_assertions::assert_eq; + + #[cfg(feature = "indexmap")] + use indexmap::indexmap; + + #[test] + fn test_zip_by_key_hashmap() { + let map1 = hashmap! { + 'a' => "alpha", + 'b' => "bravo", + 'c' => "charlie", + }; + + let map2 = hashmap! { + 'b' => "beans", + 'c' => "carrots", + 'd' => "dill", + }; + + let mut iter = zip_by_key(&map1, &map2); + + assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None))); + assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans")))); + assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots")))); + assert_eq!(iter.next(), Some((&'d', None, Some(&"dill")))); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_zip_by_key_btreemap() { + let map1 = btreemap! { + 'a' => "alpha", + 'b' => "bravo", + 'c' => "charlie", + }; + + let map2 = btreemap! { + 'b' => "beans", + 'c' => "carrots", + 'd' => "dill", + }; + + let mut iter = zip_by_key(&map1, &map2); + + assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None))); + assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans")))); + assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots")))); + assert_eq!(iter.next(), Some((&'d', None, Some(&"dill")))); + assert_eq!(iter.next(), None); + } + + #[cfg(feature = "indexmap")] + #[test] + fn test_zip_by_key_indexmap() { + let map1 = indexmap! { + 'a' => "alpha", + 'b' => "bravo", + 'c' => "charlie", + }; + + let map2 = indexmap! { + 'b' => "beans", + 'c' => "carrots", + 'd' => "dill", + }; + + let mut iter = zip_by_key(&map1, &map2); + + assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None))); + assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans")))); + assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots")))); + assert_eq!(iter.next(), Some((&'d', None, Some(&"dill")))); + assert_eq!(iter.next(), None); + } +}