fix: ensure loss of aa motif detected when it's the only motif

Currently Nextclade fails to detect a lost motif if the motif is the only one in its category. This is due to incorrect iteration of the category keys (names) of motifs. Here I implemented an iterator to correctly visit both the reference and query motif maps: it will visit the category even if it is entirely missing in one or the other map. Previously, the category missing from one map would not be considered, which caused silent failure to detect the disappearance of a motif. This was initially reported for the flu h5 datasets - the loss of `polybasic_cleavage_site` motif would not be detected. I tested by comparing the outputs of smoke tests from master branch and from this branch. The `polybasic_cleavage_site` motif loss is now detected in h5 datasets and there are no other changes.
nextstrain · May 8, 2024 · 4483423 · 4483423
1 parent 32cdeb0
commit 4483423
Show file tree

Hide file tree

Showing 4 changed files with 224 additions and 16 deletions.
diff --git a/packages/nextclade/src/analyze/find_aa_motifs_changes.rs b/packages/nextclade/src/analyze/find_aa_motifs_changes.rs
@@ -3,12 +3,14 @@ use crate::analyze::find_aa_motifs::{AaMotif, AaMotifWithoutSeq};
 use crate::coord::position::AaRefPosition;
 use crate::coord::range::{intersect, Range};
 use crate::translate::translate_genes::Translation;
-use crate::utils::collections::{cloned_into, zip_map_hashmap};
+use crate::utils::collections::cloned_into;
+use crate::utils::zip_map::zip_by_key;
 use eyre::Report;
 use itertools::{Either, Itertools, Zip};
 use serde::{Deserialize, Serialize};
 use std::collections::{BTreeMap, HashSet};
 use std::fmt::Debug;
+use std::ops::Deref;
 
 pub type AaMotifsMap = BTreeMap<String, Vec<AaMotif>>;
 pub type AaMotifsChangesMap = BTreeMap<String, AaMotifChanges>;
@@ -40,21 +42,14 @@ pub fn find_aa_motifs_changes(
   ref_translation: &Translation,
   qry_translation: &Translation,
 ) -> Result<AaMotifsChangesMap, Report> {
-  let query_motif_keys = aa_motifs_qry.keys().collect_vec();
-
-  // Exclude ref motifs missing in query
-  let aa_motifs_ref: AaMotifsMap = aa_motifs_ref
-    .iter()
-    .filter(|(name, _)| query_motif_keys.contains(name))
-    .map(|(key, val)| (key.clone(), val.clone()))
-    .collect();
-
-  zip_map_hashmap(&aa_motifs_ref, aa_motifs_qry, |name, motifs_ref, motifs_qry| {
-    let changes = find_aa_motifs_changes_one(motifs_ref, motifs_qry, ref_translation, qry_translation)?;
-
-    Ok((name.clone(), changes))
-  })
-  .collect()
+  zip_by_key(aa_motifs_ref, aa_motifs_qry)
+    .map(|(name, motifs_ref, motifs_qry)| {
+      let motifs_ref = motifs_ref.map(Deref::deref).unwrap_or_default();
+      let motifs_qry = motifs_qry.map(Deref::deref).unwrap_or_default();
+      let changes = find_aa_motifs_changes_one(motifs_ref, motifs_qry, ref_translation, qry_translation)?;
+      Ok((name.clone(), changes))
+    })
+    .collect()
 }
 
 fn find_aa_motifs_changes_one(

diff --git a/packages/nextclade/src/utils/map.rs b/packages/nextclade/src/utils/map.rs
@@ -0,0 +1,63 @@
+use std::collections::{BTreeMap, HashMap};
+use std::hash::Hash;
+
+#[cfg(feature = "indexmap")]
+use indexmap::IndexMap;
+
+/// Generic interface for maps: HashMap, BTreeMap, IndexMap
+pub trait Map<K, V> {
+  fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
+  where
+    K: 'a;
+
+  fn get(&self, key: &K) -> Option<&V>;
+}
+
+impl<K, V> Map<K, V> for BTreeMap<K, V>
+where
+  K: Ord,
+{
+  fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
+  where
+    K: 'a,
+  {
+    self.keys()
+  }
+
+  fn get(&self, key: &K) -> Option<&V> {
+    self.get(key)
+  }
+}
+
+impl<K, V> Map<K, V> for HashMap<K, V>
+where
+  K: Eq + Hash,
+{
+  fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
+  where
+    K: 'a,
+  {
+    self.keys()
+  }
+
+  fn get(&self, key: &K) -> Option<&V> {
+    self.get(key)
+  }
+}
+
+#[cfg(feature = "indexmap")]
+impl<K, V> Map<K, V> for IndexMap<K, V>
+where
+  K: Eq + Hash,
+{
+  fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
+  where
+    K: 'a,
+  {
+    self.keys()
+  }
+
+  fn get(&self, key: &K) -> Option<&V> {
+    IndexMap::get(self, key)
+  }
+}
diff --git a/packages/nextclade/src/utils/mod.rs b/packages/nextclade/src/utils/mod.rs
@@ -7,8 +7,10 @@ pub mod fs;
 pub mod getenv;
 pub mod global_init;
 pub mod info;
+pub mod map;
 pub mod num;
 pub mod option;
 pub mod string;
 pub mod vec2d;
 pub mod wraparound;
+pub mod zip_map;
diff --git a/packages/nextclade/src/utils/zip_map.rs b/packages/nextclade/src/utils/zip_map.rs
@@ -0,0 +1,148 @@
+use crate::utils::map::Map;
+use itertools::{chain, Itertools};
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::marker::PhantomData;
+
+/// Iterate a pair of maps synchronized by keys
+pub fn zip_by_key<'a, M1, M2, K, V1, V2>(map1: &'a M1, map2: &'a M2) -> ZipByKeyIter<'a, K, V1, V2, M1, M2>
+where
+  K: Ord + Debug + Hash + 'a,
+  V1: 'a,
+  V2: 'a,
+  M1: Map<K, V1>,
+  M2: Map<K, V2>,
+{
+  ZipByKeyIter::new(map1, map2)
+}
+
+/// Iterator for a pair of maps, synchronized by keys
+pub struct ZipByKeyIter<'a, K, V1, V2, M1: Map<K, V1>, M2: Map<K, V2>> {
+  keys: Vec<&'a K>,
+  map1: &'a M1,
+  map2: &'a M2,
+  index: usize,
+  _phantom: PhantomData<(V1, V2)>,
+}
+
+impl<'a, K, V1, V2, M1, M2> ZipByKeyIter<'a, K, V1, V2, M1, M2>
+where
+  K: Ord + Debug + Hash + 'a,
+  V1: 'a,
+  V2: 'a,
+  M1: Map<K, V1>,
+  M2: Map<K, V2>,
+{
+  pub fn new(map1: &'a M1, map2: &'a M2) -> Self {
+    let keys = chain!(map1.keys(), map2.keys()).unique().sorted().collect_vec();
+    ZipByKeyIter {
+      keys,
+      map1,
+      map2,
+      index: 0,
+      _phantom: PhantomData,
+    }
+  }
+}
+
+impl<'a, K, V1, V2, M1, M2> Iterator for ZipByKeyIter<'a, K, V1, V2, M1, M2>
+where
+  K: Ord + 'a,
+  V1: 'a,
+  V2: 'a,
+  M1: Map<K, V1>,
+  M2: Map<K, V2>,
+{
+  type Item = (&'a K, Option<&'a V1>, Option<&'a V2>);
+
+  fn next(&mut self) -> Option<Self::Item> {
+    if self.index >= self.keys.len() {
+      return None;
+    }
+    let key = self.keys[self.index];
+    let value1 = self.map1.get(key);
+    let value2 = self.map2.get(key);
+    self.index += 1;
+    Some((key, value1, value2))
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+  use maplit::{btreemap, hashmap};
+  use pretty_assertions::assert_eq;
+
+  #[cfg(feature = "indexmap")]
+  use indexmap::indexmap;
+
+  #[test]
+  fn test_zip_by_key_hashmap() {
+    let map1 = hashmap! {
+        'a' => "alpha",
+        'b' => "bravo",
+        'c' => "charlie",
+    };
+
+    let map2 = hashmap! {
+        'b' => "beans",
+        'c' => "carrots",
+        'd' => "dill",
+    };
+
+    let mut iter = zip_by_key(&map1, &map2);
+
+    assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None)));
+    assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans"))));
+    assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots"))));
+    assert_eq!(iter.next(), Some((&'d', None, Some(&"dill"))));
+    assert_eq!(iter.next(), None);
+  }
+
+  #[test]
+  fn test_zip_by_key_btreemap() {
+    let map1 = btreemap! {
+        'a' => "alpha",
+        'b' => "bravo",
+        'c' => "charlie",
+    };
+
+    let map2 = btreemap! {
+        'b' => "beans",
+        'c' => "carrots",
+        'd' => "dill",
+    };
+
+    let mut iter = zip_by_key(&map1, &map2);
+
+    assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None)));
+    assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans"))));
+    assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots"))));
+    assert_eq!(iter.next(), Some((&'d', None, Some(&"dill"))));
+    assert_eq!(iter.next(), None);
+  }
+
+  #[cfg(feature = "indexmap")]
+  #[test]
+  fn test_zip_by_key_indexmap() {
+    let map1 = indexmap! {
+        'a' => "alpha",
+        'b' => "bravo",
+        'c' => "charlie",
+    };
+
+    let map2 = indexmap! {
+        'b' => "beans",
+        'c' => "carrots",
+        'd' => "dill",
+    };
+
+    let mut iter = zip_by_key(&map1, &map2);
+
+    assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None)));
+    assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans"))));
+    assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots"))));
+    assert_eq!(iter.next(), Some((&'d', None, Some(&"dill"))));
+    assert_eq!(iter.next(), None);
+  }
+}