Skip to content

Commit

Permalink
fix: ensure loss of aa motif detected when it's the only motif
Browse files Browse the repository at this point in the history
Currently Nextclade fails to detect a lost motif if the motif is the only one in its category. This is due to incorrect iteration of the category keys (names) of motifs.

Here I implemented an iterator to correctly visit both the reference and query motif maps: it will visit the category even if it is entirely missing in one or the other map. Previously, the category missing from one map would not be considered, which caused silent failure to detect the disappearance of a motif.

This was initially reported for the flu h5 datasets - the loss of `polybasic_cleavage_site` motif would not be detected.

I tested by comparing the outputs of smoke tests from master branch and from this branch. The `polybasic_cleavage_site` motif loss is now detected in h5 datasets and there are no other changes.
  • Loading branch information
ivan-aksamentov committed May 8, 2024
1 parent 32cdeb0 commit 4483423
Show file tree
Hide file tree
Showing 4 changed files with 224 additions and 16 deletions.
27 changes: 11 additions & 16 deletions packages/nextclade/src/analyze/find_aa_motifs_changes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ use crate::analyze::find_aa_motifs::{AaMotif, AaMotifWithoutSeq};
use crate::coord::position::AaRefPosition;
use crate::coord::range::{intersect, Range};
use crate::translate::translate_genes::Translation;
use crate::utils::collections::{cloned_into, zip_map_hashmap};
use crate::utils::collections::cloned_into;
use crate::utils::zip_map::zip_by_key;
use eyre::Report;
use itertools::{Either, Itertools, Zip};
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashSet};
use std::fmt::Debug;
use std::ops::Deref;

pub type AaMotifsMap = BTreeMap<String, Vec<AaMotif>>;
pub type AaMotifsChangesMap = BTreeMap<String, AaMotifChanges>;
Expand Down Expand Up @@ -40,21 +42,14 @@ pub fn find_aa_motifs_changes(
ref_translation: &Translation,
qry_translation: &Translation,
) -> Result<AaMotifsChangesMap, Report> {
let query_motif_keys = aa_motifs_qry.keys().collect_vec();

// Exclude ref motifs missing in query
let aa_motifs_ref: AaMotifsMap = aa_motifs_ref
.iter()
.filter(|(name, _)| query_motif_keys.contains(name))
.map(|(key, val)| (key.clone(), val.clone()))
.collect();

zip_map_hashmap(&aa_motifs_ref, aa_motifs_qry, |name, motifs_ref, motifs_qry| {
let changes = find_aa_motifs_changes_one(motifs_ref, motifs_qry, ref_translation, qry_translation)?;

Ok((name.clone(), changes))
})
.collect()
zip_by_key(aa_motifs_ref, aa_motifs_qry)
.map(|(name, motifs_ref, motifs_qry)| {
let motifs_ref = motifs_ref.map(Deref::deref).unwrap_or_default();
let motifs_qry = motifs_qry.map(Deref::deref).unwrap_or_default();
let changes = find_aa_motifs_changes_one(motifs_ref, motifs_qry, ref_translation, qry_translation)?;
Ok((name.clone(), changes))
})
.collect()
}

fn find_aa_motifs_changes_one(
Expand Down
63 changes: 63 additions & 0 deletions packages/nextclade/src/utils/map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use std::collections::{BTreeMap, HashMap};
use std::hash::Hash;

#[cfg(feature = "indexmap")]
use indexmap::IndexMap;

/// Generic interface for maps: HashMap, BTreeMap, IndexMap
pub trait Map<K, V> {
fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
where
K: 'a;

fn get(&self, key: &K) -> Option<&V>;
}

impl<K, V> Map<K, V> for BTreeMap<K, V>
where
K: Ord,
{
fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
where
K: 'a,
{
self.keys()
}

fn get(&self, key: &K) -> Option<&V> {
self.get(key)
}
}

impl<K, V> Map<K, V> for HashMap<K, V>
where
K: Eq + Hash,
{
fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
where
K: 'a,
{
self.keys()
}

fn get(&self, key: &K) -> Option<&V> {
self.get(key)
}
}

#[cfg(feature = "indexmap")]
impl<K, V> Map<K, V> for IndexMap<K, V>
where
K: Eq + Hash,
{
fn keys<'a>(&'a self) -> impl Iterator<Item = &'a K>
where
K: 'a,
{
self.keys()
}

fn get(&self, key: &K) -> Option<&V> {
IndexMap::get(self, key)
}
}
2 changes: 2 additions & 0 deletions packages/nextclade/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ pub mod fs;
pub mod getenv;
pub mod global_init;
pub mod info;
pub mod map;
pub mod num;
pub mod option;
pub mod string;
pub mod vec2d;
pub mod wraparound;
pub mod zip_map;
148 changes: 148 additions & 0 deletions packages/nextclade/src/utils/zip_map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
use crate::utils::map::Map;
use itertools::{chain, Itertools};
use std::fmt::Debug;
use std::hash::Hash;
use std::marker::PhantomData;

/// Iterate a pair of maps synchronized by keys
pub fn zip_by_key<'a, M1, M2, K, V1, V2>(map1: &'a M1, map2: &'a M2) -> ZipByKeyIter<'a, K, V1, V2, M1, M2>
where
K: Ord + Debug + Hash + 'a,
V1: 'a,
V2: 'a,
M1: Map<K, V1>,
M2: Map<K, V2>,
{
ZipByKeyIter::new(map1, map2)
}

/// Iterator for a pair of maps, synchronized by keys
pub struct ZipByKeyIter<'a, K, V1, V2, M1: Map<K, V1>, M2: Map<K, V2>> {
keys: Vec<&'a K>,
map1: &'a M1,
map2: &'a M2,
index: usize,
_phantom: PhantomData<(V1, V2)>,
}

impl<'a, K, V1, V2, M1, M2> ZipByKeyIter<'a, K, V1, V2, M1, M2>
where
K: Ord + Debug + Hash + 'a,
V1: 'a,
V2: 'a,
M1: Map<K, V1>,
M2: Map<K, V2>,
{
pub fn new(map1: &'a M1, map2: &'a M2) -> Self {
let keys = chain!(map1.keys(), map2.keys()).unique().sorted().collect_vec();
ZipByKeyIter {
keys,
map1,
map2,
index: 0,
_phantom: PhantomData,
}
}
}

impl<'a, K, V1, V2, M1, M2> Iterator for ZipByKeyIter<'a, K, V1, V2, M1, M2>
where
K: Ord + 'a,
V1: 'a,
V2: 'a,
M1: Map<K, V1>,
M2: Map<K, V2>,
{
type Item = (&'a K, Option<&'a V1>, Option<&'a V2>);

fn next(&mut self) -> Option<Self::Item> {
if self.index >= self.keys.len() {
return None;
}
let key = self.keys[self.index];
let value1 = self.map1.get(key);
let value2 = self.map2.get(key);
self.index += 1;
Some((key, value1, value2))
}
}

#[cfg(test)]
mod tests {
use super::*;
use maplit::{btreemap, hashmap};
use pretty_assertions::assert_eq;

#[cfg(feature = "indexmap")]
use indexmap::indexmap;

#[test]
fn test_zip_by_key_hashmap() {
let map1 = hashmap! {
'a' => "alpha",
'b' => "bravo",
'c' => "charlie",
};

let map2 = hashmap! {
'b' => "beans",
'c' => "carrots",
'd' => "dill",
};

let mut iter = zip_by_key(&map1, &map2);

assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None)));
assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans"))));
assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots"))));
assert_eq!(iter.next(), Some((&'d', None, Some(&"dill"))));
assert_eq!(iter.next(), None);
}

#[test]
fn test_zip_by_key_btreemap() {
let map1 = btreemap! {
'a' => "alpha",
'b' => "bravo",
'c' => "charlie",
};

let map2 = btreemap! {
'b' => "beans",
'c' => "carrots",
'd' => "dill",
};

let mut iter = zip_by_key(&map1, &map2);

assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None)));
assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans"))));
assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots"))));
assert_eq!(iter.next(), Some((&'d', None, Some(&"dill"))));
assert_eq!(iter.next(), None);
}

#[cfg(feature = "indexmap")]
#[test]
fn test_zip_by_key_indexmap() {
let map1 = indexmap! {
'a' => "alpha",
'b' => "bravo",
'c' => "charlie",
};

let map2 = indexmap! {
'b' => "beans",
'c' => "carrots",
'd' => "dill",
};

let mut iter = zip_by_key(&map1, &map2);

assert_eq!(iter.next(), Some((&'a', Some(&"alpha"), None)));
assert_eq!(iter.next(), Some((&'b', Some(&"bravo"), Some(&"beans"))));
assert_eq!(iter.next(), Some((&'c', Some(&"charlie"), Some(&"carrots"))));
assert_eq!(iter.next(), Some((&'d', None, Some(&"dill"))));
assert_eq!(iter.next(), None);
}
}

0 comments on commit 4483423

Please sign in to comment.