diff --git a/examples/hdbscan.rs b/examples/hdbscan.rs index 6be664d..c3324e2 100644 --- a/examples/hdbscan.rs +++ b/examples/hdbscan.rs @@ -15,7 +15,7 @@ fn main() { let data: Vec = rdr .deserialize() .map(|v| { - let r: Vec = v.expect("corruptted data"); + let r: Vec = v.expect("corrupted data"); if nfeatures < 1 { nfeatures = r.len(); } @@ -41,7 +41,7 @@ fn main() { metric: Euclidean::default(), boruvka: true, }; - let (clusters, outliers) = clustering.fit(&data.view()); + let (clusters, outliers, _outlier_scores) = clustering.fit(&data.view()); println!("========= Report ========="); println!("# of events processed: {}", data.nrows()); println!("# of features provided: {}", data.ncols()); diff --git a/src/hdbscan.rs b/src/hdbscan.rs index a3bede6..05ce276 100644 --- a/src/hdbscan.rs +++ b/src/hdbscan.rs @@ -45,15 +45,19 @@ where } } -impl Fit, (HashMap>, Vec)> for HDbscan +impl Fit, (HashMap>, Vec, Vec)> + for HDbscan where A: AddAssign + DivAssign + FloatCore + FromPrimitive + Sync + Send, S: Data, M: Metric + Clone + Sync + Send, { - fn fit(&mut self, input: &ArrayBase) -> (HashMap>, Vec) { + fn fit( + &mut self, + input: &ArrayBase, + ) -> (HashMap>, Vec, Vec) { if input.is_empty() { - return (HashMap::new(), Vec::new()); + return (HashMap::new(), Vec::new(), Vec::new()); } let input = input.as_standard_layout(); let db = BallTree::new(input.view(), self.metric.clone()).expect("non-empty array"); @@ -87,8 +91,10 @@ where mst.sort_unstable_by(|a, b| a.2.partial_cmp(&(b.2)).expect("invalid distance")); let sorted_mst = Array1::from_vec(mst); let labeled = label(sorted_mst); - let condensed = Array1::from_vec(condense_mst(labeled.view(), self.min_cluster_size)); - find_clusters(&condensed.view()) + let condensed = condense_mst(labeled.view(), self.min_cluster_size); + let outlier_scores = glosh(&condensed, self.min_cluster_size); + let (clusters, outliers) = find_clusters(&Array1::from_vec(condensed).view()); + (clusters, outliers, outlier_scores) } } @@ -387,6 +393,84 @@ fn find_clusters( (res_clusters, outliers) } +// GLOSH: Global-Local Outlier Score from Hierarchies +// Reference: https://dl.acm.org/doi/10.1145/2733381 +// +// Given the following hierarchy (min_cluster_size = 3), +// Root +// / \ +// A ... +// eps_x -> / \ +// x A +// / \ +// y A +// /|\ <- eps_A: A is still a cluster w.r.t. min_cluster_size at this level +// a b c +// +// To compute the outlier score of point x, we need: +// - eps_x: eps that x joins to cluster A (A is the first cluster that x joins to) +// - eps_A: lowest eps that A or any of A's child clusters survives w.r.t. min_cluster_size. +// Then, the outlier score of x is defined as: +// score(x) = 1 - eps_A / eps_x +// +// Since we are working with density lambda values (where lambda = 1/eps): +// lambda_x = 1 / eps_x +// lambda_A = 1 / eps_C +// score(x) = 1 - lambda_x / lambda_C +fn glosh( + condensed_mst: &[(usize, usize, A, usize)], + min_cluster_size: usize, +) -> Vec { + let deaths = max_lambdas(condensed_mst, min_cluster_size); + let num_events = condensed_mst + .iter() + .map(|(_, child, _, _)| *child) + .max() + .map_or(0, |max_child| max_child + 1); + + let mut scores = vec![A::zero(); num_events]; + for (parent, child, lambda, _) in condensed_mst { + if *child >= num_events { + continue; + } + let lambda_max = deaths[*parent]; + if lambda_max == A::zero() { + scores[*child] = A::zero(); + } else { + scores[*child] = (lambda_max - *lambda) / lambda_max; + } + } + scores +} + +// Return the maximum lambda value (min eps) for each cluster C such that +// the cluster or any of its child clusters has at least min_cluster_size points. +fn max_lambdas( + condensed_mst: &[(usize, usize, A, usize)], + min_cluster_size: usize, +) -> Vec { + let largest_parent = condensed_mst + .iter() + .max_by_key(|(parent, _, _, _)| *parent) + .unwrap() + .0; + + // bottom-up traverse the hierarchy to keep track of the maximum lambda values + // (same with the reverse order iteration on the condensed_mst) + let mut parent_sizes: Vec = vec![0; largest_parent + 1]; + let mut deaths_arr: Vec = vec![A::zero(); largest_parent + 1]; + for (parent, child, lambda, child_size) in condensed_mst.iter().rev() { + parent_sizes[*parent] += *child_size; + if parent_sizes[*parent] >= min_cluster_size { + deaths_arr[*parent] = deaths_arr[*parent].max(*lambda); + } + if *child_size >= min_cluster_size { + deaths_arr[*parent] = deaths_arr[*parent].max(deaths_arr[*child]); + } + } + deaths_arr +} + fn bfs_tree(tree: &[(usize, usize)], root: usize) -> Vec { let mut result = vec![]; let mut to_process = HashSet::new(); @@ -936,7 +1020,7 @@ mod test { metric: Euclidean::default(), boruvka: false, }; - let (clusters, outliers) = hdbscan.fit(&data); + let (clusters, outliers, _) = hdbscan.fit(&data); assert_eq!(clusters.len(), 2); assert_eq!( outliers.len(), @@ -967,7 +1051,7 @@ mod test { metric: Euclidean::default(), boruvka: false, }; - let (clusters, outliers) = hdbscan.fit(&data); + let (clusters, outliers, _) = hdbscan.fit(&data); assert_eq!(clusters.len(), 2); assert_eq!( outliers.len(), @@ -1044,6 +1128,69 @@ mod test { assert_eq!(answer, mst); } + #[test] + fn outlier_scores() { + use ndarray::array; + use petal_neighbors::distance::Euclidean; + + use crate::Fit; + + let data = array![ + // cluster1: + [1.0, 1.0], + [1.0, 2.0], + [2.0, 1.0], + [2.0, 2.0], + // cluster2: + [4.0, 1.0], + [4.0, 2.0], + [5.0, 1.0], + [5.0, 2.0], + // cluster3: + [9.0, 1.0], + [9.0, 2.0], + [10.0, 1.0], + [10.0, 2.0], + [11.0, 1.0], + [11.0, 2.0], + // outlier1: + [2.0, 5.0], + // outlier2: + [10.0, 8.0], + ]; + let mut hdbscan = super::HDbscan { + eps: 0.5, + alpha: 1., + min_samples: 4, + min_cluster_size: 4, + metric: Euclidean::default(), + boruvka: false, + }; + let (_, _, outlier_scores) = hdbscan.fit(&data); + + // The first 14 data objects immediately form their clusters at eps = √2 + // The outlier scores of these objects are all 0: + // glosh(x) = 1 - √2 / √2 = 0 + for i in 0..14 { + assert_eq!(outlier_scores[i], 0.0); + } + + // Outlier1 joins the cluster C = {cluster1 ∪ cluster2} at: + // eps_outlier1 = √13 + // The lowest eps that C or any of its child clusters survives w.r.t. min_cluster_size = 4 is: + // eps_C = √2 (due to cluster1 or cluster2) + // Then the outlier score of outlier1 is: + // glosh(outlier1) = 1 - √2 / √13 = 0.60776772972 + assert_eq!(outlier_scores[14], 1.0 - 2.0_f64.sqrt() / 13.0_f64.sqrt()); + + // Outlier2 joins the root cluster at at eps = √37 + // The lowest eps that the root cluster survives w.r.t. min_cluster_size = 4 is: + // eps_root = √2 + // Then the outlier score of outlier2 is: + // glosh(outlier2) = 1 - √2 / √37 = 0.76750472251 + assert_eq!(outlier_scores[15], 1.0 - 2.0_f64.sqrt() / 37.0_f64.sqrt()); + } + #[test] fn tree_union_find() { use succinct::{BitVecMut, BitVector};