Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Outlier scores for HDBSCAN #73

Merged
merged 11 commits into from
Nov 29, 2024
4 changes: 2 additions & 2 deletions examples/hdbscan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fn main() {
let data: Vec<f64> = rdr
.deserialize()
.map(|v| {
let r: Vec<f64> = v.expect("corruptted data");
let r: Vec<f64> = v.expect("corrupted data");
if nfeatures < 1 {
nfeatures = r.len();
}
Expand All @@ -41,7 +41,7 @@ fn main() {
metric: Euclidean::default(),
boruvka: true,
};
let (clusters, outliers) = clustering.fit(&data.view());
let (clusters, outliers, _outlier_scores) = clustering.fit(&data.view());
azizkayumov marked this conversation as resolved.
Show resolved Hide resolved
println!("========= Report =========");
println!("# of events processed: {}", data.nrows());
println!("# of features provided: {}", data.ncols());
Expand Down
161 changes: 154 additions & 7 deletions src/hdbscan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,19 @@ where
}
}

impl<S, A, M> Fit<ArrayBase<S, Ix2>, (HashMap<usize, Vec<usize>>, Vec<usize>)> for HDbscan<A, M>
impl<S, A, M> Fit<ArrayBase<S, Ix2>, (HashMap<usize, Vec<usize>>, Vec<usize>, Vec<A>)>
for HDbscan<A, M>
where
A: AddAssign + DivAssign + FloatCore + FromPrimitive + Sync + Send,
S: Data<Elem = A>,
M: Metric<A> + Clone + Sync + Send,
{
fn fit(&mut self, input: &ArrayBase<S, Ix2>) -> (HashMap<usize, Vec<usize>>, Vec<usize>) {
fn fit(
&mut self,
input: &ArrayBase<S, Ix2>,
) -> (HashMap<usize, Vec<usize>>, Vec<usize>, Vec<A>) {
if input.is_empty() {
return (HashMap::new(), Vec::new());
return (HashMap::new(), Vec::new(), Vec::new());
}
let input = input.as_standard_layout();
let db = BallTree::new(input.view(), self.metric.clone()).expect("non-empty array");
Expand Down Expand Up @@ -87,8 +91,10 @@ where
mst.sort_unstable_by(|a, b| a.2.partial_cmp(&(b.2)).expect("invalid distance"));
let sorted_mst = Array1::from_vec(mst);
let labeled = label(sorted_mst);
let condensed = Array1::from_vec(condense_mst(labeled.view(), self.min_cluster_size));
find_clusters(&condensed.view())
let condensed = condense_mst(labeled.view(), self.min_cluster_size);
let outlier_scores = glosh(&condensed, self.min_cluster_size);
azizkayumov marked this conversation as resolved.
Show resolved Hide resolved
let (clusters, outliers) = find_clusters(&Array1::from_vec(condensed).view());
(clusters, outliers, outlier_scores)
Comment on lines +94 to +97
Copy link

@coderabbitai coderabbitai bot Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Optimize conversion of condensed data structure

In the line let (clusters, outliers) = find_clusters(&Array1::from_vec(condensed).view());, consider avoiding the conversion from Vec<(usize, usize, A, usize)> to Array1 if not necessary. Operating directly on the Vec could improve performance by reducing unnecessary allocations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This suggestion requires a change in the parameter types of find_clusters, which is not the scope of this PR.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@azizkayumov, understood. Thank you for clarifying.

}
}

Expand Down Expand Up @@ -387,6 +393,84 @@ fn find_clusters<A: FloatCore + FromPrimitive + AddAssign + Sub>(
(res_clusters, outliers)
}

// GLOSH: Global-Local Outlier Score from Hierarchies
// Reference: https://dl.acm.org/doi/10.1145/2733381
//
// Given the following hierarchy (min_cluster_size = 3),
// Root
// / \
// A ...
// eps_x -> / \
// x A
// / \
// y A
// /|\ <- eps_A: A is still a cluster w.r.t. min_cluster_size at this level
// a b c
//
// To compute the outlier score of point x, we need:
// - eps_x: eps that x joins to cluster A (A is the first cluster that x joins to)
// - eps_A: lowest eps that A or any of A's child clusters survives w.r.t. min_cluster_size.
// Then, the outlier score of x is defined as:
// score(x) = 1 - eps_A / eps_x
//
// Since we are working with density lambda values (where lambda = 1/eps):
// lambda_x = 1 / eps_x
// lambda_A = 1 / eps_C
// score(x) = 1 - lambda_x / lambda_C
fn glosh<A: FloatCore>(
condensed_mst: &[(usize, usize, A, usize)],
min_cluster_size: usize,
) -> Vec<A> {
let deaths = max_lambdas(condensed_mst, min_cluster_size);
let num_events = condensed_mst
.iter()
.map(|(_, child, _, _)| *child)
.max()
.map_or(0, |max_child| max_child + 1);

let mut scores = vec![A::zero(); num_events];
for (parent, child, lambda, _) in condensed_mst {
if *child >= num_events {
continue;
}
let lambda_max = deaths[*parent];
if lambda_max == A::zero() {
scores[*child] = A::zero();
} else {
scores[*child] = (lambda_max - *lambda) / lambda_max;
}
}
scores
}

// Return the maximum lambda value (min eps) for each cluster C such that
// the cluster or any of its child clusters has at least min_cluster_size points.
fn max_lambdas<A: FloatCore>(
condensed_mst: &[(usize, usize, A, usize)],
min_cluster_size: usize,
) -> Vec<A> {
let largest_parent = condensed_mst
.iter()
.max_by_key(|(parent, _, _, _)| *parent)
.unwrap()
.0;

// bottom-up traverse the hierarchy to keep track of the maximum lambda values
// (same with the reverse order iteration on the condensed_mst)
let mut parent_sizes: Vec<usize> = vec![0; largest_parent + 1];
let mut deaths_arr: Vec<A> = vec![A::zero(); largest_parent + 1];
for (parent, child, lambda, child_size) in condensed_mst.iter().rev() {
parent_sizes[*parent] += *child_size;
if parent_sizes[*parent] >= min_cluster_size {
deaths_arr[*parent] = deaths_arr[*parent].max(*lambda);
}
if *child_size >= min_cluster_size {
deaths_arr[*parent] = deaths_arr[*parent].max(deaths_arr[*child]);
}
}
deaths_arr
}

fn bfs_tree(tree: &[(usize, usize)], root: usize) -> Vec<usize> {
let mut result = vec![];
let mut to_process = HashSet::new();
Expand Down Expand Up @@ -936,7 +1020,7 @@ mod test {
metric: Euclidean::default(),
boruvka: false,
};
let (clusters, outliers) = hdbscan.fit(&data);
let (clusters, outliers, _) = hdbscan.fit(&data);
assert_eq!(clusters.len(), 2);
assert_eq!(
outliers.len(),
Expand Down Expand Up @@ -967,7 +1051,7 @@ mod test {
metric: Euclidean::default(),
boruvka: false,
};
let (clusters, outliers) = hdbscan.fit(&data);
let (clusters, outliers, _) = hdbscan.fit(&data);
assert_eq!(clusters.len(), 2);
assert_eq!(
outliers.len(),
Expand Down Expand Up @@ -1044,6 +1128,69 @@ mod test {
assert_eq!(answer, mst);
}

#[test]
fn outlier_scores() {
use ndarray::array;
use petal_neighbors::distance::Euclidean;

use crate::Fit;

let data = array![
// cluster1:
[1.0, 1.0],
[1.0, 2.0],
[2.0, 1.0],
[2.0, 2.0],
// cluster2:
[4.0, 1.0],
[4.0, 2.0],
[5.0, 1.0],
[5.0, 2.0],
// cluster3:
[9.0, 1.0],
[9.0, 2.0],
[10.0, 1.0],
[10.0, 2.0],
[11.0, 1.0],
[11.0, 2.0],
// outlier1:
[2.0, 5.0],
// outlier2:
[10.0, 8.0],
];
let mut hdbscan = super::HDbscan {
eps: 0.5,
alpha: 1.,
min_samples: 4,
min_cluster_size: 4,
metric: Euclidean::default(),
boruvka: false,
};
let (_, _, outlier_scores) = hdbscan.fit(&data);

// The first 14 data objects immediately form their clusters at eps = √2
// The outlier scores of these objects are all 0:
// glosh(x) = 1 - √2 / √2 = 0
for i in 0..14 {
assert_eq!(outlier_scores[i], 0.0);
}

// Outlier1 joins the cluster C = {cluster1 ∪ cluster2} at:
// eps_outlier1 = √13
// The lowest eps that C or any of its child clusters survives w.r.t. min_cluster_size = 4 is:
// eps_C = √2 (due to cluster1 or cluster2)
// Then the outlier score of outlier1 is:
// glosh(outlier1) = 1 - √2 / √13 = 0.60776772972
assert_eq!(outlier_scores[14], 1.0 - 2.0_f64.sqrt() / 13.0_f64.sqrt());

// Outlier2 joins the root cluster at at eps = √37
// The lowest eps that the root cluster survives w.r.t. min_cluster_size = 4 is:
// eps_root = √2
// Then the outlier score of outlier2 is:
// glosh(outlier2) = 1 - √2 / √37 = 0.76750472251
assert_eq!(outlier_scores[15], 1.0 - 2.0_f64.sqrt() / 37.0_f64.sqrt());
}

#[test]
fn tree_union_find() {
use succinct::{BitVecMut, BitVector};
Expand Down