Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 7bb9bee

Browse files
authored
chore: move HyperLogLog generics to method (#112)
As title.
1 parent 066ae17 commit 7bb9bee

File tree

1 file changed

+15
-23
lines changed

1 file changed

+15
-23
lines changed

optd-gungnir/src/stats/hyperloglog.rs

+15-23
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
//! number of collisions and eliminate the need for a large range correction estimator.
77
88
use crate::stats::murmur2::murmur_hash;
9-
use std::{cmp::max, marker::PhantomData};
9+
use std::cmp::max;
1010

1111
/// Trait to transform any object into a stream of bytes.
1212
pub trait ByteSerializable {
@@ -15,13 +15,11 @@ pub trait ByteSerializable {
1515

1616
/// The HyperLogLog (HLL) structure to provide a statistical estimate of NDistinct.
1717
/// For safety reasons, HLLs can only count elements of the same ByteSerializable type.
18-
pub struct HyperLogLog<T: ByteSerializable> {
18+
pub struct HyperLogLog {
1919
registers: Vec<u8>, // The buckets to estimate HLL on (i.e. upper p bits).
2020
precision: u8, // The precision (p) of our HLL; 4 <= p <= 16.
2121
m: usize, // The number of HLL buckets; 2^p.
2222
alpha: f64, // The normal HLL multiplier factor.
23-
24-
hll_type: PhantomData<T>, // A marker to the data type of our HLL (to silent warnings).
2523
}
2624

2725
// Serialize common data types for hashing (String).
@@ -50,29 +48,27 @@ impl_byte_serializable_for_numeric!(usize, isize);
5048
impl_byte_serializable_for_numeric!(f64, f32);
5149

5250
// Self-contained implementation of the HyperLogLog data structure.
53-
impl<T> HyperLogLog<T>
54-
where
55-
T: ByteSerializable,
56-
{
51+
impl HyperLogLog {
5752
/// Creates and initializes a new empty HyperLogLog.
5853
pub fn new(precision: u8) -> Self {
5954
assert!((4..=16).contains(&precision));
6055

6156
let m = 1 << precision;
6257
let alpha = compute_alpha(m);
6358

64-
HyperLogLog::<T> {
59+
HyperLogLog {
6560
registers: vec![0; m],
6661
precision,
6762
m,
6863
alpha,
69-
70-
hll_type: PhantomData,
7164
}
7265
}
7366

7467
/// Digests an array of ByteSerializable data into the HLL.
75-
pub fn aggregate(&mut self, data: &[T]) {
68+
pub fn aggregate<T>(&mut self, data: &[T])
69+
where
70+
T: ByteSerializable,
71+
{
7672
for d in data {
7773
let hash = murmur_hash(&d.to_bytes(), 0); // TODO: We ignore DoS attacks (seed).
7874
let mask = (1 << (self.precision)) - 1;
@@ -84,7 +80,7 @@ where
8480
/// Merges two HLLs together and returns a new one.
8581
/// Particularly useful for parallel execution.
8682
/// NOTE: Takes ownership of self and other.
87-
pub fn merge(self, other: HyperLogLog<T>) -> Self {
83+
pub fn merge(self, other: HyperLogLog) -> Self {
8884
assert!(self.precision == other.precision);
8985

9086
let merged_registers = self
@@ -94,13 +90,11 @@ where
9490
.map(|(x, y)| x.max(y))
9591
.collect();
9692

97-
HyperLogLog::<T> {
93+
HyperLogLog {
9894
registers: merged_registers,
9995
precision: self.precision,
10096
m: self.m,
10197
alpha: self.alpha,
102-
103-
hll_type: PhantomData,
10498
}
10599
}
106100

@@ -158,7 +152,7 @@ mod tests {
158152

159153
#[test]
160154
fn hll_small_strings() {
161-
let mut hll = HyperLogLog::<String>::new(12);
155+
let mut hll = HyperLogLog::new(12);
162156

163157
let data = vec!["a".to_string(), "b".to_string()];
164158
hll.aggregate(&data);
@@ -167,7 +161,7 @@ mod tests {
167161

168162
#[test]
169163
fn hll_small_u64() {
170-
let mut hll = HyperLogLog::<u64>::new(12);
164+
let mut hll = HyperLogLog::new(12);
171165

172166
let data = vec![1, 2];
173167
hll.aggregate(&data);
@@ -203,7 +197,7 @@ mod tests {
203197
#[test]
204198
fn hll_big() {
205199
let precision = 12;
206-
let mut hll = HyperLogLog::<String>::new(precision);
200+
let mut hll = HyperLogLog::new(precision);
207201
let n_distinct = 100000;
208202
let relative_error = 0.05; // We allow a 5% relatative error rate.
209203

@@ -224,14 +218,12 @@ mod tests {
224218
let n_jobs = 16;
225219
let relative_error = 0.05; // We allow a 5% relatative error rate.
226220

227-
let result_hll = Arc::new(Mutex::new(Option::Some(HyperLogLog::<String>::new(
228-
precision,
229-
))));
221+
let result_hll = Arc::new(Mutex::new(Option::Some(HyperLogLog::new(precision))));
230222
let job_id = AtomicUsize::new(0);
231223
thread::scope(|s| {
232224
for _ in 0..n_jobs {
233225
s.spawn(|_| {
234-
let mut local_hll = HyperLogLog::<String>::new(precision);
226+
let mut local_hll = HyperLogLog::new(precision);
235227
let curr_job_id = job_id.fetch_add(1, Ordering::SeqCst);
236228

237229
let strings = generate_random_strings(n_distinct, 100, curr_job_id);

0 commit comments

Comments
 (0)