6
6
//! number of collisions and eliminate the need for a large range correction estimator.
7
7
8
8
use crate :: stats:: murmur2:: murmur_hash;
9
- use std:: { cmp:: max, marker :: PhantomData } ;
9
+ use std:: cmp:: max;
10
10
11
11
/// Trait to transform any object into a stream of bytes.
12
12
pub trait ByteSerializable {
@@ -15,13 +15,11 @@ pub trait ByteSerializable {
15
15
16
16
/// The HyperLogLog (HLL) structure to provide a statistical estimate of NDistinct.
17
17
/// For safety reasons, HLLs can only count elements of the same ByteSerializable type.
18
- pub struct HyperLogLog < T : ByteSerializable > {
18
+ pub struct HyperLogLog {
19
19
registers : Vec < u8 > , // The buckets to estimate HLL on (i.e. upper p bits).
20
20
precision : u8 , // The precision (p) of our HLL; 4 <= p <= 16.
21
21
m : usize , // The number of HLL buckets; 2^p.
22
22
alpha : f64 , // The normal HLL multiplier factor.
23
-
24
- hll_type : PhantomData < T > , // A marker to the data type of our HLL (to silent warnings).
25
23
}
26
24
27
25
// Serialize common data types for hashing (String).
@@ -50,29 +48,27 @@ impl_byte_serializable_for_numeric!(usize, isize);
50
48
impl_byte_serializable_for_numeric ! ( f64 , f32 ) ;
51
49
52
50
// Self-contained implementation of the HyperLogLog data structure.
53
- impl < T > HyperLogLog < T >
54
- where
55
- T : ByteSerializable ,
56
- {
51
+ impl HyperLogLog {
57
52
/// Creates and initializes a new empty HyperLogLog.
58
53
pub fn new ( precision : u8 ) -> Self {
59
54
assert ! ( ( 4 ..=16 ) . contains( & precision) ) ;
60
55
61
56
let m = 1 << precision;
62
57
let alpha = compute_alpha ( m) ;
63
58
64
- HyperLogLog :: < T > {
59
+ HyperLogLog {
65
60
registers : vec ! [ 0 ; m] ,
66
61
precision,
67
62
m,
68
63
alpha,
69
-
70
- hll_type : PhantomData ,
71
64
}
72
65
}
73
66
74
67
/// Digests an array of ByteSerializable data into the HLL.
75
- pub fn aggregate ( & mut self , data : & [ T ] ) {
68
+ pub fn aggregate < T > ( & mut self , data : & [ T ] )
69
+ where
70
+ T : ByteSerializable ,
71
+ {
76
72
for d in data {
77
73
let hash = murmur_hash ( & d. to_bytes ( ) , 0 ) ; // TODO: We ignore DoS attacks (seed).
78
74
let mask = ( 1 << ( self . precision ) ) - 1 ;
84
80
/// Merges two HLLs together and returns a new one.
85
81
/// Particularly useful for parallel execution.
86
82
/// NOTE: Takes ownership of self and other.
87
- pub fn merge ( self , other : HyperLogLog < T > ) -> Self {
83
+ pub fn merge ( self , other : HyperLogLog ) -> Self {
88
84
assert ! ( self . precision == other. precision) ;
89
85
90
86
let merged_registers = self
@@ -94,13 +90,11 @@ where
94
90
. map ( |( x, y) | x. max ( y) )
95
91
. collect ( ) ;
96
92
97
- HyperLogLog :: < T > {
93
+ HyperLogLog {
98
94
registers : merged_registers,
99
95
precision : self . precision ,
100
96
m : self . m ,
101
97
alpha : self . alpha ,
102
-
103
- hll_type : PhantomData ,
104
98
}
105
99
}
106
100
@@ -158,7 +152,7 @@ mod tests {
158
152
159
153
#[ test]
160
154
fn hll_small_strings ( ) {
161
- let mut hll = HyperLogLog :: < String > :: new ( 12 ) ;
155
+ let mut hll = HyperLogLog :: new ( 12 ) ;
162
156
163
157
let data = vec ! [ "a" . to_string( ) , "b" . to_string( ) ] ;
164
158
hll. aggregate ( & data) ;
@@ -167,7 +161,7 @@ mod tests {
167
161
168
162
#[ test]
169
163
fn hll_small_u64 ( ) {
170
- let mut hll = HyperLogLog :: < u64 > :: new ( 12 ) ;
164
+ let mut hll = HyperLogLog :: new ( 12 ) ;
171
165
172
166
let data = vec ! [ 1 , 2 ] ;
173
167
hll. aggregate ( & data) ;
@@ -203,7 +197,7 @@ mod tests {
203
197
#[ test]
204
198
fn hll_big ( ) {
205
199
let precision = 12 ;
206
- let mut hll = HyperLogLog :: < String > :: new ( precision) ;
200
+ let mut hll = HyperLogLog :: new ( precision) ;
207
201
let n_distinct = 100000 ;
208
202
let relative_error = 0.05 ; // We allow a 5% relatative error rate.
209
203
@@ -224,14 +218,12 @@ mod tests {
224
218
let n_jobs = 16 ;
225
219
let relative_error = 0.05 ; // We allow a 5% relatative error rate.
226
220
227
- let result_hll = Arc :: new ( Mutex :: new ( Option :: Some ( HyperLogLog :: < String > :: new (
228
- precision,
229
- ) ) ) ) ;
221
+ let result_hll = Arc :: new ( Mutex :: new ( Option :: Some ( HyperLogLog :: new ( precision) ) ) ) ;
230
222
let job_id = AtomicUsize :: new ( 0 ) ;
231
223
thread:: scope ( |s| {
232
224
for _ in 0 ..n_jobs {
233
225
s. spawn ( |_| {
234
- let mut local_hll = HyperLogLog :: < String > :: new ( precision) ;
226
+ let mut local_hll = HyperLogLog :: new ( precision) ;
235
227
let curr_job_id = job_id. fetch_add ( 1 , Ordering :: SeqCst ) ;
236
228
237
229
let strings = generate_random_strings ( n_distinct, 100 , curr_job_id) ;
0 commit comments