Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 3477898

Browse files
authored
feat: add mcv data structure for statistics (#133)
This pull request implements the MCV (Most-Common-Values) data structure. This data structure gathers exact frequencies of the common values reported by the first Misra-Gries ANALYZE sweep. It supports a fully parallelizable, memory-bounded MCV computation scheme through an easy-to-use API. The implementation includes unit tests for i32 values. Next steps: Integrate into DataFusion.
1 parent 026a01e commit 3477898

File tree

3 files changed

+159
-2
lines changed

3 files changed

+159
-2
lines changed

optd-gungnir/src/stats.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
pub mod hyperloglog;
2+
pub mod mcv;
23
pub mod misragries;
34
pub mod murmur2;
45
pub mod tdigest;

optd-gungnir/src/stats/mcv.rs

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
//! A hash-based MCV implementation that will track exact frequencies for
2+
//! an array of prespecified elements.
3+
4+
use std::{collections::HashMap, hash::Hash};
5+
6+
/// The MCV structure to track exact frequencies of fixed elements.
7+
pub struct MCV<T: PartialEq + Eq + Hash + Clone> {
8+
frequencies: HashMap<T, i32>, // The exact frequencies of an element T.
9+
}
10+
11+
// Self-contained implementation of the MCV data structure.
12+
impl<T> MCV<T>
13+
where
14+
T: PartialEq + Eq + Hash + Clone,
15+
{
16+
/// Creates and initializes a new empty MCV with the frequency map sized
17+
/// based on the number of unique elements in `to_track`.
18+
pub fn new(to_track: &[T]) -> Self {
19+
let mut frequencies: HashMap<T, i32> = HashMap::with_capacity(to_track.len());
20+
for item in to_track {
21+
frequencies.insert(item.clone(), 0);
22+
}
23+
24+
MCV::<T> { frequencies }
25+
}
26+
27+
// Inserts an element in the MCV if it is being tracked.
28+
pub fn insert_element(&mut self, elem: T, occ: i32) {
29+
if let Some(frequency) = self.frequencies.get_mut(&elem) {
30+
*frequency += occ;
31+
}
32+
}
33+
34+
/// Digests an array of data into the MCV structure.
35+
pub fn aggregate(&mut self, data: &[T]) {
36+
data.iter()
37+
.for_each(|key| self.insert_element(key.clone(), 1));
38+
}
39+
40+
/// Merges another MCV into the current one.
41+
/// Particularly useful for parallel execution.
42+
pub fn merge(&mut self, other: &MCV<T>) {
43+
other
44+
.frequencies
45+
.iter()
46+
.for_each(|(key, occ)| self.insert_element(key.clone(), *occ));
47+
}
48+
49+
/// Returns the frequencies of the most common values.
50+
pub fn frequencies(&self) -> &HashMap<T, i32> {
51+
&self.frequencies
52+
}
53+
}
54+
55+
// Start of unit testing section.
56+
#[cfg(test)]
57+
mod tests {
58+
use std::collections::HashMap;
59+
use std::sync::{Arc, Mutex};
60+
61+
use crossbeam::thread;
62+
use rand::seq::SliceRandom;
63+
use rand::{rngs::StdRng, SeedableRng};
64+
65+
use super::MCV;
66+
67+
// Generates hardcoded frequencies and returns them,
68+
// along with a flattened randomized array containing those frequencies.
69+
fn generate_frequencies() -> (HashMap<i32, i32>, Vec<i32>) {
70+
let mut frequencies = std::collections::HashMap::new();
71+
72+
frequencies.insert(0, 2);
73+
frequencies.insert(1, 4);
74+
frequencies.insert(2, 9);
75+
frequencies.insert(3, 8);
76+
frequencies.insert(4, 50);
77+
frequencies.insert(5, 6);
78+
79+
let mut flattened = Vec::new();
80+
for (key, &value) in &frequencies {
81+
for _ in 0..value {
82+
flattened.push(*key);
83+
}
84+
}
85+
86+
let mut rng = StdRng::seed_from_u64(0);
87+
flattened.shuffle(&mut rng);
88+
89+
(frequencies, flattened)
90+
}
91+
92+
#[test]
93+
fn aggregate() {
94+
let to_track = vec![0, 1, 2, 3];
95+
let mut mcv = MCV::<i32>::new(&to_track);
96+
97+
let (frequencies, flattened) = generate_frequencies();
98+
99+
mcv.aggregate(&flattened);
100+
101+
let mcv_freq = mcv.frequencies();
102+
assert_eq!(mcv_freq.len(), to_track.len());
103+
104+
to_track.iter().for_each(|item| {
105+
assert!(mcv_freq.contains_key(item));
106+
assert_eq!(mcv_freq.get(item), frequencies.get(item));
107+
});
108+
}
109+
110+
#[test]
111+
fn merge() {
112+
let to_track = vec![0, 1, 2, 3];
113+
let n_jobs = 16;
114+
115+
let total_frequencies = Arc::new(Mutex::new(HashMap::<i32, i32>::new()));
116+
let result_mcv = Arc::new(Mutex::new(MCV::<i32>::new(&to_track)));
117+
thread::scope(|s| {
118+
for _ in 0..n_jobs {
119+
s.spawn(|_| {
120+
let mut local_mcv = MCV::<i32>::new(&to_track);
121+
122+
let (local_frequencies, flattened) = generate_frequencies();
123+
let mut total_frequencies = total_frequencies.lock().unwrap();
124+
for (&key, &value) in &local_frequencies {
125+
*total_frequencies.entry(key).or_insert(0) += value;
126+
}
127+
128+
local_mcv.aggregate(&flattened);
129+
130+
let mcv_local_freq = local_mcv.frequencies();
131+
assert_eq!(mcv_local_freq.len(), to_track.len());
132+
133+
to_track.iter().for_each(|item| {
134+
assert!(mcv_local_freq.contains_key(item));
135+
assert_eq!(mcv_local_freq.get(item), local_frequencies.get(item));
136+
});
137+
138+
let mut result = result_mcv.lock().unwrap();
139+
result.merge(&local_mcv);
140+
});
141+
}
142+
})
143+
.unwrap();
144+
145+
let mcv = result_mcv.lock().unwrap();
146+
let mcv_freq = mcv.frequencies();
147+
148+
to_track.iter().for_each(|item| {
149+
assert!(mcv_freq.contains_key(item));
150+
assert_eq!(
151+
mcv_freq.get(item),
152+
total_frequencies.lock().unwrap().get(item)
153+
);
154+
});
155+
}
156+
}

optd-gungnir/src/stats/misragries.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ mod tests {
104104
use rand::{rngs::StdRng, SeedableRng};
105105

106106
#[test]
107-
fn aggregate_full_size() {
107+
fn aggregate_simple() {
108108
let data = vec![0, 1, 2, 3];
109109
let mut misra_gries = MisraGries::<i32>::new(data.len() as u16);
110110

@@ -116,7 +116,7 @@ mod tests {
116116
}
117117

118118
#[test]
119-
fn aggregate_half_size() {
119+
fn aggregate_double() {
120120
let data = vec![0, 1, 2, 3];
121121
let data_dup = [data.as_slice(), data.as_slice()].concat();
122122

0 commit comments

Comments
 (0)