Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 3ebe94e

Browse files
authored
feat: add an arithmetic encoder to transform strings into f64 (#134)
This PR implements an arithmetic encoder for alphanumeric strings, enabling the strings to utilize t-digests (see [Wikipedia](https://en.wikipedia.org/wiki/Arithmetic_coding)). Characters outside the specified alpha-numerical range are treated as identical and considered 'heaviest'.
1 parent 204758e commit 3ebe94e

File tree

6 files changed

+78
-1
lines changed

6 files changed

+78
-1
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-gungnir/Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ edition = "2021"
99
itertools = "0.11"
1010
rand = "0.8"
1111
crossbeam = "0.8"
12-
serde = {version = "1.0", features = ["derive"]}
12+
lazy_static = "1.4"
13+
serde = {version = "1.0", features = ["derive"]}

optd-gungnir/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
#![allow(clippy::new_without_default)]
22

33
pub mod stats;
4+
pub mod utils;

optd-gungnir/src/stats/misragries.rs

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ use std::{cmp::min, collections::HashMap, hash::Hash};
99

1010
use itertools::Itertools;
1111

12+
pub const DEFAULT_K_TO_TRACK: u16 = 100;
13+
1214
/// The Misra-Gries structure to approximate the k most frequent elements in
1315
/// a stream of N elements. It will always identify elements with frequency
1416
/// f >= (n/k), and include additional leftovers.

optd-gungnir/src/utils.rs

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub mod arith_encoder;
+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
//! This module provides an encoder that converts alpha-numeric strings
2+
//! into f64 values, designed to maintain the natural ordering of strings.
3+
//!
4+
//! While the encoding is theoretically lossless, in practice, it may suffer
5+
//! from precision loss due to floating-point errors.
6+
//!
7+
//! Non-alpha-numeric characters are relegated to the end of the encoded value,
8+
//! rendering them indistinguishable from one another in this context.
9+
10+
use std::collections::HashMap;
11+
12+
use lazy_static::lazy_static;
13+
14+
// The alphanumerical ordering.
15+
const ALPHANUMERIC_ORDER: [char; 95] = [
16+
' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<',
17+
'=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4',
18+
'5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
19+
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
20+
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
21+
];
22+
23+
const PMF: f64 = 1.0 / (ALPHANUMERIC_ORDER.len() as f64);
24+
25+
lazy_static! {
26+
static ref CDF: HashMap<char, f64> = {
27+
let length = ALPHANUMERIC_ORDER.len() + 1; // To account for non-alpha-numeric characters.
28+
let mut cdf = HashMap::with_capacity(length);
29+
for (index, &char) in ALPHANUMERIC_ORDER.iter().enumerate() {
30+
cdf.insert(char, (index as f64) / (length as f64));
31+
}
32+
cdf
33+
};
34+
}
35+
36+
pub fn encode(string: &str) -> f64 {
37+
let mut left = 0.0;
38+
let mut right = f64::MAX;
39+
40+
for char in string.chars() {
41+
let cdf = CDF.get(&char).unwrap_or(&1.0);
42+
let distance = right - left;
43+
right = left + distance * (cdf + PMF);
44+
left += distance * cdf;
45+
}
46+
47+
left
48+
}
49+
50+
// Start of unit testing section.
51+
#[cfg(test)]
52+
mod tests {
53+
use super::encode;
54+
55+
#[test]
56+
fn encode_tests() {
57+
assert!(encode("") < encode("abc"));
58+
assert!(encode("abc") < encode("bcd"));
59+
60+
assert!(encode("a") < encode("aaa"));
61+
assert!(encode("!a") < encode("a!"));
62+
assert!(encode("Alexis") < encode("Schlomer"));
63+
64+
assert!(encode("Gungnir Rules!") < encode("Schlomer"));
65+
assert!(encode("Gungnir Rules!") < encode("Schlomer"));
66+
67+
assert_eq!(encode(" "), encode(" "));
68+
assert_eq!(encode("Same"), encode("Same"));
69+
assert!(encode("Nicolas ") < encode("Nicolas💰💼"));
70+
}
71+
}

0 commit comments

Comments
 (0)