Skip to content

Commit eed8d24

Browse files
Add bloom filters video
1 parent c13fcff commit eed8d24

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ James and his team are available for consulting, contracting, code reviews, and
1212

1313
| N | Code | Video |
1414
|-----| --- |--- |
15+
| 132 | [src](videos/132_bloom_filter) | [Bloom Filters](https://youtu.be/qZNJTh2NEiU) |
16+
| 131 | no src | [Actually, you CAN divide by zero.](https://youtu.be/eR23nPNqf6A) |
1517
| 130 | [src](videos/130_python_312_release) | [Python 3.12 is HERE!](https://youtu.be/8l4UWz48Elc) |
1618
| 129 | [src](videos/129_lambda_in_a_loop_is_a_code_smell) | [Lambda in a Loop is a Code Smell](https://youtu.be/fZE6ZWde-Os) |
1719
| 128 | [src](videos/128_id_mapping) | [A forbidden Python technique to put ANYTHING in a dict or set.](https://youtu.be/NpdNDTncxwA) |
+190
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
import array
2+
import hashlib
3+
import itertools
4+
import math
5+
import random
6+
import string
7+
import time
8+
from collections.abc import Callable, Iterable, MutableSequence
9+
from dataclasses import dataclass
10+
11+
12+
def _8_bools_to_int(bools) -> int:
13+
bin_str = ''.join('1' if b else '0' for b in reversed(bools))
14+
return int(bin_str, 2)
15+
16+
17+
@dataclass
18+
class BitArray:
19+
data: array.array[int]
20+
size: int
21+
22+
@classmethod
23+
def _to_bytes(cls, iterable, iter_len_out: list):
24+
iterable = (bool(x) for x in iterable)
25+
iterable = itertools.batched(iterable, 8)
26+
iter_len = 0
27+
for x in iterable:
28+
iter_len += len(x)
29+
yield _8_bools_to_int(x)
30+
31+
iter_len_out[0] = iter_len
32+
33+
@classmethod
34+
def from_iterable(cls, iterable: Iterable):
35+
iter_len = [0]
36+
iterable = cls._to_bytes(iterable, iter_len_out=iter_len)
37+
data = array.array('B', iterable)
38+
size = iter_len[0]
39+
return cls(data=data, size=size)
40+
41+
@classmethod
42+
def zeros(cls, n: int):
43+
arr_size, remainder = divmod(n, 8)
44+
if remainder:
45+
arr_size += 1
46+
data = array.array('B', (0 for _ in range(arr_size)))
47+
return cls(data=data, size=n)
48+
49+
def _check_index(self, n):
50+
if not isinstance(n, int):
51+
raise TypeError("expected int")
52+
if not 0 <= n < self.size:
53+
raise IndexError(n)
54+
55+
def __getitem__(self, n):
56+
self._check_index(n)
57+
arr_idx, bit_idx = divmod(n, 8)
58+
return (self.data[arr_idx] >> bit_idx) & 0b1
59+
60+
def __setitem__(self, n, bit):
61+
self._check_index(n)
62+
arr_idx, bit_idx = divmod(n, 8)
63+
data = self.data[arr_idx]
64+
data &= ~(1 << bit_idx) # clear bit
65+
data |= (bool(bit) * (1 << bit_idx)) # set bit
66+
self.data[arr_idx] = data
67+
68+
def __repr__(self):
69+
return f"{self.__class__.__name__}({list(self)})"
70+
71+
def __len__(self):
72+
return self.size
73+
74+
75+
@dataclass
76+
class BloomFilter[T]:
77+
mem: MutableSequence[int]
78+
calc_hashes: Callable[[T], Iterable[int]]
79+
80+
@staticmethod
81+
def estimate_false_positive_rate(n_hashes: int, mem_size: int, n_items: int):
82+
return (1.0 - math.exp(- n_hashes * n_items / mem_size)) ** n_hashes
83+
84+
def add(self, item: T):
85+
for h in self.calc_hashes(item):
86+
self.mem[h % len(self.mem)] = 1
87+
88+
def __contains__(self, item: T):
89+
return all(self.mem[h % len(self.mem)] for h in self.calc_hashes(item))
90+
91+
92+
def split_long_hash[T](
93+
hash_fn: Callable[[T], int],
94+
digest_size: int,
95+
hashes: int,
96+
bytes_per_hash: int,
97+
) -> Callable[[T], list[int]]:
98+
if digest_size // hashes < bytes_per_hash:
99+
raise ValueError("digest not long enough")
100+
101+
def calc_hashes(item):
102+
item_hash = hash_fn(item)
103+
hash_bytes = item_hash.to_bytes(digest_size)
104+
return [
105+
int.from_bytes(hash_bytes[i * bytes_per_hash:(i + 1) * bytes_per_hash])
106+
for i in range(hashes)
107+
]
108+
109+
return calc_hashes
110+
111+
112+
nice_chars = string.printable
113+
114+
115+
def random_str(length: int) -> str:
116+
return ''.join(random.choices(nice_chars, k=length))
117+
118+
119+
def bitarray_example():
120+
bits = BitArray.from_iterable([1, 1, 0, 1, 1, 1, 0, 1])
121+
print(len(bits))
122+
# bits[0] = 0
123+
print(bits)
124+
print(BitArray.zeros(2))
125+
126+
127+
@dataclass
128+
class Timer:
129+
msg: str
130+
start: float = 0.0
131+
end: float = 0.0
132+
133+
def __enter__(self):
134+
print(self.msg, end=": ")
135+
self.start = time.perf_counter()
136+
137+
def __exit__(self, exc_type, exc_val, exc_tb):
138+
self.end = time.perf_counter()
139+
print(f"{self.end - self.start:.02f}s")
140+
141+
142+
def bloom_example():
143+
def long_hash(s: str) -> int:
144+
h = hashlib.sha256()
145+
h.update(s.encode())
146+
return int.from_bytes(h.digest())
147+
148+
n_hashes = 5
149+
bytes_per_hash = 6
150+
151+
calc_hashes = split_long_hash(
152+
long_hash,
153+
digest_size=256 // 8,
154+
hashes=n_hashes,
155+
bytes_per_hash=bytes_per_hash)
156+
157+
mem_size = 80_000_000
158+
elem_count = 10_000_000
159+
160+
# mem = [0] * mem_size # ~ 8 bytes per element, (each element is a pointer)
161+
mem = BitArray.zeros(mem_size) # ~ 1 bit per element
162+
bloom = BloomFilter[str](mem=mem, calc_hashes=calc_hashes)
163+
164+
with Timer("Making strs"):
165+
strs = {random_str(16) for _ in range(elem_count)}
166+
167+
with Timer("Adding strs"):
168+
for s in strs:
169+
bloom.add(s)
170+
171+
with Timer("checking no false negatives"):
172+
assert all(s in bloom for s in strs)
173+
174+
with Timer("checking false positives"):
175+
false_positives = sum((random_str(15) in bloom) for _ in range(elem_count))
176+
177+
fpr_estimated = bloom.estimate_false_positive_rate(n_hashes, mem_size, elem_count)
178+
print(f"False positive estimate: {fpr_estimated * 100:.03f}%")
179+
180+
fpr_empirical = false_positives / elem_count
181+
print(f"False positives: {false_positives} ({fpr_empirical * 100:.03f}%)")
182+
183+
184+
def main():
185+
# bitarray_example()
186+
bloom_example()
187+
188+
189+
if __name__ == '__main__':
190+
main()

0 commit comments

Comments
 (0)