Skip to content

Commit

Permalink
Add package bloom
Browse files Browse the repository at this point in the history
  • Loading branch information
hanke0 committed Dec 15, 2023
1 parent ca48599 commit 1338d5d
Show file tree
Hide file tree
Showing 2 changed files with 280 additions and 0 deletions.
143 changes: 143 additions & 0 deletions bloom/bloom.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// Package bloom provides a bloom filter implementation inspired by leveldb.
package bloom

import "unsafe"

// Bloom provides a lower than 2% false positive filter.
type Bloom struct {
k int // number of hash functions
bits uint32
filter []uint8
}

// New create a new bloom filter.
// The total memory in bits could simple calculate by bitsPerKey * expectLength.
// Usually bitsPerKey = 10 is enough.
func New(bitsPerKey, expectLength int) *Bloom {
k := int(float64(bitsPerKey) * 0.69) // 0.69 =~ ln(2)
if k < 1 {
k = 1
}
if k > 30 {
k = 30
}
bits := expectLength * bitsPerKey
// reduce false positive rate for small expectLength.
if bits < 64 {
bits = 64
}
bytes_ := (bits + 7) / 8
bits = bytes_ * 8
return &Bloom{
k: k,
bits: uint32(bits),
filter: make([]uint8, bytes_),
}
}

func unsafeToBytes(s string) []byte {
return *(*[]byte)(unsafe.Pointer(&s))
}

// Add records s exists in bloom.
func (b *Bloom) Add(s string) {
b.AddBytes(unsafeToBytes(s))
}

// AddBytes records data exists in bloom.
func (b *Bloom) AddBytes(data []byte) {
// Use double-hashing to generate a sequence of hash values.
// See analysis in
// Kirsch, A., Mitzenmacher, M. (2006). Less Hashing, Same Performance: Building a Better Bloom Filter.
h := bloomhash(data)
delta := h>>17 | h<<15 // Rotate right 17 bits
for i := 0; i < b.k; i++ {
bitpos := h % b.bits
b.filter[bitpos/8] |= 1 << (bitpos % 8)
h += delta
}
}

// MayExists tests if s exists in bloom.
func (b *Bloom) MayExists(s string) bool {
return b.MayExistsBytes(unsafeToBytes(s))
}

// MayExistsBytes tests if data exists in bloom.
func (b *Bloom) MayExistsBytes(data []byte) bool {
h := bloomhash(data)
delta := h>>17 | h<<15 // Rotate right 17 bits
for i := 0; i < b.k; i++ {
bitpos := h % b.bits
if b.filter[bitpos/8]&(1<<(bitpos%8)) == 0 {
return false
}
h += delta
}
return true
}

// TestAndAdd tests if s exists in bloom and records it into bloom if it not.
func (b *Bloom) TestAndAdd(s string) bool {
return b.TestAndAddBytes(unsafeToBytes(s))
}

// TestAndAddBytes tests if data exists in bloom and records it into bloom if it not.
func (b *Bloom) TestAndAddBytes(data []byte) bool {
exists := true
h := bloomhash(data)
delta := h>>17 | h<<15 // Rotate right 17 bits
for i := 0; i < b.k; i++ {
bitpos := h % b.bits
if b.filter[bitpos/8]&(1<<(bitpos%8)) == 0 {
b.filter[bitpos/8] |= 1 << (bitpos % 8)
exists = false
}
h += delta
}
return exists

}

func bloomhash(data []byte) uint32 {
return hash(data, 0xbc9f1d34)
}

// hash similar to murmur hash.
func hash(data []byte, seed uint32) uint32 {
const (
m uint32 = 0xc6a4a793
r uint32 = 24
)
n := uint32(len(data))
h := seed ^ (n * m)
for len(data) > 4 {
w := decodeFixed32(data[:4])
data = data[4:]
h += w
h *= m
h ^= (h >> 16)
}
switch len(data) {
case 3:
h += uint32(data[0]) | uint32(data[1])<<8 |
uint32(data[2])<<16
h *= m
h ^= (h >> r)
case 2:
h += uint32(data[0]) | uint32(data[1])<<8
h *= m
h ^= (h >> r)
case 1:
h += uint32(data[0])
h *= m
h ^= (h >> r)
default:
}
return h
}

func decodeFixed32(data []byte) uint32 {
return uint32(data[0]) | uint32(data[1])<<8 |
uint32(data[2])<<16 | uint32(data[3])<<24
}
137 changes: 137 additions & 0 deletions bloom/bloom_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package bloom

import (
"fmt"
"strconv"
"testing"
)

func expectTrue(t tb, ok bool) {
t.Helper()
if !ok {
t.Fatal("expect true got false")
}
}

func TestEmpty(t *testing.T) {
b := New(10, 1000000)
expectTrue(t, !b.MayExists("hello"))
expectTrue(t, !b.MayExists("world"))
}

func TestSmall(t *testing.T) {
b := New(10, 2)
b.Add("hello")
b.Add("world")
expectTrue(t, b.MayExists("hello"))
expectTrue(t, b.MayExists("world"))
expectTrue(t, !b.MayExists("x"))
expectTrue(t, !b.MayExists("w"))
}

func TestTestAndAdd(t *testing.T) {
b := New(10, 2)
expectTrue(t, !b.TestAndAdd("hello"))
expectTrue(t, !b.TestAndAdd("world"))
expectTrue(t, b.TestAndAdd("hello"))
expectTrue(t, b.TestAndAdd("world"))
expectTrue(t, !b.MayExists("x"))
expectTrue(t, !b.MayExists("w"))
}

type tb interface {
Helper()
Fatalf(string, ...interface{})
Fatal(...interface{})
}

func getSetBloom(t tb, n int) *Bloom {
t.Helper()
b := New(10, n)
for i := 0; i < n; i++ {
b.Add(strconv.Itoa(n))
}
for i := 0; i < n; i++ {
if !b.MayExists(strconv.Itoa(n)) {
t.Fatalf("expect true got false, length %d; key %d", i, n)
}
}
return b
}

func testNextLength(n int) int {
if n < 10 {
return n + 1
} else if n < 100 {
return n + 10
} else if n < 1000 {
return n + 100
} else {
return n + 1000
}
}

func falsePositive(b *Bloom, n int) float64 {
var m, f float64
for i := 0; i < 10000; i++ {
if b.MayExists(strconv.Itoa(i + 1000000000)) {
f++
}
m++
}
return f / m
}

func TestFalsePositive(t *testing.T) {
var mediocre, good int
for i := 1; i <= 10000; i = testNextLength(i) {
b := getSetBloom(t, i)
rate := falsePositive(b, i)
// Must not be over 2%
if rate > 0.02 {
t.Fatalf("False positive: %5.2f%%, length = %d, bytes = %d", rate*100, i, len(b.filter))
}
if rate > 0.0125 { // Allowed, but not too often
mediocre++
} else {
good++
}
}
if mediocre > good/5 {
t.Fatalf("expect mediocre > good/5, mediocre = %d, good = %d", mediocre, good)
}
}

func BenchmarkBloom(b *testing.B) {
b.ReportAllocs()
for i := 1; i <= 10000; i = testNextLength(i) {
b.Run(fmt.Sprintf("n=%d", i), func(b *testing.B) {
b.ReportAllocs()
for j := 0; j < b.N; j++ {
bloom := getSetBloom(b, i)
rate := falsePositive(bloom, i)
// Must not be over 2%
if rate > 0.02 {
b.Fatalf("False positive: %5.2f%%, length = %d, bytes = %d", rate*100, i, len(bloom.filter))
}
}
})
}
}

func BenchmarkBloomTestAndMatch(b *testing.B) {
b.ReportAllocs()
bloom := New(10, b.N)
var n, f float64
for i := 0; i < b.N; i++ {
if bloom.TestAndAdd(strconv.Itoa(i)) {
if i == 0 {
f++
}
}
n++
}
if r := f / n; r > 0.02 {
b.Fatalf("false positive > 0.02: %5.2f%%", r*100)
}
}

0 comments on commit 1338d5d

Please sign in to comment.