-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
280 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
// Package bloom provides a bloom filter implementation inspired by leveldb. | ||
package bloom | ||
|
||
import "unsafe" | ||
|
||
// Bloom provides a lower than 2% false positive filter. | ||
type Bloom struct { | ||
k int // number of hash functions | ||
bits uint32 | ||
filter []uint8 | ||
} | ||
|
||
// New create a new bloom filter. | ||
// The total memory in bits could simple calculate by bitsPerKey * expectLength. | ||
// Usually bitsPerKey = 10 is enough. | ||
func New(bitsPerKey, expectLength int) *Bloom { | ||
k := int(float64(bitsPerKey) * 0.69) // 0.69 =~ ln(2) | ||
if k < 1 { | ||
k = 1 | ||
} | ||
if k > 30 { | ||
k = 30 | ||
} | ||
bits := expectLength * bitsPerKey | ||
// reduce false positive rate for small expectLength. | ||
if bits < 64 { | ||
bits = 64 | ||
} | ||
bytes_ := (bits + 7) / 8 | ||
bits = bytes_ * 8 | ||
return &Bloom{ | ||
k: k, | ||
bits: uint32(bits), | ||
filter: make([]uint8, bytes_), | ||
} | ||
} | ||
|
||
func unsafeToBytes(s string) []byte { | ||
return *(*[]byte)(unsafe.Pointer(&s)) | ||
} | ||
|
||
// Add records s exists in bloom. | ||
func (b *Bloom) Add(s string) { | ||
b.AddBytes(unsafeToBytes(s)) | ||
} | ||
|
||
// AddBytes records data exists in bloom. | ||
func (b *Bloom) AddBytes(data []byte) { | ||
// Use double-hashing to generate a sequence of hash values. | ||
// See analysis in | ||
// Kirsch, A., Mitzenmacher, M. (2006). Less Hashing, Same Performance: Building a Better Bloom Filter. | ||
h := bloomhash(data) | ||
delta := h>>17 | h<<15 // Rotate right 17 bits | ||
for i := 0; i < b.k; i++ { | ||
bitpos := h % b.bits | ||
b.filter[bitpos/8] |= 1 << (bitpos % 8) | ||
h += delta | ||
} | ||
} | ||
|
||
// MayExists tests if s exists in bloom. | ||
func (b *Bloom) MayExists(s string) bool { | ||
return b.MayExistsBytes(unsafeToBytes(s)) | ||
} | ||
|
||
// MayExistsBytes tests if data exists in bloom. | ||
func (b *Bloom) MayExistsBytes(data []byte) bool { | ||
h := bloomhash(data) | ||
delta := h>>17 | h<<15 // Rotate right 17 bits | ||
for i := 0; i < b.k; i++ { | ||
bitpos := h % b.bits | ||
if b.filter[bitpos/8]&(1<<(bitpos%8)) == 0 { | ||
return false | ||
} | ||
h += delta | ||
} | ||
return true | ||
} | ||
|
||
// TestAndAdd tests if s exists in bloom and records it into bloom if it not. | ||
func (b *Bloom) TestAndAdd(s string) bool { | ||
return b.TestAndAddBytes(unsafeToBytes(s)) | ||
} | ||
|
||
// TestAndAddBytes tests if data exists in bloom and records it into bloom if it not. | ||
func (b *Bloom) TestAndAddBytes(data []byte) bool { | ||
exists := true | ||
h := bloomhash(data) | ||
delta := h>>17 | h<<15 // Rotate right 17 bits | ||
for i := 0; i < b.k; i++ { | ||
bitpos := h % b.bits | ||
if b.filter[bitpos/8]&(1<<(bitpos%8)) == 0 { | ||
b.filter[bitpos/8] |= 1 << (bitpos % 8) | ||
exists = false | ||
} | ||
h += delta | ||
} | ||
return exists | ||
|
||
} | ||
|
||
func bloomhash(data []byte) uint32 { | ||
return hash(data, 0xbc9f1d34) | ||
} | ||
|
||
// hash similar to murmur hash. | ||
func hash(data []byte, seed uint32) uint32 { | ||
const ( | ||
m uint32 = 0xc6a4a793 | ||
r uint32 = 24 | ||
) | ||
n := uint32(len(data)) | ||
h := seed ^ (n * m) | ||
for len(data) > 4 { | ||
w := decodeFixed32(data[:4]) | ||
data = data[4:] | ||
h += w | ||
h *= m | ||
h ^= (h >> 16) | ||
} | ||
switch len(data) { | ||
case 3: | ||
h += uint32(data[0]) | uint32(data[1])<<8 | | ||
uint32(data[2])<<16 | ||
h *= m | ||
h ^= (h >> r) | ||
case 2: | ||
h += uint32(data[0]) | uint32(data[1])<<8 | ||
h *= m | ||
h ^= (h >> r) | ||
case 1: | ||
h += uint32(data[0]) | ||
h *= m | ||
h ^= (h >> r) | ||
default: | ||
} | ||
return h | ||
} | ||
|
||
func decodeFixed32(data []byte) uint32 { | ||
return uint32(data[0]) | uint32(data[1])<<8 | | ||
uint32(data[2])<<16 | uint32(data[3])<<24 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
package bloom | ||
|
||
import ( | ||
"fmt" | ||
"strconv" | ||
"testing" | ||
) | ||
|
||
func expectTrue(t tb, ok bool) { | ||
t.Helper() | ||
if !ok { | ||
t.Fatal("expect true got false") | ||
} | ||
} | ||
|
||
func TestEmpty(t *testing.T) { | ||
b := New(10, 1000000) | ||
expectTrue(t, !b.MayExists("hello")) | ||
expectTrue(t, !b.MayExists("world")) | ||
} | ||
|
||
func TestSmall(t *testing.T) { | ||
b := New(10, 2) | ||
b.Add("hello") | ||
b.Add("world") | ||
expectTrue(t, b.MayExists("hello")) | ||
expectTrue(t, b.MayExists("world")) | ||
expectTrue(t, !b.MayExists("x")) | ||
expectTrue(t, !b.MayExists("w")) | ||
} | ||
|
||
func TestTestAndAdd(t *testing.T) { | ||
b := New(10, 2) | ||
expectTrue(t, !b.TestAndAdd("hello")) | ||
expectTrue(t, !b.TestAndAdd("world")) | ||
expectTrue(t, b.TestAndAdd("hello")) | ||
expectTrue(t, b.TestAndAdd("world")) | ||
expectTrue(t, !b.MayExists("x")) | ||
expectTrue(t, !b.MayExists("w")) | ||
} | ||
|
||
type tb interface { | ||
Helper() | ||
Fatalf(string, ...interface{}) | ||
Fatal(...interface{}) | ||
} | ||
|
||
func getSetBloom(t tb, n int) *Bloom { | ||
t.Helper() | ||
b := New(10, n) | ||
for i := 0; i < n; i++ { | ||
b.Add(strconv.Itoa(n)) | ||
} | ||
for i := 0; i < n; i++ { | ||
if !b.MayExists(strconv.Itoa(n)) { | ||
t.Fatalf("expect true got false, length %d; key %d", i, n) | ||
} | ||
} | ||
return b | ||
} | ||
|
||
func testNextLength(n int) int { | ||
if n < 10 { | ||
return n + 1 | ||
} else if n < 100 { | ||
return n + 10 | ||
} else if n < 1000 { | ||
return n + 100 | ||
} else { | ||
return n + 1000 | ||
} | ||
} | ||
|
||
func falsePositive(b *Bloom, n int) float64 { | ||
var m, f float64 | ||
for i := 0; i < 10000; i++ { | ||
if b.MayExists(strconv.Itoa(i + 1000000000)) { | ||
f++ | ||
} | ||
m++ | ||
} | ||
return f / m | ||
} | ||
|
||
func TestFalsePositive(t *testing.T) { | ||
var mediocre, good int | ||
for i := 1; i <= 10000; i = testNextLength(i) { | ||
b := getSetBloom(t, i) | ||
rate := falsePositive(b, i) | ||
// Must not be over 2% | ||
if rate > 0.02 { | ||
t.Fatalf("False positive: %5.2f%%, length = %d, bytes = %d", rate*100, i, len(b.filter)) | ||
} | ||
if rate > 0.0125 { // Allowed, but not too often | ||
mediocre++ | ||
} else { | ||
good++ | ||
} | ||
} | ||
if mediocre > good/5 { | ||
t.Fatalf("expect mediocre > good/5, mediocre = %d, good = %d", mediocre, good) | ||
} | ||
} | ||
|
||
func BenchmarkBloom(b *testing.B) { | ||
b.ReportAllocs() | ||
for i := 1; i <= 10000; i = testNextLength(i) { | ||
b.Run(fmt.Sprintf("n=%d", i), func(b *testing.B) { | ||
b.ReportAllocs() | ||
for j := 0; j < b.N; j++ { | ||
bloom := getSetBloom(b, i) | ||
rate := falsePositive(bloom, i) | ||
// Must not be over 2% | ||
if rate > 0.02 { | ||
b.Fatalf("False positive: %5.2f%%, length = %d, bytes = %d", rate*100, i, len(bloom.filter)) | ||
} | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func BenchmarkBloomTestAndMatch(b *testing.B) { | ||
b.ReportAllocs() | ||
bloom := New(10, b.N) | ||
var n, f float64 | ||
for i := 0; i < b.N; i++ { | ||
if bloom.TestAndAdd(strconv.Itoa(i)) { | ||
if i == 0 { | ||
f++ | ||
} | ||
} | ||
n++ | ||
} | ||
if r := f / n; r > 0.02 { | ||
b.Fatalf("false positive > 0.02: %5.2f%%", r*100) | ||
} | ||
} |