|
| 1 | +package filter |
| 2 | + |
| 3 | +import ( |
| 4 | + "github.com/spaolacci/murmur3" |
| 5 | + "math" |
| 6 | +) |
| 7 | + |
| 8 | +const( |
| 9 | + DEFAULT_PROB = 0.000001 |
| 10 | +) |
| 11 | + |
| 12 | +// TODO: Hash in directories |
| 13 | + |
| 14 | +// BloomFilter is a struct for creating a Bloom filter for an image file. A |
| 15 | +// A Bloom filter specifies whether a specific file path is "definitily" not |
| 16 | +// in the image file or is "maybe" in the file with a certain probability. |
| 17 | +// This struct implements the Filter interface. |
| 18 | +type BloomFilter struct { |
| 19 | + // FPProb (False Positive Probability) is the desired probability of a false positive in the filter |
| 20 | + FPProb float64 |
| 21 | + |
| 22 | + // NumHashes represents the number of hash functions for this bloom filter (k) |
| 23 | + NumHashes uint64 |
| 24 | + |
| 25 | + // NumElem represents the number of elements in this filter (n) |
| 26 | + NumElem uint64 |
| 27 | + |
| 28 | + // FilterSize represents the number of bits in this filter (m) |
| 29 | + FilterSize uint64 |
| 30 | + |
| 31 | + // BitSet is the array of bits that implements a bloom filter |
| 32 | + BitSet []bool |
| 33 | +} |
| 34 | + |
| 35 | + |
| 36 | +// Initialize implements Filter.Initialize. Assumes b.NumElem is set to number of expected elements |
| 37 | +// and FPProb is set |
| 38 | +func (b *BloomFilter) Initialize() { |
| 39 | + // Check for error conditions |
| 40 | + if (b.NumElem < 1) { |
| 41 | + // Return error |
| 42 | + return |
| 43 | + } |
| 44 | + |
| 45 | + // Initialize FPProb |
| 46 | + if (b.FPProb == 0) { b.FPProb = DEFAULT_PROB } |
| 47 | + |
| 48 | + // Compute filter size and initialize bitarray |
| 49 | + b.FilterSize = b.calcFilterSize() |
| 50 | + b.BitSet = make([]bool, b.FilterSize) |
| 51 | + |
| 52 | + // Compute number of hashes (k) |
| 53 | + b.NumHashes = b.calcNumHashes() |
| 54 | + |
| 55 | +} |
| 56 | + |
| 57 | +// calcFilterSize calculates the optimal size of bit array given prob and elements |
| 58 | +// Assumes FPProb and NumElem is set |
| 59 | +// m = ceil((n*log(p)) / log(1 / pow(2, log(2))) |
| 60 | +func (b *BloomFilter) calcFilterSize() uint64 { |
| 61 | + return uint64(math.Ceil((float64(b.NumElem) * math.Log(b.FPProb)) / math.Log(1 / math.Pow(2, math.Log(2))))) |
| 62 | +} |
| 63 | + |
| 64 | +// calcNumHashes calculates the aptimal number of hashes given the filter size and the number of elements |
| 65 | +// Assumes FilterSize and NumElem set |
| 66 | +// k = round((m / n) * log(2)) |
| 67 | +func (b *BloomFilter) calcNumHashes() uint64 { |
| 68 | + return uint64(math.Round(float64(b.FilterSize / b.NumElem) * math.Log(2))) |
| 69 | +} |
| 70 | + |
| 71 | +// AddElement implements Filter.AddElement |
| 72 | +func (b *BloomFilter) AddElement(elem []byte) { |
| 73 | + // Get the hashed value of the element |
| 74 | + h1, h2 := b.hashElement(elem) |
| 75 | + |
| 76 | + intHash := h1 |
| 77 | + |
| 78 | + // Set bits in bitset to represent added element -> TODO: Does int cast affect anything? |
| 79 | + for i:=0; i < int(b.NumHashes); i++ { |
| 80 | + intHash += (b.NumHashes*h2) |
| 81 | + bitToSet := intHash % b.FilterSize |
| 82 | + b.setBit(bitToSet) |
| 83 | + } |
| 84 | + |
| 85 | +} |
| 86 | + |
| 87 | +// hashElement hashes the elem passed in based on the murmur hash function |
| 88 | +// TODO: Unsure if Sum128 is correct |
| 89 | +func (b *BloomFilter) hashElement(elem []byte) (uint64, uint64) { |
| 90 | + return murmur3.Sum128(elem) |
| 91 | +} |
| 92 | + |
| 93 | +// setBits will set bit at position to true |
| 94 | +func (b *BloomFilter) setBit(position uint64) { |
| 95 | + b.BitSet[position] = true |
| 96 | +} |
| 97 | + |
| 98 | +// RemoveElement removes an element from the filter |
| 99 | +// |
| 100 | +// |
| 101 | +func (b *BloomFilter) RemoveElement() { |
| 102 | + // No-op for bloom filter |
| 103 | +} |
| 104 | + |
| 105 | +// TestElement implements Filter.TestElement |
| 106 | +func (b *BloomFilter) TestElement(elem []byte) bool { |
| 107 | + // TODO: Make this modular with add element |
| 108 | + // Get the hashed value of the element |
| 109 | + h1, h2 := b.hashElement(elem) |
| 110 | + |
| 111 | + intHash := h1 |
| 112 | + |
| 113 | + // TODO: Look into this, may be perf issue.. |
| 114 | + //var testFilter = make([]bool, b.FilterSize) |
| 115 | + |
| 116 | + // Create a test bit array |
| 117 | + //copy(testFilter, b.BitSet) |
| 118 | + |
| 119 | + // Set bits in bitset to represent added element |
| 120 | + for i:=0; i < int(b.NumHashes); i++ { |
| 121 | + intHash += (b.NumHashes*h2) |
| 122 | + bitToSet := intHash % b.FilterSize |
| 123 | + |
| 124 | + // Check if bitset in filter |
| 125 | + if (!b.BitSet[bitToSet]) { |
| 126 | + return false |
| 127 | + } |
| 128 | + } |
| 129 | + |
| 130 | + // Test if found by checking that all bits set are same as original |
| 131 | + return true |
| 132 | +} |
| 133 | + |
| 134 | +// checkBitSetEquality checks if a test bloom filter equals the current bloom filter |
| 135 | +func (b *BloomFilter) checkBitSetEquality(test []bool) (bool) { |
| 136 | + if (len(test) != len(b.BitSet)) { return false } |
| 137 | + |
| 138 | + for i:=0; i < int(b.FilterSize); i++ { |
| 139 | + if (b.BitSet[i] != test[i]) { return false } |
| 140 | + } |
| 141 | + |
| 142 | + return true |
| 143 | +} |
0 commit comments