Skip to content

Commit f0d833a

Browse files
committed
Tweak text and code style
Add examples to description Add playground Add test project
1 parent ce78581 commit f0d833a

File tree

8 files changed

+488
-78
lines changed

8 files changed

+488
-78
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
//: Playground - noun: a place where people can play
2+
3+
public class BloomFilter<T> {
4+
private var array: [Bool]
5+
private var hashFunctions: [T -> Int]
6+
7+
public init(size: Int = 1024, hashFunctions: [T -> Int]) {
8+
self.array = .init(count: size, repeatedValue: false)
9+
self.hashFunctions = hashFunctions
10+
}
11+
12+
private func computeHashes(value: T) -> [Int] {
13+
return hashFunctions.map() { hashFunc in abs(hashFunc(value) % array.count) }
14+
}
15+
16+
public func insert(element: T) {
17+
for hashValue in computeHashes(element) {
18+
array[hashValue] = true
19+
}
20+
}
21+
22+
public func insert(values: [T]) {
23+
for value in values {
24+
insert(value)
25+
}
26+
}
27+
28+
public func query(value: T) -> Bool {
29+
let hashValues = computeHashes(value)
30+
31+
// Map hashes to indices in the Bloom Filter
32+
let results = hashValues.map() { hashValue in array[hashValue] }
33+
34+
// All values must be 'true' for the query to return true
35+
36+
// This does NOT imply that the value is in the Bloom filter,
37+
// only that it may be. If the query returns false, however,
38+
// you can be certain that the value was not added.
39+
40+
let exists = results.reduce(true, combine: { $0 && $1 })
41+
return exists
42+
}
43+
44+
public func isEmpty() -> Bool {
45+
// As soon as the reduction hits a 'true' value, the && condition will fail.
46+
return array.reduce(true) { prev, next in prev && !next }
47+
}
48+
}
49+
50+
51+
52+
/* Two hash functions, adapted from http://www.cse.yorku.ca/~oz/hash.html */
53+
54+
func djb2(x: String) -> Int {
55+
var hash = 5381
56+
for char in x.characters {
57+
hash = ((hash << 5) &+ hash) &+ char.hashValue
58+
}
59+
return Int(hash)
60+
}
61+
62+
func sdbm(x: String) -> Int {
63+
var hash = 0
64+
for char in x.characters {
65+
hash = char.hashValue &+ (hash << 6) &+ (hash << 16) &- hash
66+
}
67+
return Int(hash)
68+
}
69+
70+
71+
72+
/* A simple test */
73+
74+
let bloom = BloomFilter<String>(size: 17, hashFunctions: [djb2, sdbm])
75+
76+
bloom.insert("Hello world!")
77+
print(bloom.array)
78+
79+
bloom.query("Hello world!") // true
80+
bloom.query("Hello WORLD") // false
81+
82+
bloom.insert("Bloom Filterz")
83+
print(bloom.array)
84+
85+
bloom.query("Bloom Filterz") // true
86+
bloom.query("Hello WORLD") // true
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2+
<playground version='5.0' target-platform='osx'>
3+
<timeline fileName='timeline.xctimeline'/>
4+
</playground>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<Timeline
3+
version = "3.0">
4+
<TimelineItems>
5+
</TimelineItems>
6+
</Timeline>

Bloom Filter/BloomFilter.swift

+39-51
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,46 @@
1-
import Foundation
2-
31
public class BloomFilter<T> {
4-
private(set) private var arr: [Bool]
5-
private(set) private var hashFunctions: [T -> Int]
6-
7-
public init(size: Int = 1024, hashFunctions: [T -> Int]) {
8-
self.arr = Array<Bool>(count: size, repeatedValue: false)
9-
self.hashFunctions = hashFunctions
2+
private var array: [Bool]
3+
private var hashFunctions: [T -> Int]
4+
5+
public init(size: Int = 1024, hashFunctions: [T -> Int]) {
6+
self.array = .init(count: size, repeatedValue: false)
7+
self.hashFunctions = hashFunctions
8+
}
9+
10+
private func computeHashes(value: T) -> [Int] {
11+
return hashFunctions.map() { hashFunc in abs(hashFunc(value) % array.count) }
12+
}
13+
14+
public func insert(element: T) {
15+
for hashValue in computeHashes(element) {
16+
array[hashValue] = true
1017
}
11-
12-
private func computeHashes(value: T) -> [Int] {
13-
return hashFunctions.map() { hashFunc in
14-
abs(hashFunc(value) % self.arr.count)
15-
}
18+
}
19+
20+
public func insert(values: [T]) {
21+
for value in values {
22+
insert(value)
1623
}
24+
}
25+
26+
public func query(value: T) -> Bool {
27+
let hashValues = computeHashes(value)
1728

18-
public func insert(toInsert: T) {
19-
let hashValues: [Int] = self.computeHashes(toInsert)
20-
21-
for hashValue in hashValues {
22-
self.arr[hashValue] = true
23-
}
24-
}
29+
// Map hashes to indices in the Bloom Filter
30+
let results = hashValues.map() { hashValue in array[hashValue] }
2531

26-
public func insert(values: [T]) {
27-
for value in values {
28-
self.insert(value)
29-
}
30-
}
32+
// All values must be 'true' for the query to return true
3133

32-
public func query(value: T) -> Bool {
33-
let hashValues = self.computeHashes(value)
34-
35-
// Map hashes to indices in the Bloom filter
36-
let results = hashValues.map() { hashValue in
37-
self.arr[hashValue]
38-
}
39-
40-
// All values must be 'true' for the query to return true
41-
42-
// This does NOT imply that the value is in the Bloom filter,
43-
// only that it may be. If the query returns false, however,
44-
// you can be certain that the value was not added.
45-
46-
let exists = results.reduce(true, combine: { $0 && $1 })
47-
48-
return exists
49-
}
50-
51-
public func isEmpty() -> Bool {
52-
// Reduce list; as soon as the reduction hits a 'true' value, the && condition will fail
53-
return arr.reduce(true) { prev, next in
54-
prev && !next
55-
}
56-
}
34+
// This does NOT imply that the value is in the Bloom filter,
35+
// only that it may be. If the query returns false, however,
36+
// you can be certain that the value was not added.
5737

58-
}
38+
let exists = results.reduce(true, combine: { $0 && $1 })
39+
return exists
40+
}
41+
42+
public func isEmpty() -> Bool {
43+
// As soon as the reduction hits a 'true' value, the && condition will fail.
44+
return array.reduce(true) { prev, next in prev && !next }
45+
}
46+
}

Bloom Filter/README.markdown

+68-25
Original file line numberDiff line numberDiff line change
@@ -2,57 +2,100 @@
22

33
## Introduction
44

5-
A Bloom Filter is a space-efficient data structure to check for an element in a set, that guarantees that there are no false negatives on queries. In other words, a query to a Bloom filter either returns "false", meaning the element is definitely not in the set, or "true", meaning that the element could be in the set. At first, this may not seem too useful. However, it's important in applications like cache filtering and data synchronization.
5+
A Bloom Filter is a space-efficient data structure that tells you whether or not an element is present in a set.
66

7-
An advantage of the Bloom Filter over a hash table is that the former maintains constant memory usage and constant-time insert and search. For a large number of elements in a set, the performance difference between a hash table and a Bloom Filter is significant, and it is a viable option if you do not need the guarantee of no false positives.
7+
This is a probabilistic data structure: a query to a Bloom filter either returns `false`, meaning the element is definitely not in the set, or `true`, meaning that the element *might* be in the set.
88

9-
## Implementation
9+
There is a small probability of false positives, where the element isn't actually in the set even though the query returned `true`. But there will never any false negatives: you're guaranteed that if the query returns `false`, then the element really isn't in the set.
1010

11-
A Bloom Filter is essentially a fixed-length bit vector. To insert an element in the filter, it is hashed with *m* different hash functions, which map to indices in the array. The bits at these indices are set to `1`, or `true`, when an element is inserted.
11+
So a Bloom Filter tells you, "definitely not" or "probably yes".
1212

13-
Querying, similarly, is accomplished by hashing the expected value, and checking to see if all of the bits at the indices are `true`. If even one of the bits is not `true`, the element could not have been inserted - and the query returns `false`. If all the bits are `true`, the query returns likewise. If there are "collisions", the query may erroneously return `true` even though the element was not inserted - bringing about the issue with false positives mentioned earlier.
13+
At first, this may not seem too useful. However, it's important in applications like cache filtering and data synchronization.
14+
15+
An advantage of the Bloom Filter over a hash table is that the former maintains constant memory usage and constant-time insert and search. For sets with a large number of elements, the performance difference between a hash table and a Bloom Filter is significant, and it is a viable option if you do not need the guarantee of no false positives.
16+
17+
> **Note:** Unlike a hash table, the Bloom Filter does not store the actual objects. It just remembers what objects you’ve seen (with a degree of uncertainty) and which ones you haven’t.
18+
19+
## How it works
20+
21+
A Bloom Filter is essentially a fixed-length [bit vector](../Bit Set/), an array of bits. When we insert objects, we set some of these bits to `1`, and when we query for objects we check if certain bits are `0` or `1`. Both operations use hash functions.
22+
23+
To insert an element in the filter, the element is hashed with several different hash functions. Each hash function returns a value that we map to an index in the array. We set the bits at these indices to `1` or true.
24+
25+
For example, let's say this is our array of bits. We have 17 bits and initially they are all `0` or false:
26+
27+
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
28+
29+
Now we want to insert the string `"Hello world!"` into the Bloom Filter. We apply two hash functions to this string. The first one gives the value 1999532104120917762. We map this hash value to an index into our array by taking the modulo of the array length: `1999532104120917762 % 17 = 4`. This means we set the bit at index 4 to `1` or true:
30+
31+
[ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
32+
33+
Then we hash the original string again but this time with a different hash function. It gives the hash value 9211818684948223801. Modulo 17 that is 12, and we set the bit at index 12 to `1` as well:
34+
35+
[ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 ]
36+
37+
These two bits are enough to tell the Bloom Filter that it now contains the string `"Hello world!"`.
38+
39+
Querying, similarly, is accomplished by first hashing the expected value, which gives several array indices, and then checking to see if all of the bits at those indices are `true`. If even one of the bits is not `true`, the element could not have been inserted and the query returns `false`. If all the bits are `true`, the query returns likewise.
40+
41+
For example, if we query for the string `"Hello WORLD"`, then the first hash function returns 5383892684077141175, which modulo 17 is 12. That bit is `1`. But the second hash function gives 5625257205398334446, which maps to array index 9. That bit is `0`. This means the string `"Hello WORLD"` is not in the filter and the query returns `false`.
42+
43+
The fact that the first hash function mapped to a `1` bit is a coincidence (it has nothing to do with the fact that both strings start with `"Hello "`). Too many such coincidences can lead to "collisions". If there are collisions, the query may erroneously return `true` even though the element was not inserted -- bringing about the issue with false positives mentioned earlier.
44+
45+
Let's say we insert some other element, `"Bloom Filterz"`, which sets bits 7 and 9. Now the array looks like this:
46+
47+
[ 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0 ]
48+
49+
If you query for `"Hello WORLD"` again, the filter sees that bit 12 is true and bit 9 is now true as well. It reports that `"Hello WORLD"` is indeed present in the set, even though it isn't... because we never inserted that particular string. It's a false positive. This example shows why a Bloom Filter will never say, "definitely yes", only "probably yes".
50+
51+
You can fix such issues by using an array with more bits and using additional hash functions. Of course, the more hash functions you use the slower the Bloom Filter will be. So you have to strike a balance.
1452

1553
Deletion is not possible with a Bloom Filter, since any one bit might have been set by multiple elements inserted. Once you add an element, it's in there for good.
1654

17-
## The Code
55+
Performance of a Bloom Filter is **O(k)** where **k** is the number of hashing functions.
56+
57+
## The code
1858

19-
The code is extremely straightforward, as you can imagine. The internal bit array is set to a fixed length on initialization, which cannot be mutated once it is initialized. Several hash functions should be specified at initialization, which will depend on the types you're using. You can see some examples in the tests - the djb2 and sdbm hash functions for strings.
59+
The code is quite straightforward. The internal bit array is set to a fixed length on initialization, which cannot be mutated once it is initialized.
2060

2161
```swift
2262
public init(size: Int = 1024, hashFunctions: [T -> Int]) {
23-
self.arr = Array<Bool>(count: size, repeatedValue: false)
24-
self.hashFunctions = hashFunctions
63+
self.array = .init(count: size, repeatedValue: false)
64+
self.hashFunctions = hashFunctions
2565
}
2666
```
2767

68+
Several hash functions should be specified at initialization. Which hash functions you use will depend on the datatypes of the elements you'll be adding to the set. You can see some examples in the playground and the tests -- the `djb2` and `sdbm` hash functions for strings.
69+
2870
Insertion just flips the required bits to `true`:
2971

3072
```swift
31-
public func insert(toInsert: T) {
32-
let hashValues: [Int] = self.computeHashes(toInsert)
73+
public func insert(element: T) {
74+
for hashValue in computeHashes(element) {
75+
array[hashValue] = true
76+
}
77+
}
78+
```
3379

34-
for hashValue in hashValues {
35-
self.arr[hashValue] = true
36-
}
80+
This uses the `computeHashes()` function, which loops through the specified `hashFunctions` and returns an array of indices:
81+
82+
```swift
83+
private func computeHashes(value: T) -> [Int] {
84+
return hashFunctions.map() { hashFunc in abs(hashFunc(value) % array.count) }
3785
}
3886
```
3987

4088
And querying checks to make sure the bits at the hashed values are `true`:
4189

4290
```swift
4391
public func query(value: T) -> Bool {
44-
let hashValues = self.computeHashes(value)
45-
46-
let results = hashValues.map() { hashValue in
47-
self.arr[hashValue]
48-
}
49-
50-
let exists = results.reduce(true, combine: { $0 && $1 })
51-
52-
return exists
92+
let hashValues = computeHashes(value)
93+
let results = hashValues.map() { hashValue in array[hashValue] }
94+
let exists = results.reduce(true, combine: { $0 && $1 })
95+
return exists
5396
}
5497
```
5598

56-
If you're coming from another imperative language, you might notice the unusual syntax in the `exists` constant assignment. Swift makes use of functional paradigms when it makes code more consise and readable, and in this case, `reduce` is a much more consise way to check if all the required bits are `true` than a `for` loop.
99+
If you're coming from another imperative language, you might notice the unusual syntax in the `exists` assignment. Swift makes use of functional paradigms when it makes code more consise and readable, and in this case, `reduce` is a much more consise way to check if all the required bits are `true` than a `for` loop.
57100

58-
*Written for Swift Algorithm Club by Jamil Dhanani*
101+
*Written for Swift Algorithm Club by Jamil Dhanani. Edited by Matthijs Hollemans.*

Bloom Filter/BloomFilterTests/BloomFilterTests.swift renamed to Bloom Filter/Tests/BloomFilterTests.swift

-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
import XCTest
2-
import BloomFilter
32

43
/* Two hash functions, adapted from
54
http://www.cse.yorku.ca/~oz/hash.html */
65

7-
86
func djb2(x: String) -> Int {
97
var hash = 5381
108

Bloom Filter/Tests/Info.plist

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3+
<plist version="1.0">
4+
<dict>
5+
<key>CFBundleDevelopmentRegion</key>
6+
<string>en</string>
7+
<key>CFBundleExecutable</key>
8+
<string>$(EXECUTABLE_NAME)</string>
9+
<key>CFBundleIdentifier</key>
10+
<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
11+
<key>CFBundleInfoDictionaryVersion</key>
12+
<string>6.0</string>
13+
<key>CFBundleName</key>
14+
<string>$(PRODUCT_NAME)</string>
15+
<key>CFBundlePackageType</key>
16+
<string>BNDL</string>
17+
<key>CFBundleShortVersionString</key>
18+
<string>1.0</string>
19+
<key>CFBundleSignature</key>
20+
<string>????</string>
21+
<key>CFBundleVersion</key>
22+
<string>1</string>
23+
</dict>
24+
</plist>

0 commit comments

Comments
 (0)