Skip to content

Commit 93b0138

Browse files
committed
BasicReconcile
1 parent 150f5cc commit 93b0138

File tree

7 files changed

+191
-138
lines changed

7 files changed

+191
-138
lines changed

hybridestimator.go

-3
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ func (h *HybridEstimator) BuildSignature(keys [][]byte) {
4646
}
4747

4848
func (h *HybridEstimator) EstimateSizeDifference(remote *HybridEstimator) int {
49-
5049
count := 0
5150

5251
for level := h.Depth - 1; level >= -1; level-- {
@@ -55,9 +54,7 @@ func (h *HybridEstimator) EstimateSizeDifference(remote *HybridEstimator) int {
5554
} else if level < 2 { //MinHash
5655
mh := h.MinHashset[level]
5756
fmt.Println(mh.Difference(remote.MinHashset[level]))
58-
5957
} else { //IBF Strata
60-
6158
ibf := h.IBFset[level]
6259
remotelevel := remote.IBFset[level]
6360
ibf.Subtract(remotelevel)

ibf.go

+25
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,31 @@ func (f *IBF) UnmarshalJSON(data []byte) error {
8585
return nil
8686
}
8787

88+
//Takes and sets values in the IBF directly
89+
func (f *IBF) SetIBF(data IBFSerialization) error {
90+
bitset, err := hex.DecodeString(data.Data)
91+
if err != nil {
92+
return err
93+
}
94+
95+
f.Size = data.Size
96+
f.Keysize = data.Keysize
97+
f.Hashset = data.Hashset
98+
f.Countset = data.Countset
99+
f.Bitset = bitset
100+
101+
return nil
102+
}
103+
104+
func (f *IBF) GetIBF() IBFSerialization {
105+
return IBFSerialization{
106+
f.Size,
107+
f.Keysize,
108+
f.Hashset,
109+
f.Countset,
110+
hex.EncodeToString(f.Bitset)}
111+
}
112+
88113
// Hashes returns an array of hash values resulting from the specified `key`.
89114
// This implementation uses the 128-bit x86 murmur3 hash and returns the
90115
// following, in order:

minhash.go

+52-56
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package reconcile
22

33
import (
4-
"fmt"
54
"math"
65
)
76

@@ -18,69 +17,66 @@ import (
1817
//if h(row) < M(i, c)
1918
//M(i,c) = h(row)
2019

21-
func MinHashDifference(sigA [][]uint64, sigB [][]uint64) int {
22-
if len(sigA) != len(sigB) {
23-
fmt.Println("Hash count doesn't match")
24-
return 0 //should throw error
25-
}
26-
if len(sigA[0]) != len(sigB[0]) {
27-
fmt.Println("Keysizes don't match")
28-
return 0 //should throw error
29-
}
30-
31-
diff := 0
32-
bitwidth := len(sigA[0])
33-
for row := range sigA {
34-
for col := 0; col < bitwidth; col++ {
35-
if sigA[row][col] == sigB[row][col] {
36-
if sigA[row][col] != math.MaxUint64 && sigB[row][col] != math.MaxUint64 {
37-
diff++
38-
}
39-
}
40-
}
41-
}
42-
43-
return diff
20+
type MinHash struct {
21+
keysize int
22+
signature [][]uint64
23+
hashcount uint32
24+
keycount int
4425
}
4526

46-
//Should probably return a custom struct
47-
func GetMinHashSignature(keys [][]byte, hashcount uint32) [][]uint64 {
48-
//dont precompute permutation hashset
49-
//build signature table of hashcount rows
50-
//hashseed is signature row
51-
keysize := len(keys[0])
27+
func NewMinHash(hashcount uint32, keysize int) *MinHash {
28+
var hashseed uint32
5229

53-
mh := make([][]uint64, hashcount)
54-
for row := range mh {
55-
mh[row] = make([]uint64, keysize*8)
30+
//Initialise signature with maximum values
31+
signature := make([][]uint64, hashcount)
32+
for row := range signature {
33+
signature[row] = make([]uint64, keysize*8)
5634
}
5735

58-
var hashseed uint32
59-
60-
//init signature with values (maximum)
6136
for hashseed = 0; hashseed < hashcount; hashseed++ { //0 to hashcount
6237
for bitindex := 0; bitindex < (keysize * 8); bitindex++ { //for each bit in key
63-
mh[hashseed][bitindex] = math.MaxUint64
38+
signature[hashseed][bitindex] = math.MaxUint64
6439
} //scan through bits
6540
} //for each hash
41+
return &MinHash{keysize, signature, hashcount, 0}
42+
}
43+
44+
func (mh *MinHash) Add(key []byte) {
45+
var hashseed uint32
6646

67-
for _, key := range keys {
68-
for hashseed = 0; hashseed < hashcount; hashseed++ { //0 to hashcount
69-
sum := Sum128x32(key, hashseed)
70-
hash := uint64(sum[0]) % uint64(len(keys))
71-
for byteindex, keybyte := range key { //for each byte of key
72-
var pattern uint8 = 1
73-
for bitindex := 0; bitindex < 8; bitindex++ { //for each bit in key
74-
if keybyte&pattern != 0 { //do if found bit & not already found
75-
if hash < mh[hashseed][(byteindex*8)+bitindex] {
76-
mh[hashseed][(byteindex*8)+bitindex] = hash
77-
} //if hash is less than then update
78-
} //if bit is 1
79-
pattern <<= 1
80-
} //scan through bits
81-
} //scan through bytes of key
82-
} //for each hash
83-
} //for each key
47+
for hashseed = 0; hashseed < mh.hashcount; hashseed++ { //0 to hashcount
48+
sum := Sum128x32(key, hashseed)
49+
hash := uint64(sum[0]) % uint64(mh.hashcount)
50+
for byteindex, keybyte := range key { //for each byte of key
51+
var pattern uint8 = 1
52+
for bitindex := 0; bitindex < 8; bitindex++ { //for each bit in key
53+
if keybyte&pattern != 0 { //do if found bit & not already found
54+
if hash < mh.signature[hashseed][(byteindex*8)+bitindex] {
55+
mh.signature[hashseed][(byteindex*8)+bitindex] = hash
56+
} //if hash is less than then update
57+
} //if bit is 1
58+
pattern <<= 1
59+
} //scan through bits
60+
} //scan through bytes of key
61+
} //for each hash
62+
mh.keycount++
63+
}
8464

85-
return mh
86-
} //GetSignature
65+
//Returns Jacquard similiarity score
66+
func (mh *MinHash) Difference(remote *MinHash) int {
67+
//Catch errors
68+
69+
match := 0
70+
bitwidth := mh.keysize * 8
71+
for row := range mh.signature {
72+
for col := 0; col < bitwidth; col++ {
73+
if mh.signature[row][col] == remote.signature[row][col] {
74+
if mh.signature[row][col] != math.MaxUint64 && remote.signature[row][col] != math.MaxUint64 {
75+
match++
76+
}
77+
} //if they match
78+
} //for each bit
79+
} //for each hash/row in signature
80+
score := float64(match) / float64(bitwidth*int(mh.hashcount))
81+
return int(((1.0 - score) / (1.0 + score)) * float64(mh.keycount+remote.keycount))
82+
}

minhash_test.go

+11-16
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@ import (
77
)
88

99
func TestMinHash(t *testing.T) {
10-
numElements := 1000
11-
numDifferences := 1000
10+
numElements := 100
11+
numDifferences := 80
1212
var hashCount uint32 = 100
1313
keysize := 32
14-
15-
localkeys := [][]byte{}
16-
remotekeys := [][]byte{}
14+
local := NewMinHash(hashCount, keysize)
15+
remote := NewMinHash(hashCount, keysize)
1716

1817
for i := 0; i < numElements; i++ {
1918
element := make([]byte, keysize)
@@ -22,8 +21,8 @@ func TestMinHash(t *testing.T) {
2221
t.Error("Could not get random bytes for set element")
2322
return
2423
}
25-
localkeys = append(localkeys, element)
26-
remotekeys = append(remotekeys, element)
24+
local.Add(element)
25+
remote.Add(element)
2726
}
2827

2928
for i := 0; i < numDifferences; i++ {
@@ -34,20 +33,16 @@ func TestMinHash(t *testing.T) {
3433
return
3534
}
3635
// Add to a set at random
37-
diffSet := &localkeys
36+
diffSet := local
3837
if rand.Intn(2) == 0 {
39-
diffSet = &remotekeys
38+
diffSet = remote
4039
}
41-
*diffSet = append(*diffSet, element)
40+
diffSet.Add(element)
4241
}
4342

44-
sigA := GetMinHashSignature(localkeys, hashCount)
45-
sigB := GetMinHashSignature(remotekeys, hashCount)
46-
47-
diff := MinHashDifference(sigA, sigB)
43+
diff := local.Difference(remote)
4844

49-
fmt.Printf("MinHash Diff %v out of %v\n", diff, int(hashCount)*keysize*8)
50-
//fmt.Printf("%v%%\n", diff, int(hashCount)*keysize*8)
45+
fmt.Printf("MinHash Diff %v vs actual: %v\n", diff, numDifferences)
5146

5247
fmt.Printf("End MinHash \n")
5348
}

reconcile.go

+59-37
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,68 @@
11
package reconcile
22

3-
//Inputs:
4-
//-A local set
5-
//-A remote strata estimator
6-
//-A remote MiHash signature
7-
8-
//Estimates the difference between the local and remote set
9-
//Builds an IBF of size estimated * factor
10-
//Returns two sets of what is missing from either data sets
11-
12-
//Usage steps:
13-
//NewSetReconciler(set)
14-
//Estimate size of difference
15-
//Get Missing Elements
16-
17-
//Size of difference involves partitioning keys into n levels
18-
//where size of leveln = len(set) / (2^n)
19-
//Large Partitions are handed to minhash estimator
20-
//Level 1 - 2
21-
//Strata estimator operates on 3..
22-
23-
//1: Both parties
24-
// Generate Two Signatures
25-
// level1-2 minhash
26-
// further strata
27-
//filter := NewIBF(cells, keysize)
28-
29-
//create the reconciler
30-
//exchange set sizes and agree on minimum size
31-
//TEMPORARY!!
32-
//use local set to build minhash and strata estimator data structure
33-
//exchange with other party
34-
//estimate difference size
35-
//build ibf
36-
//exchange ibfs
37-
//get difference
3+
import (
4+
"math"
5+
)
6+
7+
//Create reconciler with local keys knowing the remote set size
8+
//Generate strata signature and transmit
9+
//Receive remote strata signature, estimate size
10+
//Build IBF of 'size' and transmit signature
11+
//Receive remote IBF signature and calculate difference
3812

3913
type Reconcile struct {
14+
Keyset [][]byte
15+
Estimator *Strata
16+
Depth int
17+
DiffSize int //default 0
18+
}
19+
20+
//Creates a set reconciler and populates a size estimator with all local keys
21+
func NewReconcile(keys [][]byte, remotesetsize int) *Reconcile {
22+
//Ugly
23+
var depth int
24+
if remotesetsize > len(keys) {
25+
depth = int(math.Ceil(math.Log2(float64(remotesetsize))))
26+
} else {
27+
depth = int(math.Ceil(math.Log2(float64(len(keys)))))
28+
}
29+
30+
estimator := NewStrata(80, len(keys[0]), depth)
31+
estimator.Populate(keys)
32+
33+
return &Reconcile{keys, estimator, depth, 0}
34+
}
35+
36+
func (r *Reconcile) GetDifferenceSizeEstimator() ([]byte, error) {
37+
return r.Estimator.MarshalStrataJSON()
38+
}
39+
40+
//Takes JSON estimator data from remote and estimates size of difference
41+
func (r *Reconcile) EstimateDifferenceSize(data []byte) error {
42+
remote := NewStrata(80, len(r.Keyset[0]), r.Depth)
43+
remote.UnmarshalStrataJSON(data)
44+
r.DiffSize = r.Estimator.Estimate(remote)
45+
return nil
46+
}
4047

48+
//Generates signature of ibf dataset
49+
//Must be called after estimating difference size
50+
func (r *Reconcile) GetIBFSignature(size int) ([]byte, error) {
51+
ibf := NewIBF(size, len(r.Keyset[0]))
52+
for _, key := range r.Keyset {
53+
ibf.Add(key)
54+
}
55+
return ibf.MarshalJSON()
4156
}
4257

43-
func NewSetReconciler(keys [][]byte) {
58+
func (r *Reconcile) GetDifference(size int, remotesignature []byte) (a [][]byte, b [][]byte, ok bool) {
59+
ibf := NewIBF(size, len(r.Keyset[0]))
60+
for _, key := range r.Keyset {
61+
ibf.Add(key)
62+
}
63+
remoteibf := NewIBF(size, len(r.Keyset[0]))
64+
remoteibf.UnmarshalJSON(remotesignature)
65+
ibf.Subtract(remoteibf)
66+
return ibf.Decode()
4467

45-
IBFset := make([]*IBF, )
4668
}

0 commit comments

Comments
 (0)