Skip to content
This repository was archived by the owner on Jun 20, 2023. It is now read-only.

Commit 21b0c06

Browse files
authoredOct 7, 2019
Merge pull request #16 from ipfs/feat/buzhash
Implement buzhash
2 parents 9a794d0 + 57fa659 commit 21b0c06

7 files changed

+315
-59
lines changed
 

‎benchmark_test.go

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package chunk
2+
3+
import (
4+
"bytes"
5+
"io"
6+
"math/rand"
7+
"testing"
8+
)
9+
10+
type newSplitter func(io.Reader) Splitter
11+
12+
type bencSpec struct {
13+
size int
14+
name string
15+
}
16+
17+
var bSizes = []bencSpec{
18+
{1 << 10, "1K"},
19+
{1 << 20, "1M"},
20+
{16 << 20, "16M"},
21+
{100 << 20, "100M"},
22+
}
23+
24+
func benchmarkChunker(b *testing.B, ns newSplitter) {
25+
for _, s := range bSizes {
26+
s := s
27+
b.Run(s.name, func(b *testing.B) {
28+
benchmarkChunkerSize(b, ns, s.size)
29+
})
30+
}
31+
}
32+
33+
func benchmarkChunkerSize(b *testing.B, ns newSplitter, size int) {
34+
rng := rand.New(rand.NewSource(1))
35+
data := make([]byte, size)
36+
rng.Read(data)
37+
38+
b.SetBytes(int64(size))
39+
b.ReportAllocs()
40+
b.ResetTimer()
41+
42+
var res uint64
43+
44+
for i := 0; i < b.N; i++ {
45+
r := ns(bytes.NewReader(data))
46+
47+
for {
48+
chunk, err := r.NextBytes()
49+
if err != nil {
50+
if err == io.EOF {
51+
break
52+
}
53+
b.Fatal(err)
54+
}
55+
res = res + uint64(len(chunk))
56+
}
57+
}
58+
Res = Res + res
59+
}

‎buzhash.go

+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
package chunk
2+
3+
import (
4+
"io"
5+
"math/bits"
6+
7+
pool "github.com/libp2p/go-buffer-pool"
8+
)
9+
10+
const (
11+
buzMin = 128 << 10
12+
buzMax = 512 << 10
13+
buzMask = 1<<17 - 1
14+
)
15+
16+
type Buzhash struct {
17+
r io.Reader
18+
buf []byte
19+
n int
20+
21+
err error
22+
}
23+
24+
func NewBuzhash(r io.Reader) *Buzhash {
25+
return &Buzhash{
26+
r: r,
27+
buf: pool.Get(buzMax),
28+
}
29+
}
30+
31+
func (b *Buzhash) Reader() io.Reader {
32+
return b.r
33+
}
34+
35+
func (b *Buzhash) NextBytes() ([]byte, error) {
36+
if b.err != nil {
37+
return nil, b.err
38+
}
39+
40+
n, err := io.ReadFull(b.r, b.buf[b.n:])
41+
if err != nil {
42+
if err == io.ErrUnexpectedEOF || err == io.EOF {
43+
if b.n+n < buzMin {
44+
b.err = io.EOF
45+
res := make([]byte, b.n+n)
46+
copy(res, b.buf)
47+
48+
pool.Put(b.buf)
49+
b.buf = nil
50+
return res, nil
51+
}
52+
} else {
53+
b.err = err
54+
pool.Put(b.buf)
55+
b.buf = nil
56+
return nil, err
57+
}
58+
}
59+
60+
i := buzMin - 32
61+
62+
var state uint32 = 0
63+
64+
for ; i < buzMin; i++ {
65+
state = bits.RotateLeft32(state, 1)
66+
state = state ^ bytehash[b.buf[i]]
67+
}
68+
69+
if b.n+n > len(b.buf) {
70+
panic("this is impossible, but gives +9 to performance")
71+
}
72+
73+
for ; state&buzMask != 0 && i < b.n+n; i++ {
74+
state = bits.RotateLeft32(state, 1) ^ bytehash[b.buf[i-32]] ^ bytehash[b.buf[i]]
75+
}
76+
77+
res := make([]byte, i)
78+
copy(res, b.buf)
79+
80+
b.n = copy(b.buf, b.buf[i:b.n+n])
81+
82+
return res, nil
83+
}
84+
85+
var bytehash = [256]uint32{
86+
0x6236e7d5, 0x10279b0b, 0x72818182, 0xdc526514, 0x2fd41e3d, 0x777ef8c8,
87+
0x83ee5285, 0x2c8f3637, 0x2f049c1a, 0x57df9791, 0x9207151f, 0x9b544818,
88+
0x74eef658, 0x2028ca60, 0x0271d91a, 0x27ae587e, 0xecf9fa5f, 0x236e71cd,
89+
0xf43a8a2e, 0xbb13380, 0x9e57912c, 0x89a26cdb, 0x9fcf3d71, 0xa86da6f1,
90+
0x9c49f376, 0x346aecc7, 0xf094a9ee, 0xea99e9cb, 0xb01713c6, 0x88acffb,
91+
0x2960a0fb, 0x344a626c, 0x7ff22a46, 0x6d7a1aa5, 0x6a714916, 0x41d454ca,
92+
0x8325b830, 0xb65f563, 0x447fecca, 0xf9d0ea5e, 0xc1d9d3d4, 0xcb5ec574,
93+
0x55aae902, 0x86edc0e7, 0xd3a9e33, 0xe70dc1e1, 0xe3c5f639, 0x9b43140a,
94+
0xc6490ac5, 0x5e4030fb, 0x8e976dd5, 0xa87468ea, 0xf830ef6f, 0xcc1ed5a5,
95+
0x611f4e78, 0xddd11905, 0xf2613904, 0x566c67b9, 0x905a5ccc, 0x7b37b3a4,
96+
0x4b53898a, 0x6b8fd29d, 0xaad81575, 0x511be414, 0x3cfac1e7, 0x8029a179,
97+
0xd40efeda, 0x7380e02, 0xdc9beffd, 0x2d049082, 0x99bc7831, 0xff5002a8,
98+
0x21ce7646, 0x1cd049b, 0xf43994f, 0xc3c6c5a5, 0xbbda5f50, 0xec15ec7,
99+
0x9adb19b6, 0xc1e80b9, 0xb9b52968, 0xae162419, 0x2542b405, 0x91a42e9d,
100+
0x6be0f668, 0x6ed7a6b9, 0xbc2777b4, 0xe162ce56, 0x4266aad5, 0x60fdb704,
101+
0x66f832a5, 0x9595f6ca, 0xfee83ced, 0x55228d99, 0x12bf0e28, 0x66896459,
102+
0x789afda, 0x282baa8, 0x2367a343, 0x591491b0, 0x2ff1a4b1, 0x410739b6,
103+
0x9b7055a0, 0x2e0eb229, 0x24fc8252, 0x3327d3df, 0xb0782669, 0x1c62e069,
104+
0x7f503101, 0xf50593ae, 0xd9eb275d, 0xe00eb678, 0x5917ccde, 0x97b9660a,
105+
0xdd06202d, 0xed229e22, 0xa9c735bf, 0xd6316fe6, 0x6fc72e4c, 0x206dfa2,
106+
0xd6b15c5a, 0x69d87b49, 0x9c97745, 0x13445d61, 0x35a975aa, 0x859aa9b9,
107+
0x65380013, 0xd1fb6391, 0xc29255fd, 0x784a3b91, 0xb9e74c26, 0x63ce4d40,
108+
0xc07cbe9e, 0xe6e4529e, 0xfb3632f, 0x9438d9c9, 0x682f94a8, 0xf8fd4611,
109+
0x257ec1ed, 0x475ce3d6, 0x60ee2db1, 0x2afab002, 0x2b9e4878, 0x86b340de,
110+
0x1482fdca, 0xfe41b3bf, 0xd4a412b0, 0xe09db98c, 0xc1af5d53, 0x7e55e25f,
111+
0xd3346b38, 0xb7a12cbd, 0x9c6827ba, 0x71f78bee, 0x8c3a0f52, 0x150491b0,
112+
0xf26de912, 0x233e3a4e, 0xd309ebba, 0xa0a9e0ff, 0xca2b5921, 0xeeb9893c,
113+
0x33829e88, 0x9870cc2a, 0x23c4b9d0, 0xeba32ea3, 0xbdac4d22, 0x3bc8c44c,
114+
0x1e8d0397, 0xf9327735, 0x783b009f, 0xeb83742, 0x2621dc71, 0xed017d03,
115+
0x5c760aa1, 0x5a69814b, 0x96e3047f, 0xa93c9cde, 0x615c86f5, 0xb4322aa5,
116+
0x4225534d, 0xd2e2de3, 0xccfccc4b, 0xbac2a57, 0xf0a06d04, 0xbc78d737,
117+
0xf2d1f766, 0xf5a7953c, 0xbcdfda85, 0x5213b7d5, 0xbce8a328, 0xd38f5f18,
118+
0xdb094244, 0xfe571253, 0x317fa7ee, 0x4a324f43, 0x3ffc39d9, 0x51b3fa8e,
119+
0x7a4bee9f, 0x78bbc682, 0x9f5c0350, 0x2fe286c, 0x245ab686, 0xed6bf7d7,
120+
0xac4988a, 0x3fe010fa, 0xc65fe369, 0xa45749cb, 0x2b84e537, 0xde9ff363,
121+
0x20540f9a, 0xaa8c9b34, 0x5bc476b3, 0x1d574bd7, 0x929100ad, 0x4721de4d,
122+
0x27df1b05, 0x58b18546, 0xb7e76764, 0xdf904e58, 0x97af57a1, 0xbd4dc433,
123+
0xa6256dfd, 0xf63998f3, 0xf1e05833, 0xe20acf26, 0xf57fd9d6, 0x90300b4d,
124+
0x89df4290, 0x68d01cbc, 0xcf893ee3, 0xcc42a046, 0x778e181b, 0x67265c76,
125+
0xe981a4c4, 0x82991da1, 0x708f7294, 0xe6e2ae62, 0xfc441870, 0x95e1b0b6,
126+
0x445f825, 0x5a93b47f, 0x5e9cf4be, 0x84da71e7, 0x9d9582b0, 0x9bf835ef,
127+
0x591f61e2, 0x43325985, 0x5d2de32e, 0x8d8fbf0f, 0x95b30f38, 0x7ad5b6e,
128+
0x4e934edf, 0x3cd4990e, 0x9053e259, 0x5c41857d}

‎buzhash_test.go

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
package chunk
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"io"
7+
"testing"
8+
9+
util "github.com/ipfs/go-ipfs-util"
10+
)
11+
12+
func TestBuzhashChunking(t *testing.T) {
13+
data := make([]byte, 1024*1024*16)
14+
util.NewTimeSeededRand().Read(data)
15+
16+
r := NewBuzhash(bytes.NewReader(data))
17+
18+
var chunks [][]byte
19+
20+
for {
21+
chunk, err := r.NextBytes()
22+
if err != nil {
23+
if err == io.EOF {
24+
break
25+
}
26+
t.Fatal(err)
27+
}
28+
29+
chunks = append(chunks, chunk)
30+
}
31+
32+
t.Logf("average block size: %d\n", len(data)/len(chunks))
33+
34+
unchunked := bytes.Join(chunks, nil)
35+
if !bytes.Equal(unchunked, data) {
36+
fmt.Printf("%d %d\n", len(unchunked), len(data))
37+
//ioutil.WriteFile("./incorrect", unchunked, 0777)
38+
//ioutil.WriteFile("./correct", data, 0777)
39+
t.Fatal("data was chunked incorrectly")
40+
}
41+
}
42+
43+
func TestBuzhashChunkReuse(t *testing.T) {
44+
newBuzhash := func(r io.Reader) Splitter {
45+
return NewBuzhash(r)
46+
}
47+
testReuse(t, newBuzhash)
48+
}
49+
50+
func BenchmarkBuzhash2(b *testing.B) {
51+
benchmarkChunker(b, func(r io.Reader) Splitter {
52+
return NewBuzhash(r)
53+
})
54+
}
55+
56+
func TestBuzhashBitsHashBias(t *testing.T) {
57+
counts := make([]byte, 32)
58+
for _, h := range bytehash {
59+
for i := 0; i < 32; i++ {
60+
if h&1 == 1 {
61+
counts[i]++
62+
}
63+
h = h >> 1
64+
}
65+
}
66+
for i, c := range counts {
67+
if c != 128 {
68+
t.Errorf("Bit balance in position %d broken, %d ones", i, c)
69+
}
70+
}
71+
}

‎gen/main.go

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// This file generates bytehash LUT
2+
package main
3+
4+
import (
5+
"fmt"
6+
"math/rand"
7+
)
8+
9+
const nRounds = 200
10+
11+
func main() {
12+
rnd := rand.New(rand.NewSource(0))
13+
14+
lut := make([]uint32, 256)
15+
for i := 0; i < 256/2; i++ {
16+
lut[i] = 1<<32 - 1
17+
}
18+
19+
for r := 0; r < nRounds; r++ {
20+
for b := uint32(0); b < 32; b++ {
21+
mask := uint32(1) << b
22+
nmask := ^mask
23+
for i, j := range rnd.Perm(256) {
24+
li := lut[i]
25+
lj := lut[j]
26+
lut[i] = li&nmask | (lj & mask)
27+
lut[j] = lj&nmask | (li & mask)
28+
}
29+
}
30+
}
31+
32+
fmt.Printf("%#v", lut)
33+
}

‎parse.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ var (
1414
)
1515

1616
// FromString returns a Splitter depending on the given string:
17-
// it supports "default" (""), "size-{size}", "rabin", "rabin-{blocksize}" and
18-
// "rabin-{min}-{avg}-{max}".
17+
// it supports "default" (""), "size-{size}", "rabin", "rabin-{blocksize}",
18+
// "rabin-{min}-{avg}-{max}" and "buzhash".
1919
func FromString(r io.Reader, chunker string) (Splitter, error) {
2020
switch {
2121
case chunker == "" || chunker == "default":
@@ -34,6 +34,9 @@ func FromString(r io.Reader, chunker string) (Splitter, error) {
3434
case strings.HasPrefix(chunker, "rabin"):
3535
return parseRabinString(r, chunker)
3636

37+
case chunker == "buzhash":
38+
return NewBuzhash(r), nil
39+
3740
default:
3841
return nil, fmt.Errorf("unrecognized chunker option: %s", chunker)
3942
}

‎rabin_test.go

+16-30
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ func TestRabinChunking(t *testing.T) {
3939
}
4040
}
4141

42-
func chunkData(t *testing.T, data []byte) map[string]blocks.Block {
43-
r := NewRabin(bytes.NewReader(data), 1024*256)
42+
func chunkData(t *testing.T, newC newSplitter, data []byte) map[string]blocks.Block {
43+
r := newC(bytes.NewReader(data))
4444

4545
blkmap := make(map[string]blocks.Block)
4646

@@ -60,12 +60,12 @@ func chunkData(t *testing.T, data []byte) map[string]blocks.Block {
6060
return blkmap
6161
}
6262

63-
func TestRabinChunkReuse(t *testing.T) {
63+
func testReuse(t *testing.T, cr newSplitter) {
6464
data := make([]byte, 1024*1024*16)
6565
util.NewTimeSeededRand().Read(data)
6666

67-
ch1 := chunkData(t, data[1000:])
68-
ch2 := chunkData(t, data)
67+
ch1 := chunkData(t, cr, data[1000:])
68+
ch2 := chunkData(t, cr, data)
6969

7070
var extra int
7171
for k := range ch2 {
@@ -76,35 +76,21 @@ func TestRabinChunkReuse(t *testing.T) {
7676
}
7777

7878
if extra > 2 {
79-
t.Log("too many spare chunks made")
79+
t.Logf("too many spare chunks made: %d", extra)
8080
}
8181
}
8282

83+
func TestRabinChunkReuse(t *testing.T) {
84+
newRabin := func(r io.Reader) Splitter {
85+
return NewRabin(r, 256*1024)
86+
}
87+
testReuse(t, newRabin)
88+
}
89+
8390
var Res uint64
8491

8592
func BenchmarkRabin(b *testing.B) {
86-
data := make([]byte, 16<<20)
87-
util.NewTimeSeededRand().Read(data)
88-
89-
b.SetBytes(16 << 20)
90-
b.ReportAllocs()
91-
b.ResetTimer()
92-
93-
var res uint64
94-
95-
for i := 0; i < b.N; i++ {
96-
r := NewRabin(bytes.NewReader(data), 1024*256)
97-
98-
for {
99-
chunk, err := r.NextBytes()
100-
if err != nil {
101-
if err == io.EOF {
102-
break
103-
}
104-
b.Fatal(err)
105-
}
106-
res = res + uint64(len(chunk))
107-
}
108-
}
109-
Res = Res + res
93+
benchmarkChunker(b, func(r io.Reader) Splitter {
94+
return NewRabin(r, 256<<10)
95+
})
11096
}

0 commit comments

Comments
 (0)