Skip to content
This repository was archived by the owner on Jul 7, 2020. It is now read-only.

Commit c49e035

Browse files
committed
Merge pull request #6 from jkff/count-min-sketch
Count min sketch
2 parents 9cdac2d + 87a0cde commit c49e035

File tree

4 files changed

+253
-1
lines changed

4 files changed

+253
-1
lines changed

README.mdown

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
A Java library for summarizing data in streams for which it is
55
infeasible to store all events. More specifically, there are classes
66
for estimating: cardinality (i.e. counting things); set membership;
7-
and the top-k elements. One particularly useful feature is that
7+
top-k elements and frequency. One particularly useful feature is that
88
cardinality estimators with compatible configurations may be safely
99
merged.
1010

@@ -112,3 +112,8 @@ NY, USA, 2003. ACM.
112112
computation of frequent and top-k elements in data streams. pages
113113
398–412. 2005.
114114

115+
#### Frequency
116+
117+
* Graham Cormode and S. Muthukrishnan. An improved data stream
118+
summary: The Count-Min sketch and its applications. 2004.
119+
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
package com.clearspring.analytics.stream.frequency;
2+
3+
import java.io.ByteArrayInputStream;
4+
import java.io.ByteArrayOutputStream;
5+
import java.io.DataInputStream;
6+
import java.io.DataOutputStream;
7+
import java.io.IOException;
8+
import java.util.Random;
9+
10+
/**
11+
* Count-Min Sketch datastructure.
12+
* An Improved Data Stream Summary: The Count-Min Sketch and its Applications
13+
* http://www.eecs.harvard.edu/~michaelm/CS222/countmin.pdf
14+
*/
15+
public class CountMinSketch implements IFrequency
16+
{
17+
public static final long PRIME_MODULUS = (1L << 31) - 1;
18+
private int depth;
19+
private int width;
20+
private long[][] table;
21+
private long[] hashA;
22+
private long size;
23+
private double eps;
24+
private double confidence;
25+
26+
private CountMinSketch()
27+
{
28+
}
29+
30+
public CountMinSketch(int depth, int width, int seed)
31+
{
32+
this.depth = depth;
33+
this.width = width;
34+
this.eps = 2.0 / width;
35+
this.confidence = 1 - 1 / Math.pow(2, depth);
36+
initTablesWith(depth, width, seed);
37+
}
38+
39+
public CountMinSketch(double epsOfTotalCount, double confidence, int seed)
40+
{
41+
// 2/w = eps ; w = 2/eps
42+
// 1/2^depth <= 1-confidence ; depth >= -log2 (1-confidence)
43+
this.eps = epsOfTotalCount;
44+
this.confidence = confidence;
45+
this.width = (int) Math.ceil(2 / epsOfTotalCount);
46+
this.depth = (int) Math.ceil(-Math.log(1 - confidence) / Math.log(2));
47+
initTablesWith(depth, width, seed);
48+
}
49+
50+
private void initTablesWith(int depth, int width, int seed)
51+
{
52+
this.table = new long[depth][width];
53+
this.hashA = new long[depth];
54+
Random r = new Random(seed);
55+
// We're using a linear hash functions
56+
// of the form (a*x+b) mod p.
57+
// a,b are chosen independently for each hash function.
58+
// However we can set b = 0 as all it does is shift the results
59+
// without compromising their uniformity or independence with
60+
// the other hashes.
61+
for (int i = 0; i < depth; ++i)
62+
{
63+
hashA[i] = r.nextInt(Integer.MAX_VALUE);
64+
}
65+
}
66+
67+
public double getRelativeError()
68+
{
69+
return eps;
70+
}
71+
72+
public double getConfidence()
73+
{
74+
return confidence;
75+
}
76+
77+
private int hash(long item, int i)
78+
{
79+
long hash = hashA[i] * item;
80+
// A super fast way of computing x mod 2^p-1
81+
// See http://www.cs.princeton.edu/courses/archive/fall09/cos521/Handouts/universalclasses.pdf
82+
// page 149, right after Proposition 7.
83+
hash += hash >> 32;
84+
hash &= PRIME_MODULUS;
85+
// Doing "%" after (int) conversion is ~2x faster than %'ing longs.
86+
return ((int) hash) % width;
87+
}
88+
89+
@Override
90+
public void add(long item, long count)
91+
{
92+
if (count < 0)
93+
{
94+
// Actually for negative increments we'll need to use the median
95+
// instead of minimum, and accuracy will suffer somewhat.
96+
// Probably makes sense to add an "allow negative increments"
97+
// parameter to constructor.
98+
throw new IllegalArgumentException("Negative increments not implemented");
99+
}
100+
for (int i = 0; i < depth; ++i)
101+
{
102+
table[i][hash(item, i)] += count;
103+
}
104+
size += count;
105+
}
106+
107+
@Override
108+
public long size()
109+
{
110+
return size;
111+
}
112+
113+
/**
114+
* The estimate is correct within 'epsilon' * (total item count),
115+
* with probability 'confidence'.
116+
*/
117+
@Override
118+
public long estimateCount(long item)
119+
{
120+
long res = Long.MAX_VALUE;
121+
for (int i = 0; i < depth; ++i)
122+
{
123+
res = Math.min(res, table[i][hash(item, i)]);
124+
}
125+
return res;
126+
}
127+
128+
public static byte[] serialize(CountMinSketch sketch)
129+
{
130+
ByteArrayOutputStream bos = new ByteArrayOutputStream();
131+
DataOutputStream s = new DataOutputStream(bos);
132+
try
133+
{
134+
s.writeLong(sketch.size);
135+
s.writeInt(sketch.depth);
136+
s.writeInt(sketch.width);
137+
for (int i = 0; i < sketch.depth; ++i)
138+
{
139+
s.writeLong(sketch.hashA[i]);
140+
for (int j = 0; j < sketch.width; ++j)
141+
{
142+
s.writeLong(sketch.table[i][j]);
143+
}
144+
}
145+
return bos.toByteArray();
146+
} catch (IOException e)
147+
{
148+
// Shouldn't happen
149+
throw new RuntimeException(e);
150+
}
151+
}
152+
153+
public static CountMinSketch deserialize(byte[] data)
154+
{
155+
ByteArrayInputStream bis = new ByteArrayInputStream(data);
156+
DataInputStream s = new DataInputStream(bis);
157+
try
158+
{
159+
CountMinSketch sketch = new CountMinSketch();
160+
sketch.size = s.readLong();
161+
sketch.depth = s.readInt();
162+
sketch.width = s.readInt();
163+
sketch.eps = 2.0 / sketch.width;
164+
sketch.confidence = 1 - 1 / Math.pow(2, sketch.depth);
165+
sketch.hashA = new long[sketch.depth];
166+
sketch.table = new long[sketch.depth][sketch.width];
167+
for (int i = 0; i < sketch.depth; ++i)
168+
{
169+
sketch.hashA[i] = s.readLong();
170+
for (int j = 0; j < sketch.width; ++j)
171+
{
172+
sketch.table[i][j] = s.readLong();
173+
}
174+
}
175+
return sketch;
176+
}
177+
catch (IOException e)
178+
{
179+
// Shouldn't happen
180+
throw new RuntimeException(e);
181+
}
182+
}
183+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package com.clearspring.analytics.stream.frequency;
2+
3+
public interface IFrequency
4+
{
5+
void add(long item, long count);
6+
7+
long estimateCount(long item);
8+
9+
long size();
10+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package com.clearspring.analytics.stream.frequency;
2+
3+
import org.junit.Test;
4+
5+
import java.util.Random;
6+
7+
import static org.junit.Assert.assertTrue;
8+
9+
public class CountMinSketchTest
10+
{
11+
@Test
12+
public void testAccuracy()
13+
{
14+
int seed = 7364181;
15+
Random r = new Random(seed);
16+
int numItems = 1000000;
17+
int[] xs = new int[numItems];
18+
int maxScale = 20;
19+
for (int i = 0; i < xs.length; ++i)
20+
{
21+
int scale = r.nextInt(maxScale);
22+
xs[i] = r.nextInt(1 << scale);
23+
}
24+
25+
double epsOfTotalCount = 0.0001;
26+
double confidence = 0.99;
27+
28+
CountMinSketch sketch = new CountMinSketch(epsOfTotalCount, confidence, seed);
29+
for (int x : xs)
30+
{
31+
sketch.add(x, 1);
32+
}
33+
34+
int[] actualFreq = new int[1 << maxScale];
35+
for (int x : xs)
36+
{
37+
actualFreq[x]++;
38+
}
39+
40+
sketch = CountMinSketch.deserialize(CountMinSketch.serialize(sketch));
41+
42+
int numErrors = 0;
43+
for (int i = 0; i < actualFreq.length; ++i)
44+
{
45+
double ratio = 1.0 * (sketch.estimateCount(i) - actualFreq[i]) / xs.length;
46+
if (ratio > 1.0001)
47+
{
48+
numErrors++;
49+
}
50+
}
51+
double pCorrect = 1 - 1.0 * numErrors / actualFreq.length;
52+
assertTrue("Confidence not reached: required " + confidence + ", reached " + pCorrect, pCorrect > confidence);
53+
}
54+
}

0 commit comments

Comments
 (0)