Skip to content

Commit f6d707d

Browse files
✨ feat: First draft for reservoir sampling.
Fixes #18.
1 parent dfb68b6 commit f6d707d

File tree

6 files changed

+137
-0
lines changed

6 files changed

+137
-0
lines changed

package.json

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
"@aureooms/js-functools": "2.0.3",
7070
"@aureooms/js-itertools": "5.1.0",
7171
"@aureooms/js-memory": "4.0.0",
72+
"@aureooms/js-red-black-tree": "^9.0.0",
7273
"@aureooms/js-type": "1.0.4",
7374
"@babel/core": "7.13.14",
7475
"@babel/preset-env": "7.13.12",

src/api/reservoir.js

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import _waterman from '../kernel/_waterman.js';
2+
import randint from './randint.js';
3+
4+
/**
5+
* Reservoir sampling.
6+
*
7+
* @function
8+
* @param {number} k The size of the sample.
9+
* @param {Iterable} iterable The input iterable.
10+
* @param {Array} [output=new Array(k)] The output array.
11+
* @return {Array} The output array.
12+
*/
13+
const reservoir = _waterman(randint);
14+
export default reservoir;

src/index.js

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ export {default as randfloat} from './api/randfloat.js';
33
export {default as randint} from './api/randint.js';
44
export {default as random} from './api/random.js';
55
export {default as randrange} from './api/randrange.js';
6+
export {default as reservoir} from './api/reservoir.js';
67
export {default as sample} from './api/sample.js';
78
export {default as shuffle} from './api/shuffle.js';
89
export {default as shuffled} from './api/shuffled.js';
@@ -12,3 +13,4 @@ export {default as _fisheryates_inside_out} from './kernel/_fisheryates_inside_o
1213
export {default as _randfloat} from './kernel/_randfloat.js';
1314
export {default as _randint} from './kernel/_randint.js';
1415
export {default as _shuffle} from './kernel/_shuffle.js';
16+
export {default as _waterman} from './kernel/_waterman.js';

src/kernel/_waterman.js

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/**
2+
* Construct a sampling function using Algorithm R due to Alan Waterman (both
3+
* name and attribution are due to Knuth).
4+
*
5+
* @param {Function} randint The randint function.
6+
* @return {Function} The sample function.
7+
*/
8+
const _waterman = (randint) => {
9+
/**
10+
* Samples k items uniformly at random from an iterable of unknown size.
11+
*
12+
* We want each item to have probability k/n of being selected.
13+
*
14+
* The algorithm works as follows:
15+
* 1. We initialize a candidate sample with the first k items.
16+
* 2. For each remaining item i, decide whether to insert it in the
17+
* candidate sample with probability k/i, evicting an item from the
18+
* candidate sample at random, or to discard it immediately (with
19+
* probability 1-k/i),
20+
*
21+
* To prove that the obtained probability of inclusion for each item is correct
22+
* we multiply two probabilities:
23+
* 1. The probability of entering the candidate sample.
24+
* 2. The probability of staying in the candidate sample until the end.
25+
*
26+
* For items 1 to k, probability 1. is 1, and probability 2. is
27+
* (1-1/(k+1))(1-1/(k+2))...(1-1/n)
28+
* = (k/(k+1))((k+1)/(k+2))...((n-1)/n) which telescopes to k/n.
29+
*
30+
* For items i = k+1 to n, where probability 1. is k/i, and probability 2.
31+
* is (1-1/(i+1))(1-1/(i+2))...(1-1/n)
32+
* = (i/(i+1))((i+1)/(i+2))...((n-1)/n) which telescopes to i/n.
33+
*
34+
* NOTE: Could also implement so that it yields after each input item.
35+
* NOTE: One can reduce the expected number of random bits needed by
36+
* avoiding generating any number above k-1:
37+
* - First we branch on whether i < k.
38+
* - Then we generate the random number between 0 and k-1 only if needed.
39+
*
40+
* To decide on the branch, flip a biased coin with parameter p = k/n.
41+
* To do so, flip a fair coin until it differs from the binary
42+
* representation of k/n (0.10110101...).
43+
* The computation can be made efficient by realizing several things:
44+
* - k is fixed and smaller than n (so divmod step can be skipped)
45+
* - k/(n+1) < k/n (so we can avoid recomputing if the biased flip > k/n)
46+
*
47+
* This would reduce the number of necessary random bits from O(n log n) to
48+
* expected O(n).
49+
*
50+
* @param {number} k The size of the sample.
51+
* @param {Iterable} iterable The input iterable.
52+
* @param {Array} [output=new Array(k)] The output array.
53+
* @return {Array} The output array.
54+
*/
55+
const sample = (k, iterable, output = new Array(k)) => {
56+
const it = iterable[Symbol.iterator]();
57+
58+
let n = 0;
59+
60+
for (; n < k; ++n) {
61+
const {value, done} = it.next();
62+
if (done) return output;
63+
output[n] = value;
64+
}
65+
66+
for (; ; ++n) {
67+
const {value, done} = it.next();
68+
if (done) return output;
69+
const i = randint(0, n);
70+
if (i < k) output[i] = value;
71+
}
72+
};
73+
74+
return sample;
75+
};
76+
77+
export default _waterman;

test/src/reservoir.js

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import test from 'ava';
2+
import {range} from '@aureooms/js-itertools';
3+
import {increasing} from '@aureooms/js-compare';
4+
import {RedBlackTree} from '@aureooms/js-red-black-tree';
5+
import {reservoir, _waterman, randint} from '../../src/index.js';
6+
7+
const macro = (t, _, reservoir, k, n) => {
8+
const sample = reservoir(k, range(n));
9+
const source = RedBlackTree.from(increasing, range(n));
10+
// We cannot use a Set as it would smoosh input duplicates
11+
12+
console.debug({sample});
13+
t.is(sample.length, k);
14+
for (const i of range(Math.min(k, n))) t.true(source.remove(sample[i]));
15+
for (const i of range(n, k)) t.true(sample[i] === undefined);
16+
};
17+
18+
macro.title = (title, algo, _, k, n) =>
19+
title || `[${algo}] reservoir(${k}, range(${n}))`;
20+
21+
const algorithms = [
22+
['Waterman', _waterman(randint)],
23+
['API', reservoir],
24+
];
25+
26+
const params = [
27+
[0, 10],
28+
[5, 10],
29+
[10, 5],
30+
[10, 10],
31+
[50, 1000],
32+
];
33+
34+
for (const [name, algorithm] of algorithms) {
35+
for (const [k, input] of params) {
36+
test(macro, name, algorithm, k, input);
37+
}
38+
}

yarn.lock

+5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@
4242
resolved "https://registry.yarnpkg.com/@aureooms/js-memory/-/js-memory-4.0.0.tgz#db87dc64b948f672d73b434ebde047b05869712c"
4343
integrity sha1-24fcZLlI9nLXO0NOveBHsFhpcSw=
4444

45+
"@aureooms/js-red-black-tree@^9.0.0":
46+
version "9.0.0"
47+
resolved "https://registry.yarnpkg.com/@aureooms/js-red-black-tree/-/js-red-black-tree-9.0.0.tgz#ee006f24af42749546232b2d0baa13910c98f7b2"
48+
integrity sha512-sUtY0HnwQnBUjrfwysKc6H4BJO4O2+NnrUHLqTYJyT1l1VSI+oXGffjjmMJTFpIl4L/4FEZAN0L3BiQxgR1T8g==
49+
4550
"@aureooms/[email protected]":
4651
version "1.0.4"
4752
resolved "https://registry.yarnpkg.com/@aureooms/js-type/-/js-type-1.0.4.tgz#7f9de5f5f8506ff9c8958731744b7427b62e92b7"

0 commit comments

Comments
 (0)