✨ feat: First draft for reservoir sampling.

make-github-pseudonymous-again · make-github-pseudonymous-again · commit f6d707d6a763 · 2021-04-08T22:48:04.000+02:00
Fixes #18.
diff --git a/package.json b/package.json
@@ -69,6 +69,7 @@
     "@aureooms/js-functools": "2.0.3",
     "@aureooms/js-itertools": "5.1.0",
     "@aureooms/js-memory": "4.0.0",
+    "@aureooms/js-red-black-tree": "^9.0.0",
     "@aureooms/js-type": "1.0.4",
     "@babel/core": "7.13.14",
     "@babel/preset-env": "7.13.12",
diff --git a/src/api/reservoir.js b/src/api/reservoir.js
@@ -0,0 +1,14 @@
+import _waterman from '../kernel/_waterman.js';
+import randint from './randint.js';
+
+/**
+ * Reservoir sampling.
+ *
+ * @function
+ * @param {number} k The size of the sample.
+ * @param {Iterable} iterable The input iterable.
+ * @param {Array} [output=new Array(k)] The output array.
+ * @return {Array} The output array.
+ */
+const reservoir = _waterman(randint);
+export default reservoir;
diff --git a/src/index.js b/src/index.js
@@ -3,6 +3,7 @@ export {default as randfloat} from './api/randfloat.js';
 export {default as randint} from './api/randint.js';
 export {default as random} from './api/random.js';
 export {default as randrange} from './api/randrange.js';
+export {default as reservoir} from './api/reservoir.js';
 export {default as sample} from './api/sample.js';
 export {default as shuffle} from './api/shuffle.js';
 export {default as shuffled} from './api/shuffled.js';
@@ -12,3 +13,4 @@ export {default as _fisheryates_inside_out} from './kernel/_fisheryates_inside_o
 export {default as _randfloat} from './kernel/_randfloat.js';
 export {default as _randint} from './kernel/_randint.js';
 export {default as _shuffle} from './kernel/_shuffle.js';
+export {default as _waterman} from './kernel/_waterman.js';
diff --git a/src/kernel/_waterman.js b/src/kernel/_waterman.js
@@ -0,0 +1,77 @@
+/**
+ * Construct a sampling function using Algorithm R due to Alan Waterman (both
+ * name and attribution are due to Knuth).
+ *
+ * @param {Function} randint The randint function.
+ * @return {Function} The sample function.
+ */
+const _waterman = (randint) => {
+	/**
+	 * Samples k items uniformly at random from an iterable of unknown size.
+	 *
+	 * We want each item to have probability k/n of being selected.
+	 *
+	 * The algorithm works as follows:
+	 *   1. We initialize a candidate sample with the first k items.
+	 *   2. For each remaining item i, decide whether to insert it in the
+	 *   candidate sample with probability k/i, evicting an item from the
+	 *   candidate sample at random, or to discard it immediately (with
+	 *   probability 1-k/i),
+	 *
+	 * To prove that the obtained probability of inclusion for each item is correct
+	 * we multiply two probabilities:
+	 *   1. The probability of entering the candidate sample.
+	 *   2. The probability of staying in the candidate sample until the end.
+	 *
+	 * For items 1 to k, probability 1. is 1, and probability 2. is
+	 * (1-1/(k+1))(1-1/(k+2))...(1-1/n)
+	 * = (k/(k+1))((k+1)/(k+2))...((n-1)/n) which telescopes to k/n.
+	 *
+	 * For items i = k+1 to n, where probability 1. is k/i, and probability 2.
+	 * is (1-1/(i+1))(1-1/(i+2))...(1-1/n)
+	 * = (i/(i+1))((i+1)/(i+2))...((n-1)/n) which telescopes to i/n.
+	 *
+	 * NOTE: Could also implement so that it yields after each input item.
+	 * NOTE: One can reduce the expected number of random bits needed by
+	 * avoiding generating any number above k-1:
+	 *   - First we branch on whether i < k.
+	 *   - Then we generate the random number between 0 and k-1 only if needed.
+	 *
+	 * To decide on the branch, flip a biased coin with parameter p = k/n.
+	 * To do so, flip a fair coin until it differs from the binary
+	 * representation of k/n (0.10110101...).
+	 * The computation can be made efficient by realizing several things:
+	 *   - k is fixed and smaller than n (so divmod step can be skipped)
+	 *   - k/(n+1) < k/n (so we can avoid recomputing if the biased flip > k/n)
+	 *
+	 * This would reduce the number of necessary random bits from O(n log n) to
+	 * expected O(n).
+	 *
+	 * @param {number} k The size of the sample.
+	 * @param {Iterable} iterable The input iterable.
+	 * @param {Array} [output=new Array(k)] The output array.
+	 * @return {Array} The output array.
+	 */
+	const sample = (k, iterable, output = new Array(k)) => {
+		const it = iterable[Symbol.iterator]();
+
+		let n = 0;
+
+		for (; n < k; ++n) {
+			const {value, done} = it.next();
+			if (done) return output;
+			output[n] = value;
+		}
+
+		for (; ; ++n) {
+			const {value, done} = it.next();
+			if (done) return output;
+			const i = randint(0, n);
+			if (i < k) output[i] = value;
+		}
+	};
+
+	return sample;
+};
+
+export default _waterman;
diff --git a/test/src/reservoir.js b/test/src/reservoir.js
@@ -0,0 +1,38 @@
+import test from 'ava';
+import {range} from '@aureooms/js-itertools';
+import {increasing} from '@aureooms/js-compare';
+import {RedBlackTree} from '@aureooms/js-red-black-tree';
+import {reservoir, _waterman, randint} from '../../src/index.js';
+
+const macro = (t, _, reservoir, k, n) => {
+	const sample = reservoir(k, range(n));
+	const source = RedBlackTree.from(increasing, range(n));
+	// We cannot use a Set as it would smoosh input duplicates
+
+	console.debug({sample});
+	t.is(sample.length, k);
+	for (const i of range(Math.min(k, n))) t.true(source.remove(sample[i]));
+	for (const i of range(n, k)) t.true(sample[i] === undefined);
+};
+
+macro.title = (title, algo, _, k, n) =>
+	title || `[${algo}] reservoir(${k}, range(${n}))`;
+
+const algorithms = [
+	['Waterman', _waterman(randint)],
+	['API', reservoir],
+];
+
+const params = [
+	[0, 10],
+	[5, 10],
+	[10, 5],
+	[10, 10],
+	[50, 1000],
+];
+
+for (const [name, algorithm] of algorithms) {
+	for (const [k, input] of params) {
+		test(macro, name, algorithm, k, input);
+	}
+}
diff --git a/yarn.lock b/yarn.lock
@@ -42,6 +42,11 @@
   resolved "https://registry.yarnpkg.com/@aureooms/js-memory/-/js-memory-4.0.0.tgz#db87dc64b948f672d73b434ebde047b05869712c"
   integrity sha1-24fcZLlI9nLXO0NOveBHsFhpcSw=
 
+"@aureooms/js-red-black-tree@^9.0.0":
+  version "9.0.0"
+  resolved "https://registry.yarnpkg.com/@aureooms/js-red-black-tree/-/js-red-black-tree-9.0.0.tgz#ee006f24af42749546232b2d0baa13910c98f7b2"
+  integrity sha512-sUtY0HnwQnBUjrfwysKc6H4BJO4O2+NnrUHLqTYJyT1l1VSI+oXGffjjmMJTFpIl4L/4FEZAN0L3BiQxgR1T8g==
+
 "@aureooms/js-type@1.0.4":
   version "1.0.4"
   resolved "https://registry.yarnpkg.com/@aureooms/js-type/-/js-type-1.0.4.tgz#7f9de5f5f8506ff9c8958731744b7427b62e92b7"