|
| 1 | +/** |
| 2 | + * Construct a sampling function using Algorithm R due to Alan Waterman (both |
| 3 | + * name and attribution are due to Knuth). |
| 4 | + * |
| 5 | + * @param {Function} randint The randint function. |
| 6 | + * @return {Function} The sample function. |
| 7 | + */ |
| 8 | +const _waterman = (randint) => { |
| 9 | + /** |
| 10 | + * Samples k items uniformly at random from an iterable of unknown size. |
| 11 | + * |
| 12 | + * We want each item to have probability k/n of being selected. |
| 13 | + * |
| 14 | + * The algorithm works as follows: |
| 15 | + * 1. We initialize a candidate sample with the first k items. |
| 16 | + * 2. For each remaining item i, decide whether to insert it in the |
| 17 | + * candidate sample with probability k/i, evicting an item from the |
| 18 | + * candidate sample at random, or to discard it immediately (with |
| 19 | + * probability 1-k/i), |
| 20 | + * |
| 21 | + * To prove that the obtained probability of inclusion for each item is correct |
| 22 | + * we multiply two probabilities: |
| 23 | + * 1. The probability of entering the candidate sample. |
| 24 | + * 2. The probability of staying in the candidate sample until the end. |
| 25 | + * |
| 26 | + * For items 1 to k, probability 1. is 1, and probability 2. is |
| 27 | + * (1-1/(k+1))(1-1/(k+2))...(1-1/n) |
| 28 | + * = (k/(k+1))((k+1)/(k+2))...((n-1)/n) which telescopes to k/n. |
| 29 | + * |
| 30 | + * For items i = k+1 to n, where probability 1. is k/i, and probability 2. |
| 31 | + * is (1-1/(i+1))(1-1/(i+2))...(1-1/n) |
| 32 | + * = (i/(i+1))((i+1)/(i+2))...((n-1)/n) which telescopes to i/n. |
| 33 | + * |
| 34 | + * NOTE: Could also implement so that it yields after each input item. |
| 35 | + * NOTE: One can reduce the expected number of random bits needed by |
| 36 | + * avoiding generating any number above k-1: |
| 37 | + * - First we branch on whether i < k. |
| 38 | + * - Then we generate the random number between 0 and k-1 only if needed. |
| 39 | + * |
| 40 | + * To decide on the branch, flip a biased coin with parameter p = k/n. |
| 41 | + * To do so, flip a fair coin until it differs from the binary |
| 42 | + * representation of k/n (0.10110101...). |
| 43 | + * The computation can be made efficient by realizing several things: |
| 44 | + * - k is fixed and smaller than n (so divmod step can be skipped) |
| 45 | + * - k/(n+1) < k/n (so we can avoid recomputing if the biased flip > k/n) |
| 46 | + * |
| 47 | + * This would reduce the number of necessary random bits from O(n log n) to |
| 48 | + * expected O(n). |
| 49 | + * |
| 50 | + * @param {number} k The size of the sample. |
| 51 | + * @param {Iterable} iterable The input iterable. |
| 52 | + * @param {Array} [output=new Array(k)] The output array. |
| 53 | + * @return {Array} The output array. |
| 54 | + */ |
| 55 | + const sample = (k, iterable, output = new Array(k)) => { |
| 56 | + const it = iterable[Symbol.iterator](); |
| 57 | + |
| 58 | + let n = 0; |
| 59 | + |
| 60 | + for (; n < k; ++n) { |
| 61 | + const {value, done} = it.next(); |
| 62 | + if (done) return output; |
| 63 | + output[n] = value; |
| 64 | + } |
| 65 | + |
| 66 | + for (; ; ++n) { |
| 67 | + const {value, done} = it.next(); |
| 68 | + if (done) return output; |
| 69 | + const i = randint(0, n); |
| 70 | + if (i < k) output[i] = value; |
| 71 | + } |
| 72 | + }; |
| 73 | + |
| 74 | + return sample; |
| 75 | +}; |
| 76 | + |
| 77 | +export default _waterman; |
0 commit comments