Reimplement stable_random_distribution

Donaim · Donaim · commit 9ccbdf95326e · 2025-02-14T16:17:31.000-08:00
diff --git a/micall/tests/test_stable_random_distribution.py b/micall/tests/test_stable_random_distribution.py
@@ -2,29 +2,47 @@
 from micall.utils.stable_random_distribution import stable_random_distribution
 import numpy as np
 from itertools import islice
+from typing import Set
 
 
 def test_indices_in_range():
-    """Test that each index generated is within the range [0, maximum)."""
+    """Test that each index generated is within the range [0, high)."""
 
-    maximum = 10
-    gen = stable_random_distribution(maximum, seed=123)
+    high = 10
+    gen = stable_random_distribution(high, seed=123)
     # Grab a bunch of values from the infinite generator
 
     for _ in range(1000):
         idx = next(gen)
-        assert 0 <= idx < maximum, f"Index {idx} out of range [0,{maximum})"
+        assert 0 <= idx < high, f"Index {idx} out of range [0,{high})"
 
 
 def test_bounds_are_reachable():
     """Test that both min and max-1 can be generated."""
 
-    maximum = 999
-    gen = stable_random_distribution(maximum, seed=123)
+    high = 999
+    gen = stable_random_distribution(high, seed=123)
     lst = islice(gen, 1000)
 
     assert 0 in lst
-    assert (maximum-1) in lst
+    assert (high-1) in lst
+
+
+def test_everything_is_reachable():
+    """Test that all numbers in the range [0, max-1) can be generated."""
+
+    high = 30
+    fun = stable_random_distribution
+    # def fun(high, seed):
+    #     import random
+    #     while True:
+    #         yield random.randint(0, high)
+
+    gen = fun(high, seed=123)
+    lst = tuple(map(int, islice(gen, 1000)))
+
+    for x in range(high):
+        assert x in lst
 
 
 def test_deterministic_output_with_seed():
@@ -33,10 +51,10 @@ def test_deterministic_output_with_seed():
     re-seeded with the same seed.
     """
 
-    maximum = 15
+    high = 15
     seed = 456
-    gen1 = stable_random_distribution(maximum, seed=seed)
-    gen2 = stable_random_distribution(maximum, seed=seed)
+    gen1 = stable_random_distribution(high, seed=seed)
+    gen2 = stable_random_distribution(high, seed=seed)
 
     # Compare the first 50 generated values.
     values1 = [next(gen1) for _ in range(50)]
@@ -50,9 +68,9 @@ def test_different_seeds_differ():
     A sanity check that different seeds usually lead to a different sequence.
     """
 
-    maximum = 15
-    gen1 = stable_random_distribution(maximum, seed=789)
-    gen2 = stable_random_distribution(maximum, seed=987)
+    high = 15
+    gen1 = stable_random_distribution(high, seed=789)
+    gen2 = stable_random_distribution(high, seed=987)
 
     # Compare the first 50 generated values: while not guaranteed to
     # be different, it is extremely unlikely that the two sequences
@@ -76,18 +94,18 @@ def test_fair_distribution_behavior():
       - With the adaptive update, values should tend to be farther apart.
     """
 
-    maximum = 1_000
+    high = 100
     num_samples = 3_000
     for seed in range(100):
         # Gather samples from our generator.
-        gen = stable_random_distribution(maximum, seed=seed)
+        gen = stable_random_distribution(high, seed=seed)
         samples = np.array([next(gen) for _ in range(num_samples)])
         diff_stable = np.abs(np.diff(np.sort(samples))) ** 2
         avg_diff_stable = diff_stable.mean()
 
         # For comparison, generate num_samples indices uniformly at random.
         rng = np.random.default_rng(seed)
-        uniform_samples = rng.choice(maximum, size=num_samples)
+        uniform_samples = rng.choice(high, size=num_samples)
         diff_uniform = np.abs(np.diff(np.sort(uniform_samples))) ** 2
         avg_diff_uniform = diff_uniform.mean()
 
@@ -98,3 +116,38 @@ def test_fair_distribution_behavior():
             f"Expected stable generator to have a higher average jump than a uniform generator: "
             f"stable {avg_diff_stable} vs uniform {avg_diff_uniform}"
         )
+
+
+def test_fill_domain_speed():
+    """
+    Test that the stable_random_distribution fill out the domain
+    quicker than a simple uniform generator.
+
+    Idea is similar to the previous test.
+    """
+
+    high = 100
+    trials = 100
+    wins = 0
+
+    for seed in range(trials):
+        # Gather samples from our generator.
+        gen = stable_random_distribution(high, seed=seed)
+        stable_bucket: Set[int] = set()
+        stable_steps = 0
+        while len(stable_bucket) < high:
+            stable_bucket.add(next(gen))
+            stable_steps += 1
+
+        # For comparison, generate num_samples indices uniformly at random.
+        rng = np.random.default_rng(seed)
+        uniform_bucket: Set[int] = set()
+        uniform_steps = 0
+        while len(uniform_bucket) < high:
+            uniform_bucket.add(rng.integers(0, high))
+            uniform_steps += 1
+
+        if stable_steps < uniform_steps:
+            wins += 1
+
+    assert wins / trials > 0.85
diff --git a/micall/utils/stable_random_distribution.py b/micall/utils/stable_random_distribution.py
@@ -1,32 +1,21 @@
-from typing import Iterator, Sequence
+from typing import Iterator
 
 import numpy as np
-import random
 
+DUPLICATION_FACTOR = 1
 
-def stable_random_distribution(maximum: int, seed: int = 42) -> Iterator[int]:
-    if maximum <= 0:
+
+def stable_random_distribution(high: int, seed: int = 42) -> Iterator[int]:
+    if high <= 0:
         return
 
-    n = maximum
-    rng = random.Random(seed)
+    rng = np.random.default_rng(seed)
+    block = np.arange(high)
+    population = np.concatenate([block] * DUPLICATION_FACTOR, axis=0)
 
-    population = np.arange(n)
-    forward = np.arange(1, n + 1)
-    backwards = np.copy(np.flip(forward))
-    np_weights = np.zeros(n)
+    assert len(population) == DUPLICATION_FACTOR * len(block)
 
     while True:
-        top = np.max(np_weights) + 1
-        weights: Sequence[float] = top - np_weights  # type: ignore
-        index = rng.choices(population=population, weights=weights)[0]
+        index = rng.choice(population)
         yield index
-
-        if index == 0:
-            np_weights += backwards
-        else:
-            np_weights[:(index + 1)] += forward[-(index + 1):]
-            np_weights[(index + 1):] += backwards[1:-index]
-
-        # Prevent overflow.
-        np_weights -= np_weights.min()
+        population[index] = rng.integers(low=0, high=high)