Add ConstrainedPartitionsIntRange (#192)

gsakkis · web-flow · commit 1e7a75589393 · 2022-10-03T15:55:35.000+03:00
* Fix InclusiveRange.equal_values(x, y) between IntRange and WeightedRange

* Drop InclusiveRange.factory, add WeightedRange.from_mapping

* Add ConstrainedPartitionsIntRange

* Refactor ConstrainedPartitionsIntRange

* Update InclusiveRange.__getstate__ to work with inherited __slots__
diff --git a/tests/readers/test_ranges.py b/tests/readers/test_ranges.py
@@ -6,60 +6,35 @@
 import pytest
 
 from tiledb.ml.readers._tensor_schema.ranges import (
-    InclusiveRange,
+    ConstrainedPartitionsIntRange,
     IntRange,
     WeightedRange,
 )
 
 
-@pytest.mark.parametrize("values", [None, 42, 3.14])
-def test_inclusive_range_factory_type_error(values):
-    with pytest.raises(TypeError) as excinfo:
-        InclusiveRange.factory(values)
-    assert "Cannot create inclusive range" in str(excinfo.value)
-
-
 class TestIntRange:
-    values = range(10, 20)
-    r = InclusiveRange.factory(values)
+    r = IntRange(10, 19)
 
     def test_basic(self):
         assert self.r.min == 10
         assert self.r.max == 19
         assert self.r.weight == 10
         assert len(self.r) == 10
 
-    @pytest.mark.parametrize(
-        "values",
-        [
-            values,
-            list(values),
-            set(values),
-            iter(values),
-            reversed(values),
-            Counter(values),
-            np.array(values),
-            range(19, 9, -1),
-            np.arange(19, 9, -1),
-        ],
-    )
-    def test_equal(self, values):
-        assert_equal_ranges(self.r, InclusiveRange.factory(values), IntRange)
+    def test_equal(self):
+        assert_equal_ranges(self.r, IntRange(10, 19))
+        assert self.r != IntRange(0, 9)
+        assert self.r != IntRange(10, 20)
+        assert self.r != IntRange(11, 19)
+        assert self.r != WeightedRange.from_mapping(dict.fromkeys(self.r.values, 2))
 
-    @pytest.mark.parametrize(
-        "values",
-        [
-            np.array(values, dtype=object),
-            range(0, 10),
-            range(10, 21),
-            range(11, 20),
-            range(10, 20, 2),
-        ],
-    )
-    def test_not_equal(self, values):
-        r = InclusiveRange.factory(values)
-        assert self.r != r
-        assert not self.r.equal_values(r)
+    def test_equal_values(self):
+        assert self.r.equal_values(IntRange(10, 19))
+        assert not self.r.equal_values(IntRange(10, 20))
+        assert not self.r.equal_values(IntRange(11, 19))
+        assert self.r.equal_values(
+            WeightedRange.from_mapping(dict.fromkeys(self.r.values, 2))
+        )
 
     def test_indices(self):
         np.testing.assert_array_equal(
@@ -83,22 +58,22 @@ def test_indices_error(self, values):
     @pytest.mark.parametrize(
         "k,expected_bounds",
         [
-            (1, [(10, 20)]),
-            (2, [(10, 15), (15, 20)]),
-            (3, [(10, 14), (14, 17), (17, 20)]),
-            (4, [(10, 13), (13, 16), (16, 18), (18, 20)]),
-            (5, [(10, 12), (12, 14), (14, 16), (16, 18), (18, 20)]),
-            (6, [(10, 12), (12, 14), (14, 16), (16, 18), (18, 19), (19, 20)]),
-            (7, [(10, 12), (12, 14), (14, 16)] + [(i, i + 1) for i in range(16, 20)]),
-            (8, [(10, 12), (12, 14)] + [(i, i + 1) for i in range(14, 20)]),
-            (9, [(10, 12)] + [(i, i + 1) for i in range(12, 20)]),
-            (10, [(i, i + 1) for i in range(10, 20)]),
+            (1, [(10, 19)]),
+            (2, [(10, 14), (15, 19)]),
+            (3, [(10, 13), (14, 16), (17, 19)]),
+            (4, [(10, 12), (13, 15), (16, 17), (18, 19)]),
+            (5, [(10, 11), (12, 13), (14, 15), (16, 17), (18, 19)]),
+            (6, [(10, 11), (12, 13), (14, 15), (16, 17), (18, 18), (19, 19)]),
+            (7, [(10, 11), (12, 13), (14, 15)] + [(i, i) for i in range(16, 20)]),
+            (8, [(10, 11), (12, 13)] + [(i, i) for i in range(14, 20)]),
+            (9, [(10, 11)] + [(i, i) for i in range(12, 20)]),
+            (10, [(i, i) for i in range(10, 20)]),
         ],
     )
     def test_partition_by_count(self, k, expected_bounds):
         ranges = list(self.r.partition_by_count(k))
         assert len(ranges) == k
-        expected_ranges = [InclusiveRange.factory(range(*bs)) for bs in expected_bounds]
+        expected_ranges = [IntRange(*bounds) for bounds in expected_bounds]
         assert ranges == expected_ranges
 
     def test_partition_by_count_error(self):
@@ -110,33 +85,93 @@ def test_partition_by_count_error(self):
     @pytest.mark.parametrize(
         "max_weight,expected_bounds",
         [
-            (1, [(i, i + 1) for i in range(10, 20)]),
-            (2, [(10, 12), (12, 14), (14, 16), (16, 18), (18, 20)]),
-            (3, [(10, 13), (13, 16), (16, 19), (19, 20)]),
-            (4, [(10, 14), (14, 18), (18, 20)]),
-            (5, [(10, 15), (15, 20)]),
-            (6, [(10, 16), (16, 20)]),
-            (7, [(10, 17), (17, 20)]),
-            (8, [(10, 18), (18, 20)]),
-            (9, [(10, 19), (19, 20)]),
-            (10, [(10, 20)]),
-            (11, [(10, 20)]),
+            (1, [(i, i) for i in range(10, 20)]),
+            (2, [(10, 11), (12, 13), (14, 15), (16, 17), (18, 19)]),
+            (3, [(10, 12), (13, 15), (16, 18), (19, 19)]),
+            (4, [(10, 13), (14, 17), (18, 19)]),
+            (5, [(10, 14), (15, 19)]),
+            (6, [(10, 15), (16, 19)]),
+            (7, [(10, 16), (17, 19)]),
+            (8, [(10, 17), (18, 19)]),
+            (9, [(10, 18), (19, 19)]),
+            (10, [(10, 19)]),
+            (11, [(10, 19)]),
         ],
     )
     def test_partition_by_weight(self, max_weight, expected_bounds):
         ranges = list(self.r.partition_by_weight(max_weight))
         assert max(r.weight for r in ranges) <= max_weight
-        expected_ranges = [InclusiveRange.factory(range(*bs)) for bs in expected_bounds]
+        expected_ranges = [IntRange(*bounds) for bounds in expected_bounds]
         assert ranges == expected_ranges
 
     def test_pickle(self):
         assert pickle.loads(pickle.dumps(self.r)) == self.r
 
 
+class TestConstrainedPartitionsIntRange:
+    r = ConstrainedPartitionsIntRange(10, 29, range(1, 101, 4))
+
+    @pytest.mark.parametrize(
+        "k,expected_bounds",
+        [
+            (1, [(10, 29)]),
+            (2, [(10, 20), (21, 29)]),
+            (3, [(10, 16), (17, 24), (25, 29)]),
+            (4, [(10, 16), (17, 20), (21, 24), (25, 29)]),
+            (5, [(10, 12), (13, 16), (17, 20), (21, 24), (25, 29)]),
+            (6, [(10, 12), (13, 16), (17, 20), (21, 24), (25, 28), (29, 29)]),
+        ],
+    )
+    def test_partition_by_count(self, k, expected_bounds):
+        ranges = list(self.r.partition_by_count(k))
+        assert len(ranges) == k
+        # all partitions after the first must start at a start_offset
+        start_offsets = self.r.start_offsets
+        assert all(r.min in start_offsets for r in ranges[1:])
+        bounds = [(r.min, r.max) for r in ranges]
+        assert bounds == expected_bounds
+
+    @pytest.mark.parametrize("k", [7, 8, 9, 10])
+    def test_partition_by_count_error(self, k):
+        with pytest.raises(ValueError) as excinfo:
+            list(self.r.partition_by_count(k))
+        assert "Cannot partition range" in str(excinfo.value)
+
+    @pytest.mark.parametrize(
+        "max_weight,expected_bounds",
+        [
+            (4, [(10, 12), (13, 16), (17, 20), (21, 24), (25, 28), (29, 29)]),
+            (5, [(10, 12), (13, 16), (17, 20), (21, 24), (25, 29)]),
+            (6, [(10, 12), (13, 16), (17, 20), (21, 24), (25, 29)]),
+            (7, [(10, 16), (17, 20), (21, 24), (25, 29)]),
+            (8, [(10, 16), (17, 24), (25, 29)]),
+            (9, [(10, 16), (17, 24), (25, 29)]),
+            (10, [(10, 16), (17, 24), (25, 29)]),
+            (11, [(10, 20), (21, 29)]),
+        ],
+    )
+    def test_partition_by_weight(self, max_weight, expected_bounds):
+        ranges = list(self.r.partition_by_weight(max_weight))
+        assert max(r.weight for r in ranges) <= max_weight
+        # all partitions after the first must start at a start_offset
+        start_offsets = self.r.start_offsets
+        assert all(r.min in start_offsets for r in ranges[1:])
+        bounds = [(r.min, r.max) for r in ranges]
+        assert bounds == expected_bounds
+
+    @pytest.mark.parametrize("max_weight", [1, 2, 3])
+    def test_partition_by_weight_error(self, max_weight):
+        with pytest.raises(ValueError) as excinfo:
+            list(self.r.partition_by_weight(max_weight))
+        assert "Cannot partition range" in str(excinfo.value)
+
+
 class TestWeightedRange:
     values = ("e", "f", "a", "d", "a", "c", "d", "a", "f", "c", "f", "f", "b", "d")
-    r = InclusiveRange.factory(values)
-    r2 = InclusiveRange.factory({v: timedelta(c) for v, c in Counter(values).items()})
+    r = WeightedRange.from_mapping(Counter(values))
+    r2 = WeightedRange.from_mapping(
+        {v: timedelta(c) for v, c in Counter(values).items()}
+    )
 
     @pytest.mark.parametrize("r", [r, r2])
     def test_basic(self, r):
@@ -145,44 +180,19 @@ def test_basic(self, r):
         assert len(r) == 6
         assert r.weight == 14 if r is self.r else timedelta(14)
 
-    @pytest.mark.parametrize(
-        "values",
-        [
-            values,
-            list(values),
-            iter(values),
-            reversed(values),
-            Counter(values),
-            np.array(values),
-            np.array(values, dtype=object),
-        ],
-    )
-    def test_equal(self, values):
-        assert_equal_ranges(self.r, InclusiveRange.factory(values), WeightedRange)
-
-    def test_not_equal(self):
-        assert self.r != InclusiveRange.factory(set(self.values))
-        assert self.r != InclusiveRange.factory(range(len(set(self.values))))
+    def test_equal(self):
+        assert_equal_ranges(self.r, WeightedRange.from_mapping(Counter(self.values)))
+        assert self.r != WeightedRange.from_mapping(Counter(set(self.values)))
+        assert self.r != IntRange(0, len(set(self.values)) - 1)
 
     def test_equal_values(self):
-        assert self.r.equal_values(InclusiveRange.factory(set(self.values)))
-
-        r = InclusiveRange.factory([1, 2, 3, 3, 4, 5])
-        assert r.equal_values(InclusiveRange.factory(range(1, 6)))
-        assert not r.equal_values(InclusiveRange.factory(range(1, 7)))
-        assert not r.equal_values(InclusiveRange.factory(range(2, 7)))
-
-    def test_strided_range(self):
-        assert_equal_ranges(
-            InclusiveRange.factory(range(10, 20, 3)),
-            InclusiveRange.factory([10, 13, 16, 19]),
-            WeightedRange,
-        )
-        assert_equal_ranges(
-            InclusiveRange.factory(range(20, 10, -3)),
-            InclusiveRange.factory([11, 14, 17, 20]),
-            WeightedRange,
+        assert self.r.equal_values(
+            WeightedRange.from_mapping(Counter(set(self.values)))
         )
+        r = WeightedRange.from_mapping(Counter([1, 2, 3, 3, 4, 5]))
+        assert r.equal_values(IntRange(1, 5))
+        assert not r.equal_values(IntRange(1, 6))
+        assert not r.equal_values(IntRange(2, 5))
 
     def test_indices(self):
         np.testing.assert_array_equal(
@@ -219,15 +229,15 @@ def test_indices_error(self, values):
     def test_partition_by_count(self, k, expected_mappings):
         ranges = list(self.r.partition_by_count(k))
         assert len(ranges) == k
-        expected_ranges = list(map(InclusiveRange.factory, expected_mappings))
+        expected_ranges = list(map(WeightedRange.from_mapping, expected_mappings))
         assert ranges == expected_ranges
 
     @parametrize_by_count
     def test_partition_by_count2(self, k, expected_mappings):
         ranges = list(self.r2.partition_by_count(k))
         assert len(ranges) == k
         expected_ranges = [
-            InclusiveRange.factory({v: timedelta(w) for v, w in mapping.items()})
+            WeightedRange.from_mapping({v: timedelta(w) for v, w in mapping.items()})
             for mapping in expected_mappings
         ]
         assert ranges == expected_ranges
@@ -261,7 +271,7 @@ def test_partition_by_count_error(self, r):
     def test_partition_by_weight(self, max_weight, expected_mappings):
         ranges = list(self.r.partition_by_weight(max_weight))
         assert max(r.weight for r in ranges) <= max_weight
-        expected_ranges = list(map(InclusiveRange.factory, expected_mappings))
+        expected_ranges = list(map(WeightedRange.from_mapping, expected_mappings))
         assert ranges == expected_ranges
 
     @parametrize_by_max_weight
@@ -270,7 +280,7 @@ def test_partition_by_weight2(self, max_weight, expected_mappings):
         ranges = list(self.r2.partition_by_weight(max_weight))
         assert max(r.weight for r in ranges) <= max_weight
         expected_ranges = [
-            InclusiveRange.factory({v: timedelta(w) for v, w in mapping.items()})
+            WeightedRange.from_mapping({v: timedelta(w) for v, w in mapping.items()})
             for mapping in expected_mappings
         ]
         assert ranges == expected_ranges
@@ -280,17 +290,16 @@ def test_partition_by_weight2(self, max_weight, expected_mappings):
     )
     def test_partition_by_weight_error(self, r, max_weights):
         for max_weight in max_weights:
-            with pytest.raises(ValueError):
+            with pytest.raises(ValueError) as excinfo:
                 list(r.partition_by_weight(max_weight))
+            assert "Cannot partition range" in str(excinfo.value)
 
     def test_pickle(self):
         assert pickle.loads(pickle.dumps(self.r)) == self.r
         assert pickle.loads(pickle.dumps(self.r2)) == self.r2
 
 
-def assert_equal_ranges(r1, r2, cls):
-    assert isinstance(r1, cls)
-    assert isinstance(r2, cls)
+def assert_equal_ranges(r1, r2):
     assert r1.min == r2.min
     assert r1.max == r2.max
     assert r1.weight == r2.weight
diff --git a/tiledb/ml/readers/_tensor_schema/base_sparse.py b/tiledb/ml/readers/_tensor_schema/base_sparse.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 from .base import Tensor, TensorSchema
-from .ranges import InclusiveRange
+from .ranges import WeightedRange
 
 
 class BaseSparseTensorSchema(TensorSchema[Tensor]):
@@ -18,8 +18,8 @@ def __init__(self, **kwargs: Any):
             )
 
     @property
-    def key_range(self) -> InclusiveRange[Any, int]:
-        self._key_range: InclusiveRange[Any, int]
+    def key_range(self) -> WeightedRange[Any, int]:
+        self._key_range: WeightedRange[Any, int]
         try:
             return self._key_range
         except AttributeError:
@@ -30,7 +30,7 @@ def key_range(self) -> InclusiveRange[Any, int]:
             assert isinstance(key_dim_slice, slice)
             for result in query[key_dim_slice]:
                 key_counter.update(result[key_dim])
-            self._key_range = InclusiveRange.factory(key_counter)
+            self._key_range = WeightedRange.from_mapping(key_counter)
             return self._key_range
 
     @property
diff --git a/tiledb/ml/readers/_tensor_schema/dense.py b/tiledb/ml/readers/_tensor_schema/dense.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from .base import TensorSchema
-from .ranges import InclusiveRange
+from .ranges import InclusiveRange, IntRange
 
 
 class DenseTensorSchema(TensorSchema[np.ndarray]):
@@ -21,7 +21,7 @@ def __init__(self, **kwargs: Any):
             )
 
     @property
-    def key_range(self) -> InclusiveRange[int, int]:
+    def key_range(self) -> IntRange:
         try:
             key_dim_slice = self._dim_selectors[0]
         except KeyError:
@@ -33,7 +33,7 @@ def key_range(self) -> InclusiveRange[int, int]:
             raise NotImplementedError(
                 "Key dimension slicing is not yet implemented for dense arrays"
             )
-        return InclusiveRange.factory(range(key_dim_min, key_dim_max + 1))
+        return IntRange(key_dim_min, key_dim_max)
 
     def iter_tensors(
         self, key_ranges: Iterable[InclusiveRange[int, int]]
diff --git a/tiledb/ml/readers/_tensor_schema/ranges.py b/tiledb/ml/readers/_tensor_schema/ranges.py