Skip to content

Commit 1a4cc9d

Browse files
authored
SubspaceDiscrete.from_simplex convenience constructor (#117)
This PR adds a new `SubspaceDiscrete.from_simplex` convenience constructor that is useful for mixture use cases. In particular, it enables such cases when an API is involved, as the two existing alternatives will not work here: * `from_product` in combination with sum constraints suffers from the exponential blowup of parameter configurations * `from_dataframe` requires sending excessively large requests to the API (since the requests need to contain the full enumerated search space)
2 parents 1105bda + 6fc23c2 commit 1a4cc9d

File tree

5 files changed

+228
-3
lines changed

5 files changed

+228
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
- `mypy` for campaign, constraints and telemetry
1111
- Top-level example summaries
1212
- `RecommenderProtocol` as common interface for `Strategy` and `Recommender`
13+
- `SubspaceDiscrete.from_simplex` convenience constructor
1314

1415
### Changed
1516
- Order of README sections

baybe/searchspace/discrete.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
from itertools import zip_longest
56
from typing import Any, Collection, Iterable, List, Optional, Tuple, cast
67

78
import numpy as np
@@ -230,6 +231,133 @@ def discrete_parameter_factory(
230231

231232
return cls(parameters=parameters, exp_rep=df, empty_encoding=empty_encoding)
232233

234+
@classmethod
235+
def from_simplex(
236+
cls,
237+
max_sum: float,
238+
simplex_parameters: List[NumericalDiscreteParameter],
239+
product_parameters: Optional[List[DiscreteParameter]] = None,
240+
boundary_only: bool = False,
241+
tolerance: float = 1e-6,
242+
) -> SubspaceDiscrete:
243+
"""Efficiently create discrete simplex subspaces.
244+
245+
The same result can be achieved using
246+
:meth:`baybe.searchspace.discrete.SubspaceDiscrete.from_product` in combination
247+
with appropriate sum constraints. However, such an approach is inefficient
248+
because the Cartesian product involved creates an exponentially large set of
249+
candidates, most of which do not satisfy the simplex constraints and must be
250+
subsequently be filtered out by the method.
251+
252+
By contrast, this method uses a shortcut that removes invalid candidates
253+
already during the creation of parameter combinations, resulting in a
254+
significantly faster construction.
255+
256+
Args:
257+
max_sum: The maximum sum of the parameter values defining the simplex size.
258+
simplex_parameters: The parameters to be used for the simplex construction.
259+
product_parameters: Optional parameters that enter in form of a Cartesian
260+
product.
261+
boundary_only: Flag determining whether to keep only parameter
262+
configurations on the simplex boundary.
263+
tolerance: Numerical tolerance used to validate the simplex constraint.
264+
265+
Raises:
266+
ValueError: If the passed parameters are not suitable for a simplex
267+
construction.
268+
269+
Returns:
270+
The created simplex subspace.
271+
272+
Note:
273+
The achieved efficiency gains can vary depending on the particular order in
274+
which the parameters are passed to this method, as the configuration space
275+
is built up incrementally from the parameter sequence.
276+
"""
277+
if product_parameters is None:
278+
product_parameters = []
279+
280+
# Validate parameter types
281+
if not (
282+
all(isinstance(p, NumericalDiscreteParameter) for p in simplex_parameters)
283+
):
284+
raise ValueError(
285+
f"All parameters passed via 'simplex_parameters' "
286+
f"must be of type '{NumericalDiscreteParameter.__name__}'."
287+
)
288+
if not (all(isinstance(p, DiscreteParameter) for p in product_parameters)):
289+
raise ValueError(
290+
f"All parameters passed via 'product_parameters' "
291+
f"must be of subclasses of '{DiscreteParameter.__name__}'."
292+
)
293+
294+
# Construct the product part of the space
295+
product_space = parameter_cartesian_prod_to_df(product_parameters)
296+
if not simplex_parameters:
297+
return cls(parameters=product_parameters, exp_rep=product_space)
298+
299+
# Validate non-negativity
300+
min_values = [min(p.values) for p in simplex_parameters]
301+
if not (min(min_values) >= 0.0):
302+
raise ValueError(
303+
f"All parameters passed to '{cls.from_simplex.__name__}' "
304+
f"must have non-negative values only."
305+
)
306+
307+
def drop_invalid(df: pd.DataFrame, max_sum: float, boundary_only: bool) -> None:
308+
"""Drop rows that violate a specified simplex constraint.
309+
310+
Args:
311+
df: The dataframe whose rows should satisfy the simplex constraint.
312+
max_sum: The maximum row sum defining the simplex size.
313+
boundary_only: Flag to control if the points represented by the rows
314+
may lie inside the simplex or on its boundary only.
315+
"""
316+
row_sums = df.sum(axis=1)
317+
if boundary_only:
318+
locs_to_drop = row_sums[
319+
(row_sums < max_sum - tolerance) | (row_sums > max_sum + tolerance)
320+
].index
321+
else:
322+
locs_to_drop = row_sums[row_sums > max_sum + tolerance].index
323+
df.drop(locs_to_drop, inplace=True)
324+
325+
# Get the minimum sum contributions to come in the upcoming joins (the first
326+
# item is the minimum possible sum of all parameters starting from the
327+
# second parameter, the second item is the minimum possible sum starting from
328+
# the third parameter, and so on ...)
329+
min_upcoming = np.cumsum(min_values[:0:-1])[::-1]
330+
331+
# Incrementally build up the space, dropping invalid configuration along the
332+
# way. More specifically: after having cross-joined a new parameter, there must
333+
# be enough "room" left for the remaining parameters to fit. Hence,
334+
# configurations of the current parameter subset that exceed the desired
335+
# total value minus the minimum contribution to come from the yet to be added
336+
# parameters can be already discarded.
337+
for i, (param, min_to_go) in enumerate(
338+
zip_longest(simplex_parameters, min_upcoming, fillvalue=0)
339+
):
340+
if i == 0:
341+
exp_rep = pd.DataFrame({param.name: param.values})
342+
else:
343+
exp_rep = pd.merge(
344+
exp_rep, pd.DataFrame({param.name: param.values}), how="cross"
345+
)
346+
drop_invalid(exp_rep, max_sum - min_to_go, boundary_only=False)
347+
348+
# If requested, keep only the boundary values
349+
if boundary_only:
350+
drop_invalid(exp_rep, max_sum, boundary_only=True)
351+
352+
# Augment the Cartesian product created from all other parameter types
353+
if product_parameters:
354+
exp_rep = pd.merge(exp_rep, product_space, how="cross")
355+
356+
# Reset the index
357+
exp_rep.reset_index(drop=True, inplace=True)
358+
359+
return cls(parameters=simplex_parameters, exp_rep=exp_rep)
360+
233361
@property
234362
def is_empty(self) -> bool:
235363
"""Return whether this subspace is empty."""

tests/hypothesis_strategies/alternative_creation/test_searchspace.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
"""Test alternative ways of creation not considered in the strategies."""
22

3+
import hypothesis.strategies as st
4+
import numpy as np
35
import pandas as pd
46
import pytest
7+
from hypothesis import given
58
from pytest import param
69

710
from baybe.parameters import (
811
CategoricalParameter,
912
NumericalContinuousParameter,
1013
NumericalDiscreteParameter,
1114
)
15+
from baybe.parameters.categorical import TaskParameter
1216
from baybe.searchspace import SearchSpace, SubspaceContinuous
1317
from baybe.searchspace.discrete import SubspaceDiscrete
18+
from tests.hypothesis_strategies.parameters import numerical_discrete_parameter
1419

1520
# Discrete inputs for testing
1621
s_x = pd.Series([1, 2, 3], name="x")
@@ -97,3 +102,63 @@ def test_searchspace_creation_from_dataframe(df, parameters, expected):
97102
else:
98103
with pytest.raises(expected):
99104
SearchSpace.from_dataframe(df, parameters)
105+
106+
107+
@pytest.mark.parametrize("boundary_only", (False, True))
108+
@given(
109+
parameters=st.lists(
110+
numerical_discrete_parameter(min_value=0.0, max_value=1.0),
111+
min_size=1,
112+
max_size=5,
113+
unique_by=lambda x: x.name,
114+
)
115+
)
116+
def test_discrete_space_creation_from_simplex_inner(parameters, boundary_only):
117+
"""Candidates from a simplex space satisfy the simplex constraint."""
118+
tolerance = 1e-6
119+
max_possible = sum(max(p.values) for p in parameters)
120+
min_possible = sum(min(p.values) for p in parameters)
121+
122+
if boundary_only:
123+
# Ensure there exists configurations both inside and outside the simplex
124+
max_sum = (max_possible + min_possible) / 2
125+
else:
126+
# We use the maximum parameter sum because it can be exactly achieved (for other
127+
# values, except for the minimum, it's not guaranteed there actually exists
128+
# a parameter combination that can exactly hit it)
129+
max_sum = max_possible
130+
131+
subspace = SubspaceDiscrete.from_simplex(
132+
max_sum, parameters, boundary_only=boundary_only, tolerance=tolerance
133+
)
134+
135+
if boundary_only:
136+
assert np.allclose(subspace.exp_rep.sum(axis=1), max_sum, atol=tolerance)
137+
else:
138+
assert (subspace.exp_rep.sum(axis=1) <= max_sum + tolerance).all()
139+
140+
141+
p_d1 = NumericalDiscreteParameter(name="d1", values=[0.0, 0.5, 1.0])
142+
p_d2 = NumericalDiscreteParameter(name="d2", values=[0.0, 0.5, 1.0])
143+
p_t1 = TaskParameter(name="t1", values=["A", "B"])
144+
p_t2 = TaskParameter(name="t2", values=["A", "B"])
145+
146+
147+
@pytest.mark.parametrize(
148+
("simplex_parameters", "product_parameters", "n_elements"),
149+
[
150+
param([p_d1, p_d2], [p_t1, p_t2], 6 * 4, id="both"),
151+
param([p_d1, p_d2], [], 6, id="simplex-only"),
152+
param([], [p_t1, p_t2], 4, id="task_only"),
153+
],
154+
)
155+
def test_discrete_space_creation_from_simplex_mixed(
156+
simplex_parameters, product_parameters, n_elements
157+
):
158+
"""Additional non-simplex parameters enter in form of a Cartesian product."""
159+
max_sum = 1.0
160+
subspace = SubspaceDiscrete.from_simplex(
161+
max_sum, simplex_parameters, product_parameters, boundary_only=False
162+
)
163+
assert len(subspace.exp_rep) == n_elements # <-- (# simplex part) x (# task part)
164+
assert not any(subspace.exp_rep.duplicated())

tests/hypothesis_strategies/parameters.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Hypothesis strategies for parameters."""
22

3+
from typing import Optional
4+
35
import hypothesis.strategies as st
46
import numpy as np
57
from hypothesis.extra.pandas import columns, data_frames
@@ -76,14 +78,18 @@ def custom_descriptors(draw: st.DrawFn):
7678
@st.composite
7779
def numerical_discrete_parameter(
7880
draw: st.DrawFn,
81+
min_value: Optional[float] = None,
82+
max_value: Optional[float] = None,
7983
):
8084
"""Generate :class:`baybe.parameters.numerical.NumericalDiscreteParameter`."""
8185
name = draw(parameter_name)
8286
values = draw(
8387
st.lists(
84-
st.one_of(
85-
st.integers(),
86-
st.floats(allow_infinity=False, allow_nan=False),
88+
st.floats(
89+
allow_infinity=False,
90+
allow_nan=False,
91+
min_value=min_value,
92+
max_value=max_value,
8793
),
8894
min_size=2,
8995
unique=True,

tests/serialization/test_searchspace_serialization.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44

55
import pytest
66

7+
from baybe.parameters.numerical import NumericalDiscreteParameter
78
from baybe.searchspace import SearchSpace
9+
from baybe.searchspace.discrete import SubspaceDiscrete
10+
from baybe.serialization.core import converter
811

912

1013
@pytest.mark.parametrize(
@@ -40,3 +43,25 @@ def test_from_dataframe_deserialization(searchspace):
4043
)
4144
deserialized = SearchSpace.from_json(config)
4245
assert searchspace == deserialized, (searchspace, deserialized)
46+
47+
48+
def test_from_simplex_deserialization():
49+
"""Deserialization from simplex yields back the original object."""
50+
parameters = [
51+
NumericalDiscreteParameter("p1", [0, 0.5, 1]),
52+
NumericalDiscreteParameter("p2", [0, 0.5, 1]),
53+
]
54+
max_sum = 1.0
55+
subspace = SubspaceDiscrete.from_simplex(max_sum, simplex_parameters=parameters)
56+
parameters_string = json.dumps(converter.unstructure(parameters))
57+
config = """
58+
{
59+
"constructor": "from_simplex",
60+
"max_sum": __fillin__max_sum__,
61+
"simplex_parameters": __fillin__parameters__
62+
}
63+
""".replace("__fillin__max_sum__", str(max_sum)).replace(
64+
"__fillin__parameters__", parameters_string
65+
)
66+
deserialized = SubspaceDiscrete.from_json(config)
67+
assert subspace == deserialized, (subspace, deserialized)

0 commit comments

Comments
 (0)