|
2 | 2 |
|
3 | 3 | from __future__ import annotations
|
4 | 4 |
|
| 5 | +from itertools import zip_longest |
5 | 6 | from typing import Any, Collection, Iterable, List, Optional, Tuple, cast
|
6 | 7 |
|
7 | 8 | import numpy as np
|
@@ -230,6 +231,133 @@ def discrete_parameter_factory(
|
230 | 231 |
|
231 | 232 | return cls(parameters=parameters, exp_rep=df, empty_encoding=empty_encoding)
|
232 | 233 |
|
| 234 | + @classmethod |
| 235 | + def from_simplex( |
| 236 | + cls, |
| 237 | + max_sum: float, |
| 238 | + simplex_parameters: List[NumericalDiscreteParameter], |
| 239 | + product_parameters: Optional[List[DiscreteParameter]] = None, |
| 240 | + boundary_only: bool = False, |
| 241 | + tolerance: float = 1e-6, |
| 242 | + ) -> SubspaceDiscrete: |
| 243 | + """Efficiently create discrete simplex subspaces. |
| 244 | +
|
| 245 | + The same result can be achieved using |
| 246 | + :meth:`baybe.searchspace.discrete.SubspaceDiscrete.from_product` in combination |
| 247 | + with appropriate sum constraints. However, such an approach is inefficient |
| 248 | + because the Cartesian product involved creates an exponentially large set of |
| 249 | + candidates, most of which do not satisfy the simplex constraints and must be |
| 250 | + subsequently be filtered out by the method. |
| 251 | +
|
| 252 | + By contrast, this method uses a shortcut that removes invalid candidates |
| 253 | + already during the creation of parameter combinations, resulting in a |
| 254 | + significantly faster construction. |
| 255 | +
|
| 256 | + Args: |
| 257 | + max_sum: The maximum sum of the parameter values defining the simplex size. |
| 258 | + simplex_parameters: The parameters to be used for the simplex construction. |
| 259 | + product_parameters: Optional parameters that enter in form of a Cartesian |
| 260 | + product. |
| 261 | + boundary_only: Flag determining whether to keep only parameter |
| 262 | + configurations on the simplex boundary. |
| 263 | + tolerance: Numerical tolerance used to validate the simplex constraint. |
| 264 | +
|
| 265 | + Raises: |
| 266 | + ValueError: If the passed parameters are not suitable for a simplex |
| 267 | + construction. |
| 268 | +
|
| 269 | + Returns: |
| 270 | + The created simplex subspace. |
| 271 | +
|
| 272 | + Note: |
| 273 | + The achieved efficiency gains can vary depending on the particular order in |
| 274 | + which the parameters are passed to this method, as the configuration space |
| 275 | + is built up incrementally from the parameter sequence. |
| 276 | + """ |
| 277 | + if product_parameters is None: |
| 278 | + product_parameters = [] |
| 279 | + |
| 280 | + # Validate parameter types |
| 281 | + if not ( |
| 282 | + all(isinstance(p, NumericalDiscreteParameter) for p in simplex_parameters) |
| 283 | + ): |
| 284 | + raise ValueError( |
| 285 | + f"All parameters passed via 'simplex_parameters' " |
| 286 | + f"must be of type '{NumericalDiscreteParameter.__name__}'." |
| 287 | + ) |
| 288 | + if not (all(isinstance(p, DiscreteParameter) for p in product_parameters)): |
| 289 | + raise ValueError( |
| 290 | + f"All parameters passed via 'product_parameters' " |
| 291 | + f"must be of subclasses of '{DiscreteParameter.__name__}'." |
| 292 | + ) |
| 293 | + |
| 294 | + # Construct the product part of the space |
| 295 | + product_space = parameter_cartesian_prod_to_df(product_parameters) |
| 296 | + if not simplex_parameters: |
| 297 | + return cls(parameters=product_parameters, exp_rep=product_space) |
| 298 | + |
| 299 | + # Validate non-negativity |
| 300 | + min_values = [min(p.values) for p in simplex_parameters] |
| 301 | + if not (min(min_values) >= 0.0): |
| 302 | + raise ValueError( |
| 303 | + f"All parameters passed to '{cls.from_simplex.__name__}' " |
| 304 | + f"must have non-negative values only." |
| 305 | + ) |
| 306 | + |
| 307 | + def drop_invalid(df: pd.DataFrame, max_sum: float, boundary_only: bool) -> None: |
| 308 | + """Drop rows that violate a specified simplex constraint. |
| 309 | +
|
| 310 | + Args: |
| 311 | + df: The dataframe whose rows should satisfy the simplex constraint. |
| 312 | + max_sum: The maximum row sum defining the simplex size. |
| 313 | + boundary_only: Flag to control if the points represented by the rows |
| 314 | + may lie inside the simplex or on its boundary only. |
| 315 | + """ |
| 316 | + row_sums = df.sum(axis=1) |
| 317 | + if boundary_only: |
| 318 | + locs_to_drop = row_sums[ |
| 319 | + (row_sums < max_sum - tolerance) | (row_sums > max_sum + tolerance) |
| 320 | + ].index |
| 321 | + else: |
| 322 | + locs_to_drop = row_sums[row_sums > max_sum + tolerance].index |
| 323 | + df.drop(locs_to_drop, inplace=True) |
| 324 | + |
| 325 | + # Get the minimum sum contributions to come in the upcoming joins (the first |
| 326 | + # item is the minimum possible sum of all parameters starting from the |
| 327 | + # second parameter, the second item is the minimum possible sum starting from |
| 328 | + # the third parameter, and so on ...) |
| 329 | + min_upcoming = np.cumsum(min_values[:0:-1])[::-1] |
| 330 | + |
| 331 | + # Incrementally build up the space, dropping invalid configuration along the |
| 332 | + # way. More specifically: after having cross-joined a new parameter, there must |
| 333 | + # be enough "room" left for the remaining parameters to fit. Hence, |
| 334 | + # configurations of the current parameter subset that exceed the desired |
| 335 | + # total value minus the minimum contribution to come from the yet to be added |
| 336 | + # parameters can be already discarded. |
| 337 | + for i, (param, min_to_go) in enumerate( |
| 338 | + zip_longest(simplex_parameters, min_upcoming, fillvalue=0) |
| 339 | + ): |
| 340 | + if i == 0: |
| 341 | + exp_rep = pd.DataFrame({param.name: param.values}) |
| 342 | + else: |
| 343 | + exp_rep = pd.merge( |
| 344 | + exp_rep, pd.DataFrame({param.name: param.values}), how="cross" |
| 345 | + ) |
| 346 | + drop_invalid(exp_rep, max_sum - min_to_go, boundary_only=False) |
| 347 | + |
| 348 | + # If requested, keep only the boundary values |
| 349 | + if boundary_only: |
| 350 | + drop_invalid(exp_rep, max_sum, boundary_only=True) |
| 351 | + |
| 352 | + # Augment the Cartesian product created from all other parameter types |
| 353 | + if product_parameters: |
| 354 | + exp_rep = pd.merge(exp_rep, product_space, how="cross") |
| 355 | + |
| 356 | + # Reset the index |
| 357 | + exp_rep.reset_index(drop=True, inplace=True) |
| 358 | + |
| 359 | + return cls(parameters=simplex_parameters, exp_rep=exp_rep) |
| 360 | + |
233 | 361 | @property
|
234 | 362 | def is_empty(self) -> bool:
|
235 | 363 | """Return whether this subspace is empty."""
|
|
0 commit comments