From 00ee6116bdf5ea36f59d601c8ca2d5277da64771 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 3 Jan 2025 16:29:02 +0100 Subject: [PATCH 01/11] Extract experimental input validation utility --- baybe/campaign.py | 46 ++++------------- baybe/telemetry.py | 9 +--- baybe/utils/dataframe.py | 38 ++------------ baybe/utils/validation.py | 106 +++++++++++++++++++++++++++++++++++++- 4 files changed, 120 insertions(+), 79 deletions(-) diff --git a/baybe/campaign.py b/baybe/campaign.py index 6bd36585b..fc5662a54 100644 --- a/baybe/campaign.py +++ b/baybe/campaign.py @@ -43,6 +43,7 @@ from baybe.utils.boolean import eq_dataframe from baybe.utils.dataframe import filter_df, fuzzy_row_match from baybe.utils.plotting import to_string +from baybe.utils.validation import validate_parameter_input, validate_target_input if TYPE_CHECKING: from botorch.posteriors import Posterior @@ -201,48 +202,24 @@ def add_measurements( Each addition of data is considered a new batch. Added results are checked for validity. Categorical values need to have an exact match. For numerical values, a campaign flag determines if values that lie outside a specified tolerance - are accepted. - Note that this modifies the provided data in-place. + are accepted. Possible validation exceptions are documented in + :func:`baybe.utils.validation.validate_target_input` and + :func:`baybe.utils.validation.validate_parameter_input`. Args: data: The data to be added (with filled values for targets). Preferably created via :func:`baybe.campaign.Campaign.recommend`. numerical_measurements_must_be_within_tolerance: Flag indicating if numerical parameters need to be within their tolerances. - - Raises: - ValueError: If one of the targets has missing values or NaNs in the provided - dataframe. - TypeError: If the target has non-numeric entries in the provided dataframe. """ # Invalidate recommendation cache first (in case of uncaught exceptions below) self._cached_recommendation = pd.DataFrame() - # Check if all targets have valid values - for target in self.targets: - if data[target.name].isna().any(): - raise ValueError( - f"The target '{target.name}' has missing values or NaNs in the " - f"provided dataframe. Missing target values are not supported." - ) - if data[target.name].dtype.kind not in "iufb": - raise TypeError( - f"The target '{target.name}' has non-numeric entries in the " - f"provided dataframe. Non-numeric target values are not supported." - ) - - # Check if all targets have valid values - for param in self.parameters: - if data[param.name].isna().any(): - raise ValueError( - f"The parameter '{param.name}' has missing values or NaNs in the " - f"provided dataframe. Missing parameter values are not supported." - ) - if param.is_numerical and (data[param.name].dtype.kind not in "iufb"): - raise TypeError( - f"The numerical parameter '{param.name}' has non-numeric entries in" - f" the provided dataframe." - ) + # Validate target and parameter input values + validate_target_input(data, self.targets) + validate_parameter_input( + data, self.parameters, numerical_measurements_must_be_within_tolerance + ) # Read in measurements and add them to the database self.n_batches_done += 1 @@ -257,10 +234,7 @@ def add_measurements( # Update metadata if self.searchspace.type in (SearchSpaceType.DISCRETE, SearchSpaceType.HYBRID): idxs_matched = fuzzy_row_match( - self.searchspace.discrete.exp_rep, - data, - self.parameters, - numerical_measurements_must_be_within_tolerance, + self.searchspace.discrete.exp_rep, data, self.parameters ) self._searchspace_metadata.loc[idxs_matched, _MEASURED] = True diff --git a/baybe/telemetry.py b/baybe/telemetry.py index f3879cef4..895e351f7 100644 --- a/baybe/telemetry.py +++ b/baybe/telemetry.py @@ -240,14 +240,7 @@ def telemetry_record_recommended_measurement_percentage( if is_enabled(): if len(cached_recommendation) > 0: recommended_measurements_percentage = ( - len( - fuzzy_row_match( - cached_recommendation, - measurements, - parameters, - numerical_measurements_must_be_within_tolerance, - ) - ) + len(fuzzy_row_match(cached_recommendation, measurements, parameters)) / len(cached_recommendation) * 100.0 ) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 61cc4f051..048b40665 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -416,15 +416,13 @@ def fuzzy_row_match( left_df: pd.DataFrame, right_df: pd.DataFrame, parameters: Sequence[Parameter], - numerical_measurements_must_be_within_tolerance: bool, ) -> pd.Index: """Match row of the right dataframe to the rows of the left dataframe. - This is useful for validity checks and to automatically match measurements to - entries in the search space, e.g. to detect which ones have been measured. - For categorical parameters, there needs to be an exact match with any of the - allowed values. For numerical parameters, the user can decide via a flag - whether values outside the tolerance should be accepted. + This is useful for matching measurements to entries in the search space, e.g. to + detect which ones have been measured. For categorical parameters, there needs to be + an exact match with any of the allowed values. For numerical parameters, the user + can decide via a flag whether values outside the tolerance should be accepted. Args: left_df: The data that serves as lookup reference. @@ -432,17 +430,12 @@ def fuzzy_row_match( dataframe. parameters: List of baybe parameter objects that are needed to identify potential tolerances. - numerical_measurements_must_be_within_tolerance: If ``True``, numerical - parameters are matched with the search space elements only if there is a - match within the parameter tolerance. If ``False``, the closest match is - considered, irrespective of the distance. Returns: The index of the matching rows in ``left_df``. Raises: ValueError: If some rows are present in the right but not in the left dataframe. - ValueError: If the input data has invalid values. """ # Assert that all parameters appear in the given dataframe if not all(col in right_df.columns for col in left_df.columns): @@ -451,30 +444,9 @@ def fuzzy_row_match( " in the left dataframe." ) - inds_matched = [] - # Iterate over all input rows + inds_matched = [] for ind, row in right_df.iterrows(): - # Check if the row represents a valid input - valid = True - for param in parameters: - if param.is_numerical: - if numerical_measurements_must_be_within_tolerance: - valid &= param.is_in_range(row[param.name]) - else: - valid &= param.is_in_range(row[param.name]) - if not valid: - raise ValueError( - f"Input data on row with the index {row.name} has invalid " - f"values in parameter '{param.name}'. " - f"For categorical parameters, values need to exactly match a " - f"valid choice defined in your config. " - f"For numerical parameters, a match is accepted only if " - f"the input value is within the specified tolerance/range. Set " - f"the flag 'numerical_measurements_must_be_within_tolerance' " - f"to 'False' to disable this behavior." - ) - # Differentiate category-like and discrete numerical parameters cat_cols = [p.name for p in parameters if not p.is_numerical] num_cols = [p.name for p in parameters if (p.is_numerical and p.is_discrete)] diff --git a/baybe/utils/validation.py b/baybe/utils/validation.py index a16d018c4..8c9037f6d 100644 --- a/baybe/utils/validation.py +++ b/baybe/utils/validation.py @@ -3,11 +3,16 @@ from __future__ import annotations import math -from collections.abc import Callable -from typing import Any +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING, Any +import pandas as pd from attrs import Attribute +if TYPE_CHECKING: + from baybe.parameters.base import Parameter + from baybe.targets.base import Target + def validate_not_nan(self: Any, attribute: Attribute, value: Any) -> None: """Attrs-compatible validator to forbid 'nan' values.""" @@ -68,3 +73,100 @@ def validator(self: Any, attribute: Attribute, value: Any) -> None: non_inf_float = _make_restricted_float_validator(allow_nan=True, allow_inf=False) """Validator for non-infinite floats.""" + + +def validate_target_input(data: pd.DataFrame, targets: Sequence[Target]) -> None: + """Validate input dataframe columns corresponding to targets. + + Args: + data: The input dataframe to be validated. + targets: The allowed targets. + + Raises: + ValueError: If the input dataframe is empty. + ValueError: If any target data contain NaN. + TypeError: If any numerical target data contain non-numeric values. + ValueError: If any binary target data contain values not part of the targets' + allowed values. + """ + from baybe.targets import BinaryTarget, NumericalTarget + + if len(data) < 1: + raise ValueError("The provided input dataframe cannot be emtpy.") + + for t in targets: + if data[t.name].isna().any(): + raise ValueError( + f"The target '{t.name}' has missing values or NaNs in the provided " + f"dataframe. Missing target values are not supported." + ) + + if isinstance(t, NumericalTarget): + if data[t.name].dtype.kind not in "iufb": + raise TypeError( + f"The numerical target '{t.name}' has non-numeric entries in the " + f"provided dataframe. Non-numeric target values are not supported." + ) + elif isinstance(t, BinaryTarget): + if not ( + data[t.name].isin(allowed := [t.failure_value, t.success_value]).all() + ): + raise ValueError( + f"The binary target '{t.name}' has nvalid entries in the provided " + f"dataframe. Allowed values are: {allowed}." + ) + + +def validate_parameter_input( + data: pd.DataFrame, + parameters: Sequence[Parameter], + numerical_measurements_must_be_within_tolerance: bool = False, +) -> None: + """Validate input dataframe columns corresponding to parameters. + + Args: + data: The input dataframe to be validated. + parameters: The allowed parameters. + numerical_measurements_must_be_within_tolerance: If ``True``, numerical + parameter values must match to parameter values within the + parameter-specific tolerance. + + Raises: + ValueError: If the input dataframe is empty. + ValueError: If a parameter contains NaN. + TypeError: If a parameter contains non-numeric values. + """ + if len(data) < 1: + raise ValueError("The provided input dataframe cannot be emtpy.") + + for p in parameters: + if data[p.name].isna().any(): + raise ValueError( + f"The parameter '{p.name}' has missing values or NaNs in the provided " + f"dataframe. Missing parameter values are not supported." + ) + if p.is_numerical and (data[p.name].dtype.kind not in "iufb"): + raise TypeError( + f"The numerical parameter '{p.name}' has non-numeric entries in the " + f"provided dataframe." + ) + + # Check if all rows have valid inputs matching allowed parameter values + for ind, row in data.iterrows(): + valid = True + if p.is_numerical: + if numerical_measurements_must_be_within_tolerance: + valid &= p.is_in_range(row[p.name]) + else: + valid &= p.is_in_range(row[p.name]) + if not valid: + raise ValueError( + f"Input data on row with the index {row.name} has invalid " + f"values in parameter '{p.name}'. " + f"For categorical parameters, values need to exactly match a " + f"valid choice defined in your config. " + f"For numerical parameters, a match is accepted only if " + f"the input value is within the specified tolerance/range. Set " + f"the flag 'numerical_measurements_must_be_within_tolerance' " + f"to 'False' to disable this behavior." + ) From a7728f76a5132db7158b0be9b4bb45baaaf5c1a8 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 3 Jan 2025 16:44:04 +0100 Subject: [PATCH 02/11] Fix simulation with empty initial data --- baybe/simulation/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/simulation/core.py b/baybe/simulation/core.py index b28c8473a..5b6fa22e3 100644 --- a/baybe/simulation/core.py +++ b/baybe/simulation/core.py @@ -118,7 +118,7 @@ def simulate_experiment( campaign = deepcopy(campaign) # Add the initial data - if initial_data is not None: + if (initial_data is not None) and (len(initial_data) > 0): campaign.add_measurements(initial_data) # For impute_mode 'ignore', do not recommend space entries that are not From c8918017a9aaccf449bc71ef7f46d189222036b2 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 3 Jan 2025 16:44:20 +0100 Subject: [PATCH 03/11] Expand basic input output tests --- tests/test_input_output.py | 78 +++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/tests/test_input_output.py b/tests/test_input_output.py index bda924b7c..7cfe5fc1c 100644 --- a/tests/test_input_output.py +++ b/tests/test_input_output.py @@ -10,44 +10,70 @@ from baybe.targets import NumericalTarget from baybe.utils.dataframe import add_fake_measurements -# List of tests that are expected to fail (still missing implementation etc) -param_xfails = [] -target_xfails = [] - @pytest.mark.parametrize( - "bad_val", - [1337, np.nan, "asd"], - ids=["not_within_tol", "nan", "string_instead_float"], + "bad_val, parameter_names", + [ + (1337, ["Num_disc_1"]), + (np.nan, ["Num_disc_1"]), + ("asd", ["Num_disc_1"]), + ("asd", ["Categorical_1"]), + (np.nan, ["Categorical_1"]), + (1337, ["Categorical_1"]), + ("asd", ["Custom_1"]), + (np.nan, ["Custom_1"]), + (1337, ["Custom_1"]), + ("asd", ["Task"]), + (np.nan, ["Task"]), + (1337, ["Task"]), + ], + ids=[ + "num_param_outside_tol", + "num_param_nan", + "num_param_str", + "cat_param_invalid_cat", + "cat_param_nan", + "cat_param_num", + "custom_param_invalid_cat", + "custom_param_nan", + "custom_param_num", + "task_param_invalid_cat", + "task_param_nan", + "task_param_num", + ], ) -def test_bad_parameter_input_value(campaign, good_reference_values, bad_val, request): +@pytest.mark.parametrize("n_grid_points", [5], ids=["g5"]) +def test_bad_parameter_input_value(campaign, bad_val): """Test attempting to read in an invalid parameter value.""" - if request.node.callspec.id in param_xfails: - pytest.xfail() - - rec = campaign.recommend(batch_size=3) - add_fake_measurements( - rec, - campaign.targets, - good_reference_values=good_reference_values, - ) + rec = campaign.recommend(batch_size=2) + add_fake_measurements(rec, campaign.targets) # Add an invalid value - rec.Num_disc_1.iloc[0] = bad_val + rec[campaign.parameters[0].name].iloc[0] = bad_val with pytest.raises((ValueError, TypeError)): campaign.add_measurements(rec) @pytest.mark.parametrize( - "bad_val", - [np.nan, "asd"], - ids=["nan", "string_instead_float"], + "bad_val, target_names", + [ + (np.nan, ["Target_max"]), + ("asd", ["Target_max"]), + (np.nan, ["Target_binary"]), + (1337, ["Target_binary"]), + ("asd", ["Target_binary"]), + ], + ids=[ + "num_target_nan", + "num_target_str", + "binary_target_nan", + "binary_target_num", + "binary_target_str", + ], ) -def test_bad_target_input_value(campaign, good_reference_values, bad_val, request): +@pytest.mark.parametrize("n_grid_points", [5], ids=["g5"]) +def test_bad_target_input_value(campaign, good_reference_values, bad_val): """Test attempting to read in an invalid target value.""" - if request.node.callspec.id in target_xfails: - pytest.xfail() - rec = campaign.recommend(batch_size=3) add_fake_measurements( rec, @@ -56,7 +82,7 @@ def test_bad_target_input_value(campaign, good_reference_values, bad_val, reques ) # Add an invalid value - rec.Target_max.iloc[0] = bad_val + rec[campaign.targets[0].name].iloc[0] = bad_val with pytest.raises((ValueError, TypeError)): campaign.add_measurements(rec) From e19b02dc53e0fd919d370f21148ba6a67be168b7 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 3 Jan 2025 17:33:38 +0100 Subject: [PATCH 04/11] Add test for invalid pending_experiments --- tests/test_pending_experiments.py | 59 +++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/test_pending_experiments.py b/tests/test_pending_experiments.py index 961dd8a39..e05388b3f 100644 --- a/tests/test_pending_experiments.py +++ b/tests/test_pending_experiments.py @@ -2,6 +2,7 @@ import warnings +import numpy as np import pandas as pd import pytest from pytest import param @@ -175,3 +176,61 @@ def test_invalid_acqf(searchspace, recommender, objective, batch_size, acqf): measurements=rec1, pending_experiments=rec2, ) + + +@pytest.mark.parametrize( + "parameter_names, invalid_pending_value", + [ + (["Categorical_1", "Num_disc_1"], "asd"), + (["Categorical_1", "Num_disc_1"], 1337), + (["Categorical_1", "Num_disc_1"], np.nan), + (["Num_disc_1", "Num_disc_2"], "asd"), + (["Num_disc_1", "Num_disc_2"], np.nan), + (["Custom_1", "Num_disc_2"], "asd"), + (["Custom_1", "Num_disc_2"], 1337), + (["Custom_1", "Num_disc_2"], np.nan), + (["Task", "Num_disc_1"], "asd"), + (["Task", "Num_disc_1"], 1337), + (["Task", "Num_disc_1"], np.nan), + ], + ids=[ + "cat_param_invalid_value", + "cat_param_num", + "cat_param_nan", + "num_param_str", + "num_param_nan", + "custom_param_str", + "custom_param_num", + "custom_param_nan", + "task_param_invalid_value", + "task_param_num", + "task_param_nan", + ], +) +@pytest.mark.parametrize("n_grid_points", [5], ids=["g5"]) +@pytest.mark.parametrize("batch_size", [3], ids=["b3"]) +def test_invalid_input( + searchspace, + recommender, + objective, + batch_size, + invalid_pending_value, + parameter_names, +): + """Test exception raised for acqfs that don't support pending experiments.""" + # Get recommendation and add a fake results + rec1 = recommender.recommend(batch_size, searchspace, objective) + add_fake_measurements(rec1, objective.targets) + + # Create fake pending experiments + rec2 = rec1.copy() + rec2[parameter_names[0]] = invalid_pending_value + + with pytest.raises((ValueError, TypeError), match="parameter"): + recommender.recommend( + batch_size, + searchspace, + objective, + measurements=rec1, + pending_experiments=rec2, + ) From f2ff23fe456a3b9f77ece3eff1945de5c4a0ff1a Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 3 Jan 2025 17:33:52 +0100 Subject: [PATCH 05/11] Add pending_experiments validation --- baybe/recommenders/pure/bayesian/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/baybe/recommenders/pure/bayesian/base.py b/baybe/recommenders/pure/bayesian/base.py index bf44b53db..e5520b9eb 100644 --- a/baybe/recommenders/pure/bayesian/base.py +++ b/baybe/recommenders/pure/bayesian/base.py @@ -17,6 +17,7 @@ from baybe.searchspace import SearchSpace from baybe.surrogates import CustomONNXSurrogate, GaussianProcessSurrogate from baybe.surrogates.base import IndependentGaussianSurrogate, SurrogateProtocol +from baybe.utils.validation import validate_parameter_input @define @@ -123,6 +124,9 @@ def recommend( if isinstance(self._surrogate_model, CustomONNXSurrogate): CustomONNXSurrogate.validate_compatibility(searchspace) + if pending_experiments is not None: + validate_parameter_input(pending_experiments, searchspace.parameters) + self._setup_botorch_acqf( searchspace, objective, measurements, pending_experiments ) From 257f6b6d3b00950389c0f6a42250a73d82ab7528 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 6 Jan 2025 12:00:22 +0100 Subject: [PATCH 06/11] Fix docstring --- tests/test_pending_experiments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pending_experiments.py b/tests/test_pending_experiments.py index e05388b3f..01004f7c2 100644 --- a/tests/test_pending_experiments.py +++ b/tests/test_pending_experiments.py @@ -217,7 +217,7 @@ def test_invalid_input( invalid_pending_value, parameter_names, ): - """Test exception raised for acqfs that don't support pending experiments.""" + """Test exception raised for invalid pending experiments input.""" # Get recommendation and add a fake results rec1 = recommender.recommend(batch_size, searchspace, objective) add_fake_measurements(rec1, objective.targets) From 70d0b851e79b1aa84419b986ad080be05cb70f05 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 6 Jan 2025 12:52:22 +0100 Subject: [PATCH 07/11] Add utility for creating fake input --- baybe/utils/dataframe.py | 49 +++++++++++++++++++++++++++++++ tests/test_input_output.py | 7 ++--- tests/test_pending_experiments.py | 24 +++++++-------- 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 048b40665..8d7142b16 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -278,6 +278,55 @@ def add_parameter_noise( return data +def create_fake_input( + parameters: Sequence[Parameter], + targets: Sequence[Target], + n_rows: int = 1, + **kwargs: dict, +) -> pd.DataFrame: + """Create fake valid input for :meth:`baybe.campaign.Campaign.add_measurements`. + + If noisy parameter values are desired, it is recommended to apply + :func:`baybe.utils.dataframe.add_parameter_noise` to the output of this function. + + Args: + parameters: The parameters. + targets: The targets. + n_rows: Number of desired rows. + **kwargs: Additional arguments to be passed to + :func:`baybe.utils.dataframe.add_fake_measurements`. + + Returns: + Dataframe corresponding to fake measurement input. + + Raises: + ValueError: If less than one row was requested. + """ + # Assert at least one fake entry is being generated + if n_rows < 1: + raise ValueError( + f"'{create_fake_input.__name__}' must at least create one row, but the " + f"requested number was: {n_rows}." + ) + + # Create fake parameter values from their definitions + content = {} + for p in parameters: + if p.is_discrete: + vals = np.random.choice(p.values, n_rows, replace=True) + else: + vals = np.random.uniform(p.bounds.lower, p.bounds.upper, n_rows) + + content[p.name] = vals + + data = pd.DataFrame.from_dict(content) + + # Add fake target values + add_fake_measurements(data, targets, **kwargs) + + return data + + def df_drop_single_value_columns( df: pd.DataFrame, lst_exclude: list = None ) -> pd.DataFrame: diff --git a/tests/test_input_output.py b/tests/test_input_output.py index 7cfe5fc1c..6fae6d98e 100644 --- a/tests/test_input_output.py +++ b/tests/test_input_output.py @@ -8,7 +8,7 @@ from baybe.recommenders import BotorchRecommender from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.dataframe import add_fake_measurements +from baybe.utils.dataframe import add_fake_measurements, create_fake_input @pytest.mark.parametrize( @@ -43,10 +43,9 @@ ], ) @pytest.mark.parametrize("n_grid_points", [5], ids=["g5"]) -def test_bad_parameter_input_value(campaign, bad_val): +def test_bad_parameter_input_value(campaign, bad_val, batch_size): """Test attempting to read in an invalid parameter value.""" - rec = campaign.recommend(batch_size=2) - add_fake_measurements(rec, campaign.targets) + rec = create_fake_input(campaign.parameters, campaign.targets, batch_size) # Add an invalid value rec[campaign.parameters[0].name].iloc[0] = bad_val diff --git a/tests/test_pending_experiments.py b/tests/test_pending_experiments.py index 01004f7c2..9048bc559 100644 --- a/tests/test_pending_experiments.py +++ b/tests/test_pending_experiments.py @@ -19,7 +19,10 @@ TwoPhaseMetaRecommender, ) from baybe.utils.basic import get_subclasses -from baybe.utils.dataframe import add_fake_measurements, add_parameter_noise +from baybe.utils.dataframe import ( + add_parameter_noise, + create_fake_input, +) from baybe.utils.random import temporary_seed _discrete_params = ["Categorical_1", "Switch_1", "Num_disc_1"] @@ -116,9 +119,8 @@ def test_pending_points(campaign, batch_size): """Test there is no recommendation overlap if pending experiments are specified.""" warnings.filterwarnings("ignore", category=UnusedObjectWarning) - # Perform a fake first iteration - rec = campaign.recommend(batch_size) - add_fake_measurements(rec, campaign.targets) + # Add some initial measurements + rec = create_fake_input(campaign.parameters, campaign.targets, batch_size) campaign.add_measurements(rec) # Get recommendations and set them as pending experiments while getting another set @@ -160,11 +162,8 @@ def test_invalid_acqf(searchspace, recommender, objective, batch_size, acqf): recommender=BotorchRecommender(acquisition_function=acqf) ) - # Get recommendation and add a fake results - rec1 = recommender.recommend(batch_size, searchspace, objective) - add_fake_measurements(rec1, objective.targets) - - # Create fake pending experiments + # Create fake measurements and pending experiments + rec1 = create_fake_input(searchspace.parameters, objective.targets, batch_size) rec2 = rec1.copy() add_parameter_noise(rec2, searchspace.parameters) @@ -218,11 +217,8 @@ def test_invalid_input( parameter_names, ): """Test exception raised for invalid pending experiments input.""" - # Get recommendation and add a fake results - rec1 = recommender.recommend(batch_size, searchspace, objective) - add_fake_measurements(rec1, objective.targets) - - # Create fake pending experiments + # Create fake measurements and pending experiments + rec1 = create_fake_input(searchspace.parameters, objective.targets, batch_size) rec2 = rec1.copy() rec2[parameter_names[0]] = invalid_pending_value From 8cbd0b7eb26ff656e4f8e0e092ba37b66ca4ba22 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 6 Jan 2025 13:13:07 +0100 Subject: [PATCH 08/11] Add fixture for fake measurements --- tests/conftest.py | 18 +++++++++++++++++- tests/test_input_output.py | 10 ++++------ tests/test_pending_experiments.py | 27 ++++++++++++--------------- tests/test_surrogate.py | 10 +++------- 4 files changed, 36 insertions(+), 29 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c82630cd4..c28f790bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -77,7 +77,11 @@ ) from baybe.utils.basic import hilberts_factory from baybe.utils.boolean import strtobool -from baybe.utils.dataframe import add_fake_measurements, add_parameter_noise +from baybe.utils.dataframe import ( + add_fake_measurements, + add_parameter_noise, + create_fake_input, +) from baybe.utils.random import temporary_seed # Hypothesis settings @@ -164,6 +168,18 @@ def fixture_batch_size(request): return request.param +@pytest.fixture(name="n_fake_measurements") +def fixture_n_fake_measurements(batch_size): + """Number of rows for :func:`baybe.utils.dataframe.create_fake_input`.""" + return batch_size + + +@pytest.fixture(name="fake_measurements") +def fixture_fake_measurements(parameters, targets, batch_size): + """Artificially created valid measurements.""" + return create_fake_input(parameters, targets, batch_size) + + @pytest.fixture( params=[5, pytest.param(8, marks=pytest.mark.slow)], name="n_grid_points", diff --git a/tests/test_input_output.py b/tests/test_input_output.py index 6fae6d98e..584b809c1 100644 --- a/tests/test_input_output.py +++ b/tests/test_input_output.py @@ -8,7 +8,7 @@ from baybe.recommenders import BotorchRecommender from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.dataframe import add_fake_measurements, create_fake_input +from baybe.utils.dataframe import add_fake_measurements @pytest.mark.parametrize( @@ -43,14 +43,12 @@ ], ) @pytest.mark.parametrize("n_grid_points", [5], ids=["g5"]) -def test_bad_parameter_input_value(campaign, bad_val, batch_size): +def test_bad_parameter_input_value(campaign, bad_val, fake_measurements): """Test attempting to read in an invalid parameter value.""" - rec = create_fake_input(campaign.parameters, campaign.targets, batch_size) - # Add an invalid value - rec[campaign.parameters[0].name].iloc[0] = bad_val + fake_measurements[campaign.parameters[0].name].iloc[0] = bad_val with pytest.raises((ValueError, TypeError)): - campaign.add_measurements(rec) + campaign.add_measurements(fake_measurements) @pytest.mark.parametrize( diff --git a/tests/test_pending_experiments.py b/tests/test_pending_experiments.py index 9048bc559..59b122756 100644 --- a/tests/test_pending_experiments.py +++ b/tests/test_pending_experiments.py @@ -21,7 +21,6 @@ from baybe.utils.basic import get_subclasses from baybe.utils.dataframe import ( add_parameter_noise, - create_fake_input, ) from baybe.utils.random import temporary_seed @@ -115,13 +114,12 @@ ], ) @pytest.mark.parametrize("n_grid_points", [8], ids=["grid8"]) -def test_pending_points(campaign, batch_size): +def test_pending_points(campaign, batch_size, fake_measurements): """Test there is no recommendation overlap if pending experiments are specified.""" warnings.filterwarnings("ignore", category=UnusedObjectWarning) # Add some initial measurements - rec = create_fake_input(campaign.parameters, campaign.targets, batch_size) - campaign.add_measurements(rec) + campaign.add_measurements(fake_measurements) # Get recommendations and set them as pending experiments while getting another set # Fix the random seed for each recommend call to limit influence of randomness in @@ -156,24 +154,23 @@ def test_pending_points(campaign, batch_size): ) @pytest.mark.parametrize("n_grid_points", [5], ids=["g5"]) @pytest.mark.parametrize("batch_size", [3], ids=["b3"]) -def test_invalid_acqf(searchspace, recommender, objective, batch_size, acqf): +def test_invalid_acqf(searchspace, objective, batch_size, acqf, fake_measurements): """Test exception raised for acqfs that don't support pending experiments.""" recommender = TwoPhaseMetaRecommender( recommender=BotorchRecommender(acquisition_function=acqf) ) # Create fake measurements and pending experiments - rec1 = create_fake_input(searchspace.parameters, objective.targets, batch_size) - rec2 = rec1.copy() - add_parameter_noise(rec2, searchspace.parameters) + fake_pending_experiments = fake_measurements.copy() + add_parameter_noise(fake_pending_experiments, searchspace.parameters) with pytest.raises(IncompatibleAcquisitionFunctionError): recommender.recommend( batch_size, searchspace, objective, - measurements=rec1, - pending_experiments=rec2, + measurements=fake_measurements, + pending_experiments=fake_pending_experiments, ) @@ -215,18 +212,18 @@ def test_invalid_input( batch_size, invalid_pending_value, parameter_names, + fake_measurements, ): """Test exception raised for invalid pending experiments input.""" # Create fake measurements and pending experiments - rec1 = create_fake_input(searchspace.parameters, objective.targets, batch_size) - rec2 = rec1.copy() - rec2[parameter_names[0]] = invalid_pending_value + fake_pending_experiments = fake_measurements.copy() + fake_pending_experiments[parameter_names[0]] = invalid_pending_value with pytest.raises((ValueError, TypeError), match="parameter"): recommender.recommend( batch_size, searchspace, objective, - measurements=rec1, - pending_experiments=rec2, + measurements=fake_measurements, + pending_experiments=fake_pending_experiments, ) diff --git a/tests/test_surrogate.py b/tests/test_surrogate.py index 463725443..c0a44a968 100644 --- a/tests/test_surrogate.py +++ b/tests/test_surrogate.py @@ -2,25 +2,21 @@ from unittest.mock import patch -from baybe.recommenders.pure.nonpredictive.sampling import RandomRecommender from baybe.surrogates.gaussian_process.core import GaussianProcessSurrogate -from baybe.utils.dataframe import add_fake_measurements @patch.object(GaussianProcessSurrogate, "_fit") -def test_caching(patched, searchspace, objective): +def test_caching(patched, searchspace, objective, fake_measurements): """A second fit call with the same context does not trigger retraining.""" # Prepare the setting - measurements = RandomRecommender().recommend(3, searchspace, objective) - add_fake_measurements(measurements, objective.targets) surrogate = GaussianProcessSurrogate() # First call - surrogate.fit(searchspace, objective, measurements) + surrogate.fit(searchspace, objective, fake_measurements) patched.assert_called() patched.reset_mock() # Second call - surrogate.fit(searchspace, objective, measurements) + surrogate.fit(searchspace, objective, fake_measurements) patched.assert_not_called() From f038018f2ae318431c5d33d25caaacee47395f88 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 6 Jan 2025 11:49:04 +0100 Subject: [PATCH 09/11] Update type hints --- baybe/utils/dataframe.py | 8 ++++---- baybe/utils/validation.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 8d7142b16..5cc3cda98 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -4,7 +4,7 @@ import functools import logging -from collections.abc import Callable, Collection, Iterable, Sequence +from collections.abc import Callable, Iterable, Sequence from typing import TYPE_CHECKING, Literal, TypeVar, overload import numpy as np @@ -70,7 +70,7 @@ def to_tensor(*x: np.ndarray | pd.DataFrame) -> Tensor | tuple[Tensor, ...]: def add_fake_measurements( data: pd.DataFrame, - targets: Collection[Target], + targets: Iterable[Target], good_reference_values: dict[str, list] | None = None, good_intervals: dict[str, tuple[float, float]] | None = None, bad_intervals: dict[str, tuple[float, float]] | None = None, @@ -279,8 +279,8 @@ def add_parameter_noise( def create_fake_input( - parameters: Sequence[Parameter], - targets: Sequence[Target], + parameters: Iterable[Parameter], + targets: Iterable[Target], n_rows: int = 1, **kwargs: dict, ) -> pd.DataFrame: diff --git a/baybe/utils/validation.py b/baybe/utils/validation.py index 8c9037f6d..84b47ad2f 100644 --- a/baybe/utils/validation.py +++ b/baybe/utils/validation.py @@ -3,7 +3,7 @@ from __future__ import annotations import math -from collections.abc import Callable, Sequence +from collections.abc import Callable, Iterable from typing import TYPE_CHECKING, Any import pandas as pd @@ -75,7 +75,7 @@ def validator(self: Any, attribute: Attribute, value: Any) -> None: """Validator for non-infinite floats.""" -def validate_target_input(data: pd.DataFrame, targets: Sequence[Target]) -> None: +def validate_target_input(data: pd.DataFrame, targets: Iterable[Target]) -> None: """Validate input dataframe columns corresponding to targets. Args: @@ -119,7 +119,7 @@ def validate_target_input(data: pd.DataFrame, targets: Sequence[Target]) -> None def validate_parameter_input( data: pd.DataFrame, - parameters: Sequence[Parameter], + parameters: Iterable[Parameter], numerical_measurements_must_be_within_tolerance: bool = False, ) -> None: """Validate input dataframe columns corresponding to parameters. From c71fe09b3cdf8bf6e1a1e6191b420c3d2ed49c7b Mon Sep 17 00:00:00 2001 From: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:53:16 +0100 Subject: [PATCH 10/11] Improve text Co-authored-by: AdrianSosic --- baybe/utils/validation.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/baybe/utils/validation.py b/baybe/utils/validation.py index 84b47ad2f..84a2108c1 100644 --- a/baybe/utils/validation.py +++ b/baybe/utils/validation.py @@ -92,28 +92,26 @@ def validate_target_input(data: pd.DataFrame, targets: Iterable[Target]) -> None from baybe.targets import BinaryTarget, NumericalTarget if len(data) < 1: - raise ValueError("The provided input dataframe cannot be emtpy.") + raise ValueError("The provided input dataframe cannot be empty.") for t in targets: if data[t.name].isna().any(): raise ValueError( - f"The target '{t.name}' has missing values or NaNs in the provided " - f"dataframe. Missing target values are not supported." + f"The target '{t.name}' has missing values in the provided dataframe." ) if isinstance(t, NumericalTarget): if data[t.name].dtype.kind not in "iufb": raise TypeError( f"The numerical target '{t.name}' has non-numeric entries in the " - f"provided dataframe. Non-numeric target values are not supported." + f"provided dataframe." ) elif isinstance(t, BinaryTarget): - if not ( - data[t.name].isin(allowed := [t.failure_value, t.success_value]).all() - ): + allowed = {t.failure_value, t.success_value} + if invalid := set(data[t.name].unique()) - allowed: raise ValueError( - f"The binary target '{t.name}' has nvalid entries in the provided " - f"dataframe. Allowed values are: {allowed}." + f"The binary target '{t.name}' has invalid entries {invalid} " + f"in the provided dataframe. Allowed values are: {allowed}." ) @@ -137,13 +135,13 @@ def validate_parameter_input( TypeError: If a parameter contains non-numeric values. """ if len(data) < 1: - raise ValueError("The provided input dataframe cannot be emtpy.") + raise ValueError("The provided input dataframe cannot be empty.") for p in parameters: if data[p.name].isna().any(): raise ValueError( - f"The parameter '{p.name}' has missing values or NaNs in the provided " - f"dataframe. Missing parameter values are not supported." + f"The parameter '{p.name}' has missing values in the provided " + f"dataframe." ) if p.is_numerical and (data[p.name].dtype.kind not in "iufb"): raise TypeError( From 969dea45363057f8c87842d164dd7448d0e5a255 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 10 Jan 2025 16:32:16 +0100 Subject: [PATCH 11/11] Add note --- baybe/utils/dataframe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 5cc3cda98..ea08a4317 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -485,6 +485,11 @@ def fuzzy_row_match( Raises: ValueError: If some rows are present in the right but not in the left dataframe. + + Note: + This function assumes that the dataframes contain only allowed values as + specified in the parameter objects. No further validation to assert this is + done. """ # Assert that all parameters appear in the given dataframe if not all(col in right_df.columns for col in left_df.columns):