From 665d1a404f9b7dcebbf35216e0d543c9d2bc8cf0 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 16 Jan 2025 13:54:15 +0100 Subject: [PATCH 1/4] Maintain row order after cross join --- CHANGELOG.md | 2 ++ baybe/searchspace/discrete.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d535a07ead..1970ecaecb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `SingleTargetObjective` no longer erroneously maximizes it - Improvement-based Monte Carlo acquisition functions now use the correct reference value in minimization mode +- The `polars` cross join for the Cartesian product now explicitly maintains row order, + yielding an equivalent result to the `pandas` version and fixing tests for `>=1.19.0` ### Removed - `botorch_function_wrapper` utility for creating lookup callables diff --git a/baybe/searchspace/discrete.py b/baybe/searchspace/discrete.py index bddd4461df..984d05c705 100644 --- a/baybe/searchspace/discrete.py +++ b/baybe/searchspace/discrete.py @@ -792,7 +792,12 @@ def parameter_cartesian_prod_polars(parameters: Sequence[Parameter]) -> pl.LazyF # Cross-join parameters res = param_frames[0] for frame in param_frames[1:]: - res = res.join(frame, how="cross", force_parallel=True) + # Note: We enforce row order to be consistent with the pandas output, + # which simplifies testing. If speed ever becomes problematic, this + # restriction could be removed. + res = res.join( + frame, how="cross", force_parallel=True, maintain_order="left_right" + ) return res From fbc5d290143467808e9ee0a4f79f228eb62841dc Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 24 Jan 2025 08:38:35 +0100 Subject: [PATCH 2/4] Revert "Maintain row order after cross join" For performance reasons, we instead accept the different order and adjust out tests accordingly. --- CHANGELOG.md | 2 -- baybe/searchspace/discrete.py | 7 +------ 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1970ecaecb..d535a07ead 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,8 +52,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `SingleTargetObjective` no longer erroneously maximizes it - Improvement-based Monte Carlo acquisition functions now use the correct reference value in minimization mode -- The `polars` cross join for the Cartesian product now explicitly maintains row order, - yielding an equivalent result to the `pandas` version and fixing tests for `>=1.19.0` ### Removed - `botorch_function_wrapper` utility for creating lookup callables diff --git a/baybe/searchspace/discrete.py b/baybe/searchspace/discrete.py index 984d05c705..bddd4461df 100644 --- a/baybe/searchspace/discrete.py +++ b/baybe/searchspace/discrete.py @@ -792,12 +792,7 @@ def parameter_cartesian_prod_polars(parameters: Sequence[Parameter]) -> pl.LazyF # Cross-join parameters res = param_frames[0] for frame in param_frames[1:]: - # Note: We enforce row order to be consistent with the pandas output, - # which simplifies testing. If speed ever becomes problematic, this - # restriction could be removed. - res = res.join( - frame, how="cross", force_parallel=True, maintain_order="left_right" - ) + res = res.join(frame, how="cross", force_parallel=True) return res From c7e3f4752cfdb92f6884ebb5a2cbad0af390e551 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 24 Jan 2025 08:47:04 +0100 Subject: [PATCH 3/4] Ignore row order in equality check --- CHANGELOG.md | 2 ++ tests/constraints/test_constraints_polars.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d535a07ead..fd806e1f1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 compatible with `strtobool` - All arguments to `MetaRecommender.select_recommender` are now optional - `MetaRecommender`s can now be composed of other `MetaRecommender`s +- For performance reasons, search space manipulation using `polars` is no longer + guaranteed to produce the same row order as the corresponding `pandas` operations ### Fixed - Rare bug arising from degenerate `SubstanceParameter.comp_df` rows that caused diff --git a/tests/constraints/test_constraints_polars.py b/tests/constraints/test_constraints_polars.py index 48a5865f75..62f6cfb03e 100644 --- a/tests/constraints/test_constraints_polars.py +++ b/tests/constraints/test_constraints_polars.py @@ -189,7 +189,7 @@ def test_polars_product(constraints, parameters): # Do Pandas product df_pd = parameter_cartesian_prod_pandas(parameters) - # Assert equality of lengths before filtering + # Assert equality before filtering assert_frame_equal(df_pl.to_pandas(), df_pd) # Apply constraints @@ -198,5 +198,9 @@ def test_polars_product(constraints, parameters): _apply_constraint_filter_polars(ldf, constraints)[0].collect().to_pandas() ) - # Assert strict equality of two dataframes - assert_frame_equal(df_pl_filtered, df_pd_filtered) + # Assert order-agnostic equality of the two dataframes + cols = df_pd_filtered.columns.tolist() + assert_frame_equal( + df_pd_filtered.sort_values(cols).reset_index(drop=True), + df_pl_filtered.sort_values(cols).reset_index(drop=True), + ) From 63fecc915810495f8ead3b14f9dee0ffec3dc859 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 24 Jan 2025 08:54:20 +0100 Subject: [PATCH 4/4] Add admonition on row order to user guide --- docs/userguide/envvars.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/userguide/envvars.md b/docs/userguide/envvars.md index 836ddf8c72..260290e634 100644 --- a/docs/userguide/envvars.md +++ b/docs/userguide/envvars.md @@ -85,6 +85,12 @@ changing the Python environment. To do so, you can set the environment variable `BAYBE_DEACTIVATE_POLARS` to any truthy value accepted by [`strtobool`](baybe.utils.boolean.strtobool). +```{admonition} Row Order +:class: caution + +For performance reasons, search space manipulation using `polars` is not +guaranteed to produce the same row order as the corresponding `pandas` operations. +``` ## Disk Caching For some components, such as the