Remove columns with duplicated names

Scienfitz · Scienfitz · commit d49ebbf8f80e · 2025-02-01T16:39:57.000+01:00
This is a safety net for the case that encodings provide duplicated column names (likely due to bugs)
diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py
@@ -1,6 +1,7 @@
 """Substance parameters."""
 
 import gc
+import warnings
 from functools import cached_property
 from typing import Any, ClassVar
 
@@ -140,8 +141,17 @@ def comp_df(self) -> pd.DataFrame:
             kwargs_fingerprint=self.kwargs_fingerprint,
         )
 
-        # Drop NaN and constant columns
+        # Drop NaN, constant columns and columns with duplicated names
         comp_df = comp_df.loc[:, ~comp_df.isna().any(axis=0)]
+        mask_duplicated_columns = comp_df.columns.duplicated()
+        if any(mask_duplicated_columns):
+            warnings.warn(
+                f"There were duplicated column names for the substance parameter "
+                f"{self.name} with encoding {self.encoding.name}. This could indicate "
+                f"bugs with the encoding computation. The duplicated columns will be "
+                f"dropped."
+            )
+            comp_df = comp_df.loc[:, ~mask_duplicated_columns]
         comp_df = df_drop_single_value_columns(comp_df)
 
         # Label the rows with the molecule names