diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index 0fe9e747..62d71c3b 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -35,8 +35,16 @@ # ============================================================================= +class _Columns: + names: List[str] + + class _Data: data: pd.DataFrame + columns: _Columns + + def to_dataframe() -> pd.DataFrame: + ... @dataclass @@ -165,7 +173,6 @@ def from_dataframe(cls, df: pd.DataFrame) -> Domain: Domain Domain object """ - # TODO : If lower_bound and upper_bound are similar, then it is a constant parameter space = {} for name, type in df.dtypes.items(): if type == 'float64': @@ -188,7 +195,7 @@ def from_dataframe(cls, df: pd.DataFrame) -> Domain: @classmethod def from_data(cls: Type[Domain], data: _Data) -> Domain: - return cls.from_dataframe(data.data) + return cls.from_dataframe(data.to_dataframe()) # Export # ============================================================================= diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index abc85881..acf6b28a 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -4,6 +4,7 @@ from __future__ import annotations # Standard +from copy import deepcopy from pathlib import Path from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Type @@ -25,12 +26,52 @@ # ============================================================================= +class _Columns: + def __init__(self, columns: Optional[Dict[str, bool]] = None): + if columns is None: + columns = {} + + self.columns: Dict[str, bool] = columns + + def __repr__(self) -> str: + return self.columns.__repr__() + + @property + def names(self) -> List[str]: + return list(self.columns.keys()) + + def is_disk(self, name: str) -> bool: + return self.columns[name] + + def add(self, name: str, is_disk: bool = False): + self.columns[name] = is_disk + + def remove(self, name: str): + del self.columns[name] + + def iloc(self, name: str | List[str]) -> List[int]: + if isinstance(name, str): + name = [name] + + _indices = [] + for n in name: + _indices.append(self.names.index(n)) + return _indices + + def replace_key(self, old_name: str, new_name: str): + self.columns[new_name] = self.columns.pop(old_name) + class _Data: - def __init__(self, data: Optional[pd.DataFrame] = None): + def __init__(self, data: Optional[pd.DataFrame] = None, + columns: Optional[_Columns] = None): if data is None: data = pd.DataFrame() - self.data: pd.DataFrame = data + if columns is None: + columns = _Columns({col: False for col in data.columns}) + + self.columns: _Columns = columns + self.data = data.rename(columns={name: i for i, name in enumerate(data.columns)}) def __len__(self): """The len() method returns the number of datapoints""" @@ -84,18 +125,18 @@ def __add__(self, other: _Data | Dict[str, Any]) -> _Data: try: last_index = self.data.index[-1] except IndexError: # Empty DataFrame - return _Data(other.data.copy()) # Make a copy of other.data + return _Data(data=other.data.copy(), columns=other.columns) # Make a copy of other.data # Make a copy of other.data and modify its index other_data_copy = other.data.copy() other_data_copy.index = other_data_copy.index + last_index + 1 - return _Data(pd.concat([self.data, other_data_copy])) + return _Data(pd.concat([self.data, other_data_copy]), columns=self.columns) def __eq__(self, __o: _Data) -> bool: return self.data.equals(__o.data) def _repr_html_(self) -> str: - return self.data._repr_html_() + return self.to_dataframe()._repr_html_() # Properties # ============================================================================= @@ -106,7 +147,7 @@ def indices(self) -> pd.Index: @property def names(self) -> List[str]: - return self.data.columns.to_list() + return self.columns.names # Alternative constructors # ============================================================================= @@ -139,16 +180,17 @@ def from_domain(cls, domain: Domain) -> _Data: ------- _description_ """ - df = pd.DataFrame(columns=domain.names).astype( - domain._cast_types_dataframe() - ) + _dtypes = {index: parameter._type for index, (_, parameter) in enumerate(domain.space.items())} + + df = pd.DataFrame(columns=range(len(domain))).astype(_dtypes) # Set the categories tot the categorical parameters - for name, categorical_input in domain.get_categorical_parameters().items(): - df[name] = pd.Categorical( - df[name], categories=categorical_input.categories) + for index, (name, categorical_input) in enumerate(domain.get_categorical_parameters().items()): + df[index] = pd.Categorical( + df[index], categories=categorical_input.categories) - return cls(df) + _columns = {name: False for name in domain.names} + return cls(df, columns=_Columns(_columns)) @classmethod def from_file(cls, filename: Path | str) -> _Data: @@ -160,7 +202,10 @@ def from_file(cls, filename: Path | str) -> _Data: The filename to load the data from. """ file = Path(filename).with_suffix('.csv') - return cls(pd.read_csv(file, header=0, index_col=0)) + df = pd.read_csv(file, header=0, index_col=0) + _columns = {name: False for name in df.columns.to_list()} + df.columns = range(df.columns.size) # Reset the columns to be consistent + return cls(df, columns=_Columns(_columns)) @classmethod def from_numpy(cls: Type[_Data], array: np.ndarray) -> _Data: @@ -183,7 +228,8 @@ def from_dataframe(cls, dataframe: pd.DataFrame) -> _Data: dataframe : pd.DataFrame The dataframe to load the data from. """ - return cls(dataframe) + _columns = {name: False for name in dataframe.columns.to_list()} + return cls(dataframe, columns=_Columns(_columns)) def reset(self, domain: Optional[Domain] = None): """Resets the data to the initial state. @@ -197,11 +243,13 @@ def reset(self, domain: Optional[Domain] = None): ---- If the domain is None, the data will be reset to an empty dataframe. """ + if domain is None: self.data = pd.DataFrame() - return - - self.data = self.from_domain(domain).data + self.columns = _Columns() + else: + self.data = self.from_domain(domain).data + self.columns = self.from_domain(domain).columns # Export # ============================================================================= @@ -230,7 +278,7 @@ def to_xarray(self, label: str) -> xr.DataArray: xarray DataArray with the data. """ return xr.DataArray(self.data, dims=['iterations', label], coords={ - 'iterations': range(len(self)), label: self.names}) + 'iterations': self.indices, label: self.names}) def to_dataframe(self) -> pd.DataFrame: """Export the _Data object to a pandas DataFrame. @@ -240,7 +288,9 @@ def to_dataframe(self) -> pd.DataFrame: pd.DataFrame pandas dataframe with the data. """ - return self.data + df = deepcopy(self.data) + df.columns = self.names + return df def combine_data_to_multiindex(self, other: _Data, jobs_df: pd.DataFrame) -> pd.DataFrame: """Combine the data to a multiindex dataframe. @@ -262,7 +312,8 @@ def combine_data_to_multiindex(self, other: _Data, jobs_df: pd.DataFrame) -> pd. This function is mainly used to show the combined ExperimentData object in a Jupyter Notebook """ - return pd.concat([jobs_df, self.data, other.data], axis=1, keys=['jobs', 'input', 'output']) + return pd.concat([jobs_df, self.to_dataframe(), + other.to_dataframe()], axis=1, keys=['jobs', 'input', 'output']) def store(self, filename: Path) -> None: """Stores the data to a file. @@ -272,7 +323,7 @@ def store(self, filename: Path) -> None: filename : Path The filename to store the data to. """ - self.data.to_csv(filename.with_suffix('.csv')) + self.to_dataframe().to_csv(filename.with_suffix('.csv')) def n_best_samples(self, nosamples: int, column_name: List[str] | str) -> pd.DataFrame: """Returns the n best samples. We consider to be lower values better. @@ -289,7 +340,7 @@ def n_best_samples(self, nosamples: int, column_name: List[str] | str) -> pd.Dat pd.DataFrame The n best samples. """ - return self.data.nsmallest(n=nosamples, columns=column_name) + return self.data.nsmallest(n=nosamples, columns=self.columns.iloc(column_name)) def select_columns(self, columns: Iterable[str] | str) -> _Data: """Filter the data on the selected columns. @@ -307,7 +358,8 @@ def select_columns(self, columns: Iterable[str] | str) -> _Data: # This is necessary otherwise self.data[columns] will be a Series if isinstance(columns, str): columns = [columns] - return _Data(self.data[columns]) + _selected_columns = _Columns({column: self.columns.columns[column] for column in columns}) + return _Data(self.data[self.columns.iloc(columns)], columns=_selected_columns) # Append and remove data # ============================================================================= @@ -336,7 +388,13 @@ def add_empty_rows(self, number_of_rows: int): self.data = pd.concat([self.data, empty_data], ignore_index=False) def add_column(self, name: str): - self.data[name] = np.nan + if self.data.columns.empty: + new_columns_index = 0 + else: + new_columns_index = self.data.columns[-1] + 1 + + self.columns.add(name) + self.data[new_columns_index] = np.nan def fill_numpy_arrays(self, array: np.ndarray) -> Iterable[int]: # get the indices of the nan values @@ -351,7 +409,7 @@ def remove(self, indices: List[int]): # ============================================================================= def get_data_dict(self, index: int) -> Dict[str, Any]: - return self.data.loc[index].to_dict() + return self.to_dataframe().loc[index].to_dict() def set_data(self, index: int, value: Any, column: Optional[str] = None): # check if the index exists @@ -360,12 +418,18 @@ def set_data(self, index: int, value: Any, column: Optional[str] = None): if column is None: self.data.loc[index] = value - else: - try: - self.data.at[index, column] = value - except ValueError: - self.data = self.data.astype(object) - self.data.at[index, column] = value + return + + elif column not in self.columns.names: + # TODO this is_disk value needs to be provided by set_data call + self.columns.add(column, is_disk=False) + + _column_index = self.columns.iloc(column)[0] + try: + self.data.at[index, _column_index] = value + except ValueError: + self.data = self.data.astype(object) + self.data.at[index, _column_index] = value def reset_index(self) -> None: """Reset the index of the data.""" @@ -379,7 +443,8 @@ def has_columnnames(self, names: Iterable[str]) -> bool: return set(names).issubset(self.names) def set_columnnames(self, names: Iterable[str]) -> None: - self.data.columns = names + for old_name, new_name in zip(self.names, names): + self.columns.replace_key(old_name, new_name) def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: @@ -395,4 +460,6 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: _Data The data object. """ - return _Data(pd.DataFrame(dictionary, index=[0]).copy()) + _columns = {name: False for name in dictionary.keys()} + df = pd.DataFrame(dictionary, index=[0]).copy() + return _Data(data=df, columns=_Columns(_columns)) diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index 4e5a5f39..0d0c3d7d 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -495,8 +495,8 @@ def _add_experiments(self, experiment_sample: ExperimentSample | ExperimentData) # Apparently you need to cast the types again # TODO: Breaks if values are NaN or infinite - self.input_data.data = self.input_data.data.astype( - self.domain._cast_types_dataframe()) + _dtypes = {index: parameter._type for index, (_, parameter) in enumerate(self.domain.space.items())} + self.input_data.data = self.input_data.data.astype(_dtypes) def add_input_parameter(self, name: str, parameter: Parameter) -> None: """Add a new input column to the ExperimentData object. diff --git a/tests/design/test_data.py b/tests/design/test_data.py index 35d8a299..495a536f 100644 --- a/tests/design/test_data.py +++ b/tests/design/test_data.py @@ -70,7 +70,8 @@ def test_data_get_inputdata_dict(sample_data: _Data): def test_data_set_data(sample_data: _Data): index = 0 sample_data.set_data(index, 15, 'output1') - assert sample_data.data.loc[index, 'output1'] == 15 + _column_index = sample_data.columns.iloc('output1')[0] + assert sample_data.data.loc[index, _column_index] == 15 def test_data_to_numpy(sample_data: _Data): diff --git a/tests/experimentdata/test_experimentdata.py b/tests/experimentdata/test_experimentdata.py index 79422f34..364c82b9 100644 --- a/tests/experimentdata/test_experimentdata.py +++ b/tests/experimentdata/test_experimentdata.py @@ -467,7 +467,7 @@ def mock_pd_read_pickle(*args, **kwargs): assert experiment_data == experimentdata_expected -@pytest.mark.parametrize("input_data", [path_input, str_input, pd_input(), data_input(), numpy_input()]) +@pytest.mark.parametrize("input_data", [pd_input(), path_input, str_input, data_input(), numpy_input()]) @pytest.mark.parametrize("output_data", [None]) @pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3), None, path_domain, str_domain]) @@ -545,8 +545,8 @@ def mock_pd_read_pickle(*args, **kwargs): assert experiment_data.jobs == experimentdata_expected_no_output.jobs assert experiment_data == experimentdata_expected_no_output - - + + @pytest.mark.parametrize("input_data", [None]) @pytest.mark.parametrize("output_data", [None]) @pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), @@ -656,6 +656,7 @@ def test_evaluate_mode(mode: str, experimentdata_continuous: ExperimentData, tmp experimentdata_continuous.evaluate("ackley", mode=mode, kwargs={ "scale_bounds": np.array([[0., 1.], [0., 1.], [0., 1.]]), 'seed': SEED}) + def test_get_input_data(experimentdata_expected_no_output: ExperimentData): input_data = experimentdata_expected_no_output.get_input_data() df, _ = input_data.to_pandas() @@ -672,12 +673,14 @@ def test_get_input_data_selection(experimentdata_expected_no_output: ExperimentD selected_pd = pd_input()[selection] pd.testing.assert_frame_equal(df, selected_pd) + def test_get_output_data(experimentdata_expected: ExperimentData): output_data = experimentdata_expected.get_output_data() _, df = output_data.to_pandas() pd.testing.assert_frame_equal(df, pd_output()) assert experimentdata_expected.output_data == output_data.output_data + @pytest.mark.parametrize("selection", ["y", ["y"]]) def test_get_output_data_selection(experimentdata_expected: ExperimentData, selection: Iterable[str] | str): output_data = experimentdata_expected.get_output_data(selection) @@ -687,6 +690,7 @@ def test_get_output_data_selection(experimentdata_expected: ExperimentData, sele selected_pd = pd_output()[selection] pd.testing.assert_frame_equal(df, selected_pd) + def test_iter_behaviour(experimentdata_continuous: ExperimentData): for i in experimentdata_continuous: assert isinstance(i, ExperimentSample) @@ -695,5 +699,6 @@ def test_iter_behaviour(experimentdata_continuous: ExperimentData): for i in selected_experimentdata: assert isinstance(i, ExperimentSample) + if __name__ == "__main__": # pragma: no cover pytest.main() diff --git a/tests/sampling/test_sampling.py b/tests/sampling/test_sampling.py index d02011ab..13a45645 100644 --- a/tests/sampling/test_sampling.py +++ b/tests/sampling/test_sampling.py @@ -35,15 +35,14 @@ def test_correct_sampling_ran(design3: Domain): ] ) - columnnames = ["x1", "x2", "x3", "x4", "x5"] - df_ground_truth = pd.DataFrame(data=ground_truth_samples, columns=columnnames) + df_ground_truth = pd.DataFrame(data=ground_truth_samples) df_ground_truth = df_ground_truth.astype( { - "x1": "float", - "x2": "int", - "x3": "float", - "x4": "category", - "x5": "float", + 0: "float", + 1: "int", + 2: "float", + 3: "category", + 4: "float", } ) @@ -69,15 +68,14 @@ def test_correct_sampling_sobol(design3: Domain): ] ) - columnnames = ["x1", "x2", "x3", "x4", "x5"] - df_ground_truth = pd.DataFrame(data=ground_truth_samples, columns=columnnames) + df_ground_truth = pd.DataFrame(data=ground_truth_samples) df_ground_truth = df_ground_truth.astype( { - "x1": "float", - "x2": "int", - "x3": "float", - "x4": "category", - "x5": "float", + 0: "float", + 1: "int", + 2: "float", + 3: "category", + 4: "float", } ) @@ -102,15 +100,14 @@ def test_correct_sampling_lhs(design3: Domain): ] ) - columnnames = ["x1", "x2", "x3", "x4", "x5"] - df_ground_truth = pd.DataFrame(data=ground_truth_samples, columns=columnnames) + df_ground_truth = pd.DataFrame(data=ground_truth_samples) df_ground_truth = df_ground_truth.astype( { - "x1": "float", - "x2": "int", - "x3": "float", - "x4": "category", - "x5": "float", + 0: "float", + 1: "int", + 2: "float", + 3: "category", + 4: "float", } )