Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mpvanderschelling/issue208 #211

Merged
merged 3 commits into from
Oct 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/f3dasm/_src/design/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,16 @@
# =============================================================================


class _Columns:
names: List[str]


class _Data:
data: pd.DataFrame
columns: _Columns

def to_dataframe() -> pd.DataFrame:
...


@dataclass
Expand Down Expand Up @@ -165,7 +173,6 @@ def from_dataframe(cls, df: pd.DataFrame) -> Domain:
Domain
Domain object
"""
# TODO : If lower_bound and upper_bound are similar, then it is a constant parameter
space = {}
for name, type in df.dtypes.items():
if type == 'float64':
Expand All @@ -188,7 +195,7 @@ def from_dataframe(cls, df: pd.DataFrame) -> Domain:

@classmethod
def from_data(cls: Type[Domain], data: _Data) -> Domain:
return cls.from_dataframe(data.data)
return cls.from_dataframe(data.to_dataframe())

# Export
# =============================================================================
Expand Down
135 changes: 101 additions & 34 deletions src/f3dasm/_src/experimentdata/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

# Standard
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Type

Expand All @@ -25,12 +26,52 @@
# =============================================================================


class _Columns:
def __init__(self, columns: Optional[Dict[str, bool]] = None):
if columns is None:
columns = {}

self.columns: Dict[str, bool] = columns

def __repr__(self) -> str:
return self.columns.__repr__()

@property
def names(self) -> List[str]:
return list(self.columns.keys())

def is_disk(self, name: str) -> bool:
return self.columns[name]

def add(self, name: str, is_disk: bool = False):
self.columns[name] = is_disk

def remove(self, name: str):
del self.columns[name]

def iloc(self, name: str | List[str]) -> List[int]:
if isinstance(name, str):
name = [name]

_indices = []
for n in name:
_indices.append(self.names.index(n))
return _indices

def replace_key(self, old_name: str, new_name: str):
self.columns[new_name] = self.columns.pop(old_name)

class _Data:
def __init__(self, data: Optional[pd.DataFrame] = None):
def __init__(self, data: Optional[pd.DataFrame] = None,
columns: Optional[_Columns] = None):
if data is None:
data = pd.DataFrame()

self.data: pd.DataFrame = data
if columns is None:
columns = _Columns({col: False for col in data.columns})

self.columns: _Columns = columns
self.data = data.rename(columns={name: i for i, name in enumerate(data.columns)})

def __len__(self):
"""The len() method returns the number of datapoints"""
Expand Down Expand Up @@ -84,18 +125,18 @@ def __add__(self, other: _Data | Dict[str, Any]) -> _Data:
try:
last_index = self.data.index[-1]
except IndexError: # Empty DataFrame
return _Data(other.data.copy()) # Make a copy of other.data
return _Data(data=other.data.copy(), columns=other.columns) # Make a copy of other.data

# Make a copy of other.data and modify its index
other_data_copy = other.data.copy()
other_data_copy.index = other_data_copy.index + last_index + 1
return _Data(pd.concat([self.data, other_data_copy]))
return _Data(pd.concat([self.data, other_data_copy]), columns=self.columns)

def __eq__(self, __o: _Data) -> bool:
return self.data.equals(__o.data)

def _repr_html_(self) -> str:
return self.data._repr_html_()
return self.to_dataframe()._repr_html_()

# Properties
# =============================================================================
Expand All @@ -106,7 +147,7 @@ def indices(self) -> pd.Index:

@property
def names(self) -> List[str]:
return self.data.columns.to_list()
return self.columns.names

# Alternative constructors
# =============================================================================
Expand Down Expand Up @@ -139,16 +180,17 @@ def from_domain(cls, domain: Domain) -> _Data:
-------
_description_
"""
df = pd.DataFrame(columns=domain.names).astype(
domain._cast_types_dataframe()
)
_dtypes = {index: parameter._type for index, (_, parameter) in enumerate(domain.space.items())}

df = pd.DataFrame(columns=range(len(domain))).astype(_dtypes)

# Set the categories tot the categorical parameters
for name, categorical_input in domain.get_categorical_parameters().items():
df[name] = pd.Categorical(
df[name], categories=categorical_input.categories)
for index, (name, categorical_input) in enumerate(domain.get_categorical_parameters().items()):
df[index] = pd.Categorical(
df[index], categories=categorical_input.categories)

return cls(df)
_columns = {name: False for name in domain.names}
return cls(df, columns=_Columns(_columns))

@classmethod
def from_file(cls, filename: Path | str) -> _Data:
Expand All @@ -160,7 +202,10 @@ def from_file(cls, filename: Path | str) -> _Data:
The filename to load the data from.
"""
file = Path(filename).with_suffix('.csv')
return cls(pd.read_csv(file, header=0, index_col=0))
df = pd.read_csv(file, header=0, index_col=0)
_columns = {name: False for name in df.columns.to_list()}
df.columns = range(df.columns.size) # Reset the columns to be consistent
return cls(df, columns=_Columns(_columns))

@classmethod
def from_numpy(cls: Type[_Data], array: np.ndarray) -> _Data:
Expand All @@ -183,7 +228,8 @@ def from_dataframe(cls, dataframe: pd.DataFrame) -> _Data:
dataframe : pd.DataFrame
The dataframe to load the data from.
"""
return cls(dataframe)
_columns = {name: False for name in dataframe.columns.to_list()}
return cls(dataframe, columns=_Columns(_columns))

def reset(self, domain: Optional[Domain] = None):
"""Resets the data to the initial state.
Expand All @@ -197,11 +243,13 @@ def reset(self, domain: Optional[Domain] = None):
----
If the domain is None, the data will be reset to an empty dataframe.
"""

if domain is None:
self.data = pd.DataFrame()
return

self.data = self.from_domain(domain).data
self.columns = _Columns()
else:
self.data = self.from_domain(domain).data
self.columns = self.from_domain(domain).columns

# Export
# =============================================================================
Expand Down Expand Up @@ -230,7 +278,7 @@ def to_xarray(self, label: str) -> xr.DataArray:
xarray DataArray with the data.
"""
return xr.DataArray(self.data, dims=['iterations', label], coords={
'iterations': range(len(self)), label: self.names})
'iterations': self.indices, label: self.names})

def to_dataframe(self) -> pd.DataFrame:
"""Export the _Data object to a pandas DataFrame.
Expand All @@ -240,7 +288,9 @@ def to_dataframe(self) -> pd.DataFrame:
pd.DataFrame
pandas dataframe with the data.
"""
return self.data
df = deepcopy(self.data)
df.columns = self.names
return df

def combine_data_to_multiindex(self, other: _Data, jobs_df: pd.DataFrame) -> pd.DataFrame:
"""Combine the data to a multiindex dataframe.
Expand All @@ -262,7 +312,8 @@ def combine_data_to_multiindex(self, other: _Data, jobs_df: pd.DataFrame) -> pd.
This function is mainly used to show the combined ExperimentData object in a
Jupyter Notebook
"""
return pd.concat([jobs_df, self.data, other.data], axis=1, keys=['jobs', 'input', 'output'])
return pd.concat([jobs_df, self.to_dataframe(),
other.to_dataframe()], axis=1, keys=['jobs', 'input', 'output'])

def store(self, filename: Path) -> None:
"""Stores the data to a file.
Expand All @@ -272,7 +323,7 @@ def store(self, filename: Path) -> None:
filename : Path
The filename to store the data to.
"""
self.data.to_csv(filename.with_suffix('.csv'))
self.to_dataframe().to_csv(filename.with_suffix('.csv'))

def n_best_samples(self, nosamples: int, column_name: List[str] | str) -> pd.DataFrame:
"""Returns the n best samples. We consider to be lower values better.
Expand All @@ -289,7 +340,7 @@ def n_best_samples(self, nosamples: int, column_name: List[str] | str) -> pd.Dat
pd.DataFrame
The n best samples.
"""
return self.data.nsmallest(n=nosamples, columns=column_name)
return self.data.nsmallest(n=nosamples, columns=self.columns.iloc(column_name))

def select_columns(self, columns: Iterable[str] | str) -> _Data:
"""Filter the data on the selected columns.
Expand All @@ -307,7 +358,8 @@ def select_columns(self, columns: Iterable[str] | str) -> _Data:
# This is necessary otherwise self.data[columns] will be a Series
if isinstance(columns, str):
columns = [columns]
return _Data(self.data[columns])
_selected_columns = _Columns({column: self.columns.columns[column] for column in columns})
return _Data(self.data[self.columns.iloc(columns)], columns=_selected_columns)
# Append and remove data
# =============================================================================

Expand Down Expand Up @@ -336,7 +388,13 @@ def add_empty_rows(self, number_of_rows: int):
self.data = pd.concat([self.data, empty_data], ignore_index=False)

def add_column(self, name: str):
self.data[name] = np.nan
if self.data.columns.empty:
new_columns_index = 0
else:
new_columns_index = self.data.columns[-1] + 1

self.columns.add(name)
self.data[new_columns_index] = np.nan

def fill_numpy_arrays(self, array: np.ndarray) -> Iterable[int]:
# get the indices of the nan values
Expand All @@ -351,7 +409,7 @@ def remove(self, indices: List[int]):
# =============================================================================

def get_data_dict(self, index: int) -> Dict[str, Any]:
return self.data.loc[index].to_dict()
return self.to_dataframe().loc[index].to_dict()

def set_data(self, index: int, value: Any, column: Optional[str] = None):
# check if the index exists
Expand All @@ -360,12 +418,18 @@ def set_data(self, index: int, value: Any, column: Optional[str] = None):

if column is None:
self.data.loc[index] = value
else:
try:
self.data.at[index, column] = value
except ValueError:
self.data = self.data.astype(object)
self.data.at[index, column] = value
return

elif column not in self.columns.names:
# TODO this is_disk value needs to be provided by set_data call
self.columns.add(column, is_disk=False)

_column_index = self.columns.iloc(column)[0]
try:
self.data.at[index, _column_index] = value
except ValueError:
self.data = self.data.astype(object)
self.data.at[index, _column_index] = value

def reset_index(self) -> None:
"""Reset the index of the data."""
Expand All @@ -379,7 +443,8 @@ def has_columnnames(self, names: Iterable[str]) -> bool:
return set(names).issubset(self.names)

def set_columnnames(self, names: Iterable[str]) -> None:
self.data.columns = names
for old_name, new_name in zip(self.names, names):
self.columns.replace_key(old_name, new_name)


def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data:
Expand All @@ -395,4 +460,6 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data:
_Data
The data object.
"""
return _Data(pd.DataFrame(dictionary, index=[0]).copy())
_columns = {name: False for name in dictionary.keys()}
df = pd.DataFrame(dictionary, index=[0]).copy()
return _Data(data=df, columns=_Columns(_columns))
4 changes: 2 additions & 2 deletions src/f3dasm/_src/experimentdata/experimentdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,8 @@ def _add_experiments(self, experiment_sample: ExperimentSample | ExperimentData)

# Apparently you need to cast the types again
# TODO: Breaks if values are NaN or infinite
self.input_data.data = self.input_data.data.astype(
self.domain._cast_types_dataframe())
_dtypes = {index: parameter._type for index, (_, parameter) in enumerate(self.domain.space.items())}
self.input_data.data = self.input_data.data.astype(_dtypes)

def add_input_parameter(self, name: str, parameter: Parameter) -> None:
"""Add a new input column to the ExperimentData object.
Expand Down
3 changes: 2 additions & 1 deletion tests/design/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ def test_data_get_inputdata_dict(sample_data: _Data):
def test_data_set_data(sample_data: _Data):
index = 0
sample_data.set_data(index, 15, 'output1')
assert sample_data.data.loc[index, 'output1'] == 15
_column_index = sample_data.columns.iloc('output1')[0]
assert sample_data.data.loc[index, _column_index] == 15


def test_data_to_numpy(sample_data: _Data):
Expand Down
11 changes: 8 additions & 3 deletions tests/experimentdata/test_experimentdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ def mock_pd_read_pickle(*args, **kwargs):
assert experiment_data == experimentdata_expected


@pytest.mark.parametrize("input_data", [path_input, str_input, pd_input(), data_input(), numpy_input()])
@pytest.mark.parametrize("input_data", [pd_input(), path_input, str_input, data_input(), numpy_input()])
@pytest.mark.parametrize("output_data", [None])
@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]),
dimensionality=3), None, path_domain, str_domain])
Expand Down Expand Up @@ -545,8 +545,8 @@ def mock_pd_read_pickle(*args, **kwargs):
assert experiment_data.jobs == experimentdata_expected_no_output.jobs

assert experiment_data == experimentdata_expected_no_output


@pytest.mark.parametrize("input_data", [None])
@pytest.mark.parametrize("output_data", [None])
@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]),
Expand Down Expand Up @@ -656,6 +656,7 @@ def test_evaluate_mode(mode: str, experimentdata_continuous: ExperimentData, tmp
experimentdata_continuous.evaluate("ackley", mode=mode, kwargs={
"scale_bounds": np.array([[0., 1.], [0., 1.], [0., 1.]]), 'seed': SEED})


def test_get_input_data(experimentdata_expected_no_output: ExperimentData):
input_data = experimentdata_expected_no_output.get_input_data()
df, _ = input_data.to_pandas()
Expand All @@ -672,12 +673,14 @@ def test_get_input_data_selection(experimentdata_expected_no_output: ExperimentD
selected_pd = pd_input()[selection]
pd.testing.assert_frame_equal(df, selected_pd)


def test_get_output_data(experimentdata_expected: ExperimentData):
output_data = experimentdata_expected.get_output_data()
_, df = output_data.to_pandas()
pd.testing.assert_frame_equal(df, pd_output())
assert experimentdata_expected.output_data == output_data.output_data


@pytest.mark.parametrize("selection", ["y", ["y"]])
def test_get_output_data_selection(experimentdata_expected: ExperimentData, selection: Iterable[str] | str):
output_data = experimentdata_expected.get_output_data(selection)
Expand All @@ -687,6 +690,7 @@ def test_get_output_data_selection(experimentdata_expected: ExperimentData, sele
selected_pd = pd_output()[selection]
pd.testing.assert_frame_equal(df, selected_pd)


def test_iter_behaviour(experimentdata_continuous: ExperimentData):
for i in experimentdata_continuous:
assert isinstance(i, ExperimentSample)
Expand All @@ -695,5 +699,6 @@ def test_iter_behaviour(experimentdata_continuous: ExperimentData):
for i in selected_experimentdata:
assert isinstance(i, ExperimentSample)


if __name__ == "__main__": # pragma: no cover
pytest.main()
Loading