From 03b861b5e94e12f121c94ee95d57d5b28dc826ff Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 2 Nov 2023 16:48:50 -0400 Subject: [PATCH 1/3] Remove the naming discrepancy (`path_`) of referenced to_disk=True object Fixes #212 --- src/f3dasm/_src/design/domain.py | 109 ++++++++++++++--- src/f3dasm/_src/design/parameter.py | 5 + src/f3dasm/_src/experimentdata/_columns.py | 110 ++++++++++++++++++ src/f3dasm/_src/experimentdata/_data.py | 52 ++------- .../_src/experimentdata/experimentdata.py | 41 +++++-- .../_src/experimentdata/experimentsample.py | 88 +++++++------- tests/design/conftest.py | 5 +- tests/design/test_data.py | 6 +- tests/design/test_designofexperiments.py | 9 +- tests/design/test_trial.py | 7 +- tests/experimentdata/test_experimentdata.py | 51 +++++--- 11 files changed, 342 insertions(+), 141 deletions(-) create mode 100644 src/f3dasm/_src/experimentdata/_columns.py diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index 73d98e46..592e7003 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -24,7 +24,7 @@ # Local from .parameter import (CategoricalParameter, CategoricalType, ConstantParameter, ContinuousParameter, - DiscreteParameter, Parameter) + DiscreteParameter, OutputParameter, Parameter) # Authorship & Credits # ============================================================================= @@ -37,7 +37,12 @@ class _Columns: - names: List[str] + @property + def names(self) -> List[str]: + ... + + def is_disk(self, name: str) -> bool: + ... class _Data: @@ -59,6 +64,7 @@ class Domain: """ space: Dict[str, Parameter] = field(default_factory=dict) + output_space: Dict[str, OutputParameter] = field(default_factory=dict) def __len__(self) -> int: """The len() method returns the number of parameters""" @@ -164,44 +170,57 @@ def from_yaml(cls: Type[Domain], yaml: DictConfig) -> Domain: for name, param in yaml.items()}) @classmethod - def from_dataframe(cls, df: pd.DataFrame) -> Domain: + def from_dataframe(cls, df_input: pd.DataFrame, + df_output: pd.DataFrame) -> Domain: """Initializes a Domain from a pandas DataFrame. Parameters ---------- - df : pd.DataFrame + df_input : pd.DataFrame DataFrame containing the input parameters. + df_output : pd.DataFrame + DataFrame containing the output parameters. Returns ------- Domain Domain object """ - space = {} - for name, type in df.dtypes.items(): + input_space = {} + for name, type in df_input.dtypes.items(): if type == 'float64': - if float(df[name].min()) == float(df[name].max()): - space[name] = ConstantParameter( - value=float(df[name].min())) + if float(df_input[name].min()) == float(df_input[name].max()): + input_space[name] = ConstantParameter( + value=float(df_input[name].min())) continue - space[name] = ContinuousParameter(lower_bound=float( - df[name].min()), upper_bound=float(df[name].max())) + input_space[name] = ContinuousParameter(lower_bound=float( + df_input[name].min()), + upper_bound=float(df_input[name].max())) elif type == 'int64': - if int(df[name].min()) == int(df[name].max()): - space[name] = ConstantParameter(value=int(df[name].min())) + if int(df_input[name].min()) == int(df_input[name].max()): + input_space[name] = ConstantParameter( + value=int(df_input[name].min())) continue - space[name] = DiscreteParameter(lower_bound=int( - df[name].min()), upper_bound=int(df[name].max())) + input_space[name] = DiscreteParameter(lower_bound=int( + df_input[name].min()), + upper_bound=int(df_input[name].max())) else: - space[name] = CategoricalParameter(df[name].unique().tolist()) + input_space[name] = CategoricalParameter( + df_input[name].unique().tolist()) - return cls(space=space) + output_space = {} + for name in df_output.columns: + output_space[name] = OutputParameter(to_disk=False) + + return cls(space=input_space, output_space=output_space) @classmethod - def from_data(cls: Type[Domain], data: _Data) -> Domain: - return cls.from_dataframe(data.to_dataframe()) + def from_data(cls: Type[Domain], input_data: _Data, + output_data: _Data) -> Domain: + return cls.from_dataframe(input_data.to_dataframe(), + output_data.to_dataframe()) # Export # ============================================================================= @@ -369,6 +388,29 @@ def add(self, name: str, space: Parameter): """ self.space[name] = space + def add_output(self, name: str, to_disk: bool): + """Add a new output parameter to the domain. + + Parameters + ---------- + name : str + Name of the output parameter. + to_disk : bool + Whether to store the output parameter on disk. + + Example + ------- + >>> domain = Domain() + >>> domain.add_output('param1', True) + >>> domain.space + {'param1': OutputParameter(to_disk=True)} + """ + if name in self.output_space: + raise KeyError( + f"Parameter {name} already exists in the domain! \ + Choose a different name.") + + self.output_space[name] = OutputParameter(to_disk) # Getters # ============================================================================= @@ -649,6 +691,35 @@ def _all_input_continuous(self) -> bool: """Check if all input parameters are continuous""" return len(self) == len(self._filter(ContinuousParameter)) + def check_output(self, output_data: _Data): + for output_name in output_data.columns.names: + if not self.is_in_output(output_name): + self.add_output(output_name, to_disk=False) + + def is_in_output(self, output_name: str) -> bool: + """Check if output is in the domain + + Parameters + ---------- + output_name : str + Name of the output + + Returns + ------- + bool + True if output is in the domain, False otherwise + + Example + ------- + >>> domain = Domain() + >>> domain.add_output('output1') + >>> domain.is_in_output('output1') + True + >>> domain.is_in_output('output2') + False + """ + return output_name in self.output_space + def make_nd_continuous_domain(bounds: np.ndarray | List[List[float]], dimensionality: int) -> Domain: diff --git a/src/f3dasm/_src/design/parameter.py b/src/f3dasm/_src/design/parameter.py index 82902e57..e1ccab65 100644 --- a/src/f3dasm/_src/design/parameter.py +++ b/src/f3dasm/_src/design/parameter.py @@ -34,6 +34,11 @@ class Parameter: _type: ClassVar[str] = field(init=False, default="object") +@dataclass +class OutputParameter(Parameter): + to_disk: bool = field(default=False) + + @dataclass class ConstantParameter(Parameter): """Create a search space parameter that is constant. diff --git a/src/f3dasm/_src/experimentdata/_columns.py b/src/f3dasm/_src/experimentdata/_columns.py new file mode 100644 index 00000000..743ec874 --- /dev/null +++ b/src/f3dasm/_src/experimentdata/_columns.py @@ -0,0 +1,110 @@ +""" +The _Columns class is used to order and track the parameter names of the data +columns. This class is not intended to be used directly by the user. + It is used by the _Data class to provide an interface to datatypes that do not + have a column structure, such as numpy arrays. + +Notes +----- + +For the default back-end of _Data, this class is obsolete since pandas + DataFrames have a column structure. However, this class is intended to be a + uniform interface to data that does not have a column structure. +""" + +# Modules +# ============================================================================= + +from __future__ import annotations + +# Standard +from typing import Dict, List, Optional + +# Authorship & Credits +# ============================================================================= +__author__ = 'Martin van der Schelling (M.P.vanderSchelling@tudelft.nl)' +__credits__ = ['Martin van der Schelling'] +__status__ = 'Stable' +# ============================================================================= +# +# ============================================================================= + + +class _Columns: + def __init__(self, columns: Optional[Dict[str, None]] = None): + """Class that keeps track of the names and order of parameters + in the raw data. + + Parameters + ---------- + columns: Dict[str, None], optional + dictionary with names as column names and None as values + , by default None + + Notes + ----- + The datatype of a dict with nonsensical values is used to prevent + duplicate keys. This is because the dict is used as a set. + """ + if columns is None: + columns = {} + + self.columns: Dict[str, None] = columns + + def __repr__(self) -> str: + """Representation of the _Columns object.""" + return self.columns.keys().__repr__() + + @property + def names(self) -> List[str]: + """List of the names of the columns. + + Returns + ------- + List[str] + list of the names of the columns + """ + return list(self.columns.keys()) + + def add(self, name: str): + """Add a column to the _Columns object. + + Parameters + ---------- + name: str + name of the column to add + """ + self.columns[name] = None + + def iloc(self, name: str | List[str]) -> List[int]: + """Get the index of a column. + + Parameters + ---------- + name: str | List[str] + name of the column(s) to get the index of + + Returns + ------- + List[int] + list of the indices of the columns + """ + if isinstance(name, str): + name = [name] + + _indices = [] + for n in name: + _indices.append(self.names.index(n)) + return _indices + + def rename(self, old_name: str, new_name: str): + """Replace the name of a column. + + Parameters + ---------- + old_name: str + name of the column to replace + new_name: str + name of the column to replace with + """ + self.columns[new_name] = self.columns.pop(old_name) diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index 9425a670..4d6a3d18 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -15,6 +15,7 @@ # Local from ..design.domain import Domain +from ._columns import _Columns # Authorship & Credits # ============================================================================= @@ -26,42 +27,6 @@ # ============================================================================= -class _Columns: - def __init__(self, columns: Optional[Dict[str, bool]] = None): - if columns is None: - columns = {} - - self.columns: Dict[str, bool] = columns - - def __repr__(self) -> str: - return self.columns.__repr__() - - @property - def names(self) -> List[str]: - return list(self.columns.keys()) - - def is_disk(self, name: str) -> bool: - return self.columns[name] - - def add(self, name: str, is_disk: bool = False): - self.columns[name] = is_disk - - def remove(self, name: str): - del self.columns[name] - - def iloc(self, name: str | List[str]) -> List[int]: - if isinstance(name, str): - name = [name] - - _indices = [] - for n in name: - _indices.append(self.names.index(n)) - return _indices - - def replace_key(self, old_name: str, new_name: str): - self.columns[new_name] = self.columns.pop(old_name) - - class _Data: def __init__(self, data: Optional[pd.DataFrame] = None, columns: Optional[_Columns] = None): @@ -69,7 +34,7 @@ def __init__(self, data: Optional[pd.DataFrame] = None, data = pd.DataFrame() if columns is None: - columns = _Columns({col: False for col in data.columns}) + columns = _Columns({col: None for col in data.columns}) self.columns: _Columns = columns self.data = data.rename( @@ -195,7 +160,7 @@ def from_domain(cls, domain: Domain) -> _Data: df[index] = pd.Categorical( df[index], categories=categorical_input.categories) - _columns = {name: False for name in domain.names} + _columns = {name: None for name in domain.names} return cls(df, columns=_Columns(_columns)) @classmethod @@ -235,7 +200,7 @@ def from_dataframe(cls, dataframe: pd.DataFrame) -> _Data: dataframe : pd.DataFrame The dataframe to load the data from. """ - _columns = {name: False for name in dataframe.columns.to_list()} + _columns = {name: None for name in dataframe.columns.to_list()} return cls(dataframe, columns=_Columns(_columns)) def reset(self, domain: Optional[Domain] = None): @@ -332,6 +297,7 @@ def store(self, filename: Path) -> None: filename : Path The filename to store the data to. """ + # TODO: The column information is not saved in the .csv! self.to_dataframe().to_csv(filename.with_suffix('.csv')) def n_best_samples(self, nosamples: int, @@ -433,12 +399,12 @@ def set_data(self, index: int, value: Any, column: Optional[str] = None): raise IndexError(f"Index {index} does not exist in the data.") if column is None: + # Set the entire row to the values self.data.loc[index] = value return elif column not in self.columns.names: - # TODO this is_disk value needs to be provided by set_data call - self.columns.add(column, is_disk=False) + self.add_column(column) _column_index = self.columns.iloc(column)[0] try: @@ -460,7 +426,7 @@ def has_columnnames(self, names: Iterable[str]) -> bool: def set_columnnames(self, names: Iterable[str]) -> None: for old_name, new_name in zip(self.names, names): - self.columns.replace_key(old_name, new_name) + self.columns.rename(old_name, new_name) def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: @@ -477,6 +443,6 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: _Data The data object. """ - _columns = {name: False for name in dictionary.keys()} + _columns = {name: None for name in dictionary.keys()} df = pd.DataFrame(dictionary, index=[0]).copy() return _Data(data=df, columns=_Columns(_columns)) diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index 3ea33699..f25f2871 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -105,7 +105,7 @@ def __init__(self, domain: Optional[Domain] = None, else: job_value = Status.FINISHED - self.domain = domain_factory(domain, self.input_data) + self.domain = domain_factory(domain, self.input_data, self.output_data) # Create empty input_data from domain if input_data is empty if self.input_data.is_empty(): @@ -403,6 +403,8 @@ def get_output_data(self, no input data! """ if parameter_names is None: + # TODO: Make a domain where space is empty + # but it tracks output_space! return ExperimentData(output_data=self.output_data, jobs=self.jobs, filename=self.filename, path=self.path) else: @@ -571,15 +573,18 @@ def add_input_parameter(self, name: str, parameter: Parameter) -> None: self.input_data.add_column(name) self.domain.add(name, parameter) - def add_output_parameter(self, name: str) -> None: + def add_output_parameter(self, name: str, is_disk: bool) -> None: """Add a new output column to the ExperimentData object. Parameters ---------- name name of the new output column + is_disk + Whether the output column will be stored on disk or not """ self.output_data.add_column(name) + self.domain.add_output(name, is_disk) def fill_output(self, output: np.ndarray, label: str = "y"): """ @@ -593,7 +598,7 @@ def fill_output(self, output: np.ndarray, label: str = "y"): Label of the output column to add to, by default "y". """ if label not in self.output_data.names: - self.add_output_parameter(label) + self.add_output_parameter(label, is_disk=False) filled_indices: Iterable[int] = self.output_data.fill_numpy_arrays( output) @@ -646,10 +651,15 @@ def get_experiment_sample(self, index: int) -> ExperimentSample: ExperimentSample The ExperimentSample at the given index. """ + + output_experiment_sample_dict = self.output_data.get_data_dict(index) + + dict_output = {k: (v, self.domain.output_space[k].to_disk) + for k, v in output_experiment_sample_dict.items()} + return ExperimentSample(dict_input=self.input_data.get_data_dict( index), - dict_output=self.output_data.get_data_dict( - index), + dict_output=dict_output, jobnumber=index, experimentdata_directory=self.path) @@ -663,9 +673,14 @@ def _set_experiment_sample(self, experiment_sample : ExperimentSample The ExperimentSample to set. """ - for column, value in experiment_sample.output_data.items(): + for column, (value, is_disk) in experiment_sample._dict_output.items(): + + if not self.domain.is_in_output(column): + self.domain.add_output(column, to_disk=is_disk) + self.output_data.set_data( - index=experiment_sample.job_number, value=value, column=column) + index=experiment_sample.job_number, value=value, + column=column) self.jobs.mark(experiment_sample._jobnumber, status=Status.FINISHED) @@ -792,8 +807,8 @@ def mark_all_error_open(self) -> None: Mark all the experiments that have the status 'error' open """ self.jobs.mark_all_error_open() - # Datageneration - # ============================================================================= + # Datageneration + # ========================================================================= def evaluate(self, data_generator: DataGenerator, mode: str = 'sequential', kwargs: Optional[dict] = None) -> None: @@ -1160,18 +1175,20 @@ def data_factory(data: DataTypes) -> _Data: f"Path or str, not {type(data)}") -def domain_factory(domain: Domain | None, input_data: _Data) -> Domain: +def domain_factory(domain: Domain | None, + input_data: _Data, output_data: _Data) -> Domain: if isinstance(domain, Domain): + domain.check_output(output_data) return domain elif isinstance(domain, (Path, str)): return Domain.from_file(Path(domain)) - elif input_data.is_empty() and domain is None: + elif (input_data.is_empty() and output_data.is_empty() and domain is None): return Domain() elif domain is None: - return Domain.from_data(input_data) + return Domain.from_data(input_data, output_data) else: raise TypeError( diff --git a/src/f3dasm/_src/experimentdata/experimentsample.py b/src/f3dasm/_src/experimentdata/experimentsample.py index 956491fd..7cb923e3 100644 --- a/src/f3dasm/_src/experimentdata/experimentsample.py +++ b/src/f3dasm/_src/experimentdata/experimentsample.py @@ -30,8 +30,6 @@ # Storing to disk # ============================================================================= -PATH_PREFIX = 'path_' - class _Store: suffix: int @@ -176,7 +174,8 @@ def save_object(object: Any, path: Path, experimentdata_directory: Path, class ExperimentSample: - def __init__(self, dict_input: Dict[str, Any], dict_output: Dict[str, Any], + def __init__(self, dict_input: Dict[str, Any], + dict_output: Dict[str, Tuple[Any, bool]], jobnumber: int, experimentdata_directory: Optional[Path] = None): """Single realization of a design of experiments. @@ -184,9 +183,13 @@ def __init__(self, dict_input: Dict[str, Any], dict_output: Dict[str, Any], Parameters ---------- dict_input : Dict[str, Any] - Input parameters of one experiment - dict_output : Dict[str, Any] - Output parameters of one experiment + Input parameters of one experiment. + The key is the name of the parameter. + dict_output : Dict[str, Tuple[Any, bool]] + Output parameters of one experiment. + The key is the name of the parameter, + the first value of the tuple is the actual value and the second + if the value is stored to disk or not jobnumber : int Index of the experiment """ @@ -224,7 +227,7 @@ def from_numpy(cls: Type[ExperimentSample], input_array: np.ndarray, if output_value is None: dict_output = {} else: - dict_output = {"y": output_value} + dict_output = {"y": (output_value, False)} return cls(dict_input=dict_input, dict_output=dict_output, jobnumber=jobnumber) @@ -247,22 +250,20 @@ class of defined type to load the data. By default None, Value of the parameter of the sample """ # Load the value literally (even if it is a reference) - value = self._load_from_experimentdata(item) - - if item.startswith(PATH_PREFIX): + value, from_disk = self._load_from_experimentdata(item) - if isinstance(value, float): - # value is NaN - return item + if not from_disk: + return value - # Load the object from the reference - return load_object(Path(value), - self._experimentdata_directory, load_method) - else: - # Return the literal value + if isinstance(value, float): + # value is NaN return value - def _load_from_experimentdata(self, item: str) -> Any: + # Load the object from the reference + return load_object(Path(value), + self._experimentdata_directory, load_method) + + def _load_from_experimentdata(self, item: str) -> Tuple[Any, bool]: """Load the data from the experiment data. Parameters @@ -272,17 +273,18 @@ def _load_from_experimentdata(self, item: str) -> Any: Returns ------- - Any - data + Tuple[Any, bool] + data and if it is stored to disk or not """ value = self._dict_input.get(item, None) - if value is None: - value = self._dict_output.get(item, None) - return value + if value is None: + return self._dict_output.get(item, None) + else: + return value, False def __setitem__(self, key: str, value: Any): - self._dict_output[key] = value + self._dict_output[key] = (value, False) def __repr__(self) -> str: return f"ExperimentSample({self.job_number} : \ @@ -308,21 +310,29 @@ def output_data(self) -> Dict[str, Any]: Dict[str, Any] The output data of the design as a dictionary. """ - # Load all the data from the experiment data - # return {key: self.get(key) for key in self._dict_output.keys()} - return self._dict_output + # This is the loaded data ! + return {key: self.get(key) for key in self._dict_output} + + # create an alias for output_data names output_data_loaded + # this is for backward compatibility + output_data_loaded = output_data @property - def output_data_loaded(self) -> Dict[str, Any]: - """Retrieve the output data of the design as a dictionary. + def output_data_with_references(self) -> Dict[str, Any]: + """Retrieve the output data of the design as a dictionary, but refrain + from loading the data from disk and give the references. + + Notes + ----- + If you want to use the data, you can load it in memory with the + :func:`output_data` property. Returns ------- Dict[str, Any] - The output data of the design as a dictionary. + The output data of the design as a dictionary with references. """ - # Load all the data from the experiment data - return {key: self.get(key) for key in self._dict_output.keys()} + return self._dict_output @property def job_number(self) -> int: @@ -350,8 +360,8 @@ def to_numpy(self) -> Tuple[np.ndarray, np.ndarray]: Tuple[np.ndarray, np.ndarray] A tuple of numpy arrays containing the input and output data. """ - return np.array(list(self._dict_input.values())), np.array( - list(self._dict_output.values())) + return np.array(list(self.input_data.values())), np.array( + list(self.output_data.values())) def to_dict(self) -> Dict[str, Any]: """Converts the design to a dictionary. @@ -361,7 +371,7 @@ def to_dict(self) -> Dict[str, Any]: Dict[str, Any] A dictionary containing the input and output data. """ - return {**self.input_data, **self.output_data_loaded, + return {**self.input_data, **self.output_data, 'job_number': self.job_number} def store(self, name: str, object: Any, to_disk: bool = False, @@ -407,10 +417,10 @@ def _store_to_disk(self, object: Any, name: str, store_method=store_method) # Store the path to the object in the output_data - self._dict_output[f"{PATH_PREFIX}{name}"] = str( - file_path.with_suffix(suffix)) + self._dict_output[name] = (str( + file_path.with_suffix(suffix)), True) logger.info(f"Stored {name} to {file_path.with_suffix(suffix)}") def _store_to_experimentdata(self, object: Any, name: str) -> None: - self._dict_output[name] = object + self._dict_output[name] = (object, False) diff --git a/tests/design/conftest.py b/tests/design/conftest.py index f7e54a29..0087f7d4 100644 --- a/tests/design/conftest.py +++ b/tests/design/conftest.py @@ -35,7 +35,8 @@ def domain(): def continuous_parameter(): lower_bound = 3.3 upper_bound = 3.8 - return ContinuousParameter(lower_bound=lower_bound, upper_bound=upper_bound) + return ContinuousParameter(lower_bound=lower_bound, + upper_bound=upper_bound) @pytest.fixture(scope="package") @@ -54,7 +55,7 @@ def categorical_parameter(): @pytest.fixture(scope="package") def design_data(): dict_input = {'input1': 1, 'input2': 2} - dict_output = {'output1': 3, 'output2': 4} + dict_output = {'output1': (3, False), 'output2': (4, False)} job_number = 123 return dict_input, dict_output, job_number diff --git a/tests/design/test_data.py b/tests/design/test_data.py index 495a536f..738f10c6 100644 --- a/tests/design/test_data.py +++ b/tests/design/test_data.py @@ -69,7 +69,8 @@ def test_data_get_inputdata_dict(sample_data: _Data): def test_data_set_data(sample_data: _Data): index = 0 - sample_data.set_data(index, 15, 'output1') + sample_data.set_data(index=index, value=15, + column='output1') _column_index = sample_data.columns.iloc('output1')[0] assert sample_data.data.loc[index, _column_index] == 15 @@ -77,7 +78,8 @@ def test_data_set_data(sample_data: _Data): def test_data_to_numpy(sample_data: _Data): input_array = sample_data.to_numpy() assert isinstance(input_array, np.ndarray) - assert input_array.shape == (len(sample_data), len(sample_data.data.columns)) + assert input_array.shape == ( + len(sample_data), len(sample_data.data.columns)) def test_data_n_best_samples(sample_data: _Data): diff --git a/tests/design/test_designofexperiments.py b/tests/design/test_designofexperiments.py index 1bb526ec..81cb6528 100644 --- a/tests/design/test_designofexperiments.py +++ b/tests/design/test_designofexperiments.py @@ -68,7 +68,8 @@ def test_add_input_space(): } design = Domain(space=designspace) - design.add('x4', CategoricalParameter(categories=["test1", "test2", "test3"])) + design.add('x4', CategoricalParameter( + categories=["test1", "test2", "test3"])) design.add('x5', DiscreteParameter(lower_bound=2, upper_bound=3)) assert design.space == { @@ -88,7 +89,8 @@ def test_add_space(): } domain = Domain(space=designspace) - domain.add('x4', CategoricalParameter(categories=["test1", "test2", "test3"])) + domain.add('x4', CategoricalParameter( + categories=["test1", "test2", "test3"])) domain.add('x5', DiscreteParameter(lower_bound=2, upper_bound=3)) assert domain.space == { @@ -142,7 +144,8 @@ def test_get_number_of_input_parameters(domain: Domain): def test_domain_from_dataframe(sample_dataframe: pd.DataFrame): - domain = Domain.from_dataframe(sample_dataframe) + domain = Domain.from_dataframe( + df_input=sample_dataframe, df_output=pd.DataFrame()) ground_truth = Domain(space={'feature1': ContinuousParameter(lower_bound=1.0, upper_bound=3.0), 'feature2': DiscreteParameter(lower_bound=4, upper_bound=6), 'feature3': CategoricalParameter(['A', 'B', 'C'])}) diff --git a/tests/design/test_trial.py b/tests/design/test_trial.py index a5fe9175..ccbc7b85 100644 --- a/tests/design/test_trial.py +++ b/tests/design/test_trial.py @@ -10,7 +10,7 @@ def test_design_initialization(design_data): dict_input, dict_output, job_number = design_data design = ExperimentSample(dict_input, dict_output, job_number) assert design.input_data == dict_input - assert design.output_data == dict_output + assert design._dict_output == dict_output assert design.job_number == job_number @@ -18,8 +18,9 @@ def test_design_to_numpy(design_data): dict_input, dict_output, job_number = design_data design = ExperimentSample(dict_input, dict_output, job_number) input_array, output_array = design.to_numpy() - assert np.array_equal(input_array, np.array(list(dict_input.values()))) - assert np.array_equal(output_array, np.array(list(dict_output.values()))) + + check_output_array = np.array([v for v, _ in dict_output.values()]) + assert np.array_equal(output_array, check_output_array) def test_design_set(design_data): diff --git a/tests/experimentdata/test_experimentdata.py b/tests/experimentdata/test_experimentdata.py index 364c82b9..94cb68be 100644 --- a/tests/experimentdata/test_experimentdata.py +++ b/tests/experimentdata/test_experimentdata.py @@ -70,13 +70,16 @@ def test_from_file(experimentdata_continuous: ExperimentData, seed: int, tmp_pat # experimentdata_continuous.filename = tmp_path / 'test001' experimentdata_continuous.store(tmp_path / 'experimentdata') - experimentdata_from_file = ExperimentData.from_file(tmp_path / 'experimentdata') + experimentdata_from_file = ExperimentData.from_file( + tmp_path / 'experimentdata') # Check if the input_data attribute of ExperimentData matches the expected_data - pd.testing.assert_frame_equal(experimentdata_continuous.input_data.data, experimentdata_from_file.input_data.data) + pd.testing.assert_frame_equal( + experimentdata_continuous.input_data.data, experimentdata_from_file.input_data.data) pd.testing.assert_frame_equal(experimentdata_continuous.output_data.data, experimentdata_from_file.output_data.data) - pd.testing.assert_series_equal(experimentdata_continuous.jobs.jobs, experimentdata_from_file.jobs.jobs) + pd.testing.assert_series_equal( + experimentdata_continuous.jobs.jobs, experimentdata_from_file.jobs.jobs) # assert experimentdata_continuous.input_data == experimentdata_from_file.input_data assert experimentdata_continuous.output_data == experimentdata_from_file.output_data assert experimentdata_continuous.domain == experimentdata_from_file.domain @@ -105,7 +108,8 @@ def sample_csv_inputdata(tmp_path): input_csv_file = tmp_path / 'experimentdata_data.csv' # Create sample input and output dataframes - input_data = pd.DataFrame({'input_col1': [1, 2, 3], 'input_col2': [4, 5, 6]}) + input_data = pd.DataFrame( + {'input_col1': [1, 2, 3], 'input_col2': [4, 5, 6]}) return input_csv_file, input_data @@ -116,7 +120,8 @@ def sample_csv_outputdata(tmp_path): output_csv_file = tmp_path / 'experimentdata_output.csv' # Create sample input and output dataframes - output_data = pd.DataFrame({'output_col1': [7, 8, 9], 'output_col2': [10, 11, 12]}) + output_data = pd.DataFrame( + {'output_col1': [7, 8, 9], 'output_col2': [10, 11, 12]}) return output_csv_file, output_data @@ -126,8 +131,10 @@ def test_from_object(experimentdata_continuous: ExperimentData): output_data = experimentdata_continuous.output_data jobs = experimentdata_continuous.jobs domain = experimentdata_continuous.domain - experiment_data = ExperimentData(input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) - assert experiment_data == ExperimentData(input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) + experiment_data = ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) + assert experiment_data == ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) assert experiment_data == experimentdata_continuous # Exporters @@ -155,20 +162,21 @@ def test_to_pandas(experimentdata_continuous: ExperimentData, pandas_dataframe: def test_add_new_input_column(experimentdata: ExperimentData, continuous_parameter: ContinuousParameter): - experimentdata.add_input_parameter(name='test', parameter=continuous_parameter) + experimentdata.add_input_parameter( + name='test', parameter=continuous_parameter) assert 'test' in experimentdata.input_data.names def test_add_new_output_column(experimentdata: ExperimentData): - experimentdata.add_output_parameter(name='test') + experimentdata.add_output_parameter(name='test', is_disk=False) assert 'test' in experimentdata.output_data.names def test_fill_outputs(experimentdata_continuous: ExperimentData, numpy_output_array: np.ndarray, numpy_array: np.ndarray): exp_data = ExperimentData(experimentdata_continuous.domain) - exp_data.add_output_parameter(name='y') - exp_data.add(domain=exp_data.domain, input_data=numpy_array, output_data=numpy_output_array) + exp_data.add(domain=exp_data.domain, input_data=numpy_array, + output_data=numpy_output_array) experimentdata_continuous.fill_output(numpy_output_array) assert exp_data == experimentdata_continuous @@ -237,7 +245,8 @@ def create_jobs_pickle_finished(filepath): _data_input = _Data(pd_input()) _data_output = _Data(pd_output()) - experimentdata = ExperimentData(domain=domain, input_data=_data_input, output_data=_data_output) + experimentdata = ExperimentData( + domain=domain, input_data=_data_input, output_data=_data_output) experimentdata.jobs.store(filepath) @@ -454,14 +463,16 @@ def mock_pd_read_pickle(*args, **kwargs): if isinstance(input_data, np.ndarray) and domain is None: with pytest.raises(ValueError): - ExperimentData(domain=domain, input_data=input_data, output_data=output_data, jobs=jobs) + ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) return # Initialize ExperimentData with the CSV file experiment_data = ExperimentData(domain=domain, input_data=input_data, output_data=output_data, jobs=jobs) # Check if the input_data attribute of ExperimentData matches the expected_data - pd.testing.assert_frame_equal(experiment_data.input_data.data, experimentdata_expected.input_data.data) + pd.testing.assert_frame_equal( + experiment_data.input_data.data, experimentdata_expected.input_data.data) pd.testing.assert_frame_equal(experiment_data.output_data.data, experimentdata_expected.output_data.data) assert experiment_data == experimentdata_expected @@ -527,7 +538,8 @@ def mock_pd_read_pickle(*args, **kwargs): if isinstance(input_data, np.ndarray) and domain is None: with pytest.raises(ValueError): - ExperimentData(domain=domain, input_data=input_data, output_data=output_data, jobs=jobs) + ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) return # Initialize ExperimentData with the CSV file @@ -535,10 +547,12 @@ def mock_pd_read_pickle(*args, **kwargs): output_data=output_data, jobs=jobs) # Check if the input_data attribute of ExperimentData matches the expected_data - pd.testing.assert_frame_equal(experiment_data.input_data.data, experimentdata_expected_no_output.input_data.data) + pd.testing.assert_frame_equal( + experiment_data.input_data.data, experimentdata_expected_no_output.input_data.data) pd.testing.assert_frame_equal(experiment_data.output_data.data, experimentdata_expected_no_output.output_data.data) - pd.testing.assert_series_equal(experiment_data.jobs.jobs, experimentdata_expected_no_output.jobs.jobs) + pd.testing.assert_series_equal( + experiment_data.jobs.jobs, experimentdata_expected_no_output.jobs.jobs) assert experiment_data.input_data == experimentdata_expected_no_output.input_data assert experiment_data.output_data == experimentdata_expected_no_output.output_data assert experiment_data.domain == experimentdata_expected_no_output.domain @@ -596,7 +610,8 @@ def mock_load_pickle(*args, **kwargs): output_data=output_data) # Check if the input_data attribute of ExperimentData matches the expected_data - pd.testing.assert_frame_equal(experiment_data.input_data.data, experimentdata_expected_only_domain.input_data.data) + pd.testing.assert_frame_equal( + experiment_data.input_data.data, experimentdata_expected_only_domain.input_data.data) pd.testing.assert_frame_equal(experiment_data.output_data.data, experimentdata_expected_only_domain.output_data.data) assert experiment_data.input_data == experimentdata_expected_only_domain.input_data From 53d30a6b3c4d2c169dffc42a0f20d6ea3c14b224 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:33:05 -0400 Subject: [PATCH 2/3] removed dependency of Domain to _Data --- src/f3dasm/_src/design/domain.py | 27 ++----------------- .../_src/experimentdata/experimentdata.py | 5 ++-- 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index 592e7003..f3a1323c 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -36,23 +36,6 @@ # ============================================================================= -class _Columns: - @property - def names(self) -> List[str]: - ... - - def is_disk(self, name: str) -> bool: - ... - - -class _Data: - data: pd.DataFrame - columns: _Columns - - def to_dataframe() -> pd.DataFrame: - ... - - @dataclass class Domain: """Main class for defining the domain of the design of experiments. @@ -216,12 +199,6 @@ def from_dataframe(cls, df_input: pd.DataFrame, return cls(space=input_space, output_space=output_space) - @classmethod - def from_data(cls: Type[Domain], input_data: _Data, - output_data: _Data) -> Domain: - return cls.from_dataframe(input_data.to_dataframe(), - output_data.to_dataframe()) - # Export # ============================================================================= @@ -691,8 +668,8 @@ def _all_input_continuous(self) -> bool: """Check if all input parameters are continuous""" return len(self) == len(self._filter(ContinuousParameter)) - def check_output(self, output_data: _Data): - for output_name in output_data.columns.names: + def check_output(self, names: List[str]): + for output_name in names: if not self.is_in_output(output_name): self.add_output(output_name, to_disk=False) diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index f25f2871..d8babe7c 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -1178,7 +1178,7 @@ def data_factory(data: DataTypes) -> _Data: def domain_factory(domain: Domain | None, input_data: _Data, output_data: _Data) -> Domain: if isinstance(domain, Domain): - domain.check_output(output_data) + domain.check_output(output_data.names) return domain elif isinstance(domain, (Path, str)): @@ -1188,7 +1188,8 @@ def domain_factory(domain: Domain | None, return Domain() elif domain is None: - return Domain.from_data(input_data, output_data) + return Domain.from_dataframe( + input_data.to_dataframe(), output_data.to_dataframe()) else: raise TypeError( From 32d12c9bda13c0dbdbfe59ef5638f693cdd00275 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:42:12 -0400 Subject: [PATCH 3/3] updated documentation --- .../rst_doc_files/classes/design/experimentsample.rst | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docs/source/rst_doc_files/classes/design/experimentsample.rst b/docs/source/rst_doc_files/classes/design/experimentsample.rst index 00be3b85..5e8f4a65 100644 --- a/docs/source/rst_doc_files/classes/design/experimentsample.rst +++ b/docs/source/rst_doc_files/classes/design/experimentsample.rst @@ -123,13 +123,7 @@ A reference (:code:`Path`) will be saved to the :attr:`~f3dasm.design.Experiment ├── my_experiment_output.csv └── my_experiment_jobs.pkl -In the :attr:`~f3dasm.design.ExperimentData.output_data`, a reference to the stored object (e.g. :code:`my_project/output_1/0.npy`) will be automatically appended to the `path_` parameter. - -.. code-block:: python - - >>> experiment_sample['output_numpy'] - 'my_project/output_numpy/0.npy' - +In the :attr:`~f3dasm.design.ExperimentData.output_data`, a reference to the stored object (e.g. :code:`my_project/output_1/0.npy`) will be automatically appended to the parameter. :mod:`f3dasm` has built-in storing functions for numpy :class:`~numpy.ndarray`, pandas :class:`~pandas.DataFrame` and xarray :class:`~xarray.DataArray` and :class:`~xarray.Dataset`.