Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove the naming discrepancy (path_) of referenced to_disk=True object #216

Merged
merged 3 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,7 @@ A reference (:code:`Path`) will be saved to the :attr:`~f3dasm.design.Experiment
├── my_experiment_output.csv
└── my_experiment_jobs.pkl

In the :attr:`~f3dasm.design.ExperimentData.output_data`, a reference to the stored object (e.g. :code:`my_project/output_1/0.npy`) will be automatically appended to the `path_<output parameter name>` parameter.

.. code-block:: python

>>> experiment_sample['output_numpy']
'my_project/output_numpy/0.npy'

In the :attr:`~f3dasm.design.ExperimentData.output_data`, a reference to the stored object (e.g. :code:`my_project/output_1/0.npy`) will be automatically appended to the parameter.


:mod:`f3dasm` has built-in storing functions for numpy :class:`~numpy.ndarray`, pandas :class:`~pandas.DataFrame` and xarray :class:`~xarray.DataArray` and :class:`~xarray.Dataset`.
Expand Down
110 changes: 79 additions & 31 deletions src/f3dasm/_src/design/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# Local
from .parameter import (CategoricalParameter, CategoricalType,
ConstantParameter, ContinuousParameter,
DiscreteParameter, Parameter)
DiscreteParameter, OutputParameter, Parameter)

# Authorship & Credits
# =============================================================================
Expand All @@ -36,18 +36,6 @@
# =============================================================================


class _Columns:
names: List[str]


class _Data:
data: pd.DataFrame
columns: _Columns

def to_dataframe() -> pd.DataFrame:
...


@dataclass
class Domain:
"""Main class for defining the domain of the design of experiments.
Expand All @@ -59,6 +47,7 @@ class Domain:
"""

space: Dict[str, Parameter] = field(default_factory=dict)
output_space: Dict[str, OutputParameter] = field(default_factory=dict)

def __len__(self) -> int:
"""The len() method returns the number of parameters"""
Expand Down Expand Up @@ -164,44 +153,51 @@ def from_yaml(cls: Type[Domain], yaml: DictConfig) -> Domain:
for name, param in yaml.items()})

@classmethod
def from_dataframe(cls, df: pd.DataFrame) -> Domain:
def from_dataframe(cls, df_input: pd.DataFrame,
df_output: pd.DataFrame) -> Domain:
"""Initializes a Domain from a pandas DataFrame.

Parameters
----------
df : pd.DataFrame
df_input : pd.DataFrame
DataFrame containing the input parameters.
df_output : pd.DataFrame
DataFrame containing the output parameters.

Returns
-------
Domain
Domain object
"""
space = {}
for name, type in df.dtypes.items():
input_space = {}
for name, type in df_input.dtypes.items():
if type == 'float64':
if float(df[name].min()) == float(df[name].max()):
space[name] = ConstantParameter(
value=float(df[name].min()))
if float(df_input[name].min()) == float(df_input[name].max()):
input_space[name] = ConstantParameter(
value=float(df_input[name].min()))
continue

space[name] = ContinuousParameter(lower_bound=float(
df[name].min()), upper_bound=float(df[name].max()))
input_space[name] = ContinuousParameter(lower_bound=float(
df_input[name].min()),
upper_bound=float(df_input[name].max()))
elif type == 'int64':
if int(df[name].min()) == int(df[name].max()):
space[name] = ConstantParameter(value=int(df[name].min()))
if int(df_input[name].min()) == int(df_input[name].max()):
input_space[name] = ConstantParameter(
value=int(df_input[name].min()))
continue

space[name] = DiscreteParameter(lower_bound=int(
df[name].min()), upper_bound=int(df[name].max()))
input_space[name] = DiscreteParameter(lower_bound=int(
df_input[name].min()),
upper_bound=int(df_input[name].max()))
else:
space[name] = CategoricalParameter(df[name].unique().tolist())
input_space[name] = CategoricalParameter(
df_input[name].unique().tolist())

return cls(space=space)
output_space = {}
for name in df_output.columns:
output_space[name] = OutputParameter(to_disk=False)

@classmethod
def from_data(cls: Type[Domain], data: _Data) -> Domain:
return cls.from_dataframe(data.to_dataframe())
return cls(space=input_space, output_space=output_space)

# Export
# =============================================================================
Expand Down Expand Up @@ -369,6 +365,29 @@ def add(self, name: str, space: Parameter):
"""
self.space[name] = space

def add_output(self, name: str, to_disk: bool):
"""Add a new output parameter to the domain.

Parameters
----------
name : str
Name of the output parameter.
to_disk : bool
Whether to store the output parameter on disk.

Example
-------
>>> domain = Domain()
>>> domain.add_output('param1', True)
>>> domain.space
{'param1': OutputParameter(to_disk=True)}
"""
if name in self.output_space:
raise KeyError(
f"Parameter {name} already exists in the domain! \
Choose a different name.")

self.output_space[name] = OutputParameter(to_disk)
# Getters
# =============================================================================

Expand Down Expand Up @@ -649,6 +668,35 @@ def _all_input_continuous(self) -> bool:
"""Check if all input parameters are continuous"""
return len(self) == len(self._filter(ContinuousParameter))

def check_output(self, names: List[str]):
for output_name in names:
if not self.is_in_output(output_name):
self.add_output(output_name, to_disk=False)

def is_in_output(self, output_name: str) -> bool:
"""Check if output is in the domain

Parameters
----------
output_name : str
Name of the output

Returns
-------
bool
True if output is in the domain, False otherwise

Example
-------
>>> domain = Domain()
>>> domain.add_output('output1')
>>> domain.is_in_output('output1')
True
>>> domain.is_in_output('output2')
False
"""
return output_name in self.output_space


def make_nd_continuous_domain(bounds: np.ndarray | List[List[float]],
dimensionality: int) -> Domain:
Expand Down
5 changes: 5 additions & 0 deletions src/f3dasm/_src/design/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ class Parameter:
_type: ClassVar[str] = field(init=False, default="object")


@dataclass
class OutputParameter(Parameter):
to_disk: bool = field(default=False)


@dataclass
class ConstantParameter(Parameter):
"""Create a search space parameter that is constant.
Expand Down
110 changes: 110 additions & 0 deletions src/f3dasm/_src/experimentdata/_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""
The _Columns class is used to order and track the parameter names of the data
columns. This class is not intended to be used directly by the user.
It is used by the _Data class to provide an interface to datatypes that do not
have a column structure, such as numpy arrays.

Notes
-----

For the default back-end of _Data, this class is obsolete since pandas
DataFrames have a column structure. However, this class is intended to be a
uniform interface to data that does not have a column structure.
"""

# Modules
# =============================================================================

from __future__ import annotations

# Standard
from typing import Dict, List, Optional

# Authorship & Credits
# =============================================================================
__author__ = 'Martin van der Schelling ([email protected])'
__credits__ = ['Martin van der Schelling']
__status__ = 'Stable'
# =============================================================================
#
# =============================================================================


class _Columns:
def __init__(self, columns: Optional[Dict[str, None]] = None):
"""Class that keeps track of the names and order of parameters
in the raw data.

Parameters
----------
columns: Dict[str, None], optional
dictionary with names as column names and None as values
, by default None

Notes
-----
The datatype of a dict with nonsensical values is used to prevent
duplicate keys. This is because the dict is used as a set.
"""
if columns is None:
columns = {}

self.columns: Dict[str, None] = columns

def __repr__(self) -> str:
"""Representation of the _Columns object."""
return self.columns.keys().__repr__()

@property
def names(self) -> List[str]:
"""List of the names of the columns.

Returns
-------
List[str]
list of the names of the columns
"""
return list(self.columns.keys())

def add(self, name: str):
"""Add a column to the _Columns object.

Parameters
----------
name: str
name of the column to add
"""
self.columns[name] = None

def iloc(self, name: str | List[str]) -> List[int]:
"""Get the index of a column.

Parameters
----------
name: str | List[str]
name of the column(s) to get the index of

Returns
-------
List[int]
list of the indices of the columns
"""
if isinstance(name, str):
name = [name]

_indices = []
for n in name:
_indices.append(self.names.index(n))
return _indices

def rename(self, old_name: str, new_name: str):
"""Replace the name of a column.

Parameters
----------
old_name: str
name of the column to replace
new_name: str
name of the column to replace with
"""
self.columns[new_name] = self.columns.pop(old_name)
Loading