From 841657f6f7ba7b726fc11dda0c0678d91551adbb Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 1 May 2024 20:45:31 +0200 Subject: [PATCH] feat: specify `extras` instead of `features` in `to_tabular_dataset` (#685) Closes #623 ### Summary of Changes When creating a tabular dataset, users can now optionally specify extra columns, i.e. columns that are neither target nor feature. The feature columns are implicitly all columns that are neither target nor extra. Previously, users had to specify the features instead and the extras were implicit. However, the list of features is usually much longer than the list of extras, making the previous approach cumbersome. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> --- docs/tutorials/classification.ipynb | 21 +- docs/tutorials/regression.ipynb | 11 +- .../labeled/containers/_tabular_dataset.py | 169 ++++----------- src/safeds/data/tabular/containers/_table.py | 13 +- .../data/tabular/containers/_time_series.py | 58 +---- src/safeds/ml/classical/_util_sklearn.py | 9 +- .../containers/_tabular_dataset/test_eq.py | 34 +-- .../_tabular_dataset/test_extras.py | 41 ++++ .../_tabular_dataset/test_features.py | 2 +- .../_tabular_dataset/test_from_table.py | 150 ------------- .../containers/_tabular_dataset/test_hash.py | 30 +-- .../containers/_tabular_dataset/test_init.py | 176 +++++++++++++--- .../_tabular_dataset/test_into_dataloader.py | 8 +- .../_tabular_dataset/test_sizeof.py | 5 +- .../_tabular_dataset/test_to_table.py | 5 +- .../containers/_time_series/test_eq.py | 2 +- .../_time_series/test_from_tagged_table.py | 199 ------------------ .../classification/test_ada_boost.py | 2 +- .../classification/test_classifier.py | 8 +- .../classification/test_gradient_boosting.py | 2 +- .../test_k_nearest_neighbors.py | 2 +- .../classification/test_random_forest.py | 2 +- .../test_support_vector_machine.py | 2 +- .../ml/classical/regression/test_ada_boost.py | 2 +- .../regression/test_elastic_net_regression.py | 2 +- .../regression/test_gradient_boosting.py | 2 +- .../regression/test_k_nearest_neighbors.py | 2 +- .../regression/test_lasso_regression.py | 2 +- .../regression/test_random_forest.py | 2 +- .../ml/classical/regression/test_regressor.py | 8 +- .../regression/test_ridge_regression.py | 2 +- .../regression/test_support_vector_machine.py | 2 +- 32 files changed, 321 insertions(+), 654 deletions(-) create mode 100644 tests/safeds/data/labeled/containers/_tabular_dataset/test_extras.py delete mode 100644 tests/safeds/data/labeled/containers/_tabular_dataset/test_from_table.py delete mode 100644 tests/safeds/data/tabular/containers/_time_series/test_from_tagged_table.py diff --git a/docs/tutorials/classification.ipynb b/docs/tutorials/classification.ipynb index ab2f22eef..52ec2aeb0 100644 --- a/docs/tutorials/classification.ipynb +++ b/docs/tutorials/classification.ipynb @@ -29,7 +29,7 @@ "\n", "titanic = Table.from_csv_file(\"data/titanic.csv\")\n", "#For visualisation purposes we only print out the first 15 rows.\n", - "titanic.slice_rows(0,15)" + "titanic.slice_rows(0, 15)" ], "metadata": { "collapsed": false @@ -77,7 +77,6 @@ "source": [ "from safeds.data.tabular.transformation import OneHotEncoder\n", "\n", - "old_column_names = train_table.column_names\n", "encoder = OneHotEncoder().fit(train_table, [\"sex\"])" ], "metadata": { @@ -97,18 +96,14 @@ "cell_type": "code", "execution_count": null, "outputs": [], - "source": [ - "transformed_table = encoder.transform(train_table)\n", - "new_column_names = transformed_table.column_names\n", - "new_columns= set(new_column_names) - set(old_column_names)" - ], + "source": "transformed_table = encoder.transform(train_table)", "metadata": { "collapsed": false } }, { "cell_type": "markdown", - "source": "5. Mark the `survived` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.", + "source": "5. Mark the `survived` `Column` as the target variable to be predicted. Include some columns only as extra columns, which are completely ignored by the model:", "metadata": { "collapsed": false } @@ -118,9 +113,9 @@ "execution_count": null, "outputs": [], "source": [ - "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", feature_names=[\n", - " *new_columns\n", - "])" + "extra_names = [\"id\", \"name\", \"ticket\", \"cabin\", \"port_embarked\", \"age\", \"fare\"]\n", + "\n", + "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names)" ], "metadata": { "collapsed": false @@ -192,9 +187,7 @@ "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n", "testing_table = encoder.transform(testing_table)\n", "\n", - "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", feature_names=[\n", - " *new_columns\n", - "])\n", + "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names)\n", "fitted_model.accuracy(test_tabular_dataset)\n" ], "metadata": { diff --git a/docs/tutorials/regression.ipynb b/docs/tutorials/regression.ipynb index 364147288..2d5041791 100644 --- a/docs/tutorials/regression.ipynb +++ b/docs/tutorials/regression.ipynb @@ -60,7 +60,7 @@ }, { "cell_type": "markdown", - "source": "3. Mark the `price` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.\n", + "source": "3. Mark the `price` `Column` as the target variable to be predicted. Include the `id` column only as an extra column, which is completely ignored by the model:", "metadata": { "collapsed": false } @@ -70,10 +70,9 @@ "execution_count": null, "outputs": [], "source": [ - "feature_columns = set(train_table.column_names) - set([\"price\", \"id\"])\n", + "extra_names = [\"id\"]\n", "\n", - "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", feature_names=[\n", - " *feature_columns])\n" + "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names)\n" ], "metadata": { "collapsed": false @@ -147,9 +146,7 @@ } ], "source": [ - "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", feature_names=[\n", - " *feature_columns\n", - "])\n", + "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names)\n", "\n", "fitted_model.mean_absolute_error(test_tabular_dataset)\n" ], diff --git a/src/safeds/data/labeled/containers/_tabular_dataset.py b/src/safeds/data/labeled/containers/_tabular_dataset.py index 7f28667dc..81e73bf9b 100644 --- a/src/safeds/data/labeled/containers/_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_tabular_dataset.py @@ -5,9 +5,6 @@ from safeds._utils import _structural_hash from safeds.data.tabular.containers import Column, Table -from safeds.exceptions import ( - UnknownColumnNameError, -) if TYPE_CHECKING: from collections.abc import Mapping, Sequence @@ -22,150 +19,67 @@ class TabularDataset: """ A tabular dataset maps feature columns to a target column. + Create a tabular dataset from a mapping of column names to their values. + Parameters ---------- data: The data. target_name: Name of the target column. - feature_names: - Names of the feature columns. If None, all columns except the target column are used. + extra_names: + Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but + the target column are used as features. Raises ------ ColumnLengthMismatchError If columns have different lengths. ValueError - If the target column is also a feature column. + If the target column is also an extra column. ValueError - If no feature columns are specified. + If no feature columns remains. Examples -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table({"col1": ["a", "b"], "col2": [1, 2]}) - >>> tabular_dataset = table.to_tabular_dataset("col2", ["col1"]) + >>> from safeds.data.labeled.containers import TabularDataset + >>> dataset = TabularDataset( + ... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]}, + ... target_name="target", + ... extra_names=["id"] + ... ) """ - # ------------------------------------------------------------------------------------------------------------------ - # Creation - # ------------------------------------------------------------------------------------------------------------------ - - @staticmethod - def _from_table( - table: Table, - target_name: str, - feature_names: list[str] | None = None, - ) -> TabularDataset: - """ - Create a tabular dataset from a table. - - Parameters - ---------- - table: - The table. - target_name: - Name of the target column. - feature_names: - Names of the feature columns. If None, all columns except the target column are used. - - Returns - ------- - tabular_dataset: - The created tabular dataset. - - Raises - ------ - UnknownColumnNameError - If target_name matches none of the column names. - ValueError - If the target column is also a feature column. - ValueError - If no feature columns are specified. - - Examples - -------- - >>> from safeds.data.labeled.containers import TabularDataset - >>> from safeds.data.tabular.containers import Table - >>> table = Table({"col1": ["a", "b", "c", "a"], "col2": [1, 2, 3, 4]}) - >>> tabular_dataset = TabularDataset._from_table(table, "col2", ["col1"]) - """ - table = table._as_table() - if target_name not in table.column_names: - raise UnknownColumnNameError([target_name]) - - # If no feature names are specified, use all columns except the target column - if feature_names is None: - feature_names = table.column_names - feature_names.remove(target_name) - - # Validate inputs - if target_name in feature_names: - raise ValueError(f"Column '{target_name}' cannot be both feature and target.") - if len(feature_names) == 0: - raise ValueError("At least one feature column must be specified.") - - # Create result - result = object.__new__(TabularDataset) - - result._table = table - result._features = table.keep_only_columns(feature_names) - result._target = table.get_column(target_name) - - return result - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ def __init__( self, - data: Mapping[str, Sequence[Any]], + data: Table | Mapping[str, Sequence[Any]], target_name: str, - feature_names: list[str] | None = None, + extra_names: list[str] | None = None, ): - """ - Create a tabular dataset from a mapping of column names to their values. - - Parameters - ---------- - data: - The data. - target_name: - Name of the target column. - feature_names: - Names of the feature columns. If None, all columns except the target column are used. - - Raises - ------ - ColumnLengthMismatchError - If columns have different lengths. - ValueError - If the target column is also a feature column. - ValueError - If no feature columns are specified. - - Examples - -------- - >>> from safeds.data.labeled.containers import TabularDataset - >>> table = TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]) - """ - self._table = Table(data) + # Preprocess inputs + if not isinstance(data, Table): + data = Table(data) + if extra_names is None: + extra_names = [] - # If no feature names are specified, use all columns except the target column - if feature_names is None: - feature_names = self._table.column_names - if target_name in feature_names: - feature_names.remove(target_name) + # Derive feature names + feature_names = [name for name in data.column_names if name not in {target_name, *extra_names}] # Validate inputs - if target_name in feature_names: - raise ValueError(f"Column '{target_name}' cannot be both feature and target.") + if target_name in extra_names: + raise ValueError(f"Column '{target_name}' cannot be both target and extra.") if len(feature_names) == 0: - raise ValueError("At least one feature column must be specified.") + raise ValueError("At least one feature column must remain.") - self._features: Table = self._table.keep_only_columns(feature_names) - self._target: Column = self._table.get_column(target_name) + # Set attributes + self._table: Table = data + self._features: Table = data.keep_only_columns(feature_names) + self._target: Column = data.get_column(target_name) + self._extras: Table = data.keep_only_columns(extra_names) def __eq__(self, other: object) -> bool: """ @@ -210,27 +124,22 @@ def __sizeof__(self) -> int: @property def features(self) -> Table: - """ - Get the feature columns of the tabular dataset. - - Returns - ------- - features: - The table containing the feature columns. - """ + """The feature columns of the tabular dataset.""" return self._features @property def target(self) -> Column: + """The target column of the tabular dataset.""" + return self._target + + @property + def extras(self) -> Table: """ - Get the target column of the tabular dataset. + Additional columns of the tabular dataset that are neither features nor target. - Returns - ------- - target: - The target column. + These can be used to store additional information about instances, such as IDs. """ - return self._target + return self._extras # ------------------------------------------------------------------------------------------------------------------ # Conversion diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 3bc04dc64..0d5964d10 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -2412,7 +2412,7 @@ def to_rows(self) -> list[Row]: for (_, series_row) in self._data.iterrows() ] - def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None = None) -> TabularDataset: + def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset: """ Return a new `TabularDataset` with columns marked as a target column or feature columns. @@ -2422,12 +2422,13 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None = ---------- target_name: Name of the target column. - feature_names: - Names of the feature columns. If None, all columns except the target column are used. + extra_names: + Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but + the target column are used as features. Returns ------- - tabular_dataset: + dataset: A new tabular dataset with the given target and feature names. Raises @@ -2441,11 +2442,11 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None = -------- >>> from safeds.data.tabular.containers import Table >>> table = Table({"item": ["apple", "milk", "beer"], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]}) - >>> tabular_dataset = table.to_tabular_dataset(target_name="amount_bought", feature_names=["item", "price"]) + >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"]) """ from safeds.data.labeled.containers import TabularDataset - return TabularDataset._from_table(self, target_name, feature_names) + return TabularDataset(self, target_name, extra_names) # ------------------------------------------------------------------------------------------------------------------ # IPython integration diff --git a/src/safeds/data/tabular/containers/_time_series.py b/src/safeds/data/tabular/containers/_time_series.py index db93b0333..bf71e92e6 100644 --- a/src/safeds/data/tabular/containers/_time_series.py +++ b/src/safeds/data/tabular/containers/_time_series.py @@ -20,8 +20,6 @@ from pathlib import Path from typing import Any - from safeds.data.labeled.containers import TabularDataset - class TimeSeries(Table): @@ -76,58 +74,6 @@ def timeseries_from_csv_file( feature_names=feature_names, ) - @staticmethod - def _from_tabular_dataset( - tabular_dataset: TabularDataset, - time_name: str, - ) -> TimeSeries: - """Create a time series from a tabular dataset. - - Parameters - ---------- - tabular_dataset: - The tabular dataset. - time_name: - Name of the time column. - - Returns - ------- - time_series: - the created time series - - Raises - ------ - UnknownColumnNameError - If time_name matches none of the column names. - Value Error - If time column is also a feature column - - Examples - -------- - >>> from safeds.data.labeled.containers import TabularDataset - >>> from safeds.data.tabular.containers import Table, TimeSeries - >>> tabular_dataset = TabularDataset({"date": ["01.01", "01.02", "01.03", "01.04"], "col1": ["a", "b", "c", "a"]}, "col1" ) - >>> timeseries = TimeSeries._from_tabular_dataset(tabular_dataset, time_name = "date") - """ - if time_name not in tabular_dataset._table.column_names: - raise UnknownColumnNameError([time_name]) - table = tabular_dataset.to_table() - # make sure that the time_name is not part of the features - result = object.__new__(TimeSeries) - feature_names = tabular_dataset.features.column_names - if time_name in feature_names: - feature_names.remove(time_name) - - if time_name == tabular_dataset.target.name: - raise ValueError(f"Column '{time_name}' cannot be both time column and target.") - - result._data = table._data - result._schema = table.schema - result._time = table.get_column(time_name) - result._features = table.keep_only_columns(feature_names) - result._target = table.get_column(tabular_dataset.target.name) - return result - @staticmethod def _from_table( table: Table, @@ -237,8 +183,8 @@ def __init__( Examples -------- - >>> from safeds.data.labeled.containers import TabularDataset - >>> table = TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]) + >>> from safeds.data.tabular.containers import TimeSeries + >>> table = TimeSeries({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a") """ import pandas as pd diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py index 80056b8ba..2f822f9a1 100644 --- a/src/safeds/ml/classical/_util_sklearn.py +++ b/src/safeds/ml/classical/_util_sklearn.py @@ -161,9 +161,16 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_ warnings.filterwarnings("ignore", message="X does not have valid feature names") predicted_target_vector = model.predict(dataset_df.values) result_set[target_name] = predicted_target_vector + + extra_names = [ + column_name + for column_name in dataset.column_names + if column_name != target_name and column_name not in feature_names + ] + return Table._from_pandas_dataframe(result_set).to_tabular_dataset( target_name=target_name, - feature_names=feature_names, + extra_names=extra_names, ) except ValueError as exception: raise PredictionError(str(exception)) from exception diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py index 9743cf874..6c84f2a1a 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py @@ -8,35 +8,39 @@ @pytest.mark.parametrize( ("table1", "table2", "expected"), [ - (TabularDataset({"a": [], "b": []}, "b", ["a"]), TabularDataset({"a": [], "b": []}, "b", ["a"]), True), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), + TabularDataset({"a": [], "b": []}, "b"), + TabularDataset({"a": [], "b": []}, "b"), True, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), + True, + ), + ( + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", ["b"]), False, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", ["d"]), False, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), - TabularDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), + TabularDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b"), False, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), - TabularDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), + TabularDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b"), False, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), False, ), ], @@ -61,9 +65,9 @@ def test_should_return_whether_two_tabular_datasets_are_equal( @pytest.mark.parametrize( ("table", "other"), [ - (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), None), - (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), Row()), - (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), Table()), + (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), None), + (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), Row()), + (TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), Table()), ], ids=[ "TabularDataset vs. None", diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_extras.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_extras.py new file mode 100644 index 000000000..001b15524 --- /dev/null +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_extras.py @@ -0,0 +1,41 @@ +import pytest +from safeds.data.labeled.containers import TabularDataset +from safeds.data.tabular.containers import Table + + +@pytest.mark.parametrize( + ("tabular_dataset", "extras"), + [ + ( + TabularDataset( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + target_name="T", + ), + Table(), + ), + ( + TabularDataset( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + target_name="T", + extra_names=["A", "C"], + ), + Table({"A": [1, 4], "C": [3, 6]}), + ), + ], + ids=[ + "only_target_and_features", + "target_features_and_extras", + ], +) +def test_should_return_features(tabular_dataset: TabularDataset, extras: Table) -> None: + assert tabular_dataset.extras == extras diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_features.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_features.py index dd572a198..446664c36 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_features.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_features.py @@ -27,7 +27,7 @@ "T": [0, 1], }, target_name="T", - feature_names=["A", "C"], + extra_names=["B"], ), Table({"A": [1, 4], "C": [3, 6]}), ), diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_from_table.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_from_table.py deleted file mode 100644 index 10b73579c..000000000 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_from_table.py +++ /dev/null @@ -1,150 +0,0 @@ -import pytest -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table -from safeds.exceptions import UnknownColumnNameError - - -@pytest.mark.parametrize( - ("table", "target_name", "feature_names", "error", "error_msg"), - [ - ( - Table( - { - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - ["A", "B", "C", "D", "E"], - UnknownColumnNameError, - r"Could not find column\(s\) 'D, E'", - ), - ( - Table( - { - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "D", - ["A", "B", "C"], - UnknownColumnNameError, - r"Could not find column\(s\) 'D'", - ), - ( - Table( - { - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "A", - ["A", "B", "C"], - ValueError, - r"Column 'A' cannot be both feature and target.", - ), - ( - Table( - { - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "A", - [], - ValueError, - r"At least one feature column must be specified.", - ), - ( - Table( - { - "A": [1, 4], - }, - ), - "A", - None, - ValueError, - r"At least one feature column must be specified.", - ), - ], - ids=[ - "feature_does_not_exist", - "target_does_not_exist", - "target_and_feature_overlap", - "features_are_empty-explicitly", - "features_are_empty_implicitly", - ], -) -def test_should_raise_error( - table: Table, - target_name: str, - feature_names: list[str] | None, - error: type[Exception], - error_msg: str, -) -> None: - with pytest.raises(error, match=error_msg): - TabularDataset._from_table(table, target_name=target_name, feature_names=feature_names) - - -@pytest.mark.parametrize( - ("table", "target_name", "feature_names"), - [ - ( - Table( - { - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - ["A", "B", "C"], - ), - ( - Table( - { - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - ["A", "C"], - ), - ( - Table( - { - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - None, - ), - ], - ids=[ - "create_tabular_dataset", - "tabular_dataset_not_all_columns_are_features", - "tabular_dataset_with_feature_names_as_None", - ], -) -def test_should_create_a_tabular_dataset(table: Table, target_name: str, feature_names: list[str] | None) -> None: - tabular_dataset = TabularDataset._from_table(table, target_name=target_name, feature_names=feature_names) - feature_names = feature_names if feature_names is not None else table.remove_columns([target_name]).column_names - assert isinstance(tabular_dataset, TabularDataset) - assert tabular_dataset._features.column_names == feature_names - assert tabular_dataset._target.name == target_name - assert tabular_dataset._features == table.keep_only_columns(feature_names) - assert tabular_dataset._target == table.get_column(target_name) diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py index 918041a47..e86e5197f 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py @@ -5,14 +5,17 @@ @pytest.mark.parametrize( ("table1", "table2"), [ - (TabularDataset({"a": [], "b": []}, "b", ["a"]), TabularDataset({"a": [], "b": []}, "b", ["a"])), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), + TabularDataset({"a": [], "b": []}, "b"), + TabularDataset({"a": [], "b": []}, "b"), ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), - TabularDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), + ), + ( + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), + TabularDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b"), ), ], ids=[ @@ -29,20 +32,23 @@ def test_should_return_same_hash_for_equal_tabular_datasets(table1: TabularDatas ("table1", "table2"), [ ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", ["b"]), ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", ["d"]), ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), - TabularDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b", ["a"]), + TabularDataset( + {"a": [1, 2, 3], "b": [4, 5, 6]}, + "b", + ), + TabularDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b"), ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), ), ], ids=[ diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py index 9e6116e1a..0ac34111e 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_init.py @@ -5,7 +5,7 @@ @pytest.mark.parametrize( - ("data", "target_name", "feature_names", "error", "error_msg"), + ("data", "target_name", "extra_names", "error", "error_msg"), [ ( { @@ -15,7 +15,7 @@ "T": [0, 1], }, "T", - ["A", "B", "C", "D", "E"], + ["D", "E"], UnknownColumnNameError, r"Could not find column\(s\) 'D, E'", ), @@ -27,7 +27,7 @@ "T": [0, 1], }, "D", - ["A", "B", "C"], + [], UnknownColumnNameError, r"Could not find column\(s\) 'D'", ), @@ -39,9 +39,9 @@ "T": [0, 1], }, "A", - ["A", "B", "C"], + ["A"], ValueError, - r"Column 'A' cannot be both feature and target.", + r"Column 'A' cannot be both target and extra.", ), ( { @@ -50,42 +50,114 @@ "C": [3, 6], "T": [0, 1], }, - "D", - [], + "T", + ["A", "B", "C"], ValueError, - r"At least one feature column must be specified.", + r"At least one feature column must remain.", ), ( { "A": [1, 4], }, "A", - None, + [], + ValueError, + r"At least one feature column must remain.", + ), + ( + Table( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + ), + "T", + ["D", "E"], + UnknownColumnNameError, + r"Could not find column\(s\) 'D, E'", + ), + ( + Table( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + ), + "D", + [], + UnknownColumnNameError, + r"Could not find column\(s\) 'D'", + ), + ( + Table( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + ), + "A", + ["A"], ValueError, - r"At least one feature column must be specified.", + r"Column 'A' cannot be both target and extra.", + ), + ( + Table( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + ), + "T", + ["A", "B", "C"], + ValueError, + r"At least one feature column must remain.", + ), + ( + Table( + { + "A": [1, 4], + }, + ), + "A", + [], + ValueError, + r"At least one feature column must remain.", ), ], ids=[ - "feature_does_not_exist", - "target_does_not_exist", - "target_and_feature_overlap", - "features_are_empty-explicitly", - "features_are_empty_implicitly", + "dict_extra_does_not_exist", + "dict_target_does_not_exist", + "dict_target_and_extra_overlap", + "dict_features_are_empty_explicitly", + "dict_features_are_empty_implicitly", + "table_extra_does_not_exist", + "table_target_does_not_exist", + "table_target_and_extra_overlap", + "table_features_are_empty_explicitly", + "table_features_are_empty_implicitly", ], ) def test_should_raise_error( data: dict[str, list[int]], target_name: str, - feature_names: list[str] | None, + extra_names: list[str] | None, error: type[Exception], error_msg: str, ) -> None: with pytest.raises(error, match=error_msg): - TabularDataset(data, target_name=target_name, feature_names=feature_names) + TabularDataset(data, target_name=target_name, extra_names=extra_names) @pytest.mark.parametrize( - ("data", "target_name", "feature_names"), + ("data", "target_name", "extra_names"), [ ( { @@ -95,7 +167,7 @@ def test_should_raise_error( "T": [0, 1], }, "T", - ["A", "B", "C"], + [], ), ( { @@ -117,24 +189,66 @@ def test_should_raise_error( "T", None, ), + ( + Table( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + ), + "T", + [], + ), + ( + Table( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + ), + "T", + ["A", "C"], + ), + ( + Table( + { + "A": [1, 4], + "B": [2, 5], + "C": [3, 6], + "T": [0, 1], + }, + ), + "T", + None, + ), ], ids=[ - "create_tabular_dataset", - "tabular_dataset_not_all_columns_are_features", - "tabular_dataset_with_feature_names_as_None", + "dict_create_tabular_dataset", + "dict_tabular_dataset_not_all_columns_are_features", + "dict_tabular_dataset_with_extra_names_as_None", + "table_create_tabular_dataset", + "table_tabular_dataset_not_all_columns_are_features", + "table_tabular_dataset_with_extra_names_as_None", ], ) def test_should_create_a_tabular_dataset( - data: dict[str, list[int]], + data: Table | dict[str, list[int]], target_name: str, - feature_names: list[str] | None, + extra_names: list[str] | None, ) -> None: - tabular_dataset = TabularDataset(data, target_name=target_name, feature_names=feature_names) - if feature_names is None: - feature_names = list(data.keys()) - feature_names.remove(target_name) + tabular_dataset = TabularDataset(data, target_name=target_name, extra_names=extra_names) + if not isinstance(data, Table): + data = Table(data) + + if extra_names is None: + extra_names = [] + assert isinstance(tabular_dataset, TabularDataset) - assert tabular_dataset._features.column_names == feature_names + assert tabular_dataset._extras.column_names == extra_names assert tabular_dataset._target.name == target_name - assert tabular_dataset._features == Table(data).keep_only_columns(feature_names) - assert tabular_dataset._target == Table(data).get_column(target_name) + assert tabular_dataset._extras == data.keep_only_columns(extra_names) + assert tabular_dataset._target == data.get_column(target_name) diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py index bc35f883c..a2a217401 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py @@ -4,7 +4,7 @@ @pytest.mark.parametrize( - ("data", "target_name", "feature_names"), + ("data", "target_name", "extra_names"), [ ( { @@ -14,7 +14,7 @@ "T": [0, 1], }, "T", - ["A", "B", "C"], + [], ), ], ids=[ @@ -24,8 +24,8 @@ def test_should_create_dataloader( data: dict[str, list[int]], target_name: str, - feature_names: list[str] | None, + extra_names: list[str] | None, ) -> None: - tabular_dataset = Table.from_dict(data).to_tabular_dataset(target_name, feature_names) + tabular_dataset = Table.from_dict(data).to_tabular_dataset(target_name, extra_names) data_loader = tabular_dataset._into_dataloader_with_classes(1, 2) assert isinstance(data_loader, DataLoader) diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py index aeae315e0..a7097deec 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py @@ -14,7 +14,6 @@ "target": [1, 3, 2], }, "target", - ["feature_1", "feature_2"], ), TabularDataset( { @@ -24,10 +23,10 @@ "target": [1, 3, 2], }, "target", - ["feature_1", "feature_2"], + ["other"], ), ], - ids=["normal", "table_with_column_as_non_feature"], + ids=["normal", "table_with_extra_column"], ) def test_should_size_be_greater_than_normal_object(tabular_dataset: TabularDataset) -> None: assert sys.getsizeof(tabular_dataset) > sys.getsizeof(object()) diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py index accf240b3..71e9d5db8 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py @@ -14,7 +14,6 @@ "target": [1, 3, 2], }, "target", - ["feature_1", "feature_2"], ), Table( { @@ -33,7 +32,7 @@ "target": [1, 3, 2], }, "target", - ["feature_1", "feature_2"], + ["other"], ), Table( { @@ -45,7 +44,7 @@ ), ), ], - ids=["normal", "table_with_column_as_non_feature"], + ids=["normal", "table_with_extra_column"], ) def test_should_return_table(tabular_dataset: TabularDataset, expected: Table) -> None: table = tabular_dataset.to_table() diff --git a/tests/safeds/data/tabular/containers/_time_series/test_eq.py b/tests/safeds/data/tabular/containers/_time_series/test_eq.py index bde59d432..0e39f828f 100644 --- a/tests/safeds/data/tabular/containers/_time_series/test_eq.py +++ b/tests/safeds/data/tabular/containers/_time_series/test_eq.py @@ -87,7 +87,7 @@ def test_should_return_true_if_objects_are_identical(table1: TimeSeries) -> None (TimeSeries({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "c", ["a"]), Table()), ( TimeSeries({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "c", ["a"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b"), ), ], ids=[ diff --git a/tests/safeds/data/tabular/containers/_time_series/test_from_tagged_table.py b/tests/safeds/data/tabular/containers/_time_series/test_from_tagged_table.py deleted file mode 100644 index 0cc96a575..000000000 --- a/tests/safeds/data/tabular/containers/_time_series/test_from_tagged_table.py +++ /dev/null @@ -1,199 +0,0 @@ -import pytest -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table, TimeSeries -from safeds.exceptions import UnknownColumnNameError - - -@pytest.mark.parametrize( - ("table", "target_name", "time_name", "feature_names", "error", "error_msg"), - [ - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - "time", - ["A", "B", "C", "D", "E"], - UnknownColumnNameError, - r"Could not find column\(s\) 'D, E'", - ), - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "D", - "time", - ["A", "B", "C"], - UnknownColumnNameError, - r"Could not find column\(s\) 'D'", - ), - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "A", - "time", - ["A", "B", "C"], - ValueError, - r"Column 'A' cannot be both feature and target.", - ), - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "A", - "time", - [], - ValueError, - r"At least one feature column must be specified.", - ), - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "time", - "time", - ["A", "B", "C"], - ValueError, - r"Column 'time' cannot be both time column and target.", - ), - ( - Table( - { - "r": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - "time", - ["A", "B", "C"], - UnknownColumnNameError, - r"Could not find column\(s\) 'time'", - ), - ], - ids=[ - "feature_does_not_exist", - "target_does_not_exist", - "target_and_feature_overlap", - "features_are_empty-explicitly", - "time_name_is_target", - "time_does_not_exist", - ], -) -def test_should_raise_error( - table: Table, - target_name: str, - time_name: str, - feature_names: list[str] | None, - error: type[Exception], - error_msg: str, -) -> None: - with pytest.raises(error, match=error_msg): - TimeSeries._from_tabular_dataset( - TabularDataset._from_table(table, target_name=target_name, feature_names=feature_names), - time_name=time_name, - ) - - -@pytest.mark.parametrize( - ("table", "target_name", "time_name", "feature_names"), - [ - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - "time", - ["A", "B", "C"], - ), - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - "time", - ["A", "C"], - ), - ( - Table( - { - "time": [0, 1], - "A": [1, 4], - "B": [2, 5], - "C": [3, 6], - "T": [0, 1], - }, - ), - "T", - "time", - None, - ), - ], - ids=[ - "create_tabular_dataset", - "tabular_dataset_not_all_columns_are_features", - "tabular_dataset_with_feature_names_as_None", - ], -) -def test_should_create_a_time_series( - table: Table, - target_name: str, - time_name: str, - feature_names: list[str] | None, -) -> None: - tabular_dataset = TabularDataset._from_table(table, target_name=target_name, feature_names=feature_names) - time_series = TimeSeries._from_tabular_dataset(tabular_dataset, time_name=time_name) - feature_names = ( - feature_names if feature_names is not None else table.remove_columns([target_name, time_name]).column_names - ) - assert isinstance(time_series, TimeSeries) - assert time_series._features.column_names == feature_names - assert time_series._target.name == target_name - assert time_series._features == table.keep_only_columns(feature_names) - assert time_series._target == table.get_column(target_name) - assert time_series.time == table.get_column(time_name) diff --git a/tests/safeds/ml/classical/classification/test_ada_boost.py b/tests/safeds/ml/classical/classification/test_ada_boost.py index 22bf98567..91fde488b 100644 --- a/tests/safeds/ml/classical/classification/test_ada_boost.py +++ b/tests/safeds/ml/classical/classification/test_ada_boost.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestLearner: diff --git a/tests/safeds/ml/classical/classification/test_classifier.py b/tests/safeds/ml/classical/classification/test_classifier.py index a08ae7b5f..5d8803c8a 100644 --- a/tests/safeds/ml/classical/classification/test_classifier.py +++ b/tests/safeds/ml/classical/classification/test_classifier.py @@ -63,7 +63,7 @@ def valid_data() -> TabularDataset: "feat2": [3, 6], "target": [0, 1], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]) + ).to_tabular_dataset(target_name="target", extra_names=["id"]) @pytest.mark.parametrize("classifier", classifiers(), ids=lambda x: x.__class__.__name__) @@ -93,7 +93,7 @@ def test_should_not_change_input_table(self, classifier: Classifier, request: Fi "feat2": [3, 6], "target": [0, 1], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]), + ).to_tabular_dataset(target_name="target", extra_names=["id"]), NonNumericColumnError, ( r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\{'feat1'\}\nYou" @@ -110,7 +110,7 @@ def test_should_not_change_input_table(self, classifier: Classifier, request: Fi "feat2": [3, 6], "target": [0, 1], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]), + ).to_tabular_dataset(target_name="target", extra_names=["id"]), MissingValuesColumnError, ( r"Tried to do an operation on one or multiple columns containing missing values: \n\{'feat1'\}\nYou" @@ -127,7 +127,7 @@ def test_should_not_change_input_table(self, classifier: Classifier, request: Fi "feat2": [], "target": [], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]), + ).to_tabular_dataset(target_name="target", extra_names=["id"]), DatasetMissesDataError, r"Dataset contains no rows", ), diff --git a/tests/safeds/ml/classical/classification/test_gradient_boosting.py b/tests/safeds/ml/classical/classification/test_gradient_boosting.py index 5306daa23..c48ecd15d 100644 --- a/tests/safeds/ml/classical/classification/test_gradient_boosting.py +++ b/tests/safeds/ml/classical/classification/test_gradient_boosting.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestNumberOfTrees: diff --git a/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py b/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py index b2f17dda6..775ceb51a 100644 --- a/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py +++ b/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestNumberOfNeighbors: diff --git a/tests/safeds/ml/classical/classification/test_random_forest.py b/tests/safeds/ml/classical/classification/test_random_forest.py index 6edee8743..14e87e6a0 100644 --- a/tests/safeds/ml/classical/classification/test_random_forest.py +++ b/tests/safeds/ml/classical/classification/test_random_forest.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestNumberOfTrees: diff --git a/tests/safeds/ml/classical/classification/test_support_vector_machine.py b/tests/safeds/ml/classical/classification/test_support_vector_machine.py index 21df23650..2a19fd80a 100644 --- a/tests/safeds/ml/classical/classification/test_support_vector_machine.py +++ b/tests/safeds/ml/classical/classification/test_support_vector_machine.py @@ -31,7 +31,7 @@ def kernels() -> list[SupportVectorMachineKernel]: @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestC: diff --git a/tests/safeds/ml/classical/regression/test_ada_boost.py b/tests/safeds/ml/classical/regression/test_ada_boost.py index fb0b50989..44cfcbd83 100644 --- a/tests/safeds/ml/classical/regression/test_ada_boost.py +++ b/tests/safeds/ml/classical/regression/test_ada_boost.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestLearner: diff --git a/tests/safeds/ml/classical/regression/test_elastic_net_regression.py b/tests/safeds/ml/classical/regression/test_elastic_net_regression.py index 1c8041a32..66f10699d 100644 --- a/tests/safeds/ml/classical/regression/test_elastic_net_regression.py +++ b/tests/safeds/ml/classical/regression/test_elastic_net_regression.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestAlpha: diff --git a/tests/safeds/ml/classical/regression/test_gradient_boosting.py b/tests/safeds/ml/classical/regression/test_gradient_boosting.py index 931493dc6..f1ef8549d 100644 --- a/tests/safeds/ml/classical/regression/test_gradient_boosting.py +++ b/tests/safeds/ml/classical/regression/test_gradient_boosting.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestNumberOfTrees: diff --git a/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py b/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py index e2578db2a..a01e27f0b 100644 --- a/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py +++ b/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestNumberOfNeighbors: diff --git a/tests/safeds/ml/classical/regression/test_lasso_regression.py b/tests/safeds/ml/classical/regression/test_lasso_regression.py index 1a9a7d191..90d771b16 100644 --- a/tests/safeds/ml/classical/regression/test_lasso_regression.py +++ b/tests/safeds/ml/classical/regression/test_lasso_regression.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestAlpha: diff --git a/tests/safeds/ml/classical/regression/test_random_forest.py b/tests/safeds/ml/classical/regression/test_random_forest.py index cb35759a5..2f5f97579 100644 --- a/tests/safeds/ml/classical/regression/test_random_forest.py +++ b/tests/safeds/ml/classical/regression/test_random_forest.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestNumberOfTrees: diff --git a/tests/safeds/ml/classical/regression/test_regressor.py b/tests/safeds/ml/classical/regression/test_regressor.py index ca0759119..7d2a2f5b4 100644 --- a/tests/safeds/ml/classical/regression/test_regressor.py +++ b/tests/safeds/ml/classical/regression/test_regressor.py @@ -74,7 +74,7 @@ def valid_data() -> TabularDataset: "feat2": [3, 6], "target": [0, 1], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]) + ).to_tabular_dataset(target_name="target", extra_names=["id"]) @pytest.mark.parametrize("regressor", regressors(), ids=lambda x: x.__class__.__name__) @@ -104,7 +104,7 @@ def test_should_not_change_input_table(self, regressor: Regressor, request: Fixt "feat2": [3, 6], "target": [0, 1], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]), + ).to_tabular_dataset(target_name="target", extra_names=["id"]), NonNumericColumnError, r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\{'feat1'\}", ), @@ -116,7 +116,7 @@ def test_should_not_change_input_table(self, regressor: Regressor, request: Fixt "feat2": [3, 6], "target": [0, 1], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]), + ).to_tabular_dataset(target_name="target", extra_names=["id"]), MissingValuesColumnError, r"Tried to do an operation on one or multiple columns containing missing values: \n\{'feat1'\}", ), @@ -128,7 +128,7 @@ def test_should_not_change_input_table(self, regressor: Regressor, request: Fixt "feat2": [], "target": [], }, - ).to_tabular_dataset(target_name="target", feature_names=["feat1", "feat2"]), + ).to_tabular_dataset(target_name="target", extra_names=["id"]), DatasetMissesDataError, r"Dataset contains no rows", ), diff --git a/tests/safeds/ml/classical/regression/test_ridge_regression.py b/tests/safeds/ml/classical/regression/test_ridge_regression.py index c30e17e31..3dd2054ce 100644 --- a/tests/safeds/ml/classical/regression/test_ridge_regression.py +++ b/tests/safeds/ml/classical/regression/test_ridge_regression.py @@ -8,7 +8,7 @@ @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestAlpha: diff --git a/tests/safeds/ml/classical/regression/test_support_vector_machine.py b/tests/safeds/ml/classical/regression/test_support_vector_machine.py index 6ed483b9d..a2015964c 100644 --- a/tests/safeds/ml/classical/regression/test_support_vector_machine.py +++ b/tests/safeds/ml/classical/regression/test_support_vector_machine.py @@ -31,7 +31,7 @@ def kernels() -> list[SupportVectorMachineKernel]: @pytest.fixture() def training_set() -> TabularDataset: table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1", feature_names=["col2"]) + return table.to_tabular_dataset(target_name="col1") class TestC: