From 8209ffad6f49a6133c07e6c1435ca67027d69ef6 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Sun, 11 Feb 2024 22:03:15 -0800 Subject: [PATCH 01/14] expose feature_name_ via sklearn consistent attribute feature_names_in_ --- python-package/lightgbm/sklearn.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 9f1a62f542ca..1a569fc62afe 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1119,6 +1119,11 @@ def feature_name_(self) -> List[str]: if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.") return self._Booster.feature_name() # type: ignore[union-attr] + + @property + def feature_names_in_(self) -> List[str]: + """:obj:`list` of shape = [n_features]: Sklearn-style property for feature names.""" + return self.feature_name_ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): From 52835d802d523c30b7914f927079d4c482d58cd1 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Mon, 12 Feb 2024 20:52:24 -0800 Subject: [PATCH 02/14] fix docstring --- python-package/lightgbm/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 1a569fc62afe..d3ac917522c0 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1122,7 +1122,7 @@ def feature_name_(self) -> List[str]: @property def feature_names_in_(self) -> List[str]: - """:obj:`list` of shape = [n_features]: Sklearn-style property for feature names.""" + """:obj:`list` of shape = [n_features]: The names of features.""" return self.feature_name_ From adc76834ee0b55b03f1ab6fa60259e7062f2b6f8 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Mon, 12 Feb 2024 20:53:21 -0800 Subject: [PATCH 03/14] raise error if estimator not fitted --- python-package/lightgbm/sklearn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index d3ac917522c0..d0200eec5714 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1123,6 +1123,8 @@ def feature_name_(self) -> List[str]: @property def feature_names_in_(self) -> List[str]: """:obj:`list` of shape = [n_features]: The names of features.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No feature_names_in_ found. Need to call fit beforehand.') return self.feature_name_ From 08e67aaa656c84a03344dd99ee41e8a257bd49c6 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Sun, 17 Mar 2024 14:58:24 -0700 Subject: [PATCH 04/14] ensure exact feature match for feature_names_in_ attribute --- tests/python_package_test/test_sklearn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 2fc127b5232d..40c648446cfe 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1397,6 +1397,7 @@ def test_validate_features(task): else: model.fit(df, y) assert model.feature_name_ == features + assert model.feature_names_in_ == features # try to predict with a different feature df2 = df.rename(columns={"x2": "z"}) From 0ecc3371795764e066c202760a5a1f56e5bb80ed Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Thu, 28 Mar 2024 11:38:42 -0700 Subject: [PATCH 05/14] add test for numpy input --- tests/python_package_test/test_sklearn.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 40c648446cfe..86975ee42e71 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1276,6 +1276,17 @@ def test_check_is_fitted(): check_is_fitted(model) +def test_getting_feature_names_in_np_input(): + X, y = load_digits(n_class=2, return_X_y=True) + est = lgb.LGBMModel(n_estimators=5, objective="binary") + with pytest.raises(lgb.compat.LGBMNotFittedError): + est.feature_names_in_ + est.fit(X, y) + assert est.feature_names_in_ == [ + f"Column_{i}" for i in range(X.shape[1]) + ] + + @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) def test_sklearn_integration(estimator, check): estimator.set_params(min_child_samples=1, min_data_in_bin=1) From c110c9d7fcb3a5fc935671625c633c311a976708 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Thu, 28 Mar 2024 11:50:27 -0700 Subject: [PATCH 06/14] add test for pandas input with feature names --- tests/python_package_test/test_sklearn.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 86975ee42e71..f009acf74b1f 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1277,6 +1277,8 @@ def test_check_is_fitted(): def test_getting_feature_names_in_np_input(): + # input is a numpy array, which doesn't have feature names. LightGBM adds + # feature names to the fitted model, which is inconsistent with sklearn's behavior X, y = load_digits(n_class=2, return_X_y=True) est = lgb.LGBMModel(n_estimators=5, objective="binary") with pytest.raises(lgb.compat.LGBMNotFittedError): @@ -1287,6 +1289,16 @@ def test_getting_feature_names_in_np_input(): ] +def test_getting_feature_names_in_pd_input(): + # as_frame=True means input has column names and these should propagate to fitted model + X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) + est = lgb.LGBMModel(n_estimators=5, objective="binary") + with pytest.raises(lgb.compat.LGBMNotFittedError): + est.feature_names_in_ + est.fit(X, y) + assert est.feature_names_in_ == list(X.columns) + + @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) def test_sklearn_integration(estimator, check): estimator.set_params(min_child_samples=1, min_data_in_bin=1) From a8a56314a0a0fef13c04c76b31da9231feac2c5a Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Thu, 28 Mar 2024 11:54:33 -0700 Subject: [PATCH 07/14] add documentation for when input data has no feature names --- python-package/lightgbm/sklearn.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index d0200eec5714..72da3d2a4fb6 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1122,7 +1122,12 @@ def feature_name_(self) -> List[str]: @property def feature_names_in_(self) -> List[str]: - """:obj:`list` of shape = [n_features]: The names of features.""" + """:obj:`list` of shape = [n_features]: The names of features. + + .. note:: + + If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``. + """ if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError('No feature_names_in_ found. Need to call fit beforehand.') return self.feature_name_ From 4e1f1dc5bd7afae274e3ace3130b7e61c317bc0d Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Thu, 28 Mar 2024 11:56:43 -0700 Subject: [PATCH 08/14] pre-commit fixes --- python-package/lightgbm/sklearn.py | 8 ++++---- tests/python_package_test/test_sklearn.py | 4 +--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 72da3d2a4fb6..7212029444a4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1119,17 +1119,17 @@ def feature_name_(self) -> List[str]: if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.") return self._Booster.feature_name() # type: ignore[union-attr] - + @property def feature_names_in_(self) -> List[str]: """:obj:`list` of shape = [n_features]: The names of features. - + .. note:: If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``. """ - if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No feature_names_in_ found. Need to call fit beforehand.') + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.") return self.feature_name_ diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index f009acf74b1f..c25bfb2dd239 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1284,9 +1284,7 @@ def test_getting_feature_names_in_np_input(): with pytest.raises(lgb.compat.LGBMNotFittedError): est.feature_names_in_ est.fit(X, y) - assert est.feature_names_in_ == [ - f"Column_{i}" for i in range(X.shape[1]) - ] + assert est.feature_names_in_ == [f"Column_{i}" for i in range(X.shape[1])] def test_getting_feature_names_in_pd_input(): From b826426a4bdfeeebaad9779005365a8c554842c9 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Thu, 30 May 2024 19:57:40 -0700 Subject: [PATCH 09/14] feature_names_in_ returns a 1D numpy array --- python-package/lightgbm/sklearn.py | 4 ++-- tests/python_package_test/test_sklearn.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 7212029444a4..d7c00c517c7d 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1121,7 +1121,7 @@ def feature_name_(self) -> List[str]: return self._Booster.feature_name() # type: ignore[union-attr] @property - def feature_names_in_(self) -> List[str]: + def feature_names_in_(self) -> np.ndarray: """:obj:`list` of shape = [n_features]: The names of features. .. note:: @@ -1130,7 +1130,7 @@ def feature_names_in_(self) -> List[str]: """ if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.") - return self.feature_name_ + return np.array(self.feature_name_) class LGBMRegressor(_LGBMRegressorBase, LGBMModel): diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index c25bfb2dd239..d7b18b25611a 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1282,9 +1282,9 @@ def test_getting_feature_names_in_np_input(): X, y = load_digits(n_class=2, return_X_y=True) est = lgb.LGBMModel(n_estimators=5, objective="binary") with pytest.raises(lgb.compat.LGBMNotFittedError): - est.feature_names_in_ + check_is_fitted(est) est.fit(X, y) - assert est.feature_names_in_ == [f"Column_{i}" for i in range(X.shape[1])] + np.testing.assert_array_equal(est.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) def test_getting_feature_names_in_pd_input(): @@ -1294,7 +1294,7 @@ def test_getting_feature_names_in_pd_input(): with pytest.raises(lgb.compat.LGBMNotFittedError): est.feature_names_in_ est.fit(X, y) - assert est.feature_names_in_ == list(X.columns) + np.testing.assert_array_equal(est.feature_names_in_, X.columns) @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) @@ -1418,7 +1418,6 @@ def test_validate_features(task): else: model.fit(df, y) assert model.feature_name_ == features - assert model.feature_names_in_ == features # try to predict with a different feature df2 = df.rename(columns={"x2": "z"}) From fd1ce7c25051edffb8ed385cec5fa2831ed448ea Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Thu, 30 May 2024 20:24:03 -0700 Subject: [PATCH 10/14] test LGBMModel, LGBMClassifier, LGBMRegressor, LGBMRanker --- tests/python_package_test/test_sklearn.py | 36 ++++++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index d7b18b25611a..2c807b8e1dcf 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1281,20 +1281,40 @@ def test_getting_feature_names_in_np_input(): # feature names to the fitted model, which is inconsistent with sklearn's behavior X, y = load_digits(n_class=2, return_X_y=True) est = lgb.LGBMModel(n_estimators=5, objective="binary") - with pytest.raises(lgb.compat.LGBMNotFittedError): - check_is_fitted(est) - est.fit(X, y) - np.testing.assert_array_equal(est.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) + clf = lgb.LGBMClassifier(n_estimators=5) + reg = lgb.LGBMRegressor(n_estimators=5) + rnk = lgb.LGBMRanker(n_estimators=5) + models = (est, clf, reg, rnk) + group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group + + for model in models: + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=group) + else: + model.fit(X, y) + np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) def test_getting_feature_names_in_pd_input(): # as_frame=True means input has column names and these should propagate to fitted model X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) est = lgb.LGBMModel(n_estimators=5, objective="binary") - with pytest.raises(lgb.compat.LGBMNotFittedError): - est.feature_names_in_ - est.fit(X, y) - np.testing.assert_array_equal(est.feature_names_in_, X.columns) + clf = lgb.LGBMClassifier(n_estimators=5) + reg = lgb.LGBMRegressor(n_estimators=5) + rnk = lgb.LGBMRanker(n_estimators=5) + models = (est, clf, reg, rnk) + group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group + + for model in models: + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=group) + else: + model.fit(X, y) + np.testing.assert_array_equal(est.feature_names_in_, X.columns) @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) From edd951ac3f770a31e4064eb785e45b49f35d8d74 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Thu, 30 May 2024 20:59:14 -0700 Subject: [PATCH 11/14] rearrange feature name property docstrings --- python-package/lightgbm/sklearn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index d7c00c517c7d..c0375018fe57 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1115,19 +1115,19 @@ def feature_importances_(self) -> np.ndarray: @property def feature_name_(self) -> List[str]: - """:obj:`list` of shape = [n_features]: The names of features.""" - if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.") - return self._Booster.feature_name() # type: ignore[union-attr] - - @property - def feature_names_in_(self) -> np.ndarray: """:obj:`list` of shape = [n_features]: The names of features. .. note:: If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``. """ + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.") + return self._Booster.feature_name() # type: ignore[union-attr] + + @property + def feature_names_in_(self) -> np.ndarray: + """:obj:`array` of shape = [n_features]: scikit-learn compatible version of .feature_name_.""" if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.") return np.array(self.feature_name_) From 25888c6a79569e91af5e3874b4df36c6458d7d6b Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Fri, 31 May 2024 19:26:36 -0700 Subject: [PATCH 12/14] add get_feature_names_out method --- python-package/lightgbm/sklearn.py | 6 ++++ tests/python_package_test/test_sklearn.py | 43 +++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index c0375018fe57..eecf80cbb4a4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1014,6 +1014,12 @@ def predict( **predict_params, ) + def get_feature_names_out(self) -> np.ndarray: + """:obj:`array` of shape = [n_features]: Get output features of fitted model.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError("Output features cannot be determined. Need to call fit beforehand.") + return self.feature_names_in_ + predict.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted value for each sample.", X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 2c807b8e1dcf..c05d2e7248a0 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1317,6 +1317,49 @@ def test_getting_feature_names_in_pd_input(): np.testing.assert_array_equal(est.feature_names_in_, X.columns) +def test_get_feature_names_out_np_input(): + # input is a numpy array, which doesn't have feature names. LightGBM adds + # feature names to the fitted model, which is inconsistent with sklearn's behavior + X, y = load_digits(n_class=2, return_X_y=True) + est = lgb.LGBMModel(n_estimators=5, objective="binary") + clf = lgb.LGBMClassifier(n_estimators=5) + reg = lgb.LGBMRegressor(n_estimators=5) + rnk = lgb.LGBMRanker(n_estimators=5) + models = (est, clf, reg, rnk) + group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group + + for model in models: + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=group) + else: + model.fit(X, y) + np.testing.assert_array_equal( + model.get_feature_names_out(), np.array([f"Column_{i}" for i in range(X.shape[1])]) + ) + + +def test_get_feature_names_out_pd_input(): + # as_frame=True means input has column names and these should propagate to fitted model + X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) + est = lgb.LGBMModel(n_estimators=5, objective="binary") + clf = lgb.LGBMClassifier(n_estimators=5) + reg = lgb.LGBMRegressor(n_estimators=5) + rnk = lgb.LGBMRanker(n_estimators=5) + models = (est, clf, reg, rnk) + group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group + + for model in models: + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=group) + else: + model.fit(X, y) + np.testing.assert_array_equal(model.get_feature_names_out(), X.columns) + + @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()]) def test_sklearn_integration(estimator, check): estimator.set_params(min_child_samples=1, min_data_in_bin=1) From 574d9ce58afdf0ceaa75c1a33594ad7575fcfd56 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Fri, 31 May 2024 20:20:38 -0700 Subject: [PATCH 13/14] format reference to .feature_name_ with ticks --- python-package/lightgbm/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index eecf80cbb4a4..4eb606cea392 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1133,7 +1133,7 @@ def feature_name_(self) -> List[str]: @property def feature_names_in_(self) -> np.ndarray: - """:obj:`array` of shape = [n_features]: scikit-learn compatible version of .feature_name_.""" + """:obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``.""" if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.") return np.array(self.feature_name_) From 8ac21d32589299fc1fe8089a969dfd5f983aee19 Mon Sep 17 00:00:00 2001 From: nicklamiller Date: Mon, 10 Jun 2024 17:04:22 -0700 Subject: [PATCH 14/14] remove get_feature_names_out method, tidy up tests --- python-package/lightgbm/sklearn.py | 6 -- tests/python_package_test/test_sklearn.py | 108 +++++++--------------- 2 files changed, 32 insertions(+), 82 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index a335fc32a0a7..7f3e91a064c4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1043,12 +1043,6 @@ def predict( **predict_params, ) - def get_feature_names_out(self) -> np.ndarray: - """:obj:`array` of shape = [n_features]: Get output features of fitted model.""" - if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError("Output features cannot be determined. Need to call fit beforehand.") - return self.feature_names_in_ - predict.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted value for each sample.", X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index b2e23a847715..10af8ba960f3 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1290,88 +1290,44 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth): assert "Provided parameters constrain tree depth" not in capsys.readouterr().out -def test_getting_feature_names_in_np_input(): - # input is a numpy array, which doesn't have feature names. LightGBM adds - # feature names to the fitted model, which is inconsistent with sklearn's behavior - X, y = load_digits(n_class=2, return_X_y=True) - est = lgb.LGBMModel(n_estimators=5, objective="binary") - clf = lgb.LGBMClassifier(n_estimators=5) - reg = lgb.LGBMRegressor(n_estimators=5) - rnk = lgb.LGBMRanker(n_estimators=5) - models = (est, clf, reg, rnk) - group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group - - for model in models: - with pytest.raises(lgb.compat.LGBMNotFittedError): - check_is_fitted(model) - if isinstance(model, lgb.LGBMRanker): - model.fit(X, y, group=group) - else: - model.fit(X, y) - np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) - - -def test_getting_feature_names_in_pd_input(): - # as_frame=True means input has column names and these should propagate to fitted model - X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) - est = lgb.LGBMModel(n_estimators=5, objective="binary") - clf = lgb.LGBMClassifier(n_estimators=5) - reg = lgb.LGBMRegressor(n_estimators=5) - rnk = lgb.LGBMRanker(n_estimators=5) - models = (est, clf, reg, rnk) - group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group - - for model in models: - with pytest.raises(lgb.compat.LGBMNotFittedError): - check_is_fitted(model) - if isinstance(model, lgb.LGBMRanker): - model.fit(X, y, group=group) - else: - model.fit(X, y) - np.testing.assert_array_equal(est.feature_names_in_, X.columns) - - -def test_get_feature_names_out_np_input(): +@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +def test_getting_feature_names_in_np_input(estimator_class): # input is a numpy array, which doesn't have feature names. LightGBM adds # feature names to the fitted model, which is inconsistent with sklearn's behavior X, y = load_digits(n_class=2, return_X_y=True) - est = lgb.LGBMModel(n_estimators=5, objective="binary") - clf = lgb.LGBMClassifier(n_estimators=5) - reg = lgb.LGBMRegressor(n_estimators=5) - rnk = lgb.LGBMRanker(n_estimators=5) - models = (est, clf, reg, rnk) - group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group - - for model in models: - with pytest.raises(lgb.compat.LGBMNotFittedError): - check_is_fitted(model) - if isinstance(model, lgb.LGBMRanker): - model.fit(X, y, group=group) - else: - model.fit(X, y) - np.testing.assert_array_equal( - model.get_feature_names_out(), np.array([f"Column_{i}" for i in range(X.shape[1])]) - ) + params = {"n_estimators": 2, "num_leaves": 7} + if estimator_class is lgb.LGBMModel: + model = estimator_class(**{**params, "objective": "binary"}) + else: + model = estimator_class(**params) + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=[X.shape[0]]) + else: + model.fit(X, y) + np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) -def test_get_feature_names_out_pd_input(): - # as_frame=True means input has column names and these should propagate to fitted model +@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +def test_getting_feature_names_in_pd_input(estimator_class): X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) - est = lgb.LGBMModel(n_estimators=5, objective="binary") - clf = lgb.LGBMClassifier(n_estimators=5) - reg = lgb.LGBMRegressor(n_estimators=5) - rnk = lgb.LGBMRanker(n_estimators=5) - models = (est, clf, reg, rnk) - group = np.full(shape=(X.shape[0] // 2,), fill_value=2) # Just an example group - - for model in models: - with pytest.raises(lgb.compat.LGBMNotFittedError): - check_is_fitted(model) - if isinstance(model, lgb.LGBMRanker): - model.fit(X, y, group=group) - else: - model.fit(X, y) - np.testing.assert_array_equal(model.get_feature_names_out(), X.columns) + col_names = X.columns.to_list() + assert isinstance(col_names, list) and all( + isinstance(c, str) for c in col_names + ), "input data must have feature names for this test to cover the expected functionality" + params = {"n_estimators": 2, "num_leaves": 7} + if estimator_class is lgb.LGBMModel: + model = estimator_class(**{**params, "objective": "binary"}) + else: + model = estimator_class(**params) + with pytest.raises(lgb.compat.LGBMNotFittedError): + check_is_fitted(model) + if isinstance(model, lgb.LGBMRanker): + model.fit(X, y, group=[X.shape[0]]) + else: + model.fit(X, y) + np.testing.assert_array_equal(model.feature_names_in_, X.columns) @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])