From 1f561d325b92b5a67de37e1e2a2497dd58e57374 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 11:37:57 -0700 Subject: [PATCH 1/8] address get_feature_names to get_feature_names_out API change https://github.com/scikit-learn/scikit-learn/pull/18444 --- q2_sample_classifier/tests/test_estimators.py | 2 +- q2_sample_classifier/utilities.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/q2_sample_classifier/tests/test_estimators.py b/q2_sample_classifier/tests/test_estimators.py index b42de3c..8b3964a 100644 --- a/q2_sample_classifier/tests/test_estimators.py +++ b/q2_sample_classifier/tests/test_estimators.py @@ -135,7 +135,7 @@ def test_extract_features(self): dv = DictVectorizer() dv.fit(dicts) features = table.ids('observation') - self.assertEqual(set(dv.get_feature_names()), set(features)) + self.assertEqual(set(dv.get_feature_names_out()), set(features)) self.assertEqual(len(dicts), len(table.ids())) for dict_row, (table_row, _, _) in zip(dicts, table.iter()): for feature, count in zip(features, table_row): diff --git a/q2_sample_classifier/utilities.py b/q2_sample_classifier/utilities.py index b224676..fb014ff 100644 --- a/q2_sample_classifier/utilities.py +++ b/q2_sample_classifier/utilities.py @@ -239,7 +239,7 @@ def _rfecv_feature_selection(feature_data, targets, estimator, # Describe top features n_opt = rfecv.named_steps.est.n_features_ importance = _extract_important_features( - rfecv.named_steps.dv.get_feature_names(), + rfecv.named_steps.dv.get_feature_names_out(), rfecv.named_steps.est.ranking_) importance = sort_importances(importance, ascending=True)[:n_opt] @@ -411,12 +411,12 @@ def _calculate_feature_importances(estimator): # feature_importances_ or coef_ to report feature importance/weights try: importances = _extract_important_features( - estimator.named_steps.dv.get_feature_names(), + estimator.named_steps.dv.get_feature_names_out(), estimator.named_steps.est.feature_importances_) # is there a better way to determine whether estimator has coef_ ? except AttributeError: importances = _extract_important_features( - estimator.named_steps.dv.get_feature_names(), + estimator.named_steps.dv.get_feature_names_out(), estimator.named_steps.est.coef_) return importances @@ -718,7 +718,7 @@ def _mean_feature_importance(importances): def _null_feature_importance(table): feature_extractor = DictVectorizer() feature_extractor.fit(table) - imp = pd.DataFrame(index=feature_extractor.get_feature_names()) + imp = pd.DataFrame(index=feature_extractor.get_feature_names_out()) imp.index.name = "feature" imp["importance"] = 1 return imp From cd784dbf59cbb2d8d22cc83a255fdbc370b4f000 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 11:44:10 -0700 Subject: [PATCH 2/8] update read_csv calls for "squeeze" API change https://pandas.pydata.org/docs/whatsnew/v1.4.0.html --- q2_sample_classifier/tests/test_estimators.py | 2 +- .../tests/test_types_formats_transformers.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/q2_sample_classifier/tests/test_estimators.py b/q2_sample_classifier/tests/test_estimators.py index 8b3964a..dec0a0b 100644 --- a/q2_sample_classifier/tests/test_estimators.py +++ b/q2_sample_classifier/tests/test_estimators.py @@ -117,7 +117,7 @@ def _load_cmc(md_fp, column): index_col=0, names=['feature', 'importance']) self.exp_pred = pd.read_csv( self.get_data_path('predictions.tsv'), sep='\t', header=0, - index_col=0, squeeze=True) + index_col=0).squeeze('columns') index = pd.Index(['A', 'B', 'C', 'D'], name='id') self.table_percnorm = qiime2.Artifact.import_data( FeatureTable[PercentileNormalized], pd.DataFrame( diff --git a/q2_sample_classifier/tests/test_types_formats_transformers.py b/q2_sample_classifier/tests/test_types_formats_transformers.py index d0ec02a..3833ed1 100644 --- a/q2_sample_classifier/tests/test_types_formats_transformers.py +++ b/q2_sample_classifier/tests/test_types_formats_transformers.py @@ -85,7 +85,7 @@ def test_pd_series_to_boolean_format(self): name='outlier', index=exp_index) obs = transformer(exp) obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0, - squeeze=True) + ).squeeze('columns') self.assertEqual(sorted(exp), sorted(obs)) def test_boolean_format_to_pd_series(self): @@ -152,7 +152,7 @@ def test_pd_series_to_Predictions_format(self): name='prediction', index=['a', 'b', 'c', 'd']) obs = transformer(exp) obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0, - squeeze=True) + ).squeeze('columns') pdt.assert_series_equal(obs, exp) def test_pd_series_to_Predictions_format_allow_nans(self): @@ -161,7 +161,7 @@ def test_pd_series_to_Predictions_format_allow_nans(self): name='prediction', index=['a', 'b', 'c', 'd']) obs = transformer(exp) obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0, - squeeze=True) + ).squeeze('columns') pdt.assert_series_equal(obs, exp) def test_Predictions_format_to_pd_series(self): From 8d9df0347128a0e521d23483dcf34c192fed93f1 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 12:16:13 -0700 Subject: [PATCH 3/8] replace deprecated pd.Series.append with pd.concat --- q2_sample_classifier/tests/test_actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_sample_classifier/tests/test_actions.py b/q2_sample_classifier/tests/test_actions.py index e13a86c..23b7ee0 100644 --- a/q2_sample_classifier/tests/test_actions.py +++ b/q2_sample_classifier/tests/test_actions.py @@ -59,7 +59,7 @@ def test_action_split_table(self): self.assertEqual(y_train.name, 'bugs') # test if complete target column is covered - y_all = y_train.append(y_test).sort_index() + y_all = pd.concat([y_train, y_test]).sort_index() y_all.index.name = 'SampleID' pdt.assert_series_equal(y_all, self.md._series) From eb6d6cb2d5b20064a26b6d550d9f9db14ca8214d Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 12:17:08 -0700 Subject: [PATCH 4/8] deprecated grid_scores_ to cv_results_ https://github.com/scikit-learn/scikit-learn/pull/20161 --- q2_sample_classifier/utilities.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/q2_sample_classifier/utilities.py b/q2_sample_classifier/utilities.py index fb014ff..c66f835 100644 --- a/q2_sample_classifier/utilities.py +++ b/q2_sample_classifier/utilities.py @@ -249,16 +249,17 @@ def _rfecv_feature_selection(feature_data, targets, estimator, def _extract_rfe_scores(rfecv): + grid_scores_ = rfecv.cv_results_['mean_test_score'] n_features = len(rfecv.ranking_) # If using fractional step, step = integer of fraction * n_features if rfecv.step < 1: rfecv.step = int(rfecv.step * n_features) - # Need to manually calculate x-axis, as rfecv.grid_scores_ are a 1-d array + # Need to manually calculate x-axis, grid_scores_ is a 1-d array x = [n_features - (n * rfecv.step) - for n in range(len(rfecv.grid_scores_)-1, -1, -1)] + for n in range(len(grid_scores_)-1, -1, -1)] if x[0] < 1: x[0] = 1 - return pd.Series(rfecv.grid_scores_, index=x, name='Accuracy') + return pd.Series(grid_scores_, index=x, name='Accuracy') def nested_cross_validation(table, metadata, cv, random_state, n_jobs, From 2ec10d4b75dda6a1946c094901a50de5fd83c034 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 12:32:34 -0700 Subject: [PATCH 5/8] index with a list instead of a set the latter was deprecated in pandas --- q2_sample_classifier/classify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_sample_classifier/classify.py b/q2_sample_classifier/classify.py index ca449e2..b8aac9d 100644 --- a/q2_sample_classifier/classify.py +++ b/q2_sample_classifier/classify.py @@ -83,7 +83,7 @@ def metatable(ctx, raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) else: - metadata = metadata.loc[sample_ids] + metadata = metadata.loc[list(sample_ids)] if len(sample_ids) < len(table_ids): tab = tab.filter( ids_to_keep=sample_ids, axis='sample', inplace=False) From 7f0ec362242215e5bfba21b3badf7e3a0ad6cc8d Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 14:53:41 -0700 Subject: [PATCH 6/8] AdaBoost API updates --- q2_sample_classifier/tests/test_estimators.py | 4 ++-- q2_sample_classifier/utilities.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/q2_sample_classifier/tests/test_estimators.py b/q2_sample_classifier/tests/test_estimators.py index dec0a0b..8bb81e1 100644 --- a/q2_sample_classifier/tests/test_estimators.py +++ b/q2_sample_classifier/tests/test_estimators.py @@ -398,7 +398,7 @@ def test_train_adaboost_decision_tree(self): parameter_tuning=True, classification=True, missing_samples='ignore', base_estimator="DecisionTree") self.assertEqual(type(abe.named_steps.est), AdaBoostClassifier) - self.assertEqual(type(abe.named_steps.est.base_estimator), + self.assertEqual(type(abe.named_steps.est.estimator), DecisionTreeClassifier) def test_train_adaboost_extra_trees(self): @@ -408,7 +408,7 @@ def test_train_adaboost_extra_trees(self): parameter_tuning=True, classification=True, missing_samples='ignore', base_estimator="ExtraTrees") self.assertEqual(type(abe.named_steps.est), AdaBoostClassifier) - self.assertEqual(type(abe.named_steps.est.base_estimator), + self.assertEqual(type(abe.named_steps.est.estimator), ExtraTreeClassifier) # test some invalid inputs/edge cases diff --git a/q2_sample_classifier/utilities.py b/q2_sample_classifier/utilities.py index c66f835..4e34539 100644 --- a/q2_sample_classifier/utilities.py +++ b/q2_sample_classifier/utilities.py @@ -828,8 +828,9 @@ def _train_adaboost_base_estimator(table, metadata, column, base_estimator, return Pipeline( [('dv', estimator.named_steps.dv), - ('est', adaboost_estimator(estimator.named_steps.est, - n_estimators, random_state=random_state))]) + ('est', adaboost_estimator(estimator=estimator.named_steps.est, + n_estimators=n_estimators, + random_state=random_state))]) def _disable_feature_selection(estimator, optimize_feature_selection): From 41db1bd2b9abb110df2a990e8f550b92905489f1 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 16:05:38 -0700 Subject: [PATCH 7/8] update for pandas API change https://github.com/pandas-dev/pandas/issues/31532 --- q2_sample_classifier/tests/test_estimators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_sample_classifier/tests/test_estimators.py b/q2_sample_classifier/tests/test_estimators.py index 8bb81e1..cc23b91 100644 --- a/q2_sample_classifier/tests/test_estimators.py +++ b/q2_sample_classifier/tests/test_estimators.py @@ -504,7 +504,7 @@ def test_predict_classifications(self): ls_pred_classes = prob.columns.tolist() ls_correct_range = [col for col in ls_pred_classes if prob[col].between( - 0, 1, inclusive=True).all()] + 0, 1, inclusive="both").all()] self.assertEqual(len(ls_correct_range), prob.shape[1], msg='Predicted probabilities of class {}' 'are not in range [0,1]'.format( From c4e186c79f488446f7989b9408bb2b70c4cae1e1 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 30 Apr 2024 17:33:47 -0700 Subject: [PATCH 8/8] update to remove chained indexing https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy --- q2_sample_classifier/visuals.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/q2_sample_classifier/visuals.py b/q2_sample_classifier/visuals.py index 1cfced6..28dd9c0 100644 --- a/q2_sample_classifier/visuals.py +++ b/q2_sample_classifier/visuals.py @@ -167,9 +167,9 @@ def _plot_confusion_matrix(y_test, y_pred, classes, normalize, palette, predictions.loc["Overall Accuracy"] = "" predictions.loc["Baseline Accuracy"] = "" predictions.loc["Accuracy Ratio"] = "" - predictions.loc["Overall Accuracy"]["Overall Accuracy"] = accuracy - predictions.loc["Baseline Accuracy"]["Overall Accuracy"] = basline_accuracy - predictions.loc["Accuracy Ratio"]["Overall Accuracy"] = accuracy_ratio + predictions.loc["Overall Accuracy", "Overall Accuracy"] = accuracy + predictions.loc["Baseline Accuracy", "Overall Accuracy"] = basline_accuracy + predictions.loc["Accuracy Ratio", "Overall Accuracy"] = accuracy_ratio return predictions, confusion