diff --git a/q2_sample_classifier/classify.py b/q2_sample_classifier/classify.py index ca449e2..b8aac9d 100644 --- a/q2_sample_classifier/classify.py +++ b/q2_sample_classifier/classify.py @@ -83,7 +83,7 @@ def metatable(ctx, raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) else: - metadata = metadata.loc[sample_ids] + metadata = metadata.loc[list(sample_ids)] if len(sample_ids) < len(table_ids): tab = tab.filter( ids_to_keep=sample_ids, axis='sample', inplace=False) diff --git a/q2_sample_classifier/tests/test_actions.py b/q2_sample_classifier/tests/test_actions.py index e13a86c..23b7ee0 100644 --- a/q2_sample_classifier/tests/test_actions.py +++ b/q2_sample_classifier/tests/test_actions.py @@ -59,7 +59,7 @@ def test_action_split_table(self): self.assertEqual(y_train.name, 'bugs') # test if complete target column is covered - y_all = y_train.append(y_test).sort_index() + y_all = pd.concat([y_train, y_test]).sort_index() y_all.index.name = 'SampleID' pdt.assert_series_equal(y_all, self.md._series) diff --git a/q2_sample_classifier/tests/test_estimators.py b/q2_sample_classifier/tests/test_estimators.py index b42de3c..cc23b91 100644 --- a/q2_sample_classifier/tests/test_estimators.py +++ b/q2_sample_classifier/tests/test_estimators.py @@ -117,7 +117,7 @@ def _load_cmc(md_fp, column): index_col=0, names=['feature', 'importance']) self.exp_pred = pd.read_csv( self.get_data_path('predictions.tsv'), sep='\t', header=0, - index_col=0, squeeze=True) + index_col=0).squeeze('columns') index = pd.Index(['A', 'B', 'C', 'D'], name='id') self.table_percnorm = qiime2.Artifact.import_data( FeatureTable[PercentileNormalized], pd.DataFrame( @@ -135,7 +135,7 @@ def test_extract_features(self): dv = DictVectorizer() dv.fit(dicts) features = table.ids('observation') - self.assertEqual(set(dv.get_feature_names()), set(features)) + self.assertEqual(set(dv.get_feature_names_out()), set(features)) self.assertEqual(len(dicts), len(table.ids())) for dict_row, (table_row, _, _) in zip(dicts, table.iter()): for feature, count in zip(features, table_row): @@ -398,7 +398,7 @@ def test_train_adaboost_decision_tree(self): parameter_tuning=True, classification=True, missing_samples='ignore', base_estimator="DecisionTree") self.assertEqual(type(abe.named_steps.est), AdaBoostClassifier) - self.assertEqual(type(abe.named_steps.est.base_estimator), + self.assertEqual(type(abe.named_steps.est.estimator), DecisionTreeClassifier) def test_train_adaboost_extra_trees(self): @@ -408,7 +408,7 @@ def test_train_adaboost_extra_trees(self): parameter_tuning=True, classification=True, missing_samples='ignore', base_estimator="ExtraTrees") self.assertEqual(type(abe.named_steps.est), AdaBoostClassifier) - self.assertEqual(type(abe.named_steps.est.base_estimator), + self.assertEqual(type(abe.named_steps.est.estimator), ExtraTreeClassifier) # test some invalid inputs/edge cases @@ -504,7 +504,7 @@ def test_predict_classifications(self): ls_pred_classes = prob.columns.tolist() ls_correct_range = [col for col in ls_pred_classes if prob[col].between( - 0, 1, inclusive=True).all()] + 0, 1, inclusive="both").all()] self.assertEqual(len(ls_correct_range), prob.shape[1], msg='Predicted probabilities of class {}' 'are not in range [0,1]'.format( diff --git a/q2_sample_classifier/tests/test_types_formats_transformers.py b/q2_sample_classifier/tests/test_types_formats_transformers.py index d0ec02a..3833ed1 100644 --- a/q2_sample_classifier/tests/test_types_formats_transformers.py +++ b/q2_sample_classifier/tests/test_types_formats_transformers.py @@ -85,7 +85,7 @@ def test_pd_series_to_boolean_format(self): name='outlier', index=exp_index) obs = transformer(exp) obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0, - squeeze=True) + ).squeeze('columns') self.assertEqual(sorted(exp), sorted(obs)) def test_boolean_format_to_pd_series(self): @@ -152,7 +152,7 @@ def test_pd_series_to_Predictions_format(self): name='prediction', index=['a', 'b', 'c', 'd']) obs = transformer(exp) obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0, - squeeze=True) + ).squeeze('columns') pdt.assert_series_equal(obs, exp) def test_pd_series_to_Predictions_format_allow_nans(self): @@ -161,7 +161,7 @@ def test_pd_series_to_Predictions_format_allow_nans(self): name='prediction', index=['a', 'b', 'c', 'd']) obs = transformer(exp) obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0, - squeeze=True) + ).squeeze('columns') pdt.assert_series_equal(obs, exp) def test_Predictions_format_to_pd_series(self): diff --git a/q2_sample_classifier/utilities.py b/q2_sample_classifier/utilities.py index b224676..4e34539 100644 --- a/q2_sample_classifier/utilities.py +++ b/q2_sample_classifier/utilities.py @@ -239,7 +239,7 @@ def _rfecv_feature_selection(feature_data, targets, estimator, # Describe top features n_opt = rfecv.named_steps.est.n_features_ importance = _extract_important_features( - rfecv.named_steps.dv.get_feature_names(), + rfecv.named_steps.dv.get_feature_names_out(), rfecv.named_steps.est.ranking_) importance = sort_importances(importance, ascending=True)[:n_opt] @@ -249,16 +249,17 @@ def _rfecv_feature_selection(feature_data, targets, estimator, def _extract_rfe_scores(rfecv): + grid_scores_ = rfecv.cv_results_['mean_test_score'] n_features = len(rfecv.ranking_) # If using fractional step, step = integer of fraction * n_features if rfecv.step < 1: rfecv.step = int(rfecv.step * n_features) - # Need to manually calculate x-axis, as rfecv.grid_scores_ are a 1-d array + # Need to manually calculate x-axis, grid_scores_ is a 1-d array x = [n_features - (n * rfecv.step) - for n in range(len(rfecv.grid_scores_)-1, -1, -1)] + for n in range(len(grid_scores_)-1, -1, -1)] if x[0] < 1: x[0] = 1 - return pd.Series(rfecv.grid_scores_, index=x, name='Accuracy') + return pd.Series(grid_scores_, index=x, name='Accuracy') def nested_cross_validation(table, metadata, cv, random_state, n_jobs, @@ -411,12 +412,12 @@ def _calculate_feature_importances(estimator): # feature_importances_ or coef_ to report feature importance/weights try: importances = _extract_important_features( - estimator.named_steps.dv.get_feature_names(), + estimator.named_steps.dv.get_feature_names_out(), estimator.named_steps.est.feature_importances_) # is there a better way to determine whether estimator has coef_ ? except AttributeError: importances = _extract_important_features( - estimator.named_steps.dv.get_feature_names(), + estimator.named_steps.dv.get_feature_names_out(), estimator.named_steps.est.coef_) return importances @@ -718,7 +719,7 @@ def _mean_feature_importance(importances): def _null_feature_importance(table): feature_extractor = DictVectorizer() feature_extractor.fit(table) - imp = pd.DataFrame(index=feature_extractor.get_feature_names()) + imp = pd.DataFrame(index=feature_extractor.get_feature_names_out()) imp.index.name = "feature" imp["importance"] = 1 return imp @@ -827,8 +828,9 @@ def _train_adaboost_base_estimator(table, metadata, column, base_estimator, return Pipeline( [('dv', estimator.named_steps.dv), - ('est', adaboost_estimator(estimator.named_steps.est, - n_estimators, random_state=random_state))]) + ('est', adaboost_estimator(estimator=estimator.named_steps.est, + n_estimators=n_estimators, + random_state=random_state))]) def _disable_feature_selection(estimator, optimize_feature_selection): diff --git a/q2_sample_classifier/visuals.py b/q2_sample_classifier/visuals.py index 1cfced6..28dd9c0 100644 --- a/q2_sample_classifier/visuals.py +++ b/q2_sample_classifier/visuals.py @@ -167,9 +167,9 @@ def _plot_confusion_matrix(y_test, y_pred, classes, normalize, palette, predictions.loc["Overall Accuracy"] = "" predictions.loc["Baseline Accuracy"] = "" predictions.loc["Accuracy Ratio"] = "" - predictions.loc["Overall Accuracy"]["Overall Accuracy"] = accuracy - predictions.loc["Baseline Accuracy"]["Overall Accuracy"] = basline_accuracy - predictions.loc["Accuracy Ratio"]["Overall Accuracy"] = accuracy_ratio + predictions.loc["Overall Accuracy", "Overall Accuracy"] = accuracy + predictions.loc["Baseline Accuracy", "Overall Accuracy"] = basline_accuracy + predictions.loc["Accuracy Ratio", "Overall Accuracy"] = accuracy_ratio return predictions, confusion