From a65d435842fb79cc45b0c7ebbff4f9b5e5061228 Mon Sep 17 00:00:00 2001 From: timovdk <5330531+timovdk@users.noreply.github.com> Date: Wed, 5 Feb 2025 16:52:25 +0100 Subject: [PATCH] Prep xgboost study --- .../ansible/ansible_optuna_playbook.yml | 1 + asreview2-optuna/classifiers.py | 23 +++++++++++++++++++ asreview2-optuna/main.py | 6 ++--- asreview2-optuna/requirements.txt | 3 ++- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/asreview2-optuna/ansible/ansible_optuna_playbook.yml b/asreview2-optuna/ansible/ansible_optuna_playbook.yml index 5d9c251..caa5e0e 100644 --- a/asreview2-optuna/ansible/ansible_optuna_playbook.yml +++ b/asreview2-optuna/ansible/ansible_optuna_playbook.yml @@ -36,6 +36,7 @@ name: - python3-venv - python3-pip + - libgomp1 state: present - name: Create Python virtual environment diff --git a/asreview2-optuna/classifiers.py b/asreview2-optuna/classifiers.py index 3cd330b..a6ad176 100644 --- a/asreview2-optuna/classifiers.py +++ b/asreview2-optuna/classifiers.py @@ -6,6 +6,8 @@ SVM, ) +from xgboost import XGBClassifier + def naive_bayes_params(trial: optuna.trial.FrozenTrial): # Use logarithmic normal distribution for alpha (alpha effect is non-linear) @@ -34,17 +36,38 @@ def random_forest_params(trial: optuna.trial.FrozenTrial): return {"n_estimators": n_estimators, "max_features": max_features} +def xgboost_params(trial: optuna.trial.FrozenTrial): + # Use normal distribution for n_estimators (n_estimators effect is linear) + n_estimators = trial.suggest_int("xgboost__n_estimators", 50, 500) + + # Use normal distribution for max_depth (max_depth effect is linear) + max_depth = trial.suggest_int("xgboost__max_depth", 2, 20) + return {"n_estimators": n_estimators, "max_depth": max_depth} + + classifier_params = { "nb": naive_bayes_params, "log": logistic_params, "svm": svm_params, "rf": random_forest_params, + "xgboost": xgboost_params, } +class XGBoost(XGBClassifier): + """XGBoost classifier. + + """ + + name = "xgboost" + label = "XGBoost" + + def __init__(self, **kwargs): + super().__init__(**kwargs) classifiers = { "nb": NaiveBayes, "log": Logistic, "svm": SVM, "rf": RandomForest, + "xgboost": XGBoost, } diff --git a/asreview2-optuna/main.py b/asreview2-optuna/main.py index 81780bf..1487297 100644 --- a/asreview2-optuna/main.py +++ b/asreview2-optuna/main.py @@ -22,8 +22,8 @@ VERSION = 1 METRIC = "ndcg" # Options: "loss", "ndcg" STUDY_SET = "full" -CLASSIFIER_TYPE = "svm" # Options: "nb", "log", "svm", "rf" -FEATURE_EXTRACTOR_TYPE = "tfidf" # Options: "tfidf", "onehot", "labse", "bge-m3", "stella", "mxbai" +CLASSIFIER_TYPE = "xgboost" # Options: "nb", "log", "svm", "rf", "xgboost" +FEATURE_EXTRACTOR_TYPE = "mxbai" # Options: "tfidf", "onehot", "labse", "bge-m3", "stella", "mxbai" PICKLE_FOLDER_PATH = Path("synergy-dataset", f"pickles_{FEATURE_EXTRACTOR_TYPE}") PRE_PROCESSED_FMS = True # False = on the fly PARALLELIZE_OBJECTIVE = True @@ -248,7 +248,7 @@ def download_pickles(report_order): if __name__ == "__main__": # list of studies - studies = pd.read_json(f"synergy_studies_{STUDY_SET}.jsonl", lines=True) + studies = pd.read_json(f"synergy_studies_{STUDY_SET}.jsonl", lines=True).head(1) report_order = sorted(set(studies["dataset_id"])) if PRE_PROCESSED_FMS: diff --git a/asreview2-optuna/requirements.txt b/asreview2-optuna/requirements.txt index b7adb9b..4d089c6 100644 --- a/asreview2-optuna/requirements.txt +++ b/asreview2-optuna/requirements.txt @@ -5,4 +5,5 @@ synergy_dataset numpy pandas psycopg2-binary -sentence-transformers \ No newline at end of file +sentence-transformers +xgboost \ No newline at end of file