From a65d435842fb79cc45b0c7ebbff4f9b5e5061228 Mon Sep 17 00:00:00 2001
From: timovdk <5330531+timovdk@users.noreply.github.com>
Date: Wed, 5 Feb 2025 16:52:25 +0100
Subject: [PATCH] Prep xgboost study

---
 .../ansible/ansible_optuna_playbook.yml       |  1 +
 asreview2-optuna/classifiers.py               | 23 +++++++++++++++++++
 asreview2-optuna/main.py                      |  6 ++---
 asreview2-optuna/requirements.txt             |  3 ++-
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/asreview2-optuna/ansible/ansible_optuna_playbook.yml b/asreview2-optuna/ansible/ansible_optuna_playbook.yml
index 5d9c251..caa5e0e 100644
--- a/asreview2-optuna/ansible/ansible_optuna_playbook.yml
+++ b/asreview2-optuna/ansible/ansible_optuna_playbook.yml
@@ -36,6 +36,7 @@
         name:
           - python3-venv
           - python3-pip
+          - libgomp1
         state: present
 
     - name: Create Python virtual environment
diff --git a/asreview2-optuna/classifiers.py b/asreview2-optuna/classifiers.py
index 3cd330b..a6ad176 100644
--- a/asreview2-optuna/classifiers.py
+++ b/asreview2-optuna/classifiers.py
@@ -6,6 +6,8 @@
     SVM,
 )
 
+from xgboost import XGBClassifier
+
 
 def naive_bayes_params(trial: optuna.trial.FrozenTrial):
     # Use logarithmic normal distribution for alpha (alpha effect is non-linear)
@@ -34,17 +36,38 @@ def random_forest_params(trial: optuna.trial.FrozenTrial):
     return {"n_estimators": n_estimators, "max_features": max_features}
 
 
+def xgboost_params(trial: optuna.trial.FrozenTrial):
+    # Use normal distribution for n_estimators (n_estimators effect is linear)
+    n_estimators = trial.suggest_int("xgboost__n_estimators", 50, 500)
+
+    # Use normal distribution for max_depth (max_depth effect is linear)
+    max_depth = trial.suggest_int("xgboost__max_depth", 2, 20)
+    return {"n_estimators": n_estimators, "max_depth": max_depth}
+
+
 classifier_params = {
     "nb": naive_bayes_params,
     "log": logistic_params,
     "svm": svm_params,
     "rf": random_forest_params,
+    "xgboost": xgboost_params,
 }
 
+class XGBoost(XGBClassifier):
+    """XGBoost classifier.
+
+    """
+
+    name = "xgboost"
+    label = "XGBoost"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
 
 classifiers = {
     "nb": NaiveBayes,
     "log": Logistic,
     "svm": SVM,
     "rf": RandomForest,
+    "xgboost": XGBoost,
 }
diff --git a/asreview2-optuna/main.py b/asreview2-optuna/main.py
index 81780bf..1487297 100644
--- a/asreview2-optuna/main.py
+++ b/asreview2-optuna/main.py
@@ -22,8 +22,8 @@
 VERSION = 1
 METRIC = "ndcg"  # Options: "loss", "ndcg"
 STUDY_SET = "full"
-CLASSIFIER_TYPE = "svm"  # Options: "nb", "log", "svm", "rf"
-FEATURE_EXTRACTOR_TYPE = "tfidf"  # Options: "tfidf", "onehot", "labse", "bge-m3", "stella", "mxbai"
+CLASSIFIER_TYPE = "xgboost"  # Options: "nb", "log", "svm", "rf", "xgboost"
+FEATURE_EXTRACTOR_TYPE = "mxbai"  # Options: "tfidf", "onehot", "labse", "bge-m3", "stella", "mxbai"
 PICKLE_FOLDER_PATH = Path("synergy-dataset", f"pickles_{FEATURE_EXTRACTOR_TYPE}")
 PRE_PROCESSED_FMS = True  # False = on the fly
 PARALLELIZE_OBJECTIVE = True
@@ -248,7 +248,7 @@ def download_pickles(report_order):
 
 if __name__ == "__main__":
     # list of studies
-    studies = pd.read_json(f"synergy_studies_{STUDY_SET}.jsonl", lines=True)
+    studies = pd.read_json(f"synergy_studies_{STUDY_SET}.jsonl", lines=True).head(1)
     report_order = sorted(set(studies["dataset_id"]))
 
     if PRE_PROCESSED_FMS:
diff --git a/asreview2-optuna/requirements.txt b/asreview2-optuna/requirements.txt
index b7adb9b..4d089c6 100644
--- a/asreview2-optuna/requirements.txt
+++ b/asreview2-optuna/requirements.txt
@@ -5,4 +5,5 @@ synergy_dataset
 numpy
 pandas
 psycopg2-binary
-sentence-transformers
\ No newline at end of file
+sentence-transformers
+xgboost
\ No newline at end of file