Merge branch 'main' into ASReview2-rf

asreview · Jan 28, 2025 · 82a72a3 · 82a72a3
2 parents a4e1006 + a16255f
commit 82a72a3
Show file tree

Hide file tree

Showing 9 changed files with 378 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -163,3 +163,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+*.pdf
diff --git a/asreview2-optuna/classifiers.py b/asreview2-optuna/classifiers.py
@@ -1,8 +1,8 @@
 import optuna
-
 from asreview.models.classifiers import (
-    NaiveBayesClassifier,
     LogisticClassifier,
+    NaiveBayesClassifier,
+    RandomForestClassifier,
     SVMClassifier,
 )
 

diff --git a/asreview2-optuna/completed_runs/optuna_output_analysis.ipynb b/asreview2-optuna/completed_runs/optuna_output_analysis.ipynb
@@ -0,0 +1,221 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import optuna\n",
+    "import matplotlib.pyplot as plt\n",
+    "from collections import defaultdict\n",
+    "import pandas as pd\n",
+    "import synergy_dataset as sd\n",
+    "from IPython.display import display\n",
+    "\n",
+    "# Path to your SQLite3 database\n",
+    "db_path = \"sqlite:///svm_db.sqlite3\" # Replace with your database path\n",
+    "\n",
+    "# Get all study summaries\n",
+    "study_summaries = optuna.get_all_study_summaries(storage=db_path)\n",
+    "\n",
+    "for summary in study_summaries:\n",
+    "    print(f\"- {summary.study_name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "study_name = \"ASReview2 2024-12-20 at 14.49.22\"\n",
+    "study = optuna.load_study(study_name=study_name, storage=db_path)\n",
+    "print(study.trials[0].params)\n",
+    "\n",
+    "dataset_names = []\n",
+    "for i in sd.iter_datasets():\n",
+    "    if i.name != \"Chou_2004\":\n",
+    "        dataset_names.append(i.name)\n",
+    "\n",
+    "dataset_names.sort()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare data for visualization\n",
+    "data = []\n",
+    "\n",
+    "for trial in study.trials:\n",
+    "    if trial.intermediate_values:\n",
+    "        for dataset_id, value in enumerate(trial.intermediate_values.values()):\n",
+    "            params = trial.params  # Extract trial parameters\n",
+    "            # Record dataset_id, loss (intermediate value), and parameters\n",
+    "            data.append({\n",
+    "                \"dataset_id\": dataset_id,\n",
+    "                \"loss\": value,\n",
+    "                \"ratio\": params.get(\"ratio\", None),\n",
+    "                \"c\": params.get(\"log__C\", None)\n",
+    "            })\n",
+    "\n",
+    "# Convert to pandas DataFrame\n",
+    "df = pd.DataFrame(data)\n",
+    "\n",
+    "# Initialize variables to store the best trial per dataset\n",
+    "num_datasets = len(study.trials[0].intermediate_values)  # Assuming all trials have the same number of datasets\n",
+    "best_trials_per_dataset = [None] * num_datasets  # Store best trial numbers\n",
+    "best_losses_per_dataset = [float(\"inf\")] * num_datasets  # Store best loss values\n",
+    "best_params_per_dataset = [None] * num_datasets  # Store best trial parameters\n",
+    "\n",
+    "# Loop through all trials to find the best trial for each dataset\n",
+    "for trial in study.trials:\n",
+    "    if trial.intermediate_values:\n",
+    "        # Iterate through each dataset (position in the intermediate_values list)\n",
+    "        for dataset_id, loss in enumerate(trial.intermediate_values.values()):\n",
+    "            if loss < best_losses_per_dataset[dataset_id]:\n",
+    "                # Update the best trial info for this dataset\n",
+    "                best_losses_per_dataset[dataset_id] = loss\n",
+    "                best_trials_per_dataset[dataset_id] = trial.number\n",
+    "                best_params_per_dataset[dataset_id] = trial.params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the dictionary to a pandas DataFrame\n",
+    "df = pd.DataFrame(list(study.best_trial.intermediate_values.items()), columns=[\"Dataset\", \"Mean Loss\"])\n",
+    "# Rename the rows to indicate the dataset number\n",
+    "df.index = [dataset_names[i] for i in range(len(best_params_per_dataset))]\n",
+    "df.drop(\"Dataset\", inplace=True, axis=1)\n",
+    "\n",
+    "display(df)\n",
+    "\n",
+    "# Plot the values (optional)\n",
+    "df.plot(kind=\"bar\", figsize=(10, 6), legend=False)\n",
+    "plt.title(\"Mean Losses per Dataset\")\n",
+    "plt.xlabel(\"Dataset\")\n",
+    "plt.ylabel(\"Mean Loss\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Create a pandas DataFrame\n",
+    "df = pd.DataFrame(best_params_per_dataset)\n",
+    "\n",
+    "# Rename the rows to indicate the dataset number\n",
+    "df.index = [dataset_names[i] for i in range(len(best_params_per_dataset))]\n",
+    "\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a pandas DataFrame\n",
+    "df = pd.DataFrame(best_params_per_dataset)\n",
+    "\n",
+    "# Plot each parameter separately\n",
+    "num_params = len(df.columns)\n",
+    "fig, axes = plt.subplots(num_params, 1, figsize=(8, num_params * 2.5), sharex=False)\n",
+    "\n",
+    "for idx, param in enumerate(df.columns):\n",
+    "    ax = axes[idx]\n",
+    "    ax.plot(dataset_names, df[param], marker='o', linestyle='-', color='b', alpha=0.8, label=param)\n",
+    "    ax.set_title(param, fontsize=10)\n",
+    "    ax.set_ylabel(\"Value\", fontsize=8)\n",
+    "    ax.grid(axis=\"y\", linestyle=\"--\", alpha=0.6)\n",
+    "    ax.tick_params(axis=\"y\", labelsize=8)\n",
+    "    ax.legend(fontsize=8, loc=\"upper left\")\n",
+    "    \n",
+    "    # Set dataset names as x-tick labels for each plot\n",
+    "    ax.set_xticks(dataset_names)  # Setting positions explicitly\n",
+    "    ax.set_xticklabels(dataset_names, fontsize=8, rotation=90)  # Setting labels\n",
+    "\n",
+    "# Add x-axis label only to the bottom subplot\n",
+    "axes[-1].set_xlabel(\"Datasets\", fontsize=10)\n",
+    "\n",
+    "# Adjust layout for better spacing\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "# Save or show the plot\n",
+    "plt.savefig(\"parameter_comparison_lineplots_all_xticks_fixed.pdf\", bbox_inches=\"tight\", dpi=300)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract intermediate values grouped by dataset_id\n",
+    "dataset_intermediate_values = defaultdict(list)\n",
+    "\n",
+    "for trial in study.trials:\n",
+    "    if trial.intermediate_values:\n",
+    "        # Distribute intermediate values by dataset_id (index in the list)\n",
+    "        for dataset_id, value in enumerate(trial.intermediate_values.values()):\n",
+    "            dataset_intermediate_values[dataset_id].append(value)\n",
+    "\n",
+    "# Prepare data for boxplots\n",
+    "datasets = list(dataset_intermediate_values.keys())\n",
+    "boxplot_data = [dataset_intermediate_values[dataset_id] for dataset_id in datasets]\n",
+    "\n",
+    "# Plot boxplots\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "plt.boxplot(boxplot_data, labels=dataset_names, \n",
+    "            showmeans=True, patch_artist=True)\n",
+    "plt.xlabel(\"Dataset\")\n",
+    "plt.ylabel(\"Loss\")\n",
+    "plt.title(f\"Boxplot of Losses for Each Dataset {study_name}\")\n",
+    "plt.grid(axis=\"y\", linestyle=\"--\", alpha=0.7)\n",
+    "plt.xticks(rotation=90)  # Rotate dataset names for better readability\n",
+    "plt.tight_layout()\n",
+    "plt.ylim((0, 0.3))\n",
+    "\n",
+    "# Show the plot\n",
+    "plt.tight_layout()\n",
+    "plt.savefig(f\"boxplot_per_dataset_{study_name}.pdf\")\n",
+    "plt.show()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "asreview-2.0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/asreview2-optuna/feature_extractors.py b/asreview2-optuna/feature_extractors.py
@@ -1,7 +1,5 @@
 import optuna
-
-from asreview.models.feature_extraction import Tfidf
-from asreview.models.feature_extraction import OneHot
+from asreview.models.feature_extraction import OneHot, Tfidf
 
 
 def tfidf_params(trial: optuna.trial.FrozenTrial):
@@ -46,7 +44,6 @@ def onehot_params(trial: optuna.trial.FrozenTrial):
     "onehot": onehot_params,
 }
 
-
 feature_extractors = {
     "tfidf": Tfidf,
     "onehot": OneHot,

diff --git a/asreview2-optuna/feature_matrix_scripts/bge-m3.py b/asreview2-optuna/feature_matrix_scripts/bge-m3.py
@@ -1,23 +1,38 @@
-from pathlib import Path
 import pickle
-from FlagEmbedding import BGEM3FlagModel
+from pathlib import Path
 
+import pandas as pd
 import synergy_dataset as sd
+from FlagEmbedding import BGEM3FlagModel
 from tqdm import tqdm
 
-folder_pickle_files = Path("synergy-dataset", "pickles")
+FORCE = False
+
+folder_pickle_files = Path("synergy-dataset", "pickles_bge-m3")
 folder_pickle_files.mkdir(parents=True, exist_ok=True)
 
 model = BGEM3FlagModel("BAAI/bge-m3", devices=["cuda:0"])
 
-# Loop through datasets
 for dataset in tqdm(sd.iter_datasets(), total=26):
-    # Convert dataset to a DataFrame and reset index
-    df = dataset.to_frame().reset_index()
+    if dataset.name == "Moran_2021":
+        df = pd.read_csv("../datasets/Moran_2021_corrected_shuffled_raw.csv")
+    else:
+        # Convert dataset to a DataFrame and reset index
+        df = dataset.to_frame().reset_index()
 
     # Combine 'title' and 'abstract' text
     combined_texts = (df["title"].fillna("") + " " + df["abstract"].fillna("")).tolist()
 
+    dataset_name = (
+        dataset.name if dataset.name != "Moran_2021" else "Moran_2021_corrected"
+    )
+    pickle_file_path = folder_pickle_files / f"{dataset_name}.pkl"
+
+    # Check if the pickle file already exists
+    if not FORCE and pickle_file_path.exists():
+        print(f"Skipping {dataset_name}, pickle file already exists.")
+        continue
+
     # Generate embeddings using the LLM embedder
     X = model.encode(
         combined_texts,
@@ -29,8 +44,8 @@
     )
 
     # Save embeddings and labels as a pickle file
-    with open(folder_pickle_files / f"dense-{dataset.name}.pkl", "wb") as f:
+    with open(folder_pickle_files / f"{dataset_name}.pkl", "wb") as f:
         pickle.dump((X["dense_vecs"], df["label_included"].tolist()), f)
 
-    with open(folder_pickle_files / f"sparse-{dataset.name}.pkl", "wb") as f:
+    with open(folder_pickle_files / f"sparse-{dataset_name}.pkl", "wb") as f:
         pickle.dump((X["lexical_weights"], df["label_included"].tolist()), f)
diff --git a/asreview2-optuna/feature_matrix_scripts/labse.py b/asreview2-optuna/feature_matrix_scripts/labse.py
@@ -0,0 +1,57 @@
+import pickle
+from pathlib import Path
+
+import pandas as pd
+import synergy_dataset as sd  # Assuming this is your custom dataset handler
+import torch
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+
+FORCE = False
+
+# Folder to save embeddings
+folder_pickle_files = Path("synergy-dataset", "pickles_labse")
+folder_pickle_files.mkdir(parents=True, exist_ok=True)
+
+# Load LaBSE model
+model = SentenceTransformer("sentence-transformers/LaBSE")
+
+# Check if CUDA is available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+
+# Loop through datasets
+for dataset in tqdm(sd.iter_datasets(), total=26):
+    if dataset.name == "Moran_2021":
+        df = pd.read_csv("../datasets/Moran_2021_corrected_shuffled_raw.csv")
+    else:
+        # Convert dataset to a DataFrame and reset index
+        df = dataset.to_frame().reset_index()
+
+    # Combine 'title' and 'abstract' text
+    combined_texts = (df["title"].fillna("") + " " + df["abstract"].fillna("")).tolist()
+
+    dataset_name = (
+        dataset.name if dataset.name != "Moran_2021" else "Moran_2021_corrected"
+    )
+    pickle_file_path = folder_pickle_files / f"{dataset_name}.pkl"
+
+    # Check if the pickle file already exists
+    if not FORCE and pickle_file_path.exists():
+        print(f"Skipping {dataset_name}, pickle file already exists.")
+        continue
+
+    # Generate embeddings
+    X = model.encode(
+        combined_texts, batch_size=64, show_progress_bar=False, device=device
+    )
+
+    # Save embeddings and labels as a pickle file
+    with open(folder_pickle_files / f"{dataset_name}.pkl", "wb") as f:
+        pickle.dump(
+            (
+                X,
+                df["label_included"].tolist(),
+            ),
+            f,
+        )
-Original file line number
+Diff line change
@@ Expand Up / @@ -163,3 +163,5 @@ cython_debug/ @@
     #  and can be added to the global gitignore or merged into this file.  For a more nuclear
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     #.idea/
+    *.pdf