Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions bin/scripts/debug_age_vs_logistic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import ageml.commands as commands

if __name__ == "__main__":
commands.age_model_vs_logistic_regression()
4 changes: 4 additions & 0 deletions bin/scripts/debug_feature_influence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import ageml.commands as commands

if __name__ == "__main__":
commands.model_feature_influence()
6 changes: 3 additions & 3 deletions src/ageml/datasets/synthetic_data.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import os
from typing import List
import importlib.resources as pkg_resources
import importlib.resources as resources

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

from ageml import datasets

datasets_path = str(pkg_resources.files(datasets))
datasets_path = str(resources.files(datasets))

# RNG for reproducibility
seed = 107146869338163146621163044826586732901
Expand Down Expand Up @@ -240,7 +240,7 @@ def __init__(self):
Raises:
FileNotFoundError: _description_
"""
datasets_path = str(pkg_resources.files(datasets))
datasets_path = str(resources.files(datasets))
self.data_paths = {
"features": os.path.join(datasets_path, "toy_features.csv"),
"clinical": os.path.join(datasets_path, "toy_clinical.csv"),
Expand Down
124 changes: 89 additions & 35 deletions src/ageml/modelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def __init__(
self.verbose = verbose

# Initialize metrics storage
self.metrics = CVMetricsHandler(task_type='regression')
self.metrics = CVMetricsHandler(task_type="regression")

def set_hyperparameter_grid(self):
"""Build the hyperparameter grid of the selected model upon AgeML object initialization
Expand Down Expand Up @@ -389,7 +389,7 @@ def fit_age(self, X, y):
print(f"\nRunning CV splits with pipeline:\n{cv_pipeline}")
temp_pred_age = np.zeros(y.shape[0])
temp_corr_age = np.zeros(y.shape[0])
split_metrics = CVMetricsHandler(task_type='regression')
split_metrics = CVMetricsHandler(task_type="regression")
kf_hyperopt = model_selection.KFold(n_splits=self.CV_split, random_state=self.seed, shuffle=True)
for i, (train, test) in enumerate(kf_hyperopt.split(X)):
X_train, X_test = X[train], X[test]
Expand Down Expand Up @@ -419,7 +419,7 @@ def fit_age(self, X, y):

# Compute the mean of scores over all CV splits
split_summary = split_metrics.get_summary()
mean_score_test = split_summary['test']['mae']['mean']
mean_score_test = split_summary["test"]["mae"]["mean"]
mae_means_test.append(mean_score_test)

# If the mean MAE is better than the previous best, save the results
Expand All @@ -440,16 +440,32 @@ def fit_age(self, X, y):
# Calculate metrics over all splits
summary_dict = self.metrics.get_summary()
print("Summary metrics over all CV splits")
print("Train: MAE %.2f ± %.2f, RMSE %.2f ± %.2f, R2 %.3f ± %.3f, p %.3f ± %.3f"
% (summary_dict['train']['mae']['mean'], summary_dict['train']['mae']['std'],
summary_dict['train']['rmse']['mean'], summary_dict['train']['rmse']['std'],
summary_dict['train']['r2']['mean'], summary_dict['train']['r2']['std'],
summary_dict['train']['p']['mean'], summary_dict['train']['p']['std']))
print("Test: MAE %.2f ± %.2f, RMSE %.2f ± %.2f, R2 %.3f ± %.3f, p %.3f ± %.3f"
% (summary_dict['test']['mae']['mean'], summary_dict['test']['mae']['std'],
summary_dict['test']['rmse']['mean'], summary_dict['test']['rmse']['std'],
summary_dict['test']['r2']['mean'], summary_dict['test']['r2']['std'],
summary_dict['test']['p']['mean'], summary_dict['test']['p']['std']))
print(
"Train: MAE %.2f ± %.2f, RMSE %.2f ± %.2f, R2 %.3f ± %.3f, p %.3f ± %.3f"
% (
summary_dict["train"]["mae"]["mean"],
summary_dict["train"]["mae"]["std"],
summary_dict["train"]["rmse"]["mean"],
summary_dict["train"]["rmse"]["std"],
summary_dict["train"]["r2"]["mean"],
summary_dict["train"]["r2"]["std"],
summary_dict["train"]["p"]["mean"],
summary_dict["train"]["p"]["std"],
)
)
print(
"Test: MAE %.2f ± %.2f, RMSE %.2f ± %.2f, R2 %.3f ± %.3f, p %.3f ± %.3f"
% (
summary_dict["test"]["mae"]["mean"],
summary_dict["test"]["mae"]["std"],
summary_dict["test"]["rmse"]["mean"],
summary_dict["test"]["rmse"]["std"],
summary_dict["test"]["r2"]["mean"],
summary_dict["test"]["r2"]["std"],
summary_dict["test"]["p"]["mean"],
summary_dict["test"]["p"]["std"],
)
)

elif self.model_type == "hyperopt":
print("Running Hyperparameter optimization with 'hyperopt' model option...")
Expand Down Expand Up @@ -566,7 +582,7 @@ class Classifier:
predict(self, X): Predict class labels with fitted model.
"""

def __init__(self, CV_split: int = 5, seed=None, thr: float = 0.5, ci_val: float = 0.95, verbose: bool = False):
def __init__(self, CV_split: int = 5, seed: int = 1102, thr: float = 0.5, ci_val: float = 0.95, verbose: bool = False):
Comment thread
itellaetxe marked this conversation as resolved.
"""Initialise variables."""

# Set required modelling parts
Expand All @@ -584,7 +600,7 @@ def __init__(self, CV_split: int = 5, seed=None, thr: float = 0.5, ci_val: float
self.verbose = verbose

# Initialize metrics storage
self.metrics = CVMetricsHandler(task_type='classification')
self.metrics = CVMetricsHandler(task_type="classification")

def set_model(self):
"""Sets the model to use in the pipeline."""
Expand Down Expand Up @@ -624,7 +640,7 @@ def set_ci(self, ci_val):
def _calculate_metrics(self, y_pred, y_true):
"""Calculate metrics for classification."""

# Calculate auc, accuracy, sensistiivyt and specificity
# Calculate auc, accuracy, sensistiivyt and specificity
auc = metrics.roc_auc_score(y_true, y_pred)
acc = metrics.accuracy_score(y_true, y_pred > self.thr)
tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred > self.thr).ravel()
Expand All @@ -634,24 +650,44 @@ def _calculate_metrics(self, y_pred, y_true):
return auc, acc, sensitivity, specificity

@verbose_wrapper
def fit_model(self, X, y, scale=False):
def fit_model(self, X, y, subsample: bool = False, scale=False):
"""Fit the model.

Parameters
----------
X: 2D-Array with features; shape=(n,m)
y: 1D-Array with labbels; shape=n"""
y: 1D-Array with labels; shape=n
subsample: bool to indicate if we subsample the bigger group in the CV"""

# Arrays to store values
y = y.ravel()
y_preds = np.empty(shape=y.shape)

# Find which is the majority class when subsampling is indicated
if subsample:
Comment thread
itellaetxe marked this conversation as resolved.
print("Subsampling majority class inside CV folds.")
unique, counts = np.unique(y, return_counts=True)
maj_class = unique[np.argmax(counts)]

Comment thread
itellaetxe marked this conversation as resolved.
# To avoid data underutilization and biases, first split data into CV splits,
# then subsample majority class inside each split if indicated by flag.
kf = model_selection.KFold(n_splits=self.CV_split, shuffle=True, random_state=self.seed)
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

# Scale data
# Inside CV split, subsample majority class if indicated by flag
# If both classes are of same size, no subsampling is done
if subsample and (counts[0] != counts[1]):
maj_class_indices = np.where(y_train == maj_class)[0]
min_class_indices = np.where(y_train != maj_class)[0]
n_min = len(min_class_indices)
subsampled_maj_indices = np.random.choice(maj_class_indices, size=n_min, replace=False)
selected_indices = np.concatenate((subsampled_maj_indices, min_class_indices))
X_train = X_train[selected_indices]
y_train = y_train[selected_indices]

# Scale data if indicated by flag
if scale:
self.scaler = StandardScaler()
X_train = self.scaler.fit_transform(X_train)
Expand All @@ -665,7 +701,9 @@ def fit_model(self, X, y, scale=False):
y_preds[test_index] = y_pred

# Calculate metrics
auc_train, acc_train, sensitivity_train, specificity_train = self._calculate_metrics(self.model.predict_proba(X_train)[::, 1], y_train)
auc_train, acc_train, sensitivity_train, specificity_train = self._calculate_metrics(
self.model.predict_proba(X_train)[::, 1], y_train
)
auc, acc, sensitivity, specificity = self._calculate_metrics(y_pred, y_test)
test_metrics = ClassificationFoldMetrics(auc, acc, sensitivity, specificity)
train_metrics = ClassificationFoldMetrics(auc_train, acc_train, sensitivity_train, specificity_train)
Expand All @@ -676,18 +714,34 @@ def fit_model(self, X, y, scale=False):

# Print results
print("Summary metrics over all CV splits (%s CI)" % (self.ci_val))
print("AUC: %.3f [%.3f-%.3f]" % (summary_dict['test']['auc']['mean'],
summary_dict['test']['auc']['95ci'][0],
summary_dict['test']['auc']['95ci'][1]))
print("Accuracy: %.3f [%.3f-%.3f]" % (summary_dict['test']['accuracy']['mean'],
summary_dict['test']['accuracy']['95ci'][0],
summary_dict['test']['accuracy']['95ci'][1]))
print("Sensitivity: %.3f [%.3f-%.3f]" % (summary_dict['test']['sensitivity']['mean'],
summary_dict['test']['sensitivity']['95ci'][0],
summary_dict['test']['sensitivity']['95ci'][1]))
print("Specificity: %.3f [%.3f-%.3f]" % (summary_dict['test']['specificity']['mean'],
summary_dict['test']['specificity']['95ci'][0],
summary_dict['test']['specificity']['95ci'][1]))
print(
"AUC: %.3f [%.3f-%.3f]"
% (summary_dict["test"]["auc"]["mean"], summary_dict["test"]["auc"]["95ci"][0], summary_dict["test"]["auc"]["95ci"][1])
)
print(
"Accuracy: %.3f [%.3f-%.3f]"
% (
summary_dict["test"]["accuracy"]["mean"],
summary_dict["test"]["accuracy"]["95ci"][0],
summary_dict["test"]["accuracy"]["95ci"][1],
)
)
print(
"Sensitivity: %.3f [%.3f-%.3f]"
% (
summary_dict["test"]["sensitivity"]["mean"],
summary_dict["test"]["sensitivity"]["95ci"][0],
summary_dict["test"]["sensitivity"]["95ci"][1],
)
)
print(
"Specificity: %.3f [%.3f-%.3f]"
% (
summary_dict["test"]["specificity"]["mean"],
summary_dict["test"]["specificity"]["95ci"][0],
summary_dict["test"]["specificity"]["95ci"][1],
)
)

# Final model trained on all data
if scale:
Expand All @@ -709,11 +763,11 @@ def predict(self, X, scale=False):
# Check that model has previously been fit
if not self.modelFit:
raise ValueError("Must fit the classifier before calling predict.")

# Scale data
if scale and hasattr(self, 'scaler'):
if scale and hasattr(self, "scaler"):
X = self.scaler.transform(X)
elif scale and not hasattr(self, 'scaler'):
elif scale and not hasattr(self, "scaler"):
raise ValueError("Must fit the model with scaling before calling predict with scaling.")

# Predict class labels
Expand Down
Loading