diff --git a/Survival model/Survival-prediction.py b/Survival model/Survival-prediction.py index 8b13789..2c070c1 100644 --- a/Survival model/Survival-prediction.py +++ b/Survival model/Survival-prediction.py @@ -1 +1,594 @@ +# -*- coding: utf-8 -*- +"""All_Data.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1Xe0HDhkxqeR4gEw0Si1-c_R3mLbT3WSd +""" + + +# Step 2: Load the package + + +import numpy as np + +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.model_selection import KFold, GridSearchCV +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler +from sklearn.pipeline import Pipeline +from sklearn.model_selection import cross_val_score, cross_val_predict +from sklearn.decomposition import PCA + +from sksurv.metrics import concordance_index_censored +from sksurv.svm import FastSurvivalSVM +from sksurv.linear_model import CoxPHSurvivalAnalysis +from sksurv.metrics import concordance_index_censored +from sksurv.svm import FastSurvivalSVM +from sksurv.ensemble import RandomSurvivalForest +from sksurv.ensemble import GradientBoostingSurvivalAnalysis, ComponentwiseGradientBoostingSurvivalAnalysis +from sksurv.linear_model import CoxnetSurvivalAnalysis +#from lifelines import CoxPHFitter +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) +import matplotlib +matplotlib.rcParams['pdf.fonttype'] = 42 +matplotlib.rcParams['ps.fonttype'] = 42 + +# Step 2: Load data + +# Load data +RNA_32=pd.read_csv('C:\\Users\\Noushin\\OneDrive - Teesside University\\Noushin-paper\\Feture_vector\\32-size\\patient_RNA_62.csv') +Meta_32=pd.read_csv('C:\\Users\\Noushin\\OneDrive - Teesside University\\Noushin-paper\\Feture_vector\\32-size\\patient_Meta_59.csv') +IMG_32=pd.read_csv('C:\\Users\\Noushin\\OneDrive - Teesside University\\Noushin-paper\\Feture_vector\\32-size\\patient_IMG_83.csv') +feature1= pd.merge(IMG_32,RNA_32,on='sample') +data= pd.merge(feature1,Meta_32,on='sample') +#RNA_coxCC_2 = pd.read_csv('/content/drive/MyDrive/Feature-vector/RNA_2_16.csv') +#Feature_vector= pd.merge(RNA_coxCC,RNA_coxCC_2 ,on='sample') +Survival_Data = pd.read_excel('D:\\Second-year\\survival-data\\survival.xlsx', sheet_name='TCGA-CDR', usecols="A,Y, Z") +Feature_vector_Final= pd.merge(data,Survival_Data,on='sample') + +# 3.1 Merge gene data with OS time and status +Feature_vector_Final = Feature_vector_Final.rename(columns={ 'sample':'bcr_patient_barcode'}) +Feature_vector_Final = Feature_vector_Final.rename(columns={ 'OS':'OS.Status'}) + +from sklearn.preprocessing import OrdinalEncoder +data = Feature_vector_Final.drop(['bcr_patient_barcode','OS.Status','OS.time'], axis=1) + +data.head() + +#preprocess then add the label to df again/ different preprocessing method you can use +#ss = MinMaxScaler() + +#df = data +#df = pd.DataFrame(ss.fit_transform(df), columns=df.columns) +#df['OS.Status'] = Feature_vector_Final['OS.Status'] +#df['OS.time'] = Feature_vector_Final['OS.time'] + +# second method for preprocessing /compare both +from sklearn.preprocessing import StandardScaler +ss = StandardScaler() + +df = data +df = pd.DataFrame(ss.fit_transform(df), columns=df.columns) +df['OS.Status'] = Feature_vector_Final['OS.Status'] +df['OS.time'] = Feature_vector_Final['OS.time'] + +# Step 6: Machine Learning Methods for Survival Analysis + +# 6.1: Set up seed +SEED = 50 +CV = KFold(n_splits=3, shuffle=True, random_state=0) +TEST_SIZE =0.3 + +# 6.2 Split data to prepare for ML (already normalise) + +X = df.drop(['OS.time','OS.Status'], axis = 1) +from sklearn.feature_selection import VarianceThreshold +sel = VarianceThreshold(threshold=(.6 * (1 - .8))) +sel.fit_transform(X) +df['OS.Status'] = np.where(df['OS.Status'] == 1, True, False) +y = df[['OS.Status','OS.time']].to_records(index=False) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED) + +#X_test.index + +# Define a function for grid search to tune training model +# and predict the results +def grid_search(estimator, param, X_train, y_train, X_test, y_test, CV): + + # Define Grid Search + gcv = GridSearchCV( + estimator, + param_grid=param, + cv=CV, + n_jobs=-1).fit(X_train, y_train) + + # Find best model + model = gcv.best_estimator_ + print(model) + + + # Predict model + prediction = model.predict(X_test) + result = concordance_index_censored(y_test["OS.Status"], y_test["OS.time"], prediction) + print('C-index for test set (Hold out):', result[0]) + + return [model, round(result[0],3), prediction] + +# 6.3: Build model + +# Define a function for grid search to tune training model +# and predict the results +def c_index(model, X, y, n=5): + + # Generate seed + np.random.seed(1) + seeds = np.random.permutation(1000)[:n] + + # Train and evaluate model with 20 times + cindex_score = [] + predict_list = [] + + for s in seeds: + + #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=10) + X_trn, X_test, y_trn, y_test = train_test_split(X, y, test_size=0.3,random_state=s) + model.fit(X_trn, y_trn) + # cv_score = cross_val_score(model, X_test, y_test, cv=CV) + prediction = model.predict(X_test) + predict_list.append(prediction) + result = concordance_index_censored(y_test["OS.Status"],y_test["OS.time"], prediction) + + cindex_score.append(round(result[0],3)) + + print('Average C-index Cross validation:', np.mean(cindex_score)) + + + return [cindex_score, predict_list] + +"""# Gradient Boost Survival +pipe_gbs = Pipeline([('model', GradientBoostingSurvivalAnalysis())]) +param_gbs ={ + 'model__random_state': [SEED], + 'model__learning_rate': [0.01, 0.1, 1], + 'model__n_estimators':[100,200, 500, 800, 1000], + 'model__min_samples_split':[2,10,8], + 'model__min_samples_leaf':[1,2,8], + 'model__max_depth':[3,4,8]} +""" + +# Define the Pipeline and hyperparameter +# Random Survival Forest +pipe_rsf = Pipeline([('model', RandomSurvivalForest())]) +param_rsf ={ + 'model__random_state': [SEED], + 'model__max_features': ['sqrt','auto'], + 'model__max_depth': [8,16,32,64], + 'model__min_samples_leaf': [8,15], + 'model__n_estimators':[200,500,800, 1000]} + + + +# ComponentwiseGradientBoostingSurvivalAnalysis +pipe_cgbs = Pipeline([('model', ComponentwiseGradientBoostingSurvivalAnalysis())]) +param_cgbs ={ + 'model__random_state': [50,20,200], + 'model__learning_rate': [0.001,0.01, 0.1, 1], + 'model__n_estimators':[200, 500, 800, 1000],} + + + +# Gradient Boost Survival +pipe_gbs = Pipeline([('model', GradientBoostingSurvivalAnalysis())]) +param_gbs ={ + 'model__random_state': [SEED], + 'model__learning_rate': [0.01, 0.1, 1], + 'model__n_estimators':[200, 500, 800, 1000]} + + +# CoxPHSurvivalAnalysis +pipe_cox = Pipeline([('model', CoxPHSurvivalAnalysis())]) +param_cox ={ + "model__alpha": [0.001, 0.01, 0.1, 0, 1, 10, 100]} + +# CoxnetSurvivalAnalysis +pipe_coxnet = Pipeline([('model', CoxnetSurvivalAnalysis())]) +param_coxnet ={ + "model__alpha_min_ratio": [0.001, 0.01,0.5, 0.1], + "model__l1_ratio": [0.0001,0.001, 0.01, 0.5,0.9,1]} + + + + +# Survival SVM +pipe_svm = Pipeline([('model', FastSurvivalSVM())]) +param_svm ={ + 'model__random_state': [SEED], + 'model__max_iter': [20, 100, 1000,100000], + 'model__optimizer':['avltree', 'rbtree',]} + +# Estimator list: +#'Cox Regression':[pipe_cox, param_cox ], +estimator_list = { + 'Random Forest Survival':[pipe_rsf, param_rsf], + 'Component Wise Gradient Boosting Survival': [pipe_cgbs, param_cgbs], + 'Gradient Boosting Survival': [pipe_gbs, param_gbs], + 'Cox Regression':[pipe_cox, param_cox ], + 'Coxnet Regression':[pipe_coxnet, param_coxnet ], + 'SVM Survival': [pipe_svm, param_svm]} + +estimator_list.items() + + + +#%% +#Nested CV + +hold_out_results = [] +model_list = [] +pred_list = [] +c_index_list = [] +pred_list_n = [] +pre_list_CV=[] + +for model_name, index in estimator_list.items(): + print('\n',model_name) + for train_ix, test_ix in CV.split(X): + X_train, X_test = X.loc[train_ix, :], X.loc[test_ix, :] + y_train, y_test = y[train_ix],y[test_ix] + #X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=10) + + #print('\n',model_name) + estimator = index[0] + param = index[1] + outcome = grid_search(estimator, param, X_train, y_train, X_test, y_test, CV) + model = outcome[0] + hold_out_results.append(outcome[1]) + pred_list.append(outcome[2]) + + + # Run model n times to check c-index + score, pre = c_index(model, X, y, n=5) + c_index_list.append(score) + pred_list_n.append(pre) + pre_list_CV.append(c_index_list) +#%% +#Nested CV c-index + +print('RSF mean:',np.mean(c_index_list[0:3]) ) +print('CGBSurv mean:',np.mean(c_index_list[4:6]) ) +print('GBSurv mean:',np.mean(c_index_list[6:9]) ) +print('CoxPH mean:',np.mean(c_index_list[9:12]) ) +print('Coxnet mean:',np.mean(c_index_list[12:15]) ) +print('SurvSVM mean:',np.mean(c_index_list[15:]) ) + +#%% + +#nested vertion box plot +my_array=[] +my_array = np.array(c_index_list) +a= np.concatenate([my_array[0,:],my_array[1,:],my_array[2,:]]) +b=np.concatenate([my_array[3,:],my_array[4,:],my_array[5,:]]) +c=np.concatenate([my_array[6,:],my_array[7,:],my_array[8,:]]) +d=np.concatenate([my_array[9,:],my_array[10,:],my_array[11,:]]) +e=np.concatenate([my_array[12,:],my_array[13,:],my_array[14,:]]) +g= np.concatenate([my_array[15,:],my_array[16,:],my_array[17,:]]) +cc=[a,b,c,d,e,g] + + +name = ['RSF','CGBSurv', 'GBSurv','CoxPH','Coxnet', 'SurvSVM'] +res = [] +#'GBSurv', +for i in range(0,6): + for c in cc[i]: + res.append([name[i],c]) + +c_plot = pd.DataFrame(res, columns=['Learning approach','C-Index']) +ax=sns.set_style("white") +#ax = sns.boxplot(x="Learning approach", y="C-Index", data=c_plot) +ax = sns.boxplot(x="Learning approach", y="C-Index", data=c_plot) +ax = sns.stripplot(x="Learning approach", y="C-Index", data=c_plot, jitter=False, edgecolor="black") +# A violin plot combines the benefits of the previous two plots and simplifies them +# Denser regions of the data are fatter, and sparser thiner in a violin plot +sns.violinplot(x="Learning approach", y="C-Index", data=c_plot, size=6, edgecolor="black") +#plt.title('All Data') +#plt.title('Images and Transcriptomics Data') +plt.title('Metabolomics and Transcriptomics Data') +#plt.title('Metabolomics and Images Data') +ax=sns.set_style("white") +sns.set_style("whitegrid", {'axes.grid' : False}) +plt.ylim(0.3, 1) +plt.savefig('C:\\Users\\Noushin\\OneDrive - Teesside University\\Noushin-paper\\result\\New_pdf_adobe\\Meta_RNA_plot_nested_1.pdf',transparent=True) + +#%% + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10) +#%% +hold_out_results = [] +model_list = [] +pred_list = [] +c_index_list = [] +pred_list_n = [] + +for model_name, index in estimator_list.items(): + print('\n',model_name) + estimator = index[0] + param = index[1] + outcome = grid_search(estimator, param, X_train, y_train, X_test, y_test, CV) + model = outcome[0] + hold_out_results.append(outcome[1]) + pred_list.append(outcome[2]) + + # Run model n times to check c-index + score, pre = c_index(model, X, y, n=15) + c_index_list.append(score) + pred_list_n.append(pre) + + +#%% +# Visual results + +name = ['RSF','CGBSurv', 'GBSurv','CoxPH','Coxnet', 'SurvSVM'] +cv_res = [] +#'GBSurv', +for i in range(0,6): + for c in c_index_list[i]: + cv_res.append([name[i],c]) + +c_plot = pd.DataFrame(cv_res, columns=['Learning approach','C-Index']) +ax=sns.set_style("white") +#ax = sns.boxplot(x="Learning approach", y="C-Index", data=c_plot) +#ax = sns.boxplot(x="Learning approach", y="C-Index", data=c_plot) +ax = sns.stripplot(x="Learning approach", y="C-Index", data=c_plot, jitter=True, edgecolor="black") +# A violin plot combines the benefits of the previous two plots and simplifies them +# Denser regions of the data are fatter, and sparser thiner in a violin plot +sns.violinplot(x="Learning approach", y="C-Index", data=c_plot, size=6, edgecolor="black") +plt.title('All Data') +#plt.title('Images and Transcriptomics Data') +#plt.title('Metabolomics and Transcriptomics Data') +#plt.title('Metabolomics and Images Data') +ax=sns.set_style("white") +sns.set_style("whitegrid", {'axes.grid' : False}) +plt.ylim(0.3, 1) +#plt.savefig('C:\\Users\\Noushin\\OneDrive - Teesside University\\AllData_plot.pdf',transparent=True) + + + + + + +#%% +CoxPH = c_plot[c_plot['Learning approach'] == 'CoxPH']['C-Index'].values +Coxnet = c_plot[c_plot['Learning approach'] == 'Coxnet']['C-Index'].values +RSF = c_plot[c_plot['Learning approach'] == 'RSF']['C-Index'].values +GBSurv = c_plot[c_plot['Learning approach'] == 'GBSurv']['C-Index'].values +CGBSurv = c_plot[c_plot['Learning approach'] == 'CGBSurv']['C-Index'].values +SurvSVM = c_plot[c_plot['Learning approach'] == 'SurvSVM']['C-Index'].values + +print('CoxPH median:',np.median(CoxPH) ) +print('Coxnet median:',np.median(Coxnet) ) +print('RSF median:',np.median(RSF) ) +print('GBSurv median:',np.median(GBSurv) ) +print('CGBSurv median:',np.median(CGBSurv) ) +print('SurvSVM median:',np.median(SurvSVM) ) +#%% +from scipy import stats +#t, p = stats.ttest_ind(CGBSurv, GBSurv) +#t, p = stats.ttest_ind(CoxPH, Coxnet) +#t, p = stats.ttest_ind(Coxnet, SurvSVM) +#t, p = stats.ttest_ind(Coxnet, RSF) +t, p = stats.ttest_ind(RSF, CGBSurv) +#t, p = stats.ttest_ind(GBSurv, CoxPH) +#t, p = stats.ttest_ind(Coxnet , CGBSurv) +#t, p = stats.ttest_ind(CoxPH , RSF) + +#%% +"""Interpretation: +If the t-value is positive (>0) then the mean of g1 was significantly greater than the mean of g2. + + +If the t-value is negative (<0) then the mean of g1 was significantly smaller than the mean of g2 . +""" + + + +from lifelines.statistics import logrank_test +from lifelines import KaplanMeierFitter +from lifelines.plotting import add_at_risk_counts + +names = list(estimator_list.keys()) +k=0 +fig, ax = plt.subplots(2,3, figsize=(12,12)) +# fig, ax = plt.subplots() +for pred in pred_list: + df1 = X_test.reset_index(drop=True) + y_pred = pred + med = np.median(y_pred) + + r = np.where(y_pred >= med, 1, 0) + df1['Risk'] = r + df1['pred'] = pred + df3=df1.sort_values(by='pred', ascending=False) + High_risk=df3.head(n=6) + Low_risk=df3.tail(n=6) + print(df1.shape) + ix = df1['Risk'] == 1 + #ix=High_risk.index + #ix1=Low_risk.index + + df_y = pd.DataFrame(y_test) + df_y['OS.Status'] = np.where(df_y['OS.Status'] == True, 1, 0) + df1['OS.Status']= df_y['OS.Status'] + df1['OS.time']= df_y['OS.time'] + T_hr, E_hr = df1.loc[ix]['OS.time'], df1.loc[ix]['OS.Status'] + T_lr, E_lr = df1.loc[~ix]['OS.time'], df1.loc[~ix]['OS.Status'] + #T_lr, E_lr = df1.loc[ix1]['OS.time'], df1.loc[ix1]['OS.Status'] + + + # Set-up plots + k +=1 + plt.subplot(3,2,k) + + # Fit survival curves + kmf_control = KaplanMeierFitter() + ax = kmf_control.fit(T_hr, E_hr, label='HR').plot_survival_function() + + kmf_exp = KaplanMeierFitter() + ax = kmf_exp.fit(T_lr, E_lr, label='LR').plot_survival_function() + + + add_at_risk_counts(kmf_exp, kmf_control) + # Format graph + plt.ylim(0,1) + ax.set_xlabel('Timeline (months)',fontsize='large') + ax.set_ylabel('Percentage of Population Alive',fontsize='large') + + # Calculate p-value + # res = logrank_test(T_hr, T_lr, alpha=.95) + res = logrank_test(T_hr, T_lr, event_observed_A=E_hr, event_observed_B=E_lr, alpha=.95) + print('\nModel', names[k-1]) + res.print_summary() + + + # Location the label at the 1st out of 9 tick marks + xloc = max(np.max(T_hr),np.max(T_lr)) / 10 + ax.text(xloc,.2,res.p_value,fontsize=12) + plt.subplot(3,2,k).set_title('KM Curves {}' .format(names[k-1])) + plt.rcParams['axes.edgecolor'] = 'black' + plt.tight_layout() + #plt.savefig('D:\\Second-year\\Result\\All Data_km_30%.pdf') + + +#HR=pred_list +#np.savetxt("D:\\Second-year\\Result\\GFG1.csv", pred_list,delimiter =", ", fmt ='% s') +#HR_index=pd.read_csv("D:\\Second-year\\Result\\GFG1.csv") +#HR_index=pd.read_csv("D:\\Second-year\\Result\\GFG1.csv",header=None) +#HR_index_t=HR_index.T +#np.savetxt("D:\\Second-year\\Result\\GFG_t.csv", HR_index_t,delimiter =", ", fmt ='% s') +#print(np.median(pred_list[2])) +#df.to_csv('D:\\Second-year\\Result\\index-data.csv') +#%% + +from lifelines.statistics import logrank_test +from lifelines import KaplanMeierFitter +from lifelines.plotting import add_at_risk_counts + +names = list(estimator_list.keys()) +k=0 +#fig, ax = plt.subplots(2,1, figsize=(10,12)) +fig, ax = plt.subplots(3,2, figsize=(12,12)) +# fig, ax = plt.subplots() +for pred in pred_list: + df1 = X_test.reset_index(drop=True) + y_pred = pred + med = np.median(y_pred) + r = np.where(y_pred >= med, 1, 0) + #df1['Risk'] = r + df1['pred'] = pred + print(df1.shape) + df3=df1.sort_values(by='pred', ascending=False) + High_risk=df3.head(n=10) + Low_risk=df3.tail(n=10) + #ix = df1['Risk'] == 1 + ix=High_risk.index + ix1=Low_risk.index + + df_y = pd.DataFrame(y_test) + df_y['OS.Status'] = np.where(df_y['OS.Status'] == True, 1, 0) + df1['OS.Status']= df_y['OS.Status'] + df1['OS.time']= df_y['OS.time'] + T_hr, E_hr = df1.loc[ix]['OS.time'], df1.loc[ix]['OS.Status'] + #T_lr, E_lr = df1.loc[~ix]['OS.time'], df1.loc[~ix]['OS.Status'] + T_lr, E_lr = df1.loc[ix1]['OS.time'], df1.loc[ix1]['OS.Status'] + + + # Set-up plots + + k +=1 + #plt.subplot(2,1,k) + plt.subplot(3,2,k) + + # Fit survival curves + kmf_control = KaplanMeierFitter() + ax = kmf_control.fit(T_hr, E_hr, label='HR').plot_survival_function() + + kmf_exp = KaplanMeierFitter() + ax = kmf_exp.fit(T_lr, E_lr, label='LR').plot_survival_function() + + + add_at_risk_counts(kmf_exp, kmf_control) + # Format graph + plt.ylim(0,1) + ax.set_xlabel('Timeline (months)',fontsize='large') + ax.set_ylabel('Percentage of Population Alive',fontsize='large') + + # Calculate p-value + # res = logrank_test(T_hr, T_lr, alpha=.95) + res = logrank_test(T_hr, T_lr, event_observed_A=E_hr, event_observed_B=E_lr, alpha=.95) + print('\nModel', names[k-1]) + res.print_summary() + + + # Location the label at the 1st out of 9 tick marks + xloc = max(np.max(T_hr),np.max(T_lr)) / 10 + + + ax.text(xloc,.2,r'$\mathrm{p}=%.3f$' % (res.p_value, ),fontsize=18) + plt.subplot(3,2,k).set_title('{}' .format(names[k-1])) + #plt.subplot(2,1,k).set_title('{}' .format(names[k-1])) + plt.rcParams['axes.edgecolor'] = 'black' + plt.rcParams['font.size'] = 14 + #plt.rcParams['font.family'] = 'Arial' + #plt.rcParams.update({'font.family':'Arial'}) + #plt.rcParams['mathtext.rm'] = 'Arial' + from matplotlib.pyplot import * + +# Need to use precise font names in rcParams; I found my fonts with + import matplotlib.font_manager + [f.name for f in matplotlib.font_manager.fontManager.ttflist] + + rcParams['mathtext.fontset'] = 'custom' + rcParams['mathtext.it'] = 'Arial:italic' + rcParams['mathtext.rm'] = 'Arial' + plt.tight_layout() + + #plt.savefig('D:\\Second-year\\Result\\New\\img_rna_meta_two_3.pdf',transparent=True) + +#%% + +import sklearn +import shap +# use Kernel SHAP to explain test set predictions +X100 = shap.utils.sample(X_test, 17) # 100 instances for use as the background distribution +# compute the SHAP values for the linear model +pipe_coxnet.fit(X_train,y_train) +explainer = shap.Explainer(pipe_coxnet.predict, X100) +shap_values = explainer(X_test) + +# make a standard partial dependence plot +sample_ind = 17 +# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind] +#shap.plots.waterfall(shap_values[sample_ind], max_display=20) +#plt.savefig('D:\\Second-year\\Result\\New\\shap.pdf',transparent=True) +shap.plots.force(shap_values[0]) +#%% + +import sklearn +import shap +# use Kernel SHAP to explain test set predictions +X100 = shap.utils.sample(X_test, 20) # 100 instances for use as the background distribution +# compute the SHAP values for the linear model +pipe_coxnet.fit(X_train,y_train) +explainer = shap.Explainer(pipe_coxnet.predict, X100) +shap_values = explainer(X_test) + +# make a standard partial dependence plot +sample_ind = 16 +# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind] +shap.plots.waterfall(shap_values[sample_ind], max_display=30) +plt.savefig('D:\\Second-year\\Result\\New\\shap.pdf',transparent=True)