-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_Descriptive.py
118 lines (83 loc) · 3.36 KB
/
03_Descriptive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 19 11:20:17 2017
@author: Antonio
"""
exec(open("Utils.py").read(), globals())
exec(open("01_Importazione_dati_e_moduli.py").read(), globals())
###############################################
############ Descrittive delle colonne ########
###############################################
###############################################
################## MEDIE ######################
###############################################
means = pd.DataFrame( X.apply( np.mean ) )
summary_means = means.describe(percentiles = [0,1])
means = X.apply( np.mean )
sns.kdeplot(np.array( means ), bw=0.5)
means
###############################################
############## Deviazioni standard ############
###############################################
std = pd.DataFrame(X.apply( np.std ))
summary_std = std.describe(percentiles = [0,1])
std = X.apply( np.std )
sns.kdeplot(np.array( std ), bw=0.5)
###############################################
######### Coefficienti di variazione ##########
###############################################
CV = pd.DataFrame(X.apply( stats.variation ))
summary_cv = CV.describe(percentiles = [0,1])
CV = X.apply( stats.variation )
sns.kdeplot(np.array( CV ), bw=0.5)
###############################################
###############################################
################ Analisi missing ##############
###############################################
tipo = list(set(dati_risposta.ix[:,2]))
tipo
lista_null = []
for i in tipo:
dati_correnti = dati_risposta[dati_risposta['Cancer Type'] == i]
current_n_rows = dati_correnti.shape[0]
lista_null.append([i,
current_n_rows,
dati_correnti.ix[: , 4].isnull().sum(),
dati_correnti.ix[: , 5].isnull().sum(),
dati_correnti.ix[: , 6].isnull().sum(),
dati_correnti.ix[: , 7].isnull().sum()])
dataframe_null_type = pd.DataFrame.from_records(lista_null)
dataframe_null_type.columns = ['Cancer_type', 'total_n_row',
'null_BMS_IC_50','null_BMS_AUC',
'null_Z_IC_50', 'null_Z_AUC']
sum_missing = dataframe_null_type.apply( np.sum )
MIN = X.min().min()
MAX = X.max().max()
print(MIN, MAX)
sns.countplot(x="Cancer Type", data=dati_risposta, palette="Greens_d");
plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=60)
plt.title('\n Distribuzione variabile Cancer Type')
savefig("Presentazione/frequenze.png") #, transparent=True)
plt.show()
import matplotlib.pyplot as plt
dati_risposta.ix[:, [2, 4]].boxplot( by='Cancer Type')
plt.subplots_adjust(bottom=0.15)
plt.xticks(rotation=60)
savefig("Presentazione/boxplot1.png", dpi = 900) #, transparent=True) #), dpi = 500)
plt.show()
dati_risposta.ix[:, [2, 5]].boxplot( by='Cancer Type')
plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=60)
savefig("Presentazione/boxplot2.png") #, transparent=True) #), dpi = 500)
plt.show()
dati_risposta.ix[:, [2, 6]].boxplot( by='Cancer Type')
plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=60)
savefig("Presentazione/boxplot3.png") #, transparent=True) #), dpi = 500)
plt.show()
dati_risposta.ix[:, [2, 7]].boxplot( by='Cancer Type')
plt.subplots_adjust(bottom=0.25)
plt.xticks(rotation=60)
savefig("Presentazione/boxplot4.png") #, transparent=True) #), dpi = 500)
plt.show()