-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheda_count_feature.py
216 lines (162 loc) · 9.18 KB
/
eda_count_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import matplotlib.pyplot as plt
from matplotlib.ticker import OldScalarFormatter
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.stats import binom_test
#np.set_printoptions(threshold=np.nan)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
train = pd.read_csv("./train.csv", na_values=[-1,-1.0])
test = pd.read_csv("./test.csv", na_values=[-1,-1.0])
#bins part 1
fig, axis = plt.subplots(2,4)
# for i in range(2):
# for j in range(4):
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())#.set_powerlimits((0, 1))
sns.countplot(x='ps_ind_06_bin', data=train, ax=axis[0,0])
sns.countplot(x='ps_ind_07_bin', data=train, ax=axis[0,1])
sns.countplot(x='ps_ind_08_bin', data=train, ax=axis[0,2])
sns.countplot(x='ps_ind_09_bin', data=train, ax=axis[0,3])
sns.countplot(x='ps_ind_10_bin', data=train, ax=axis[1,0])
sns.countplot(x='ps_ind_11_bin', data=train, ax=axis[1,1])
sns.countplot(x='ps_ind_12_bin', data=train, ax=axis[1,2])
sns.countplot(x='ps_ind_13_bin', data=train, ax=axis[1,3])
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.4,
wspace=0.7)
plt.show()
#bins part 2
fig, axis = plt.subplots(2,5)
# for i in range(2):
# for j in range(5):
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())#.set_powerlimits((0, 1))
sns.countplot(x='ps_ind_16_bin', data=train, ax=axis[0,0])
sns.countplot(x='ps_ind_17_bin', data=train, ax=axis[0,1])
sns.countplot(x='ps_ind_18_bin', data=train, ax=axis[0,2])
sns.countplot(x='ps_calc_15_bin', data=train, ax=axis[0,3])
sns.countplot(x='ps_calc_16_bin', data=train, ax=axis[1,0])
sns.countplot(x='ps_calc_17_bin', data=train, ax=axis[1,1])
sns.countplot(x='ps_calc_18_bin', data=train, ax=axis[1,2])
sns.countplot(x='ps_calc_19_bin', data=train, ax=axis[1,3])
sns.countplot(x='ps_calc_20_bin', data=train, ax=axis[0,4])
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.4,
wspace=0.9)
plt.show()
#categorical part 1
fig, axis = plt.subplots(3,2)
# for i in range(3):
# for j in range(2):
# #axis[i,j].set_yscale('log')
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())
temp_train = train.fillna("NA")
sns.countplot(x='ps_ind_02_cat', data=temp_train, ax=axis[0,0], order = temp_train['ps_ind_02_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_ind_04_cat', data=temp_train, ax=axis[0,1], order = temp_train['ps_ind_04_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_ind_05_cat', data=temp_train, ax=axis[1,0], order = temp_train['ps_ind_05_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_01_cat', data=temp_train, ax=axis[1,1], order = temp_train['ps_car_01_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_02_cat', data=temp_train, ax=axis[2,0], order = temp_train['ps_car_02_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_03_cat', data=temp_train, ax=axis[2,1], order = temp_train['ps_car_03_cat'].value_counts().index.sort_values())
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.65,
wspace=0.7)
plt.show()
#categorical part 2
fig, axis = plt.subplots(3,2)
# for i in range(3):
# for j in range(2):
# #axis[i,j].set_yscale('log')
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())
sns.countplot(x='ps_car_07_cat', data=temp_train, ax=axis[0,0], order = temp_train['ps_car_07_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_08_cat', data=temp_train, ax=axis[0,1], order = temp_train['ps_car_08_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_09_cat', data=temp_train, ax=axis[1,0], order = temp_train['ps_car_09_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_04_cat', data=temp_train, ax=axis[1,1], order = temp_train['ps_car_04_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_05_cat', data=temp_train, ax=axis[2,0], order = temp_train['ps_car_05_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_06_cat', data=temp_train, ax=axis[2,1], order = temp_train['ps_car_06_cat'].value_counts().index.sort_values())
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.65,
wspace=0.7)
plt.show()
#categorical part 3
fig, axis = plt.subplots(2,1)
for i in range(2):
#axis[i].set_yscale('log')
axis[i].yaxis.set_major_formatter(OldScalarFormatter())
sns.countplot(x='ps_car_10_cat', data=temp_train, ax=axis[0], order = temp_train['ps_car_10_cat'].value_counts().index.sort_values())
sns.countplot(x='ps_car_11_cat', data=temp_train, ax=axis[1], order = temp_train['ps_car_11_cat'].value_counts().index.sort_values())
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.65,
wspace=0.7)
plt.show()
#integer part 1
fig, axis = plt.subplots(3,2)
# for i in range(3):
# for j in range(2):
# #axis[i,j].set_yscale('log')
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())
sns.countplot(x='ps_ind_01', data=temp_train, ax=axis[0,0], order = temp_train['ps_ind_01'].value_counts().index.sort_values())
sns.countplot(x='ps_ind_03', data=temp_train, ax=axis[0,1], order = temp_train['ps_ind_03'].value_counts().index.sort_values())
sns.countplot(x='ps_ind_14', data=temp_train, ax=axis[1,0], order = temp_train['ps_ind_14'].value_counts().index.sort_values())
sns.countplot(x='ps_ind_15', data=temp_train, ax=axis[1,1], order = temp_train['ps_ind_15'].value_counts().index.sort_values())
sns.countplot(x='ps_car_11', data=temp_train, ax=axis[2,0], order = temp_train['ps_car_11'].value_counts().index.sort_values())
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.65,
wspace=0.7)
plt.show()
#integer part 2
fig, axis = plt.subplots(3,4)
# for i in range(3):
# for j in range(4):
# #axis[i,j].set_yscale('log')
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())
sns.countplot(x='ps_calc_04', data=temp_train, ax=axis[0,0], order = temp_train['ps_calc_04'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_05', data=temp_train, ax=axis[0,1], order = temp_train['ps_calc_05'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_06', data=temp_train, ax=axis[0,2], order = temp_train['ps_calc_06'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_07', data=temp_train, ax=axis[0,3], order = temp_train['ps_calc_07'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_08', data=temp_train, ax=axis[1,0], order = temp_train['ps_calc_08'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_09', data=temp_train, ax=axis[1,1], order = temp_train['ps_calc_09'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_10', data=temp_train, ax=axis[1,2], order = temp_train['ps_calc_10'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_11', data=temp_train, ax=axis[1,3], order = temp_train['ps_calc_11'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_12', data=temp_train, ax=axis[2,0],color = "blue", order = temp_train['ps_calc_12'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_13', data=temp_train, ax=axis[2,1],color = "blue", order = temp_train['ps_calc_13'].value_counts().index.sort_values())
sns.countplot(x='ps_calc_14', data=temp_train, ax=axis[2,2],color = "blue", order = temp_train['ps_calc_14'].value_counts().index.sort_values())
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.65,
wspace=0.7)
plt.show()
#float features part 1
fig, axis = plt.subplots(3,2)
# for i in range(3):
# for j in range(2):
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())
temp_train = train
axis[0,0].hist(temp_train['ps_reg_01'].dropna())
axis[0,0].set_xlabel('ps_reg_01')
axis[0,1].hist(temp_train['ps_reg_02'].dropna())
axis[0,1].set_xlabel('ps_reg_02')
axis[1,0].hist(temp_train['ps_reg_03'].dropna())
axis[1,0].set_xlabel('ps_reg_03')
axis[1,1].hist(temp_train['ps_calc_01'].dropna())
axis[1,1].set_xlabel('ps_calc_01')
axis[2,0].hist(temp_train['ps_calc_02'].dropna())
axis[2,0].set_xlabel('ps_calc_02')
axis[2,1].hist(temp_train['ps_calc_03'].dropna())
axis[2,1].set_xlabel('ps_calc_03')
#sns.distplot(a=temp_train['ps_reg_01'].dropna(), kde=False, ax=axis[0,0])
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.65,
wspace=0.7)
plt.show()
#float features part 2
fig, axis = plt.subplots(2,2)
# for i in range(2):
# for j in range(2):
# axis[i,j].yaxis.set_major_formatter(OldScalarFormatter())
axis[0,0].hist(temp_train['ps_car_12'].dropna())
axis[0,0].set_xlabel('ps_car_12')
axis[0,1].hist(temp_train['ps_car_13'].dropna())
axis[0,1].set_xlabel('ps_car_13')
axis[1,0].hist(temp_train['ps_car_14'].dropna())
axis[1,0].set_xlabel('ps_car_14')
axis[1,1].hist(temp_train['ps_car_15'].dropna())
axis[1,1].set_xlabel('ps_car_15')
#sns.distplot(a=temp_train['ps_reg_01'].dropna(), kde=False, ax=axis[0,0])
plt.subplots_adjust(top=0.92, bottom=0.16, left=0.10, right=0.95, hspace=0.65,
wspace=0.7)
plt.show()
#target values
fig, axis = plt.subplots(1,1)
sns.countplot(x='target',data=train, ax=axis)
plt.show()