-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathRuleCoverDatasets.py
More file actions
286 lines (264 loc) · 9.56 KB
/
Copy pathRuleCoverDatasets.py
File metadata and controls
286 lines (264 loc) · 9.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# # -*- coding: utf-8 -*-
import pandas as pd
def banknote(wd): # Two classes
"""
Attribute Information:
1. variance of Wavelet Transformed image (continuous)
2. skewness of Wavelet Transformed image (continuous)
3. curtosis of Wavelet Transformed image (continuous)
4. entropy of image (continuous)
5. class (integer)
"""
df = pd.read_csv(wd+'data_banknote_authentication.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def ILPD(wd): # Two classes
"""
Attribute Information:
1. Age Age of the patient
2. Gender Gender of the patient
3. TB Total Bilirubin
4. DB Direct Bilirubin
5. Alkphos Alkaline Phosphotase
6. Sgpt Alamine Aminotransferase
7. Sgot Aspartate Aminotransferase
8. TP Total Protiens
9. ALB Albumin
10. A/G Ratio Albumin and Globulin Ratio
11. Selector field used to split the data into
two sets (labeled by the experts)
"""
df = pd.read_csv(wd+'ILPD.csv',header = None)
df.iloc[:,1] = (df.iloc[:,1] == 'Female')*1
df.iloc[:,-1] = (df.iloc[:,-1] == 2) * 1
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
df.dropna(inplace=True)
return df
def ionosphere(wd): # Two classes
"""
Attribute Information:
-- All 34 are continuous
-- The 35th attribute is either "good" or "bad"
according to the definition summarized above.
This is a binary classification task.
"""
df = pd.read_csv(wd+'ionosphere.csv', header = None)
df.iloc[:,-1] = (df.iloc[:,-1] == 'g')*1
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def transfusion(wd): # Two classes
"""
Attribute Information:
Given is the variable name, variable type, the measurement unit and a
brief description. The "Blood Transfusion Service Center" is a
classification problem. The order of this listing corresponds to the
order of numerals along the rows of the database.
R (Recency - months since last donation),
F (Frequency - total number of donation),
M (Monetary - total blood donated in c.c.),
T (Time - months since first donation), and
a binary variable representing whether he/she donated blood in March 2007
(1 stand for donating blood; 0 stands for not donating blood).
"""
df = pd.read_csv(wd+'transfusion.csv', header = 0)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def liver(wd): # Two classes
"""
Attribute information:
1. mcv mean corpuscular volume
2. alkphos alkaline phosphotase
3. sgpt alamine aminotransferase
4. sgot aspartate aminotransferase
5. gammagt gamma-glutamyl transpeptidase
6. drinks number of half-pint equivalents of
alcoholic beverages drunk per day
7. selector field used to split data into two set
"""
df = pd.read_csv(wd+'bupa.csv', header = None)
df.iloc[:,-1] = (df.iloc[:,-1] == 2)*1
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def tictactoe(wd): # Two classes
"""
Attribute Information:
1. top-left-square: {x,o,b}
2. top-middle-square: {x,o,b}
3. top-right-square: {x,o,b}
4. middle-left-square: {x,o,b}
5. middle-middle-square: {x,o,b}
6. middle-right-square: {x,o,b}
7. bottom-left-square: {x,o,b}
8. bottom-middle-square: {x,o,b}
9. bottom-right-square: {x,o,b}
10. Class: {positive,negative}
"""
df = pd.read_csv(wd+'tictactoe.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
df1 = pd.get_dummies(df.iloc[:,:-1], drop_first=True)
df1['y'] = (df['y'] == 'positive') *1
return df1
def wdbc(wd): # Two classes
"""
1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)
Ten real-valued features are computed for each cell nucleus:
a) radius (mean of distances from center to points on the perimeter)
b) texture (standard deviation of gray-scale values)
c) perimeter
d) area
e) smoothness (local variation in radius lengths)
f) compactness (perimeter^2 / area - 1.0)
g) concavity (severity of concave portions of the contour)
h) concave points (number of concave portions of the contour)
i) symmetry
j) fractal dimension ("coastline approximation" - 1)
"""
df = pd.read_csv(wd+'wdbc.csv', header = None, index_col = 0)
df.columns = ['y'] + ['X_' + str(i) for i in range(len(df.columns)-1)]
y = (df['y'] == 'M')*1
df.drop('y', axis=1, inplace = True)
df['y'] = y
return df
def mammography(wd): # Two classes - Imbalanced
"""
Attribute Information:
7. Class (-1 or 1)
"""
import pandas as pd
df = pd.read_csv(wd+'mammography.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def diabetes(wd): # Two classes - Imbalanced
"""
Attribute Information:
1. Number of times pregnant
2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
3. Diastolic blood pressure (mm Hg)
4. Triceps skin fold thickness (mm)
5. 2-Hour serum insulin (mu U/ml)
6. Body mass index (weight in kg/(height in m)^2)
7. Diabetes pedigree function
8. Age (years)
9. Class variable (0 or 1)
"""
import pandas as pd
df = pd.read_csv(wd+'diabetes.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def oilspill(wd): # Two classes - Imbalanced
"""
Attribute Information:
x. Class (0 or 1)
"""
import pandas as pd
df = pd.read_csv(wd+'oilspill.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
df = df.drop(df.columns[[0]], axis = 1)
return df
def phoneme(wd): # Two classes - Imbalanced
"""
Attribute Information:
Five different attributes were chosen to
characterize each vowel: they are the amplitudes of the five first
harmonics AHi, normalised by the total energy Ene (integrated on all the
frequencies): AHi/Ene. Each harmonic is signed: positive when it
corresponds to a local maximum of the spectrum and negative otherwise.
6. Class (0 and 1)
"""
import pandas as pd
df = pd.read_csv(wd+'phoneme.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def seeds(wd): # Three classes
"""
Attribute Information:
To construct the data, seven geometric parameters of wheat kernels were measured:
1. area A,
2. perimeter P,
3. compactness C = 4*pi*A/P^2,
4. length of kernel,
5. width of kernel,
6. asymmetry coefficient
7. length of kernel groove.
All of these parameters were real-valued continuous.
"""
df = pd.read_csv(wd+'seeds.csv', header = None, sep = '\t', engine = 'python')
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df
def wine(wd): # Three classes
"""
The attributes are donated by Riccardo Leardi (riclea@anchem.unige.it)
1) Alcohol
2) Malic acid
3) Ash
4) Alcalinity of ash
5) Magnesium
6) Total phenols
7) Flavanoids
8) Nonflavanoid phenols
9) Proanthocyanins
10) Color intensity
11) Hue
12) OD280/OD315 of diluted wines
13) Proline
Number of Instances
class 1 59
class 2 71
class 3 48
"""
df = pd.read_csv(wd+'wine.csv', header = None)
df.columns = ['y'] + ['X_' + str(i) for i in range(len(df.columns)-1)]
y = df['y']
df.drop('y', axis = 1, inplace = True)
df['y'] = y
return df
def glass(wd): # Six classes - Imbalanced
"""
Attribute Information:
RI: refractive index
Na: Sodium
Mg: Magnesium
Al: Aluminum
Si: Silicon
K: Potassium
Ca: Calcium
Ba: Barium
Fe: Iron
Class 1: building windows (float processed)
Class 2: building windows (non-float processed)
Class 3: vehicle windows (float processed)
Class 4: containers
Class 5: tableware
Class 6: headlamps
"""
df = pd.read_csv(wd+'glass.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
df['y'] -= 1
return df
def ecoli(wd): # Eight classes - Imbalanced
"""
Attribute Information:
0: McGeoch’s method for signal sequence recognition
1: von Heijne’s method for signal sequence recognition
2: von Heijne’s Signal Peptidase II consensus sequence score
3: Presence of charge on N-terminus of predicted lipoproteins
4: Score of discriminant analysis of the amino acid content
of outer membrane and periplasmic proteins.
5: score of the ALOM membrane-spanning region prediction program
6: score of ALOM program after excluding putative cleavable
signal regions from the sequence.
Eight Classes:
0: cytoplasm
1: inner membrane without signal sequence
2: inner membrane lipoprotein
3: inner membrane, cleavable signal sequence
4: inner membrane, non cleavable signal sequence
5: outer membrane
6: outer membrane lipoprotein
7: periplasm
"""
df = pd.read_csv(wd+'ecoli.csv', header = None)
df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
return df