RuleCovering/RuleCoverDatasets.py at master · sibirbil/RuleCovering · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# # -*- coding: utf-8 -*-

import pandas as pd

def banknote(wd): # Two classes
    """
    Attribute Information:
    1. variance of Wavelet Transformed image (continuous)
    2. skewness of Wavelet Transformed image (continuous)
    3. curtosis of Wavelet Transformed image (continuous)
    4. entropy of image (continuous)
    5. class (integer)
    """
    df = pd.read_csv(wd+'data_banknote_authentication.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def ILPD(wd): # Two classes
    """
    Attribute Information:
    1. Age Age of the patient
    2. Gender Gender of the patient
    3. TB Total Bilirubin
    4. DB Direct Bilirubin
    5. Alkphos Alkaline Phosphotase
    6. Sgpt Alamine Aminotransferase
    7. Sgot Aspartate Aminotransferase
    8. TP Total Protiens
    9. ALB Albumin
    10. A/G Ratio Albumin and Globulin Ratio
    11. Selector field used to split the data into
        two sets (labeled by the experts)
    """
    df = pd.read_csv(wd+'ILPD.csv',header = None)
    df.iloc[:,1] = (df.iloc[:,1] == 'Female')*1
    df.iloc[:,-1] = (df.iloc[:,-1] == 2) * 1
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    df.dropna(inplace=True)
    return df

def ionosphere(wd): # Two classes
    """
    Attribute Information:

    -- All 34 are continuous
    -- The 35th attribute is either "good" or "bad"
       according to the definition summarized above.
       This is a binary classification task.
    """
    df = pd.read_csv(wd+'ionosphere.csv', header = None)
    df.iloc[:,-1] = (df.iloc[:,-1] == 'g')*1
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def transfusion(wd): # Two classes
    """
    Attribute Information:
    Given is the variable name, variable type, the measurement unit and a
    brief description. The "Blood Transfusion Service Center" is a
    classification problem. The order of this listing corresponds to the
    order of numerals along the rows of the database.

    R (Recency - months since last donation),
    F (Frequency - total number of donation),
    M (Monetary - total blood donated in c.c.),
    T (Time - months since first donation), and
    a binary variable representing whether he/she donated blood in March 2007
    (1 stand for donating blood; 0 stands for not donating blood).
    """
    df = pd.read_csv(wd+'transfusion.csv', header = 0)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def liver(wd): # Two classes
    """
    Attribute information:
    1. mcv	mean corpuscular volume
    2. alkphos	alkaline phosphotase
    3. sgpt	alamine aminotransferase
    4. sgot 	aspartate aminotransferase
    5. gammagt	gamma-glutamyl transpeptidase
    6. drinks	number of half-pint equivalents of
       alcoholic beverages drunk per day
    7. selector  field used to split data into two set
   """
    df = pd.read_csv(wd+'bupa.csv', header = None)
    df.iloc[:,-1] = (df.iloc[:,-1] == 2)*1
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def tictactoe(wd): # Two classes
    """
    Attribute Information:
    1. top-left-square: {x,o,b}
    2. top-middle-square: {x,o,b}
    3. top-right-square: {x,o,b}
    4. middle-left-square: {x,o,b}
    5. middle-middle-square: {x,o,b}
    6. middle-right-square: {x,o,b}
    7. bottom-left-square: {x,o,b}
    8. bottom-middle-square: {x,o,b}
    9. bottom-right-square: {x,o,b}
    10. Class: {positive,negative}
    """
    df = pd.read_csv(wd+'tictactoe.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    df1 = pd.get_dummies(df.iloc[:,:-1], drop_first=True)
    df1['y'] = (df['y'] == 'positive') *1
    return df1

def wdbc(wd): # Two classes
    """
    1) ID number
    2) Diagnosis (M = malignant, B = benign)
    3-32)
    Ten real-valued features are computed for each cell nucleus:
        a) radius (mean of distances from center to points on the perimeter)
        b) texture (standard deviation of gray-scale values)
        c) perimeter
        d) area
        e) smoothness (local variation in radius lengths)
        f) compactness (perimeter^2 / area - 1.0)
        g) concavity (severity of concave portions of the contour)
        h) concave points (number of concave portions of the contour)
        i) symmetry
        j) fractal dimension ("coastline approximation" - 1)
    """
    df = pd.read_csv(wd+'wdbc.csv', header = None, index_col = 0)
    df.columns = ['y'] + ['X_' + str(i) for i in range(len(df.columns)-1)]
    y = (df['y'] == 'M')*1
    df.drop('y', axis=1, inplace = True)
    df['y'] = y
    return df

def mammography(wd): # Two classes - Imbalanced
    """
    Attribute Information:
    7. Class (-1 or 1)
    """
    import pandas as pd
    df = pd.read_csv(wd+'mammography.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def diabetes(wd): # Two classes - Imbalanced
    """
    Attribute Information:
    1. Number of times pregnant
    2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
    3. Diastolic blood pressure (mm Hg)
    4. Triceps skin fold thickness (mm)
    5. 2-Hour serum insulin (mu U/ml)
    6. Body mass index (weight in kg/(height in m)^2)
    7. Diabetes pedigree function
    8. Age (years)
    9. Class variable (0 or 1)
    """
    import pandas as pd
    df = pd.read_csv(wd+'diabetes.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def oilspill(wd): # Two classes - Imbalanced
    """
    Attribute Information:
    x. Class (0 or 1)
    """
    import pandas as pd
    df = pd.read_csv(wd+'oilspill.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    df = df.drop(df.columns[[0]], axis = 1)
    return df

def phoneme(wd): # Two classes - Imbalanced
    """
    Attribute Information:
    Five different attributes were chosen to
    characterize each vowel: they are the amplitudes of the five first
    harmonics AHi, normalised by the total energy Ene (integrated on all the
    frequencies): AHi/Ene. Each harmonic is signed: positive when it
    corresponds to a local maximum of the spectrum and negative otherwise.
    6. Class (0 and 1)
    """
    import pandas as pd
    df = pd.read_csv(wd+'phoneme.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def seeds(wd): # Three classes
    """
    Attribute Information:

    To construct the data, seven geometric parameters of wheat kernels were measured:
    1. area A,
    2. perimeter P,
    3. compactness C = 4*pi*A/P^2,
    4. length of kernel,
    5. width of kernel,
    6. asymmetry coefficient
    7. length of kernel groove.
    All of these parameters were real-valued continuous.
    """
    df = pd.read_csv(wd+'seeds.csv', header = None, sep = '\t', engine = 'python')
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df

def wine(wd): # Three classes
    """
    The attributes are donated by Riccardo Leardi (riclea@anchem.unige.it)
 	 1) Alcohol
 	 2) Malic acid
 	 3) Ash
	 4) Alcalinity of ash
 	 5) Magnesium
	 6) Total phenols
 	 7) Flavanoids
 	 8) Nonflavanoid phenols
 	 9) Proanthocyanins
	10) Color intensity
 	11) Hue
 	12) OD280/OD315 of diluted wines
 	13) Proline
    Number of Instances
    class 1 59
	class 2 71
	class 3 48
    """
    df = pd.read_csv(wd+'wine.csv', header = None)
    df.columns = ['y'] + ['X_' + str(i) for i in range(len(df.columns)-1)]
    y = df['y']
    df.drop('y', axis = 1, inplace = True)
    df['y'] = y
    return df

def glass(wd): # Six classes - Imbalanced
    """
    Attribute Information:
    RI: refractive index
    Na: Sodium
    Mg: Magnesium
    Al: Aluminum
    Si: Silicon
    K: Potassium
    Ca: Calcium
    Ba: Barium
    Fe: Iron

    Class 1: building windows (float processed)
    Class 2: building windows (non-float processed)
    Class 3: vehicle windows (float processed)
    Class 4: containers
    Class 5: tableware
    Class 6: headlamps

    """
    df = pd.read_csv(wd+'glass.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    df['y'] -= 1
    return df

def ecoli(wd): # Eight classes - Imbalanced
    """
    Attribute Information:
    0: McGeoch’s method for signal sequence recognition
    1: von Heijne’s method for signal sequence recognition
    2: von Heijne’s Signal Peptidase II consensus sequence score
    3: Presence of charge on N-terminus of predicted lipoproteins
    4: Score of discriminant analysis of the amino acid content
       of outer membrane and periplasmic proteins.
    5: score of the ALOM membrane-spanning region prediction program
    6: score of ALOM program after excluding putative cleavable
       signal regions from the sequence.

    Eight Classes:
    0: cytoplasm
    1: inner membrane without signal sequence
    2: inner membrane lipoprotein
    3: inner membrane, cleavable signal sequence
    4: inner membrane, non cleavable signal sequence
    5: outer membrane
    6: outer membrane lipoprotein
    7: periplasm
    """
    df = pd.read_csv(wd+'ecoli.csv', header = None)
    df.columns = ['X_' + str(i) for i in range(len(df.columns)-1)] + ['y']
    return df