-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinitData.py
70 lines (53 loc) · 1.51 KB
/
initData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
import numpy as np
import xlrd
import pandas as pd
import math
from matplotlib.pyplot import figure, axes, legend, subplot, plot, hist, title, imshow, yticks, cm, xlabel, ylabel, show, grid, boxplot
from scipy.linalg import svd
from scipy.io import loadmat
import sklearn.linear_model as lm
from sklearn import preprocessing
# Load xls sheet with data
#dataset = xlrd.open_workbook('wage2.xls').sheet_by_index(0)
#data = pd.get_dummies(dataset)
df = pd.read_excel('modified.xls', header = None)
doc = xlrd.open_workbook('modified.xls').sheet_by_index(0)
attributeNames = doc.row_values(0, 1, 8)
n = len(df.index)
df.reset_index()
df.reindex(index=range(0,n))
df.dropna(inplace=True)
dfMatrix = df.as_matrix()
y = dfMatrix[1:,0]
yMatrix = np.mat(y)
X = np.mat(np.empty((n-1,7)))
for i, col_id in enumerate(range(1,8)):
X[:,i] = np.matrix(doc.col_values(col_id, 1, n)).T
classX = np.asarray(X)
stdX = preprocessing.scale(classX)
#N = len(y)
#M = len(attributeNames)
N, M = X.shape
classNames = ['Poor', 'Lower', 'Middle', 'Upper']
attributeNames = [
'hours',
'iq',
'educ',
'exper',
'tenure',
'age',
'black'
]
classY = np.asarray(np.mat(np.empty((N))).T).squeeze()
for i in range(0,N):
if y[i] <= np.percentile(y,25):
classY[i] = 0
elif y[i] <= np.percentile(y,50):
classY[i] = 1
elif y[i] <= np.percentile(y,75):
classY[i] = 2
else:
classY[i] = 3
C = len(classNames)
#boxplot(preprocessing.scale(classX))