Skip to content

Commit fcdf8ff

Browse files
add DT
1 parent 1a8324f commit fcdf8ff

File tree

2 files changed

+333
-0
lines changed

2 files changed

+333
-0
lines changed

Decision_Tree/decision_tree_python.py

+273
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
#coding:utf-8
2+
#Author:codewithzichao
3+
4+
5+
# mnist_train:60000
6+
# mnist_test:10000
7+
# acc: 0.8636
8+
# time: 583.6889300346375
9+
10+
11+
import pandas as pd
12+
import numpy as np
13+
import time
14+
from collections import Counter
15+
16+
17+
18+
def loadData(fileName):
19+
#从文件中读取数据
20+
data=pd.read_csv(fileName,header=None)
21+
# 将数据从dataframe转化为ndarray
22+
data=data.values
23+
#数据第一行为分类结果
24+
y_label=data[:,0]
25+
x_label=data[:,1:]
26+
27+
#数据二值化,返回数据
28+
#因为xi的取值范围为0-255,则计算p(X=xi\Y=y)的时候可能性过多,计算过于繁杂
29+
# 所以进行二值化
30+
# y_label为np.ndarray,x_label为np.ndarray
31+
32+
x_label[x_label<128]=0
33+
x_label[x_label>=128]=1
34+
35+
# mp.ndarray
36+
return x_label,y_label
37+
38+
# 计算每一列的信息熵
39+
def calcul_H_D(column):
40+
'''
41+
:param column: 需要求信息增益的列
42+
:return: 信息熵
43+
'''
44+
# 计算这一列有几种取值
45+
types=set([i for i in column]) # set中不包含相同元素
46+
47+
type_dic={} #用来计数每个Di有多少种
48+
HD=0
49+
# 初始化type_dic
50+
51+
for i in types:
52+
type_dic[i]=0
53+
# HD=(Di)/D * log(Di/D)
54+
for i in range(len(column)):
55+
type_dic[column[i]]+=1
56+
for i in type_dic:
57+
HD=HD+(-1)*type_dic[i]/len(column)*np.log2(type_dic[i]/len(column))
58+
return HD
59+
60+
61+
# 计算条件熵
62+
# H_D_A=Di/D*H(Di
63+
def calcul_H_D_A(column, y_label):
64+
'''
65+
:param column: 特征A所在列 需要np.array
66+
:param y_label: 分类结果类,D 需要np.array
67+
:return: 条件熵
68+
'''
69+
70+
#计算特征A的几种取值
71+
types=set([i for i in column])
72+
73+
# 计算出特征Ai的条件下的信息熵
74+
H_D_Ai={}
75+
76+
type_dic = {} # 用来计数每个Di有多少种
77+
for i in types:
78+
#初始化type_dic
79+
type_dic[i]=0
80+
81+
# 计算特定Ai条件下的条件熵
82+
# y_label[column==i]得到y_label中A中特征为Ai的分类结果
83+
H_D_Ai[i]=calcul_H_D(y_label[column == i])
84+
85+
# 用于计算出得到Di,计算Di/D
86+
for i in range(len(column)):
87+
type_dic[column[i]]+=1
88+
89+
# 计算条件熵
90+
H_D_A=0
91+
for i in types:
92+
H_D_A+=type_dic[i]/len(column)*H_D_Ai[i]
93+
return H_D_A
94+
95+
96+
# 找到信息增益最大的列
97+
def findMaxFeature(X_trian,y_train):
98+
'''
99+
:param X_trian: 训练集D
100+
:param y_train: 训练集标签
101+
:return: 列
102+
'''
103+
104+
features=X_trian.shape[1]
105+
106+
H_D=0
107+
H_D_A=0
108+
max_Gain=-10000 #最大信息增益
109+
max_feature=-1 #最大信息增益的列
110+
111+
# 样本的熵
112+
H_D = calcul_H_D(y_train)
113+
114+
for feature in range(features): # 对列进行遍历
115+
# 注意是X_trian[:, feature],别忘了:定位行
116+
H_D_A=calcul_H_D_A(X_trian[:, feature], y_train)
117+
118+
if H_D-H_D_A>max_Gain:
119+
max_Gain=H_D-H_D_A
120+
max_feature=feature
121+
return max_feature,max_Gain
122+
123+
124+
# 对于一列数据,找到出现最多的类,作为这一列的标记
125+
def findCluster(column):
126+
# 使用counter,对每一个出现的特征计数
127+
ans=Counter(column)
128+
# 找到出现次数第一多的
129+
cluster=ans.most_common(1)[0][0]
130+
return cluster
131+
132+
133+
# 对于样本根据特征进行切分
134+
def cutData(X_train,y_train,Ag,ai):
135+
'''
136+
:param X_train: 训练样本
137+
:param y_train: 样本标签
138+
:param Ag: 需要切分特征所在的列
139+
:param ai: 切分特征
140+
:return: 切分后的训练样本,标签
141+
'''
142+
143+
rest_train_data=[] #切分之后的训练集
144+
rest_train_label=[] #切分之后的标签
145+
146+
147+
for i in range(len(X_train)):
148+
if X_train[i][Ag]==ai:
149+
# a = np.array([[1, 2, 3], [1, 2, 3]])
150+
# b = np.array([[1, 2, 3], [4, 5, 6]])
151+
# a + b
152+
# out:array([[2, 4, 6],
153+
# [5, 7, 9]])
154+
# 对样本进行切分,依据Ag列的ai特征
155+
# 切分完之后的样本没有了Ag列
156+
# 总行数为Ag中ai特征的行
157+
158+
159+
rest_train_data.append(list(X_train[i][0:Ag])+list(X_train[i][Ag+1:]))
160+
rest_train_label.append(y_train[i])
161+
return np.array(rest_train_data),np.array(rest_train_label)
162+
163+
164+
165+
def creTree(X_train,y_train):
166+
# 当信息增益小于0.3,就置T为单节点树
167+
epsilon=0.1
168+
169+
print(f'create tree,data_length={len(X_train)}')
170+
171+
# 查看总共还有多少分类
172+
clusters=set([i for i in y_train])
173+
174+
# 若果样本中所有实例都是同一类,则T为单节点树,返回该类作为节点的标记
175+
if len(clusters)==1:
176+
# y_train中所有分类都是一样的,直接返回第一个
177+
return y_train[0]
178+
179+
# 如果样本D中特征A为空集,则直接返回分类中最多的一类
180+
# X_train[0]==0 就代表没有列了
181+
if len(X_train[0])==0:
182+
return findCluster(y_train)
183+
184+
# 找到最大的信息增益的列
185+
feature,gain=findMaxFeature(X_train,y_train)
186+
187+
#若信息增益小于epsilon,则T为单节点树,返回其中最大的类作为标记
188+
if gain<epsilon:
189+
return findCluster(y_train)
190+
191+
# 当信息增益大于epsilon,对样本依据特征划分子空间,递归构造子树
192+
193+
# 计算这一列有几种分类
194+
types=set([i for i in X_train[:,feature]])
195+
196+
tree_dic = {feature:{}}
197+
# 使用字典描述树,如tree{123:{0:7,{1:{....}}}
198+
# 就代表123列的0特征可以分类为7,1则继续构造子树
199+
200+
for i in types:
201+
# 返回的是一个元组
202+
rest_X_train,rest_y_train=cutData(X_train, y_train, feature, i)
203+
tree_dic[feature][i]=creTree(rest_X_train,rest_y_train)
204+
205+
return tree_dic
206+
207+
def predict(x_test,tree):
208+
209+
210+
211+
while True:# 一直循环,直到在tree中找到位置
212+
213+
# 得到树中的分类特征,依据分类结果
214+
# print(tree)
215+
216+
(key, value), = tree.items()
217+
if type(value).__name__=='dict':
218+
# 如果值仍为字典,则我们需要继续遍历
219+
# 在对测试集继续遍历的时候,我们需要删除该分类特征(key),
220+
# 因为我们在构造树的时候,删除了一些特征,
221+
# 因此我们的到的feature也是相对的
222+
223+
feature=x_test[key]
224+
#print(type(x_test))
225+
#print(x_test[key])
226+
227+
# 注意x_test需要为list,才可以用del
228+
del x_test[key]
229+
# 向子树搜寻
230+
# 注意是value【feature】 不是tree【feature】
231+
tree=value[feature]
232+
# 子树为单节点,直接返回值
233+
#print(type(tree)) # numpy.int64
234+
#print(type(tree).__name__) # int64
235+
if type(tree).__name__=='int64':
236+
return tree
237+
else:
238+
# 若value不是字典类型
239+
return value
240+
241+
def test(X_test,y_test,tree):
242+
acc_num=0
243+
acc=0
244+
for i in range(len(X_test)):
245+
y_pred=predict(list(X_test[i]),tree)
246+
if y_pred==y_test[i]:
247+
acc_num+=1
248+
print(f'find {i}th data cluster:y_pred={y_pred},y={y_test[i]}')
249+
print('now_acc=', acc_num / (i + 1))
250+
251+
252+
if __name__=="__main__":
253+
# 获取当前时间
254+
start = time.time()
255+
256+
# 读取训练文件
257+
print("load train data")
258+
X_train, y_train = loadData('../MnistData/mnist_train.csv')
259+
260+
# 读取测试文件
261+
print('load test data')
262+
X_test, y_test = loadData('../MnistData/mnist_test.csv')
263+
264+
265+
tree=creTree(X_train,y_train)
266+
267+
268+
test(X_test, y_test,tree)
269+
270+
# 获取结束时间
271+
end = time.time()
272+
273+
print('run time:', end - start)
+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#coding:utf-8
2+
#Author:codewithzichao
3+
4+
5+
'''
6+
数据集:mnist
7+
accuaracy:0.8659.
8+
time:14.435183763504028.
9+
'''
10+
11+
import pandas as pd
12+
import numpy as np
13+
from sklearn import tree
14+
import time
15+
16+
def loadData(fileName):
17+
#从文件中读取数据
18+
data=pd.read_csv(fileName,header=None)
19+
# 将数据从dataframe转化为ndarray
20+
data=data.values
21+
#数据第一行为分类结果
22+
y_label=data[:,0]
23+
x_label=data[:,1:]
24+
y_label=np.array(y_label).reshape(-1)
25+
x_label=np.array(x_label)
26+
27+
28+
#数据二值化,返回数据
29+
#因为xi的取值范围为0-255,则计算p(X=xi\Y=y)的时候可能性过多,计算过于繁杂
30+
# 所以进行二值化
31+
# y_label为np.ndarray,x_label为np.ndarray
32+
33+
x_label[x_label<128]=0
34+
x_label[x_label>=128]=1
35+
36+
# mp.ndarray
37+
return x_label,y_label
38+
39+
40+
if __name__=="__main__":
41+
# 获取当前时间
42+
start = time.time()
43+
44+
# 读取训练文件
45+
print("load train data")
46+
X_train,y_train = loadData('../MnistData/mnist_train.csv')
47+
48+
# 读取测试文件
49+
print('load test data')
50+
X_test,y_test = loadData('../MnistData/mnist_test.csv')
51+
52+
clf = tree.DecisionTreeClassifier()
53+
clf.fit(X_train,y_train)
54+
55+
test_accuracy=clf.score(X_test, y_test)
56+
print(f"the test_accuracy is {test_accuracy}.")
57+
58+
end=time.time()
59+
60+
print(f"the total time is {end-start}.")

0 commit comments

Comments
 (0)