Skip to content

Commit 6948ac1

Browse files
optimize code
2 parents 764c3ec + e3df5a5 commit 6948ac1

File tree

4 files changed

+108
-87
lines changed

4 files changed

+108
-87
lines changed

decision_tree_id3.py

+78-78
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,79 @@
1-
import numpy as np
2-
import treelib
3-
import scipy.stats
4-
5-
class ID3():
6-
def __init__(self):
7-
self.__tree = treelib.Tree()
8-
9-
def __get_entropy(self, y):
10-
_, counts = np.unique(y, return_counts=True)
11-
prob_classes = counts / np.sum(counts)
12-
return scipy.stats.entropy(prob_classes)
13-
14-
def __create_tree(self, parent, X, y):
15-
data_number, feature_number = X.shape
16-
17-
if data_number == 0:
18-
return
19-
20-
if len(np.unique(y)) == 1 or (X == X[0]).all():
21-
self.__tree.update_node(parent.identifier, data=max(set(y), key=y.tolist().count))
22-
return
23-
24-
info_gain_max = -np.inf
25-
for i in range(feature_number):
26-
if len(np.unique(X[:, i])) == 1:
27-
continue
28-
29-
y_subs = [y[np.flatnonzero(X[:, i] == feature_label)] for feature_label in np.unique(X[:, i])]
30-
31-
info_gain = self.__get_info_gain(y_subs, y)
32-
33-
if info_gain > info_gain_max:
34-
info_gain_max = info_gain
35-
feature_split = i
36-
37-
self.__tree.update_node(parent.identifier, data=feature_split)
38-
for feature_label in np.unique(X[:, feature_split]):
39-
node = self.__tree.create_node(feature_label, parent=parent)
40-
self.__create_tree(node, X[np.flatnonzero(X[:, feature_split] == feature_label)], y[np.flatnonzero(X[:, feature_split] == feature_label)])
41-
42-
def __get_info_gain(self, y_subs, y):
43-
return self.__get_entropy(y) - sum([self.__get_entropy(y_sub) * len(y_sub) for y_sub in y_subs]) / len(y)
44-
45-
def fit(self, X, y):
46-
'''
47-
Parameters
48-
----------
49-
X : shape (data_number, feature_number)
50-
Training data
51-
y : shape (data_number)
52-
Target values, discrete value
53-
'''
54-
root = self.__tree.create_node('root')
55-
self.__create_tree(root, X, y)
56-
self.__tree.show()
57-
58-
def __query(self, x, node):
59-
if node.is_leaf():
60-
return node.data
61-
62-
feature_split = node.data
63-
for child in self.__tree.children(node.identifier):
64-
if x[feature_split] == child.tag:
65-
return self.__query(x, child)
66-
67-
def predict(self, X):
68-
'''
69-
Parameters
70-
----------
71-
X : shape (data_number, feature_number)
72-
Predicting data
73-
74-
Returns
75-
-------
76-
y : shape (data_number,)
77-
Predicted class label per sample
78-
'''
1+
import numpy as np
2+
import treelib
3+
import scipy.stats
4+
5+
class ID3():
6+
def __init__(self):
7+
self.__tree = treelib.Tree()
8+
9+
def __get_entropy(self, y):
10+
_, counts = np.unique(y, return_counts=True)
11+
prob_classes = counts / np.sum(counts)
12+
return scipy.stats.entropy(prob_classes)
13+
14+
def __create_tree(self, parent, X, y):
15+
data_number, feature_number = X.shape
16+
17+
if data_number == 0:
18+
return
19+
20+
if len(np.unique(y)) == 1 or (X == X[0]).all():
21+
self.__tree.update_node(parent.identifier, data=max(set(y), key=y.tolist().count))
22+
return
23+
24+
info_gain_max = -np.inf
25+
for i in range(feature_number):
26+
if len(np.unique(X[:, i])) == 1:
27+
continue
28+
29+
y_subs = [y[np.flatnonzero(X[:, i] == feature_label)] for feature_label in np.unique(X[:, i])]
30+
31+
info_gain = self.__get_info_gain(y_subs, y)
32+
33+
if info_gain > info_gain_max:
34+
info_gain_max = info_gain
35+
feature_split = i
36+
37+
self.__tree.update_node(parent.identifier, data=feature_split)
38+
for feature_label in np.unique(X[:, feature_split]):
39+
node = self.__tree.create_node(feature_label, parent=parent)
40+
self.__create_tree(node, X[np.flatnonzero(X[:, feature_split] == feature_label)], y[np.flatnonzero(X[:, feature_split] == feature_label)])
41+
42+
def __get_info_gain(self, y_subs, y):
43+
return self.__get_entropy(y) - sum([self.__get_entropy(y_sub) * len(y_sub) for y_sub in y_subs]) / len(y)
44+
45+
def fit(self, X, y):
46+
'''
47+
Parameters
48+
----------
49+
X : shape (data_number, feature_number)
50+
Training data, must be discrete value
51+
y : shape (data_number)
52+
Target values
53+
'''
54+
root = self.__tree.create_node('root')
55+
self.__create_tree(root, X, y)
56+
self.__tree.show()
57+
58+
def __query(self, x, node):
59+
if node.is_leaf():
60+
return node.data
61+
62+
feature_split = node.data
63+
for child in self.__tree.children(node.identifier):
64+
if x[feature_split] == child.tag:
65+
return self.__query(x, child)
66+
67+
def predict(self, X):
68+
'''
69+
Parameters
70+
----------
71+
X : shape (data_number, feature_number)
72+
Predicting data, must be discrete value
73+
74+
Returns
75+
-------
76+
y : shape (data_number,)
77+
Predicted class label per sample
78+
'''
7979
return np.apply_along_axis(self.__query, 1, X, self.__tree.get_node(self.__tree.root))

metrics.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,9 @@ def r2_score(y_true, y_pred):
177177
Parameters
178178
----------
179179
y_true : shape (data_number, 1)
180-
True label
180+
True value
181181
y_pred : shape (data_number, 1)
182-
Predicting label
182+
Predicting value
183183
184184
Returns
185185
-------

preprocess.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def transform(self, X):
6262
X : shape (data_number, feature_number)
6363
The Predicting data standard scaler encoded.
6464
'''
65-
return (X - self.__mean) / self.__std
65+
return (X - self.__mean) / (self.__std + 1e-8)
6666

6767
class OneHot:
6868
@property

random_forest.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,25 @@ def __init__(self, mode='classification', debug=True):
1010
self.__mode = mode
1111
self.__debug = debug
1212

13-
def fit(self, X, y, trees_number, pick_feature_number):
13+
def fit(self, X, y, n_trees, pick_feature_number):
14+
'''
15+
Parameters
16+
----------
17+
X : shape (data_number, feature_number)
18+
Training data
19+
y : shape (data_number, 1)
20+
Target values, 1 or 0
21+
n_trees : The number of trees in the forest.
22+
pick_feature_number : The number of features picked randomly
23+
'''
1424
data_number, feature_number = X.shape
1525

16-
self.__indexs, self.__indexs_oob = preprocess.bagging(data_number, trees_number)
26+
self.__indexs, self.__indexs_oob = preprocess.bagging(data_number, n_trees)
1727

1828
if self.__debug:
1929
accuracy = []
2030

21-
for i in range(trees_number):
31+
for i in range(n_trees):
2232
features = np.random.choice(feature_number, pick_feature_number, replace=False)
2333

2434
X_bag = X[self.__indexs[i]][:, features]
@@ -38,10 +48,10 @@ def fit(self, X, y, trees_number, pick_feature_number):
3848

3949
def __oob_verification(self, X, y):
4050
data_number = X.shape[0]
41-
trees_number = len(self.__trees)
51+
n_trees = len(self.__trees)
4252

43-
results = np.full((data_number, trees_number), None)
44-
for i in range(trees_number):
53+
results = np.full((data_number, n_trees), None)
54+
for i in range(n_trees):
4555
tree = self.__trees[i]['model']
4656
features = self.__trees[i]['features']
4757
X_bag_oob = X[self.__indexs_oob[i]][:, features]
@@ -63,6 +73,17 @@ def __oob_verification(self, X, y):
6373
return metrics.accuracy(y, y_pred)
6474

6575
def predict(self, X):
76+
'''
77+
Parameters
78+
----------
79+
X : shape (data_number, feature_number)
80+
Predicting data
81+
82+
Returns
83+
-------
84+
y : shape (data_number, 1)
85+
Predicted value per sample
86+
'''
6687
data_number = X.shape[0]
6788

6889
results = np.empty((data_number, 0))

0 commit comments

Comments
 (0)