1
- import numpy as np
2
- import treelib
3
- import scipy .stats
4
-
5
- class ID3 ():
6
- def __init__ (self ):
7
- self .__tree = treelib .Tree ()
8
-
9
- def __get_entropy (self , y ):
10
- _ , counts = np .unique (y , return_counts = True )
11
- prob_classes = counts / np .sum (counts )
12
- return scipy .stats .entropy (prob_classes )
13
-
14
- def __create_tree (self , parent , X , y ):
15
- data_number , feature_number = X .shape
16
-
17
- if data_number == 0 :
18
- return
19
-
20
- if len (np .unique (y )) == 1 or (X == X [0 ]).all ():
21
- self .__tree .update_node (parent .identifier , data = max (set (y ), key = y .tolist ().count ))
22
- return
23
-
24
- info_gain_max = - np .inf
25
- for i in range (feature_number ):
26
- if len (np .unique (X [:, i ])) == 1 :
27
- continue
28
-
29
- y_subs = [y [np .flatnonzero (X [:, i ] == feature_label )] for feature_label in np .unique (X [:, i ])]
30
-
31
- info_gain = self .__get_info_gain (y_subs , y )
32
-
33
- if info_gain > info_gain_max :
34
- info_gain_max = info_gain
35
- feature_split = i
36
-
37
- self .__tree .update_node (parent .identifier , data = feature_split )
38
- for feature_label in np .unique (X [:, feature_split ]):
39
- node = self .__tree .create_node (feature_label , parent = parent )
40
- self .__create_tree (node , X [np .flatnonzero (X [:, feature_split ] == feature_label )], y [np .flatnonzero (X [:, feature_split ] == feature_label )])
41
-
42
- def __get_info_gain (self , y_subs , y ):
43
- return self .__get_entropy (y ) - sum ([self .__get_entropy (y_sub ) * len (y_sub ) for y_sub in y_subs ]) / len (y )
44
-
45
- def fit (self , X , y ):
46
- '''
47
- Parameters
48
- ----------
49
- X : shape (data_number, feature_number)
50
- Training data
51
- y : shape (data_number)
52
- Target values, discrete value
53
- '''
54
- root = self .__tree .create_node ('root' )
55
- self .__create_tree (root , X , y )
56
- self .__tree .show ()
57
-
58
- def __query (self , x , node ):
59
- if node .is_leaf ():
60
- return node .data
61
-
62
- feature_split = node .data
63
- for child in self .__tree .children (node .identifier ):
64
- if x [feature_split ] == child .tag :
65
- return self .__query (x , child )
66
-
67
- def predict (self , X ):
68
- '''
69
- Parameters
70
- ----------
71
- X : shape (data_number, feature_number)
72
- Predicting data
73
-
74
- Returns
75
- -------
76
- y : shape (data_number,)
77
- Predicted class label per sample
78
- '''
1
+ import numpy as np
2
+ import treelib
3
+ import scipy .stats
4
+
5
+ class ID3 ():
6
+ def __init__ (self ):
7
+ self .__tree = treelib .Tree ()
8
+
9
+ def __get_entropy (self , y ):
10
+ _ , counts = np .unique (y , return_counts = True )
11
+ prob_classes = counts / np .sum (counts )
12
+ return scipy .stats .entropy (prob_classes )
13
+
14
+ def __create_tree (self , parent , X , y ):
15
+ data_number , feature_number = X .shape
16
+
17
+ if data_number == 0 :
18
+ return
19
+
20
+ if len (np .unique (y )) == 1 or (X == X [0 ]).all ():
21
+ self .__tree .update_node (parent .identifier , data = max (set (y ), key = y .tolist ().count ))
22
+ return
23
+
24
+ info_gain_max = - np .inf
25
+ for i in range (feature_number ):
26
+ if len (np .unique (X [:, i ])) == 1 :
27
+ continue
28
+
29
+ y_subs = [y [np .flatnonzero (X [:, i ] == feature_label )] for feature_label in np .unique (X [:, i ])]
30
+
31
+ info_gain = self .__get_info_gain (y_subs , y )
32
+
33
+ if info_gain > info_gain_max :
34
+ info_gain_max = info_gain
35
+ feature_split = i
36
+
37
+ self .__tree .update_node (parent .identifier , data = feature_split )
38
+ for feature_label in np .unique (X [:, feature_split ]):
39
+ node = self .__tree .create_node (feature_label , parent = parent )
40
+ self .__create_tree (node , X [np .flatnonzero (X [:, feature_split ] == feature_label )], y [np .flatnonzero (X [:, feature_split ] == feature_label )])
41
+
42
+ def __get_info_gain (self , y_subs , y ):
43
+ return self .__get_entropy (y ) - sum ([self .__get_entropy (y_sub ) * len (y_sub ) for y_sub in y_subs ]) / len (y )
44
+
45
+ def fit (self , X , y ):
46
+ '''
47
+ Parameters
48
+ ----------
49
+ X : shape (data_number, feature_number)
50
+ Training data, must be discrete value
51
+ y : shape (data_number)
52
+ Target values
53
+ '''
54
+ root = self .__tree .create_node ('root' )
55
+ self .__create_tree (root , X , y )
56
+ self .__tree .show ()
57
+
58
+ def __query (self , x , node ):
59
+ if node .is_leaf ():
60
+ return node .data
61
+
62
+ feature_split = node .data
63
+ for child in self .__tree .children (node .identifier ):
64
+ if x [feature_split ] == child .tag :
65
+ return self .__query (x , child )
66
+
67
+ def predict (self , X ):
68
+ '''
69
+ Parameters
70
+ ----------
71
+ X : shape (data_number, feature_number)
72
+ Predicting data, must be discrete value
73
+
74
+ Returns
75
+ -------
76
+ y : shape (data_number,)
77
+ Predicted class label per sample
78
+ '''
79
79
return np .apply_along_axis (self .__query , 1 , X , self .__tree .get_node (self .__tree .root ))
0 commit comments