-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathTree.cpp
300 lines (286 loc) · 10.9 KB
/
Tree.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#include "Tree.h"
/* Some declaration of function prototypies that will be
used in the member functions of the class.
*/
void printTree(Node *root, int depth);
feature chooseBestFeature(const vector<feature> &feats, vector<example> &E,
vector<example> &upSet, vector<example> &downSet, float &sP, bool useName);
double H(const vector<example> &E);
inline double infoGain(const vector<example> &E, const vector<example> &upSet,
const vector<example> &downSet);
vector<feature> randomChooseFeatures();
example averageLabelValueOfExamples(const vector<example> &E);
/*
Defination of the memeber functions.
*/
// By Tony, Sep. 20. The constructor of the class.
Tree::Tree() {
root = NULL;
}
// By Tony, Sep. 20. The deconstructor of the Tree, release the root node space.
// Before the root node is released, root node will automatically release its sub nodes.
Tree::~Tree() {
//delete root;
if (root != NULL)
root->~Node();
}
// By Tony, Sep20. Below are some class parameter setting methods.
void Tree::setThreshold(float exampleSize, float depth, float infoGain) {
infoGainThreshold = infoGain;
exampleSizeThreshold = exampleSize;
depthThreshold = depth;
}
void Tree::setTrainData(const vector<example> &a) {
trainData = a;
}
void Tree::setTestData(const vector<example> &a) {
testData = a;
}
void Tree::setFeatures(const vector<feature> &a) {
features = a;
}
int Tree::getTrainDataSize() {return this->trainData.size();}
// By Tony, Sep 20. Begin split the tree from the root node.
void Tree::beginLearning() {
if (trainData.size() == 0) cout << "Tree's trainData size is empty." << endl;
if (features.size() == 0) cout << "Tree's training features is empty." << endl;
else root = learning(this->trainData, this->features, 0);
}
// By Tony, Sep 20. Get the result of prediction of this tree.
vector<example> Tree::getResult() {
result.clear();
if (testData.size() == 0) {
cout << "Tree's test data is empty. Can not get result." << endl;
}
for (int i = 0; i < testData.size(); i++) {
if (root == NULL) cout << "root = NULL" << endl;
result.push_back(root->makeDecision(testData[i]));
}
return result;
}
// By Tony, Sep 20. Print this tree to console by calling a receruit function.
void Tree::print() {
printTree(root, 0);
}
// By Tony, Sep 20. Core algorithm of decision tree.
Node* Tree::learning(vector<example> &E, const vector<feature> &feats, int depth) {
int n;
if (E.size() <= exampleSizeThreshold || depth > depthThreshold || H(E) == -1 * numeric_limits<double>::infinity()) {
Node *leaf = new Node(E, depth); return leaf;
}
else {
float splitPoint;
vector<example> upSet, downSet;
vector<feature> feats = randomChooseFeatures();
feature best = chooseBestFeature(feats, E, upSet, downSet, splitPoint, E.size()>10?true:false);
//feature best = chooseBestFeature(feats, E, upSet, downSet, splitPoint, false);
Node *newTree = new Node(best, depth);
Node *newLeftTree = learning(upSet, feats, depth + 1);
newTree->addNewBranch(newLeftTree, splitPoint);
Node *newRightTree = learning(downSet, feats, depth + 1);
newTree->addNewBranch(newRightTree, splitPoint);
return newTree;
}
}
/* Below are some static functions that not belows to the Tree class,
but they are used by the member functions of Tree class.*/
// By Tony, Sep 20. Print this tree to the console. For debug use.
void printTree(Node *root, int depth) {
root->print();
vector<Node*> subTrees = root->getSubNodes();
for (int i = 0; i < subTrees.size(); i++) {
for (int j = 0; j < depth + 1; j++) cout << "\t";
if (i == 0) cout << "<" << root->getSplitPoint() << ": ";
else cout << ">" << root->getSplitPoint() << ": ";
printTree(subTrees[i], depth + 1);
}
}
// By Tone, Sep 20. This function returns a average value of the labels
// in E.
// In the returned 'example', only '.y' will be used.
// Recall that struct example { float x[feaNum]; float y[labelNum]};
example averageLabelValueOfExamples(const vector<example> &E) {
example ave;
memset(ave.y, 0, sizeof(ave.y));
for (int i = 0; i < E.size(); i++) {
for (int j = 0; j < labelNum; j++) {
ave.y[j] += E[i].y[j];
}
}
for (int i = 0; i < labelNum; i++) ave.y[i] /= E.size();
return ave;
}
// By Tony, Sep 20.
vector<feature> randomChooseFeatures() {
//int num = int(sqrt(feaNum));
//int num = feaNum;
int num = 200;
vector<feature> feats;
for (int i = 0; i < feaNum; i++) {
feats.push_back({i});
}
random_shuffle(feats.begin(), feats.end());
feats.resize(num);
return feats;
}
// By Tony, Sep 21.
// The H(S) function used by infoGain().
// Third party library Armadillo is used for matrix calculation.
// See at: http://arma.sourceforge.net/
double H(const vector<example> &E) {
// Using Armadillo linear algebra lib.
// Declare a double number matrix S with size of E.size() * labelNum.
arma::mat S(E.size(), labelNum);
// Convert our data structure to the armadillo matrix so we can calculate
// the 'cov(S)' and the 'det(cov(S))' as in the paper page 5 formular (2).
//cout << E.size() << endl;
if (E.size() == 1) return 0;
for (int i = 0; i < S.n_rows; i++) {
for (int j = 0; j < S.n_cols; j++) {
S(i, j) = E[i].y[j];
// FOR DEBUG ONLY:
// cout << E[i].y[j] << ",";
}
// FOR DEBUG ONLY:
// cout << endl;
}
//Calculate the covariance matrix of S.
arma::mat covS = arma::cov(S);
// FOR DEBUG ONLY:
// cout << endl << "covS = " << covS << endl;
for (int i = 0; i < covS.n_rows; i++) covS(i, i) += 0.1;
//Calculate the determinant of covS.
double detCS = arma::det(covS);
// FOR DEBUG ONLY:
// cout << "detcs = " << detCS << endl;
// log() is the function 'ln()'
// M_PI = 3.14159265358979323846
double c = (labelNum / 2) * (1 + log(2 * M_PI));
double lg = log(detCS);
/* FOR DEBUG ONLY
if (lg == -1 * numeric_limits<double>::infinity()) {
// if detCS -> 0, then ln(detCS) -> -inf
// which means the examples labels value are very close. (I guess)
cout << "detCS = 0 " << endl;
}
*/
// FOR DEBUG ONLY:
//cout << lg / 2+ c << endl;
//exit (EXIT_FAILURE);
return lg / 2 + c;
}
// By Tony. Sep 20.
inline double infoGain(const vector<example> &E, const vector<example> &upSet, const vector<example> &downSet) {
if (H(upSet) == -1 * numeric_limits<double>::infinity() && H(downSet) == -1 * numeric_limits<double>::infinity())
return numeric_limits<double>::infinity();
// INF is a fake inf. INF = 2e9
if (H(upSet) == -1 * numeric_limits<double>::infinity())
return INF - downSet.size() * H(downSet);
if (H(downSet) == -1 * numeric_limits<double>::infinity())
return INF - upSet.size() * H(upSet);
//cout << "size of UPset: " << upSet.size() << "size of DownSet: " << downSet.size() << endl;
//cout << "H(E) of UpSet: " << H(upSet) << " H(E) of DownSet: " << H(downSet) << endl;
return H(E) - (((upSet.size() + 0.0) / E.size()) * H(upSet) + ((downSet.size() + 0.0 )/ E.size() * H(downSet)));
}
int label(const example &e) {
int last = e.name.size() - 1;
if ('1' <= e.name[last] <= '9')
return e.name[last] - '1' + 1;
if ('A' <= e.name[last] <= 'Z')
return e.name[last] - 'A' + 11;
if ('0' == e.name[last])
return 10;
cout << "example name[last] not [1-9] or [A-Z]" << endl;
exit(0);
}
double H2(const vector<example> &E) {
int count[50] = {0};
for (int i = 0; i < E.size(); i++) {
count[label(E[i])]++;
}
double ret = 0;
for (int i = 0; i < 50; i++) {
double x = float(count[i])/E.size();
if (count[i])
ret += x * log(x);
}
return -ret;
}
double infoGain2(const vector<example> &E, const vector<example> &upSet, const vector<example> &downSet) {
return H2(E) - (((upSet.size() + 0.0) / E.size()) * H2(upSet) + ((downSet.size() + 0.0 )/ E.size() * H2(downSet)));
}
// By Tony. Sep 20.
// useName == true: calculate infoGain by name.
// useNme == false: calculate infoGain by points.
feature chooseBestFeature(const vector<feature> &feats, vector<example> &E, vector<example> &upSet, vector<example> &downSet, float &sP, bool useName) {
using namespace std::chrono;
// Set maxInfoGain to -inf.
double maxInfoGain = -1 * numeric_limits<double>::infinity();
int bestFeature = -1;
for (int i = 0; i < feats.size(); i++) {
//auto start = high_resolution_clock::now();
float min = INF, max = -INF;
for (int j = 0; j < E.size(); j++) {
if (E[j].x[feats[i].index] > max) max = E[j].x[feats[i].index];
if (E[j].x[feats[i].index] < min) min = E[j].x[feats[i].index];
}
//cout << " i = " << i << "max = " << max << "min = " << min << endl;
if (0.01 > max - min && 0.01 > min - max) continue;
// Try a number of split points.
double split[splitPointsNum], step = (max - min) / (splitPointsNum + 1);
for (int j = 0; j < splitPointsNum; j++) split[j] = min + step * (j + 1);
for (int j = 0; j < splitPointsNum; j++) {
//cout << "split[j] = " << split[j] << ", ";
upSet.clear();
downSet.clear();
for (int k = 0; k < E.size(); k++) {
if (E[k].x[feats[i].index] <= split[j]) {
upSet.push_back(E[k]);
}
else {
downSet.push_back(E[k]);
}
}
//auto start1 = high_resolution_clock::now();
double nu = useName ? infoGain2(E, upSet, downSet) : infoGain(E, upSet, downSet);
//cout << "feat id " << feats[i].index << " H = (" << H(upSet) << ", " << H(downSet) << ") Upsize = " << upSet.size() << " Downsize = " << downSet.size() << endl;
//auto end1 = high_resolution_clock::now();
//float elapsedSeconds1 = duration_cast<duration<float>>(end1-start1).count();
//cout << "time1 " << elapsedSeconds1 << endl;
//cout << "Info calculate time: " << infoGainEndTime - infoGainStartTime << endl;
// For debug only.
//cout << "nu = " << nu << endl;
// Get the best feature index. and the split point value.
if (maxInfoGain < nu) {
maxInfoGain = nu;
sP = split[j];
bestFeature = feats[i].index;
}
}
//auto end = high_resolution_clock::now();
//float elapsedSeconds = duration_cast<duration<float>>(end-start).count();
//cout << "time2 " << elapsedSeconds << endl;
}
upSet.clear();
downSet.clear();
if (bestFeature < 0 || bestFeature >= feaNum) {
cout << "choose feature erorr: best feature index = " << bestFeature << endl;
cout << "Exit program.." << endl;
exit(0);
}
// cout << "bestFeature = " << bestFeature << endl;
for (int k = 0; k < E.size(); k++) {
if (E[k].x[bestFeature] <= sP) {
upSet.push_back(E[k]);
}
else {
downSet.push_back(E[k]);
}
}
//cout << "H(best) = ";
//cout << H(upSet) << " ";
//cout << H(downSet) << endl;
feature a;
a.index = bestFeature;
return a;
}