Skip to content

Commit 5d5ab0b

Browse files
committed
* implemented random forest with analysis
1 parent 90c4fc6 commit 5d5ab0b

File tree

4 files changed

+91
-4
lines changed

4 files changed

+91
-4
lines changed

analysis.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def analyzeAndWriteToFile(classifier, predictions, answerKey, foldsEvaluations,
3535
elif classifier.startswith("KNearestNeighbor Classifier_"):
3636
idx = classifier.index("_")
3737
clf = "{k}knn".format(k=classifier[idx+1:])
38+
elif classifier == "Random Forest Classifier":
39+
clf = "rf"
3840

3941

4042
fileName = clf + "_" + date + "_" + t + ".txt"

knn_RNASeq.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from sklearn.neighbors import KNeighborsClassifier
2+
import sys
23
import numpy as np
34

45

main.py

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,9 @@
77
import rbfSVC_RNASeq
88
import neuralNetwork_RNASeq
99
import knn_RNASeq
10+
import randomForest_RNASeq
1011
import analysis
1112

12-
13-
# Resource: http://machinelearningmastery.com/get-your-hands-dirty-with-scikit-learn-now/
14-
# Python for Java Programmers: http://python4java.necaiseweb.org/Fundamentals/TheBasics
15-
1613
# run with down sampling and cross validation: python main.py GSE60361C13005Expression.txt expressionmRNAAnnotations.txt 1 1 1
1714
# run with down sampling and without cross validation: python main.py GSE60361C13005Expression.txt expressionmRNAAnnotations.txt 1 1 0
1815
# run without downsampling and with cross validation: python main.py GSE60361C13005Expression.txt expressionmRNAAnnotations.txt 1 0 1
@@ -54,6 +51,15 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
5451

5552
return knn_predictionResults
5653

54+
def rf(trainingData, testingData, trainingDataTargets, testingDataTargets):
55+
# fit training data to rf
56+
randomForest_RNASeq.fitTrainingData(trainingData, trainingDataTargets)
57+
58+
# predict the values using random forest classifier
59+
rf_predictionResults = randomForest_RNASeq.predictTestData(testingData)
60+
61+
return rf_predictionResults
62+
5763
if __name__ == '__main__':
5864
t0 = time.clock()
5965
print "start"
@@ -96,6 +102,11 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
96102
print " - Using Multi-Layer Perceptron (Neural Network)"
97103
elif classifier == 3:
98104
print " - Using K Nearest Neighbor Classifier with k = {k}".format(k=n_neighbors)
105+
elif classifier == 4:
106+
print " - Using Random Forest Classifier"
107+
else:
108+
print "** ERROR: invalid classifier selection"
109+
sys.exit(0)
99110

100111
if downSampleFlag:
101112
print "** Down sampling enabled **"
@@ -197,6 +208,15 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
197208
foldsEvaluations.append(analysis.calculateEvaluations(knn_predictionResults, testingDataKey))
198209
# ***************** END KNN *****************
199210

211+
elif classifier == 4:
212+
# ***************** RF *****************
213+
# fit and make predictions
214+
rf_predictionResults = rf(trainingFolds, testingData, trainingKeys, testingDataKey)
215+
216+
# add the accuracies for this fold to the accuracies list
217+
foldsEvaluations.append(analysis.calculateEvaluations(rf_predictionResults, testingDataKey))
218+
# ***************** END RF *****************
219+
200220

201221
# increment iterator to process the next fold as testing data
202222
iterator += 1
@@ -218,6 +238,10 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
218238
# ***************** KNN *****************
219239
analysis.analyzeAndWriteToFile("KNearestNeighbor Classifier_{k}".format(k=n_neighbors), knn_predictionResults, testingDataKey, foldsEvaluations, 10, 0)
220240
# ***************** END KNN *****************
241+
elif classifier == 4:
242+
# ***************** RF *****************
243+
analysis.analyzeAndWriteToFile("Random Forest Classifier", rf_predictionResults, testingDataKey, foldsEvaluations, 10, 0)
244+
# ***************** END RF *****************
221245

222246
else:
223247
# partition the down sampled data set into 70% training and 30% testing
@@ -258,7 +282,16 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
258282

259283
analysis.analyzeAndWriteToFile("KNearestNeighbor Classifier_{k}".format(k=n_neighbors), knn_predictionResults, data.getDSTestingDataTargetValues(), foldsEvaluations, 1, 1)
260284
# ***************** END KNN *****************
285+
elif classifier == 4:
286+
# ***************** RF *****************
287+
rf_predictionResults = rf(data.getDSTrainingData(), data.getDSTestingData(), data.getDSTargetValues(),
288+
data.getDSTestingDataTargetValues())
261289

290+
foldsEvaluations = [] # single fold list but we still need to use a 3D list
291+
foldsEvaluations.append(analysis.calculateEvaluations(rf_predictionResults, data.getDSTestingDataTargetValues()))
292+
293+
analysis.analyzeAndWriteToFile("Random Forest Classifier", rf_predictionResults, data.getDSTestingDataTargetValues(), foldsEvaluations, 1, 1)
294+
# ***************** END RF *****************
262295

263296
else:
264297
if crossValidateFlag:
@@ -319,6 +352,14 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
319352
# add the accuracies for this fold to accuracies list
320353
foldsEvaluations.append(analysis.calculateEvaluations(knn_predictionResults, testingDataKey))
321354
# ***************** END KNN *****************
355+
elif classifier == 4:
356+
# ***************** RF *****************
357+
# fit and make predictions
358+
rf_predictionResults = rf(trainingFolds, testingData, trainingKeys, testingDataKey)
359+
360+
# add the accuracies for this fold to accuracies list
361+
foldsEvaluations.append(analysis.calculateEvaluations(rf_predictionResults, testingDataKey))
362+
# ***************** END RF *****************
322363

323364

324365
# increment iterator to process the next fold as testing data
@@ -341,6 +382,10 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
341382
# ***************** KNN *****************
342383
analysis.analyzeAndWriteToFile("KNearestNeighbor Classifier_{k}".format(k=n_neighbors), knn_predictionResults, testingDataKey, foldsEvaluations, 10, 2)
343384
# ***************** END KNN *****************
385+
elif classifier == 4:
386+
# ***************** RF *****************
387+
analysis.analyzeAndWriteToFile("Random Forest Classifier", rf_predictionResults, testingDataKey, foldsEvaluations, 10, 2)
388+
# ***************** END RF *****************
344389

345390
else:
346391
# partition the data set into 70% training and 30% testing
@@ -386,5 +431,18 @@ def knn(trainingData, testingData, trainingDataTargets, testingDataTargets):
386431
analysis.analyzeAndWriteToFile("KNearestNeighbor Classifier_{k}".format(k=n_neighbors), knn_predictionResults, data.getTestingDataTargetValues(), foldsEvaluations, 1, 3)
387432
# ***************** END KNN *****************
388433

434+
elif classifier == 4:
435+
# ***************** RF *****************
436+
rf_predictionResults = rf(data.getTrainingData(), data.getTestingData(), data.getTrainingDataTargetValues(),
437+
data.getTestingDataTargetValues())
438+
439+
# analyze results using robust evaluations
440+
foldsEvaluations = []
441+
442+
foldsEvaluations.append(analysis.calculateEvaluations(rf_predictionResults, data.getTestingDataTargetValues()))
443+
444+
analysis.analyzeAndWriteToFile("Random Forest Classifier", rf_predictionResults, data.getTestingDataTargetValues(), foldsEvaluations, 1, 3)
445+
# ***************** END RF *****************
446+
389447
print "\nprogram execution: {t} seconds".format(t=time.clock()-t0)
390448
print "exiting"

randomForest_RNASeq.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from sklearn.ensemble import RandomForestClassifier
2+
import sys
3+
import numpy as np
4+
5+
rfClf = RandomForestClassifier(n_estimators=10)
6+
7+
# Source: http://scikit-learn.org/stable/modules/ensemble.html
8+
def fitTrainingData(training_data, nSamples):
9+
# fit(X,Y)
10+
# x = 2D array of cells
11+
# y = 1D list of classifiers i.e. [group1, group1, group2, group2]
12+
13+
# makes lists into np arrays
14+
training_dataNP = np.array(training_data)
15+
nSamplesNP = np.array(nSamples)
16+
17+
rfClf.fit(training_dataNP, nSamplesNP)
18+
19+
def predictTestData(testing_data):
20+
# make list into np array
21+
testing_dataNP = np.array(testing_data)
22+
23+
# predict the values
24+
predicted = rfClf.predict(testing_dataNP)
25+
26+
return predicted

0 commit comments

Comments
 (0)