-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathk_fold_cv.py
66 lines (61 loc) · 2.88 KB
/
k_fold_cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
CS501 Group 16
Fall 2018
Author: Jeff Xie
Credit: KFold implementation from sk.learn
Inputs:
k_list: a list of k values for knn
train: training data to be folded for cross validation
label: labels for the training data
folds: number of folds
Outputs:
Bar graph of k values and their accuracy
Recommended k value
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy #for deepcopy,
from knndtw import KnnDtw
from sklearn.model_selection import KFold
#Import scoring metrics, import more as needed
from sklearn.metrics import classification_report, confusion_matrix, precision_score, accuracy_score
def k_fold_cross_val(k_list,train,label,folds):
#Randomly shuffle the data and label in to the same sequence
seed = np.arange(train.shape[0])
np.random.shuffle(seed)
train = train[seed]
label = label[seed]
#Keep track of the score for this k value, num of scores = num of folds
k_scores = [] #averaged scores for each k value, num of scores = num of K
#we want to split train data into test and train
label_name = {1:'Hover', 2:'Impact (Front Left)', 3:'Impact (Front Right)', 4:'Impact (Back Left)', 5:'Impact (Back Right)',
6:'Gust (from Left)', 7:'Gust (from Right)', 8: 'Gust (from front)' }
clf = KnnDtw(n_neighbors=1, max_warping_window=100) #Initialize classifier
kf = KFold(n_splits=folds)
kf.get_n_splits(train)
for K in k_list:
scores = [] #averaged scores for each k value, num of scores = num of K
clf = KnnDtw(n_neighbors=K, max_warping_window=100)
for train_index, test_index in kf.split(train):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = train[train_index], train[test_index]
y_train, y_test = label[train_index], label[test_index]
clf_copy = copy.deepcopy(clf) #try to make sure the estimator is reset before each fit, but maybe I can just move clf into the loop?
clf_copy.fit(X_train,y_train)
labels, proba = clf_copy.predict(X_test)
#print(classification_report(labels, y_test,target_names=[l for l in label_name.values()]))
acc = accuracy_score(y_test,labels)
print('Accuracy for this fold is:', acc)
scores.append(acc)
scores = np.array(scores) #convert the fold scores array into numpy
score = np.average(scores) #averages the fold scores to a single socre for the k
k_scores.append(score)
#Plot the average accuracy score for each k, recommend a besk (highest accuracy) k
k_best = k_list[np.argmax(k_scores)]
plt.bar(k_list, k_scores,width=0.2)
plt.xlabel('k (nearest neighbors)')
plt.ylabel('Accuracy (average)')
plt.xticks(k_list)
print('Best k value from list is:',k_best)
return k_best