-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLDA.R
124 lines (85 loc) · 5.11 KB
/
LDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# LDA is used as a dimensionality reduction technique.
# It is used in the preprocessing step for pattern classification.
# LDA has the goal to project a dataset onto a lower-dimensional space.
# LDA differs to PCA because in addition to finding the component axises with LDA we are interested in the axes
# that maximize the separation between multiple classes.
# The goal of LDA is to project a feature space (a dataset n-dimensional samples) onto a small subspace
# k(where k <= n -1) while maintaining the class discriminatory information.
# Both PCA and LDA are linear transforamation techiques used for dimensional reductions. PCA is described as
# unsupervised but LDA is supervised because of the relation to the dependent variable.
# Additional Information : https://sebastianraschka.com/Articles/2014_python_lda.html
# The main 5 steps for LDA;
# - Compute the d-dimensional mean vectors for the different classes from the datasets.
# - Compute the scatter matrices (in-between class and within-class scatter matrix)
# - Computer the eigenvector and corresponding eigenvalues for the scatter matrices.
# - Sort the eigenvectors by decreasing eigenvalues and choose k eigenvectors with the largest eigenvalues to for,
# a (d * k) dimensional matrix W (where every column represents an eigenvector)
# - Use this (d * k) eigenvector matrix to tranform the sample onto the new subspace. This can be summarized by the
# matrix multiplication : Y = Y * W (where X is an n * d-dimensional matrix representing the n samples, and y are the transformed n * k-dimensional samples in the new subspace.)
# ---------------------------------------------------- Importing Data ----------------------------------------- #
dataset = read.csv("Wine.csv")
# ---------------------------------- Splitting the dataset into Training and Test Set ------------------------- #
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# --------------------------------------------------- Feature Scalling ----------------------------------------- #
training_set[-14] = scale(training_set[-14])
test_set[-14] = scale((test_set[-14]))
# ---------------------------------------------------- Applying LDA ------------------------------------------- #
library(MASS)
lda = lda(formula = Customer_Segment ~ ., data = training_set)
training_set = as.data.frame(predict(lda, training_set))
training_set = training_set[c(5, 6, 1)]
test_set = as.data.frame(predict(lda, test_set))
test_set = test_set[c(5, 6, 1)]
# ------------------------------------------- Fitting SVM to the Training Set --------------------------------- #
classifier = svm(formula = class ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')
summary(classifier)
# -------------------------------------------- Predicting the Test Set result ---------------------------------- #
# Making Vector probability prediction
y_pred = predict(classifier, newdata = test_set[-3])
y_pred
# ------------------------------------------------- Confusion Matrix ------------------------------------------- #
cm = table(test_set[, 3], y_pred)
cm
# --------------------------------------- Visualising the Training Set results --------------------------------- #
# Download from the site, it will be in zip from and then install.
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('x.LD1', 'x.LD2')
y_grid = predict(classifier, newdata = grid_set)
# Plotting
plot(set[, -3],
main = 'SVM (Training set)',
xlab = 'LD1', ylab = 'LD2',
xlim = range(X1), ylim = range(X2))
# Regression Line
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
# Giving Color
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))
# ------------------------------------------- Visualising the Test Set results --------------------------------- #
library(ElemStatLearn)
set = test_set
test1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
test2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(test1, test2)
colnames(grid_set) = c('x.LD1', 'x.LD2')
y_grid = predict(classifier, newdata = grid_set)
# Plotting
plot(set[, -3], main = 'SVM (Test set)',
xlab = 'LD1', ylab = 'LD2',
xlim = range(test1), ylim = range(test2))
# Regression Line
contour(test1, test2, matrix(as.numeric(y_grid), length(test1), length(test2)), add = TRUE)
# Giving Color
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))