-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLogistic Regression vs RandomForest v4.R
118 lines (85 loc) · 3.36 KB
/
Logistic Regression vs RandomForest v4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#############################################
##
## Logistic Regression vs Randon Forest
##
## ORatWork
##
## Date 20150301
##
## Used as part of the Something to discuss over a good glass of wine;
## Accuracy vs Interpretability blogpost
##
##############################################
#empty R - workspace
rm(list=ls())
# load required Lib's
require(caret)
require(arm)
require(corrplot)
require(randomForest)
require(gridExtra)
# get the data
# data can be found on https://archive.ics.uci.edu/ml/
red <- read.csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', header = TRUE, sep = ';')
white <- read.csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', header = TRUE, sep = ';')
# prepare to merge the two data sets
red[, 'color'] <- 'red'
white[, 'color'] <- 'white'
df <- rbind(red, white)
df$color <- as.factor(df$color)
# classify the wines with a quality factor >= 6 as good, smaller than 6 as bad
good_ones <- df$quality >= 6
bad_ones <- df$quality < 6
df[good_ones, 'quality'] <- 'good'
df[bad_ones, 'quality'] <- 'bad'
df$quality <- as.factor(df$quality)
dim(df)
# make a copy of the raw data to feed into the estimation process
copies <- dummyVars(quality ~ ., data = df)
df_copy<- data.frame(predict(copies, newdata = df))
df_copy[, 'quality'] <- df$quality
# split data set in train and test set
# set the seed for reproducibility
set.seed(1234)
trainIndices <- createDataPartition(df_copy$quality, p = 0.7, list = FALSE)
train <- df_copy[trainIndices, ]
test <- df_copy[-trainIndices, ]
numericColumns <- !colnames(train) %in% c('quality', 'color.red', 'color.white')
# feature selection via correlation matrix
# only numeric features
correlationMatrix <- cor(train[, numericColumns])
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff = 0.6)
colnames(correlationMatrix)[highlyCorrelated]
corrplot(correlationMatrix, order = "FPC")
#take out the columns that are highly correlated
wanted=!colnames(train) %in% colnames(correlationMatrix)[highlyCorrelated]
wanted<-wanted & numericColumns
# set fit control to 10 fold cross validation
fitControl <- trainControl(method = 'cv', number = 10)
# fit a logistic regression
fit_glm <- train(x = train[,wanted], y = train$quality,
method = 'glm',
preProcess = 'range',
family = binomial,
trControl = fitControl)
predict_glm <- predict(fit_glm, newdata = test[,wanted])
confMat_glm <- confusionMatrix(predict_glm, test$quality, positive = 'good')
importance_glm <- varImp(fit_glm, scale = TRUE)
plot(importance_glm, main = 'Feature importance for Logistic Regression')
# fit a random forest
fit_rf <- train(x = train[, wanted], y = train$quality,
method = 'rf',
trControl = fitControl,
tuneGrid = expand.grid(.mtry = c(2:6)),
n.tree = 1000)
predict_rf <- predict(fit_rf, newdata = test[, wanted])
confMat_rf <- confusionMatrix(predict_rf, test$quality, positive = 'good')
importance_rf <- varImp(fit_rf, scale = TRUE)
plot(importance_rf, main = 'Feature importance for Random Forest')
#summary of the models used
models <- resamples(list(GLM = fit_glm,
RF = fit_rf))
dotplot(models)
results <- summary(models)
grid.table(results$statistics$Accuracy)
grid.table(results$statistics$Kappa)