-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCourse-3.R
304 lines (232 loc) · 8.51 KB
/
Course-3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
install.packages("manip")
#Import the data
#Survey contains all the completed data
survey <- read.csv("CompleteResponses.csv")
#Incomplete contains the data that had the model purchased incomplete
incomplete <-read.csv("SurveyIncomplete.csv")
#####
#List the attributes of data
attributes(survey)
#7 columns: names are salary, age, elevl, car, zipcode, credit, brand
#Prints the min, max, mean, median, and quartiles of each attribute
summary(survey)
#Display the structure of the data
str(survey)
#Names your attributes within the data
names(survey)
#Change categorical data to factors
survey$elevel <-as.factor(survey$elevel)
incomplete$elevel <-as.factor(incomplete$elevel)
is.factor(survey$elevel)
survey$car <-as.factor(survey$car)
incomplete$car <-as.factor(incomplete$car)
is.factor(survey$car)
survey$brand <-as.factor(survey$brand)
incomplete$brand <-as.factor(incomplete$brand)
is.factor(survey$brand)
survey$zipcode <-as.factor(survey$zipcode)
incomplete$zipcode <- as.factor(incomplete$zipcode)
summary(incomplete)
#checked the levels to see if they made sense and are in the expected range
levels(survey$elevel)
levels(incomplete$elevel)
levels(survey$car)
levels(incomplete$car)
levels(survey$brand)
levels(incomplete$brand)
levels(survey$zipcode)
#Checked data for missing values by column
which(is.na(survey$salary))
which(is.na(survey$age))
which(is.na(survey$elevel))
which(is.na(survey$car))
which(is.na(survey$zipcode))
which(is.na(survey$credit))
which(is.na(survey$brand))
#Checked data for missing values by dataframe
is.na(survey)
apply(is.na(survey),2,which)
#####
#Looked at how the data is distributed. Education, Car model, zipcode are are very even.
#Slightly more buy brand 1
par(mfrow=c(2,2))
plot(survey$elevel, main="Education Level")
plot(survey$car, main="Main Car Driven")
plot(survey$zipcode, main='Zip Code')
plot(survey$brand, main="Brand Preferred")
ggplot(data=survey, aes(x=brand)) + geom_bar()
#Value counts of each category
table(survey$elevel)
#0 1 2 3 4
#2052 1948 1983 1947 1968
table(survey$car)
#1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#492 509 488 479 505 477 495 511 487 500 473 498 473 494 542 470 508 524 489 484
table(survey$zipcode)
#0 1 2 3 4 5 6 7 8
#1085 1053 1112 1080 1087 1108 1155 1083 1135
table(survey$brand)
#0 1
#3744 6154
#Boxplots to look for outliers
par(mfrow=c(1,2))
hist(survey$salary)
boxplot(survey$salary, main='salary', sub=paste("outlier", boxplot.stats(survey$salary)$out))
par(mfrow=c(1,2))
hist(survey$age)
boxplot(survey$age, main='age', sub=paste("outlier", boxplot.stats(survey$age)$out))
par(mfrow=c(1,2))
hist(survey$credit)
boxplot(survey$credit, main='credit', sub=paste("outlier", boxplot.stats(survey$credit)$out))
#Density plots to see how data is spread
par(mfrow=c(1,3))
plot(density(survey$salary), main="density plot salary", ylab="frequency", sub=paste("skewness",round(e1071::skewness(survey$salary),2)))
plot(density(survey$age), main="density plot age", ylab="frequency", sub=paste("skewness",round(e1071::skewness(survey$age),2)))
plot(density(survey$credit), main="density plot credit", ylab="frequency", sub=paste("skewness",round(e1071::skewness(survey$credit),2)))
#####
#load library and set seed
library(caret)
set.seed(123)
#define a 75%/25% train/test split of the data set
inTraining <- createDataPartition(survey$brand, p=.75, list=FALSE)
training <- survey[inTraining,]
testing <- survey[-inTraining,]
head(testing)
#10 fold cross validation
fitControl <-trainControl(method = "repeatedcv", number = 10, repeats = 1)
#####
#Random forest with tuneLength=1 Accuracy is 92.066 Mtry=11
system.time(
rfFit1 <-train(brand~., data=training, method = 'rf', trControl=fitControl, tuneLength=1)
)
rfFit1
print(rfFit1)
#Feature importance-car and zipcode do not matter
importance1 <-varImp(rfFit1, scale=FALSE)
print(importance1)
plot(importance1)
brand.predict<-predict(rfFit1, testing)
postResample(brand.predict, testing$brand) # 0.925
C1<-confusionMatrix(brand.predict, testing$brand)
C1
final <- data.frame(Actual_Brand=testing$brand, Predicted_Brand=brand.predict)
final
#####
#Random forest with tuneLength=2 Accuracy =91.51 mtry=34
rfFit2 <- train(brand~., data=training, method = 'rf', trControl=fitControl, tuneLength=2)
rfFit2
importance2 <-varImp(rfFit2, scale=FALSE)
print(importance2)
plot(importance2)
brand.predict2<-predict(rfFit2, testing)
postResample(brand.predict2, testing$brand)
C2<-confusionMatrix(brand.predict2, testing$brand)
C2
#####
#Random forest with TuneGrid =1,2,3 Accuracy = 92.21464 with mtry=18
rfGrid <- expand.grid(mtry=c(1,2,3))
rfitm1 <- train(brand~., data=training, method='rf', trControl=fitControl, tuneGrid=rfGrid)
rfitm1
brand.predict3<-predict(rfitm1, testing)
postResample(brand.predict3, testing$brand) #72.63
C3 <-confusionMatrix(brand.predict3, testing$brand)
C3
#92.28 Best One
rfFit3 <- train(brand~., data=training, method = 'rf', trControl=fitControl)
rfFit3
brand.predict4<-predict(rfFit3, testing)
postResample(brand.predict4, testing$brand)
C4 <-confusionMatrix(brand.predict4, testing$brand)
C4
#####
#Feature selection Removed car and zipcode
survey2 <-within(survey, rm("car", "zipcode"))
head(survey2)
#######Use data after feature selection. Got rid of car and zipcode
#
inTraining2 <-createDataPartition(survey2$brand, p=.75, list=FALSE)
training2 <- survey2[inTraining2,]
testing2 <- survey2[-inTraining2,]
fitControl <-trainControl(method = "repeatedcv", number = 10, repeats = 1)
#Accuracy is 96.1 mtry=2
system.time(
rfFit1.1 <-train(brand~., data=training2, method = 'rf', trControl=fitControl, tuneLength=1)
)
rfFit1.1
brand.predict5<-predict(rfFit1.1, testing)
postResample(brand.predict5, testing$brand)
C5 <-confusionMatrix(brand.predict5, testing$brand)
C5
#Accuracy is 92.13359 with mtry=3
system.time(
rfFit1.12 <-train(brand~., data=training2, method = 'rf',trControl=fitControl, tuneLength=11)
)
#98
rfFit1.12
brand.predict6<-predict(rfFit1.12, testing)
postResample(brand.predict6, testing$brand)
C6 <-confusionMatrix(brand.predict6, testing$brand)
C6
#####
#Use best model to predict brands for incomplete data.
incomplete.predict<-predict(rfFit1.12, incomplete)
table(predict(rfFit1.12, incomplete))
incomplete$brand.predictedRF <-incomplete.predict
head(incomplete)
str(incomplete)
#write.csv(incomplete, "C:\\Users\\Sherri's Laptop\\Documents\\Course3\\Task2\\incomplete_predicted2.csv",row.names=FALSE)
#C5.0 Classification Models
library(modeldata)
str(survey)
vars <-c("salary","age")
in_train <- round(nrow(survey2)*0.7)
in_test <-nrow(survey2)-in_train
train_data <-survey2[in_train,]
test_data<-survey2[-in_train,]
training_indices<-sample(seq_len(nrow(survey2)),size=in_train)
trainSet <-survey2[training_indices,]
testSet <-survey2[-training_indices,]
install.packages("C50")
library("C50")
install.packages("inum")
str(trainSet[,vars])
tree_mod <-C5.0(x=trainSet[,vars], y=trainSet$brand)
tree_mod
summary(tree_mod)
plot(tree_mod)
predict(tree_mod, testSet[,vars])
table(predict(tree_mod, testSet[,vars]))
#this is on incomplete data
incomplete.predict2 <- predict(tree_mod, incomplete[,vars])
table(predict(tree_mod, incomplete[,vars]))
table(predict(rfFit1, incomplete))
brand.predict7<-predict(tree_mod,testSet[,vars])
C7<-confusionMatrix(brand.predict7, testSet$brand)
C7
incomplete$brand.predictedC50 <-incomplete.predict2
head(incomplete)
str(incomplete)
write.csv(incomplete, "C:\\Users\\Sherri's Laptop\\Documents\\Course3\\Task2\\incomplete_predicted.csv",row.names=FALSE)
head(incomplete)
library(ggplot2)
library(cowplot)
ggplot(incomplete, aes(x=age,
y=salary,
color=brand.predictedRF
)) + geom_point()
#ggplot(survey, aes(x=age, y=salary, color=brand)) +geom_point()
ggplot(incomplete, aes(x=age)) + geom_bar()
#Scatterplot of predicted Graph A will buy product 0 and product 1
filtered_data <-filter(incomplete, brand.predictedRF == 0)
filtered_data2 <-filter(incomplete, brand.predictedRF ==1)
Acer<-ggplot(filtered_data, aes(x=age, y=salary))+ geom_point()
Sony<-ggplot(filtered_data2, aes(x=age, y=salary))+ geom_point()
plot_grid(Acer,Sony, labels=c("Acer","Sony"))
Agebox<-ggplot(incomplete, aes(x=brand.predictedRF, y=age)) + geom_boxplot() + theme_bw()
Salarybox<-ggplot(incomplete, aes(x=brand.predictedRF, y=salary)) + geom_boxplot() + theme_bw()
plot_grid(Agebox,Salarybox, labels = c("Age", "Salary"))
ggplot(incomplete, aes(x=salary, fill=brand.predictedRF)) + geom_density(alpha=0.7)
summary(incomplete)
summary(survey)
str(survey)
head(incomplete, n=20)