-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRnotebook.Rmd
572 lines (435 loc) · 24.2 KB
/
Rnotebook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
---
title: "R Notebook"
output: html_notebook
---
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*.
```{r}
##using all data, without removing row level duplicates
##Remove all environment variables, to relinguis the RAM space and also to avoid overlaps...
rm(list=ls(all=TRUE))
#Uncomment the installer part if the packages are not available
#pkgs <- c("readr",
# "spam","LICORS",
# "cluster","vegan","sqldf","C50","ROCR","rpart","e1071","ggplot2","ggthemes","scales","h2o")
#install.packages(pkgs)
library(readr)
# library(LICORS)
# library(spam)
library(vegan)
library(cluster)
#Try usig this later all in the end
library(sqldf)
library(RSQLite)
library(tcltk)
library(ROCR)
library(rpart) # classification algorithm
#library(C50) # classification algorithm
library(e1071)# classification algorithm
library(ggplot2) # visualization
library(ggthemes) # visualization
library(scales) # visualization
library(randomForest) # classification algorithm
library(h2o) # Neural Network
setwd("E:/Workspace/Project")
getwd()
#Loading data into a DataFrame using read_csv, as to apply format to the date variable while loading itself.
#as I have abserved that the date is having diffrent format in the data set provided.
PriorAuth_Data <- read_csv("PriorAuth_Data.csv",
col_types = cols(TransDate = col_date(format = "%m/%d/%Y")))
sum(is.na(PriorAuth_Data)) # no NA 's found
PriorAuth_Data <- PriorAuth_Data[setdiff(names(PriorAuth_Data),'UserID')]
head(PriorAuth_Data)
str(PriorAuth_Data)
# By Definition GPI Generic Product Identifier :
# Combination of Drug group + Drug class + Drug sub-class + Drug name + Drug name extension
PriorAuth_Data$comb1 <- paste(PriorAuth_Data$Drug
,PriorAuth_Data$DrugClass
,PriorAuth_Data$DrugSubClass
,PriorAuth_Data$Drug_Chemical_Name
,PriorAuth_Data$DrugGroup,sep = '')
# Seeing into the data, another Combination of
#Drug group + Drug class + Drug sub-class + Drug name + Drug name extension + GPI gives the same NDC Code.
#So forming another combination attribute...!
PriorAuth_Data$comb2 <- paste(PriorAuth_Data$Drug
,PriorAuth_Data$DrugClass
,PriorAuth_Data$DrugSubClass
,PriorAuth_Data$Drug_Chemical_Name
,PriorAuth_Data$DrugGroup
,PriorAuth_Data$GPI,sep = '')
PriorAuth_Data$comb3 <- paste(PriorAuth_Data$RxGroupId
,PriorAuth_Data$Bin
,PriorAuth_Data$PCN
,sep = '')
##Create frequency table for each individual attribute against the Target, and standardising the freque
##to find the clusters, to reduce the huge levels in the data.
########################################################################################
# Whys SQLDF, why not table() for getting frequency ?
# Table is the easy way to get frequency, but then while converting it to dartaframe,
# the data is transformed undesired manner.
#eg: samle table(PriorAuth_Data$Drug,PriorAuth_Data$Target) out put
# FALSE TRUE
# Drug001 6 10
# Drug002 2 1
# Drug003 8 28
# Drug004 2 1
# Drug005 3 10
# Drug006 2 4
# Drug007 32 100
# after converting it to dataframe it looks below
# Data.frame(table(PriorAuth_Data$Drug,PriorAuth_Data$Target))
# Var1 Var2 Freq
# Drug001 FALSE 6
# Drug002 FALSE 2
# Drug003 FALSE 8
# Drug004 FALSE 2
# Drug005 FALSE 3
# Drug006 FALSE 2
# Drug007 FALSE 32
########################################################################################
attributes <- setdiff(names(PriorAuth_Data),c('Target','TransDate'))
rm(list= ls()[!(ls() %in% c('PriorAuth_Data','attributes'))])
# Taking the count of True and total count for each attribute and its levels and further taking
# "True Responce Rate" to handle the high leveled categorical data.
select <- c('Select ')
case <- ', sum(case when Target = 1 then 1 else 0 end) as True, sum(case when Target = 1 then 1 else 0 end)+(sum(case when Target = 0 then 1 else 0 end)) as Total from PriorAuth_Data group by '
for (i in 1:length(attributes)){
##Creating Frequency tables for all Individual attributes against the Class variable dynamically
assign(attributes[i],sqldf(paste(select,attributes[i],case,attributes[i],sep = '')))
}
## Calculating True responce rate for each attribute.
Bin <-cbind(Bin[1],Bin[2]/Bin[3])
Drug <-cbind(Drug[1],Drug[2]/Drug[3])
DrugSubClass <-cbind(DrugSubClass[1],DrugSubClass[2]/DrugSubClass[3])
DrugClass <-cbind(DrugClass[1],DrugClass[2]/DrugClass[3])
Drug_Chemical_Name <-cbind(Drug_Chemical_Name[1],Drug_Chemical_Name[2]/Drug_Chemical_Name[3])
GPI <-cbind(GPI[1],GPI[2]/GPI[3])
NDC <-cbind(NDC[1],NDC[2]/NDC[3])
DrugGroup <-cbind(DrugGroup[1],DrugGroup[2]/DrugGroup[3])
DoctorID <-cbind(DoctorID[1],DoctorID[2]/DoctorID[3])
RxGroupId <-cbind(RxGroupId[1],RxGroupId[2]/RxGroupId[3])
PCN <-cbind(PCN[1],PCN[2]/PCN[3])
State <-cbind(State[1],State[2]/State[3])
comb1 <-cbind(comb1[1],comb1[2]/comb1[3])
comb2 <-cbind(comb2[1],comb2[2]/comb2[3])
comb3 <-cbind(comb3[1],comb3[2]/comb3[3])
rm(select,case,i)
PriorAuth_num_Data <- sqldf('select d.True DrugId,sc.True DrugSubClassId , dc.True DrugClassId ,c.True Drug_ChemicalId, g.True GpiId , n.True NdcId , dg.True DrugGroupId,
Dr.True DoctorID , gi.True RxGroupId , b.True BinId, p.True PcnId, s.True StateId, c1.True com1Id, c2.True com2Id ,c3.True com3Id , PA.Target,PA.TransDate
from PriorAuth_Data PA
inner join Drug d on PA.Drug = d.Drug
inner join DrugSubClass sc on PA.DrugSubClass = sc.DrugSubClass
inner join DrugClass dc on PA.DrugClass = dc.DrugClass
inner join Drug_Chemical_Name c on PA.Drug_Chemical_Name = c.Drug_Chemical_Name
inner join GPI g on PA.GPI = g.GPI
inner join NDC n on PA.NDC = n.NDC
inner join DrugGroup dg on PA.DrugGroup = dg.DrugGroup
inner join DoctorID Dr on PA.DoctorID = Dr.DoctorID
inner join RxGroupId gi on PA.RxGroupId = gi.RxGroupId
inner join Bin b on PA.Bin = b.Bin
inner join PCN p on PA.PCN = p.PCN
inner join State s on PA.State = s.State
inner join comb1 c1 on PA.comb1 = c1.comb1
inner join comb2 c2 on PA.comb2 = c2.comb2
inner join comb3 c3 on PA.comb3 = c3.comb3
')
#Removing unwanted/unused variables and relinquishing memory.
rm(list= ls()[!(ls() %in% c('PriorAuth_num_Data'))])
# as all other attributes are now integer for except Trandate, converting Target to Integer
PriorAuth_num_Data$Target <- ifelse(PriorAuth_num_Data$Target == TRUE, 1, 0)
str(PriorAuth_num_Data)
#for some reasons the below two attributes are chr, so converting it to num
PriorAuth_num_Data$DoctorID<- as.numeric(PriorAuth_num_Data$DoctorID)
PriorAuth_num_Data$RxGroupId <- as.numeric(PriorAuth_num_Data$RxGroupId)
str(PriorAuth_num_Data)
cor(PriorAuth_num_Data[setdiff(names(PriorAuth_num_Data),c('TransDate'))])
#########################################################################################
#Data is Ready for Model building
#########################################################################################
#Split Train and test using Time-based sampling approach as suggested
# Total 18 days data
# data grouped by date info
data.frame(table(PriorAuth_num_Data$TransDate))
traindata <- subset(PriorAuth_num_Data, TransDate<=as.Date("2013-08-04"))
nrow(traindata) # 7255 rows for training i.e 77%
testdata <- subset(PriorAuth_num_Data, TransDate>as.Date("2013-08-04"))
nrow(testdata)#2138 rows for testing teh model i.e 23%
str(traindata)
rm(list= ls()[!(ls() %in% c('traindata','testdata'))])
#########################################################################################
### Building Modelz
#########################################################################################
### Logistic Regression
#########################################################################################
# LogReg <- glm(Target ~ ., data=traindata, family=binomial)
# summary(LogReg)
#
step(glm(Target ~ .,data=traindata),direction = 'backward')
# step gave me NdcId + DoctorID + com3Id +Transdate, But removing Transdate gave a slight improvement.
LogReg <- glm(formula = Target ~ NdcId + DoctorID + com3Id ,
data = traindata)
# train results
prob<-predict(LogReg, type="response")
pred_class <- ifelse(prob> 0.5, 1, 0)
table(traindata$Target,pred_class)
# Error Metric
conf.mat = table(traindata$Target,pred_class)
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
# Test results
fitted.results <- predict(LogReg,testdata,type='response')
fitted.class <- ifelse(fitted.results > 0.5,1,0)
table(testdata$Target,fitted.class)
# Error Metric
conf.mat = table(testdata$Target,fitted.class)
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
#Ploting the ROC curve and calculate the AUC
#(area under the curve) which are typical performance measurements
#for a binary classifier.
#The ROC (Receiver Operating Characteristic curve) is a curve generated by plotting the true positive rate (TPR = sensitivity) against
# the false positive rate (FPR= specificity) at various threshold settings while the AUC is
# the area under the ROC curve. As a rule of thumb, a model with good
#predictive ability should have an AUC closer to 1 (1 is ideal) than to 0.5.
p <- predict(LogReg,testdata, type="response")
pr <- prediction(p, testdata$Target)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf,colorize = TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))
abline(a=0, b= 1)
auc <- performance(pr, measure = "auc")
auc <- [email protected][[1]]
auc
#########################################################################################
### Decision Tree
#########################################################################################
rm(list= ls()[!(ls() %in% c('traindata','testdata'))])
PA_rpart <- rpart(Target~ ., data=traindata, method="class")
#By Variable Importance got below as top
PA_rpart <- rpart(Target~DoctorID + NdcId + com3Id, data=traindata, method="class")
plot(PA_rpart,main="Classification Tree for loan Class",margin=0.15,uniform=TRUE)
text(PA_rpart,use.n=T)
summary(PA_rpart)
conf.mat = table(traindata$Target, predict(PA_rpart, newdata=traindata, type="class"))
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
conf.mat = table(testdata$Target, predict(PA_rpart, newdata=testdata, type="class"))
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
# Compared the results of glm and rpart, it is almost the same
#write.csv(cbind(testdata$Target, testresult),'desition_testresult.csv')
#write.csv(cbind(testdata$Target,fitted.class),'glm_testresult.csv')
#########################################################################################
### SVM
#########################################################################################
rm(list= ls()[!(ls() %in% c('traindata','testdata'))])
# Build best SVM model
#PA_SVM <- svm(Target~ NdcId + DoctorID + com3Id, data=traindata, kernel = "linear")
#SVM with linear kernal , the prediction has given the most of 97% Recall which was the highest amongst all other Algorithms
# When the kernal is changed from linear to polinomial the prediction was taken to the top and has achieved 100% Recall
PA_SVM <- svm(Target~ NdcId + DoctorID + com3Id, data=traindata, kernel = "polinomial")
# Look at the model summary
summary(PA_SVM)
plot(PA_SVM$index)
# Predict on train data
pred_Train = predict(PA_SVM, traindata)
plot(pred_Train) # Plot shows more than 0.5
conf.mat = table(traindata$Target, ifelse(pred_Train> 0.5, 1, 0))
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
# Predict on test data
pred_Test = predict(PA_SVM, testdata[setdiff(names(testdata),c('Target'))])
conf.mat = table(testdata$Target, ifelse(pred_Test> 0.5, 1, 0))
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
#write.csv(cbind(testdata$Target, ifelse(pred_Test> 0.5, 1, 0)),'SVM_results.csv')
######################################################################################################
# Random Forest
######################################################################################################
rm(list= ls()[!(ls() %in% c('traindata','testdata'))])
pa_rf <- randomForest(factor(Target) ~ ., data = traindata)
str(traindata)
pa_rf$importance
round(importance(pa_rf), 2)
# Extract and store important variables obtained from the random forest model
rf_Imp_Attr = data.frame(pa_rf$importance)
rf_Imp_Attr = data.frame(row.names(rf_Imp_Attr),rf_Imp_Attr[,1])
colnames(rf_Imp_Attr) = c('Attributes', 'Importance')
rf_Imp_Attr = rf_Imp_Attr[order(rf_Imp_Attr$Importance, decreasing = TRUE),]
# plot (directly prints the important attributes)
varImpPlot(pa_rf)
# Importance is teh same as my other models, just that the Trans date is having higher importance, still
# I am not including it in my model, because it is giving very poor performance in test as espected...
pa_rf <- randomForest(factor(Target) ~ NdcId + DoctorID + com3Id, data = traindata)
# Predict on Train data
pred_Train = predict(pa_rf, traindata[,setdiff(names(traindata),"Target")],
type="response", norm.votes=TRUE)
conf.mat = table(traindata$Target, pred_Train)
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
# Predict on test data
pred_Test = predict(pa_rf, testdata[,setdiff(names(testdata),"Target")],
type="response", norm.votes=TRUE)
conf.mat = table(testdata$Target, pred_Test)
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
######################################################################################################
### H2o Autoencoder
######################################################################################################
rm(list= ls()[!(ls() %in% c('traindata','testdata'))])
h2o.init(ip='localhost', port = 54321, max_mem_size = '2g')
train.hex <- as.h2o(x = traindata, destination_frame = "train.hex")
test.hex <- as.h2o(x = testdata, destination_frame = "test.hex")
y = "Target"
x = setdiff(colnames(train.hex), y)
aec <- h2o.deeplearning(x = x, autoencoder = T,
training_frame=train.hex,
activation = "Tanh",
hidden = c(20),
epochs = 100)
# Extract features from train data
features_train <- as.data.frame(h2o.deepfeatures(data = train.hex[,x], object = aec))
# Extract features from test data
features_test <- as.data.frame(h2o.deepfeatures(data = test.hex[,x], object = aec))
train_new <-data.frame(traindata,features_train)
test_new <-data.frame(testdata,features_test)
rf_DL <- randomForest(Target ~ ., data=train_new, keep.forest=TRUE, ntree=50)
# importance of attributes
round(importance(rf_DL), 2)
importanceValues = data.frame(attribute=rownames(round(importance(rf_DL), 2)),MeanDecreaseGini = round(importance(rf_DL), 2))
row.names(importanceValues)=NULL
importanceValues = importanceValues[order(-importanceValues$IncNodePurity),]
# Top 10 Important attributes
Top10ImpAttrs = as.character(importanceValues$attribute[1:16])
Top10ImpAttrs
train_Imp = subset(train_new,select = c(Top10ImpAttrs,"Target"))
test_Imp = subset(test_new,select = c(Top10ImpAttrs,"Target"))
rm(train_new,test_new)
# Build the classification model
model_Imp = glm(Target ~ ., data = train_Imp,family = 'binomial')
# train results
prob<-predict(model_Imp, type="response")
pred_class <- ifelse(prob> 0.5, 1, 0)
table(train_Imp$Target,pred_class)
# Error Metric
conf.mat = table(train_Imp$Target,pred_class)
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
# Test results
fitted.results <- predict(model_Imp,test_Imp,type='response')
fitted.class <- ifelse(fitted.results > 0.5,1,0)
table(test_Imp$Target,fitted.class)
# Error Metric
conf.mat = table(test_Imp$Target,fitted.class)
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
######################################################################################
## Deep learning with two hidden nodes
######################################################################################
rm(list= ls()[!(ls() %in% c('traindata','testdata'))])
h2o.init(ip='localhost', port = 54321, max_mem_size = '2g')
y = "Target"
x = c('NdcId' , 'DoctorID' , 'com3Id')
train.hex <- as.h2o(x = traindata, destination_frame = "train.hex")
test.hex <- as.h2o(x = testdata, destination_frame = "test.hex")
dlModel <- h2o.deeplearning(x = x, y = y,
training_frame=train.hex,
activation = "TanhWithDropout",
hidden = c(150,150),
input_dropout_ratio = 0.001,
l1 = 1e-10,
epochs = 100)
# View specified parameters of the deep learning model
dlModel@parameters
# Examine the performance of the trained model
dlModel # display all performance metrics
# Metrics
h2o.performance(dlModel)
# Get MSE only
h2o.mse(dlModel)
pred = h2o.predict(dlModel, train.hex[x])
preddf <- as.data.frame(pred)
conf.mat <- table(traindata$Target,ifelse(preddf$predict > 0.5,1,0))
conf.mat
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
#----Prediction on test data
pred1 = h2o.predict(dlModel, test.hex[x])
pred_test_df <- as.data.frame(pred1)
conf.mat <- table(testdata$Target,ifelse(pred_test_df$predict > 0.5,1,0))
conf.mat
cat("Accuracy : ",sum(diag(conf.mat))/sum(conf.mat))
cat("Recall : ",conf.mat[2,2]/sum(conf.mat[2,]))
cat("precision : ", conf.mat[2,2]/sum(conf.mat[,2]))
cat("F1 Score : ", 2*(conf.mat[2,2]/sum(conf.mat[,2])*conf.mat[2,2]/sum(conf.mat[2,]))/((conf.mat[2,2]/sum(conf.mat[,2])+conf.mat[2,2]/sum(conf.mat[2,]))))
write.csv(x = ifelse(pred_test_df$predict > 0.5,1,0),file = "DL.csv")
#######################################################################
#Stagging
#######################################################################
#Glm
LogReg <- glm(formula = Target ~ NdcId + DoctorID + com3Id ,
data = traindata)
#SVM
PA_SVM <- svm(Target~ NdcId + DoctorID + com3Id, data=traindata, kernel = "linear")
#Decision Tree
PA_rpart <- rpart(Target~ DoctorID + NdcId + com3Id, data=traindata, method="class")
# Prediction on Train
# Train results
glm_train<-predict(LogReg, type="response")
glm_Train <- ifelse(glm_train> 0.5, 1, 0)
table(traindata$Target,glm_Train)
rpart_train <- predict(PA_rpart, newdata=traindata, type="class")
as.vector(rpart_train)
SVM_Train = predict(PA_SVM, traindata)
SVM_Train <- ifelse(SVM_Train> 0.5, 1, 0)
#combine
train_Pred_All_Models = data.frame(GLM = glm_Train,
rpart = rpart_train,
SVM = SVM_Train)
train_Pred_All_Models = data.frame(sapply(train_Pred_All_Models, as.factor))
str(train_Pred_All_Models)
summary(train_Pred_All_Models)
rm(rpart_Train, glm_Train, SVM_Train)
table(train_Pred_All_Models$GLM) #GLM
table(train_Pred_All_Models$rpart) #rpart
table(train_Pred_All_Models$SVM) #SVm
table(traindata$Target) #Original Dataset DV
train_Pred_All_Models = cbind(train_Pred_All_Models, Target = traindata$Target)
# Ensemble Model with GLM as Meta Model
ensemble_Model = glm(Target ~ ., train_Pred_All_Models, family = binomial)
summary(ensemble_Model)
# Check the "ensemble_Model model" on the train data
ensemble_Train = predict(ensemble_Model, train_Pred_All_Models,
type = "response")
ensemble_Train = ifelse(ensemble_Train > 0.5, 1, 0)
table(ensemble_Train)
cm_Ensemble = table(ensemble_Train, train_Pred_All_Models$Target)
sum(diag(cm_Ensemble))/sum(cm_Ensemble)
cat("Accuracy : ",sum(diag(cm_Ensemble))/sum(cm_Ensemble))
cat("Recall : ",cm_Ensemble[2,2]/sum(cm_Ensemble[2,]))
cat("precision : ", cm_Ensemble[2,2]/sum(cm_Ensemble[,2]))
cat("F1 Score : ", 2*(cm_Ensemble[2,2]/sum(cm_Ensemble[,2])*cm_Ensemble[2,2]/sum(cm_Ensemble[2,]))/((cm_Ensemble[2,2]/sum(cm_Ensemble[,2])+cm_Ensemble[2,2]/sum(cm_Ensemble[2,]))))
```
Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).