-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLogistic_regression.R
469 lines (410 loc) · 19.7 KB
/
Logistic_regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#Here we have dependent variable as revenue grid, the target is to find
#profitable and non profitable customers for bank- bank has divided customers
#manually as high net worth customers as 1 and low net worth
#customers as 2
#Loading libraries
library("dplyr")
R <- read.csv("C:\\Users\\Admin\\Desktop\\Existing Base.csv", stringsAsFactors = F)
#Above code is reading data in R
glimpse(R) #Looking at data
#Let us prepare data as per our need
#We need to run logistic regression here therefore dependent variable here
#Revenue.grid should be in terms of 0 and 1
R$Revenue.Grid <- ifelse(R$Revenue.Grid==1, 1, 0)
table(R$Revenue.Grid)
#Let us see how many categories are there in children
table(R$children)
R <- R %>% mutate(gsub("Zero","0", substr(children,1,1)))
glimpse(R) #Looking at data how children column has change
table(R$age_band) #Checking how many categories there in age_band
#We will try to reduce the number of categories by looking at
#percentage distribution
#We will use percentage split here
round(prop.table(table(R$age_band,R$Revenue.Grid),1),2)
#Let us make the categories now
# 1 : 61-65 0.09
# 2 : 45-50, 51-55, 65-70, 71+ 0.10
# 3 : 22-25, 26-30, 31-35, 41-45, 55-60 0.11
# 4 : 36-40 0.13
# 5 : 18-21 0.17
R<- R %>%
mutate(Age_band_1= as.numeric(age_band %in% c("61-65")),
Age_band_2= as.numeric(age_band %in% c("45-50","51-55","65-70","71+")),
Age_band_3=as.numeric(age_band %in% c("22-25","26-30","31-35",
"41-45","55-60")),
Age_band_4=as.numeric(age_band %in% c("36-40")),
Age_band_5=as.numeric(age_band %in% c("18-21"))) %>%
select(-age_band)
#Next we will be working on status
table(R$status)
unique(R$status) #Unique give us names of the categories clearly
round(prop.table(table(R$status,R$Revenue.Grid),1),2)
#Assigning categories
#1 : Divorced/Separated, Partner, Single/Never Married 0.11
R <-R %>% mutate(Status_1 = as.numeric(status %in%
c("Divorced/Separated", "Partner",
"Single/Never Married"))) %>%
select(-status)
glimpse(R) #Looking how data is looking
#Next we will work on occupation
table(R$occupation)
#Percentage proportion
round(prop.table(table(R$occupation,R$Revenue.Grid),1),2)
#Bucketing of categories
#1 Retired : 0.10
#2 Secretarial/Admin, Student, Unknown, Manual Worker, Other : 0.11
#3 Housewife : 0.09
R <- R %>% mutate(occupation_1 = as.numeric(occupation %in% c("Retired")),
occupation_2 = as.numeric(occupation %in%
c("Secretarial/Admin", "Student",
"Unknown", "Manual Worker", "Other")),
occupation_3 = as.numeric(occupation %in% c("Housewife"))) %>%
select(-occupation)
#Next Occupation partner
table(R$occupation_partner)
round(prop.table(table(R$occupation_partner, R$Revenue.Grid),1),2)
#Bucketing of categories
#1 Business Manager, Housewife, Manual Worker, Professional : 0.11
#2 Retired, Other, Unknown 0.12
R <- R %>% mutate(occupation_partner_1 = as.numeric(occupation_partner %in%
c("Business Manager",
"Housewife",
"Manual Worker",
"Professional")),
occupation_partner_2 = as.numeric(occupation_partner %in%
"Retired","Other","Unknown")) %>%
select(-occupation_partner)
glimpse(R)
#Next home_status
table(R$home_status)
round(prop.table(table(R$home_status,R$Revenue.Grid),1),2)
#Bucketing categories
#1 Rent Privately, Live in Parental Hom, Own Home : 0.11
#2 Rent from Council/HA
R<-R %>% mutate(home_status_1= as.numeric(home_status %in%
c("Rent Privately",
"Live in Parental Hom", "Own Home")),
home_status_2 = as.numeric(home_status %in%
c("Rent from Council/HA"))) %>%
select(-home_status)
glimpse(R)
#Next is family_income
table(R$family_income)
round(prop.table(table(R$family_income,R$Revenue.Grid),1),2)
#Bucketing the categories
#1 "<4,000", "<8,000, >= 4,000" : 0.08
#2 "<12,500, >=10,000", "<25,000, >=22,500","<27,500 >=25,000" : 0.10
#3 "<10,000, >=8,000", "<15,000, >=12,500", "<20,000, >=17,500", " >=35,000" : 0.11
#4 "<17,500, >=15,000", "<22,500, >=20,000", "<30,000, >=27,500" : 0.12
R <- R %>% mutate(family_income_1 = as.numeric(family_income %in%
c("Unknown")),
family_income_2 = as.numeric(family_income %in%
c("<12,500, >=10,000", "<25,000,
>=22,500","<27,500
>=25,000" )),
family_income_3 = as.numeric(family_income %in%
c("<10,000, >=8,000", "<15,000,
>=12,500",
"<20,000, >=17,500",
" >=35,000")),
family_income_4 = as.numeric(family_income %in%
c("<17,500, >=15,000", "<22,500,
>=20,000", "<30,000, >=27,500"))) %>%
select(-family_income)
glimpse(R)
#Next is self_employed
table(R$self_employed)
R<-R %>% mutate(self_employed_yes = as.numeric(self_employed == "Yes")) %>%
select(-self_employed)
#Next is self_employed_partner
table(R$self_employed_partner)
R<-R %>% mutate(self_employed_partner_yes = as.numeric(self_employed_partner ==
"Yes")) %>%
select(-self_employed_partner)
glimpse(R)
#Next I will take gender
table(R$gender)
R <-R %>% mutate(gender_female = as.numeric(gender == "Female")) %>%
select(-gender)
glimpse(R)
#Next I will table region
table(R$region)
round(prop.table(table(R$region,R$Revenue.Grid),1),2)
#Bucketing categories
#1 "South West" : 0.09
#2 "North","East Midlands","South East" : 0.10
#3 "North West","Scotland","Wales","West Midlands" : 0.11
#4 "Northern Ireland","Unknown" : 0.12
#5 "East Anglia" : 0.13
#6 "Channel Islands" : 0.20
R <- R %>% mutate(region_1 = as.numeric(region %in% c("North",
"East Midlands","South East")),
region_2 = as.numeric(region %in% c("North West","Scotland"
,"Wales","West Midlands")),
region_3 = as.numeric(region %in% c("Northern Ireland",
"Unknown")),
region_4 = as.numeric(region %in% c("East Anglia", "Channel Islands"))) %>%
select(-region)
glimpse(R)
#Next I will see year_last_moved
table(R$year_last_moved)
#Here we have year_last_moved as 0, it means someone forget to put year in data
#We will impute this value of club it with some other year using bucketing
round(prop.table(table(R$year_last_moved,R$Revenue.Grid),1),2)
#Since 1938 and 0 has 0.05 proportion, I will club them
R<-R %>% mutate(year_last_moved=ifelse(year_last_moved==0,1938,year_last_moved))
table(R$year_last_moved) #Check if it happened
#Next we will take TVarea
table(R$TVarea)
round(prop.table(table(R$TVarea,R$Revenue.Grid),1),2)
#Bucketing categories
#1 "Grampian", "HTV" : 0.10
#2 "Anglia","Carlton"," Central","Scottish TV" : 0.11
#3 "Granada","Ulster","Unknown", "Yorkshire" : 0.12
#4 "Border" : 0.14
R <- R %>% mutate(TVarea.1 = as.numeric(TVarea %in% c("Grampian","HTV")),
TVarea.2 = as.numeric(TVarea %in% c("Anglia","Carlton","Central",
"Scottish TV")),
TVarea.3 = as.numeric(TVarea %in% c("Granada","Ulster",
"Unknown","Yorkshire")),
TVarea.4 = as.numeric(TVarea %in% c("Border"))) %>%
select(-TVarea)
glimpse(R)
#Next I will go for post_code
table(R$post_code)
#As post_code has too many categories, I will drop this variable
R<-R %>% select(-post_code)
#Next I will go for post_area
table(R$post_area)
#As post_area has too many categories, I will drop this variable
R<-R %>% select(-post_area)
#Finally we are done with data cleaning, now we will do modelling
#Please check before starting with modelling that no variable has NA's in summary
#we can check this by using summary(R)
#We will start with sampling for validation
set.seed(1712)
R_=sample(1:nrow(R),0.75*nrow(R)) #This will select 75% of rows from R
R_train=R[R_,]
R_test=R[-R_,]
nrow(R_train) #Checking if we have train data of 75% rows
nrow(R_test) #Checking if we have test data of 25% rows
library(car) #This isuse for checking up VIF
R_fit <- lm(Revenue.Grid ~ . -REF_NO, data=R_train)
summary(R_fit)
R_vif <- vif(R_fit) #Calculating VIF
sort(R_vif, decreasing = T)[1:3] #Arranging as per VIF only looking at top 3
#Dropping the feature with maximum variance
R_fit <- lm(Revenue.Grid ~ . -REF_NO -Investment.in.Commudity, data=R_train)
R_vif <- vif(R_fit) #Calculating VIF
sort(R_vif, decreasing = T)[1:3] #Arranging as per VIF only looking at top 3
#Dropping the feature with maximum variance
R_fit <- lm(Revenue.Grid ~ . -REF_NO -Investment.in.Commudity
-Investment.in.Derivative, data=R_train)
R_vif <- vif(R_fit) #Calculating VIF
sort(R_vif, decreasing = T)[1:3] #Arranging as per VIF only looking at top 3
#Dropping the feature with maximum variance
R_fit <- lm(Revenue.Grid ~ . -REF_NO -Investment.in.Commudity
-Investment.in.Derivative
-Investment.in.Equity
, data=R_train)
R_vif <- vif(R_fit) #Calculating VIF
sort(R_vif, decreasing = T)[1:3] #Arranging as per VIF only looking at top 3
#Dropping the feature with maximum variance
R_fit <- lm(Revenue.Grid ~ . -REF_NO -Investment.in.Commudity
-Investment.in.Derivative
-Investment.in.Equity -Age_band_3
, data=R_train)
R_vif <- vif(R_fit) #Calculating VIF
sort(R_vif, decreasing = T)[1:3] #Arranging as per VIF only looking at top 3
#Dropping the feature with maximum variance
R_fit <- lm(Revenue.Grid ~ . -REF_NO -Investment.in.Commudity
-Investment.in.Derivative
-Investment.in.Equity -Age_band_3 -Portfolio.Balance
, data=R_train)
R_vif <- vif(R_fit) #Calculating VIF
sort(R_vif, decreasing = T)[1:3] #Arranging as per VIF only looking at top 3
#Dropping the feature with maximum variance
R_fit <- lm(Revenue.Grid ~ . -REF_NO -Investment.in.Commudity
-Investment.in.Derivative
-Investment.in.Equity -Age_band_3 -Portfolio.Balance - home_status_1
, data=R_train)
R_vif <- vif(R_fit) #Calculating VIF
sort(R_vif, decreasing = T)[1:3] #Arranging as per VIF only looking at top 3
#For logistic regression we love feature with less than variance =10, so we will
#keep all of the features now
#We created a new data set here so we can easily write the variables dropped
R_train_new <- R_train %>% select(-REF_NO, -Investment.in.Commudity,
-Investment.in.Derivative,
-Investment.in.Equity, -Age_band_3,
-Portfolio.Balance, - home_status_1)
R_fit_new <- glm(Revenue.Grid ~., family = "binomial",data=R_train_new)
summary(R_fit_new)
#Using AIC (automated function) for better model, we can do this through step function
R_fit_new <- step(R_fit_new)
formula(R_fit_new) #Will give the formula after doing AIC as best
R_fit_new <- glm(Revenue.Grid ~ children + Average.Credit.Card.Transaction +
Balance.Transfer +
Term.Deposit + Life.Insurance + Medical.Insurance +
Average.A.C.Balance +
Personal.Loan + Investment.in.Mutual.Fund +
Investment.Tax.Saving.Bond +
Home.Loan + Online.Purchase.Amount + family_income_1 +
self_employed_yes +
self_employed_partner_yes + gender_male + TVarea.2
+ TVarea.3,family = "binomial", data = R_train_new)
summary(R_fit_new)
#We will now remove insignificant variables based on p-value > 0.05
#Starting with family_income_1
R_fit_new <- glm(Revenue.Grid ~ children + Average.Credit.Card.Transaction +
Balance.Transfer +
Term.Deposit + Life.Insurance + Medical.Insurance +
Average.A.C.Balance +
Personal.Loan + Investment.in.Mutual.Fund +
Investment.Tax.Saving.Bond +
Home.Loan + Online.Purchase.Amount +
self_employed_yes +
self_employed_partner_yes + gender_male + TVarea.2
+ TVarea.3,family = "binomial", data = R_train_new)
summary(R_fit_new)
#Next we will delete self_emplyed_yes
R_fit_new <- glm(Revenue.Grid ~ children + Average.Credit.Card.Transaction +
Balance.Transfer +
Term.Deposit + Life.Insurance + Medical.Insurance +
Average.A.C.Balance +
Personal.Loan + Investment.in.Mutual.Fund +
Investment.Tax.Saving.Bond +
Home.Loan + Online.Purchase.Amount +
self_employed_partner_yes + gender_male + TVarea.2
+ TVarea.3,family = "binomial", data = R_train_new)
summary(R_fit_new)
#Next we will reduce Investment.in.Mutual.Fund
R_fit_new <- glm(Revenue.Grid ~ children + Average.Credit.Card.Transaction +
Balance.Transfer +
Term.Deposit + Life.Insurance + Medical.Insurance +
Average.A.C.Balance +
Personal.Loan +
Investment.Tax.Saving.Bond +
Home.Loan + Online.Purchase.Amount +
self_employed_partner_yes + gender_male + TVarea.2
+ TVarea.3,family = "binomial", data = R_train_new)
summary(R_fit_new)
#Next we will drop TVarea.2
R_fit_new <- glm(Revenue.Grid ~ children + Average.Credit.Card.Transaction +
Balance.Transfer +
Term.Deposit + Life.Insurance + Medical.Insurance +
Average.A.C.Balance +
Personal.Loan +
Investment.Tax.Saving.Bond +
Home.Loan + Online.Purchase.Amount +
self_employed_partner_yes + gender_male
+ TVarea.3,family = "binomial", data = R_train_new)
summary(R_fit_new)
#Next we will drop TVarea.3
R_fit_new <- glm(Revenue.Grid ~ children + Average.Credit.Card.Transaction +
Balance.Transfer +
Term.Deposit + Life.Insurance + Medical.Insurance +
Average.A.C.Balance +
Personal.Loan +
Investment.Tax.Saving.Bond +
Home.Loan + Online.Purchase.Amount +
self_employed_partner_yes + gender_male
,family = "binomial", data = R_train_new)
summary(R_fit_new)
#Now all the left over variables looks significant
#The below code will give predicted values for training data
R_train_new$score <- predict(R_fit_new, newdata = R_train_new, type = "response")
#Let us plot the result
library(ggplot2)
ggplot(R_train_new,aes(y=Revenue.Grid,x=score,color=factor(Revenue.Grid)))+
geom_point() + geom_jitter()
#x axis shows that predicted probabilities.
#We can see True Negative on left on the x axis of the bottom graph
#We can see True Positive on right on the x axis of the top graph
#Let us assume cut off is 0.2
cutoff = 0.2
predicted<- as.numeric(R_train_new$score>cutoff)
glimpse(predicted)
#Calculating TP,TN,FP,FN
glimpse(R_train_new$Revenue.Grid)
glimpse(predicted)
TP = sum(predicted==1 & R_train_new$Revenue.Grid==1)
FP <- sum(predicted==1 & R_train_new$Revenue.Grid==0)
FN <- sum(predicted==0 & R_train_new$Revenue.Grid==1)
TN <- sum(predicted==0 & R_train_new$Revenue.Grid==0)
#Now we need to find out our total positive and total negatives
# Lets also calculate total number of 1's i.e. positives & 0's i.e. negatives
P=TP+FN # Total number of 1's in reality
N=TN+FP # Total number of 0's in relity
#Total number of points
Total <- P + N
#We can do this for multiple cutoff values as for only one cutoff = 0.4 is of no use
#Creating table with different cutoff values
cutoff_data = NULL
cutoffs = round(seq(0,1,length=100),3)
cutoffs # A vector of 100 cutoff values
for (cutoff in cutoffs)
{
predicted=as.numeric(R_train_new$score>cutoff)
TP=sum(predicted==1 & R_train_new$Revenue.Grid==1)
FP=sum(predicted==1 & R_train_new$Revenue.Grid==0)
FN=sum(predicted==0 & R_train_new$Revenue.Grid==1)
TN=sum(predicted==0 & R_train_new$Revenue.Grid==0)
cutoff_data=rbind.data.frame(cutoff_data,c(cutoff,TP,FP,FN,TN))
}
colnames(cutoff_data) = c("Cutoff", "TP", "FP", "FN", "TN")
View(cutoff_data)
#Now we can make the metrics to find the best cut off
cutoff_data=cutoff_data %>%
mutate(Sn=TP/P,
Sp=TN/N,
dist=sqrt((1-Sn)**2+(1-Sp)**2),
P=FN+TP,
N=TN+FP) %>%
mutate(KS=abs((TP/P)-(FP/N))) %>%
mutate(Accuracy=(TP+TN)/(P+N)) %>%
mutate(Lift=(TP/P)/((TP+FP)/(P+N))) %>%
mutate(M=(8*FN+2*FP)/(P+N)) %>%
select(-P,-N)
View(cutoff_data)
#It would be easier for us to determine best cut off if we can see the data in
#visualization therefore we will plot the metrics however before that
#we must convert matrix into "long format" from "wide format"
library(tidyr)#For tiding the data together
cutoff_viz=cutoff_data %>%
select(Cutoff,Sn,Sp,dist,KS,Accuracy,Lift,M) %>%
gather(Criterion,Value,Sn:M)
View(cutoff_viz) # Data is in long format
# Visualization
ggplot(filter(cutoff_viz,Criterion!="Lift"),aes(x=Cutoff,y=Value,color=Criterion))+
geom_line()
# We'll visualise lift separately because of its scale
cutoff_viz %>%
filter(Criterion=="Lift") %>%
ggplot(aes(x=Cutoff,y=Value,color=Criterion))+geom_line()
# Predict on testset
R_test$score=predict(R_fit_new,newdata = R_test,type = "response")
# Cutoff with max KS
KS_cutoff=cutoff_data$Cutoff[which.max(cutoff_data$KS)][1]
KS_cutoff
table(R_test$Revenue.Grid,as.numeric(R_test$score>KS_cutoff)) # Confusion matrix
#Cutoff with minimum distance
dist_cutoff=cutoff_data$Cutoff[which.min(cutoff_data$dist)][1]
dist_cutoff
table(R_test$Revenue.Grid,as.numeric(R_test$score>dist_cutoff))
# Cutoff with max Accuracy
Acc_cutoff=cutoff_data$Cutoff[which.max(cutoff_data$Accuracy)][1]
Acc_cutoff
table(R_test$Revenue.Grid,as.numeric(R_test$score>Acc_cutoff))
# Cutoff with minimum M ( The hypothetical business criterion)
M_cutoff=cutoff_data$Cutoff[which.min(cutoff_data$M)][1]
M_cutoff
table(R_test$Revenue.Grid,as.numeric(R_test$score>M_cutoff))
# ROC Curve
library(pROC)
roccurve=roc(R_train$Revenue.Grid,R_train_new$score)
plot(roccurve)
auc(roccurve)
roc_data=cutoff_data %>% select(cutoff,Sn,Sp) %>%
mutate(TPR=Sn,FPR=1-Sp) %>% select(cutoff,TPR,FPR)
View(roc_data) #Once you have find correct (FPR, TPR) cut-off value will be in table
ggplot(roc_data,aes(x=FPR,y=TPR))+geom_line()+ggtitle("My ROC Curve")