forked from nursnaaz/Batch-1-Beseant-Tech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDay_3_Preprocessing_Subsetting_Imputation_July13.r
89 lines (65 loc) · 2.74 KB
/
Day_3_Preprocessing_Subsetting_Imputation_July13.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#########################################Subsetting###############################
##This might form an important aspect in Data analysis where we might want to work on a subset of data
##Subset on vectors
v<-c(1,2,3,4,5)
v[v>3] #Output all elements greater than 3
attach(mtcars)
data<-mtcars
str(data)
##Subset on matrices and data frames
#a. Calling by cell positions
data1<-data[,2:11]
data1
data1<-data[1:10,2:11]
data1
data[,-1]
data[setdiff(names(data),'mpg')]
class(data1[,c(2,3)]) # explain this
data1[,1, drop=F] #droping str ?
#b. By using column names- two methods
data1<-data[,c("mpg","cyl")]
name<-c("mpg","cyl","disp","hp")
data1<-data[names(data) %in% name] ## %in% comes in handy for subsetting
#c. Using a subset function ##from help identify the argument to be given
data1<-subset(data,mpg>25,select=mpg:carb) #From data extracts all the records whose mpg>25 and all columns
#d. The same dataframe can be obtained in another way
data1<-data[mpg>25,]
##Multiple conditions can be given using "&" or "|"
data2<-data[mpg>25 & hp>75,]
data2<-subset(data,mpg>25 | gear==5,select=mpg:carb)
##Using which.max
data[which.max(mpg),]
##Using which.min
data[which.min(mpg),]
##Using which
data[which(data$mpg==max(data$mpg)),]
data[which(row.names(data) %in% c("Mazda RX4","Datsun 710")),]
detach(mtcars)
################################Data Exploration and Data Aggregation Methods#######################
##These form an important aspect especially for data exploration, data understanding and to processing
## the data for model building
##A data frame can have multiple datatypes in it like numeric, factor and logical.
library(plyr)
attach(baseball)
dfBB<-baseball
str(dfBB) ##outputs what to which type each variable belong to.
summary(dfBB) ## gives the overall summary of the data,we observe that the stats are given for numerical
## attributes, if characters then class and mode are mentioned.
##Conversion of variable types if necessary
##We can consider "teams" as a factor so that we can compare runs batted and home runs for teams
dfBB$team<-as.factor(dfBB$team)
str(dfBB$team)
##We do this appropriate conversions first
##Missing Values
##To count the number of missing values
sum(is.na(dfBB)) ##Gives the number of missing values in the data. What to do with the missing values ?
#option1. Omit all records with NA values
data1<-na.omit(dfBB) ##it omits all the records which has atleast one NA value in it
data2<-dfBB[complete.cases(dfBB),] ##another way
#Option2. If the missing values are few, then we can impute these missing values
library(DMwR)
data3<-centralImputation(dfBB) #Central Imputation
sum(is.na(data3))
data4<-knnImputation(dfBB[,-c(1,4,5)],scale=T,k=5) #KNN Imputation
sum(is.na(data4))
write.csv(data3, "data_imputed.csv", row.names=FALSE)