randomforest.Rmd

---
title: "Random Forest and Examples"
author: "Mu He"
date: "9/18/2021"
output: 
  html_document:
    toc: true
    toc_float: true
    toc_depth: 4
---
### Random Forest

Definition a random forest

### Flowchart of a Random Forest Alogrithm


### Examples in R
```{r}
library(randomForest)
library(MASS)
library(tree)
library(gbm)
library(e1071)

par(mfrow=c(1,1))
# Plot for M/3 explanation
n<-10:1000
plot(n,((n-1)/n)^n,typ="l")
dev.print(device=postscript, "M3.eps", onefile=FALSE, horizontal=FALSE)

### Boston example, based on example from James et al.(2013)
library(randomForest)
library(MASS)
library(tree)
# First, a tree
#?Boston
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)
tree.boston=tree(medv~.,Boston,subset=train)
summary(tree.boston)
# Use cv tree to determine depth (i.e., to see if we might want to prune)
plot(tree.boston)
text(tree.boston,pretty=0)
cv.boston=cv.tree(tree.boston)
plot(cv.boston$size,cv.boston$dev,type='b')
#This is how we would prune:
#prune.boston=prune.tree(tree.boston,best=5)
#yhat=predict(prune.boston,newdata=Boston[-train,])
yhat=predict(tree.boston,newdata=Boston[-train,])
boston.test=Boston[-train,"medv"]
plot(yhat,boston.test)
abline(0,1)
mean((yhat-boston.test)^2)
# Now, bagging and random forests
set.seed(1)
train = sample (1: nrow(Boston ), nrow(Boston )/2)
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE)
bag.boston
yhat.bag = predict(bag.boston,newdata=Boston[-train,])
mean((yhat.bag-boston.test)^2)
plot(yhat.bag,boston.test)
abline(0,1)
importance(bag.boston)
varImpPlot(bag.boston)
set.seed(1)
rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=4,importance=TRUE)
rf.boston
yhat.rf = predict(rf.boston,newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)
importance(rf.boston)
varImpPlot(rf.boston)
rf.boston2=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE,ntree=500)
rf.boston2
yhat.rf = predict(rf.boston2,newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)
importance(rf.boston)
varImpPlot(rf.boston)
# Boosting -- this is just the little bit from class, see wine.R for a fuller example of boosting
set.seed(1)
boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4)
summary(boost.boston)
par(mfrow=c(1,2))
plot(boost.boston,i="rm")
plot(boost.boston,i="lstat")
yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
boston.test=Boston[-train,"medv"]
mean((yhat.boost-boston.test)^2)

### Hitters data
data(Hitters,package="ISLR")
head(Hitters)
Hitters<-na.omit(Hitters)
Hitters$logSalary<-log(Hitters$Salary)
names(Hitters)
str(Hitters)
Hitters<-Hitters[,-19]
names(Hitters)
set.seed(1)
train = sample (1: nrow(Hitters), nrow(Hitters)/2)
tree.hitters <- tree(logSalary ~ ., data = Hitters, subset=train)
summary(tree.hitters)
plot(tree.hitters)
text(tree.hitters)
yhat=predict(tree.hitters,newdata=Hitters[-train,])
hitters.test=Hitters[-train,"logSalary"]
plot(yhat,hitters.test)
abline(0,1)
mean((yhat-hitters.test)^2)
# Now, bagging and random forests
set.seed(1)
bag.hitters=randomForest(logSalary~.,data=Hitters,subset=train,mtry=19,importance=TRUE)
bag.hitters
yhat.bag = predict(bag.hitters,newdata=Hitters[-train,])
mean((yhat.bag-hitters.test)^2)
importance(bag.hitters)
varImpPlot(bag.hitters)
plot(yhat.bag,hitters.test)
abline(0,1)
set.seed(1)
rf.hitters=randomForest(logSalary~.,data=Hitters,subset=train,mtry=5,importance=TRUE)
rf.hitters
yhat.rf = predict(rf.hitters,newdata=Hitters[-train,])
mean((yhat.rf-hitters.test)^2)
importance(rf.hitters)
varImpPlot(rf.hitters)
#A figure
M<-(1:20)*50
error<-M
for(m in M){
	bag.hitters=randomForest(logSalary~.,data=Hitters,subset=train,mtry=19,importance=TRUE,ntree=m)
	yhat.bag = predict(bag.hitters,newdata=Hitters[-train,])
	error[m/50]<-mean((yhat.bag-hitters.test)^2)
}
bag<-error
for(m in M){
	bag.hitters=randomForest(logSalary~.,data=Hitters,subset=train,mtry=9,importance=TRUE,ntree=m)
	yhat.bag = predict(bag.hitters,newdata=Hitters[-train,])
	error[m/50]<-mean((yhat.bag-hitters.test)^2)
}
rf4<-error
for(m in M){
	bag.hitters=randomForest(logSalary~.,data=Hitters,subset=train,mtry=5,importance=TRUE,ntree=m)
	yhat.bag = predict(bag.hitters,newdata=Hitters[-train,])
	error[m/50]<-mean((yhat.bag-hitters.test)^2)
}
rf6<-error
plot(M,bag,typ="l",col=2,xlim=c(0,1000),ylab="test error",ylim=c(0.145,0.2))
lines(M,rf4,typ="l",col=1)
lines(M,rf6,typ="l",col=4)
legend(1,0.165,c("m=19 (bagging)","m=9","m=5"),col=c(2,1,4),pch=c(2,3,4))
# Boosting for Hitters data --- this is the little bit from class; see wine.R data for a fuller example of boosting
set.seed(1)
boost.hitters=gbm(logSalary~.,data=Hitters[train,],distribution="gaussian",n.trees=5000,interaction.depth=4)
summary(boost.hitters)
yhat.boost=predict(boost.hitters,newdata=Hitters[-train,],n.trees=5000)
hitters.test=Hitters[-train,"logSalary"]
mean((yhat.boost-hitters.test)^2)
```

```{r}
### Iris data
names(iris)
set.seed(1)
train <- c(sample(1:50, 25), sample(51:100, 25), sample(101:150, 25))
tree.iris <- tree(Species ~ ., data = iris, subset=train,method="class")
summary(tree.iris)
plot(tree.iris)
text(tree.iris)
iris.test=iris[-train,"Species"]
iris.pred=predict(tree.iris,iris[-train,],type="class")
tab<-table(iris.test,iris.pred)
tab
1-classAgreement(tab)$diag
classAgreement(tab)$crand
# Now, bagging and random forests
set.seed(1)
bag.iris=randomForest(Species~.,data=iris,subset=train,mtry=4,importance=TRUE,type="class")
bag.iris
iris.pred=predict(bag.iris,iris[-train,],type="class")
tab<-table(iris.test,iris.pred)
tab
1-classAgreement(tab)$diag
classAgreement(tab)$crand
set.seed(1)
rf.iris=randomForest(Species~.,data=iris,subset=train,mtry=3,importance=TRUE,type="class")
rf.iris
iris.pred.rf=predict(rf.iris,iris[-train,],type="class")
tab<-table(iris.test,iris.pred.rf)
tab
1-classAgreement(tab)$diag
classAgreement(tab)$crand
importance(rf.iris)
varImpPlot(rf.iris)
```