Used Cars with Trees, Random Forests, Boosting, and Neural Nets

Let’s try a single tree, random forests, boosting, and neural nets on the cars data with y = price and x= (mileage,year).

Let’s do a simple train/test split and compare the methods based on rmse on the test data.

For each method I’ll show you code to implement the method in R and then ask you to try a simple modification.

Read in the data

We use the same data and train/test split for each method.

## read in data and do simple train/test split

cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cds = cd[,c(1,4,5)]
cds$price = cds$price/1000
cds$mileage = cds$mileage/1000

n = nrow(cds)
set.seed(34)
testid = sample(1:n,300)

cdstr = cds[-testid,]
cdste = cds[testid,]

So, cdstr is the training data and cdste is the test data.

Single Tree

Below we find the a tree of size (number of bottom nodes) 21 might work well.
The results also suggest that we could also use a smaller tree and get decent results.

Try getting a smaller tree by using cp value bigger than bestcp and see you it does out of sample.

### librairies
library(rpart)
library(rpart.plot)

## simple tree
# see the link "simple tree in R" on the webpage

## fit on train
set.seed(14)
big.tree = rpart(price~.,method="anova",data=cdstr,control=rpart.control(minsplit=5,cp=.0001))
nbig = length(unique(big.tree$where))  # where gives a bottom node id for each train observation
cat("size of big tree: ",nbig,"\n")

## size of big tree:  122

##################################################
###look at CV results
plotcp(big.tree)

##################################################
### get nice tree from CV results
iibest = which.min(big.tree$cptable[,"xerror"]) #which has the lowest error
bestcp=big.tree$cptable[iibest,"CP"]
bestsize = big.tree$cptable[iibest,"nsplit"]+1
cat('size of best tree is: ',bestsize,"\n")

## size of best tree is:  21

cat('best cp is: ',bestcp,'\n')

## best cp is:  0.001213563

# note: in cptable:
# CP is the complexity parameter
# rel error is the average deviance of the current tree
# divided by the average deviance of the null tree (in-sample fit).
# xerror is based on a 10-fold cross-validation and is again measured
# relative to the deviance of the null model (out-of sample fit).

#prune to good tree
best.tree = prune(big.tree,cp=bestcp)
nbest = length(unique(best.tree$where))
cat("size of best tree: ", nbest,"\n")

## size of best tree:  21

##################################################
###get predictions
ypredT = predict(best.tree,newdata=cdste)
plot(cdste$price,ypredT)
abline(0,1,col="red",lwd=3)

rmseT = sqrt(mean((cdste$price - ypredT)^2))
cat('rmse for a single tree is:',rmseT,'\n')

## rmse for a single tree is: 6.050673

## using rpart.plot
rpart.plot(best.tree,split.cex=0.8,cex=1.0,type=3,extra=0)

Random Forests

Below is the results for random forests.
Very simple !!

Try changing ntree to 50. What should happen?

##################################################
## random forests
## see slide 64 of the trees notes

#load libraries
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

set.seed(88)
rffit = randomForest(price~.,data=cdstr,mtry=2,ntree=500)
ypredRF = predict(rffit,newdata=cdste)
rmseRF = sqrt(mean((cdste$price - ypredRF)^2))

plot(rffit)

print(rffit$importance)

##         IncNodePurity
## mileage      45689.55
## year        189995.99

cat('rmse for random forests is: ',rmseRF,'\n')

## rmse for random forests is:  5.523562

Boosting

Below is boosting with gbm.

Try setting interaction.depth = 2. After all, we only have two features.

##################################################
## gbm
## see slide 64 of the trees notes

library(gbm) #boosting

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

#fit using boosting
nB = 500
lam = .05
set.seed(16)
boostfit = gbm(price~.,data=cdstr,distribution="gaussian",
   interaction.depth=4,n.trees=nB,shrinkage=lam)
ypredB = predict(boostfit,newdata=cdste,n.trees=nB)
rmseB = sqrt(mean((cdste$price - ypredB)^2))

cat('rmse boost: ',rmseB,'\n')

## rmse boost:  5.248297

Neural Networks

Below is torch code adapted from the Hitters example in the ISLR chapter 10 lab.
It looks like just one epoch does the trick.

Try setting the number of epochs to a low number like 2 or 5.

##################################################
## nnet with torch
## see hitters lab

library(torch)
library(luz) # high-level interface for torch
library(torchvision) # for datasets and image transformation
library(torchdatasets) # for datasets we are going to use
library(zeallot)
torch_manual_seed(13)

##################################################
## scale

xtr = as.matrix(cdstr[,2:3])
xte = as.matrix(cdste[,2:3])
xtrs = scale(xtr)
xtes = xte
for(i in 1:ncol(xtr)) {
   mtemp = mean(xtr[,i])
   stemp = sd(xtr[,i])
   temp = (xtr[,i]-mtemp)/stemp
   print(summary(temp - xtrs[,i]))
   xtes[,i] = (xte[,i]-mtemp)/stemp
}

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0

## model
modnn <- nn_module(
  initialize = function(input_size) {
    self$hidden <- nn_linear(input_size, 50)
    self$activation <- nn_relu()
    self$dropout <- nn_dropout(0.4)
    self$output <- nn_linear(50, 1)
  },
  forward = function(x) {
    x %>%
      self$hidden() %>%
      self$activation() %>%
      self$dropout() %>%
      self$output()
  }
)

modnn <- modnn %>%
  setup(
    loss = nn_mse_loss(),
    optimizer = optim_rmsprop,
    metrics = list(luz_metric_mae())
  ) %>%
  set_hparams(input_size = ncol(xtrs))

fitted <- modnn %>%
  fit(
    data = list(xtrs, matrix(cdstr$price, ncol = 1)),
    valid_data = list(xtes, matrix(cdste$price, ncol = 1)),
    epochs = 20 # 50
  )

plot(fitted)

ypredN <- predict(fitted, xtes)
ypredN = as.matrix(ypredN)
plot(cdste$price,ypredN)

mean(abs(cdste$price - as.matrix(ypredN)))

## [1] 3.746837

rmseN = sqrt(mean((cdste$price - ypredN)^2))
cat('rmse for neural net: ',rmseN,'\n')

## rmse for neural net:  5.300348

Let’s Compare !!

##################################################
## compare predictions across the methods
fMat = cbind(cdste$price,ypredT,ypredRF,ypredB,ypredN)
colnames(fMat) = c('price','tree','RF','Boost','Nnet')
print(cor(fMat))

##           price      tree        RF     Boost      Nnet
## price 1.0000000 0.9466224 0.9546860 0.9588456 0.9568344
## tree  0.9466224 1.0000000 0.9826257 0.9823523 0.9705356
## RF    0.9546860 0.9826257 1.0000000 0.9939847 0.9807047
## Boost 0.9588456 0.9823523 0.9939847 1.0000000 0.9885384
## Nnet  0.9568344 0.9705356 0.9807047 0.9885384 1.0000000

pairs(fMat)

rmsef = function(yp) {
    sqrt(mean((cdste$price - yp)^2))
}
apply(fMat[,2:5],2,rmsef)

##     tree       RF    Boost     Nnet 
## 6.050673 5.523562 5.248297 5.300348