Get Data

##################################################
## read in data
cd = read.csv("http://www.rob-mcculloch.org/data/usedcars.csv")
print(dim(cd))
## [1] 20063    11
##################################################
##  pick off y=price, x=mileage
y = cd$price
x = cd$mileage
## for convenince, as a data frame
ddf = data.frame(x,y)
n = nrow(ddf)
##################################################
### simple train/test split
set.seed(34)

ntr = floor(.75*n)
nte = n - ntr
iitr = sample(1:n,ntr)

Tree with mileage

##################################################
## single tree
library(rpart)

set.seed(99)
big.tree = rpart(y~.,method="anova",data=ddf[iitr,],control=rpart.control(minsplit=5,cp=.0001))
nbig = length(unique(big.tree$where))
cat("size of big tree: ",nbig,"\n")
## size of big tree:  108
##################################################
###look at CV results
##https://stats.stackexchange.com/questions/275652/rpart-cross-validation
##   sounds like 10-fold cross validation is hard coded in
plotcp(big.tree)

##################################################
### get nice tree from CV results
iibest = which.min(big.tree$cptable[,"xerror"]) #which has the lowest error
bestcp=big.tree$cptable[iibest,"CP"]
bestsize = big.tree$cptable[iibest,"nsplit"]+1

##################################################
#prune to good tree
best.tree = prune(big.tree,cp=bestcp)
nbest = length(unique(best.tree$where))
cat("size of best tree: ", nbest,"\n")
## size of best tree:  26
##################################################
## in-sample fits
yhat = predict(best.tree)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhat,col='blue')
title(main='in sample fit using a single tree')

##################################################
## prediction
ypred = predict(best.tree,ddf[-iitr,])
plot(x[-iitr],y[-iitr],xlab='mileage',ylab='price')
points(x[-iitr],ypred,col='blue')
title('out of sample fit using a single tree')

rmsetree = sqrt(mean((y[-iitr] - ypred)^2))
cat('test rmse for tree: ',rmsetree,'\n')
## test rmse for tree:  9784.738

Random Forests with mileage

Note that I actually had to tune random forests!!!

I used the parameter maxnodes to make sure the “big trees” were not too big after not liking the fit at the default settings.

### random forests
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
rffit = randomForest(y~.,data=ddf[iitr,])
yhatrf = predict(rffit)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatrf,col='blue')
title(main='in sample fit for default random forest')

rffit1 = randomForest(y~.,data=ddf[iitr,],mtry=1,ntree=1000,maxnodes=50)
yhatrf1 = predict(rffit1)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatrf1,col='blue')
title(main='in sample fit for random forests with maxnodes=50')

plot(rffit1)
title(main='plot of random forests object with maxnodes=50')

ypredrf1 = predict(rffit1,newdata=ddf[-iitr,])
rmserf = sqrt(mean((y[-iitr] - ypredrf1)^2))
cat('rmse for random forests: ',rmserf,'\n')
## rmse for random forests:  9653.093

Boosting with mileage

### boosting
library(gbm)
## Loaded gbm 2.1.8
#fit using boosting
boostfit = gbm(y~.,data=ddf[iitr,],distribution="gaussian",
   interaction.depth=4,n.trees=500,shrinkage=.2)
yhatbst = predict(boostfit,n.trees=500)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatbst,col='blue')
title(main='in sample boosting 500 trees, shrinkage=.2')

boostfit1 = gbm(y~.,data=ddf[iitr,],distribution="gaussian",
   interaction.depth=4,n.trees=100,shrinkage=.2)
yhatbst1 = predict(boostfit1,n.trees=100)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatbst1,col='blue')
title(main='in sample boosting 100 trees, shrinkage=.2')

ypredbst1 = predict(boostfit,newdata=ddf[-iitr,],n.trees=100)
rmsebst = sqrt(mean((y[-iitr] - ypredbst1)^2))
cat('boosting rmse with mileage: ',rmsebst,'\n')
## boosting rmse with mileage:  9675.402

Neural nets with mileage

Rmarkdown would not use my keras install. See the R script solutions for the neural net fit.

Interestingly, I had to scale y to get it to work !!

I saved the predictions from all the models in a csv file.

pred1 = read.csv("predMat1.csv")
head(pred1)
##   ytest     tree        rf     boost      nnet
## 1  5995 14644.72 13683.875 13509.267 13581.360
## 2 10850 14644.72 16784.924 16041.661 16598.259
## 3 11459 10693.15 10805.517 10811.567 10712.978
## 4 10765 28508.47 28305.843 28745.603 29770.363
## 5  6995 10693.15  9479.976  9028.219  8999.773
## 6 21695 26294.12 26299.427 26782.224 27590.335
plot(x[-iitr],y[-iitr],xlab='mileage',ylab='price')
points(x[-iitr],pred1[,5],col='blue')
title(main='out of sample neural net fit')

rmsenn = sqrt(mean((pred1[,1]-pred1[,5])^2))
cat('out of sample rmse for nnet with mileage: ',rmsenn,'\n')
## out of sample rmse for nnet with mileage:  9679.372
pairs(pred1)

cor(pred1)
##           ytest      tree        rf     boost      nnet
## ytest 1.0000000 0.8447010 0.8490366 0.8482563 0.8503597
## tree  0.8447010 1.0000000 0.9967256 0.9937262 0.9926976
## rf    0.8490366 0.9967256 1.0000000 0.9982879 0.9979401
## boost 0.8482563 0.9937262 0.9982879 1.0000000 0.9970785
## nnet  0.8503597 0.9926976 0.9979401 0.9970785 1.0000000
plot(x,y)
lines(rep(mean(x),2),c(mean(y)-2*rmsetree,mean(y)+2*rmsetree),col='magenta',lwd=3)
lines(rep(mean(x)+3000,2),c(mean(y)-2*rmserf,mean(y)+2*rmserf),col='cyan',lwd=3)
lines(rep(mean(x)+6000,2),c(mean(y)-2*rmsebst,mean(y)+2*rmsebst),col='red',lwd=3)
lines(rep(mean(x)+9000,2),c(mean(y)-2*rmsenn,mean(y)+2*rmsenn),col='blue',lwd=3)
title(main="plus and minus 2rmse")
legend("topright",legend=c('tree','rf','boost','nnet'),col=c('magenta','cyan','red','blue'),lwd=rep(4,3))

Get data and scale x and y for mileage and year

I start by scaling x =[mileage,year] and y=price.
I actually only need this for the neural nets which are not in this document, but it is in the R script.

ytrs = scale(y[iitr])
ytes = (y[-iitr]-attr(ytrs,'scaled:center'))/attr(ytrs,'scaled:scale')

x = cbind(cd$mileage,cd$year)

## train/test
xtr = x[iitr,]
xte = x[-iitr,]

## scale x
xtrs = scale(xtr)
xtes = scale(xte,center=attr(xtrs,'scaled:center'), scale = attr(xtrs,'scaled:scale'))

## let's do everything with the scaled values
ddftr = data.frame(x=xtrs,y=ytrs)
ddfte = data.frame(x=xtes,y=ytes)

Tree with mileage and year

library(rpart)

set.seed(99)
big.tree = rpart(y~.,method="anova",data=ddftr,control=rpart.control(minsplit=5,cp=.0001))
nbig = length(unique(big.tree$where))
cat("size of big tree: ",nbig,"\n")
## size of big tree:  67
plotcp(big.tree)

##################################################
### get nice tree from CV results
iibest = which.min(big.tree$cptable[,"xerror"]) #which has the lowest error
bestcp=big.tree$cptable[iibest,"CP"]
bestsize = big.tree$cptable[iibest,"nsplit"]+1

##################################################
#prune to good tree
best.tree = prune(big.tree,cp=bestcp)
nbest = length(unique(best.tree$where))
cat("size of best tree: ", nbest,"\n")
## size of best tree:  61
##################################################
## in-sample fits
yhat = predict(best.tree)
plot(ytrs,yhat)
abline(0,1,col='red')
title(main='in sample y vs yhat, tree, mileage and year')

#library(rgl)
#plot3d(xtrs[,1],xtrs[,2],yhat)
##################################################
## prediction
ypred = predict(best.tree,ddfte)
ypred = ypred * sd(y[iitr]) + mean(y[iitr])
rmsetree = sqrt(mean((y[-iitr] - ypred)^2))
cat("rmsetree: ",rmsetree,"\n")
## rmsetree:  5547.464

random forests with mileage and year

##################################################
### rf
library(randomForest)

#rffit = randomForest(y~.,data=ddftr,mtry=2,ntree=1000,maxnodes=50)
rffit = randomForest(y~.,data=ddftr,mtry=2,ntree=1000,maxnodes=200)
yhatrf = predict(rffit)
plot(yhatrf,ytrs)
abline(0,1,col='red')
title(main='in sample rf with mileage and year')

## prediction
ypredrf = predict(rffit,ddfte)
ypredrf = ypredrf * sd(y[iitr]) + mean(y[iitr])
rmserf = sqrt(mean((y[-iitr] - ypredrf)^2))
cat("rmserf: ",rmserf,"\n")
## rmserf:  5414.796
temp = cbind(y[-iitr], ypred, ypredrf)
pairs(temp)

cor(temp)
##                       ypred   ypredrf
##         1.0000000 0.9527960 0.9550571
## ypred   0.9527960 1.0000000 0.9975098
## ypredrf 0.9550571 0.9975098 1.0000000

boosting mileage and year

##################################################
#fit using boosting

boostfit = gbm(y~.,data=ddftr,distribution="gaussian",
   interaction.depth=4,n.trees=200,shrinkage=.2)
yhatbst = predict(boostfit,n.trees=100)
plot(yhatbst,ytrs)
abline(0,1,col='red')
title(main='in sample boosting with mileage and year')

ypredbst = predict(boostfit,newdata=ddfte,n.trees=200)
ypredbst = ypredbst * sd(y[iitr]) + mean(y[iitr])
rmsebst = sqrt(mean((y[-iitr] - ypredbst)^2))

cat('out of sample rmse, boosting, mileage and year: ',rmsebst,'\n')
## out of sample rmse, boosting, mileage and year:  5452.455
temp = cbind(y[-iitr], ypred, ypredrf,ypredbst)
pairs(temp)

cor(temp)
##                        ypred   ypredrf  ypredbst
##          1.0000000 0.9527960 0.9550571 0.9544161
## ypred    0.9527960 1.0000000 0.9975098 0.9960648
## ypredrf  0.9550571 0.9975098 1.0000000 0.9983542
## ypredbst 0.9544161 0.9960648 0.9983542 1.0000000

neural nets with boosting and year

pred2 = read.csv('predDf2.csv')
head(pred2)
##   ytest      tree        rf     boost      nnet
## 1  5995  5179.734  6284.906  6575.473  6953.948
## 2 10850  8522.711  7978.151  8832.464  7899.981
## 3 11459  9010.047  9723.690 10039.285  8095.971
## 4 10765 11793.145 11692.629 11359.929 11072.952
## 5  6995  6744.240  7575.573  7570.186  6852.731
## 6 21695 16873.232 19064.096 18392.332 21828.294
cat('out of sample rmse for nnet with mileage and year: ',sqrt(mean((pred2[,1]-pred2[,5])^2)))
## out of sample rmse for nnet with mileage and year:  5515.483
pairs(pred2)

cor(pred2)
##           ytest      tree        rf     boost      nnet
## ytest 1.0000000 0.9527960 0.9550571 0.9544161 0.9537173
## tree  0.9527960 1.0000000 0.9975098 0.9960648 0.9937575
## rf    0.9550571 0.9975098 1.0000000 0.9983542 0.9964258
## boost 0.9544161 0.9960648 0.9983542 1.0000000 0.9955748
## nnet  0.9537173 0.9937575 0.9964258 0.9955748 1.0000000