##################################################
## read in data
cd = read.csv("http://www.rob-mcculloch.org/data/usedcars.csv")
print(dim(cd))
## [1] 20063 11
##################################################
## pick off y=price, x=mileage
y = cd$price
x = cd$mileage
## for convenince, as a data frame
ddf = data.frame(x,y)
n = nrow(ddf)
##################################################
### simple train/test split
set.seed(34)
ntr = floor(.75*n)
nte = n - ntr
iitr = sample(1:n,ntr)
##################################################
## single tree
library(rpart)
set.seed(99)
big.tree = rpart(y~.,method="anova",data=ddf[iitr,],control=rpart.control(minsplit=5,cp=.0001))
nbig = length(unique(big.tree$where))
cat("size of big tree: ",nbig,"\n")
## size of big tree: 108
##################################################
###look at CV results
##https://stats.stackexchange.com/questions/275652/rpart-cross-validation
## sounds like 10-fold cross validation is hard coded in
plotcp(big.tree)
##################################################
### get nice tree from CV results
iibest = which.min(big.tree$cptable[,"xerror"]) #which has the lowest error
bestcp=big.tree$cptable[iibest,"CP"]
bestsize = big.tree$cptable[iibest,"nsplit"]+1
##################################################
#prune to good tree
best.tree = prune(big.tree,cp=bestcp)
nbest = length(unique(best.tree$where))
cat("size of best tree: ", nbest,"\n")
## size of best tree: 26
##################################################
## in-sample fits
yhat = predict(best.tree)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhat,col='blue')
title(main='in sample fit using a single tree')
##################################################
## prediction
ypred = predict(best.tree,ddf[-iitr,])
plot(x[-iitr],y[-iitr],xlab='mileage',ylab='price')
points(x[-iitr],ypred,col='blue')
title('out of sample fit using a single tree')
rmsetree = sqrt(mean((y[-iitr] - ypred)^2))
cat('test rmse for tree: ',rmsetree,'\n')
## test rmse for tree: 9784.738
Note that I actually had to tune random forests!!!
I used the parameter maxnodes to make sure the “big trees” were not too big after not liking the fit at the default settings.
### random forests
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
rffit = randomForest(y~.,data=ddf[iitr,])
yhatrf = predict(rffit)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatrf,col='blue')
title(main='in sample fit for default random forest')
rffit1 = randomForest(y~.,data=ddf[iitr,],mtry=1,ntree=1000,maxnodes=50)
yhatrf1 = predict(rffit1)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatrf1,col='blue')
title(main='in sample fit for random forests with maxnodes=50')
plot(rffit1)
title(main='plot of random forests object with maxnodes=50')
ypredrf1 = predict(rffit1,newdata=ddf[-iitr,])
rmserf = sqrt(mean((y[-iitr] - ypredrf1)^2))
cat('rmse for random forests: ',rmserf,'\n')
## rmse for random forests: 9653.093
### boosting
library(gbm)
## Loaded gbm 2.1.8
#fit using boosting
boostfit = gbm(y~.,data=ddf[iitr,],distribution="gaussian",
interaction.depth=4,n.trees=500,shrinkage=.2)
yhatbst = predict(boostfit,n.trees=500)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatbst,col='blue')
title(main='in sample boosting 500 trees, shrinkage=.2')
boostfit1 = gbm(y~.,data=ddf[iitr,],distribution="gaussian",
interaction.depth=4,n.trees=100,shrinkage=.2)
yhatbst1 = predict(boostfit1,n.trees=100)
plot(x[iitr],y[iitr],xlab='mileage',ylab='price')
points(x[iitr],yhatbst1,col='blue')
title(main='in sample boosting 100 trees, shrinkage=.2')
ypredbst1 = predict(boostfit,newdata=ddf[-iitr,],n.trees=100)
rmsebst = sqrt(mean((y[-iitr] - ypredbst1)^2))
cat('boosting rmse with mileage: ',rmsebst,'\n')
## boosting rmse with mileage: 9675.402
Rmarkdown would not use my keras install. See the R script solutions for the neural net fit.
Interestingly, I had to scale y to get it to work !!
I saved the predictions from all the models in a csv file.
pred1 = read.csv("predMat1.csv")
head(pred1)
## ytest tree rf boost nnet
## 1 5995 14644.72 13683.875 13509.267 13581.360
## 2 10850 14644.72 16784.924 16041.661 16598.259
## 3 11459 10693.15 10805.517 10811.567 10712.978
## 4 10765 28508.47 28305.843 28745.603 29770.363
## 5 6995 10693.15 9479.976 9028.219 8999.773
## 6 21695 26294.12 26299.427 26782.224 27590.335
plot(x[-iitr],y[-iitr],xlab='mileage',ylab='price')
points(x[-iitr],pred1[,5],col='blue')
title(main='out of sample neural net fit')
rmsenn = sqrt(mean((pred1[,1]-pred1[,5])^2))
cat('out of sample rmse for nnet with mileage: ',rmsenn,'\n')
## out of sample rmse for nnet with mileage: 9679.372
pairs(pred1)
cor(pred1)
## ytest tree rf boost nnet
## ytest 1.0000000 0.8447010 0.8490366 0.8482563 0.8503597
## tree 0.8447010 1.0000000 0.9967256 0.9937262 0.9926976
## rf 0.8490366 0.9967256 1.0000000 0.9982879 0.9979401
## boost 0.8482563 0.9937262 0.9982879 1.0000000 0.9970785
## nnet 0.8503597 0.9926976 0.9979401 0.9970785 1.0000000
plot(x,y)
lines(rep(mean(x),2),c(mean(y)-2*rmsetree,mean(y)+2*rmsetree),col='magenta',lwd=3)
lines(rep(mean(x)+3000,2),c(mean(y)-2*rmserf,mean(y)+2*rmserf),col='cyan',lwd=3)
lines(rep(mean(x)+6000,2),c(mean(y)-2*rmsebst,mean(y)+2*rmsebst),col='red',lwd=3)
lines(rep(mean(x)+9000,2),c(mean(y)-2*rmsenn,mean(y)+2*rmsenn),col='blue',lwd=3)
title(main="plus and minus 2rmse")
legend("topright",legend=c('tree','rf','boost','nnet'),col=c('magenta','cyan','red','blue'),lwd=rep(4,3))
I start by scaling x =[mileage,year] and y=price.
I actually only need this for the neural nets which are not in this
document, but it is in the R script.
ytrs = scale(y[iitr])
ytes = (y[-iitr]-attr(ytrs,'scaled:center'))/attr(ytrs,'scaled:scale')
x = cbind(cd$mileage,cd$year)
## train/test
xtr = x[iitr,]
xte = x[-iitr,]
## scale x
xtrs = scale(xtr)
xtes = scale(xte,center=attr(xtrs,'scaled:center'), scale = attr(xtrs,'scaled:scale'))
## let's do everything with the scaled values
ddftr = data.frame(x=xtrs,y=ytrs)
ddfte = data.frame(x=xtes,y=ytes)
library(rpart)
set.seed(99)
big.tree = rpart(y~.,method="anova",data=ddftr,control=rpart.control(minsplit=5,cp=.0001))
nbig = length(unique(big.tree$where))
cat("size of big tree: ",nbig,"\n")
## size of big tree: 67
plotcp(big.tree)
##################################################
### get nice tree from CV results
iibest = which.min(big.tree$cptable[,"xerror"]) #which has the lowest error
bestcp=big.tree$cptable[iibest,"CP"]
bestsize = big.tree$cptable[iibest,"nsplit"]+1
##################################################
#prune to good tree
best.tree = prune(big.tree,cp=bestcp)
nbest = length(unique(best.tree$where))
cat("size of best tree: ", nbest,"\n")
## size of best tree: 61
##################################################
## in-sample fits
yhat = predict(best.tree)
plot(ytrs,yhat)
abline(0,1,col='red')
title(main='in sample y vs yhat, tree, mileage and year')
#library(rgl)
#plot3d(xtrs[,1],xtrs[,2],yhat)
##################################################
## prediction
ypred = predict(best.tree,ddfte)
ypred = ypred * sd(y[iitr]) + mean(y[iitr])
rmsetree = sqrt(mean((y[-iitr] - ypred)^2))
cat("rmsetree: ",rmsetree,"\n")
## rmsetree: 5547.464
##################################################
### rf
library(randomForest)
#rffit = randomForest(y~.,data=ddftr,mtry=2,ntree=1000,maxnodes=50)
rffit = randomForest(y~.,data=ddftr,mtry=2,ntree=1000,maxnodes=200)
yhatrf = predict(rffit)
plot(yhatrf,ytrs)
abline(0,1,col='red')
title(main='in sample rf with mileage and year')
## prediction
ypredrf = predict(rffit,ddfte)
ypredrf = ypredrf * sd(y[iitr]) + mean(y[iitr])
rmserf = sqrt(mean((y[-iitr] - ypredrf)^2))
cat("rmserf: ",rmserf,"\n")
## rmserf: 5414.796
temp = cbind(y[-iitr], ypred, ypredrf)
pairs(temp)
cor(temp)
## ypred ypredrf
## 1.0000000 0.9527960 0.9550571
## ypred 0.9527960 1.0000000 0.9975098
## ypredrf 0.9550571 0.9975098 1.0000000
##################################################
#fit using boosting
boostfit = gbm(y~.,data=ddftr,distribution="gaussian",
interaction.depth=4,n.trees=200,shrinkage=.2)
yhatbst = predict(boostfit,n.trees=100)
plot(yhatbst,ytrs)
abline(0,1,col='red')
title(main='in sample boosting with mileage and year')
ypredbst = predict(boostfit,newdata=ddfte,n.trees=200)
ypredbst = ypredbst * sd(y[iitr]) + mean(y[iitr])
rmsebst = sqrt(mean((y[-iitr] - ypredbst)^2))
cat('out of sample rmse, boosting, mileage and year: ',rmsebst,'\n')
## out of sample rmse, boosting, mileage and year: 5452.455
temp = cbind(y[-iitr], ypred, ypredrf,ypredbst)
pairs(temp)
cor(temp)
## ypred ypredrf ypredbst
## 1.0000000 0.9527960 0.9550571 0.9544161
## ypred 0.9527960 1.0000000 0.9975098 0.9960648
## ypredrf 0.9550571 0.9975098 1.0000000 0.9983542
## ypredbst 0.9544161 0.9960648 0.9983542 1.0000000
pred2 = read.csv('predDf2.csv')
head(pred2)
## ytest tree rf boost nnet
## 1 5995 5179.734 6284.906 6575.473 6953.948
## 2 10850 8522.711 7978.151 8832.464 7899.981
## 3 11459 9010.047 9723.690 10039.285 8095.971
## 4 10765 11793.145 11692.629 11359.929 11072.952
## 5 6995 6744.240 7575.573 7570.186 6852.731
## 6 21695 16873.232 19064.096 18392.332 21828.294
cat('out of sample rmse for nnet with mileage and year: ',sqrt(mean((pred2[,1]-pred2[,5])^2)))
## out of sample rmse for nnet with mileage and year: 5515.483
pairs(pred2)
cor(pred2)
## ytest tree rf boost nnet
## ytest 1.0000000 0.9527960 0.9550571 0.9544161 0.9537173
## tree 0.9527960 1.0000000 0.9975098 0.9960648 0.9937575
## rf 0.9550571 0.9975098 1.0000000 0.9983542 0.9964258
## boost 0.9544161 0.9960648 0.9983542 1.0000000 0.9955748
## nnet 0.9537173 0.9937575 0.9964258 0.9955748 1.0000000