cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[,c('price','mileage','color')]
cd$price = cd$price/1000
cd$mileage = cd$mileage/1000
cd$color = as.factor(cd$color)
cd$mileagesq = cd$mileage^2
head(cd)
## price mileage color mileagesq
## 1 43.995 36.858 Silver 1358.512
## 2 44.995 46.883 Black 2198.016
## 3 25.999 108.759 White 11828.520
## 4 33.880 35.187 Black 1238.125
## 5 34.895 48.153 Black 2318.711
## 6 5.995 121.748 other 14822.576
plot(cd$mileage,cd$price,col=cd$color,xlab='mileage',ylab='price',pch=16,cex=.8)
legend('topright',legend=levels(cd$color),col=1:4,pch=rep(16,4))
set.seed(34)
n = nrow(cd)
pin = .75 #percent train (or percent in-sample)
ii = sample(1:n,floor(pin*n))
cdtr = cd[ii,]
cdte = cd[-ii,]
cat("dimension of train data:",dim(cdtr),"\n")
## dimension of train data: 750 4
## dimension of train data: 750 3
cat("dimension of test data:",dim(cdte),"\n")
## dimension of test data: 250 4
## dimension of test data: 250 3
lmfit = lm(price~.,cdtr)
yhat = lmfit$fitted.values
ypred = predict(lmfit,cdte)
Plot in-sample fit.
plot(cdtr$price,yhat,xlab='train price',ylab='yhat=fitted values',col='blue')
abline(0,1,col='red')
title(main='quadratic model')
Plot out-of-sample fit.
plot(cdte$price,ypred,xlab='test price',ylab='ypred=predicted values',col='blue')
abline(0,1,col='red')
title(main='quadratic model')
rmse on test
rmsef = function(y,yhat) {
return(sqrt(mean((y-yhat)^2)))
}
cat("rmse for quadratic model is ",rmsef(cdte$price,ypred))
## rmse for quadratic model is 9.22106
Plot fit.
plot(cdtr$mileage,yhat,col=cdtr$color)
legend('topright',legend=levels(cd$color),col=1:4,pch=rep(16,4),bty='n')
lmfit$coefficients
## (Intercept) mileage colorother colorSilver colorWhite mileagesq
## 69.340512321 -0.699519446 -2.715050493 -3.483103303 0.929254968 0.001940603
From the coefficients we can see that other and Silver have lower prices than Black and White is about the same which is what we see in the plot.
cdtr$lprice = log(cdtr$price)
lmfit0 = lm(lprice~mileage+color,cdtr)
yhat0 = exp(lmfit0$fitted.values)
ypred0 = exp(predict(lmfit0,cdte))
Plot in-sample fit.
plot(cdtr$price,yhat0,xlab='train price',ylab='yhat=fitted values',col='blue')
abline(0,1,col='red')
title(main="log price model")
Plot out-of-sample fit.
plot(cdte$price,ypred0,xlab='test price',ylab='ypred=predicted values',col='blue')
abline(0,1,col='red')
title(main='log price model')
cat("rmse for quadratic model is ",rmsef(cdte$price,ypred),"\n")
## rmse for quadratic model is 9.22106
cat("rmse for log model is ",rmsef(cdte$price,ypred0),"\n")
## rmse for log model is 9.32466
Not much difference, let’s compare the predictions.
plot(ypred,ypred0,xlab='quadratic predictions',ylab='log y predictions',col='blue')
abline(0,1,col='red')
Let’s compare with linear.
lmfit1 = lm(price~mileage,cdtr)
yhat1 = lmfit1$fitted.values
ypred1 = predict(lmfit1,cdte)
cat("rmse for quadratic model is ",rmsef(cdte$price,ypred),"\n")
## rmse for quadratic model is 9.22106
cat("rmse for log model is ",rmsef(cdte$price,ypred0),"\n")
## rmse for log model is 9.32466
cat("rmse for linear model is ",rmsef(cdte$price,ypred1),"\n")
## rmse for linear model is 11.06928
plot(cdte$price,ypred,col='blue',pch=2,ylim=range(c(ypred,ypred1)),xlab='test price',ylab='predictions')
points(cdte$price,ypred1,col='red',pch=4)
legend('topleft',legend=c('quadratic','linear'),pch=c(2,4),col=c('blue','red'),bty='n')
abline(0,1,col='gray',lty=2)