Read in the data and process it a bit

cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[,c('price','mileage','color')]
cd$price = cd$price/1000
cd$mileage = cd$mileage/1000
cd$color = as.factor(cd$color)
cd$mileagesq = cd$mileage^2
head(cd)
##    price mileage  color mileagesq
## 1 43.995  36.858 Silver  1358.512
## 2 44.995  46.883  Black  2198.016
## 3 25.999 108.759  White 11828.520
## 4 33.880  35.187  Black  1238.125
## 5 34.895  48.153  Black  2318.711
## 6  5.995 121.748  other 14822.576

Plot the data

plot(cd$mileage,cd$price,col=cd$color,xlab='mileage',ylab='price',pch=16,cex=.8)
legend('topright',legend=levels(cd$color),col=1:4,pch=rep(16,4))

Train/Test split

set.seed(34)
n = nrow(cd)
pin = .75 #percent train (or percent in-sample)
ii = sample(1:n,floor(pin*n))
cdtr = cd[ii,]
cdte = cd[-ii,]
cat("dimension of train data:",dim(cdtr),"\n")
## dimension of train data: 750 4
## dimension of train data: 750 3
cat("dimension of test data:",dim(cdte),"\n")
## dimension of test data: 250 4
## dimension of test data: 250 3

Quadratic Model

lmfit = lm(price~.,cdtr)
yhat = lmfit$fitted.values
ypred = predict(lmfit,cdte)

Plot in-sample fit.

plot(cdtr$price,yhat,xlab='train price',ylab='yhat=fitted values',col='blue')
abline(0,1,col='red')
title(main='quadratic model')

Plot out-of-sample fit.

plot(cdte$price,ypred,xlab='test price',ylab='ypred=predicted values',col='blue')
abline(0,1,col='red')
title(main='quadratic model')

rmse on test

rmsef = function(y,yhat) {
  return(sqrt(mean((y-yhat)^2)))
}
cat("rmse for quadratic model is ",rmsef(cdte$price,ypred))
## rmse for quadratic model is  9.22106

Plot fit.

plot(cdtr$mileage,yhat,col=cdtr$color)
legend('topright',legend=levels(cd$color),col=1:4,pch=rep(16,4),bty='n')

lmfit$coefficients
##  (Intercept)      mileage   colorother  colorSilver   colorWhite    mileagesq 
## 69.340512321 -0.699519446 -2.715050493 -3.483103303  0.929254968  0.001940603

From the coefficients we can see that other and Silver have lower prices than Black and White is about the same which is what we see in the plot.

log price model

cdtr$lprice = log(cdtr$price)
lmfit0 = lm(lprice~mileage+color,cdtr)
yhat0 = exp(lmfit0$fitted.values)
ypred0 = exp(predict(lmfit0,cdte))

Plot in-sample fit.

plot(cdtr$price,yhat0,xlab='train price',ylab='yhat=fitted values',col='blue')
abline(0,1,col='red')
title(main="log price model")

Plot out-of-sample fit.

plot(cdte$price,ypred0,xlab='test price',ylab='ypred=predicted values',col='blue')
abline(0,1,col='red')
title(main='log price model')

cat("rmse for quadratic model is ",rmsef(cdte$price,ypred),"\n")
## rmse for quadratic model is  9.22106
cat("rmse for log model is ",rmsef(cdte$price,ypred0),"\n")
## rmse for log model is  9.32466

Not much difference, let’s compare the predictions.

plot(ypred,ypred0,xlab='quadratic predictions',ylab='log y predictions',col='blue')
abline(0,1,col='red')

Let’s compare with linear.

lmfit1 = lm(price~mileage,cdtr)
yhat1 = lmfit1$fitted.values
ypred1 = predict(lmfit1,cdte)
cat("rmse for quadratic model is ",rmsef(cdte$price,ypred),"\n")
## rmse for quadratic model is  9.22106
cat("rmse for log model is ",rmsef(cdte$price,ypred0),"\n")
## rmse for log model is  9.32466
cat("rmse for linear model is ",rmsef(cdte$price,ypred1),"\n")
## rmse for linear model is  11.06928
plot(cdte$price,ypred,col='blue',pch=2,ylim=range(c(ypred,ypred1)),xlab='test price',ylab='predictions')
points(cdte$price,ypred1,col='red',pch=4)
legend('topleft',legend=c('quadratic','linear'),pch=c(2,4),col=c('blue','red'),bty='n')
abline(0,1,col='gray',lty=2)