##################################################
### get data
cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
scd = cd[,c(1,4,5)]
scd$price = scd$price/1000
scd$mileage = scd$mileage/1000
head(scd)
## price mileage year
## 1 43.995 36.858 2008
## 2 44.995 46.883 2012
## 3 25.999 108.759 2007
## 4 33.880 35.187 2007
## 5 34.895 48.153 2007
## 6 5.995 121.748 2002
##################################################
## add polynomial terms
x = as.matrix(scd[,c(2,3)])
xpoly5 = poly(x,degree=5,raw=FALSE)
dfpol5 = data.frame(y = scd$price, xpoly5)
xpoly10 = poly(x,degree=10,raw=FALSE)
dfpol10 = data.frame(y = scd$price, xpoly10)
print(dim(xpoly5))
## [1] 1000 20
print(dim(xpoly10))
## [1] 1000 65
print(names(dfpol5))
## [1] "y" "X1.0" "X2.0" "X3.0" "X4.0" "X5.0" "X0.1" "X1.1" "X2.1" "X3.1"
## [11] "X4.1" "X0.2" "X1.2" "X2.2" "X3.2" "X0.3" "X1.3" "X2.3" "X0.4" "X1.4"
## [21] "X0.5"
print(names(dfpol10))
## [1] "y" "X1.0" "X2.0" "X3.0" "X4.0" "X5.0" "X6.0" "X7.0" "X8.0"
## [10] "X9.0" "X10.0" "X0.1" "X1.1" "X2.1" "X3.1" "X4.1" "X5.1" "X6.1"
## [19] "X7.1" "X8.1" "X9.1" "X0.2" "X1.2" "X2.2" "X3.2" "X4.2" "X5.2"
## [28] "X6.2" "X7.2" "X8.2" "X0.3" "X1.3" "X2.3" "X3.3" "X4.3" "X5.3"
## [37] "X6.3" "X7.3" "X0.4" "X1.4" "X2.4" "X3.4" "X4.4" "X5.4" "X6.4"
## [46] "X0.5" "X1.5" "X2.5" "X3.5" "X4.5" "X5.5" "X0.6" "X1.6" "X2.6"
## [55] "X3.6" "X4.6" "X0.7" "X1.7" "X2.7" "X3.7" "X0.8" "X1.8" "X2.8"
## [64] "X0.9" "X1.9" "X0.10"
##################################################
## simple train/test split
n = nrow(cd)
set.seed(34)
testid = sample(1:n,300)
dfpol5tr = dfpol5[-testid,]
dfpol5te = dfpol5[testid,]
dfpol10tr = dfpol10[-testid,]
dfpol10te = dfpol10[testid,]
print(dim(dfpol5tr))
## [1] 700 21
print(dim(dfpol5te))
## [1] 300 21
print(dim(dfpol10tr))
## [1] 700 66
print(dim(dfpol10te))
## [1] 300 66
##################################################
## try linear, expect this is overfitting
lmf5 = lm(y~.,dfpol5tr)
ypred5 = predict(lmf5,dfpol5te)
lmf10 = lm(y~.,dfpol10tr)
ypred10 = predict(lmf10,dfpol10te)
linMte = cbind(dfpol5te$y,ypred5,ypred10)
colnames(linMte) = c('y','ypred5','ypred10')
linMtr = cbind(dfpol5tr$y,lmf5$fitted,lmf10$fitted)
colnames(linMtr) = c('y','yhat5','yhat10')
pairs(linMtr)
pairs(linMte)
myrmse = function(y,yp) {
sqrt(mean((y-yp)^2))
}
cat('lin5 rmse, test: ',myrmse(dfpol5te$y,ypred5),'\n')
## lin5 rmse, test: 5.404617
cat('lin5 rmse, train: ',myrmse(dfpol5tr$y,lmf5$fitted),'\n')
## lin5 rmse, train: 5.443113
cat('lin10 rmse, test: ',myrmse(dfpol10te$y,ypred10),'\n')
## lin10 rmse, test: 23.20774
cat('lin10 rmse, train: ',myrmse(dfpol10tr$y,lmf10$fitted),'\n')
## lin10 rmse, train: 5.150665
## 5 actually looks ok, 10 is not!
The 5 poly seems to work pretty well, good results on train and
test.
The 10 poly is a disaster on test.
Let’s see if we can improve the performance by using the LASSO instead of just running a multiple regression.
We will do 10-fold cross-validation on train and then predict on test using lambda.min.
Check the help on cv.glmnet and glmnet.
The help on glmnet says the default is to standardize the training features.
##################################################
### gmlnet on train, predict on test
### can regularization, fix the 10 version???!!!!!
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
####################
### poly5
## fit on train
set.seed(14) # Dave Keon
cars.gcv5 = cv.glmnet(xpoly5[-testid,], dfpol5tr$y, type.measure = "mse", nfolds = 10,alpha=1) #alpha = 1 is Lasso
plot(cars.gcv5)
cars.las5 = cars.gcv5$glmnet.fit
plot(cars.las5)
## predict on test
ypredL5 = predict(cars.las5,newx=xpoly5[testid,],s=cars.gcv5$lambda.min)
yhatL5 = predict(cars.las5,newx=xpoly5[-testid,],s=cars.gcv5$lambda.min)
### plot y vs yhat (ypred) and lasso vs linear
par(mfrow=c(1,1))
plot(dfpol5te$y,ypredL5)
abline(0,1,col='red')
title(main='poly5, test y vs. lasso predictions')
####################
### poly10
## fit on train
set.seed(14) # Dave Keon
cars.gcv10 = cv.glmnet(xpoly10[-testid,], dfpol10tr$y, type.measure = "mse", nfolds = 10,alpha=1)
plot(cars.gcv10)
cars.las10 = cars.gcv10$glmnet.fit
plot(cars.las10)
## predict on test, fit on train
ypredL10 = predict(cars.las10,newx=xpoly10[testid,],s=cars.gcv10$lambda.min)
yhatL10 = predict(cars.las10,newx=xpoly10[-testid,],s=cars.gcv10$lambda.min)
plot(dfpol10te$y,ypredL10)
abline(0,1,col='red')
title(main='poly10, test y vs lasso predictions')
cat('test rmse, lin5, lasso5: ',myrmse(dfpol5te$y,ypred5),', ',myrmse(dfpol5te$y,ypredL5),'\n')
## test rmse, lin5, lasso5: 5.404617 , 5.356217
cat('test rmse, lin10, lasso10: ',myrmse(dfpol10te$y,ypred10),', ',myrmse(dfpol10te$y,ypredL10),'\n')
## test rmse, lin10, lasso10: 23.20774 , 5.1409
So the 5 poly with LASSO gives rmse 5.35 on test while the 10 poly
gives 5.14.
10 poly with LASSO seems pretty good !!
##################################################
### binary case
y = scd$price
qval = quantile(y,probs=c(.75))
ybin = as.integer(y>qval) # car is expensive if price is greater than 75 percentile
ybintr = ybin[-testid]
ybinte = ybin[testid]
##################################################
## fit on train
set.seed(14) # Dave Keon
lcars.gcv = cv.glmnet(xpoly10[-testid,], ybintr, family = 'binomial', nfolds = 10,alpha=1)
par(mfrow=c(1,1))
plot(lcars.gcv)
lcars.las = lcars.gcv$glmnet.fit
plot(lcars.las)
##################################################
## coefs
coefL = as.matrix(coef(lcars.las,s=lcars.gcv$lambda.min))
cat('number of nonzero coefficients:\n')
## number of nonzero coefficients:
print(sum(coefL >0))
## [1] 8
##################################################
### fits and preds
yhatBL = predict(lcars.las,newx=xpoly10[-testid,], type="class",s=lcars.gcv$lambda.min)
yhatBLP = predict(lcars.las,newx=xpoly10[-testid,], type="response",s=lcars.gcv$lambda.min)[,1]
ypredBL = predict(lcars.las,newx=xpoly10[testid,], type="class",s=lcars.gcv$lambda.min)
ypredBLP = predict(lcars.las,newx=xpoly10[testid,], type="response",s=lcars.gcv$lambda.min)[,1]
## type='class' gives 0 or 1
cat('\n first 15 predictions\n')
##
## first 15 predictions
print(ypredBL[1:15])
## [1] "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "1" "0" "0" "0"
cat('\n first 15 probabilities\n')
##
## first 15 probabilities
## type='response' gives p(y=1|x)
ypredBLP[1:15]
## [1] 0.0041981287 0.0032182730 0.0022946452 0.0009010616 0.2832168316
## [6] 0.0004787383 0.0002208959 0.0054574411 0.9459707486 0.1307419868
## [11] 0.0013231129 0.9912064940 0.0009351356 0.0006378746 0.2757769290
## check that 1 is the prediction means P(y=1|x) > .5
print(table(ypredBL,(ypredBLP>.5)))
##
## ypredBL FALSE TRUE
## 0 232 0
## 1 0 68
print(table(yhatBL,(yhatBLP>.5)))
##
## yhatBL FALSE TRUE
## 0 526 0
## 1 0 174
## confusion
#test
confte = table(ybinte,ypredBL)
print(confte)
## ypredBL
## ybinte 0 1
## 0 219 8
## 1 13 60
sum(diag(confte))/length(ybinte)
## [1] 0.93
#train
conftr = table(ybintr,yhatBL)
print(conftr)
## yhatBL
## ybintr 0 1
## 0 501 22
## 1 25 152
sum(diag(conftr))/length(ybintr)
## [1] 0.9328571
## roc
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## test roc
rocP = roc(ybinte, ypredBLP)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(rocP, main="test ROC Curve", col="blue")
print('test auc: \n')
## [1] "test auc: \n"
print(rocP$auc)
## Area under the curve: 0.9696
## lift
mylift = function(y,p) {
ii = order(p,decreasing=TRUE)
ps = cumsum(y[ii])/sum(y)
return(ps)
}
## test lift
nte = length(ybinte)
plot((1:nte)/nte,mylift(ybinte,ypredBLP),type='l',col='blue')
abline(0,1,col='red',lty=3)