Problem 1

##################################################
### get data
cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")

scd = cd[,c(1,4,5)]
scd$price = scd$price/1000
scd$mileage = scd$mileage/1000
head(scd)
##    price mileage year
## 1 43.995  36.858 2008
## 2 44.995  46.883 2012
## 3 25.999 108.759 2007
## 4 33.880  35.187 2007
## 5 34.895  48.153 2007
## 6  5.995 121.748 2002
##################################################
## add polynomial terms 

x = as.matrix(scd[,c(2,3)])

xpoly5 = poly(x,degree=5,raw=FALSE) 
dfpol5 = data.frame(y = scd$price, xpoly5)

xpoly10 = poly(x,degree=10,raw=FALSE) 
dfpol10 = data.frame(y = scd$price, xpoly10)

print(dim(xpoly5))
## [1] 1000   20
print(dim(xpoly10))
## [1] 1000   65
print(names(dfpol5))
##  [1] "y"    "X1.0" "X2.0" "X3.0" "X4.0" "X5.0" "X0.1" "X1.1" "X2.1" "X3.1"
## [11] "X4.1" "X0.2" "X1.2" "X2.2" "X3.2" "X0.3" "X1.3" "X2.3" "X0.4" "X1.4"
## [21] "X0.5"
print(names(dfpol10))
##  [1] "y"     "X1.0"  "X2.0"  "X3.0"  "X4.0"  "X5.0"  "X6.0"  "X7.0"  "X8.0" 
## [10] "X9.0"  "X10.0" "X0.1"  "X1.1"  "X2.1"  "X3.1"  "X4.1"  "X5.1"  "X6.1" 
## [19] "X7.1"  "X8.1"  "X9.1"  "X0.2"  "X1.2"  "X2.2"  "X3.2"  "X4.2"  "X5.2" 
## [28] "X6.2"  "X7.2"  "X8.2"  "X0.3"  "X1.3"  "X2.3"  "X3.3"  "X4.3"  "X5.3" 
## [37] "X6.3"  "X7.3"  "X0.4"  "X1.4"  "X2.4"  "X3.4"  "X4.4"  "X5.4"  "X6.4" 
## [46] "X0.5"  "X1.5"  "X2.5"  "X3.5"  "X4.5"  "X5.5"  "X0.6"  "X1.6"  "X2.6" 
## [55] "X3.6"  "X4.6"  "X0.7"  "X1.7"  "X2.7"  "X3.7"  "X0.8"  "X1.8"  "X2.8" 
## [64] "X0.9"  "X1.9"  "X0.10"
##################################################
## simple train/test split

n = nrow(cd)
set.seed(34)
testid = sample(1:n,300)

dfpol5tr = dfpol5[-testid,]
dfpol5te = dfpol5[testid,]

dfpol10tr = dfpol10[-testid,]
dfpol10te = dfpol10[testid,]

print(dim(dfpol5tr))
## [1] 700  21
print(dim(dfpol5te))
## [1] 300  21
print(dim(dfpol10tr))
## [1] 700  66
print(dim(dfpol10te))
## [1] 300  66
##################################################
## try linear, expect this is overfitting

lmf5 = lm(y~.,dfpol5tr)
ypred5 = predict(lmf5,dfpol5te)

lmf10 = lm(y~.,dfpol10tr)
ypred10 = predict(lmf10,dfpol10te)

linMte = cbind(dfpol5te$y,ypred5,ypred10)
colnames(linMte) = c('y','ypred5','ypred10')
linMtr = cbind(dfpol5tr$y,lmf5$fitted,lmf10$fitted)
colnames(linMtr) = c('y','yhat5','yhat10')

pairs(linMtr)

pairs(linMte)

myrmse = function(y,yp) {
   sqrt(mean((y-yp)^2))
}

cat('lin5 rmse, test: ',myrmse(dfpol5te$y,ypred5),'\n')
## lin5 rmse, test:  5.404617
cat('lin5 rmse, train: ',myrmse(dfpol5tr$y,lmf5$fitted),'\n')
## lin5 rmse, train:  5.443113
cat('lin10 rmse, test: ',myrmse(dfpol10te$y,ypred10),'\n')
## lin10 rmse, test:  23.20774
cat('lin10 rmse, train: ',myrmse(dfpol10tr$y,lmf10$fitted),'\n')
## lin10 rmse, train:  5.150665
## 5 actually looks ok, 10  is not!

The 5 poly seems to work pretty well, good results on train and test.
The 10 poly is a disaster on test.

Let’s see if we can improve the performance by using the LASSO instead of just running a multiple regression.

We will do 10-fold cross-validation on train and then predict on test using lambda.min.

Check the help on cv.glmnet and glmnet.

The help on glmnet says the default is to standardize the training features.

##################################################
### gmlnet on train, predict on test
### can regularization, fix the 10 version???!!!!!

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
####################
### poly5

## fit on train
set.seed(14) # Dave Keon
cars.gcv5 = cv.glmnet(xpoly5[-testid,], dfpol5tr$y, type.measure = "mse", nfolds = 10,alpha=1) #alpha = 1 is Lasso
plot(cars.gcv5)

cars.las5 = cars.gcv5$glmnet.fit
plot(cars.las5)

## predict on test
ypredL5 = predict(cars.las5,newx=xpoly5[testid,],s=cars.gcv5$lambda.min)
yhatL5 = predict(cars.las5,newx=xpoly5[-testid,],s=cars.gcv5$lambda.min)


### plot y vs yhat (ypred) and lasso vs linear
par(mfrow=c(1,1))
plot(dfpol5te$y,ypredL5)
abline(0,1,col='red')
title(main='poly5, test y vs. lasso predictions')

####################
### poly10

## fit on train
set.seed(14) # Dave Keon
cars.gcv10 = cv.glmnet(xpoly10[-testid,], dfpol10tr$y, type.measure = "mse", nfolds = 10,alpha=1)
plot(cars.gcv10)

cars.las10 = cars.gcv10$glmnet.fit
plot(cars.las10)

## predict on test, fit on train
ypredL10 = predict(cars.las10,newx=xpoly10[testid,],s=cars.gcv10$lambda.min)
yhatL10 = predict(cars.las10,newx=xpoly10[-testid,],s=cars.gcv10$lambda.min)
plot(dfpol10te$y,ypredL10)
abline(0,1,col='red')
title(main='poly10, test y vs lasso predictions')

cat('test rmse, lin5, lasso5: ',myrmse(dfpol5te$y,ypred5),', ',myrmse(dfpol5te$y,ypredL5),'\n')
## test rmse, lin5, lasso5:  5.404617 ,  5.356217
cat('test rmse, lin10, lasso10: ',myrmse(dfpol10te$y,ypred10),', ',myrmse(dfpol10te$y,ypredL10),'\n')
## test rmse, lin10, lasso10:  23.20774 ,  5.1409

So the 5 poly with LASSO gives rmse 5.35 on test while the 10 poly gives 5.14.
10 poly with LASSO seems pretty good !!

Problem 2

##################################################
### binary case

y = scd$price
qval = quantile(y,probs=c(.75))
ybin = as.integer(y>qval)  # car is expensive if price is greater than 75 percentile

ybintr = ybin[-testid]
ybinte = ybin[testid]
##################################################
## fit on train
set.seed(14) # Dave Keon
lcars.gcv = cv.glmnet(xpoly10[-testid,], ybintr, family = 'binomial', nfolds = 10,alpha=1)

par(mfrow=c(1,1))
plot(lcars.gcv)

lcars.las = lcars.gcv$glmnet.fit
plot(lcars.las)

##################################################
## coefs

coefL = as.matrix(coef(lcars.las,s=lcars.gcv$lambda.min))
cat('number of nonzero coefficients:\n')
## number of nonzero coefficients:
print(sum(coefL >0))
## [1] 8
##################################################
### fits and preds

yhatBL = predict(lcars.las,newx=xpoly10[-testid,], type="class",s=lcars.gcv$lambda.min)
yhatBLP = predict(lcars.las,newx=xpoly10[-testid,], type="response",s=lcars.gcv$lambda.min)[,1]
ypredBL = predict(lcars.las,newx=xpoly10[testid,], type="class",s=lcars.gcv$lambda.min)
ypredBLP = predict(lcars.las,newx=xpoly10[testid,], type="response",s=lcars.gcv$lambda.min)[,1]

## type='class' gives 0 or 1
cat('\n first 15 predictions\n')
## 
##  first 15 predictions
print(ypredBL[1:15])
##  [1] "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "1" "0" "0" "0"
cat('\n first 15 probabilities\n')
## 
##  first 15 probabilities
## type='response' gives p(y=1|x)
ypredBLP[1:15]
##  [1] 0.0041981287 0.0032182730 0.0022946452 0.0009010616 0.2832168316
##  [6] 0.0004787383 0.0002208959 0.0054574411 0.9459707486 0.1307419868
## [11] 0.0013231129 0.9912064940 0.0009351356 0.0006378746 0.2757769290
## check that 1 is the prediction means P(y=1|x) > .5
print(table(ypredBL,(ypredBLP>.5)))
##        
## ypredBL FALSE TRUE
##       0   232    0
##       1     0   68
print(table(yhatBL,(yhatBLP>.5)))
##       
## yhatBL FALSE TRUE
##      0   526    0
##      1     0  174
## confusion
#test
confte = table(ybinte,ypredBL)
print(confte)
##       ypredBL
## ybinte   0   1
##      0 219   8
##      1  13  60
sum(diag(confte))/length(ybinte)
## [1] 0.93
#train
conftr = table(ybintr,yhatBL)
print(conftr)
##       yhatBL
## ybintr   0   1
##      0 501  22
##      1  25 152
sum(diag(conftr))/length(ybintr)
## [1] 0.9328571
## roc
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## test roc
rocP = roc(ybinte, ypredBLP)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(rocP, main="test ROC Curve", col="blue")

print('test auc: \n')
## [1] "test auc: \n"
print(rocP$auc)
## Area under the curve: 0.9696
## lift
mylift = function(y,p) {
   ii = order(p,decreasing=TRUE)
   ps = cumsum(y[ii])/sum(y)
   return(ps)
}

## test lift
nte = length(ybinte)
plot((1:nte)/nte,mylift(ybinte,ypredBLP),type='l',col='blue')
abline(0,1,col='red',lty=3)