Simple Lasso and Ridge¶

Note that in addition to our usual imports we have Ridge, RidgeCV, Lasso, and LassoCV. You can tell by the names what they are for!!

### imports
import numpy as np
import pandas as pd
import math
import scipy as sp

import matplotlib.pyplot as plt
plt.style.use('seaborn')
#ipython terminal
#%matplotlib
#jupyter notebook
%matplotlib inline 


from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

Let's use the Boston housing data, but we will use all the features !!
We can just pull it out of sklearn.

## boston data
from sklearn.datasets import load_boston

boston = load_boston()
features = boston.data
y = boston.target

Let scale our x variables.
We typically do this when we use L1 or L2 regularization since the penalty trees all the coefficients for the different features (X'S) the same.

Note that I am using all the data to do the scaling.
In principle this is in correct since when I do CV (cross - validation) below this means I have already used all the data to pick a part of my modeling process so that nothing is out of sample.
I am assumming that the scaling I would get from 9/10 of the data is the same as the scaling I get from all the data since I am going to do 10-fold CV.

##################################################
## scale
scl = StandardScaler()
X = scl.fit_transform(features)
print("means should be 0, sds should be 1")
print(X.mean(axis=0))
print(X.std(axis=0))

means should be 0, sds should be 1
[-8.78743718e-17 -6.34319123e-16 -2.68291099e-15  4.70199198e-16
  2.49032240e-15 -1.14523016e-14 -1.40785495e-15  9.21090169e-16
  5.44140929e-16 -8.86861950e-16 -9.20563581e-15  8.16310129e-15
 -3.37016317e-16]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

##################################################
## simple regression
lmod = LinearRegression()
lmod.fit(X,y)

yhatl = lmod.predict(X)

### plot y vs yhat
plt.scatter(y,yhatl)
plt.xlabel('y'); plt.ylabel('yhat')
plt.plot(y,y,c='red',linestyle='dotted')
plt.title('y vs. yhat for linear regression, in-sample, all data')

Text(0.5, 1.0, 'y vs. yhat for linear regression, in-sample, all data')

Let's try ridge regression with 10-fold cross validation. The "alphas" are the $\lambda$ values.
That is, they control the size of the L2 complexity penalty.

##################################################
## ridge
alphas = np.linspace(start=1,stop=200,num=100)
rcv = RidgeCV(alphas,cv=10)
rcv.fit(X,y)
print(rcv.alpha_)
print(rcv.coef_)
yhatr = rcv.predict(X)

111.55555555555554
[-0.64141799  0.56122367 -0.41426219  0.73757026 -0.87662851  2.75867167
 -0.17942354 -1.60522101  0.63543557 -0.58289769 -1.63532367  0.76951297
 -2.90233088]

Now let's try the LASS0.

##################################################
## lasso
lcv = LassoCV(cv=10)
lcv.fit(X,y)

# look at alphas used
print("number of alphas used:",lcv.n_alphas)
pd.Series(lcv.alphas_).describe()

plt.plot(lcv.alphas_)

#best alpha and coefficients
print("best alpha: ",lcv.alpha_)

#coefficents
print("coeficients at best alpha: ",lcv.coef_)
print("number of 0 coefficents: ",np.sum(lcv.coef_ == 0))

#fitted values
yhatL = lcv.predict(X)

number of alphas used: 100
best alpha:  0.14602012128965022
coeficients at best alpha:  [-0.49642878  0.53758696 -0.05979688  0.64601683 -1.35784426  2.89535011
 -0.         -2.10431451  0.52531783 -0.28422681 -1.86040633  0.72184225
 -3.72077045]
number of 0 coefficents:  1

#mse
msep = lcv.mse_path_
mses = msep.sum(axis=1)
plt.scatter(lcv.alphas_,mses)
plt.xlabel('alpha'); plt.ylabel('mse')

Text(0, 0.5, 'mse')

Let's compare the fits from the three different approach, linear regression, ridge, and LASSO.

##################################################
## look
pyhat = pd.DataFrame({'y':y,'yhatl':yhatl,'yhatr':yhatr,'yhatLasso':yhatL})
pyhat.corr()

Let's look at the "path" of a paricular coefficient as alpha (the amount of shrinkage varies) for the LASSO fit.

# Lasso path
Lpath = lcv.path(X,y)
for i in range(3):
   print(Lpath[i].shape)

kk = 0 ## choose which coefficient to plot
plt.scatter(Lpath[0],Lpath[1][kk,:])
plt.xlabel('alpha'); plt.ylabel('coefficent')

(100,)
(13, 100)
(100,)

Text(0, 0.5, 'coefficent')

	y	yhatl	yhatr	yhatLasso
y	1.000000	0.860606	0.851472	0.854583
yhatl	0.860606	1.000000	0.989386	0.993001
yhatr	0.851472	0.989386	1.000000	0.996190
yhatLasso	0.854583	0.993001	0.996190	1.000000