Simple Ridge, Lasso¶
In [42]:
##################################################
### imports
import numpy as np
import pandas as pd
import math
import scipy as sp
import matplotlib.pyplot as plt
#ipython terminal
#%matplotlib
#jupyter notebook
#%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
In [43]:
##################################################
## boston data
bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")
print("*** the type of bd is:")
print(type(bd))
print(bd.head())
## to to numpy
bdnp = bd.to_numpy()
y = bdnp[:,-1]
X = bdnp[:,0:-1]
##check
plt.scatter(X[:,-1],y)
*** the type of bd is: <class 'pandas.core.frame.DataFrame'> crim zn indus chas nox rm age dis rad tax ptratio \ 0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 black lstat medv 0 396.90 4.98 24.0 1 396.90 9.14 21.6 2 392.83 4.03 34.7 3 394.63 2.94 33.4 4 396.90 5.33 36.2
Out[43]:
<matplotlib.collections.PathCollection at 0x7e3ac634fd90>
In [44]:
##################################################
## scale, note THIS IS DATA LEAKAGE, for a pipeline version see below.
scl = StandardScaler()
Xs = scl.fit_transform(X)
print("means should be 0, sds should be 1")
print(Xs.mean(axis=0))
print(Xs.std(axis=0))
means should be 0, sds should be 1 [-1.12338772e-16 7.89881994e-17 2.10635198e-16 -3.51058664e-17 -1.96592852e-16 -1.08828186e-16 -1.47444639e-16 -8.42540793e-17 -1.12338772e-16 0.00000000e+00 -4.21270397e-16 -7.44244367e-16 -3.08931624e-16] [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
In [45]:
##################################################
## simple regression
lmod = LinearRegression()
lmod.fit(Xs,y)
yhatl = lmod.predict(Xs)
### plot y vs yhat
plt.scatter(y,yhatl)
plt.xlabel('y'); plt.ylabel('yhat')
plt.plot(y,y,c='red',linestyle='dotted')
Out[45]:
[<matplotlib.lines.Line2D at 0x7e3ac63e9360>]
In [46]:
##################################################
## ridge
alphas = np.linspace(start=1,stop=200,num=100)
rcv = RidgeCV(alphas,cv=10)
rcv.fit(Xs,y)
print(rcv.alpha_)
print(rcv.coef_)
yhatr = rcv.predict(Xs)
111.55555555555554 [-0.64141799 0.56122367 -0.41426219 0.73757026 -0.87662851 2.75867167 -0.17942354 -1.60522101 0.63543557 -0.58289769 -1.63532367 0.76951297 -2.90233088]
In [47]:
##################################################
## lasso
lcv = LassoCV(cv=5)
lcv.fit(Xs,y)
# look at alphas used
print("number of alphas used:",lcv.n_alphas)
pd.Series(lcv.alphas_).describe()
#best alpha and coefficients
print("best alpha: ",lcv.alpha_)
#coefficents
print("coeficients at best alpha: ",lcv.coef_)
print("number of 0 coefficents: ",np.sum(lcv.coef_ == 0))
#fitted values
yhatL = lcv.predict(Xs)
#mse
msep = lcv.mse_path_
mses = msep.sum(axis=1)
plt.scatter(np.log(lcv.alphas_),mses)
plt.xlabel('log alpha'); plt.ylabel('mse')
number of alphas used: 100 best alpha: 0.15657258981286368 coeficients at best alpha: [-0.46536551 0.49862271 -0.07400274 0.6433277 -1.30758222 2.9109174 -0. -2.03160307 0.37204988 -0.15577827 -1.84603018 0.7126043 -3.71977274] number of 0 coefficents: 1
Out[47]:
Text(0, 0.5, 'mse')
In [48]:
##################################################
## look
pyhat = pd.DataFrame({'y':y,'yhatlin':yhatl,'yhatridge':yhatr,'yhatLasso':yhatL})
pyhat.corr()
Out[48]:
y | yhatlin | yhatridge | yhatLasso | |
---|---|---|---|---|
y | 1.000000 | 0.860606 | 0.851472 | 0.853643 |
yhatlin | 0.860606 | 1.000000 | 0.989386 | 0.991909 |
yhatridge | 0.851472 | 0.989386 | 1.000000 | 0.996136 |
yhatLasso | 0.853643 | 0.991909 | 0.996136 | 1.000000 |
In [49]:
##################################################
### use pipeline to avoid data leakage
# Create a pipeline that scales ONLY within the cross-validation folds
Pln = Pipeline([
('scaler', StandardScaler()), # Scales data within each cross-validation fold
('lasso', LassoCV(
cv=10, # 10-fold cross-validation
random_state=34
))
])
## fit/predict
Pln.fit(X, y)
yhatPln = Pln.predict(X)
plt.scatter(y,yhatPln)
plt.plot(y,y,c='red')
Out[49]:
[<matplotlib.lines.Line2D at 0x7e3ac39d93c0>]
In [50]:
##################################################
### use pipeline to avoid data leakage
# Create a pipeline that scales ONLY within the cross-validation folds
PlnR = Pipeline([
('scaler', StandardScaler()), # Scales data within each cross-validation fold
('ridge', RidgeCV(
cv=10 # 10-fold cross-valida
))
])
## fit/predict
PlnR.fit(X, y)
yhatPlnR = PlnR.predict(X)
plt.scatter(y,yhatPlnR)
plt.plot(y,y,c='red')
Out[50]:
[<matplotlib.lines.Line2D at 0x7e3abef27d90>]
In [51]:
## look
pyhat = pd.DataFrame({'y':y,'yhatlin':yhatl,'yhatridge':yhatr,'yhatLasso':yhatL,'yhatPln':yhatPln,'yhatPlnR':yhatPlnR})
pyhat.corr()
Out[51]:
y | yhatlin | yhatridge | yhatLasso | yhatPln | yhatPlnR | |
---|---|---|---|---|---|---|
y | 1.000000 | 0.860606 | 0.851472 | 0.853643 | 0.854583 | 0.860198 |
yhatlin | 0.860606 | 1.000000 | 0.989386 | 0.991909 | 0.993001 | 0.999526 |
yhatridge | 0.851472 | 0.989386 | 1.000000 | 0.996136 | 0.996190 | 0.993080 |
yhatLasso | 0.853643 | 0.991909 | 0.996136 | 1.000000 | 0.999960 | 0.994970 |
yhatPln | 0.854583 | 0.993001 | 0.996190 | 0.999960 | 1.000000 | 0.995816 |
yhatPlnR | 0.860198 | 0.999526 | 0.993080 | 0.994970 | 0.995816 | 1.000000 |
In [52]:
print(Pln.named_steps.keys())
junk = Pln.named_steps['lasso']
print(junk.alpha_)
print(junk.coef_)
dict_keys(['scaler', 'lasso']) 0.14602012128965022 [-0.49642878 0.53758696 -0.05979688 0.64601683 -1.35784426 2.89535011 -0. -2.10431451 0.52531783 -0.28422681 -1.86040633 0.72184225 -3.72077045]
In [53]:
##################################################
### compare coefficents
plt.scatter(lmod.coef_,Pln.named_steps['lasso'].coef_,c='green',label='lasso')
plt.scatter(lmod.coef_,PlnR.named_steps['ridge'].coef_,c='blue',label='ridge')
plt.plot(lmod.coef_,lmod.coef_,c='red')
plt.xlabel('linear coefficents'); plt.ylabel('shrunk coefficents')
plt.legend()
Out[53]:
<matplotlib.legend.Legend at 0x7e3abef5f910>