##################################################
### imports
import numpy as np
import pandas as pd
import math
import scipy as sp

import matplotlib.pyplot as plt
#ipython terminal
#%matplotlib
#jupyter notebook
#%matplotlib inline 


from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

##################################################
## boston data

bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")
print("*** the type of bd is:")
print(type(bd))
print(bd.head())

## to to numpy
bdnp = bd.to_numpy()
y = bdnp[:,-1]
X = bdnp[:,0:-1]

##check
plt.scatter(X[:,-1],y)

*** the type of bd is:
<class 'pandas.core.frame.DataFrame'>
      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

    black  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2

<matplotlib.collections.PathCollection at 0x7e3ac634fd90>

##################################################
## scale, note THIS IS DATA LEAKAGE,  for a pipeline version see below.
scl = StandardScaler()
Xs = scl.fit_transform(X)
print("means should be 0, sds should be 1")
print(Xs.mean(axis=0))
print(Xs.std(axis=0))

means should be 0, sds should be 1
[-1.12338772e-16  7.89881994e-17  2.10635198e-16 -3.51058664e-17
 -1.96592852e-16 -1.08828186e-16 -1.47444639e-16 -8.42540793e-17
 -1.12338772e-16  0.00000000e+00 -4.21270397e-16 -7.44244367e-16
 -3.08931624e-16]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

##################################################
## simple regression
lmod = LinearRegression()
lmod.fit(Xs,y)

yhatl = lmod.predict(Xs)

### plot y vs yhat
plt.scatter(y,yhatl)
plt.xlabel('y'); plt.ylabel('yhat')
plt.plot(y,y,c='red',linestyle='dotted')

[<matplotlib.lines.Line2D at 0x7e3ac63e9360>]

##################################################
## ridge
alphas = np.linspace(start=1,stop=200,num=100)
rcv = RidgeCV(alphas,cv=10)
rcv.fit(Xs,y)
print(rcv.alpha_)
print(rcv.coef_)
yhatr = rcv.predict(Xs)

111.55555555555554
[-0.64141799  0.56122367 -0.41426219  0.73757026 -0.87662851  2.75867167
 -0.17942354 -1.60522101  0.63543557 -0.58289769 -1.63532367  0.76951297
 -2.90233088]

##################################################
## lasso
lcv = LassoCV(cv=5)
lcv.fit(Xs,y)

# look at alphas used
print("number of alphas used:",lcv.n_alphas)
pd.Series(lcv.alphas_).describe()

#best alpha and coefficients
print("best alpha: ",lcv.alpha_)

#coefficents
print("coeficients at best alpha: ",lcv.coef_)
print("number of 0 coefficents: ",np.sum(lcv.coef_ == 0))

#fitted values
yhatL = lcv.predict(Xs)

#mse
msep = lcv.mse_path_
mses = msep.sum(axis=1)
plt.scatter(np.log(lcv.alphas_),mses)
plt.xlabel('log alpha'); plt.ylabel('mse')

number of alphas used: 100
best alpha:  0.15657258981286368
coeficients at best alpha:  [-0.46536551  0.49862271 -0.07400274  0.6433277  -1.30758222  2.9109174
 -0.         -2.03160307  0.37204988 -0.15577827 -1.84603018  0.7126043
 -3.71977274]
number of 0 coefficents:  1

Text(0, 0.5, 'mse')

##################################################
## look
pyhat = pd.DataFrame({'y':y,'yhatlin':yhatl,'yhatridge':yhatr,'yhatLasso':yhatL})
pyhat.corr()

##################################################
### use pipeline to avoid data leakage

# Create a pipeline that scales ONLY within the cross-validation folds
Pln = Pipeline([
    ('scaler', StandardScaler()),  # Scales data within each cross-validation fold
    ('lasso', LassoCV(
        cv=10,            # 10-fold cross-validation
        random_state=34
    ))
])

## fit/predict
Pln.fit(X, y)
yhatPln = Pln.predict(X)

plt.scatter(y,yhatPln)
plt.plot(y,y,c='red')

[<matplotlib.lines.Line2D at 0x7e3ac39d93c0>]

##################################################
### use pipeline to avoid data leakage

# Create a pipeline that scales ONLY within the cross-validation folds
PlnR = Pipeline([
    ('scaler', StandardScaler()),  # Scales data within each cross-validation fold
    ('ridge', RidgeCV(
        cv=10           # 10-fold cross-valida
    ))
])

## fit/predict
PlnR.fit(X, y)
yhatPlnR = PlnR.predict(X)

plt.scatter(y,yhatPlnR)
plt.plot(y,y,c='red')

[<matplotlib.lines.Line2D at 0x7e3abef27d90>]

## look
pyhat = pd.DataFrame({'y':y,'yhatlin':yhatl,'yhatridge':yhatr,'yhatLasso':yhatL,'yhatPln':yhatPln,'yhatPlnR':yhatPlnR})
pyhat.corr()

print(Pln.named_steps.keys())
junk = Pln.named_steps['lasso']
print(junk.alpha_)
print(junk.coef_)

dict_keys(['scaler', 'lasso'])
0.14602012128965022
[-0.49642878  0.53758696 -0.05979688  0.64601683 -1.35784426  2.89535011
 -0.         -2.10431451  0.52531783 -0.28422681 -1.86040633  0.72184225
 -3.72077045]

##################################################
### compare coefficents

plt.scatter(lmod.coef_,Pln.named_steps['lasso'].coef_,c='green',label='lasso')
plt.scatter(lmod.coef_,PlnR.named_steps['ridge'].coef_,c='blue',label='ridge')
plt.plot(lmod.coef_,lmod.coef_,c='red')
plt.xlabel('linear coefficents'); plt.ylabel('shrunk coefficents')
plt.legend()

<matplotlib.legend.Legend at 0x7e3abef5f910>

	y	yhatlin	yhatridge	yhatLasso
y	1.000000	0.860606	0.851472	0.853643
yhatlin	0.860606	1.000000	0.989386	0.991909
yhatridge	0.851472	0.989386	1.000000	0.996136
yhatLasso	0.853643	0.991909	0.996136	1.000000

	y	yhatlin	yhatridge	yhatLasso	yhatPln	yhatPlnR
y	1.000000	0.860606	0.851472	0.853643	0.854583	0.860198
yhatlin	0.860606	1.000000	0.989386	0.991909	0.993001	0.999526
yhatridge	0.851472	0.989386	1.000000	0.996136	0.996190	0.993080
yhatLasso	0.853643	0.991909	0.996136	1.000000	0.999960	0.994970
yhatPln	0.854583	0.993001	0.996190	0.999960	1.000000	0.995816
yhatPlnR	0.860198	0.999526	0.993080	0.994970	0.995816	1.000000

Simple Ridge, Lasso¶