Hw3 Solutions, Spring 25¶

For both problem 1 and 2, we will do a train/test split and then do 10-fold cross-validation on the train.
Then predict on test.

In [1]:
## imports

##################################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#mean_squared_error(y_true, y_pred, squared=False) # gets rmse

## lift curve
def mylift(y,p):
   """lift"""
   ii = np.argsort(p)[::-1]
   ps = np.cumsum(y[ii])/np.sum(y)
   return(ps)

Problem 1¶

In [2]:
### read in data and get y=price x=(mileage,year)

cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")

cd = cd[['price','mileage','year']]
cd['price']  /= 1000
cd['mileage']  /= 1000
print(cd.head())

X = cd[['mileage','year']].to_numpy()  #mileage and year columns as a numpy array
print("*** type of X is",type(X))
print(X.shape) #number of rows and columns
print(X[0:4,:]) #first 4 rows
y = cd['price'].values #price as a numpy vector
print(f'length of y is {len(y)}')
print(y[:4]) #implicit start at 0
    price  mileage  year
0  43.995   36.858  2008
1  44.995   46.883  2012
2  25.999  108.759  2007
3  33.880   35.187  2007
4  34.895   48.153  2007
*** type of X is <class 'numpy.ndarray'>
(1000, 2)
[[  36.858 2008.   ]
 [  46.883 2012.   ]
 [ 108.759 2007.   ]
 [  35.187 2007.   ]]
length of y is 1000
[43.995 44.995 25.999 33.88 ]
In [3]:
##################################################
## train/test split

myseed = 88 #Nylander
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.3)
In [4]:
##################################################
## demean using means from train x
Xm = Xtrain.mean(0)
XtrDm = Xtrain - Xm
XteDm = Xtest - Xm
In [5]:
##################################################
### get polynomial terms
#We'll try degree 5 and 10

p5 = PolynomialFeatures(5,include_bias=False)
Xtr5 = p5.fit_transform(XtrDm)
Xte5 = p5.transform(XteDm)
print('shape of Xtr5:')
print(Xtr5.shape)

p10 = PolynomialFeatures(10,include_bias=False)
Xtr10 = p10.fit_transform(XtrDm)
Xte10 = p10.transform(XteDm)
print('shape of Xtr10:')
print(Xtr10.shape)
shape of Xtr5:
(700, 20)
shape of Xtr10:
(700, 65)
In [6]:
##################################################
### linear models with p5 and p10

lmmod5 = LinearRegression(fit_intercept=True)
lmmod5.fit(Xtr5,ytrain)
ypred5 = lmmod5.predict(Xte5)
yhat5 = lmmod5.predict(Xtr5)

lmmod10 = LinearRegression(fit_intercept=True)
lmmod10.fit(Xtr10,ytrain)
ypred10 = lmmod10.predict(Xte10)
yhat10 = lmmod10.predict(Xtr10)


print('Simple regression, rmse in sample')
print(f'5 poly reg, 10 poly reg: \
      {mean_squared_error(ytrain, yhat5, squared=False)}, {mean_squared_error(ytrain,yhat10,squared=False)}\n')
print('Simple regression, rmse out of sample')
print(f'5 poly reg, 10 poly reg: \
      {mean_squared_error(ytest, ypred5, squared=False)}, {mean_squared_error(ytest,ypred10,squared=False)}\n')
Simple regression, rmse in sample
5 poly reg, 10 poly reg:       5.499887188137272, 52.665990160466215

Simple regression, rmse out of sample
5 poly reg, 10 poly reg:       5.214709600605703, 716.3276742059619

In [7]:
## plot fit vs ytrain and prediction vs ytest
plt.figure(figsize=(16, 10))
plt.subplot(2,2,1)
plt.scatter(ytrain,yhat5,s=5)
plt.xlabel('train y'); plt.ylabel('yhat poly 5')
plt.plot(ytrain,ytrain,c='red')

plt.subplot(2,2,2)
plt.scatter(ytrain,yhat10,s=5)
plt.xlabel('train y'); plt.ylabel('yhat poly 10')

plt.subplot(2,2,3)
plt.scatter(ytest,ypred5,s=5)
plt.xlabel('test y'); plt.ylabel('ypred poly 5')
plt.plot(ytest,ytest,c='red')

plt.subplot(2,2,4)
plt.scatter(ytest,ypred10,s=5)
plt.xlabel('test y'); plt.ylabel('ypred poly 10')
Out[7]:
Text(0, 0.5, 'ypred poly 10')
No description has been provided for this image

Degree 5 looks good both in and out of sample.
Degree 10 is a disaster. This is probably due to the numerical instability of the degree 10 polynomials.

Let's see if the LASSO can save the degree 10 polys!

In [8]:
# Create a pipeline that scales ONLY within the cross-validation folds
Pln = Pipeline([
    ('scaler', StandardScaler()),  # Scales data within each cross-validation fold
    ('lasso', LassoCV(
        cv=10,            # 10-fold cross-validation
        max_iter = 2000, # default is 1000, failed to converge for poly 10
        random_state=34
    ))
])

print(Pln)
Pipeline(steps=[('scaler', StandardScaler()),
                ('lasso', LassoCV(cv=10, max_iter=2000, random_state=34))])

Check the help on LassoCV.
Look at the max_iter and selection parameters.

In [9]:
## fit/predict
Pln.fit(Xtr5, ytrain)
yhatL5 = Pln.predict(Xtr5)
ypredL5 = Pln.predict(Xte5)

Pln.fit(Xtr10, ytrain)
yhatL10 = Pln.predict(Xtr10)
ypredL10 = Pln.predict(Xte10)
In [10]:
print('Lasso, rmse in sample')
print(f'5 poly reg, 10 poly reg: \
   {mean_squared_error(ytrain, yhatL5, squared=False)}, {mean_squared_error(ytrain,yhatL10,squared=False)}\n')
print('Lasso, rmse out of sample')
print(f'5 poly reg, 10 poly reg: \
   {mean_squared_error(ytest, ypredL5, squared=False)}, {mean_squared_error(ytest,ypredL10,squared=False)}\n')
Lasso, rmse in sample
5 poly reg, 10 poly reg:    5.526153442341063, 5.524602009209272

Lasso, rmse out of sample
5 poly reg, 10 poly reg:    5.206741119381457, 5.210090156718012

Very nice.
If we use the LASSO, we get similar results from poly5 and poly10.

In [11]:
plt.figure(figsize=(14, 6))
plt.subplot(1,2,1)
plt.scatter(ytest,ypredL5,s=3.0,c='blue')
plt.plot(ytest,ytest,c='red')
plt.xlabel('test y'); plt.ylabel('ypred poly 5, Lasso')
plt.title('test y vs predictions, Lasso from poly 5')

plt.subplot(1,2,2)
plt.scatter(ytest,ypredL10,s=3.0,c='blue')
plt.plot(ytest,ytest,c='red')
plt.xlabel('test y'); plt.ylabel('ypred poly 10, Lasso')
plt.title('test y vs predictions, Lasso from poly 10')
Out[11]:
Text(0.5, 1.0, 'test y vs predictions, Lasso from poly 10')
No description has been provided for this image

Turns out degree 5 is a nice choice.
We get an out of sample rmse of 5.2 even without the LASSO regularization.

Degree 10 only works with the regularization.
By doing 10 and getting results similar to 5, we know 5 is enough.
Maybe we should tree degree = 2,3,4 ?

Problem 2¶

I'll just do degree 5.

In [12]:
## ybin will be 1 if y is greater thant .75 percentile, 0 else
y75 = np.percentile(ytrain,75)
print(f'\n*** 75 percentile of y=price is: {y75}\n')

ybintr = np.array(ytrain>y75,dtype='int')
print(pd.Series(ybintr).value_counts())
ybinte = np.array(ytest>y75,dtype='int')
print(pd.Series(ybinte).value_counts())
*** 75 percentile of y=price is: 42.995

0    526
1    174
Name: count, dtype: int64
0    220
1     80
Name: count, dtype: int64
In [13]:
## logit lasso

PlnL = Pipeline([
    ('scaler', StandardScaler()),  # Scales data within each cross-validation fold
    ('lasso', LogisticRegressionCV(
        Cs = 100,     # 100 different regularization strengths
        cv=10,            # 10-fold cross-validation
        penalty ='l1',
        solver='saga',
        max_iter = 2000, # default is 1000, failed to converge for poly 10
        random_state=34
    ))
])

print(PlnL)
Pipeline(steps=[('scaler', StandardScaler()),
                ('lasso',
                 LogisticRegressionCV(Cs=100, cv=10, max_iter=2000,
                                      penalty='l1', random_state=34,
                                      solver='saga'))])

Look at the help for LogisticRegressionCV and check out options for the solver parameter.
What does solver='saga' mean ?

In [14]:
## poly 5
PlnL.fit(Xtr5,ybintr)
yhatbinL5 = PlnL.predict(Xtr5) # fitted values, 1 if p(y=1|x) > .5
phatbinL5 = PlnL.predict_proba(Xtr5) # probs , two columns, first is prob y=0, second is prob y=1
ypredbinL5 = PlnL.predict(Xte5)
ppredbinL5 = PlnL.predict_proba(Xte5)

# predict 1 if p(y=1|x) > .5
print(phatbinL5[:5])
print(yhatbinL5[:5])
[[0.11120003 0.88879997]
 [0.45056848 0.54943152]
 [0.11828347 0.88171653]
 [0.53952555 0.46047445]
 [0.08214173 0.91785827]]
[1 1 1 0 1]

You can see that you get a 1 for yhat when the second column of phat is > .5.

In [15]:
print('out of sample, poly 5, confusion and accuracy:')
print('confusion:')
print(confusion_matrix(ybinte,ypredbinL5))
print('accuracy:')
print(accuracy_score(ybinte,ypredbinL5))
print('pandas crosstab:')
print(pd.crosstab(ybinte,ypredbinL5))
out of sample, poly 5, confusion and accuracy:
confusion:
[[216   4]
 [ 10  70]]
accuracy:
0.9533333333333334
pandas crosstab:
col_0    0   1
row_0         
0      216   4
1       10  70
In [16]:
## out of sample lift
nte = len(ybinte)
pvec = np.linspace(1,nte,nte)/nte
plt.scatter(pvec,mylift(ybinte,ppredbinL5[:,1]),s=.5,c='blue')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.title('out of sample lift')
Out[16]:
Text(0.5, 1.0, 'out of sample lift')
No description has been provided for this image
In [17]:
##auc
aucteBL5 = roc_auc_score(ybinte,ppredbinL5[:,1])
print('test auc for logit, p5: ',aucteBL5)
test auc for logit, p5:  0.9842613636363636
In [18]:
##roc
rocteBL5 = roc_curve(ybinte,ppredbinL5[:,1])
plt.plot(rocteBL5[0],rocteBL5[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('logit auc ' + str(np.round(aucteBL5,2)))
Out[18]:
Text(0.5, 1.0, 'logit auc 0.98')
No description has been provided for this image

We got 95 percent accuracy for degree 5 polynomials and the LASSO.
Would be interesting to see if this is the same with simple logistic regression!!
Would be interesting to see which coefficent the LASSO sets to 0 !!