Hw3 Solutions, Spring 25¶
For both problem 1 and 2, we will do a train/test split and then do 10-fold cross-validation on the train.
Then predict on test.
## imports
##################################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
#mean_squared_error(y_true, y_pred, squared=False) # gets rmse
## lift curve
def mylift(y,p):
"""lift"""
ii = np.argsort(p)[::-1]
ps = np.cumsum(y[ii])/np.sum(y)
return(ps)
Problem 1¶
### read in data and get y=price x=(mileage,year)
cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[['price','mileage','year']]
cd['price'] /= 1000
cd['mileage'] /= 1000
print(cd.head())
X = cd[['mileage','year']].to_numpy() #mileage and year columns as a numpy array
print("*** type of X is",type(X))
print(X.shape) #number of rows and columns
print(X[0:4,:]) #first 4 rows
y = cd['price'].values #price as a numpy vector
print(f'length of y is {len(y)}')
print(y[:4]) #implicit start at 0
price mileage year 0 43.995 36.858 2008 1 44.995 46.883 2012 2 25.999 108.759 2007 3 33.880 35.187 2007 4 34.895 48.153 2007 *** type of X is <class 'numpy.ndarray'> (1000, 2) [[ 36.858 2008. ] [ 46.883 2012. ] [ 108.759 2007. ] [ 35.187 2007. ]] length of y is 1000 [43.995 44.995 25.999 33.88 ]
##################################################
## train/test split
myseed = 88 #Nylander
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.3)
##################################################
## demean using means from train x
Xm = Xtrain.mean(0)
XtrDm = Xtrain - Xm
XteDm = Xtest - Xm
##################################################
### get polynomial terms
#We'll try degree 5 and 10
p5 = PolynomialFeatures(5,include_bias=False)
Xtr5 = p5.fit_transform(XtrDm)
Xte5 = p5.transform(XteDm)
print('shape of Xtr5:')
print(Xtr5.shape)
p10 = PolynomialFeatures(10,include_bias=False)
Xtr10 = p10.fit_transform(XtrDm)
Xte10 = p10.transform(XteDm)
print('shape of Xtr10:')
print(Xtr10.shape)
shape of Xtr5: (700, 20) shape of Xtr10: (700, 65)
##################################################
### linear models with p5 and p10
lmmod5 = LinearRegression(fit_intercept=True)
lmmod5.fit(Xtr5,ytrain)
ypred5 = lmmod5.predict(Xte5)
yhat5 = lmmod5.predict(Xtr5)
lmmod10 = LinearRegression(fit_intercept=True)
lmmod10.fit(Xtr10,ytrain)
ypred10 = lmmod10.predict(Xte10)
yhat10 = lmmod10.predict(Xtr10)
print('Simple regression, rmse in sample')
print(f'5 poly reg, 10 poly reg: \
{mean_squared_error(ytrain, yhat5, squared=False)}, {mean_squared_error(ytrain,yhat10,squared=False)}\n')
print('Simple regression, rmse out of sample')
print(f'5 poly reg, 10 poly reg: \
{mean_squared_error(ytest, ypred5, squared=False)}, {mean_squared_error(ytest,ypred10,squared=False)}\n')
Simple regression, rmse in sample 5 poly reg, 10 poly reg: 5.499887188137272, 52.665990160466215 Simple regression, rmse out of sample 5 poly reg, 10 poly reg: 5.214709600605703, 716.3276742059619
## plot fit vs ytrain and prediction vs ytest
plt.figure(figsize=(16, 10))
plt.subplot(2,2,1)
plt.scatter(ytrain,yhat5,s=5)
plt.xlabel('train y'); plt.ylabel('yhat poly 5')
plt.plot(ytrain,ytrain,c='red')
plt.subplot(2,2,2)
plt.scatter(ytrain,yhat10,s=5)
plt.xlabel('train y'); plt.ylabel('yhat poly 10')
plt.subplot(2,2,3)
plt.scatter(ytest,ypred5,s=5)
plt.xlabel('test y'); plt.ylabel('ypred poly 5')
plt.plot(ytest,ytest,c='red')
plt.subplot(2,2,4)
plt.scatter(ytest,ypred10,s=5)
plt.xlabel('test y'); plt.ylabel('ypred poly 10')
Text(0, 0.5, 'ypred poly 10')
Degree 5 looks good both in and out of sample.
Degree 10 is a disaster. This is probably due to the numerical instability of the degree 10 polynomials.
Let's see if the LASSO can save the degree 10 polys!
# Create a pipeline that scales ONLY within the cross-validation folds
Pln = Pipeline([
('scaler', StandardScaler()), # Scales data within each cross-validation fold
('lasso', LassoCV(
cv=10, # 10-fold cross-validation
max_iter = 2000, # default is 1000, failed to converge for poly 10
random_state=34
))
])
print(Pln)
Pipeline(steps=[('scaler', StandardScaler()), ('lasso', LassoCV(cv=10, max_iter=2000, random_state=34))])
Check the help on LassoCV.
Look at the max_iter and selection parameters.
## fit/predict
Pln.fit(Xtr5, ytrain)
yhatL5 = Pln.predict(Xtr5)
ypredL5 = Pln.predict(Xte5)
Pln.fit(Xtr10, ytrain)
yhatL10 = Pln.predict(Xtr10)
ypredL10 = Pln.predict(Xte10)
print('Lasso, rmse in sample')
print(f'5 poly reg, 10 poly reg: \
{mean_squared_error(ytrain, yhatL5, squared=False)}, {mean_squared_error(ytrain,yhatL10,squared=False)}\n')
print('Lasso, rmse out of sample')
print(f'5 poly reg, 10 poly reg: \
{mean_squared_error(ytest, ypredL5, squared=False)}, {mean_squared_error(ytest,ypredL10,squared=False)}\n')
Lasso, rmse in sample 5 poly reg, 10 poly reg: 5.526153442341063, 5.524602009209272 Lasso, rmse out of sample 5 poly reg, 10 poly reg: 5.206741119381457, 5.210090156718012
Very nice.
If we use the LASSO, we get similar results from poly5 and poly10.
plt.figure(figsize=(14, 6))
plt.subplot(1,2,1)
plt.scatter(ytest,ypredL5,s=3.0,c='blue')
plt.plot(ytest,ytest,c='red')
plt.xlabel('test y'); plt.ylabel('ypred poly 5, Lasso')
plt.title('test y vs predictions, Lasso from poly 5')
plt.subplot(1,2,2)
plt.scatter(ytest,ypredL10,s=3.0,c='blue')
plt.plot(ytest,ytest,c='red')
plt.xlabel('test y'); plt.ylabel('ypred poly 10, Lasso')
plt.title('test y vs predictions, Lasso from poly 10')
Text(0.5, 1.0, 'test y vs predictions, Lasso from poly 10')
Turns out degree 5 is a nice choice.
We get an out of sample rmse of 5.2 even without the LASSO regularization.
Degree 10 only works with the regularization.
By doing 10 and getting results similar to 5, we know 5 is enough.
Maybe we should tree degree = 2,3,4 ?
Problem 2¶
I'll just do degree 5.
## ybin will be 1 if y is greater thant .75 percentile, 0 else
y75 = np.percentile(ytrain,75)
print(f'\n*** 75 percentile of y=price is: {y75}\n')
ybintr = np.array(ytrain>y75,dtype='int')
print(pd.Series(ybintr).value_counts())
ybinte = np.array(ytest>y75,dtype='int')
print(pd.Series(ybinte).value_counts())
*** 75 percentile of y=price is: 42.995 0 526 1 174 Name: count, dtype: int64 0 220 1 80 Name: count, dtype: int64
## logit lasso
PlnL = Pipeline([
('scaler', StandardScaler()), # Scales data within each cross-validation fold
('lasso', LogisticRegressionCV(
Cs = 100, # 100 different regularization strengths
cv=10, # 10-fold cross-validation
penalty ='l1',
solver='saga',
max_iter = 2000, # default is 1000, failed to converge for poly 10
random_state=34
))
])
print(PlnL)
Pipeline(steps=[('scaler', StandardScaler()), ('lasso', LogisticRegressionCV(Cs=100, cv=10, max_iter=2000, penalty='l1', random_state=34, solver='saga'))])
Look at the help for LogisticRegressionCV and check out options for the solver parameter.
What does solver='saga' mean ?
## poly 5
PlnL.fit(Xtr5,ybintr)
yhatbinL5 = PlnL.predict(Xtr5) # fitted values, 1 if p(y=1|x) > .5
phatbinL5 = PlnL.predict_proba(Xtr5) # probs , two columns, first is prob y=0, second is prob y=1
ypredbinL5 = PlnL.predict(Xte5)
ppredbinL5 = PlnL.predict_proba(Xte5)
# predict 1 if p(y=1|x) > .5
print(phatbinL5[:5])
print(yhatbinL5[:5])
[[0.11120003 0.88879997] [0.45056848 0.54943152] [0.11828347 0.88171653] [0.53952555 0.46047445] [0.08214173 0.91785827]] [1 1 1 0 1]
You can see that you get a 1 for yhat when the second column of phat is > .5.
print('out of sample, poly 5, confusion and accuracy:')
print('confusion:')
print(confusion_matrix(ybinte,ypredbinL5))
print('accuracy:')
print(accuracy_score(ybinte,ypredbinL5))
print('pandas crosstab:')
print(pd.crosstab(ybinte,ypredbinL5))
out of sample, poly 5, confusion and accuracy: confusion: [[216 4] [ 10 70]] accuracy: 0.9533333333333334 pandas crosstab: col_0 0 1 row_0 0 216 4 1 10 70
## out of sample lift
nte = len(ybinte)
pvec = np.linspace(1,nte,nte)/nte
plt.scatter(pvec,mylift(ybinte,ppredbinL5[:,1]),s=.5,c='blue')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.title('out of sample lift')
Text(0.5, 1.0, 'out of sample lift')
##auc
aucteBL5 = roc_auc_score(ybinte,ppredbinL5[:,1])
print('test auc for logit, p5: ',aucteBL5)
test auc for logit, p5: 0.9842613636363636
##roc
rocteBL5 = roc_curve(ybinte,ppredbinL5[:,1])
plt.plot(rocteBL5[0],rocteBL5[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('logit auc ' + str(np.round(aucteBL5,2)))
Text(0.5, 1.0, 'logit auc 0.98')
We got 95 percent accuracy for degree 5 polynomials and the LASSO.
Would be interesting to see if this is the same with simple logistic regression!!
Would be interesting to see which coefficent the LASSO sets to 0 !!