## imports

##################################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#mean_squared_error(y_true, y_pred, squared=False) # gets rmse

## lift curve
def mylift(y,p):
   """lift"""
   ii = np.argsort(p)[::-1]
   ps = np.cumsum(y[ii])/np.sum(y)
   return(ps)

### read in data and get y=price x=(mileage,year)

cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")

cd = cd[['price','mileage','year']]
cd['price']  /= 1000
cd['mileage']  /= 1000
print(cd.head())

X = cd[['mileage','year']].to_numpy()  #mileage and year columns as a numpy array
print("*** type of X is",type(X))
print(X.shape) #number of rows and columns
print(X[0:4,:]) #first 4 rows
y = cd['price'].values #price as a numpy vector
print(f'length of y is {len(y)}')
print(y[:4]) #implicit start at 0

    price  mileage  year
0  43.995   36.858  2008
1  44.995   46.883  2012
2  25.999  108.759  2007
3  33.880   35.187  2007
4  34.895   48.153  2007
*** type of X is <class 'numpy.ndarray'>
(1000, 2)
[[  36.858 2008.   ]
 [  46.883 2012.   ]
 [ 108.759 2007.   ]
 [  35.187 2007.   ]]
length of y is 1000
[43.995 44.995 25.999 33.88 ]

##################################################
## train/test split

myseed = 88 #Nylander
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.3)

##################################################
## demean using means from train x
Xm = Xtrain.mean(0)
XtrDm = Xtrain - Xm
XteDm = Xtest - Xm

##################################################
### get polynomial terms
#We'll try degree 5 and 10

p5 = PolynomialFeatures(5,include_bias=False)
Xtr5 = p5.fit_transform(XtrDm)
Xte5 = p5.transform(XteDm)
print('shape of Xtr5:')
print(Xtr5.shape)

p10 = PolynomialFeatures(10,include_bias=False)
Xtr10 = p10.fit_transform(XtrDm)
Xte10 = p10.transform(XteDm)
print('shape of Xtr10:')
print(Xtr10.shape)

shape of Xtr5:
(700, 20)
shape of Xtr10:
(700, 65)

##################################################
### linear models with p5 and p10

lmmod5 = LinearRegression(fit_intercept=True)
lmmod5.fit(Xtr5,ytrain)
ypred5 = lmmod5.predict(Xte5)
yhat5 = lmmod5.predict(Xtr5)

lmmod10 = LinearRegression(fit_intercept=True)
lmmod10.fit(Xtr10,ytrain)
ypred10 = lmmod10.predict(Xte10)
yhat10 = lmmod10.predict(Xtr10)


print('Simple regression, rmse in sample')
print(f'5 poly reg, 10 poly reg: \
      {mean_squared_error(ytrain, yhat5, squared=False)}, {mean_squared_error(ytrain,yhat10,squared=False)}\n')
print('Simple regression, rmse out of sample')
print(f'5 poly reg, 10 poly reg: \
      {mean_squared_error(ytest, ypred5, squared=False)}, {mean_squared_error(ytest,ypred10,squared=False)}\n')

Simple regression, rmse in sample
5 poly reg, 10 poly reg:       5.499887188137272, 52.665990160466215

Simple regression, rmse out of sample
5 poly reg, 10 poly reg:       5.214709600605703, 716.3276742059619

## plot fit vs ytrain and prediction vs ytest
plt.figure(figsize=(16, 10))
plt.subplot(2,2,1)
plt.scatter(ytrain,yhat5,s=5)
plt.xlabel('train y'); plt.ylabel('yhat poly 5')
plt.plot(ytrain,ytrain,c='red')

plt.subplot(2,2,2)
plt.scatter(ytrain,yhat10,s=5)
plt.xlabel('train y'); plt.ylabel('yhat poly 10')

plt.subplot(2,2,3)
plt.scatter(ytest,ypred5,s=5)
plt.xlabel('test y'); plt.ylabel('ypred poly 5')
plt.plot(ytest,ytest,c='red')

plt.subplot(2,2,4)
plt.scatter(ytest,ypred10,s=5)
plt.xlabel('test y'); plt.ylabel('ypred poly 10')

Text(0, 0.5, 'ypred poly 10')

# Create a pipeline that scales ONLY within the cross-validation folds
Pln = Pipeline([
    ('scaler', StandardScaler()),  # Scales data within each cross-validation fold
    ('lasso', LassoCV(
        cv=10,            # 10-fold cross-validation
        max_iter = 2000, # default is 1000, failed to converge for poly 10
        random_state=34
    ))
])

print(Pln)

Pipeline(steps=[('scaler', StandardScaler()),
                ('lasso', LassoCV(cv=10, max_iter=2000, random_state=34))])

## fit/predict
Pln.fit(Xtr5, ytrain)
yhatL5 = Pln.predict(Xtr5)
ypredL5 = Pln.predict(Xte5)

Pln.fit(Xtr10, ytrain)
yhatL10 = Pln.predict(Xtr10)
ypredL10 = Pln.predict(Xte10)

print('Lasso, rmse in sample')
print(f'5 poly reg, 10 poly reg: \
   {mean_squared_error(ytrain, yhatL5, squared=False)}, {mean_squared_error(ytrain,yhatL10,squared=False)}\n')
print('Lasso, rmse out of sample')
print(f'5 poly reg, 10 poly reg: \
   {mean_squared_error(ytest, ypredL5, squared=False)}, {mean_squared_error(ytest,ypredL10,squared=False)}\n')

Lasso, rmse in sample
5 poly reg, 10 poly reg:    5.526153442341063, 5.524602009209272

Lasso, rmse out of sample
5 poly reg, 10 poly reg:    5.206741119381457, 5.210090156718012

plt.figure(figsize=(14, 6))
plt.subplot(1,2,1)
plt.scatter(ytest,ypredL5,s=3.0,c='blue')
plt.plot(ytest,ytest,c='red')
plt.xlabel('test y'); plt.ylabel('ypred poly 5, Lasso')
plt.title('test y vs predictions, Lasso from poly 5')

plt.subplot(1,2,2)
plt.scatter(ytest,ypredL10,s=3.0,c='blue')
plt.plot(ytest,ytest,c='red')
plt.xlabel('test y'); plt.ylabel('ypred poly 10, Lasso')
plt.title('test y vs predictions, Lasso from poly 10')

Text(0.5, 1.0, 'test y vs predictions, Lasso from poly 10')

## ybin will be 1 if y is greater thant .75 percentile, 0 else
y75 = np.percentile(ytrain,75)
print(f'\n*** 75 percentile of y=price is: {y75}\n')

ybintr = np.array(ytrain>y75,dtype='int')
print(pd.Series(ybintr).value_counts())
ybinte = np.array(ytest>y75,dtype='int')
print(pd.Series(ybinte).value_counts())

*** 75 percentile of y=price is: 42.995

0    526
1    174
Name: count, dtype: int64
0    220
1     80
Name: count, dtype: int64

## logit lasso

PlnL = Pipeline([
    ('scaler', StandardScaler()),  # Scales data within each cross-validation fold
    ('lasso', LogisticRegressionCV(
        Cs = 100,     # 100 different regularization strengths
        cv=10,            # 10-fold cross-validation
        penalty ='l1',
        solver='saga',
        max_iter = 2000, # default is 1000, failed to converge for poly 10
        random_state=34
    ))
])

print(PlnL)

Pipeline(steps=[('scaler', StandardScaler()),
                ('lasso',
                 LogisticRegressionCV(Cs=100, cv=10, max_iter=2000,
                                      penalty='l1', random_state=34,
                                      solver='saga'))])

## poly 5
PlnL.fit(Xtr5,ybintr)
yhatbinL5 = PlnL.predict(Xtr5) # fitted values, 1 if p(y=1|x) > .5
phatbinL5 = PlnL.predict_proba(Xtr5) # probs , two columns, first is prob y=0, second is prob y=1
ypredbinL5 = PlnL.predict(Xte5)
ppredbinL5 = PlnL.predict_proba(Xte5)

# predict 1 if p(y=1|x) > .5
print(phatbinL5[:5])
print(yhatbinL5[:5])

[[0.11120003 0.88879997]
 [0.45056848 0.54943152]
 [0.11828347 0.88171653]
 [0.53952555 0.46047445]
 [0.08214173 0.91785827]]
[1 1 1 0 1]

print('out of sample, poly 5, confusion and accuracy:')
print('confusion:')
print(confusion_matrix(ybinte,ypredbinL5))
print('accuracy:')
print(accuracy_score(ybinte,ypredbinL5))
print('pandas crosstab:')
print(pd.crosstab(ybinte,ypredbinL5))

out of sample, poly 5, confusion and accuracy:
confusion:
[[216   4]
 [ 10  70]]
accuracy:
0.9533333333333334
pandas crosstab:
col_0    0   1
row_0         
0      216   4
1       10  70

## out of sample lift
nte = len(ybinte)
pvec = np.linspace(1,nte,nte)/nte
plt.scatter(pvec,mylift(ybinte,ppredbinL5[:,1]),s=.5,c='blue')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.title('out of sample lift')

Text(0.5, 1.0, 'out of sample lift')

##auc
aucteBL5 = roc_auc_score(ybinte,ppredbinL5[:,1])
print('test auc for logit, p5: ',aucteBL5)

test auc for logit, p5:  0.9842613636363636

##roc
rocteBL5 = roc_curve(ybinte,ppredbinL5[:,1])
plt.plot(rocteBL5[0],rocteBL5[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('logit auc ' + str(np.round(aucteBL5,2)))

Text(0.5, 1.0, 'logit auc 0.98')

Hw3 Solutions, Spring 25¶

Problem 1¶

Problem 2¶