##################################################
### imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn; seaborn.set()

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss

##sklearn model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV

import statsmodels.api as sm

##################################################
### read in data and get x and y
w8d = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/swe8there.csv")

y = w8d['y'].to_numpy()
x = w8d.iloc[:,0:-1].to_numpy() # y is the last column
n = len(y)
p = x.shape[1]

print("n,p: ",n,p)

n,p:  6166 200

##################################################
### mle
#statsmodels
# https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.Logit.fit.html
# note that iterative algorithm may not converge in many iterations.
XX = sm.add_constant(x)
lfitM = sm.Logit(y, XX).fit(maxiter=500) # maxiter: maximum number of iterations
#print(lfMLE.summary())
bhatM = lfitM.params[1:] #drop intercept

Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.496945
         Iterations: 500

/home/robert/.local/lib/python3.10/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "

#################################################
## scale

## for this data all the x are on the same scale naturally, so we don't need this
## recall, we may not want to scale using all the data, "data leakage"

doscale=False

if doscale:
   scl = StandardScaler()
   X = scl.fit_transform(x)
   print("means should be 0, sds should be 1")
   print(X.mean(axis=0))
   print(X.std(axis=0))
else:
   X = x.copy()

##################################################
### fit logit  with regularization

## choose grid for regularization parameter
llv = np.linspace(-1,5,100) #lambda
lv = np.exp(llv)

## LogisticRegression wants 1/lambda, here is the help
#  C : float, default=1.0
#      Inverse of regularization strength; must be a positive float.
#      Like in support vector machines, smaller values specify stronger
#      regularization.
Cv = 1/lv # vector of 1/lambda
nC = len(Cv)

## fit regularized logit on grid
bmat1 = np.zeros((nC,p))
bmat2 = np.zeros((nC,p))
lfit1 = LogisticRegression(penalty='l1',solver='liblinear')
lfit2 = LogisticRegression(penalty='l2')

lLoss1 = np.zeros(nC)
lLoss2 = np.zeros(nC)
for i in range(nC):
   ## L1
   lfit1.C = Cv[i]
   lfit1.fit(X,y)
   bmat1[i,:] = lfit1.coef_
   lLoss1[i] = log_loss(y,lfit1.predict_proba(X))
   ## L2
   lfit2.C = Cv[i]
   lfit2.fit(X,y)
   bmat2[i,:] = lfit2.coef_
   lLoss2[i] = log_loss(y,lfit2.predict_proba(X))

## plot regularization path of coefficients
Bmat1 = pd.DataFrame(bmat1)
Bmat1.plot(kind='line',legend=False)

<Axes: >

Bmat2 = pd.DataFrame(bmat2)
Bmat2.plot(kind='line',legend=False)

<Axes: >

## plot MLE beta vs least regularized beta
plt.scatter(bhatM,bmat1[0,:],c='red')
plt.scatter(bhatM,bmat2[0,:],c='green')
plt.plot(bhatM,bhatM,c='blue')
plt.xlabel('bhat mle'); plt.ylabel('bhat min regularization')

Text(0, 0.5, 'bhat min regularization')

## plot in sample loss
plt.scatter(llv,lLoss1,c='red',label='L1')
plt.scatter(llv,lLoss2,c='green',label='L2')
plt.xlabel('log(lambda)'); plt.ylabel('in-sample deviance loss')
plt.legend()

<matplotlib.legend.Legend at 0x7d7268c538e0>

##################################################
### do simple train/test split with log_loss 

#train/test split
rng = np.random.RandomState(34)
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=rng, test_size=.2)

iLoss = np.zeros(nC)
oLoss = np.zeros(nC)
for i in range(nC):
   lfit1.C = Cv[i]
   lfit1.fit(Xtrain,ytrain)
   iLoss[i] = log_loss(ytrain,lfit1.predict_proba(Xtrain))
   oLoss[i] = log_loss(ytest,lfit1.predict_proba(Xtest))

## plot in and out of sample loss
plt.scatter(llv,oLoss,c='blue',label='test loss')
plt.scatter(llv,iLoss,c='red',label='train loss')
plt.xlabel('log(lambda)'); plt.ylabel('deviance loss')
plt.legend()

<matplotlib.legend.Legend at 0x7d7268cbf070>

##################################################
### check log_loss to make sure it is the deviance

#from sklearn.metrics import log_loss
y_true = [0, 0, 1, 1]
y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]
print(log_loss(y_true, y_pred))

temp = -np.log(.9) -np.log(.8) - np.log(.7) - np.log(.99)
print(temp/4.0)

0.1738073366910675
0.1738073366910675

simple-logit.py¶