simple-logit.py¶
In [1]:
##################################################
### imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
##sklearn model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm
In [2]:
##################################################
### read in data and get x and y
w8d = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/swe8there.csv")
y = w8d['y'].to_numpy()
x = w8d.iloc[:,0:-1].to_numpy() # y is the last column
n = len(y)
p = x.shape[1]
print("n,p: ",n,p)
n,p: 6166 200
Note that the logit below may not have converged after 500 iterations!!!!
We will see that it has found some fit.
In [3]:
##################################################
### mle
#statsmodels
# https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.Logit.fit.html
# note that iterative algorithm may not converge in many iterations.
XX = sm.add_constant(x)
lfitM = sm.Logit(y, XX).fit(maxiter=500) # maxiter: maximum number of iterations
#print(lfMLE.summary())
bhatM = lfitM.params[1:] #drop intercept
Warning: Maximum number of iterations has been exceeded. Current function value: 0.496945 Iterations: 500
/home/robert/.local/lib/python3.10/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals warnings.warn("Maximum Likelihood optimization failed to "
In [4]:
#################################################
## scale
## for this data all the x are on the same scale naturally, so we don't need this
## recall, we may not want to scale using all the data, "data leakage"
doscale=False
if doscale:
scl = StandardScaler()
X = scl.fit_transform(x)
print("means should be 0, sds should be 1")
print(X.mean(axis=0))
print(X.std(axis=0))
else:
X = x.copy()
In [5]:
##################################################
### fit logit with regularization
## choose grid for regularization parameter
llv = np.linspace(-1,5,100) #lambda
lv = np.exp(llv)
## LogisticRegression wants 1/lambda, here is the help
# C : float, default=1.0
# Inverse of regularization strength; must be a positive float.
# Like in support vector machines, smaller values specify stronger
# regularization.
Cv = 1/lv # vector of 1/lambda
nC = len(Cv)
## fit regularized logit on grid
bmat1 = np.zeros((nC,p))
bmat2 = np.zeros((nC,p))
lfit1 = LogisticRegression(penalty='l1',solver='liblinear')
lfit2 = LogisticRegression(penalty='l2')
lLoss1 = np.zeros(nC)
lLoss2 = np.zeros(nC)
for i in range(nC):
## L1
lfit1.C = Cv[i]
lfit1.fit(X,y)
bmat1[i,:] = lfit1.coef_
lLoss1[i] = log_loss(y,lfit1.predict_proba(X))
## L2
lfit2.C = Cv[i]
lfit2.fit(X,y)
bmat2[i,:] = lfit2.coef_
lLoss2[i] = log_loss(y,lfit2.predict_proba(X))
In [6]:
## plot regularization path of coefficients
Bmat1 = pd.DataFrame(bmat1)
Bmat1.plot(kind='line',legend=False)
Out[6]:
<Axes: >
In [7]:
Bmat2 = pd.DataFrame(bmat2)
Bmat2.plot(kind='line',legend=False)
Out[7]:
<Axes: >
In [8]:
## plot MLE beta vs least regularized beta
plt.scatter(bhatM,bmat1[0,:],c='red')
plt.scatter(bhatM,bmat2[0,:],c='green')
plt.plot(bhatM,bhatM,c='blue')
plt.xlabel('bhat mle'); plt.ylabel('bhat min regularization')
Out[8]:
Text(0, 0.5, 'bhat min regularization')
In [9]:
## plot in sample loss
plt.scatter(llv,lLoss1,c='red',label='L1')
plt.scatter(llv,lLoss2,c='green',label='L2')
plt.xlabel('log(lambda)'); plt.ylabel('in-sample deviance loss')
plt.legend()
Out[9]:
<matplotlib.legend.Legend at 0x7d7268c538e0>
In [10]:
##################################################
### do simple train/test split with log_loss
#train/test split
rng = np.random.RandomState(34)
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=rng, test_size=.2)
iLoss = np.zeros(nC)
oLoss = np.zeros(nC)
for i in range(nC):
lfit1.C = Cv[i]
lfit1.fit(Xtrain,ytrain)
iLoss[i] = log_loss(ytrain,lfit1.predict_proba(Xtrain))
oLoss[i] = log_loss(ytest,lfit1.predict_proba(Xtest))
In [11]:
## plot in and out of sample loss
plt.scatter(llv,oLoss,c='blue',label='test loss')
plt.scatter(llv,iLoss,c='red',label='train loss')
plt.xlabel('log(lambda)'); plt.ylabel('deviance loss')
plt.legend()
Out[11]:
<matplotlib.legend.Legend at 0x7d7268cbf070>
In [12]:
##################################################
### check log_loss to make sure it is the deviance
#from sklearn.metrics import log_loss
y_true = [0, 0, 1, 1]
y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]
print(log_loss(y_true, y_pred))
temp = -np.log(.9) -np.log(.8) - np.log(.7) - np.log(.99)
print(temp/4.0)
0.1738073366910675 0.1738073366910675