Gradient Descent for the linear logit mle

First let’s source some utility functions that Rob wrote for working with the logit model.

source("logit-funs.R")
ls()
## [1] "lgGrad"  "lgH"     "mLL"     "phat"    "simData"

For example mLL is the minus log likelihood and lgGrad is the gradient.

print(mLL)
## function (X, y, beta) 
## {
##     eta = X %*% matrix(beta, ncol = 1)
##     py1 = 1/(1 + exp(-eta))
##     return(-sum(ifelse(y == 1, log(py1), log(1 - py1)))/n)
## }
print(lgGrad)
## function (X, y, beta) 
## {
##     eta = X %*% matrix(beta, ncol = 1)
##     pvec = 1/(1 + exp(-eta))
##     gvec = t(X) %*% matrix(y - pvec, ncol = 1)
##     return(-gvec/n)
## }

Now let’s simulate some data.

n=200
beta=c(1,2)
p = length(beta)

set.seed(17)
x1 = rnorm(n)
wht = .7
x2 = wht*x1 + (1-wht)*rnorm(n)
print(cor(x1,x2))
## [1] 0.938045

X = cbind(x1,x2)
y = simData(X,beta)

plot(x1,x2)

Let’s get the mle from the R glm function and check that the gradient is close to 0 at the mle.

##  logit mle
ddf = data.frame(X,y)
lgm = glm(y~.-1,ddf,family='binomial')
print(summary(lgm))
## 
## Call:
## glm(formula = y ~ . - 1, family = "binomial", data = ddf)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.45597  -0.52606   0.04267   0.57679   2.66536  
## 
## Coefficients:
##    Estimate Std. Error z value Pr(>|z|)  
## x1   1.2602     0.5781   2.180   0.0293 *
## x2   1.9116     0.7703   2.482   0.0131 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 200  degrees of freedom
## Residual deviance: 147.49  on 198  degrees of freedom
## AIC: 151.49
## 
## Number of Fisher Scoring iterations: 6

## mle from R
bhat = lgm$coef

#check
cat("grad a mle: ",lgGrad(X,y,bhat),"\n")
## grad a mle:  -7.884589e-14 -6.322526e-14

Let’s compute the -log lik on a grid in R2.

gs = 50 #one d grid size
alga1 = seq(from=-0,to=3.0,length.out=gs) #one d grid
alga2 = seq(from=-0,to=3.0,length.out=gs) #one d grid
alg2 = expand.grid(alga1,alga2) # two d grid


llv = rep(0,nrow(alg2))
quadv = rep(0,nrow(alg2))
linv = rep(0,nrow(alg2))
for(i in 1:length(llv)) {
   llv[i] = mLL(X,y,as.double(alg2[i,]))
}


## contour 
llmat = matrix(as.numeric(llv),nrow=length(alga1),ncol=length(alga2))
contour(alga1,alga2,llmat,nlevels=100,drawlabels=FALSE,
     xlab=expression(beta[1]),ylab=expression(beta[2]),col="blue",
           cex.lab=1.8,cex.axis=1.2,lwd=1)
points(beta[1],beta[2],cex=1.5,pch=16,col="red")
points(bhat[1],bhat[2],cex=1.5,pch=16,col="magenta")
title(main="contours of -log likelihood, red dot at true value, magenta at mle", cex.main=.8)

Gradient descent.

### gradient descent

### iteration parameters
setup=1
if(setup==1) {
niter = 100 # number of iterations
lrat = rep(5.0,niter) #learning rate
}
if(setup==2) {
niter = 100 # number of iterations
lrat = rep(40.0,niter) #learning rate
}
if(setup==3) {
niter = 50 # number of iterations
lrat = rep(35.0,niter) #learning rate
}

biter = c(1.2,.2) #starting value

## store iterations of beta values
bM = matrix(0.0,niter+1,length(biter))
bM[1,] = biter
lvv = rep(0,niter+1)
lambda = 0.0
lvv[1] = mLL(X,y,biter)

## iterate
for(i in 1:niter) {
   gv = lgGrad(X,y,biter)
   biter = biter - lrat[i]*(gv + 2*lambda*biter)
   lvv[i+1] = mLL(X,y,biter)
   bM[i+1,] = biter
}
plot(lvv)

## contour 
llmat = matrix(as.numeric(llv),nrow=length(alga1),ncol=length(alga2))
contour(alga1,alga2,llmat,nlevels=100,drawlabels=FALSE,
     xlab=expression(beta[1]),ylab=expression(beta[2]),col="blue",
           cex.lab=1.8,cex.axis=1.2,lwd=1)
points(beta[1],beta[2],cex=1.5,pch=16,col="black")
points(bhat[1],bhat[2],cex=1.5,pch=16,col="magenta")
tnm=paste0("Gradient descent, learning rate= ",lrat[1],", niter= ",niter)
title(main=tnm, cex.main=1.2)

for(i in 2:(niter+1)) {
   arrows(bM[i-1,1],bM[i-1,2],bM[i,1],bM[i,2],length=.05,col="red")
}

Homework Problems