First let’s source some utility functions that Rob wrote for working with the logit model.
source("logit-funs.R")
ls()
## [1] "lgGrad" "lgH" "mLL" "phat" "simData"
For example mLL is the minus log likelihood and lgGrad is the gradient.
print(mLL)
## function (X, y, beta)
## {
## eta = X %*% matrix(beta, ncol = 1)
## py1 = 1/(1 + exp(-eta))
## return(-sum(ifelse(y == 1, log(py1), log(1 - py1)))/n)
## }
print(lgGrad)
## function (X, y, beta)
## {
## eta = X %*% matrix(beta, ncol = 1)
## pvec = 1/(1 + exp(-eta))
## gvec = t(X) %*% matrix(y - pvec, ncol = 1)
## return(-gvec/n)
## }
Now let’s simulate some data.
n=200
beta=c(1,2)
p = length(beta)
set.seed(17)
x1 = rnorm(n)
wht = .7
x2 = wht*x1 + (1-wht)*rnorm(n)
print(cor(x1,x2))
## [1] 0.938045
X = cbind(x1,x2)
y = simData(X,beta)
plot(x1,x2)
Let’s get the mle from the R glm function and check that the gradient is close to 0 at the mle.
## logit mle
ddf = data.frame(X,y)
lgm = glm(y~.-1,ddf,family='binomial')
print(summary(lgm))
##
## Call:
## glm(formula = y ~ . - 1, family = "binomial", data = ddf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.45597 -0.52606 0.04267 0.57679 2.66536
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## x1 1.2602 0.5781 2.180 0.0293 *
## x2 1.9116 0.7703 2.482 0.0131 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 200 degrees of freedom
## Residual deviance: 147.49 on 198 degrees of freedom
## AIC: 151.49
##
## Number of Fisher Scoring iterations: 6
## mle from R
bhat = lgm$coef
#check
cat("grad a mle: ",lgGrad(X,y,bhat),"\n")
## grad a mle: -7.884589e-14 -6.322526e-14
Let’s compute the -log lik on a grid in R2.
gs = 50 #one d grid size
alga1 = seq(from=-0,to=3.0,length.out=gs) #one d grid
alga2 = seq(from=-0,to=3.0,length.out=gs) #one d grid
alg2 = expand.grid(alga1,alga2) # two d grid
llv = rep(0,nrow(alg2))
quadv = rep(0,nrow(alg2))
linv = rep(0,nrow(alg2))
for(i in 1:length(llv)) {
llv[i] = mLL(X,y,as.double(alg2[i,]))
}
## contour
llmat = matrix(as.numeric(llv),nrow=length(alga1),ncol=length(alga2))
contour(alga1,alga2,llmat,nlevels=100,drawlabels=FALSE,
xlab=expression(beta[1]),ylab=expression(beta[2]),col="blue",
cex.lab=1.8,cex.axis=1.2,lwd=1)
points(beta[1],beta[2],cex=1.5,pch=16,col="red")
points(bhat[1],bhat[2],cex=1.5,pch=16,col="magenta")
title(main="contours of -log likelihood, red dot at true value, magenta at mle", cex.main=.8)
Gradient descent.
### gradient descent
### iteration parameters
setup=1
if(setup==1) {
niter = 100 # number of iterations
lrat = rep(5.0,niter) #learning rate
}
if(setup==2) {
niter = 100 # number of iterations
lrat = rep(40.0,niter) #learning rate
}
if(setup==3) {
niter = 50 # number of iterations
lrat = rep(35.0,niter) #learning rate
}
biter = c(1.2,.2) #starting value
## store iterations of beta values
bM = matrix(0.0,niter+1,length(biter))
bM[1,] = biter
lvv = rep(0,niter+1)
lambda = 0.0
lvv[1] = mLL(X,y,biter)
## iterate
for(i in 1:niter) {
gv = lgGrad(X,y,biter)
biter = biter - lrat[i]*(gv + 2*lambda*biter)
lvv[i+1] = mLL(X,y,biter)
bM[i+1,] = biter
}
plot(lvv)
## contour
llmat = matrix(as.numeric(llv),nrow=length(alga1),ncol=length(alga2))
contour(alga1,alga2,llmat,nlevels=100,drawlabels=FALSE,
xlab=expression(beta[1]),ylab=expression(beta[2]),col="blue",
cex.lab=1.8,cex.axis=1.2,lwd=1)
points(beta[1],beta[2],cex=1.5,pch=16,col="black")
points(bhat[1],bhat[2],cex=1.5,pch=16,col="magenta")
tnm=paste0("Gradient descent, learning rate= ",lrat[1],", niter= ",niter)
title(main=tnm, cex.main=1.2)
for(i in 2:(niter+1)) {
arrows(bM[i-1,1],bM[i-1,2],bM[i,1],bM[i,2],length=.05,col="red")
}