Look at these two links on the webpage.
simple example of svd in python
simple example of svd in python, html
The first is python code with comments and the second is a jupyter
notebook version.
These files just go through the SVD as in numpy.linalg.
I’m just trying to make sure I understand how numpy.linagl.svd works
and check the
basic properties.
At the end of the file it gets interesting in that I try an X that is not of full rank.
## let's see what happens in the non full-rank case
## well append X with the sum of the columns of X
Xbar = X.sum(axis=1)
## check Xp * Xbar should be all ones
print(Xp @ Xbar)
## try one of these two
X1 = np.hstack([X,Xbar.reshape((n,1))]) # add on mean of columns
#X1 = np.hstack([X,X[:,p-1].reshape(n,1)]) # repeat last column
In the notebook I did the version where I add Xbar (the sum of the
columns) to X to make it not of full rank.
(Should have called it Xsum not Xbar).
Try the second version where I just repeat the last column.
What should be different ? What should be the same ?
Of course, if you are not using python you will have to do more work to figure out how SVD works !!
First let’s source some utility functions that Rob wrote for working
with the logit model.
There is a link to the file logit-funs.R on the webpage.
source("logit-funs.R")
ls()
## [1] "lgGrad" "lgH" "mLL" "phat" "simData"
For example mLL is the minus log likelihood and lgGrad is the gradient.
print(mLL)
## function (X, y, beta)
## {
## eta = X %*% matrix(beta, ncol = 1)
## py1 = 1/(1 + exp(-eta))
## return(-sum(ifelse(y == 1, log(py1), log(1 - py1)))/n)
## }
print(lgGrad)
## function (X, y, beta)
## {
## eta = X %*% matrix(beta, ncol = 1)
## pvec = 1/(1 + exp(-eta))
## gvec = t(X) %*% matrix(y - pvec, ncol = 1)
## return(-gvec/n)
## }
Now let’s simulate some data.
n=200
beta=c(1,2)
p = length(beta)
set.seed(17)
x1 = rnorm(n)
wht = .7
x2 = wht*x1 + (1-wht)*rnorm(n)
print(cor(x1,x2))
## [1] 0.938045
X = cbind(x1,x2)
y = simData(X,beta)
plot(x1,x2)
Let’s get the mle from the R glm function and check that the gradient is close to 0 at the mle.
## logit mle
ddf = data.frame(X,y)
lgm = glm(y~.-1,ddf,family='binomial')
print(summary(lgm))
##
## Call:
## glm(formula = y ~ . - 1, family = "binomial", data = ddf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.45597 -0.52606 0.04267 0.57679 2.66536
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## x1 1.2602 0.5781 2.180 0.0293 *
## x2 1.9116 0.7703 2.482 0.0131 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 200 degrees of freedom
## Residual deviance: 147.49 on 198 degrees of freedom
## AIC: 151.49
##
## Number of Fisher Scoring iterations: 6
## mle from R
bhat = lgm$coef
#check
cat("grad a mle: ",lgGrad(X,y,bhat),"\n")
## grad a mle: -7.884589e-14 -6.322526e-14
Let’s compute the -log lik on a grid in R2.
gs = 50 #one d grid size
alga1 = seq(from=-0,to=3.0,length.out=gs) #one d grid
alga2 = seq(from=-0,to=3.0,length.out=gs) #one d grid
alg2 = expand.grid(alga1,alga2) # two d grid
llv = rep(0,nrow(alg2))
quadv = rep(0,nrow(alg2))
linv = rep(0,nrow(alg2))
for(i in 1:length(llv)) {
llv[i] = mLL(X,y,as.double(alg2[i,]))
}
## contour
llmat = matrix(as.numeric(llv),nrow=length(alga1),ncol=length(alga2))
contour(alga1,alga2,llmat,nlevels=100,drawlabels=FALSE,
xlab=expression(beta[1]),ylab=expression(beta[2]),col="blue",
cex.lab=1.8,cex.axis=1.2,lwd=1)
points(beta[1],beta[2],cex=1.5,pch=16,col="red")
points(bhat[1],bhat[2],cex=1.5,pch=16,col="magenta")
title(main="contours of -log likelihood, red dot at true value, magenta at mle", cex.main=.8)
Gradient descent.
### gradient descent
### iteration parameters
setup=1
if(setup==1) {
niter = 100 # number of iterations
lrat = rep(5.0,niter) #learning rate
}
if(setup==2) {
niter = 100 # number of iterations
lrat = rep(40.0,niter) #learning rate
}
if(setup==3) {
niter = 50 # number of iterations
lrat = rep(35.0,niter) #learning rate
}
biter = c(1.2,.2) #starting value
## store iterations of beta values
bM = matrix(0.0,niter+1,length(biter))
bM[1,] = biter
lvv = rep(0,niter+1)
lambda = 0.0
lvv[1] = mLL(X,y,biter)
## iterate
for(i in 1:niter) {
gv = lgGrad(X,y,biter)
biter = biter - lrat[i]*(gv + 2*lambda*biter)
lvv[i+1] = mLL(X,y,biter)
bM[i+1,] = biter
}
plot(lvv)
## contour
llmat = matrix(as.numeric(llv),nrow=length(alga1),ncol=length(alga2))
contour(alga1,alga2,llmat,nlevels=100,drawlabels=FALSE,
xlab=expression(beta[1]),ylab=expression(beta[2]),col="blue",
cex.lab=1.8,cex.axis=1.2,lwd=1)
points(beta[1],beta[2],cex=1.5,pch=16,col="black")
points(bhat[1],bhat[2],cex=1.5,pch=16,col="magenta")
tnm=paste0("Gradient descent, learning rate= ",lrat[1],", niter= ",niter)
title(main=tnm, cex.main=1.2)
for(i in 2:(niter+1)) {
arrows(bM[i-1,1],bM[i-1,2],bM[i,1],bM[i,2],length=.05,col="red")
}