##################################################
## libraries
library(ggplot2)
##################################################
## simulate data
n = 200 #sample size
beta = c(1,.5) # true (intercept, slope)
## storage for simulation results
## py1 = F(beta[1] + beta[2]x), y ~ Bern(py1)
x = rep(0,n)
y = rep(0,n)
py1 = rep(0,n)
## simulate
set.seed(14) #dave keon
for(i in 1:n) {
x[i] = rnorm(1)
py1[i] = 1/(1 + exp(-(beta[1] + beta[2]*x[i])))
y[i] = rbinom(1,1,py1[i])
}
## simulated data as a data frame
ddf = data.frame(x,y= as.factor(y),py1)
head(ddf)
## x y py1
## 1 -0.66184983 0 0.6612960
## 2 0.13209832 1 0.7438448
## 3 1.49715368 1 0.8517732
## 4 -0.30114431 1 0.7004471
## 5 -0.06488077 0 0.7246328
## 6 0.64237371 1 0.7893791
summary(ddf)
## x y py1
## Min. :-2.14635 0: 56 Min. :0.4817
## 1st Qu.:-0.65945 1:144 1st Qu.:0.6616
## Median : 0.10398 Median :0.7412
## Mean : 0.03383 Mean :0.7253
## 3rd Qu.: 0.71152 3rd Qu.:0.7951
## Max. : 2.09226 Max. :0.8856
## write to csv file
write.csv(ddf,file="rsimdata.csv",row.names=FALSE)
## read in with temp = read.csv("rsimdata.csv")
##################################################
## plot
# x vs p(y=1|x)
plot(x,py1)

## marginal of y
table(ddf$y)/n
##
## 0 1
## 0.28 0.72
## x|y
boxplot(x~y,ddf)

p = ggplot(ddf,aes(x=x,y=y)) + geom_boxplot()
p

##################################################
### fit logit in R
lfit = glm(y~x,ddf,family=binomial())
summary(lfit)
##
## Call:
## glm(formula = y ~ x, family = binomial(), data = ddf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9549 -1.2132 0.6598 0.8082 1.3048
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9874 0.1651 5.980 2.23e-09 ***
## x 0.5973 0.1815 3.291 0.000998 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 237.18 on 199 degrees of freedom
## Residual deviance: 225.59 on 198 degrees of freedom
## AIC: 229.59
##
## Number of Fisher Scoring iterations: 4
## plot fit
phat = predict(lfit,type="response")
oo = order(x) # helps with plotting
plot(x[oo],py1[oo],col="blue",xlab="x",ylab="P(Y=1|x)",ylim=range(c(phat,py1)),type='l',lwd=2)
lines(x[oo],phat[oo],col="red",lwd=2)
legend('topleft',legend=c("true","estimated"),col=c("blue","red"),lwd=c(1,1),bty="n")
title("x vs. P(Y=1|x)")

##################################################
### - log lik
mLL1 = function(x,y,beta) {
n = length(y)
mll = 0.0
for(i in 1:n) {
py1 = 1/(1 + exp(-(beta[1] + beta[2]*x[i])))
if(y[i] == 1) {
mll = mll - log(py1)
} else {
mll = mll - log(1-py1)
}
}
return(mll)
}
##check against R
deviance = 2*mLL1(x,y,lfit$coef)
aic = deviance + 2*length(lfit$coef)
cat("deviance and aic should be ",deviance,", ", aic,"\n")
## deviance and aic should be 225.5914 , 229.5914
##################################################
### get -LL on beta1 grid
nval = 1000
p=2
## evaluate -LL at each row of bMat
bMat = matrix(0.0,nval,p)
bMat[,1] = lfit$coeff[1]
bMat[,2] = seq(from=0,to=1,length.out=nval)
## -LL evals
llv = rep(0,nval)
for(i in 1:nval) {
llv[i] = mLL1(x,y,bMat[i,])
}
## plot llv
plot(bMat[,2],llv,xlab="beta1 values",ylab="- log likelihood")
abline(v=beta[2],col="blue")
ii = which.min(llv)
abline(v=bMat[ii,2],col="red")
title(main="blue at true, red at mle",cex.main=2.2)

## dev.copy2pdf(file="mLL-on-a-beta1-grid.pdf",width=10,height=8)
## row of bMat at min:
bMat[ii,]
## [1] 0.9874014 0.5975976
## check
lfit$coef
## (Intercept) x
## 0.9874014 0.5972598
##################################################
### get -LL on bivariate grid
nval=50
b1g = seq(from=0,to=2,length.out=nval)
b2g = seq(from=0,to=1,length.out=nval)
## bg will be bivariate grid, all vals of b1g x all vals of b2g
bg = expand.grid(b1g,b2g)
nn = nrow(bg)
## loop over rows of bg and compute -LL
llv1 = rep(0,nn)
tm1 = system.time({
for(i in 1:nn) {
if( (i %% 500) == 0) cat("i: ",i,"\n")
llv1[i] = mLL1(x,y,bg[i,])
}
})
## i: 500
## i: 1000
## i: 1500
## i: 2000
## i: 2500
print(tm1)
## user system elapsed
## 321.587 0.160 321.837
## get bivariate grid as a matrix, and try loop
bgM = as.matrix(bg)
llv2 = rep(0,nn)
tm2 = system.time({
for(i in 1:nn) {
if( (i %% 500) == 0) cat("i: ",i,"\n")
llv2[i] = mLL1(x,y,bgM[i,])
}
})
## i: 500
## i: 1000
## i: 1500
## i: 2000
## i: 2500
print(tm2)
## user system elapsed
## 0.682 0.004 0.685
## check
ii = which.min(llv2)
cat("min -LL over grid: ",bgM[ii,],"\n")
## min -LL over grid: 0.9795918 0.5918367
cat("mle: ",lfit$coef,"\n")
## mle: 0.9874014 0.5972598
cat("true: ",beta,"\n")
## true: 1 0.5
temp = rbind(bgM[ii,],lfit$coef,beta)
rownames(temp) = c("grid mle","rmle","true")
colnames(temp) = c("intercept","slope")
print(temp)
## intercept slope
## grid mle 0.9795918 0.5918367
## rmle 0.9874014 0.5972598
## true 1.0000000 0.5000000
## try a simpler matrix, do str(bgM) str(bgMM)
bgMM = cbind(bgM[,1],bgM[,2])
llv3 = rep(0,nn)
tm3 = system.time({
for(i in 1:nn) {
if( (i %% 500) == 0) cat("i: ",i,"\n")
llv3[i] = mLL1(x,y,bgMM[i,])
}
})
## i: 500
## i: 1000
## i: 1500
## i: 2000
## i: 2500
print(tm3)
## user system elapsed
## 0.076 0.001 0.076
## check the three minus log likelihoods are the same
summary(llv2-llv3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
summary(as.double(llv1) - llv2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
##################################################
### write vectorized version of mLL, no loops!!
mLL = function(x,y,beta) {
py1 = 1/(1+exp(-(beta[1] + beta[2]*x)))
return(-sum(ifelse(y==1,log(py1),log(1-py1))))
}
llv4 = rep(0,nn)
tm4 = system.time({
for(i in 1:nn) {
llv4[i] = mLL(x,y,bgMM[i,])
}
})
print(tm4)
## user system elapsed
## 0.037 0.000 0.036
##check
summary(llv3-llv4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.542e-13 -2.842e-14 0.000e+00 8.530e-17 2.842e-14 4.690e-13
mllM = matrix(llv4,nrow=length(b1g))
contour(b1g,b2g,mllM,nlevels=50,col='blue',drawlabels=FALSE)
points(beta[1],beta[2],col='blue',pch=16)
points(lfit$coef[1],lfit$coef[2],col='red',pch=15)

persp(b1g,b2g,mllM)

##################################################
## 2d plot
library(rgl)
plot3d(bgMM[,1],bgMM[,2],llv4)
## Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
## font family "sans" not found, using "bitmap"
##vHello Professor,
##v
##vI found out RGL package is not working in MacOS because of new update in Xquartz. https://stackoverflow.com/questions/66011929/package-rgl-in-r-not-loading-in-mac-os/66127391#66127391. But I found out new package called “plotly” for plotting 3d plots in R. I even tried with the mileage-year vs price.
##v
##vplot_ly(x=cd1$mileage, y=cd1$year, z=cd1$price, type="scatter3d", mode="markers", color = cd1$price) %>% layout(title = 'Mileage-Year vs Price', scene = list(xaxis = list(title = "Mileage"), yaxis = list(title = "Year"), zaxis = list(title = "Price")))
##v