##################################################
## libraries
library(ggplot2)
##################################################
## simulate data
n = 200 #sample size
beta = c(1,.5) # true (intercept, slope)

## storage for simulation results
## py1 = F(beta[1] + beta[2]x), y ~ Bern(py1)
x = rep(0,n)
y = rep(0,n)
py1 = rep(0,n)

## simulate
set.seed(14) #dave keon
for(i in 1:n) {
   x[i] = rnorm(1)
   py1[i] = 1/(1 + exp(-(beta[1] + beta[2]*x[i])))
   y[i] = rbinom(1,1,py1[i])
}
## simulated data as a data frame
ddf = data.frame(x,y= as.factor(y),py1)
head(ddf)
##             x y       py1
## 1 -0.66184983 0 0.6612960
## 2  0.13209832 1 0.7438448
## 3  1.49715368 1 0.8517732
## 4 -0.30114431 1 0.7004471
## 5 -0.06488077 0 0.7246328
## 6  0.64237371 1 0.7893791
summary(ddf)
##        x            y            py1        
##  Min.   :-2.14635   0: 56   Min.   :0.4817  
##  1st Qu.:-0.65945   1:144   1st Qu.:0.6616  
##  Median : 0.10398           Median :0.7412  
##  Mean   : 0.03383           Mean   :0.7253  
##  3rd Qu.: 0.71152           3rd Qu.:0.7951  
##  Max.   : 2.09226           Max.   :0.8856
## write to csv file
write.csv(ddf,file="rsimdata.csv",row.names=FALSE)
  ## read in with temp = read.csv("rsimdata.csv")
##################################################
## plot

# x vs p(y=1|x)
plot(x,py1)

## marginal of y
table(ddf$y)/n
## 
##    0    1 
## 0.28 0.72
## x|y
boxplot(x~y,ddf)

p = ggplot(ddf,aes(x=x,y=y)) + geom_boxplot()
p

##################################################
### fit logit in R
lfit = glm(y~x,ddf,family=binomial())
summary(lfit)
## 
## Call:
## glm(formula = y ~ x, family = binomial(), data = ddf)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9549  -1.2132   0.6598   0.8082   1.3048  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9874     0.1651   5.980 2.23e-09 ***
## x             0.5973     0.1815   3.291 0.000998 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 237.18  on 199  degrees of freedom
## Residual deviance: 225.59  on 198  degrees of freedom
## AIC: 229.59
## 
## Number of Fisher Scoring iterations: 4
## plot fit
phat = predict(lfit,type="response")
oo = order(x) # helps with plotting
plot(x[oo],py1[oo],col="blue",xlab="x",ylab="P(Y=1|x)",ylim=range(c(phat,py1)),type='l',lwd=2)
lines(x[oo],phat[oo],col="red",lwd=2)
legend('topleft',legend=c("true","estimated"),col=c("blue","red"),lwd=c(1,1),bty="n")
title("x vs. P(Y=1|x)")

##################################################
### - log lik
mLL1 = function(x,y,beta) {
   n = length(y)
   mll = 0.0
   for(i in 1:n) {
      py1 = 1/(1 + exp(-(beta[1] + beta[2]*x[i])))
      if(y[i] == 1) {
         mll = mll - log(py1)
      } else {
         mll = mll - log(1-py1)
      }
   }
   return(mll)
}
##check against R
deviance = 2*mLL1(x,y,lfit$coef)
aic = deviance + 2*length(lfit$coef)
cat("deviance and aic should be ",deviance,", ", aic,"\n")
## deviance and aic should be  225.5914 ,  229.5914
##################################################
### get -LL on beta1 grid
nval = 1000
p=2
## evaluate -LL at each row of bMat
bMat = matrix(0.0,nval,p)
bMat[,1] = lfit$coeff[1]
bMat[,2] = seq(from=0,to=1,length.out=nval)

## -LL evals
llv = rep(0,nval)

for(i in 1:nval) {
   llv[i] = mLL1(x,y,bMat[i,])
}
## plot llv
plot(bMat[,2],llv,xlab="beta1 values",ylab="- log likelihood")
abline(v=beta[2],col="blue")
ii = which.min(llv)
abline(v=bMat[ii,2],col="red")
title(main="blue at true, red at mle",cex.main=2.2)

## dev.copy2pdf(file="mLL-on-a-beta1-grid.pdf",width=10,height=8)
## row of bMat at min:
bMat[ii,]
## [1] 0.9874014 0.5975976
## check
lfit$coef
## (Intercept)           x 
##   0.9874014   0.5972598
##################################################
### get -LL on bivariate grid
nval=50
b1g = seq(from=0,to=2,length.out=nval) 
b2g = seq(from=0,to=1,length.out=nval) 
## bg will be bivariate grid, all vals of b1g x all vals of b2g
bg = expand.grid(b1g,b2g)
nn = nrow(bg)
## loop over rows of bg and compute -LL
llv1 = rep(0,nn)
tm1 = system.time({
for(i in 1:nn) {
   if( (i %% 500) == 0) cat("i: ",i,"\n")
   llv1[i] = mLL1(x,y,bg[i,])
}
})
## i:  500 
## i:  1000 
## i:  1500 
## i:  2000 
## i:  2500
print(tm1)
##    user  system elapsed 
## 321.587   0.160 321.837
## get bivariate grid as a matrix, and try loop
bgM = as.matrix(bg)
llv2 = rep(0,nn)
tm2 = system.time({
for(i in 1:nn) {
   if( (i %% 500) == 0) cat("i: ",i,"\n")
   llv2[i] = mLL1(x,y,bgM[i,])
}
})
## i:  500 
## i:  1000 
## i:  1500 
## i:  2000 
## i:  2500
print(tm2)
##    user  system elapsed 
##   0.682   0.004   0.685
## check
ii = which.min(llv2)
cat("min -LL over grid: ",bgM[ii,],"\n")
## min -LL over grid:  0.9795918 0.5918367
cat("mle: ",lfit$coef,"\n")
## mle:  0.9874014 0.5972598
cat("true: ",beta,"\n")
## true:  1 0.5
temp  = rbind(bgM[ii,],lfit$coef,beta)
rownames(temp) = c("grid mle","rmle","true")
colnames(temp) = c("intercept","slope")
print(temp)
##          intercept     slope
## grid mle 0.9795918 0.5918367
## rmle     0.9874014 0.5972598
## true     1.0000000 0.5000000
## try a simpler matrix, do str(bgM) str(bgMM)
bgMM = cbind(bgM[,1],bgM[,2])
llv3 = rep(0,nn)
tm3 = system.time({ 
for(i in 1:nn) {
   if( (i %% 500) == 0) cat("i: ",i,"\n")
   llv3[i] = mLL1(x,y,bgMM[i,])
}
})
## i:  500 
## i:  1000 
## i:  1500 
## i:  2000 
## i:  2500
print(tm3)
##    user  system elapsed 
##   0.076   0.001   0.076
## check the three minus log likelihoods are the same
summary(llv2-llv3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
summary(as.double(llv1) - llv2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
##################################################
### write vectorized version of mLL, no loops!!
mLL = function(x,y,beta) {
   py1 = 1/(1+exp(-(beta[1] + beta[2]*x)))
   return(-sum(ifelse(y==1,log(py1),log(1-py1))))
}

llv4 = rep(0,nn)
tm4 = system.time({
for(i in 1:nn) {
   llv4[i] = mLL(x,y,bgMM[i,])
}
})
print(tm4)
##    user  system elapsed 
##   0.037   0.000   0.036
##check
summary(llv3-llv4)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -5.542e-13 -2.842e-14  0.000e+00  8.530e-17  2.842e-14  4.690e-13
mllM = matrix(llv4,nrow=length(b1g))

contour(b1g,b2g,mllM,nlevels=50,col='blue',drawlabels=FALSE)
points(beta[1],beta[2],col='blue',pch=16)
points(lfit$coef[1],lfit$coef[2],col='red',pch=15)

persp(b1g,b2g,mllM)

##################################################
## 2d plot
library(rgl)
plot3d(bgMM[,1],bgMM[,2],llv4)
## Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
## font family "sans" not found, using "bitmap"
##vHello Professor,
##v
##vI found out RGL package is not working in MacOS because of new update in Xquartz. https://stackoverflow.com/questions/66011929/package-rgl-in-r-not-loading-in-mac-os/66127391#66127391. But I found out new package called “plotly” for plotting 3d plots in R. I even tried with the mileage-year vs price. 
##v
##vplot_ly(x=cd1$mileage, y=cd1$year, z=cd1$price, type="scatter3d", mode="markers", color = cd1$price) %>% layout(title = 'Mileage-Year vs Price', scene = list(xaxis = list(title = "Mileage"), yaxis = list(title = "Year"), zaxis = list(title = "Price")))
##v