Hitters example in ISLR chapter 10 lab.

##################################################
### data, train/test
library(ISLR2)
Gitters <- na.omit(Hitters)
n <- nrow(Gitters)
set.seed(13)
ntest <- trunc(n / 3)
testid <- sample(1:n, ntest)
##################################################
###  linear 
lfit <- lm(Salary ~ ., data = Gitters[-testid, ])
lpred <- predict(lfit, Gitters[testid, ])
print('test mad for linear:')
## [1] "test mad for linear:"
with(Gitters[testid, ], mean(abs(lpred - Salary)))
## [1] 254.6687
##################################################
###  get y and x including dummies
x <- scale(model.matrix(Salary ~ . - 1, data = Gitters))
y <- Gitters$Salary
##################################################
### lasso

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
cvfit <- cv.glmnet(x[-testid, ], y[-testid],
    type.measure = "mae")
cpred <- predict(cvfit, x[testid, ], s = "lambda.min")
print('test mad for Lasso')
## [1] "test mad for Lasso"
print(mean(abs(y[testid] - cpred)))
## [1] 252.2994
plot(cvfit)

plot(cvfit$glmnet.fit)

##################################################
### torch

library(torch)
library(luz) # high-level interface for torch
library(torchvision) # for datasets and image transformation

library(torchdatasets) # for datasets we are going to use
library(zeallot)
torch_manual_seed(13)
##################################################
### nn model

modnn <- nn_module(
  initialize = function(input_size) {
    self$hidden <- nn_linear(input_size, 50)
    self$activation <- nn_relu()
    self$dropout <- nn_dropout(0.4)
    self$output <- nn_linear(50, 1)
  },
  forward = function(x) {
    x %>% 
      self$hidden() %>% 
      self$activation() %>% 
      self$dropout() %>% 
      self$output()
  }
)

## The object modnn has a single hidden layer with 50 hidden units, 
## and a ReLU activation function. 
## It then has a dropout layer, in which a random 40% of the 50 activations 
## from the previous layer are set to zero during each iteration of the 
## stochastic gradient descent algorithm. 
## Finally, the output layer has just one unit with no activation function, 
## indicating that the model provides a single quantitative output.
##################################################
### make x again using pipe notation
x1=x
x <- model.matrix(Salary ~ . - 1, data = Gitters) %>% scale()
summary(as.double(x1) - as.double(x))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
##################################################
### loss and optimizer

modnn <- modnn %>% 
  setup(
    loss = nn_mse_loss(),
    optimizer = optim_rmsprop,
    metrics = list(luz_metric_mae())
  ) %>% 
  set_hparams(input_size = ncol(x))
##################################################
### fit, e.g. run sgd

fitted <- modnn %>% 
  fit(
    data = list(x[-testid, ], matrix(y[-testid], ncol = 1)),
    valid_data = list(x[testid, ], matrix(y[testid], ncol = 1)),
    epochs = 20
  )
# plot sgd fitting
plot(fitted)

##################################################
## predict and loss

## as in lab
npred <- predict(fitted, x[testid, ])
print('nnet mad:')
## [1] "nnet mad:"
print(mean(abs(y[testid] - as.matrix(npred))))
## [1] 274.0331
## first convert npred to a nice R vector
xlio = torch_tensor(npred,device='cpu')
nnpred = as_array(xlio)

print('nnet mad:')
## [1] "nnet mad:"
print(mean(abs(y[testid] - nnpred)))
## [1] 274.0331
## compare the predictions from linear, lasso, and neural net
pMat = cbind(y[testid],lpred,cpred,nnpred)
colnames(pMat) = c('ytest','linear','lasso','nnet')
pairs(pMat)

print(cor(pMat))
##            ytest    linear     lasso      nnet
## ytest  1.0000000 0.7636504 0.7380713 0.7390636
## linear 0.7636504 1.0000000 0.9662894 0.8475180
## lasso  0.7380713 0.9662894 1.0000000 0.8610044
## nnet   0.7390636 0.8475180 0.8610044 1.0000000