Get Library and Rob’s Code

library(kknn)
source("http://www.rob-mcculloch.org/2022_uml/webpage/R/docv.R")
library(MASS)

The R package is kknn and docv.R is code Rob wrote to do cross-validation using kknn.
The MASS package has the Boston data.

head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
docvknn
## function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE) 
## {
##     return(docv(x, y, matrix(k, ncol = 1), doknn, mse, nfold = nfold, 
##         doran = doran, verbose = verbose))
## }

k is the vector of values for the number of neighbors you want to try, nfold is the number of folds, and doran indicates whether or not to randomly shuffle the rows of the training data (X,y).

Just lstat

Let’s first fit with just one x=lstat so we can see the knn fit.
Let’s use all the data to train and the sorted x=lstat values to get our fits at.
Sorting will make it easier to plot the results.

ddftr = data.frame(lstat = Boston$lstat, medv = Boston$medv )
ddfte = data.frame(lstat = sort(Boston$lstat))

Now we fit kknn. Note that we give train and test data in the call rather than calling a predict function afterwards.
Let’s use k=50 neighbors.

kf50 = kknn(medv~lstat,ddftr,ddfte,k=50,kernel = "rectangular")
names(kf50)
##  [1] "fitted.values" "CL"            "W"             "D"            
##  [5] "C"             "prob"          "response"      "distance"     
##  [9] "call"          "terms"

Let’s plot the fit.

plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors',cex.main=1.5)

Let’s predict at lstat=30.

dfpred = data.frame(lstat=c(30))
knnpred = kknn(medv~lstat,ddftr,dfpred,k=50,kernel = "rectangular")
knnpred$fitted.values
## [1] 12.058
plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors, predict at 30',cex.main=.9)
points(dfpred$lstat,knnpred$fitted.values,col='green',cex=2,pch=16)

Just lstat, cross-validation

Let’s use cross-validation to pick a good k.

set.seed(99) #always set the seed! (Gretzky was number 99)
kv = 2:100 #these are the k values (k as in kNN) we will try
cvres = docvknn(matrix(ddftr$lstat,ncol=1),ddftr$medv,kv,nfold=10) ## sum of squared prediction errors
## in docv: nset,n,nfold:  99 506 10 
## on fold:  1 , range:  1 : 51 
## on fold:  2 , range:  52 : 102 
## on fold:  3 , range:  103 : 153 
## on fold:  4 , range:  154 : 204 
## on fold:  5 , range:  205 : 255 
## on fold:  6 , range:  256 : 306 
## on fold:  7 , range:  307 : 357 
## on fold:  8 , range:  358 : 408 
## on fold:  9 , range:  409 : 459 
## on fold:  10 , range:  460 : 506

Let’s plot the out of sample rmse as estimated by cross-validation against k.

rmseres = sqrt(cvres/nrow(ddftr))
plot(kv,rmseres,xlab='k = num neighbors',ylab='rmse',type='b',col='blue')
title(main=paste0("best k is ",kv[which.min(cvres)]))

knn with more than one feature

Let’s try knn with more than one x.
Let’s use x1=lstat,x2=indus.

Recall that we have to scale the x variables.
Let’s use x -> (x-min)/(max-min).

#get variables we want
x = cbind(Boston$lstat,Boston$indus)
colnames(x) = c("lstat","indus")
y = Boston$medv
mmsc=function(x) {return((x-min(x))/(max(x)-min(x)))}
xs = apply(x,2,mmsc) #apply scaling function to each column of x

Note that I used all the data to choose the scaling which is wrong in principle since now nothing is out of sample. In this case I’m assuming this makes no difference.

Plot vs each x.

#plot y vs each x
par(mfrow=c(1,2)) #two plot frames
plot(x[,1],y,xlab="lstat",ylab="medv")
plot(x[,2],y,xlab="indus",ylab="medv")

Run cross-valiation once.

#run cross val once
par(mfrow=c(1,1))
set.seed(99)
kv = 2:20 #k values to try
n = length(y)
cvtemp = docvknn(xs,y,kv,nfold=10)
## in docv: nset,n,nfold:  19 506 10 
## on fold:  1 , range:  1 : 51 
## on fold:  2 , range:  52 : 102 
## on fold:  3 , range:  103 : 153 
## on fold:  4 , range:  154 : 204 
## on fold:  5 , range:  205 : 255 
## on fold:  6 , range:  256 : 306 
## on fold:  7 , range:  307 : 357 
## on fold:  8 , range:  358 : 408 
## on fold:  9 , range:  409 : 459 
## on fold:  10 , range:  460 : 506
cvtemp = sqrt(cvtemp/n) #docvknn returns sum of squares
plot(kv,cvtemp)

Run cross val several times.

#run cross val several times
set.seed(99)
cvmean = rep(0,length(kv)) #will keep average rmse here
ndocv = 50 #number of CV splits to try
n=length(y)
cvmat = matrix(0,length(kv),ndocv) #keep results for each split
for(i in 1:ndocv) {
   cvtemp = docvknn(xs,y,kv,nfold=10,verbose=FALSE)
   cvmean = cvmean + cvtemp
   cvmat[,i] = sqrt(cvtemp/n)
}
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10
cvmean = cvmean/ndocv
cvmean = sqrt(cvmean/n)
plot(kv,cvmean,type="n",ylim=range(cvmat),xlab="k",cex.lab=1.5)
for(i in 1:ndocv) lines(kv,cvmat[,i],col=i,lty=3) #plot each result
lines(kv,cvmean,type="b",col="black",lwd=5) #plot average result

What is the best k?

cat('best k is',kv[which.min(cvmean)],'\n')
## best k is 5

Refit using all the data and k=5

#refit using all the data and k=5
ddf = data.frame(y,xs)
near5 = kknn(y~.,ddf,ddf,k=5,kernel = "rectangular")
lmf = lm(y~.,ddf)
fmat = cbind(y,near5$fitted,lmf$fitted)
colnames(fmat)=c("y","kNN5","linear")
pairs(fmat)

print(cor(fmat))
##                y      kNN5    linear
## y      1.0000000 0.8971328 0.7392278
## kNN5   0.8971328 1.0000000 0.8336039
## linear 0.7392278 0.8336039 1.0000000

Predict price of house in place with lstat=10, indus=11.

#predict price of house in place with lstat=10, indus=11.
x1=10; x2=11
x1s = (x1-min(x[,1]))/(max(x[,1])-min(x[,1]))
x2s = (x2-min(x[,2]))/(max(x[,2])-min(x[,2]))
near = kknn(y~.,ddf,data.frame(lstat=x1s,indus=x2s),k=5,kernel = "rectangular")
cat("knn predicted value: ",near$fitted,"\n")
## knn predicted value:  23.96

```