Get Library and Rob’s Code

library(kknn)
source("http://www.rob-mcculloch.org/2025_uml/webpage/R/docvMMS.R")
library(MASS)

The R package is kknn and docv.R is code Rob wrote to do cross-validation using kknn.
The MASS package has the Boston data.

head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

Rob’s Code

You can ignore this section if you want and just see how the code is used below.
The code is not complicated.

Here is the cross validation code:

docv
## function (x, y, set, predfun, loss, nfold = 10, doran = TRUE, 
##     verbose = TRUE, ...) 
## {
##     if (!(is.matrix(x) | is.data.frame(x))) {
##         cat("error in docv: x is not a matrix or data frame\n")
##         return(0)
##     }
##     if (!(is.vector(y))) {
##         cat("error in docv: y is not a vector\n")
##         return(0)
##     }
##     if (!(length(y) == nrow(x))) {
##         cat("error in docv: length(y) != nrow(x)\n")
##         return(0)
##     }
##     nset = nrow(set)
##     n = length(y)
##     if (n == nfold) 
##         doran = FALSE
##     cat("in docv: nset,n,nfold: ", nset, n, nfold, "\n")
##     lossv = rep(0, nset)
##     if (doran) {
##         ii = sample(1:n, n)
##         y = y[ii]
##         x = x[ii, , drop = FALSE]
##     }
##     fs = round(n/nfold)
##     for (i in 1:nfold) {
##         bot = (i - 1) * fs + 1
##         top = ifelse(i == nfold, n, i * fs)
##         ii = bot:top
##         if (verbose) 
##             cat("on fold: ", i, ", range: ", bot, ":", top, "\n")
##         xin = x[-ii, , drop = FALSE]
##         yin = y[-ii]
##         xout = x[ii, , drop = FALSE]
##         yout = y[ii]
##         for (k in 1:nset) {
##             yhat = predfun(xin, yin, xout, set[k, ], ...)
##             lossv[k] = lossv[k] + loss(yout, yhat)
##         }
##     }
##     return(lossv)
## }

Here are the functions that give you a version of CV which does not scale the x using train.

docvknn
## function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE) 
## {
##     return(docv(x, y, matrix(k, ncol = 1), doknn, mse, nfold = nfold, 
##         doran = doran, verbose = verbose))
## }
doknn
## function (x, y, xp, k) 
## {
##     kdo = k[1]
##     train = data.frame(x, y = y)
##     test = data.frame(xp)
##     names(test) = names(train)[1:(ncol(train) - 1)]
##     near = kknn(y ~ ., train, test, k = kdo, kernel = "rectangular")
##     return(near$fitted)
## }

This version min/max scales on train and then uses that scaling on test.

docvknnMMS
## function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE) 
## {
##     return(docv(x, y, matrix(k, ncol = 1), doknnMMS, mse, nfold = nfold, 
##         doran = doran, verbose = verbose))
## }
doknnMMS
## function (x, y, xp, k) 
## {
##     p = ncol(x)
##     for (i in 1:p) {
##         minsc = min(x[, i])
##         maxsc = max(x[, i])
##         x[, i] = (x[, i] - minsc)/(maxsc - minsc)
##         xp[, i] = (xp[, i] - minsc)/(maxsc - minsc)
##     }
##     kdo = k[1]
##     train = data.frame(x, y = y)
##     test = data.frame(xp)
##     names(test) = names(train)[1:(ncol(train) - 1)]
##     near = kknn(y ~ ., train, test, k = kdo, kernel = "rectangular")
##     return(near$fitted)
## }

k is the vector of values for the number of neighbors you want to try, nfold is the number of folds, and doran indicates whether or not to randomly shuffle the rows of the training data (X,y).

Just lstat

Let’s first fit with just one x=lstat so we can see the knn fit.
Let’s use all the data to train and the sorted x=lstat values to get our fits at.
Sorting will make it easier to plot the results.

ddftr = data.frame(lstat = Boston$lstat, medv = Boston$medv )
ddfte = data.frame(lstat = sort(Boston$lstat))

Now we fit kknn. Note that we give train and test data in the call rather than calling a predict function afterwards.
Let’s use k=50 neighbors.

kf50 = kknn(medv~lstat,ddftr,ddfte,k=50,kernel = "rectangular")
names(kf50)
##  [1] "fitted.values" "CL"            "W"             "D"            
##  [5] "C"             "prob"          "response"      "distance"     
##  [9] "call"          "terms"

Let’s plot the fit.

plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors',cex.main=1.5)

Let’s predict at lstat=20.

dfpred = data.frame(lstat=c(20))
knnpred = kknn(medv~lstat,ddftr,dfpred,k=50,kernel = "rectangular")
knnpred$fitted.values
## [1] 15.304
plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors, predict at 30',cex.main=.9)
points(dfpred$lstat,knnpred$fitted.values,col='green',cex=2,pch=16)

Just lstat, cross-validation

Let’s use cross-validation to pick a good k.

set.seed(99) #always set the seed! (Gretzky was number 99)
kv = 2:70 #these are the k values (k as in kNN) we will try
cvres = docvknn(matrix(ddftr$lstat,ncol=1),ddftr$medv,kv,nfold=10) ## sum of squared prediction errors
## in docv: nset,n,nfold:  69 506 10 
## on fold:  1 , range:  1 : 51 
## on fold:  2 , range:  52 : 102 
## on fold:  3 , range:  103 : 153 
## on fold:  4 , range:  154 : 204 
## on fold:  5 , range:  205 : 255 
## on fold:  6 , range:  256 : 306 
## on fold:  7 , range:  307 : 357 
## on fold:  8 , range:  358 : 408 
## on fold:  9 , range:  409 : 459 
## on fold:  10 , range:  460 : 506

Let’s plot the out of sample rmse as estimated by cross-validation against k.

rmseres = sqrt(cvres/nrow(ddftr)) #you have to divide by n and take the square root to get rmse
plot(kv,rmseres,xlab='k = num neighbors',ylab='rmse',type='b',col='blue')
title(main=paste0("best k is ",kv[which.min(cvres)]))

knn with more than one feature

Let’s try knn with more than one x.
Let’s use x1=lstat,x2=indus.

x = cbind(Boston$indus,Boston$lstat)
y = Boston$medv
cat("is x a matrix: ",is.matrix(x),'\n')
## is x a matrix:  TRUE

Run cross-valiation once.
Use docvknnMMS, it will min/max scale getting the min and max from
the training data and then scaling both train x and test x with (x-min)/(max-min).

#run cross val once
par(mfrow=c(1,1))
set.seed(99)
kv = 2:20 #k values to try
n = length(y)
cvtemp = docvknnMMS(x,y,kv,nfold=10)
## in docv: nset,n,nfold:  19 506 10 
## on fold:  1 , range:  1 : 51 
## on fold:  2 , range:  52 : 102 
## on fold:  3 , range:  103 : 153 
## on fold:  4 , range:  154 : 204 
## on fold:  5 , range:  205 : 255 
## on fold:  6 , range:  256 : 306 
## on fold:  7 , range:  307 : 357 
## on fold:  8 , range:  358 : 408 
## on fold:  9 , range:  409 : 459 
## on fold:  10 , range:  460 : 506
cvtemp = sqrt(cvtemp/n) #docvknn returns sum of squares
plot(kv,cvtemp)

Run cross val several times.

#run cross val several times
set.seed(99)
cvmean = rep(0,length(kv)) #will keep average rmse here
ndocv = 50 #number of CV splits to try
n=length(y)
cvmat = matrix(0,length(kv),ndocv) #keep results for each split
for(i in 1:ndocv) {
   cvtemp = docvknnMMS(x,y,kv,nfold=10,verbose=FALSE)
   cvmean = cvmean + cvtemp
   cvmat[,i] = sqrt(cvtemp/n)
}
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10 
## in docv: nset,n,nfold:  19 506 10
cvmean = cvmean/ndocv
cvmean = sqrt(cvmean/n)
plot(kv,cvmean,type="n",ylim=range(cvmat),xlab="k",cex.lab=1.5)
for(i in 1:ndocv) lines(kv,cvmat[,i],col=i,lty=3) #plot each result
lines(kv,cvmean,type="b",col="black",lwd=5) #plot average result

What is the best k?

cat('best k is',kv[which.min(cvmean)],'\n')
## best k is 5

Refit using all the data and k=5.

Let’s scale using all the data.

x = cbind(Boston$indus,Boston$lstat)
y = Boston$medv
minv = apply(x,2,min)
maxv = apply(x,2,max)
xs = x
for(i in 1:ncol(xs)) {
  xs[,i] = (x[,i]-minv[i])/(maxv[i]-minv[i])
}
#check
print(apply(xs,2,min))
## [1] 0 0
print(apply(xs,2,max))
## [1] 1 1
#refit using all the data and k=5
ddf = data.frame(y,xs)
near5 = kknn(y~.,ddf,ddf,k=5,kernel = "rectangular")
lmf = lm(y~.,ddf)
fmat = cbind(y,near5$fitted,lmf$fitted)
colnames(fmat)=c("y","kNN5","linear")
pairs(fmat)

print(cor(fmat))
##                y      kNN5    linear
## y      1.0000000 0.8971328 0.7392278
## kNN5   0.8971328 1.0000000 0.8336039
## linear 0.7392278 0.8336039 1.0000000

Predict price of house in place with lstat=10, indus=11.

print(names(ddf)) # have to use the same names in making ddfp for prediction
## [1] "y"  "X1" "X2"
#predict price of house in place with lstat=10, indus=11.
x1=10; x2=11
x1s = (x1-minv[1])/(maxv[1] - minv[1])
x2s = (x2-minv[2])/(maxv[2] - minv[2])
ddfp = data.frame(X1 = x1s, X2 = x2s)
near = kknn(y~.,ddf,ddfp,k=5,kernel = "rectangular")
cat("knn predicted value: ",near$fitted,"\n")
## knn predicted value:  20.96