library(kknn)
source("http://www.rob-mcculloch.org/2025_uml/webpage/R/docvMMS.R")
library(MASS)
The R package is kknn and docv.R is code Rob wrote to do
cross-validation using kknn.
The MASS package has the Boston data.
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
You can ignore this section if you want and just see how the code is
used below.
The code is not complicated.
Here is the cross validation code:
docv
## function (x, y, set, predfun, loss, nfold = 10, doran = TRUE,
## verbose = TRUE, ...)
## {
## if (!(is.matrix(x) | is.data.frame(x))) {
## cat("error in docv: x is not a matrix or data frame\n")
## return(0)
## }
## if (!(is.vector(y))) {
## cat("error in docv: y is not a vector\n")
## return(0)
## }
## if (!(length(y) == nrow(x))) {
## cat("error in docv: length(y) != nrow(x)\n")
## return(0)
## }
## nset = nrow(set)
## n = length(y)
## if (n == nfold)
## doran = FALSE
## cat("in docv: nset,n,nfold: ", nset, n, nfold, "\n")
## lossv = rep(0, nset)
## if (doran) {
## ii = sample(1:n, n)
## y = y[ii]
## x = x[ii, , drop = FALSE]
## }
## fs = round(n/nfold)
## for (i in 1:nfold) {
## bot = (i - 1) * fs + 1
## top = ifelse(i == nfold, n, i * fs)
## ii = bot:top
## if (verbose)
## cat("on fold: ", i, ", range: ", bot, ":", top, "\n")
## xin = x[-ii, , drop = FALSE]
## yin = y[-ii]
## xout = x[ii, , drop = FALSE]
## yout = y[ii]
## for (k in 1:nset) {
## yhat = predfun(xin, yin, xout, set[k, ], ...)
## lossv[k] = lossv[k] + loss(yout, yhat)
## }
## }
## return(lossv)
## }
Here are the functions that give you a version of CV which does not scale the x using train.
docvknn
## function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE)
## {
## return(docv(x, y, matrix(k, ncol = 1), doknn, mse, nfold = nfold,
## doran = doran, verbose = verbose))
## }
doknn
## function (x, y, xp, k)
## {
## kdo = k[1]
## train = data.frame(x, y = y)
## test = data.frame(xp)
## names(test) = names(train)[1:(ncol(train) - 1)]
## near = kknn(y ~ ., train, test, k = kdo, kernel = "rectangular")
## return(near$fitted)
## }
This version min/max scales on train and then uses that scaling on test.
docvknnMMS
## function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE)
## {
## return(docv(x, y, matrix(k, ncol = 1), doknnMMS, mse, nfold = nfold,
## doran = doran, verbose = verbose))
## }
doknnMMS
## function (x, y, xp, k)
## {
## p = ncol(x)
## for (i in 1:p) {
## minsc = min(x[, i])
## maxsc = max(x[, i])
## x[, i] = (x[, i] - minsc)/(maxsc - minsc)
## xp[, i] = (xp[, i] - minsc)/(maxsc - minsc)
## }
## kdo = k[1]
## train = data.frame(x, y = y)
## test = data.frame(xp)
## names(test) = names(train)[1:(ncol(train) - 1)]
## near = kknn(y ~ ., train, test, k = kdo, kernel = "rectangular")
## return(near$fitted)
## }
k is the vector of values for the number of neighbors you want to try, nfold is the number of folds, and doran indicates whether or not to randomly shuffle the rows of the training data (X,y).
Let’s first fit with just one x=lstat so we can see the knn
fit.
Let’s use all the data to train and the sorted x=lstat values to get our
fits at.
Sorting will make it easier to plot the results.
ddftr = data.frame(lstat = Boston$lstat, medv = Boston$medv )
ddfte = data.frame(lstat = sort(Boston$lstat))
Now we fit kknn. Note that we give train and test data in the call
rather than calling a predict function afterwards.
Let’s use k=50 neighbors.
kf50 = kknn(medv~lstat,ddftr,ddfte,k=50,kernel = "rectangular")
names(kf50)
## [1] "fitted.values" "CL" "W" "D"
## [5] "C" "prob" "response" "distance"
## [9] "call" "terms"
Let’s plot the fit.
plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors',cex.main=1.5)
Let’s predict at lstat=20.
dfpred = data.frame(lstat=c(20))
knnpred = kknn(medv~lstat,ddftr,dfpred,k=50,kernel = "rectangular")
knnpred$fitted.values
## [1] 15.304
plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors, predict at 30',cex.main=.9)
points(dfpred$lstat,knnpred$fitted.values,col='green',cex=2,pch=16)
Let’s use cross-validation to pick a good k.
set.seed(99) #always set the seed! (Gretzky was number 99)
kv = 2:70 #these are the k values (k as in kNN) we will try
cvres = docvknn(matrix(ddftr$lstat,ncol=1),ddftr$medv,kv,nfold=10) ## sum of squared prediction errors
## in docv: nset,n,nfold: 69 506 10
## on fold: 1 , range: 1 : 51
## on fold: 2 , range: 52 : 102
## on fold: 3 , range: 103 : 153
## on fold: 4 , range: 154 : 204
## on fold: 5 , range: 205 : 255
## on fold: 6 , range: 256 : 306
## on fold: 7 , range: 307 : 357
## on fold: 8 , range: 358 : 408
## on fold: 9 , range: 409 : 459
## on fold: 10 , range: 460 : 506
Let’s plot the out of sample rmse as estimated by cross-validation against k.
rmseres = sqrt(cvres/nrow(ddftr)) #you have to divide by n and take the square root to get rmse
plot(kv,rmseres,xlab='k = num neighbors',ylab='rmse',type='b',col='blue')
title(main=paste0("best k is ",kv[which.min(cvres)]))
Let’s try knn with more than one x.
Let’s use x1=lstat,x2=indus.
x = cbind(Boston$indus,Boston$lstat)
y = Boston$medv
cat("is x a matrix: ",is.matrix(x),'\n')
## is x a matrix: TRUE
Run cross-valiation once.
Use docvknnMMS, it will min/max scale getting the min and max from
the training data and then scaling both train x and test x with
(x-min)/(max-min).
#run cross val once
par(mfrow=c(1,1))
set.seed(99)
kv = 2:20 #k values to try
n = length(y)
cvtemp = docvknnMMS(x,y,kv,nfold=10)
## in docv: nset,n,nfold: 19 506 10
## on fold: 1 , range: 1 : 51
## on fold: 2 , range: 52 : 102
## on fold: 3 , range: 103 : 153
## on fold: 4 , range: 154 : 204
## on fold: 5 , range: 205 : 255
## on fold: 6 , range: 256 : 306
## on fold: 7 , range: 307 : 357
## on fold: 8 , range: 358 : 408
## on fold: 9 , range: 409 : 459
## on fold: 10 , range: 460 : 506
cvtemp = sqrt(cvtemp/n) #docvknn returns sum of squares
plot(kv,cvtemp)
Run cross val several times.
#run cross val several times
set.seed(99)
cvmean = rep(0,length(kv)) #will keep average rmse here
ndocv = 50 #number of CV splits to try
n=length(y)
cvmat = matrix(0,length(kv),ndocv) #keep results for each split
for(i in 1:ndocv) {
cvtemp = docvknnMMS(x,y,kv,nfold=10,verbose=FALSE)
cvmean = cvmean + cvtemp
cvmat[,i] = sqrt(cvtemp/n)
}
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
cvmean = cvmean/ndocv
cvmean = sqrt(cvmean/n)
plot(kv,cvmean,type="n",ylim=range(cvmat),xlab="k",cex.lab=1.5)
for(i in 1:ndocv) lines(kv,cvmat[,i],col=i,lty=3) #plot each result
lines(kv,cvmean,type="b",col="black",lwd=5) #plot average result
What is the best k?
cat('best k is',kv[which.min(cvmean)],'\n')
## best k is 5
Refit using all the data and k=5.
Let’s scale using all the data.
x = cbind(Boston$indus,Boston$lstat)
y = Boston$medv
minv = apply(x,2,min)
maxv = apply(x,2,max)
xs = x
for(i in 1:ncol(xs)) {
xs[,i] = (x[,i]-minv[i])/(maxv[i]-minv[i])
}
#check
print(apply(xs,2,min))
## [1] 0 0
print(apply(xs,2,max))
## [1] 1 1
#refit using all the data and k=5
ddf = data.frame(y,xs)
near5 = kknn(y~.,ddf,ddf,k=5,kernel = "rectangular")
lmf = lm(y~.,ddf)
fmat = cbind(y,near5$fitted,lmf$fitted)
colnames(fmat)=c("y","kNN5","linear")
pairs(fmat)
print(cor(fmat))
## y kNN5 linear
## y 1.0000000 0.8971328 0.7392278
## kNN5 0.8971328 1.0000000 0.8336039
## linear 0.7392278 0.8336039 1.0000000
Predict price of house in place with lstat=10, indus=11.
print(names(ddf)) # have to use the same names in making ddfp for prediction
## [1] "y" "X1" "X2"
#predict price of house in place with lstat=10, indus=11.
x1=10; x2=11
x1s = (x1-minv[1])/(maxv[1] - minv[1])
x2s = (x2-minv[2])/(maxv[2] - minv[2])
ddfp = data.frame(X1 = x1s, X2 = x2s)
near = kknn(y~.,ddf,ddfp,k=5,kernel = "rectangular")
cat("knn predicted value: ",near$fitted,"\n")
## knn predicted value: 20.96