library(kknn)
source("http://www.rob-mcculloch.org/2022_uml/webpage/R/docv.R")
library(MASS)
The R package is kknn and docv.R is code Rob wrote to do cross-validation using kknn.
The MASS package has the Boston data.
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
docvknn
## function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE)
## {
## return(docv(x, y, matrix(k, ncol = 1), doknn, mse, nfold = nfold,
## doran = doran, verbose = verbose))
## }
k is the vector of values for the number of neighbors you want to try, nfold is the number of folds, and doran indicates whether or not to randomly shuffle the rows of the training data (X,y).
Let’s first fit with just one x=lstat so we can see the knn fit.
Let’s use all the data to train and the sorted x=lstat values to get our fits at.
Sorting will make it easier to plot the results.
ddftr = data.frame(lstat = Boston$lstat, medv = Boston$medv )
ddfte = data.frame(lstat = sort(Boston$lstat))
Now we fit kknn. Note that we give train and test data in the call rather than calling a predict function afterwards.
Let’s use k=50 neighbors.
kf50 = kknn(medv~lstat,ddftr,ddfte,k=50,kernel = "rectangular")
names(kf50)
## [1] "fitted.values" "CL" "W" "D"
## [5] "C" "prob" "response" "distance"
## [9] "call" "terms"
Let’s plot the fit.
plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors',cex.main=1.5)
Let’s predict at lstat=30.
dfpred = data.frame(lstat=c(30))
knnpred = kknn(medv~lstat,ddftr,dfpred,k=50,kernel = "rectangular")
knnpred$fitted.values
## [1] 12.058
plot(ddftr$lstat,ddftr$medv,xlab='lstat',ylab='medv',cex=.5,col='blue')
lines(ddfte$lstat,kf50$fitted.values,col="red",lwd=2)
title(main='knn fit with 50 neighbors, predict at 30',cex.main=.9)
points(dfpred$lstat,knnpred$fitted.values,col='green',cex=2,pch=16)
Let’s use cross-validation to pick a good k.
set.seed(99) #always set the seed! (Gretzky was number 99)
kv = 2:100 #these are the k values (k as in kNN) we will try
cvres = docvknn(matrix(ddftr$lstat,ncol=1),ddftr$medv,kv,nfold=10) ## sum of squared prediction errors
## in docv: nset,n,nfold: 99 506 10
## on fold: 1 , range: 1 : 51
## on fold: 2 , range: 52 : 102
## on fold: 3 , range: 103 : 153
## on fold: 4 , range: 154 : 204
## on fold: 5 , range: 205 : 255
## on fold: 6 , range: 256 : 306
## on fold: 7 , range: 307 : 357
## on fold: 8 , range: 358 : 408
## on fold: 9 , range: 409 : 459
## on fold: 10 , range: 460 : 506
Let’s plot the out of sample rmse as estimated by cross-validation against k.
rmseres = sqrt(cvres/nrow(ddftr))
plot(kv,rmseres,xlab='k = num neighbors',ylab='rmse',type='b',col='blue')
title(main=paste0("best k is ",kv[which.min(cvres)]))
Let’s try knn with more than one x.
Let’s use x1=lstat,x2=indus.
Recall that we have to scale the x variables.
Let’s use x -> (x-min)/(max-min).
#get variables we want
x = cbind(Boston$lstat,Boston$indus)
colnames(x) = c("lstat","indus")
y = Boston$medv
mmsc=function(x) {return((x-min(x))/(max(x)-min(x)))}
xs = apply(x,2,mmsc) #apply scaling function to each column of x
Note that I used all the data to choose the scaling which is wrong in principle since now nothing is out of sample. In this case I’m assuming this makes no difference.
Plot vs each x.
#plot y vs each x
par(mfrow=c(1,2)) #two plot frames
plot(x[,1],y,xlab="lstat",ylab="medv")
plot(x[,2],y,xlab="indus",ylab="medv")
Run cross-valiation once.
#run cross val once
par(mfrow=c(1,1))
set.seed(99)
kv = 2:20 #k values to try
n = length(y)
cvtemp = docvknn(xs,y,kv,nfold=10)
## in docv: nset,n,nfold: 19 506 10
## on fold: 1 , range: 1 : 51
## on fold: 2 , range: 52 : 102
## on fold: 3 , range: 103 : 153
## on fold: 4 , range: 154 : 204
## on fold: 5 , range: 205 : 255
## on fold: 6 , range: 256 : 306
## on fold: 7 , range: 307 : 357
## on fold: 8 , range: 358 : 408
## on fold: 9 , range: 409 : 459
## on fold: 10 , range: 460 : 506
cvtemp = sqrt(cvtemp/n) #docvknn returns sum of squares
plot(kv,cvtemp)
Run cross val several times.
#run cross val several times
set.seed(99)
cvmean = rep(0,length(kv)) #will keep average rmse here
ndocv = 50 #number of CV splits to try
n=length(y)
cvmat = matrix(0,length(kv),ndocv) #keep results for each split
for(i in 1:ndocv) {
cvtemp = docvknn(xs,y,kv,nfold=10,verbose=FALSE)
cvmean = cvmean + cvtemp
cvmat[,i] = sqrt(cvtemp/n)
}
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
## in docv: nset,n,nfold: 19 506 10
cvmean = cvmean/ndocv
cvmean = sqrt(cvmean/n)
plot(kv,cvmean,type="n",ylim=range(cvmat),xlab="k",cex.lab=1.5)
for(i in 1:ndocv) lines(kv,cvmat[,i],col=i,lty=3) #plot each result
lines(kv,cvmean,type="b",col="black",lwd=5) #plot average result
What is the best k?
cat('best k is',kv[which.min(cvmean)],'\n')
## best k is 5
Refit using all the data and k=5
#refit using all the data and k=5
ddf = data.frame(y,xs)
near5 = kknn(y~.,ddf,ddf,k=5,kernel = "rectangular")
lmf = lm(y~.,ddf)
fmat = cbind(y,near5$fitted,lmf$fitted)
colnames(fmat)=c("y","kNN5","linear")
pairs(fmat)
print(cor(fmat))
## y kNN5 linear
## y 1.0000000 0.8971328 0.7392278
## kNN5 0.8971328 1.0000000 0.8336039
## linear 0.7392278 0.8336039 1.0000000
Predict price of house in place with lstat=10, indus=11.
#predict price of house in place with lstat=10, indus=11.
x1=10; x2=11
x1s = (x1-min(x[,1]))/(max(x[,1])-min(x[,1]))
x2s = (x2-min(x[,2]))/(max(x[,2])-min(x[,2]))
near = kknn(y~.,ddf,data.frame(lstat=x1s,indus=x2s),k=5,kernel = "rectangular")
cat("knn predicted value: ",near$fitted,"\n")
## knn predicted value: 23.96
```