1. Using Cross Validation

library(kknn)
source("docv.R")
cd = read.csv("http://www.rob-mcculloch.org/data/susedcars.csv")
x = matrix(cd$mileage/1000,ncol=1)
y = cd$price/1000
set.seed(99)
kv = 2:50 #k values to try
n = length(y)
cvtemp = docvknn(x,y,kv,nfold=5)
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000
cvtemp = sqrt(cvtemp/n) #docvknn returns sum of squares
plot(kv,cvtemp)

#run cross val several times
set.seed(99)
cvmean = rep(0,length(kv)) #will keep average rmse here
ndocv = 5 #number of CV splits to try
n=length(y)
cvmat = matrix(0,length(kv),ndocv) #keep results for each split
for(i in 1:ndocv) {
   cvtemp = docvknn(x,y,kv,nfold=5)
   cvmean = cvmean + cvtemp
   cvmat[,i] = sqrt(cvtemp/n)
}
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000 
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000 
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000 
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000 
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000
cvmean = cvmean/ndocv
cvmean = sqrt(cvmean/n)
plot(kv,cvmean,type="n",ylim=range(cvmat),xlab="k",cex.lab=1.5)
for(i in 1:ndocv) lines(kv,cvmat[,i],col=i,lty=3) #plot each result
lines(kv,cvmean,type="b",col="black",lwd=2) #plot average result

cat("the min rmse is: ",min(cvmean),", at k = ",kv[which.min(cvmean)])
## the min rmse is:  9.252771 , at k =  20
#refit using all the data and k= min rmse, also did k=40 since it gives a simpler model with ok rmse
kmin = kv[which.min(cvmean)]
ddf = data.frame(y,x)
kfit = kknn(y~.,ddf,ddf,k=kmin,kernel = "rectangular")
plot(x,y)

oo = order(x)
lines(x[oo],kfit$fitted.values[oo],col='red',lwd=3)

kfits = kknn(y~.,ddf,ddf,k=40,kernel = "rectangular")
lines(x[oo],kfits$fitted.values[oo],col='blue',lwd=2)

# predict at x=100
xn = data.frame(x=c(100))
kfps = kknn(y~.,ddf,xn,k=40,kernel = "rectangular")
kfp = kknn(y~.,ddf,xn,k=kmin,kernel = "rectangular")

plot(x,y,xlab="mileage",ylab="price")
lines(x[oo],kfits$fitted.values[oo],col='blue',lwd=2)
points(100,kfps$fitted.values,pch=16,cex=2,col="green")
points(100,kfp$fitted.values,pch=3,cex=2,col="magenta")
title(main=paste0("predicted price is ",round(kfps$fitted.values,2), " thousand dollars"))

2. kNN, Cars Data with Mileage and Year

## librairies
library(kknn)
source("docv.R")

## data
cd = read.csv("http://www.rob-mcculloch.org/data/susedcars.csv")
x = cbind(cd$mileage/1000,cd$year)
y = cd$price/1000

xs = scale(x) # scale to mean 0, sd 1
mv = attr(xs,'scaled:center') #means
sv = attr(xs,'scaled:scale')  #standard deviations
set.seed(99)
kv = 2:50 #k values to try
n = length(y)
cvtemp = docvknn(xs,y,kv,nfold=5)
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000
cvtemp = sqrt(cvtemp/n) #docvknn returns sum of squares
plot(kv,cvtemp)

kmin = kv[which.min(cvtemp)]
cat("the min rmse is: ",min(cvtemp),", at k = ",kmin)
## the min rmse is:  5.507357 , at k =  28

Wow, the min rmse is quite a bit smaller when we add year!!

ddf = data.frame(y,xs)
scmileage = (75-mv[1])/sv[1]
scyear = ((1995:2010)-mv[2])/sv[2]
ddfp = data.frame(X1 = rep(scmileage,length(scyear)), X2= scyear)
kfp = kknn(y~.,ddf,ddfp,k=kmin,kernel = "rectangular")

cat("predicted price: ",kfp$fitted.values)
## predicted price:  7.516286 7.694571 8.622964 9.227071 10.21004 10.77743 10.93571 11.99236 13.20211 14.41561 16.33375 21.37068 27.56979 31.59643 34.66604 36.95043
plot(1995:2010,kfp$fitted.values)
abline(h=17.37,col='green',lty=3)
abline(v=2008,col='blue',lty=4)
title(main = paste0("prediction for 2008: ",round(kfp$fitted.values[14],2)))

Let’s check to make sure that looking at years 1995:2010 makes sense at 75 (thousand) miles.

plot(x)

3. Choice of Kernel

I’m going to just change the definition of the function doknn which is used by docvknn to use the default optimal weighting.

To do it this way, I had to understand my code (yuck).

A simple train test split with a loop over k would be the most straightforward way to do it.

doknn = function(x,y,xp,k) {
   kdo=k[1]
   train = data.frame(x,y=y)
   test = data.frame(xp); names(test) = names(train)[1:(ncol(train)-1)]
   near  = kknn(y~.,train,test,k=kdo)
   return(near$fitted)
}

cvtempo = docvknn(xs,y,kv,nfold=5)
## in docv: nset,n,nfold:  49 1000 5 
## on fold:  1 , range:  1 : 200 
## on fold:  2 , range:  201 : 400 
## on fold:  3 , range:  401 : 600 
## on fold:  4 , range:  601 : 800 
## on fold:  5 , range:  801 : 1000
cvtempo = sqrt(cvtempo/n) #docvknn returns sum of squares

plot(range(kv),range(c(cvtemp,cvtempo)),xlab="k values",ylab="rmse",type="n")
lines(kv,cvtemp,col='red')
lines(kv,cvtempo,col='blue')

This is interesting in that it suggests you could use a much bigger k (simpler model!!) with the optimal weighting.