##################################################
### import

### basic 
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
#%matplotlib

##sklearn learners
from sklearn.neighbors import KNeighborsRegressor

## scale the x variables when there is more than one
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

##sklearn metrics
from sklearn.metrics import mean_squared_error

##sklearn model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV

## random number generation
from numpy.random import default_rng

##################################################
### read in boston data
bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")

#pull off y=medv and x = lstat
y = bd['medv']
X = bd['lstat'].to_numpy()[:,np.newaxis]

#plot x vs. y
plt.scatter(X,y)
plt.xlabel('lstat')
plt.ylabel('mdev')

Text(0, 0.5, 'mdev')

##################################################
### fit one knn

# create model object setting the hyperparameter n_neighbors to 50
knnmod = KNeighborsRegressor(n_neighbors=50)

# fit with training data
knnmod.fit(X,y)

#predict on sorted x training values
Xtest = np.sort(X[:,0])[:,np.newaxis]
yhat = knnmod.predict(Xtest)

#plot fit
plt.scatter(X,y,s=10,c='k',marker='.')
plt.plot(Xtest,yhat,c='red')
plt.xlabel('lstat')
plt.ylabel('medv')

Text(0, 0.5, 'medv')

##################################################
### train/test split

#train/test split
myseed = 34
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)

# model object
kmod = KNeighborsRegressor(n_neighbors=50)

# fit on train
kmod.fit(Xtrain,ytrain)

# predict on test
ypred = kmod.predict(Xtest)

#plot to check predictions
plt.scatter(ytest,ypred)
plt.plot(ypred,ypred,c='red')

#rmse
k50mse = mean_squared_error(ytest,ypred)


#check rmse
check  = np.sum((ypred-ytest)**2)/len(ytest)
print('val from fun:',k50mse,' and check val: ',check)

val from fun: 32.063680313725484  and check val:  32.063680313725484

##################################################
###  loop over k using simple train/test split

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)

kvec = np.arange(50) + 2 #values of k to try
ormsev = np.zeros(len(kvec)) # storage for oos rsmse
irmsev = np.zeros(len(kvec)) # storage for in-sample rsmse

for i in range(len(kvec)):
   tmod = KNeighborsRegressor(n_neighbors=kvec[i])
   tmod.fit(Xtrain,ytrain)
   yhat = tmod.predict(Xtest)
   ormsev[i] = math.sqrt(mean_squared_error(ytest,yhat))
   yhat = tmod.predict(Xtrain)
   irmsev[i] = math.sqrt(mean_squared_error(ytrain,yhat))

# plot rmse vs k
plt.scatter(kvec,ormsev,c='blue',label='out-of-sample')
plt.plot(kvec,irmsev,c='red',label='in-sample')
plt.xlabel('k'); plt.ylabel('rmse')
plt.legend()

<matplotlib.legend.Legend at 0x714118f2ad10>

# plot rmse vs model complexity
mcmp = np.log(1/kvec) #model complexity
plt.scatter(mcmp,ormsev,c='blue',label='out-of-sample')
plt.plot(mcmp,irmsev,c='red',label='in-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title('rmse: blue: out of sample, red: in sample',size='x-large')
plt.legend()

<matplotlib.legend.Legend at 0x714118f96020>

##################################################
###  cross validation using sklearn cross_val_score


#model object
tempmod = KNeighborsRegressor(n_neighbors=40) #knn with k=40

## rmse from cross validation
cvres = cross_val_score(tempmod,X,y,cv=5,scoring='neg_mean_squared_error') #cross val with 5 folds

# tranform to rmse
rmse = math.sqrt(np.mean(-cvres))
print('the rmse for k=40 based on 5-fold is:', rmse)

the rmse for k=40 based on 5-fold is: 5.54000938117711

## do it again but shuffle the data

# how do you shuffle the data?
rng = default_rng(34) # 34 for Auston Matthews
# let's try a simple example first
temp = rng.choice(10,10,replace=False)
print(temp) # temp is 0 to 9 randomly shuffled by sampling without replacement

[7 8 9 3 1 5 0 6 2 4]

#np.random.seed(34)  #old way
rng = default_rng(34) # 34 for Auston Matthews
indices = rng.choice(X.shape[0],X.shape[0],replace=False)
ys = y[indices]
Xs = X[indices,:]
cvres = cross_val_score(tempmod,Xs,ys,cv=5,scoring='neg_mean_squared_error')
rmse = math.sqrt(np.mean(-cvres))
print('the rmse for k=40 based on 5-fold is:', rmse)

the rmse for k=40 based on 5-fold is: 5.268138137040043

##check shuffle
print(indices[:3])
print(y[indices[0]],y[indices[1]],y[indices[2]])
print(ys[:3])

[387 233 336]
7.4 48.3 19.5
387     7.4
233    48.3
336    19.5
Name: medv, dtype: float64

##################################################
###  cross validation on a grid of k values using sklearn validation_curve function

# create the knn model
model = KNeighborsRegressor() # create the knn model

# do cv at every value of k in kvec
# each row of (trains,test)S will correspond to a value of k
# each column has the cv=10 neg_mean_squared_error in-sample (trainS) and out of sample (testS)
trainS, testS = validation_curve(model,X,y,param_name = 'n_neighbors',param_range = kvec,cv=10,scoring='neg_mean_squared_error')

# transform neg_mean_squared_error to rmse
trrmse = np.sqrt(-trainS.mean(axis=1))
termse = np.sqrt(-testS.mean(axis=1))

#plot in and out of sample rmse
plt.scatter(mcmp,termse,label='in-sample')
plt.plot(mcmp,trrmse,c='red',label='out-of-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.legend()

<matplotlib.legend.Legend at 0x71411852bdc0>

print(np.argmin(termse))

29

print(kvec[29])

31

#################################################
### cross val on a grid using sklearn GridSearchCV

# hyperparamter values to try in the gid search
param_grid={'n_neighbors' : kvec} # same as above

# grid  is the grid searh object
grid = GridSearchCV(model,param_grid,cv=10,scoring='neg_mean_squared_error')

# now run the grid search
grid.fit(X,y)

print(grid.best_params_) #best value from grid
print(grid.best_index_) # index of best value from grid
#check
print(kvec[grid.best_index_])


temp = grid.cv_results_ # results from the grid search (a dictionary)
print(temp.keys()) # what is in temp
temp['mean_test_score'] # this is the average score over folds at the values in param_grid

#transform to rmse
rmsevals = np.sqrt(-temp['mean_test_score'])

# plot
plt.plot(mcmp,rmsevals) # plot model complexity vs. rmse
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title('rmse from GridSearch')

{'n_neighbors': 31}
29
31
dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

Text(0.5, 1.0, 'rmse from GridSearch')

##################################################
### simple train/test split with  4 features ['lstat','nox','rm','ptratio']

### read in boston data
bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")

#pull off y=medv and x = lstat
y = bd['medv']
X = bd[['lstat','nox','rm','ptratio']].to_numpy()

#train/test split
myseed = 34
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)

## min,max scale the columns of X (store in Xs for X scaled)
sc = MinMaxScaler()
# get scaling using train
Xtrs = sc.fit_transform(Xtrain)
# scale test X
Xtes = sc.transform(Xtest)

## check scaling
print(np.min(Xtrs,axis=0))
print(np.max(Xtrs,axis=0))
print(np.min(Xtes,axis=0))
print(np.max(Xtes,axis=0))

kvec = np.arange(50) + 2 #values of k to try
ormsev = np.zeros(len(kvec)) # storage for oos rsmse
irmsev = np.zeros(len(kvec)) # storage for in-sample rsmse
for i in range(len(kvec)):
   tmod = KNeighborsRegressor(n_neighbors=kvec[i])
   tmod.fit(Xtrs,ytrain)
   yhat = tmod.predict(Xtes)
   ormsev[i] = math.sqrt(mean_squared_error(ytest,yhat))
   yhat = tmod.predict(Xtrs)
   irmsev[i] = math.sqrt(mean_squared_error(ytrain,yhat))

ii = np.argmin(ormsev)
print(f'best k is {kvec[ii]}')

# plot rmse vs k
plt.scatter(kvec,ormsev,c='blue',label='out-of-sample')
plt.plot(kvec,irmsev,c='red',label='in-sample')
plt.xlabel('k'); plt.ylabel('rmse')
plt.legend()

[0. 0. 0. 0.]
[1. 1. 1. 1.]
[ 0.03489362 -0.00829876  0.11055758  0.        ]
[1.02808511 1.         0.9268059  1.        ]
best k is 4

<matplotlib.legend.Legend at 0x714115b13310>

# plot rmse vs model complexity
mcmp = np.log(1/kvec) #model complexity
plt.scatter(mcmp,ormsev,c='blue',label='out-of-sample')
plt.plot(mcmp,irmsev,c='red',label='in-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title(f'best k is {kvec[ii]}')
plt.legend()

<matplotlib.legend.Legend at 0x714115baeec0>

Simple Boston with KNN in python¶