Simple Boston with KNN in pythonĀ¶
InĀ [33]:
##################################################
### import
### basic
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
#%matplotlib
##sklearn learners
from sklearn.neighbors import KNeighborsRegressor
## scale the x variables when there is more than one
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
##sklearn metrics
from sklearn.metrics import mean_squared_error
##sklearn model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
## random number generation
from numpy.random import default_rng
InĀ [34]:
##################################################
### read in boston data
bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")
#pull off y=medv and x = lstat
y = bd['medv']
X = bd['lstat'].to_numpy()[:,np.newaxis]
#plot x vs. y
plt.scatter(X,y)
plt.xlabel('lstat')
plt.ylabel('mdev')
Out[34]:
Text(0, 0.5, 'mdev')
InĀ [35]:
##################################################
### fit one knn
# create model object setting the hyperparameter n_neighbors to 50
knnmod = KNeighborsRegressor(n_neighbors=50)
# fit with training data
knnmod.fit(X,y)
#predict on sorted x training values
Xtest = np.sort(X[:,0])[:,np.newaxis]
yhat = knnmod.predict(Xtest)
#plot fit
plt.scatter(X,y,s=10,c='k',marker='.')
plt.plot(Xtest,yhat,c='red')
plt.xlabel('lstat')
plt.ylabel('medv')
Out[35]:
Text(0, 0.5, 'medv')
InĀ [36]:
##################################################
### train/test split
#train/test split
myseed = 34
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)
# model object
kmod = KNeighborsRegressor(n_neighbors=50)
# fit on train
kmod.fit(Xtrain,ytrain)
# predict on test
ypred = kmod.predict(Xtest)
#plot to check predictions
plt.scatter(ytest,ypred)
plt.plot(ypred,ypred,c='red')
#rmse
k50mse = mean_squared_error(ytest,ypred)
#check rmse
check = np.sum((ypred-ytest)**2)/len(ytest)
print('val from fun:',k50mse,' and check val: ',check)
val from fun: 32.063680313725484 and check val: 32.063680313725484
InĀ [37]:
##################################################
### loop over k using simple train/test split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)
kvec = np.arange(50) + 2 #values of k to try
ormsev = np.zeros(len(kvec)) # storage for oos rsmse
irmsev = np.zeros(len(kvec)) # storage for in-sample rsmse
for i in range(len(kvec)):
tmod = KNeighborsRegressor(n_neighbors=kvec[i])
tmod.fit(Xtrain,ytrain)
yhat = tmod.predict(Xtest)
ormsev[i] = math.sqrt(mean_squared_error(ytest,yhat))
yhat = tmod.predict(Xtrain)
irmsev[i] = math.sqrt(mean_squared_error(ytrain,yhat))
# plot rmse vs k
plt.scatter(kvec,ormsev,c='blue',label='out-of-sample')
plt.plot(kvec,irmsev,c='red',label='in-sample')
plt.xlabel('k'); plt.ylabel('rmse')
plt.legend()
Out[37]:
<matplotlib.legend.Legend at 0x714118f2ad10>
InĀ [38]:
# plot rmse vs model complexity
mcmp = np.log(1/kvec) #model complexity
plt.scatter(mcmp,ormsev,c='blue',label='out-of-sample')
plt.plot(mcmp,irmsev,c='red',label='in-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title('rmse: blue: out of sample, red: in sample',size='x-large')
plt.legend()
Out[38]:
<matplotlib.legend.Legend at 0x714118f96020>
InĀ [39]:
##################################################
### cross validation using sklearn cross_val_score
#model object
tempmod = KNeighborsRegressor(n_neighbors=40) #knn with k=40
## rmse from cross validation
cvres = cross_val_score(tempmod,X,y,cv=5,scoring='neg_mean_squared_error') #cross val with 5 folds
# tranform to rmse
rmse = math.sqrt(np.mean(-cvres))
print('the rmse for k=40 based on 5-fold is:', rmse)
the rmse for k=40 based on 5-fold is: 5.54000938117711
InĀ [40]:
## do it again but shuffle the data
# how do you shuffle the data?
rng = default_rng(34) # 34 for Auston Matthews
# let's try a simple example first
temp = rng.choice(10,10,replace=False)
print(temp) # temp is 0 to 9 randomly shuffled by sampling without replacement
[7 8 9 3 1 5 0 6 2 4]
InĀ [41]:
#np.random.seed(34) #old way
rng = default_rng(34) # 34 for Auston Matthews
indices = rng.choice(X.shape[0],X.shape[0],replace=False)
ys = y[indices]
Xs = X[indices,:]
cvres = cross_val_score(tempmod,Xs,ys,cv=5,scoring='neg_mean_squared_error')
rmse = math.sqrt(np.mean(-cvres))
print('the rmse for k=40 based on 5-fold is:', rmse)
the rmse for k=40 based on 5-fold is: 5.268138137040043
InĀ [42]:
##check shuffle
print(indices[:3])
print(y[indices[0]],y[indices[1]],y[indices[2]])
print(ys[:3])
[387 233 336] 7.4 48.3 19.5 387 7.4 233 48.3 336 19.5 Name: medv, dtype: float64
InĀ [43]:
##################################################
### cross validation on a grid of k values using sklearn validation_curve function
# create the knn model
model = KNeighborsRegressor() # create the knn model
# do cv at every value of k in kvec
# each row of (trains,test)S will correspond to a value of k
# each column has the cv=10 neg_mean_squared_error in-sample (trainS) and out of sample (testS)
trainS, testS = validation_curve(model,X,y,param_name = 'n_neighbors',param_range = kvec,cv=10,scoring='neg_mean_squared_error')
# transform neg_mean_squared_error to rmse
trrmse = np.sqrt(-trainS.mean(axis=1))
termse = np.sqrt(-testS.mean(axis=1))
#plot in and out of sample rmse
plt.scatter(mcmp,termse,label='in-sample')
plt.plot(mcmp,trrmse,c='red',label='out-of-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.legend()
Out[43]:
<matplotlib.legend.Legend at 0x71411852bdc0>
InĀ [44]:
print(np.argmin(termse))
29
InĀ [45]:
print(kvec[29])
31
InĀ [46]:
#################################################
### cross val on a grid using sklearn GridSearchCV
# hyperparamter values to try in the gid search
param_grid={'n_neighbors' : kvec} # same as above
# grid is the grid searh object
grid = GridSearchCV(model,param_grid,cv=10,scoring='neg_mean_squared_error')
# now run the grid search
grid.fit(X,y)
print(grid.best_params_) #best value from grid
print(grid.best_index_) # index of best value from grid
#check
print(kvec[grid.best_index_])
temp = grid.cv_results_ # results from the grid search (a dictionary)
print(temp.keys()) # what is in temp
temp['mean_test_score'] # this is the average score over folds at the values in param_grid
#transform to rmse
rmsevals = np.sqrt(-temp['mean_test_score'])
# plot
plt.plot(mcmp,rmsevals) # plot model complexity vs. rmse
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title('rmse from GridSearch')
{'n_neighbors': 31} 29 31 dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
Out[46]:
Text(0.5, 1.0, 'rmse from GridSearch')
InĀ [47]:
##################################################
### simple train/test split with 4 features ['lstat','nox','rm','ptratio']
### read in boston data
bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")
#pull off y=medv and x = lstat
y = bd['medv']
X = bd[['lstat','nox','rm','ptratio']].to_numpy()
#train/test split
myseed = 34
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)
## min,max scale the columns of X (store in Xs for X scaled)
sc = MinMaxScaler()
# get scaling using train
Xtrs = sc.fit_transform(Xtrain)
# scale test X
Xtes = sc.transform(Xtest)
## check scaling
print(np.min(Xtrs,axis=0))
print(np.max(Xtrs,axis=0))
print(np.min(Xtes,axis=0))
print(np.max(Xtes,axis=0))
kvec = np.arange(50) + 2 #values of k to try
ormsev = np.zeros(len(kvec)) # storage for oos rsmse
irmsev = np.zeros(len(kvec)) # storage for in-sample rsmse
for i in range(len(kvec)):
tmod = KNeighborsRegressor(n_neighbors=kvec[i])
tmod.fit(Xtrs,ytrain)
yhat = tmod.predict(Xtes)
ormsev[i] = math.sqrt(mean_squared_error(ytest,yhat))
yhat = tmod.predict(Xtrs)
irmsev[i] = math.sqrt(mean_squared_error(ytrain,yhat))
ii = np.argmin(ormsev)
print(f'best k is {kvec[ii]}')
# plot rmse vs k
plt.scatter(kvec,ormsev,c='blue',label='out-of-sample')
plt.plot(kvec,irmsev,c='red',label='in-sample')
plt.xlabel('k'); plt.ylabel('rmse')
plt.legend()
[0. 0. 0. 0.] [1. 1. 1. 1.] [ 0.03489362 -0.00829876 0.11055758 0. ] [1.02808511 1. 0.9268059 1. ] best k is 4
Out[47]:
<matplotlib.legend.Legend at 0x714115b13310>
InĀ [48]:
# plot rmse vs model complexity
mcmp = np.log(1/kvec) #model complexity
plt.scatter(mcmp,ormsev,c='blue',label='out-of-sample')
plt.plot(mcmp,irmsev,c='red',label='in-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title(f'best k is {kvec[ii]}')
plt.legend()
Out[48]:
<matplotlib.legend.Legend at 0x714115baeec0>