Use a pipeline in sklearnĀ¶
For doing things like cross-validation things like scaling should be learned on train
and then applied to test.
The pipeline feature is very convenient for this !!
Notice the import of make_pipeline in the next code cell.
InĀ [19]:
##################################################
### basic imports
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
##sklearn learners
from sklearn.neighbors import KNeighborsRegressor
## scale the x variables when there is more than one
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
##sklearn metrics
from sklearn.metrics import mean_squared_error
##sklearn model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
##pipeline
from sklearn.pipeline import make_pipeline
InĀ [20]:
######################################################
### read in boston data
bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")
y = bd['medv'].to_numpy()
## pull off 4 of the features
X = bd[['lstat','nox','rm','ptratio']].to_numpy()
## check our lstat vs. y medv plot
#plot x vs. y
plt.scatter(X[:,0],y)
plt.xlabel('lstat')
plt.ylabel('medv')
Out[20]:
Text(0, 0.5, 'medv')
InĀ [21]:
##################################################
### use get/set params, we will need to understand this to use the pipeline
## check that we can change k=n_neighbors
knmod = KNeighborsRegressor()
print(knmod.get_params())
knmod.set_params(n_neighbors = 50)
print(knmod.get_params())
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'} {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 50, 'p': 2, 'weights': 'uniform'}
InĀ [22]:
##################################################
## make pipline and check how to change parameters
## pipeline first scales and then uses KNN
Bp = make_pipeline(MinMaxScaler(),KNeighborsRegressor())
print(Bp.get_params()) #notice how the names reflect both the Scaler and the KNeighborsRegressor !!
## looks like the name for k in Bp is 'kneighborsregressor__n_neighbors'
## check we can change it
Bp.set_params(kneighborsregressor__n_neighbors = 7)
print('\n\n',Bp.get_params())
{'memory': None, 'steps': [('minmaxscaler', MinMaxScaler()), ('kneighborsregressor', KNeighborsRegressor())], 'verbose': False, 'minmaxscaler': MinMaxScaler(), 'kneighborsregressor': KNeighborsRegressor(), 'minmaxscaler__clip': False, 'minmaxscaler__copy': True, 'minmaxscaler__feature_range': (0, 1), 'kneighborsregressor__algorithm': 'auto', 'kneighborsregressor__leaf_size': 30, 'kneighborsregressor__metric': 'minkowski', 'kneighborsregressor__metric_params': None, 'kneighborsregressor__n_jobs': None, 'kneighborsregressor__n_neighbors': 5, 'kneighborsregressor__p': 2, 'kneighborsregressor__weights': 'uniform'} {'memory': None, 'steps': [('minmaxscaler', MinMaxScaler()), ('kneighborsregressor', KNeighborsRegressor(n_neighbors=7))], 'verbose': False, 'minmaxscaler': MinMaxScaler(), 'kneighborsregressor': KNeighborsRegressor(n_neighbors=7), 'minmaxscaler__clip': False, 'minmaxscaler__copy': True, 'minmaxscaler__feature_range': (0, 1), 'kneighborsregressor__algorithm': 'auto', 'kneighborsregressor__leaf_size': 30, 'kneighborsregressor__metric': 'minkowski', 'kneighborsregressor__metric_params': None, 'kneighborsregressor__n_jobs': None, 'kneighborsregressor__n_neighbors': 7, 'kneighborsregressor__p': 2, 'kneighborsregressor__weights': 'uniform'}
InĀ [23]:
##################################################
### simple train/test split
#train/test split
myseed = 88 # William Nylander (a Toronto Maple Leaf)
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)
#set k, fit on train, predict on test
Bp.set_params(kneighborsregressor__n_neighbors = 5)
Bp.fit(Xtrain,ytrain)
yp = Bp.predict(Xtest)
## change k and do it again
Bp.set_params(kneighborsregressor__n_neighbors = 50)
Bp.fit(Xtrain,ytrain)
yp1 = Bp.predict(Xtest)
## plot
plt.scatter(ytest,yp,c='red',label='predictions and actual',s=40)
plt.scatter(ytest,yp1,c='green',label='bigger k',s=20)
plt.plot(ytest,ytest,c='blue',label='y=x line')
plt.xlabel('y test'); plt.ylabel('y predictions')
plt.legend()
Out[23]:
<matplotlib.legend.Legend at 0x792408f51cf0>
InĀ [24]:
##################################################
### let's check Bp by doing it by hand, this is scale on train and then apply to test
## min,max scale the columns of X (store in Xs for X scaled)
sc = MinMaxScaler()
# get scaling using train
Xtrs = sc.fit_transform(Xtrain)
# scale test X
Xtes = sc.transform(Xtest)
## check scaling
print(np.min(Xtrs,axis=0))
print(np.max(Xtrs,axis=0))
print(np.min(Xtes,axis=0))
print(np.max(Xtes,axis=0))
knmod = KNeighborsRegressor()
knmod.set_params(n_neighbors = 5)
print(knmod.get_params())
knmod.fit(Xtrs,ytrain)
ypred = knmod.predict(Xtes)
## are yp and ypred the same (should be)
plt.scatter(yp,ypred,c='red')
plt.plot(ypred,ypred,c='blue')
plt.xlabel('y pred from pipeline'); plt.ylabel('y pred by hand')
pd.Series(yp-ypred).describe()
[0. 0. 0. 0.] [1. 1. 1. 1.] [0.03173289 0.01440329 0.20904388 0. ] [0.83498896 1. 0.90745354 1. ] {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Out[24]:
count 102.0 mean 0.0 std 0.0 min 0.0 25% 0.0 50% 0.0 75% 0.0 max 0.0 dtype: float64
InĀ [25]:
##################################################
### cross_val_score
## do it at a good k (5) and a bad k (50)
Bp.set_params(kneighborsregressor__n_neighbors = 5)
cvres5 = cross_val_score(Bp,X,y,cv=10,scoring='neg_mean_squared_error') #cross val with 10 folds
rmse5 = math.sqrt(np.mean(-cvres5))
Bp.set_params(kneighborsregressor__n_neighbors = 50)
cvres50 = cross_val_score(Bp,X,y,cv=10,scoring='neg_mean_squared_error') #cross val with 10 folds
rmse50 = math.sqrt(np.mean(-cvres50))
print(f'rmse at k = 5 is {rmse5}, and at 50 is {rmse50}')
rmse at k = 5 is 4.563113464102608, and at 50 is 5.498710768356453
InĀ [26]:
##################################################
### validation curve with Bp pipeline
kvec = np.arange(20) + 2 #values of k to try
mcmp = np.log(1/kvec) #model complexity
trainS, testS = validation_curve(Bp,X,y,param_name = 'kneighborsregressor__n_neighbors',param_range = kvec,cv=10,scoring='neg_mean_squared_error')
# transform neg_mean_squared_error to rmse
trrmse = np.sqrt(-trainS.mean(axis=1))
termse = np.sqrt(-testS.mean(axis=1))
ii = np.argmin(termse)
print(f'the k with min test rmse is {kvec[ii]}')
#plot in and out of sample rmse with complexity
plt.scatter(mcmp,termse,label='out-of-sample')
plt.plot(mcmp,trrmse,c='red',label='in-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title(f'the k with min test rmse is {kvec[ii]}')
plt.legend()
the k with min test rmse is 5
Out[26]:
<matplotlib.legend.Legend at 0x792408ff1300>
InĀ [27]:
#plot in and out of sample rmse with kvec
plt.scatter(kvec,termse,label='out-of-sample')
plt.plot(kvec,trrmse,c='red',label='in-sample')
plt.xlabel('kvec',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title(f'the k with min test rmse is {kvec[ii]}')
plt.legend()
Out[27]:
<matplotlib.legend.Legend at 0x7924098e4d00>