Use a pipeline in sklearn¶

For doing things like cross-validation things like scaling should be learned on train
and then applied to test.

The pipeline feature is very convenient for this !!

Notice the import of make_pipeline in the next code cell.

InĀ [22]:
##################################################
### basic  imports
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt

##sklearn learners
from sklearn.neighbors import KNeighborsRegressor

## scale the x variables when there is more than one
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

##sklearn metrics
from sklearn.metrics import mean_squared_error

##sklearn model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve

##pipeline
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
InĀ [23]:
######################################################
### read in boston data
bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")

y = bd['medv'].to_numpy()

## pull off 4 of the features
X = bd[['lstat','nox','rm','ptratio']].to_numpy()

## check our lstat vs. y medv plot
#plot x vs. y
plt.scatter(X[:,0],y)
plt.xlabel('lstat')
plt.ylabel('medv')
Out[23]:
Text(0, 0.5, 'medv')
No description has been provided for this image
InĀ [24]:
##################################################
### use get/set params, we will need to understand this to use the pipeline

## check that we can change k=n_neighbors
knmod = KNeighborsRegressor()
print(knmod.get_params())

knmod.set_params(n_neighbors = 50)
print(knmod.get_params())
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 50, 'p': 2, 'weights': 'uniform'}
InĀ [25]:
##################################################
## make pipline and check how to change parameters

## pipeline first scales and then uses KNN
Bp = make_pipeline(MinMaxScaler(),KNeighborsRegressor())
print(Bp.get_params())  #notice how the names reflect both the Scaler and the KNeighborsRegressor  !!

## looks like the name for k in Bp is 'kneighborsregressor__n_neighbors'
## check we can change it
Bp.set_params(kneighborsregressor__n_neighbors = 7)
print('\n\n',Bp.get_params())
{'memory': None, 'steps': [('minmaxscaler', MinMaxScaler()), ('kneighborsregressor', KNeighborsRegressor())], 'verbose': False, 'minmaxscaler': MinMaxScaler(), 'kneighborsregressor': KNeighborsRegressor(), 'minmaxscaler__clip': False, 'minmaxscaler__copy': True, 'minmaxscaler__feature_range': (0, 1), 'kneighborsregressor__algorithm': 'auto', 'kneighborsregressor__leaf_size': 30, 'kneighborsregressor__metric': 'minkowski', 'kneighborsregressor__metric_params': None, 'kneighborsregressor__n_jobs': None, 'kneighborsregressor__n_neighbors': 5, 'kneighborsregressor__p': 2, 'kneighborsregressor__weights': 'uniform'}


 {'memory': None, 'steps': [('minmaxscaler', MinMaxScaler()), ('kneighborsregressor', KNeighborsRegressor(n_neighbors=7))], 'verbose': False, 'minmaxscaler': MinMaxScaler(), 'kneighborsregressor': KNeighborsRegressor(n_neighbors=7), 'minmaxscaler__clip': False, 'minmaxscaler__copy': True, 'minmaxscaler__feature_range': (0, 1), 'kneighborsregressor__algorithm': 'auto', 'kneighborsregressor__leaf_size': 30, 'kneighborsregressor__metric': 'minkowski', 'kneighborsregressor__metric_params': None, 'kneighborsregressor__n_jobs': None, 'kneighborsregressor__n_neighbors': 7, 'kneighborsregressor__p': 2, 'kneighborsregressor__weights': 'uniform'}
InĀ [26]:
##################################################
### simple train/test split

#train/test split
myseed = 88 # William Nylander (a Toronto Maple Leaf)
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.2)

#set k, fit on train, predict on test
Bp.set_params(kneighborsregressor__n_neighbors = 5)
Bp.fit(Xtrain,ytrain)
yp = Bp.predict(Xtest)

## change k and do it again
Bp.set_params(kneighborsregressor__n_neighbors = 50)
Bp.fit(Xtrain,ytrain)
yp1 = Bp.predict(Xtest)

## plot
plt.scatter(ytest,yp,c='red',label='predictions and actual',s=40)
plt.scatter(ytest,yp1,c='green',label='bigger k',s=20)
plt.plot(ytest,ytest,c='blue',label='y=x line')
plt.xlabel('y test'); plt.ylabel('y predictions')
plt.legend()
Out[26]:
<matplotlib.legend.Legend at 0x77824ff73790>
No description has been provided for this image
InĀ [27]:
##################################################
### let's check Bp by doing it by hand, this is scale on train and then apply to test

## min,max scale the columns of X (store in Xs for X scaled)
sc = MinMaxScaler()
# get scaling using train
Xtrs = sc.fit_transform(Xtrain)
# scale test X
Xtes = sc.transform(Xtest)

## check scaling
print(np.min(Xtrs,axis=0))
print(np.max(Xtrs,axis=0))
print(np.min(Xtes,axis=0))
print(np.max(Xtes,axis=0))

knmod = KNeighborsRegressor()
knmod.set_params(n_neighbors = 5)
print(knmod.get_params())

knmod.fit(Xtrs,ytrain)
ypred = knmod.predict(Xtes)

## are yp and ypred the same (should be)
plt.scatter(yp,ypred,c='red')
plt.plot(ypred,ypred,c='blue')
plt.xlabel('y pred from pipeline'); plt.ylabel('y pred by hand')
pd.Series(yp-ypred).describe()
[0. 0. 0. 0.]
[1. 1. 1. 1.]
[0.03173289 0.01440329 0.20904388 0.        ]
[0.83498896 1.         0.90745354 1.        ]
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Out[27]:
count    102.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
No description has been provided for this image
InĀ [28]:
##################################################
###  accessing components of the pipeline

## here are the names of the pipeline steps, same as in get_params()
print(Bp.named_steps.keys())

## now we can pull off a component

temp = Bp.named_steps['kneighborsregressor']
print(type(temp))
## then we can get methods/attributed directly from the component
print(temp.n_features_in_)
dict_keys(['minmaxscaler', 'kneighborsregressor'])
<class 'sklearn.neighbors._regression.KNeighborsRegressor'>
4
InĀ [29]:
##################################################
### another way to make a pipeline where you can pick the names of the components

Bp2 = Pipeline([
         ('mms',MinMaxScaler()),
         ('knn',KNeighborsRegressor(n_neighbors=5))
])

Bp2.fit(Xtrain,ytrain)
yp2 = Bp2.predict(Xtest)

plt.scatter(ypred,yp2,c='blue')
plt.plot(ypred,ypred,c='red')

print(Bp2.named_steps.keys())
junk = Bp2.named_steps['knn']
print(junk.n_features_in_)

print(Bp2.get_params())
dict_keys(['mms', 'knn'])
4
{'memory': None, 'steps': [('mms', MinMaxScaler()), ('knn', KNeighborsRegressor())], 'verbose': False, 'mms': MinMaxScaler(), 'knn': KNeighborsRegressor(), 'mms__clip': False, 'mms__copy': True, 'mms__feature_range': (0, 1), 'knn__algorithm': 'auto', 'knn__leaf_size': 30, 'knn__metric': 'minkowski', 'knn__metric_params': None, 'knn__n_jobs': None, 'knn__n_neighbors': 5, 'knn__p': 2, 'knn__weights': 'uniform'}
No description has been provided for this image
InĀ [30]:
##################################################
###  cross_val_score

## do it at a good k (5) and a bad k (50)

Bp.set_params(kneighborsregressor__n_neighbors = 5)
cvres5 = cross_val_score(Bp,X,y,cv=10,scoring='neg_mean_squared_error') #cross val with 10 folds
rmse5 = math.sqrt(np.mean(-cvres5))

Bp.set_params(kneighborsregressor__n_neighbors = 50)
cvres50 = cross_val_score(Bp,X,y,cv=10,scoring='neg_mean_squared_error') #cross val with 10 folds
rmse50 = math.sqrt(np.mean(-cvres50))

print(f'rmse at k = 5 is {rmse5}, and at 50 is {rmse50}')
rmse at k = 5 is 4.563113464102608, and at 50 is 5.498710768356453
InĀ [31]:
##################################################
### validation curve with Bp pipeline

kvec = np.arange(20) + 2 #values of k to try
mcmp = np.log(1/kvec) #model complexity
trainS, testS = validation_curve(Bp,X,y,param_name = 'kneighborsregressor__n_neighbors',param_range = kvec,cv=10,scoring='neg_mean_squared_error')

# transform neg_mean_squared_error to rmse
trrmse = np.sqrt(-trainS.mean(axis=1))
termse = np.sqrt(-testS.mean(axis=1))

ii = np.argmin(termse)
print(f'the k with min test rmse is {kvec[ii]}')

#plot in and out of sample rmse with complexity
plt.scatter(mcmp,termse,label='out-of-sample')
plt.plot(mcmp,trrmse,c='red',label='in-sample')
plt.xlabel('model complexity = log(1/k)',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title(f'the k with min test rmse is {kvec[ii]}')
plt.legend()
the k with min test rmse is 5
Out[31]:
<matplotlib.legend.Legend at 0x77824ff420b0>
No description has been provided for this image
InĀ [32]:
#plot in and out of sample rmse with  kvec
plt.scatter(kvec,termse,label='out-of-sample')
plt.plot(kvec,trrmse,c='red',label='in-sample')
plt.xlabel('kvec',size='x-large')
plt.ylabel('rmse',size='x-large')
plt.title(f'the k with min test rmse is {kvec[ii]}')
plt.legend()
Out[32]:
<matplotlib.legend.Legend at 0x77824ff72a10>
No description has been provided for this image