Cars Example, x=(mileage,year), pytorch¶

In [1]:
####################
###: basic imports
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

##sklearn model selection
from sklearn.model_selection import train_test_split

## scale the x variables when there is more than one
from sklearn.preprocessing import StandardScaler
In [2]:
####################
###: pytorch
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

def myrmse(y,yhat):
   """ print out rmse with 3 digits"""
   #rmse = math.sqrt(mean_squared_error(y,yhat))
   rmse = np.sqrt(np.sum((y-yhat)**2)/len(y))
   return(np.round(rmse,3))
In [3]:
####################
###: read in data
cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cds = cd[['price','mileage','year']]
cds = cds.astype('float64')  # cd read the data in as integer
cds['price'] = cds['price']/1000.0
cds['mileage'] = cds['mileage']/1000.0
print(cds.head())

X = cds[['mileage','year']].to_numpy()  #mileage and year columns as a numpy array
print("*** type of X is",type(X))
print(X.shape) #number of rows and columns
print(X[0:4,:]) #first 4 rows
y = cds['price'].values #price as a numpy vector
print(f'length of y is {len(y)}')
print(y[:4]) #implicit start at 0
    price  mileage    year
0  43.995   36.858  2008.0
1  44.995   46.883  2012.0
2  25.999  108.759  2007.0
3  33.880   35.187  2007.0
4  34.895   48.153  2007.0
*** type of X is <class 'numpy.ndarray'>
(1000, 2)
[[  36.858 2008.   ]
 [  46.883 2012.   ]
 [ 108.759 2007.   ]
 [  35.187 2007.   ]]
length of y is 1000
[43.995 44.995 25.999 33.88 ]
In [4]:
####################
###: train/test split

myseed = 88 #Nylander
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.3)

print(f'train sample size is {ytrain.shape[0]}\n')
print(f'test sample size is {ytest.shape[0]}\n')
train sample size is 700

test sample size is 300

In [5]:
##################################################
###: scale both x and y, in principle you don't have to scale y
sc = StandardScaler()
xtr = sc.fit_transform(Xtrain)
xte = sc.transform(Xtest)

scy = StandardScaler()
ytr = scy.fit_transform(ytrain.reshape(-1,1))
yte = scy.transform(ytest.reshape(-1,1))

print(xtr.shape)
print(ytr.shape)
print(xte.shape)
print(yte.shape)
(700, 2)
(700, 1)
(300, 2)
(300, 1)
In [6]:
##################################################
###: move to Tensors

xtr = xtr.astype('float32')
xxtr = torch.from_numpy(xtr)
ytr = ytr.astype('float32')
yytr = torch.from_numpy(ytr)

xte = xte.astype('float32')
xxte = torch.from_numpy(xte)
yte = yte.astype('float32')
yyte = torch.from_numpy(yte)

plt.scatter(xxtr[:,0],yytr)
plt.xlabel('scaled mileage'); plt.ylabel('scaled price')

print(xxtr.shape)
print(xxtr.dtype)
print(yytr.shape)
print(yytr.dtype)
print(yyte.dtype)
torch.Size([700, 2])
torch.float32
torch.Size([700, 1])
torch.float32
torch.float32
In [7]:
##################################################
###: set seed(s)

theseed = 14 # Dave Keon
torch.manual_seed(theseed)
np.random.seed(theseed)
random.seed(theseed)
## if gpu
#torch.cuda.manual_seed_all(theseed)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
In [8]:
##################################################
###: Dataset

class DF(Dataset):
   def __init__(self,x,y,transform=None, target_transform=None):
      self.x = x
      self.y = y
      self.transform = transform
      self.target_transform = target_transform

   def __len__(self):
      return len(self.y)

   def __getitem__(self,idx):
      return self.x[idx], self.y[idx]


tDF = DF(xxtr,yytr)

##################################################
###: DataLoader
tdl = DataLoader(tDF,batch_size=50,shuffle=True)
In [9]:
##################################################
###: model

class SLNN(nn.Module):
   def __init__(self,nunits=5):
      #super(SLNN, self).__init__()
      super().__init__()
      self.SSM = nn.Sequential(
         nn.Linear(2,nunits),
         nn.ReLU(),
         nn.Linear(nunits,1))
   def forward(self,x):
      yhat = self.SSM(x)
      return yhat

nunits= 50
model = SLNN(nunits)

## see model
print(model)
SLNN(
  (SSM): Sequential(
    (0): Linear(in_features=2, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=1, bias=True)
  )
)
In [10]:
##################################################
###: do it
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss_fn(model(dataloader.dataset.x),dataloader.dataset.y).item()



learning_rate = .05
l2par = .0

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = l2par)

epochs = 5000
lv = np.zeros((epochs,))
printskip = 1000
for t in range(epochs):
   lv[t] = train_loop(tdl, model, loss_fn, optimizer)
   if ((t+1) % printskip == 0):
      print(f"Epoch {t+1}  -------------------------------")
      print(f'train loss: {lv[t]}\n')


print("Done!")
Epoch 1000  -------------------------------
train loss: 0.08587513864040375

Epoch 2000  -------------------------------
train loss: 0.0790461078286171

Epoch 3000  -------------------------------
train loss: 0.07889129966497421

Epoch 4000  -------------------------------
train loss: 0.07782591879367828

Epoch 5000  -------------------------------
train loss: 0.07735740393400192

Done!
In [11]:
###: plot loss over epochs
plt.plot(np.sqrt(lv))
plt.xlabel('epoch')
Out[11]:
Text(0.5, 0, 'epoch')
In [12]:
###: out of sample
ypredN = model(xxte).detach().numpy()
ypredN = ypredN.astype('float64')
ypredN = scy.inverse_transform(ypredN)
ypredN = ypredN.flatten()
print(type(ypredN))
print(ypredN.shape)
print(ypredN.dtype)


plt.scatter(ypredN,ytest,s=10)
plt.plot(ytest,ytest,c='red')
plt.xlabel('nn prediction'); plt.ylabel('test y')

print(f'neural net rmse: {myrmse(ypredN,ytest)}')
<class 'numpy.ndarray'>
(300,)
float64
neural net rmse: 5.077