Homework 1 Solutions¶

imports¶

### imports
# numpy and pandas
import numpy as np
import pandas as pd
import math

#graphics with matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')
#ipython terminal
#%matplotlib
#jupyter notebook
%matplotlib inline 

# model, train/test split, dummies (one-hot-encoding), rmse metric from sklearn.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error

Read in the data and do some feature engineering¶

### get the data
cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[['price','mileage','color']]
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
cd.head()

### make y=price and X=[mileage,mileage**2,dsilver,dwhite, dother]
##    as numpy arrays
yprice = cd['price'].to_numpy()
one_hot = LabelBinarizer()
cdum = one_hot.fit_transform(cd['color'])
X = np.zeros((cd.shape[0],5))
X[:,0] = cd['mileage'] #first x is mileage
X[:,1] = cd['mileage']**2 #second x is mileage**2
X[:,2:] = cdum[:,1:]  # next three x's are the dummies for color
## so X has columns mileage, mileage**2, and then three dummies for color.

## let's check the dummies
print('color counts in numpy from dummies:')
print(np.sum(cdum,axis=0))
print('color counts in pandas:')
print(cd['color'].value_counts())

## so the second dummy is for Silver, the third is for White, and the fourth is for other

color counts in numpy from dummies:
[415 213 145 227]
color counts in pandas:
Black     415
other     227
Silver    213
White     145
Name: color, dtype: int64

Plot the data¶

### let's plot the data

## we can give a vector of colors to matplotlib.pyplot.scatter
## https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.scatter.html

## mileage vs. price
plt.scatter(X[:,0],yprice,s=10)
plt.xlabel('mileage'); plt.ylabel('price')

Text(0, 0.5, 'price')

## we can also color code the points to include the categorical variable color
## let's create a sequence of colors
cols = cd['color'].values ## colors

## dict to convert colors to matplotlib plot symbols
cdict = {'Black':'k','Silver':'gray','White':'b','other':'r'}
mcols = [cdict[acol] for acol in cols] #list of plot symbols corresponding to the colors

## plot mileage vs price with plot symbol color corresponding to levels a categorical var color
plt.scatter(X[:,0],yprice,c=mcols,s=10)
plt.xlabel('mileage'); plt.ylabel('price')
plt.title('Black:Black, Silver:gray, White:blue, other:red')

Text(0.5, 1.0, 'Black:Black, Silver:gray, White:blue, other:red')

## we could also specify the colors as integers
idict = {'Black':1,'Silver':2,'White':3,'other':4}
icols = [idict[acol] for acol in cols] #list of integers corresponding to the colors
## not plot using icols
plt.scatter(X[:,0],yprice,c=icols,cmap='gnuplot')
plt.xlabel('mileage'); plt.ylabel('price')
plt.colorbar()
plt.title('Black:1, Silver:2, White:3, other:4')

Text(0.5, 1.0, 'Black:1, Silver:2, White:3, other:4')

ldict = {1:'Black', 2:'Silver', 3:'White', 4:'other'}
plt.cla()
plt.xlim(.0*X[:,0].min(),1.05*X[:,0].max())
plt.ylim(.0*yprice.min(),1.05*yprice.max())
plt.xlabel('mileage'); plt.ylabel('price')
for i in range(1,5):
   print(i)
   ii = np.array(icols) == i
   plt.scatter(X[ii,0],yprice[ii],c=cdict[ldict[i]],s=12,label=ldict[i])
plt.legend()
fig = plt.gcf()
fig.set_size_inches(12, 8)

train/test split¶

myseed = 34
Xtr, Xte, ytr, yte = train_test_split(X,yprice,random_state=myseed, test_size=.25)

## drop second column which is mileage**2 for the logprice model
Xtr0 = np.delete(Xtr,1,axis=1)
Xte0 = np.delete(Xte,1,axis=1)

Quadratic Model¶

lmod = LinearRegression(fit_intercept=True)
lmod.fit(Xtr,ytr)
ypred =  lmod.predict(Xte)
yhat = lmod.predict(Xtr)

## plot in-sample fit
plt.scatter(ytr,yhat,s=8,c='b')
plt.plot(ytr,ytr,c='red')
plt.xlabel('train y'); plt.ylabel('yhat fits')
plt.title('quadratic model: in sample fit on train')

Text(0.5, 1.0, 'quadratic model: in sample fit on train')

## plot out-of-sample prediction
plt.scatter(yte,ypred,s=8,c='b')
plt.plot(yte,yte,c='red')
plt.xlabel('test y'); plt.ylabel('ypred predictions')
plt.title('quadratic model: out of sample predictions on test')

Text(0.5, 1.0, 'quadratic model: out of sample predictions on test')

## rmse on test
rmse = math.sqrt(mean_squared_error(yte,ypred))
print(f'rmse for model with mileage, mileage**2 and color is {rmse:0.2f}')

rmse for model with mileage, mileage**2 and color is 8.89

## plot mileage vs fit
cinttr = Xtr[:,2:] @ np.arange(1,4) + 1 ## integer indices of colors on train
ccdict = {1:'k',2:'gray',3:'b',4:'r'} # convert integers to matplotlib colors
cclist = [ccdict[i] for i in cinttr]
plt.scatter(Xtr[:,0],yhat,c=cclist,s=8)
plt.title('Black:Black, Silver:gray, White:blue, other:red')
plt.xlabel('mileage'); plt.ylabel('price')
fig = plt.gcf()
fig.set_size_inches(12, 8)

# coefficents
lmod.coef_

array([-6.88441439e-01,  1.90318361e-03, -4.14241591e+00,  3.38893659e-01,
       -3.45356884e+00])

The silver and other dummy coefficents are negative while the White dummy is slightly positive.
This makes sense with the picture above.

log price model¶

lmod0 = LinearRegression(fit_intercept=True)
lytr = np.log(ytr)
lmod0.fit(Xtr0,lytr)
ypred0 =  np.exp(lmod0.predict(Xte0))
yhat0 = np.exp(lmod0.predict(Xtr0))

## plot in-sample fit
plt.scatter(ytr,yhat0,s=8,c='b')
plt.plot(ytr,ytr,c='red')
plt.xlabel('train y'); plt.ylabel('yhat fits')
plt.title('log model: in sample fit on train')

Text(0.5, 1.0, 'log model: in sample fit on train')

## plot out-of-sample prediction
plt.scatter(yte,ypred0,s=8,c='b')
plt.plot(yte,yte,c='red')
plt.xlabel('test y'); plt.ylabel('ypred predictions')
plt.title('log model: out of sample predictions on test')

Text(0.5, 1.0, 'log model: out of sample predictions on test')

## rmse on test
rmse0 = math.sqrt(mean_squared_error(yte,ypred0))
print(f'rmse for model with log price on mileage, and color is {rmse0:0.2f}')
rmse = math.sqrt(mean_squared_error(yte,ypred))
print(f'rmse for model with mileage, mileage**2 and color is {rmse:0.2f}')

rmse for model with log price on mileage, and color is 9.32
rmse for model with mileage, mileage**2 and color is 8.89

##################################################
### not much difference in rmse, let's compare the predictions
plt.scatter(ypred,ypred0,c='b',s=8)
plt.plot(ypred,ypred,c='r')
plt.xlabel('quadratic predictions'); plt.ylabel('log price predictions')

Text(0, 0.5, 'log price predictions')

Compare with linear¶

##################################################
### what was price on mileage alone?
lmod1 = LinearRegression(fit_intercept=True)
Xtr1 = Xtr[:,0].reshape(-1,1)
Xte1 = Xte[:,0].reshape(-1,1)
lmod1.fit(Xtr1,ytr)
yhat1 = lmod1.predict(Xtr1)
ypred1 = lmod1.predict(Xte1)

## rmse for the three models
rmse1 = math.sqrt(mean_squared_error(yte,ypred1))
print(f'rmse for model with  price on mileage is {rmse1:0.2f}')
rmse0 = math.sqrt(mean_squared_error(yte,ypred0))
print(f'rmse for model with log price on mileage, and color is {rmse0:0.2f}')
rmse = math.sqrt(mean_squared_error(yte,ypred))
print(f'rmse for model with mileage, mileage**2 and color is {rmse:0.2f}')

rmse for model with  price on mileage is 10.50
rmse for model with log price on mileage, and color is 9.32
rmse for model with mileage, mileage**2 and color is 8.89

##################################################
## let's plot  to compare linear with quadratic

plt.scatter(yte,ypred,c='blue',label='quadratic',s=10,marker='o')
plt.scatter(yte,ypred1,c='red',label='linear',s=10,marker='x')
plt.plot(yte,yte,c='g',linewidth=.5)
plt.xlabel('test y=price'); plt.ylabel('quadratic and linear predictions')
plt.legend()
fig = plt.gcf()
fig.set_size_inches(12, 8)

	price	mileage	color
0	43.995	36.858	Silver
1	44.995	46.883	Black
2	25.999	108.759	White
3	33.880	35.187	Black
4	34.895	48.153	Black