### imports
# numpy and pandas
import numpy as np
import pandas as pd
import math
#graphics with matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')
#ipython terminal
#%matplotlib
#jupyter notebook
%matplotlib inline
# model, train/test split, dummies (one-hot-encoding), rmse metric from sklearn.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error
### get the data
cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[['price','mileage','color']]
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
cd.head()
### make y=price and X=[mileage,mileage**2,dsilver,dwhite, dother]
## as numpy arrays
yprice = cd['price'].to_numpy()
one_hot = LabelBinarizer()
cdum = one_hot.fit_transform(cd['color'])
X = np.zeros((cd.shape[0],5))
X[:,0] = cd['mileage'] #first x is mileage
X[:,1] = cd['mileage']**2 #second x is mileage**2
X[:,2:] = cdum[:,1:] # next three x's are the dummies for color
## so X has columns mileage, mileage**2, and then three dummies for color.
## let's check the dummies
print('color counts in numpy from dummies:')
print(np.sum(cdum,axis=0))
print('color counts in pandas:')
print(cd['color'].value_counts())
## so the second dummy is for Silver, the third is for White, and the fourth is for other
### let's plot the data
## we can give a vector of colors to matplotlib.pyplot.scatter
## https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.scatter.html
## mileage vs. price
plt.scatter(X[:,0],yprice,s=10)
plt.xlabel('mileage'); plt.ylabel('price')
## we can also color code the points to include the categorical variable color
## let's create a sequence of colors
cols = cd['color'].values ## colors
## dict to convert colors to matplotlib plot symbols
cdict = {'Black':'k','Silver':'gray','White':'b','other':'r'}
mcols = [cdict[acol] for acol in cols] #list of plot symbols corresponding to the colors
## plot mileage vs price with plot symbol color corresponding to levels a categorical var color
plt.scatter(X[:,0],yprice,c=mcols,s=10)
plt.xlabel('mileage'); plt.ylabel('price')
plt.title('Black:Black, Silver:gray, White:blue, other:red')
## we could also specify the colors as integers
idict = {'Black':1,'Silver':2,'White':3,'other':4}
icols = [idict[acol] for acol in cols] #list of integers corresponding to the colors
## not plot using icols
plt.scatter(X[:,0],yprice,c=icols,cmap='gnuplot')
plt.xlabel('mileage'); plt.ylabel('price')
plt.colorbar()
plt.title('Black:1, Silver:2, White:3, other:4')
ldict = {1:'Black', 2:'Silver', 3:'White', 4:'other'}
plt.cla()
plt.xlim(.0*X[:,0].min(),1.05*X[:,0].max())
plt.ylim(.0*yprice.min(),1.05*yprice.max())
plt.xlabel('mileage'); plt.ylabel('price')
for i in range(1,5):
print(i)
ii = np.array(icols) == i
plt.scatter(X[ii,0],yprice[ii],c=cdict[ldict[i]],s=12,label=ldict[i])
plt.legend()
fig = plt.gcf()
fig.set_size_inches(12, 8)
myseed = 34
Xtr, Xte, ytr, yte = train_test_split(X,yprice,random_state=myseed, test_size=.25)
## drop second column which is mileage**2 for the logprice model
Xtr0 = np.delete(Xtr,1,axis=1)
Xte0 = np.delete(Xte,1,axis=1)
lmod = LinearRegression(fit_intercept=True)
lmod.fit(Xtr,ytr)
ypred = lmod.predict(Xte)
yhat = lmod.predict(Xtr)
## plot in-sample fit
plt.scatter(ytr,yhat,s=8,c='b')
plt.plot(ytr,ytr,c='red')
plt.xlabel('train y'); plt.ylabel('yhat fits')
plt.title('quadratic model: in sample fit on train')
## plot out-of-sample prediction
plt.scatter(yte,ypred,s=8,c='b')
plt.plot(yte,yte,c='red')
plt.xlabel('test y'); plt.ylabel('ypred predictions')
plt.title('quadratic model: out of sample predictions on test')
## rmse on test
rmse = math.sqrt(mean_squared_error(yte,ypred))
print(f'rmse for model with mileage, mileage**2 and color is {rmse:0.2f}')
## plot mileage vs fit
cinttr = Xtr[:,2:] @ np.arange(1,4) + 1 ## integer indices of colors on train
ccdict = {1:'k',2:'gray',3:'b',4:'r'} # convert integers to matplotlib colors
cclist = [ccdict[i] for i in cinttr]
plt.scatter(Xtr[:,0],yhat,c=cclist,s=8)
plt.title('Black:Black, Silver:gray, White:blue, other:red')
plt.xlabel('mileage'); plt.ylabel('price')
fig = plt.gcf()
fig.set_size_inches(12, 8)
# coefficents
lmod.coef_
The silver and other dummy coefficents are negative while the White dummy is slightly positive.
This makes sense with the picture above.
lmod0 = LinearRegression(fit_intercept=True)
lytr = np.log(ytr)
lmod0.fit(Xtr0,lytr)
ypred0 = np.exp(lmod0.predict(Xte0))
yhat0 = np.exp(lmod0.predict(Xtr0))
## plot in-sample fit
plt.scatter(ytr,yhat0,s=8,c='b')
plt.plot(ytr,ytr,c='red')
plt.xlabel('train y'); plt.ylabel('yhat fits')
plt.title('log model: in sample fit on train')
## plot out-of-sample prediction
plt.scatter(yte,ypred0,s=8,c='b')
plt.plot(yte,yte,c='red')
plt.xlabel('test y'); plt.ylabel('ypred predictions')
plt.title('log model: out of sample predictions on test')
## rmse on test
rmse0 = math.sqrt(mean_squared_error(yte,ypred0))
print(f'rmse for model with log price on mileage, and color is {rmse0:0.2f}')
rmse = math.sqrt(mean_squared_error(yte,ypred))
print(f'rmse for model with mileage, mileage**2 and color is {rmse:0.2f}')
##################################################
### not much difference in rmse, let's compare the predictions
plt.scatter(ypred,ypred0,c='b',s=8)
plt.plot(ypred,ypred,c='r')
plt.xlabel('quadratic predictions'); plt.ylabel('log price predictions')
##################################################
### what was price on mileage alone?
lmod1 = LinearRegression(fit_intercept=True)
Xtr1 = Xtr[:,0].reshape(-1,1)
Xte1 = Xte[:,0].reshape(-1,1)
lmod1.fit(Xtr1,ytr)
yhat1 = lmod1.predict(Xtr1)
ypred1 = lmod1.predict(Xte1)
## rmse for the three models
rmse1 = math.sqrt(mean_squared_error(yte,ypred1))
print(f'rmse for model with price on mileage is {rmse1:0.2f}')
rmse0 = math.sqrt(mean_squared_error(yte,ypred0))
print(f'rmse for model with log price on mileage, and color is {rmse0:0.2f}')
rmse = math.sqrt(mean_squared_error(yte,ypred))
print(f'rmse for model with mileage, mileage**2 and color is {rmse:0.2f}')
##################################################
## let's plot to compare linear with quadratic
plt.scatter(yte,ypred,c='blue',label='quadratic',s=10,marker='o')
plt.scatter(yte,ypred1,c='red',label='linear',s=10,marker='x')
plt.plot(yte,yte,c='g',linewidth=.5)
plt.xlabel('test y=price'); plt.ylabel('quadratic and linear predictions')
plt.legend()
fig = plt.gcf()
fig.set_size_inches(12, 8)