# numpy and pandas
import numpy as np
import pandas as pd
import math
#graphics with matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# model, train/test split, dummies (one-hot-encoding), rmse metric from scikit learn.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error
Read in the data, rescale and get mileage squared and log price.
We will just use price, mileage. Later we will get trim as one-hot encoded dummies.
cdr = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cdr[['price','mileage']].copy()
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
cd['mileage'] -= cd['mileage'].mean()
cd['ml2'] = cd['mileage']**2
cd['lprice'] = np.log(cd['price'])
cd.head()
| price | mileage | ml2 | lprice | |
|---|---|---|---|---|
| 0 | 43.995 | -36.794408 | 1353.828460 | 3.784076 |
| 1 | 44.995 | -26.769408 | 716.601205 | 3.806551 |
| 2 | 25.999 | 35.106592 | 1232.472802 | 3.258058 |
| 3 | 33.880 | -38.465408 | 1479.587613 | 3.522825 |
| 4 | 34.895 | -25.499408 | 650.219808 | 3.552344 |
Now let's get the dummies for trim.
one_hot = LabelBinarizer() #does the same thing as OneHotEncoder but a little simpler
tdum = one_hot.fit_transform(cdr["trim"])
temp = tdum[:,1:] # drop the first dummy
dftemp = pd.DataFrame(temp)
print(dftemp.shape) #should be 3 columns
cdd = pd.concat([cd, dftemp], axis=1)
print(cdd.shape) # should be 4+3 columns
(1000, 3) (1000, 7)
cdd.head()
| price | mileage | ml2 | lprice | 0 | 1 | 2 | |
|---|---|---|---|---|---|---|---|
| 0 | 43.995 | -36.794408 | 1353.828460 | 3.784076 | 0 | 1 | 0 |
| 1 | 44.995 | -26.769408 | 716.601205 | 3.806551 | 0 | 1 | 0 |
| 2 | 25.999 | 35.106592 | 1232.472802 | 3.258058 | 0 | 1 | 0 |
| 3 | 33.880 | -38.465408 | 1479.587613 | 3.522825 | 0 | 1 | 0 |
| 4 | 34.895 | -25.499408 | 650.219808 | 3.552344 | 0 | 1 | 0 |
We don't really need it, but let's make sure we know which trim categories the dummies correspond to.
cdr['trim'].value_counts()
trim 550 591 430 143 other 139 500 127 Name: count, dtype: int64
cdd.iloc[:,[4,5,6]].sum(axis=0)
0 127 1 591 2 139 dtype: int64
So the dummies are for 500, 550, other.
Now that we know, let's rename the trim dummies.
cdd.rename(columns={0:'d500',1:'d550',2:'dother'},inplace=True)
cdd.head()
| price | mileage | ml2 | lprice | d500 | d550 | dother | |
|---|---|---|---|---|---|---|---|
| 0 | 43.995 | -36.794408 | 1353.828460 | 3.784076 | 0 | 1 | 0 |
| 1 | 44.995 | -26.769408 | 716.601205 | 3.806551 | 0 | 1 | 0 |
| 2 | 25.999 | 35.106592 | 1232.472802 | 3.258058 | 0 | 1 | 0 |
| 3 | 33.880 | -38.465408 | 1479.587613 | 3.522825 | 0 | 1 | 0 |
| 4 | 34.895 | -25.499408 | 650.219808 | 3.552344 | 0 | 1 | 0 |
Let's get a train/test split.
cdtr, cdte = train_test_split(cdd,random_state=88, test_size=.25) #always set the seed
print(cdtr.shape)
print(cdte.shape)
(750, 7) (250, 7)
Let's first fit a baseline regression with just mileage and trim.
lmb = LinearRegression(fit_intercept=True)
lmb.fit(cdtr[['mileage','d500','d550','dother']].to_numpy(),cdtr['price']) # fit on train
ypredb = lmb.predict(cdte[['mileage','d500','d550','dother']].to_numpy()) # predict on test
rmseb = math.sqrt(mean_squared_error(cdte['price'],ypredb))
print(f'**** base model, price on mileage and trim rmse: {rmseb:0.4f}')
**** base model, price on mileage and trim rmse: 9.7099
Ok, now let's try logging y.
lml = LinearRegression(fit_intercept=True)
lml.fit(cdtr[['mileage','d500','d550','dother']].to_numpy(),cdtr['lprice']) # fit on train
ypredl = np.exp(lml.predict(cdte[['mileage','d500','d550','dother']].to_numpy())) # predict on test
rmsel = math.sqrt(mean_squared_error(cdte['price'],ypredl))
print(f'**** log model, log(price) on mileage and trim rmse: {rmsel:0.4f}')
**** log model, log(price) on mileage and trim rmse: 8.5137
Now let's try the quadratic model.
lmq = LinearRegression(fit_intercept=True)
lmq.fit(cdtr[['mileage','ml2','d500','d550','dother']].to_numpy(),cdtr['price']) # fit on train
ypredq = lmq.predict(cdte[['mileage','ml2','d500','d550','dother']].to_numpy()) # predict on test
rmseq = math.sqrt(mean_squared_error(cdte['price'],ypredq))
print(f'**** base model, price on mileage and trim rmse: {rmseq:0.4f}')
**** base model, price on mileage and trim rmse: 8.2499
print('\n****************************************************')
print(f'\n******* rmse: base,log,quad: {rmseb:0.4f}, {rmsel:0.4f}, {rmseq:0.4f}')
print('\n****************************************************')
**************************************************** ******* rmse: base,log,quad: 9.7099, 8.5137, 8.2499 ****************************************************
Quadratic has the best out-of-sample rmse.
lmq.coef_
array([-3.40476904e-01, 1.81533757e-03, 4.72984609e-01, 1.27388189e+01,
7.92259999e+00])
Looks like 550 is the most expensive trim, then other, last 500.
Let's plot y vs y predictions to see the different models.
predDF = pd.DataFrame({'ytest':cdte['price'],'baseP':ypredb,'logP':ypredl,'quadP':ypredq})
predDF.corr()
| ytest | baseP | logP | quadP | |
|---|---|---|---|---|
| ytest | 1.000000 | 0.861215 | 0.897652 | 0.902359 |
| baseP | 0.861215 | 1.000000 | 0.943717 | 0.957204 |
| logP | 0.897652 | 0.943717 | 1.000000 | 0.980388 |
| quadP | 0.902359 | 0.957204 | 0.980388 | 1.000000 |
sns.pairplot(predDF,diag_kind='kde',height=2)
<seaborn.axisgrid.PairGrid at 0x73fede95e410>
plt.scatter(cdte['price'],ypredl,c='blue',s=5.0,marker='D',label='log model')
plt.scatter(cdte['price'],ypredq,c='red',s=3.0,label='quadratic model')
plt.axline(xy1=(0, 0), slope=1,c='black',ls='dashed',linewidth=1)
plt.xlabel('test y'); plt.ylabel('predictions')
plt.legend()
<matplotlib.legend.Legend at 0x73fee6e4a860>
Both models miss on the more expensive cars.
Let's plot the mileage vs. the predictions for both the log and quadratric model.
plt.scatter(cdte['mileage'],cdte['price'],c='black',s=3)
plt.scatter(cdte['mileage'],ypredl,c='blue',s=5,marker='D',label='log model')
plt.scatter(cdte['mileage'],ypredq,c='red',s=3,label='quadratic model')
plt.xlabel('mileage'); plt.ylabel('y=price')
plt.legend()
<matplotlib.legend.Legend at 0x73fede15d1b0>
Although bove models capture some of the nonlinearity, they are quite different and seem to use trim differently.
Let's see how the prediction errors for the two models relate to trim. ll
ecd = pd.DataFrame({'qerr':cdte['price'] - ypredq,'lerr':cdte['price']-ypredl,'trim':cdr['trim']})
axq = ecd.boxplot(column='qerr',by='trim')
axq.set_ylim(-20,50)
axl = ecd.boxplot(column='lerr',by='trim')
axl.set_ylim(-20,50)
(-20.0, 50.0)
Looks like log model is really off for trim=other.
Let's plot mileage vs. the quadratic predictions with the trim category color coded.
cind = 1*cdte['d500'] + 2 * cdte['d550'] + 3*cdte['dother']
coldir = ['magenta','red','blue','green']
labl = ['trim 430','trim 500','trim 550','trim other']
cvec = [coldir[i] for i in cind]
plt.scatter(cdte['mileage'],cdte['price'],c=cvec,s=4,marker='D')
plt.xlabel('mileage'); plt.ylabel('price')
xx = cdte['mileage'].to_numpy()
for i in range(4):
xxx = xx[cind==i]
yyy = ypredq[cind==i]
ii = np.argsort(xxx)
plt.plot(xxx[ii],yyy[ii],c=coldir[i],label=labl[i],linewidth=2)
plt.legend()
<matplotlib.legend.Legend at 0x73fedeacd750>
Let's plot the quadratic and log model fits side by side to see the difference.
fig,ax = plt.subplots(1,2,figsize=(10, 4))
ax[0].scatter(cdte['mileage'],ypredq,c=cvec,s=3)
ax[0].set_xlabel('x=mileage'); ax[0].set_ylabel('y=price')
ax[0].set_title('quadratic model')
ax[1].scatter(cdte['mileage'],ypredl,c=cvec,s=3)
ax[1].set_xlabel('x=mileage'); ax[0].set_ylabel('y=price')
ax[1].set_xlabel('x=mileage'); ax[0].set_ylabel('y=price')
ax[1].set_title('log model')
Text(0.5, 1.0, 'log model')
They look pretty different.
2. Naive Bayes¶
Read in the data.
trainB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainBS.csv")
trainyB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainyB.csv")['smsTrainyB']
n = len(trainyB)
print(f'length of y is: {n}')
print('dim of x in trainB')
print(trainB.shape)
length of y is: 4169 dim of x in trainB (4169, 4)
Estimate marginal probs for y (1 if spam, 0 else).
py1 = trainyB.sum()/n
print(f'P(Y=1): {py1:0.4f}')
P(Y=1): 0.1353
Looks right (same as in the notes).
Let’s write a simple little function which will get us estimates of the joint
of (y,x) and the conditionals of x|y for y = 0 and 1.
temp = pd.crosstab(trainyB,trainB['age']).to_numpy()
temp = temp/n
temp
array([[0.86351643, 0.00119933],
[0.13240585, 0.00287839]])
def bivJC(y,x):
'''estimate the joint from the binary (0,1) y and x and conditionals given y = 0 or 1.'''
n = len(y)
jnt = pd.crosstab(y,x).to_numpy()/n
gy0 = jnt[0,:]/sum(jnt[0,:])
gy1 = jnt[1,:]/sum(jnt[1,:])
return({'jnt':jnt,'gy0':gy0,'gy1':gy1})
ageJ = bivJC(trainyB,trainB['age'])
ageJ
{'jnt': array([[0.86351643, 0.00119933],
[0.13240585, 0.00287839]]),
'gy0': array([0.99861304, 0.00138696]),
'gy1': array([0.9787234, 0.0212766])}
gy0 and gy1 are exactly the same as the table for age obtained the notes.
Ok, now let’s do part (a).
First let’s do y|age=1 and compare with the notes.
We can get P(Y=1|age = 1) directly from the joint.
jnt = ageJ['jnt']
temp = jnt[1,1]/(jnt[1,1]+ jnt[0,1])
print(f'\n**** p ham given age is in: {temp:0.4f}')
#cat("\n**** prob of ham given age is in: ",temp,'\n')
**** p ham given age is in: 0.7059
Same as notes up to two digits.
Or we can use Bayes theorem (as in notes).
py1ga = py1*ageJ['gy1'][1]/(py1*ageJ['gy1'][1] + (1-py1)*ageJ['gy0'][1])
print(f'\n**** prob ham given age is in: {py1ga:0.4f}')
**** prob ham given age is in: 0.7059
Now let's do call the same way.
callJ = bivJC(trainyB,trainB['call'])
print(callJ)
{'jnt': array([[0.81506356, 0.04965219],
[0.07459822, 0.06068602]]), 'gy0': array([0.94257975, 0.05742025]), 'gy1': array([0.55141844, 0.44858156])}
jnt = callJ['jnt']
temp = jnt[1,1]/(jnt[1,1]+ jnt[0,1])
print(f'\n**** p ham given call is in: {temp:0.4f}')
**** p ham given call is in: 0.5500
py1gc = py1*callJ['gy1'][1]/(py1*callJ['gy1'][1] + (1-py1)*callJ['gy0'][1])
print(f'\n**** prob ham given call is in: {py1gc:0.4f}')
**** prob ham given call is in: 0.5500
Now let's do part (b).
knowJ = bivJC(trainyB,trainB['know'])
print(knowJ)
{'jnt': array([[0.82345886, 0.0412569 ],
[0.13120652, 0.00407772]]), 'gy0': array([0.95228849, 0.04771151]), 'gy1': array([0.96985816, 0.03014184])}
The probability know is in is not the different when y is 0 or 1.
It is even a little higher when y=0. Very different from call.
py1gck = (py1*callJ['gy1'][1]*knowJ['gy1'][1])/( (py1*callJ['gy1'][1]*knowJ['gy1'][1]) + ((1-py1)*callJ['gy0'][1]*knowJ['gy0'][1]))
print(f'\n**** prob ham given call and know is in: {py1gck:0.4f}')
**** prob ham given call and know is in: 0.4357
Finally, lets do part (c).
Now we need the joint distribution of (x1=call,x2=know) given y = 0 (ham) and y=1 (spam).
We will split the trainB x data.frame into the subset with y=0 and the subset with y=1.
trainB0 = trainB[trainyB == 0]
print(trainB0.shape)
trainB1 = trainB[trainyB == 1]
print(trainB1.shape)
(3605, 4) (564, 4)
Dimensions look right.
Now can estimate the joint distribution of (call,know) given y = 0 or 1.
jnt0 = pd.crosstab(trainB0['call'],trainB0['know'])/trainB0.shape[0]
print(jnt0)
jnt1 = pd.crosstab(trainB1['call'],trainB1['know'])/trainB1.shape[0]
print(jnt1)
know 0 1 call 0 0.897642 0.044938 1 0.054646 0.002774 know 0 1 call 0 0.537234 0.014184 1 0.432624 0.015957
p11y1 = jnt1.iloc[1,1]; p11y0 = jnt0.iloc[1,1]
py1g11 = (py1 * p11y1)/((py1 * p11y1) + ((1-py1)*p11y0))
print(f'prob y=1(spam) given call and know =1 (are in document) without NB: {py1g11:0.4f}')
prob y=1(spam) given call and know =1 (are in document) without NB: 0.4737
Quite similar to what we got using Naive Bayes.
Let’s investigate the Naive Bayes assumption.
Conditional on y = 0 or 1, is the joint from independence similar to the joint?
def jFromMar(jnt):
'''get a joint from the marginals of jnt assuming independence'''
mar0 = jnt.sum(axis=1) # marginal for row variable
mar1 = jnt.sum(axis=0) # marginal for column variable
return(np.outer(mar0,mar1)) #the outer product
print('\n******************************** given y=0:\n')
print('\n ***** jnt of call know given y=0:')
print(jnt0)
print('\n ***** jnt of call know given y=0, using NB assumption of independence:')
print(jFromMar(jnt0))
print('\n******************************** given y=1:\n')
print('\n ***** jnt of call know given y=1:')
print(jnt1)
print('\n ***** jnt of call know given y=1, using NB assumption of independence:')
print(jFromMar(jnt1))
******************************** given y=0: ***** jnt of call know given y=0: know 0 1 call 0 0.897642 0.044938 1 0.054646 0.002774 ***** jnt of call know given y=0, using NB assumption of independence: [[0.89760785 0.0449719 ] [0.05468064 0.00273961]] ******************************** given y=1: ***** jnt of call know given y=1: know 0 1 call 0 0.537234 0.014184 1 0.432624 0.015957 ***** jnt of call know given y=1, using NB assumption of independence: [[0.53479767 0.01662077] [0.43506048 0.01352108]]
Not exactly the same, but pretty similar.