# numpy and pandas
import numpy as np
import pandas as pd
import math

#graphics with matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# model, train/test split, dummies (one-hot-encoding), rmse metric from scikit learn.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error

cdr = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv") 
cd = cdr[['price','mileage']].copy()
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
cd['mileage'] -= cd['mileage'].mean()
cd['ml2'] = cd['mileage']**2
cd['lprice'] = np.log(cd['price'])
cd.head()

one_hot = LabelBinarizer()  #does the same thing as OneHotEncoder but a little simpler
tdum = one_hot.fit_transform(cdr["trim"])
temp = tdum[:,1:]  # drop the first dummy
dftemp = pd.DataFrame(temp)
print(dftemp.shape)  #should be 3 columns
cdd = pd.concat([cd, dftemp], axis=1)
print(cdd.shape) # should be 4+3 columns

(1000, 3)
(1000, 7)

cdd.head()

cdr['trim'].value_counts()

trim
550      591
430      143
other    139
500      127
Name: count, dtype: int64

cdd.iloc[:,[4,5,6]].sum(axis=0)

0    127
1    591
2    139
dtype: int64

cdd.rename(columns={0:'d500',1:'d550',2:'dother'},inplace=True)
cdd.head()

cdtr, cdte = train_test_split(cdd,random_state=88, test_size=.25)  #always set the seed

print(cdtr.shape)
print(cdte.shape)

(750, 7)
(250, 7)

lmb = LinearRegression(fit_intercept=True)
lmb.fit(cdtr[['mileage','d500','d550','dother']].to_numpy(),cdtr['price'])  # fit on train
ypredb = lmb.predict(cdte[['mileage','d500','d550','dother']].to_numpy()) # predict on test

rmseb = math.sqrt(mean_squared_error(cdte['price'],ypredb))
print(f'**** base model, price on mileage and trim rmse: {rmseb:0.4f}')

**** base model, price on mileage and trim rmse: 9.7099

lml = LinearRegression(fit_intercept=True)
lml.fit(cdtr[['mileage','d500','d550','dother']].to_numpy(),cdtr['lprice'])  # fit on train
ypredl = np.exp(lml.predict(cdte[['mileage','d500','d550','dother']].to_numpy())) # predict on test

rmsel = math.sqrt(mean_squared_error(cdte['price'],ypredl))
print(f'**** log model, log(price) on mileage and trim rmse: {rmsel:0.4f}')

**** log model, log(price) on mileage and trim rmse: 8.5137

lmq = LinearRegression(fit_intercept=True)
lmq.fit(cdtr[['mileage','ml2','d500','d550','dother']].to_numpy(),cdtr['price'])  # fit on train
ypredq = lmq.predict(cdte[['mileage','ml2','d500','d550','dother']].to_numpy()) # predict on test

rmseq = math.sqrt(mean_squared_error(cdte['price'],ypredq))
print(f'**** base model, price on mileage and trim rmse: {rmseq:0.4f}')

**** base model, price on mileage and trim rmse: 8.2499

print('\n****************************************************')
print(f'\n******* rmse: base,log,quad: {rmseb:0.4f}, {rmsel:0.4f}, {rmseq:0.4f}')
print('\n****************************************************')

****************************************************

******* rmse: base,log,quad: 9.7099, 8.5137, 8.2499

****************************************************

lmq.coef_

array([-3.40476904e-01,  1.81533757e-03,  4.72984609e-01,  1.27388189e+01,
        7.92259999e+00])

predDF = pd.DataFrame({'ytest':cdte['price'],'baseP':ypredb,'logP':ypredl,'quadP':ypredq})
predDF.corr()

sns.pairplot(predDF,diag_kind='kde',height=2)

<seaborn.axisgrid.PairGrid at 0x73fede95e410>

plt.scatter(cdte['price'],ypredl,c='blue',s=5.0,marker='D',label='log model')
plt.scatter(cdte['price'],ypredq,c='red',s=3.0,label='quadratic model')
plt.axline(xy1=(0, 0), slope=1,c='black',ls='dashed',linewidth=1)
plt.xlabel('test y'); plt.ylabel('predictions')
plt.legend()

<matplotlib.legend.Legend at 0x73fee6e4a860>

plt.scatter(cdte['mileage'],cdte['price'],c='black',s=3)
plt.scatter(cdte['mileage'],ypredl,c='blue',s=5,marker='D',label='log model')
plt.scatter(cdte['mileage'],ypredq,c='red',s=3,label='quadratic model')
plt.xlabel('mileage'); plt.ylabel('y=price')
plt.legend()

<matplotlib.legend.Legend at 0x73fede15d1b0>

ecd = pd.DataFrame({'qerr':cdte['price'] - ypredq,'lerr':cdte['price']-ypredl,'trim':cdr['trim']})

axq = ecd.boxplot(column='qerr',by='trim')
axq.set_ylim(-20,50)
axl = ecd.boxplot(column='lerr',by='trim')
axl.set_ylim(-20,50)

(-20.0, 50.0)

cind = 1*cdte['d500'] + 2 * cdte['d550'] + 3*cdte['dother']
coldir = ['magenta','red','blue','green']
labl = ['trim 430','trim 500','trim 550','trim other']
cvec = [coldir[i] for i in cind]
plt.scatter(cdte['mileage'],cdte['price'],c=cvec,s=4,marker='D')
plt.xlabel('mileage'); plt.ylabel('price')
xx = cdte['mileage'].to_numpy()
for i in range(4):
    xxx = xx[cind==i]
    yyy = ypredq[cind==i]
    ii = np.argsort(xxx) 
    plt.plot(xxx[ii],yyy[ii],c=coldir[i],label=labl[i],linewidth=2)

plt.legend()

<matplotlib.legend.Legend at 0x73fedeacd750>

fig,ax = plt.subplots(1,2,figsize=(10, 4))

ax[0].scatter(cdte['mileage'],ypredq,c=cvec,s=3)
ax[0].set_xlabel('x=mileage'); ax[0].set_ylabel('y=price')
ax[0].set_title('quadratic model')

ax[1].scatter(cdte['mileage'],ypredl,c=cvec,s=3)
ax[1].set_xlabel('x=mileage'); ax[0].set_ylabel('y=price')
ax[1].set_xlabel('x=mileage'); ax[0].set_ylabel('y=price')
ax[1].set_title('log model')

Text(0.5, 1.0, 'log model')

trainB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainBS.csv")
trainyB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainyB.csv")['smsTrainyB']
n = len(trainyB)

print(f'length of y is: {n}')
print('dim of x in trainB')
print(trainB.shape)

length of y is: 4169
dim of x in trainB
(4169, 4)

py1 = trainyB.sum()/n
print(f'P(Y=1): {py1:0.4f}')

P(Y=1): 0.1353

temp = pd.crosstab(trainyB,trainB['age']).to_numpy()
temp = temp/n
temp

array([[0.86351643, 0.00119933],
       [0.13240585, 0.00287839]])

def bivJC(y,x):
    '''estimate the joint from the binary (0,1) y and x and conditionals given y = 0 or 1.'''
    n = len(y)
    jnt = pd.crosstab(y,x).to_numpy()/n
    gy0 = jnt[0,:]/sum(jnt[0,:])
    gy1 = jnt[1,:]/sum(jnt[1,:])
    return({'jnt':jnt,'gy0':gy0,'gy1':gy1})

ageJ = bivJC(trainyB,trainB['age'])
ageJ

{'jnt': array([[0.86351643, 0.00119933],
        [0.13240585, 0.00287839]]),
 'gy0': array([0.99861304, 0.00138696]),
 'gy1': array([0.9787234, 0.0212766])}

jnt = ageJ['jnt']
temp = jnt[1,1]/(jnt[1,1]+ jnt[0,1])
print(f'\n****  p ham given age is in: {temp:0.4f}')
#cat("\n**** prob of ham given age is in: ",temp,'\n')

****  p ham given age is in: 0.7059

py1ga = py1*ageJ['gy1'][1]/(py1*ageJ['gy1'][1] + (1-py1)*ageJ['gy0'][1])
print(f'\n**** prob ham given age is in: {py1ga:0.4f}')

**** prob ham given age is in: 0.7059

callJ = bivJC(trainyB,trainB['call'])
print(callJ)

{'jnt': array([[0.81506356, 0.04965219],
       [0.07459822, 0.06068602]]), 'gy0': array([0.94257975, 0.05742025]), 'gy1': array([0.55141844, 0.44858156])}

jnt = callJ['jnt']
temp = jnt[1,1]/(jnt[1,1]+ jnt[0,1])
print(f'\n****  p ham given call is in: {temp:0.4f}')

****  p ham given call is in: 0.5500

py1gc = py1*callJ['gy1'][1]/(py1*callJ['gy1'][1] + (1-py1)*callJ['gy0'][1])
print(f'\n**** prob ham given call is in: {py1gc:0.4f}')

**** prob ham given call is in: 0.5500

knowJ = bivJC(trainyB,trainB['know'])
print(knowJ)

{'jnt': array([[0.82345886, 0.0412569 ],
       [0.13120652, 0.00407772]]), 'gy0': array([0.95228849, 0.04771151]), 'gy1': array([0.96985816, 0.03014184])}

py1gck = (py1*callJ['gy1'][1]*knowJ['gy1'][1])/( (py1*callJ['gy1'][1]*knowJ['gy1'][1]) + ((1-py1)*callJ['gy0'][1]*knowJ['gy0'][1]))
print(f'\n**** prob ham given call and know is in: {py1gck:0.4f}')

**** prob ham given call and know is in: 0.4357

trainB0 = trainB[trainyB == 0]
print(trainB0.shape)
trainB1 = trainB[trainyB == 1]
print(trainB1.shape)

(3605, 4)
(564, 4)

jnt0 = pd.crosstab(trainB0['call'],trainB0['know'])/trainB0.shape[0]
print(jnt0)
jnt1 = pd.crosstab(trainB1['call'],trainB1['know'])/trainB1.shape[0]
print(jnt1)

know         0         1
call                    
0     0.897642  0.044938
1     0.054646  0.002774
know         0         1
call                    
0     0.537234  0.014184
1     0.432624  0.015957

p11y1 = jnt1.iloc[1,1]; p11y0 = jnt0.iloc[1,1]
py1g11 = (py1 * p11y1)/((py1 * p11y1) + ((1-py1)*p11y0))
print(f'prob y=1(spam) given call and know =1 (are in document) without NB: {py1g11:0.4f}')

prob y=1(spam) given call and know =1 (are in document) without NB: 0.4737

def jFromMar(jnt):
    '''get a joint from the marginals of jnt assuming independence'''
    mar0 = jnt.sum(axis=1) # marginal for row variable
    mar1 = jnt.sum(axis=0) # marginal for column variable
    return(np.outer(mar0,mar1))  #the outer product

print('\n******************************** given y=0:\n')
print('\n ***** jnt of call know given y=0:')
print(jnt0)
print('\n ***** jnt of call know given y=0, using NB assumption of independence:')
print(jFromMar(jnt0))

print('\n******************************** given y=1:\n')
print('\n ***** jnt of call know given y=1:')
print(jnt1)
print('\n ***** jnt of call know given y=1, using NB assumption of independence:')
print(jFromMar(jnt1))

******************************** given y=0:


 ***** jnt of call know given y=0:
know         0         1
call                    
0     0.897642  0.044938
1     0.054646  0.002774

 ***** jnt of call know given y=0, using NB assumption of independence:
[[0.89760785 0.0449719 ]
 [0.05468064 0.00273961]]

******************************** given y=1:


 ***** jnt of call know given y=1:
know         0         1
call                    
0     0.537234  0.014184
1     0.432624  0.015957

 ***** jnt of call know given y=1, using NB assumption of independence:
[[0.53479767 0.01662077]
 [0.43506048 0.01352108]]

	price	mileage	ml2	lprice
0	43.995	-36.794408	1353.828460	3.784076
1	44.995	-26.769408	716.601205	3.806551
2	25.999	35.106592	1232.472802	3.258058
3	33.880	-38.465408	1479.587613	3.522825
4	34.895	-25.499408	650.219808	3.552344

	price	mileage	ml2	lprice	1
0	43.995	-36.794408	1353.828460	3.784076	1
1	44.995	-26.769408	716.601205	3.806551	1
2	25.999	35.106592	1232.472802	3.258058	1
3	33.880	-38.465408	1479.587613	3.522825	1
4	34.895	-25.499408	650.219808	3.552344	1

	price	mileage	ml2	lprice	d550
0	43.995	-36.794408	1353.828460	3.784076	1
1	44.995	-26.769408	716.601205	3.806551	1
2	25.999	35.106592	1232.472802	3.258058	1
3	33.880	-38.465408	1479.587613	3.522825	1
4	34.895	-25.499408	650.219808	3.552344	1

	ytest	baseP	logP	quadP
ytest	1.000000	0.861215	0.897652	0.902359
baseP	0.861215	1.000000	0.943717	0.957204
logP	0.897652	0.943717	1.000000	0.980388
quadP	0.902359	0.957204	0.980388	1.000000

hw1-sp26-Python_sol¶

Problem 1. Train test/split, categorical variables, and transformations¶

2. Naive Bayes¶