Read in the data:
cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
names(cd)
## [1] "price" "trim" "isOneOwner" "mileage" "year"
## [6] "color" "displacement"
Pull off price, mileage and trim. Rescale Price and mileage. Make
trim a factor.
Add in mileage squared. Add in log y.
cds = cd[,c(1,2,4)]
cds$mileage = cds$mileage/1000
cds$price = cds$price/1000
cds$trim = factor(cds$trim)
cds$mileage = cds$mileage-mean(cds$mileage) ## demean, better for squaring
cds$ml2 = cds$mileage^2
cds$lprice = log(cds$price)
head(cds)
## price trim mileage ml2 lprice
## 1 43.995 550 -36.79441 1353.8285 3.784076
## 2 44.995 550 -26.76941 716.6012 3.806551
## 3 25.999 550 35.10659 1232.4728 3.258058
## 4 33.880 550 -38.46541 1479.5876 3.522825
## 5 34.895 550 -25.49941 650.2198 3.552344
## 6 5.995 500 48.09559 2313.1860 1.790926
Let’s get a train/test split.
n = nrow(cds)
tef = .25
nte = floor(tef * n)
ntr = n - nte
set.seed(88) #Nylandeer
ii = sample(n,ntr)
cdstr = cds[ii,]
cdste = cds[-ii,]
print(dim(cdstr))
print(dim(cdste))
## check
print(summary(sort(cds$price) - sort(c(cdstr$price,cdste$price))))
## [1] 750 5
## [1] 250 5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
Let’ just use mileage and trim to get a benchmark.
lmb = lm(price~mileage+trim,cdstr) # fit on train
ypredb = predict(lmb,cdste) # predcit on test
#rmse function
myrmse = function(y,yp) {
return(sqrt(mean((y-yp)^2)))
}
rmseb = myrmse(cdste$price,ypredb)
cat('\n**** test rmse with mileage and trim: ',rmseb,'\n')
##
## **** test rmse with mileage and trim: 9.826758
Ok, now let’s try logging y.
lml = lm(lprice~mileage+trim,cdstr)
ypredl = exp(predict(lml,cdste))
rmsel = myrmse(cdste$price,ypredl)
cat('\n**** test rmse with log price: ',rmsel,'\n')
##
## **** test rmse with log price: 8.755658
Now let’s try the quadratic model.
lmq = lm(price~mileage + ml2 + trim,cdstr)
ypredq = predict(lmq,cdste)
rmseq = myrmse(cdste$price,ypredq)
cat('\n**** test rmse: base, log, quad: ',rmseb,', ',rmsel,', ',rmseq,'\n')
##
## **** test rmse: base, log, quad: 9.826758 , 8.755658 , 8.22983
Quadratic looks best.
Let’s plot y vs y predictions to see the different models.
pMat = cbind(cdste$price,ypredb,ypredl,ypredq)
colnames(pMat) = c('y=price','ypredb','ypredl','ypredq')
pairs(pMat)
print(cor(pMat))
## y=price ypredb ypredl ypredq
## y=price 1.0000000 0.8479297 0.8837119 0.8960343
## ypredb 0.8479297 1.0000000 0.9498192 0.9629503
## ypredl 0.8837119 0.9498192 1.0000000 0.9813096
## ypredq 0.8960343 0.9629503 0.9813096 1.0000000
Let’s plot test y=price vs. the predictions from both the log model and the quadratic model.
plot(cdste$price,ypredl,col='blue',cex=.8,pch=16,xlab='test y=price',ylab='predictions',ylim=range(c(cdste$price,ypredl,ypredq)))
points(cdste$price,ypredq,col='red',cex=.8,pch=17)
abline(0,1,col='black',lty=3)
legend('topleft',legend=c('log price','quadratic mileage'),col=c('blue','red'),pch=c(16,17),bty='n')
The errors are bigger for the more expensive cars, especially for the log model.
Now let’s plot y=price and the predictions vs. mileage.
mlg = cdste$mileage
plot(range(mlg),range(c(cdste$price,ypredl,ypredq)),type='n',xlab='x=mileage',ylab='y=price',cex.lab=.9,cex.axis=.7)
points(mlg,cdste$price,col='black',pch=1,cex=.5)
points(mlg,ypredl,col='blue',pch=16,cex=.8)
points(mlg,ypredq,col='red',pch=17,cex=.8)
legend('topright',legend=c('log','quadratic'),col=c('blue','red'),pch=c(16,17),bty='n',cex=1.0)
Both models miss the highest prices.
Are the misses related to trim ?
ecd = data.frame(qerr = cdste$price - ypredq, lerr = cdste$price - ypredl,trim = cdste$trim)
ylm = range(c(ecd$lerr,ecd$qerr))
par(mfrow=c(1,2))
boxplot(qerr~trim,ecd,ylim = ylm)
abline(h=0,lty=2)
title(main='quadratic errors by trim',cex.main=.8)
boxplot(lerr~trim,ecd, ylim = ylm)
abline(h=0,lty=2)
title(main='log errors by trim',cex.main=.8)
Looks like the errors are bigger for the log model with trim=other.
Let’s see the coefficients for the quadratic model.
print(lmq$coef)
## (Intercept) mileage ml2 trim500 trim550 trimother
## 18.5165343 -0.3343016 0.0017594 -0.3029222 13.1870263 7.8420424
So, for example, trim=550 is the most expensive one.
Let’s plot mileage vs. the quadratic predictions with the trim category color coded.
cind = as.integer(cdste$trim)
labl = c('trim 430','trim 500','trim 550','trim other')
#plt.scatter(cdte['mileage'],cdte['price'],c=cvec,s=4,marker='D')
#plt.xlabel('mileage'); plt.ylabel('price')
plot(cdste$mileage,cdste$price,col=cind,cex=.8,pch=16,xlab='x=mileage',ylab='y=price')
for(i in 1:4) {
ii = cind==i
xx = cdste$mileage[ii]
yy = ypredq[ii]
oo = order(xx)
lines(xx[oo],yy[oo],col=i,lwd=3)
}
legend('topright',legend=labl,col=1:4,lty=rep(1,4),lwd=rep(3,4),bty='n')
title(main='mileage vs. predictions, quadratic model')
Let’s plot the quadratic and log model fits side by side to see the difference.
par(mfrow=c(1,2))
plot(cdste$mileage,ypredq,col=cdste$trim,pch=16,cex=.8,xlab='x=mileage',ylab='y= predictions')
title(main='quadratic model')
plot(cdste$mileage,ypredl,col=cdste$trim,pch=16,cex=.8,xlab='x=mileage',ylab='y= predictions')
title(main='log model')
They look pretty different.
Read in the data:
trainB = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainBS.csv")
trainyB = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainyB.csv")[,1]
n = length(trainyB)
cat("\n*** there are ",n,' observations\n\n')
## make all the variable categorical
trainyB = factor(trainyB)
for(j in 1:ncol(trainB)) trainB[[j]] = factor(trainB[[j]])
##
## *** there are 4169 observations
Estimate marginal probs for y (1 if spam, 0 else).
py1 = sum(trainyB == 1)/n
cat('\n *** p(Y=1) = ',round(py1,3),'\n\n')
##
## *** p(Y=1) = 0.135
Let’s write a simple little function which will get us estimates of
the joint
of (y,x) and the conditionals of x|y for y = 0 and 1.
bivJC = function(y,x) {
n = length(y)
jnt = table(y,x)/n
gy0 = jnt[1,]/sum(jnt[1,])
gy1 = jnt[2,]/sum(jnt[2,])
return(list(jnt=jnt,gy0=gy0,gy1=gy1))
}
Let’s get joint conditional for (y,age) and check it is the same as in notes.
ageJ = bivJC(trainyB,trainB$age)
print(ageJ)
## $jnt
## x
## y 0 1
## 0 0.863516431 0.001199328
## 1 0.132405853 0.002878388
##
## $gy0
## 0 1
## 0.998613037 0.001386963
##
## $gy1
## 0 1
## 0.9787234 0.0212766
gy0 and gy1 are the conditionals for age in the doc given y equals 0
or 1.
These match the ones in the notes.
We can also get them by calling the same function from the R package
e1071
that we used in the notes.
library(e1071)
NB = naiveBayes(trainB, trainyB)
tbls = NB$tables
tbls$age
## age
## trainyB 0 1
## 0 0.998613037 0.001386963
## 1 0.978723404 0.021276596
Ok, now let’s do part (a).
First let’s do y|age=1 and compare with the notes.
We can get P(Y=1|age = 1) directly from the joint.
jnt = ageJ$jnt
temp = jnt[2,2]/(jnt[2,2]+ jnt[1,2])
cat("\n**** prob of ham given age is in: ",temp,'\n')
##
## **** prob of ham given age is in: 0.7058824
Same as notes up to two digits.
Or we can use Bayes theorem (as in notes).
py1ga = py1*ageJ$gy1[2]/(py1*ageJ$gy1[2] + (1-py1)*ageJ$gy0[2])
cat("\n**** p ham given age is in: ",py1ga,'\n')
##
## **** p ham given age is in: 0.7058824
Which is the same.
Now let’s do call the same way.
callJ = bivJC(trainyB,trainB$call)
print(callJ$gy0)
print(callJ$gy1)
print(NB$tables$call)
## 0 1
## 0.94257975 0.05742025
## 0 1
## 0.5514184 0.4485816
## call
## trainyB 0 1
## 0 0.94257975 0.05742025
## 1 0.55141844 0.44858156
jnt = callJ$jnt
temp = jnt[2,2]/(jnt[2,2]+ jnt[1,2])
cat("\n**** p ham given call is in: ",temp,'\n')
##
## **** p ham given call is in: 0.55
Not as dramatic as age, but still a lot bigger than the prior probability of ham = .135.
Now let’s do it with Bayes theorem.
py1gc = py1*callJ$gy1[2]/(py1*callJ$gy1[2] + (1-py1)*callJ$gy0[2])
cat("\n**** p ham given call is in: ",py1gc,'\n')
##
## **** p ham given call is in: 0.55
Let’s do part (b).
knowJ = bivJC(trainyB,trainB$know)
print(rbind(knowJ$gy0,knowJ$gy1))
py1gck = (py1*callJ$gy1[2]*knowJ$gy1[2])/((py1*callJ$gy1[2]*knowJ$gy1[2]) + ((1-py1)*callJ$gy0[2]*knowJ$gy0[2]))
cat("\n**** p ham given call and know are in: ",py1gck,'\n')
## 0 1
## [1,] 0.9522885 0.04771151
## [2,] 0.9698582 0.03014184
##
## **** p ham given call and know are in: 0.4357109
Check against the NB$tables.
print(NB$tables$know)
## know
## trainyB 0 1
## 0 0.95228849 0.04771151
## 1 0.96985816 0.03014184
Same as above.
Let’s do it very explicitly with all the numbers as in the notes.
temp = py1*0.44858156*0.03014184/(py1*0.44858156*0.03014184 + (1-py1)*0.05742025*0.04771151)
cat("\n**** p ham given call and know are in: ",temp,'\n')
##
## **** p ham given call and know are in: 0.4357109
know is actually more likely to be in if y=0 (ham), so the probability is lower than with just call in.
Finally, lets do part (c).
Now we need the joint distribution of (x1=call,x2=know) given y = 0
(ham) and y=1 (spam).
We will split the trainB x data.frame into the subset with y=0 and the subset with y=1.
trainB0 = trainB[trainyB == 0,]
trainB1 = trainB[trainyB == 1,]
print(table(trainyB))
print(dim(trainB0))
print(dim(trainB1))
## trainyB
## 0 1
## 3605 564
## [1] 3605 4
## [1] 564 4
Dimensions look right.
Now can estimate the joint distribution of (call,know) given y = 0 or 1.
jnt0 = table(trainB0$call,trainB0$know)/nrow(trainB0)
jnt1 = table(trainB1$call,trainB1$know)/nrow(trainB1)
p11y0 = jnt0[2,2]
p11y1 = jnt1[2,2]
ck1 = sum(trainB0$call ==1 & trainB0$know==1)/nrow(trainB0)
cat("\n%% should be the same: ",p11y0,', ',ck1)
ck2 = sum(trainB1$call ==1 & trainB1$know==1)/nrow(trainB1)
cat("\n%% should be the same: ",p11y1,', ',ck2)
##
## %% should be the same: 0.002773925 , 0.002773925
## %% should be the same: 0.01595745 , 0.01595745
Bayes theorem:
py1g11 = (py1 * p11y1)/((py1 * p11y1) + ((1-py1)*p11y0))
cat('\n\n *****p y=1(spam) given call,know=1 (not in document) without NB: ',py1g11)
##
##
## *****p y=1(spam) given call,know=1 (not in document) without NB: 0.4736842
Quite similar to what we got using Naive Bayes.
Let’s investigate the Naive Bayes assumption.
Conditional on y = 0 or 1, is the joint from independence similar to the
joint?
## this function gets the jnt assuming indpendence from the joint.
jFromMar = function(jnt) {
mar1 = apply(jnt,1,sum)
mar2 = apply(jnt,2,sum)
return(mar1 %o% mar2) #if indep, jnt is outer product of marginals
}
I0 = jFromMar(jnt0)
cat('\n** p11 given y=0 from independence:',I0[2,2])
cat('\n** p11 given y=0 :',jnt0[2,2])
I1 = jFromMar(jnt1)
cat('\n** p11 given y=1 from independence:',I1[2,2])
cat('\n** p11 given y=1 :',jnt1[2,2])
##
## ** p11 given y=0 from independence: 0.002739607
## ** p11 given y=0 : 0.002773925
## ** p11 given y=1 from independence: 0.01352108
## ** p11 given y=1 : 0.01595745
Pretty similar! Maybe Naive Bayes makes sense here.
Let’s look at the full 2x2 tables given the joints with and without independence.
cat('\n#############\n')
cat('\n%%joint given y=0, with independence:\n')
print(I0)
cat('\n%% joint given y=0\n')
print(jnt0)
cat('\n#############\n')
cat('\n%%joint given y=1 with independence:\n')
print(I1)
cat('\n%% joint given y=1\n')
print(jnt1)
##
## #############
##
## %%joint given y=0, with independence:
## 0 1
## 0 0.89760785 0.044971905
## 1 0.05468064 0.002739607
##
## %% joint given y=0
##
## 0 1
## 0 0.897642164 0.044937587
## 1 0.054646325 0.002773925
##
## #############
##
## %%joint given y=1 with independence:
## 0 1
## 0 0.5347977 0.01662077
## 1 0.4350605 0.01352108
##
## %% joint given y=1
##
## 0 1
## 0 0.53723404 0.01418440
## 1 0.43262411 0.01595745
Not exactly the same, but pretty similar.