1. Train test/split, categorical variables, and transformations

Read in the data:

cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
names(cd)

## [1] "price"        "trim"         "isOneOwner"   "mileage"      "year"        
## [6] "color"        "displacement"

Pull off price, mileage and trim. Rescale Price and mileage. Make trim a factor.
Add in mileage squared. Add in log y.

cds = cd[,c(1,2,4)]
cds$mileage = cds$mileage/1000
cds$price = cds$price/1000
cds$trim = factor(cds$trim)
cds$mileage = cds$mileage-mean(cds$mileage)  ## demean, better for squaring
cds$ml2 = cds$mileage^2
cds$lprice = log(cds$price)
head(cds)

##    price trim   mileage       ml2   lprice
## 1 43.995  550 -36.79441 1353.8285 3.784076
## 2 44.995  550 -26.76941  716.6012 3.806551
## 3 25.999  550  35.10659 1232.4728 3.258058
## 4 33.880  550 -38.46541 1479.5876 3.522825
## 5 34.895  550 -25.49941  650.2198 3.552344
## 6  5.995  500  48.09559 2313.1860 1.790926

Let’s get a train/test split.

n = nrow(cds)
tef = .25
nte = floor(tef * n)
ntr = n - nte
set.seed(88) #Nylandeer
ii = sample(n,ntr)
cdstr = cds[ii,]
cdste = cds[-ii,]
print(dim(cdstr))
print(dim(cdste))
## check
print(summary(sort(cds$price) - sort(c(cdstr$price,cdste$price))))

## [1] 750   5
## [1] 250   5
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0

Let’ just use mileage and trim to get a benchmark.

lmb = lm(price~mileage+trim,cdstr)  # fit on train
ypredb = predict(lmb,cdste) # predcit on test

#rmse function

myrmse = function(y,yp) {
  return(sqrt(mean((y-yp)^2)))
}
rmseb = myrmse(cdste$price,ypredb)
cat('\n**** test rmse with mileage and trim: ',rmseb,'\n')

## 
## **** test rmse with mileage and trim:  9.826758

Ok, now let’s try logging y.

lml = lm(lprice~mileage+trim,cdstr)
ypredl = exp(predict(lml,cdste))
rmsel = myrmse(cdste$price,ypredl)
cat('\n**** test rmse with log price: ',rmsel,'\n')

## 
## **** test rmse with log price:  8.755658

Now let’s try the quadratic model.

lmq = lm(price~mileage + ml2 + trim,cdstr)
ypredq = predict(lmq,cdste)
rmseq = myrmse(cdste$price,ypredq)
cat('\n**** test rmse: base, log, quad: ',rmseb,', ',rmsel,', ',rmseq,'\n')

## 
## **** test rmse: base, log, quad:  9.826758 ,  8.755658 ,  8.22983

Quadratic looks best.

Let’s plot y vs y predictions to see the different models.

pMat = cbind(cdste$price,ypredb,ypredl,ypredq)
colnames(pMat) = c('y=price','ypredb','ypredl','ypredq')
pairs(pMat)

print(cor(pMat))

##           y=price    ypredb    ypredl    ypredq
## y=price 1.0000000 0.8479297 0.8837119 0.8960343
## ypredb  0.8479297 1.0000000 0.9498192 0.9629503
## ypredl  0.8837119 0.9498192 1.0000000 0.9813096
## ypredq  0.8960343 0.9629503 0.9813096 1.0000000

Let’s plot test y=price vs. the predictions from both the log model and the quadratic model.

plot(cdste$price,ypredl,col='blue',cex=.8,pch=16,xlab='test y=price',ylab='predictions',ylim=range(c(cdste$price,ypredl,ypredq)))
points(cdste$price,ypredq,col='red',cex=.8,pch=17)
abline(0,1,col='black',lty=3)
legend('topleft',legend=c('log price','quadratic mileage'),col=c('blue','red'),pch=c(16,17),bty='n')

The errors are bigger for the more expensive cars, especially for the log model.

Now let’s plot y=price and the predictions vs. mileage.

mlg = cdste$mileage
plot(range(mlg),range(c(cdste$price,ypredl,ypredq)),type='n',xlab='x=mileage',ylab='y=price',cex.lab=.9,cex.axis=.7)
points(mlg,cdste$price,col='black',pch=1,cex=.5)
points(mlg,ypredl,col='blue',pch=16,cex=.8)
points(mlg,ypredq,col='red',pch=17,cex=.8)
legend('topright',legend=c('log','quadratic'),col=c('blue','red'),pch=c(16,17),bty='n',cex=1.0)

Both models miss the highest prices.

Are the misses related to trim ?

ecd = data.frame(qerr = cdste$price - ypredq, lerr = cdste$price - ypredl,trim = cdste$trim)
ylm = range(c(ecd$lerr,ecd$qerr))
par(mfrow=c(1,2))
boxplot(qerr~trim,ecd,ylim = ylm)
abline(h=0,lty=2)
title(main='quadratic errors by trim',cex.main=.8)
boxplot(lerr~trim,ecd, ylim = ylm)
abline(h=0,lty=2)
title(main='log errors by trim',cex.main=.8)

Looks like the errors are bigger for the log model with trim=other.

Let’s see the coefficients for the quadratic model.

print(lmq$coef)

## (Intercept)     mileage         ml2     trim500     trim550   trimother 
##  18.5165343  -0.3343016   0.0017594  -0.3029222  13.1870263   7.8420424

So, for example, trim=550 is the most expensive one.

Let’s plot mileage vs. the quadratic predictions with the trim category color coded.

cind = as.integer(cdste$trim)
labl = c('trim 430','trim 500','trim 550','trim other')
#plt.scatter(cdte['mileage'],cdte['price'],c=cvec,s=4,marker='D')
#plt.xlabel('mileage'); plt.ylabel('price')
plot(cdste$mileage,cdste$price,col=cind,cex=.8,pch=16,xlab='x=mileage',ylab='y=price')
for(i in 1:4) {
  ii = cind==i
  xx = cdste$mileage[ii]
  yy = ypredq[ii]
  oo = order(xx)
  lines(xx[oo],yy[oo],col=i,lwd=3)
}
legend('topright',legend=labl,col=1:4,lty=rep(1,4),lwd=rep(3,4),bty='n')
title(main='mileage vs. predictions, quadratic model')

Let’s plot the quadratic and log model fits side by side to see the difference.

par(mfrow=c(1,2))
plot(cdste$mileage,ypredq,col=cdste$trim,pch=16,cex=.8,xlab='x=mileage',ylab='y= predictions')
title(main='quadratic model')
plot(cdste$mileage,ypredl,col=cdste$trim,pch=16,cex=.8,xlab='x=mileage',ylab='y= predictions')
title(main='log model')

They look pretty different.

2. Naive Bayes

Read in the data:

trainB = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainBS.csv")
trainyB = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainyB.csv")[,1]
n = length(trainyB)
cat("\n*** there are ",n,' observations\n\n')
## make all the variable categorical
trainyB = factor(trainyB)
for(j in 1:ncol(trainB)) trainB[[j]] = factor(trainB[[j]])

## 
## *** there are  4169  observations

Estimate marginal probs for y (1 if spam, 0 else).

py1 = sum(trainyB == 1)/n
cat('\n *** p(Y=1) = ',round(py1,3),'\n\n')

## 
##  *** p(Y=1) =  0.135

Let’s write a simple little function which will get us estimates of the joint
of (y,x) and the conditionals of x|y for y = 0 and 1.

bivJC = function(y,x) {
  n = length(y)
  jnt = table(y,x)/n
  gy0 = jnt[1,]/sum(jnt[1,])
  gy1 = jnt[2,]/sum(jnt[2,])
  return(list(jnt=jnt,gy0=gy0,gy1=gy1))
}

Let’s get joint conditional for (y,age) and check it is the same as in notes.

ageJ = bivJC(trainyB,trainB$age)
print(ageJ)

## $jnt
##    x
## y             0           1
##   0 0.863516431 0.001199328
##   1 0.132405853 0.002878388
## 
## $gy0
##           0           1 
## 0.998613037 0.001386963 
## 
## $gy1
##         0         1 
## 0.9787234 0.0212766

gy0 and gy1 are the conditionals for age in the doc given y equals 0 or 1.
These match the ones in the notes.

We can also get them by calling the same function from the R package e1071
that we used in the notes.

library(e1071)
NB = naiveBayes(trainB, trainyB)
tbls = NB$tables
tbls$age

##        age
## trainyB           0           1
##       0 0.998613037 0.001386963
##       1 0.978723404 0.021276596

Ok, now let’s do part (a).

First let’s do y|age=1 and compare with the notes.

We can get P(Y=1|age = 1) directly from the joint.

jnt = ageJ$jnt
temp = jnt[2,2]/(jnt[2,2]+ jnt[1,2])
cat("\n**** prob of ham given age is in: ",temp,'\n')

## 
## **** prob of ham given age is in:  0.7058824

Same as notes up to two digits.

Or we can use Bayes theorem (as in notes).

py1ga = py1*ageJ$gy1[2]/(py1*ageJ$gy1[2] + (1-py1)*ageJ$gy0[2])
cat("\n**** p ham given age is in: ",py1ga,'\n')

## 
## **** p ham given age is in:  0.7058824

Which is the same.

Now let’s do call the same way.

callJ = bivJC(trainyB,trainB$call)
print(callJ$gy0)
print(callJ$gy1)
print(NB$tables$call)

##          0          1 
## 0.94257975 0.05742025 
##         0         1 
## 0.5514184 0.4485816 
##        call
## trainyB          0          1
##       0 0.94257975 0.05742025
##       1 0.55141844 0.44858156

jnt = callJ$jnt
temp = jnt[2,2]/(jnt[2,2]+ jnt[1,2])
cat("\n**** p ham given call is in: ",temp,'\n')

## 
## **** p ham given call is in:  0.55

Not as dramatic as age, but still a lot bigger than the prior probability of ham = .135.

Now let’s do it with Bayes theorem.

py1gc = py1*callJ$gy1[2]/(py1*callJ$gy1[2] + (1-py1)*callJ$gy0[2])
cat("\n**** p ham given call is in: ",py1gc,'\n')

## 
## **** p ham given call is in:  0.55

Let’s do part (b).

knowJ = bivJC(trainyB,trainB$know)
print(rbind(knowJ$gy0,knowJ$gy1))
py1gck = (py1*callJ$gy1[2]*knowJ$gy1[2])/((py1*callJ$gy1[2]*knowJ$gy1[2]) + ((1-py1)*callJ$gy0[2]*knowJ$gy0[2]))
cat("\n**** p ham given call and know are in: ",py1gck,'\n')

##              0          1
## [1,] 0.9522885 0.04771151
## [2,] 0.9698582 0.03014184
## 
## **** p ham given call and know are in:  0.4357109

Check against the NB$tables.

print(NB$tables$know)

##        know
## trainyB          0          1
##       0 0.95228849 0.04771151
##       1 0.96985816 0.03014184

Same as above.

Let’s do it very explicitly with all the numbers as in the notes.

temp = py1*0.44858156*0.03014184/(py1*0.44858156*0.03014184 + (1-py1)*0.05742025*0.04771151)
cat("\n**** p ham given call and know are in: ",temp,'\n')

## 
## **** p ham given call and know are in:  0.4357109

know is actually more likely to be in if y=0 (ham), so the probability is lower than with just call in.

Finally, lets do part (c).
Now we need the joint distribution of (x1=call,x2=know) given y = 0 (ham) and y=1 (spam).

We will split the trainB x data.frame into the subset with y=0 and the subset with y=1.

trainB0 = trainB[trainyB == 0,]
trainB1 = trainB[trainyB == 1,]
print(table(trainyB))
print(dim(trainB0))
print(dim(trainB1))

## trainyB
##    0    1 
## 3605  564 
## [1] 3605    4
## [1] 564   4

Dimensions look right.

Now can estimate the joint distribution of (call,know) given y = 0 or 1.

jnt0 = table(trainB0$call,trainB0$know)/nrow(trainB0)
jnt1 = table(trainB1$call,trainB1$know)/nrow(trainB1)
p11y0 = jnt0[2,2]
p11y1 = jnt1[2,2]
ck1 = sum(trainB0$call ==1 & trainB0$know==1)/nrow(trainB0)
cat("\n%% should be the same: ",p11y0,', ',ck1)
ck2 = sum(trainB1$call ==1 & trainB1$know==1)/nrow(trainB1)
cat("\n%% should be the same: ",p11y1,', ',ck2)

## 
## %% should be the same:  0.002773925 ,  0.002773925
## %% should be the same:  0.01595745 ,  0.01595745

Bayes theorem:

py1g11 = (py1 * p11y1)/((py1 * p11y1) + ((1-py1)*p11y0))
cat('\n\n *****p y=1(spam) given call,know=1 (not in document) without NB: ',py1g11)

## 
## 
##  *****p y=1(spam) given call,know=1 (not in document) without NB:  0.4736842

Quite similar to what we got using Naive Bayes.

Let’s investigate the Naive Bayes assumption.
Conditional on y = 0 or 1, is the joint from independence similar to the joint?

## this function gets the jnt assuming indpendence from the joint. 
jFromMar = function(jnt) {
  mar1 = apply(jnt,1,sum)
  mar2 = apply(jnt,2,sum)
  return(mar1 %o% mar2) #if indep, jnt is outer product of marginals
}

I0 = jFromMar(jnt0)
cat('\n** p11 given y=0 from independence:',I0[2,2])
cat('\n** p11 given y=0 :',jnt0[2,2])
I1 = jFromMar(jnt1)
cat('\n** p11 given y=1 from independence:',I1[2,2])
cat('\n** p11 given y=1 :',jnt1[2,2])

## 
## ** p11 given y=0 from independence: 0.002739607
## ** p11 given y=0 : 0.002773925
## ** p11 given y=1 from independence: 0.01352108
## ** p11 given y=1 : 0.01595745

Pretty similar! Maybe Naive Bayes makes sense here.

Let’s look at the full 2x2 tables given the joints with and without independence.

cat('\n#############\n')
cat('\n%%joint given y=0, with independence:\n')
print(I0)
cat('\n%% joint given y=0\n')
print(jnt0)
cat('\n#############\n')
cat('\n%%joint given y=1 with independence:\n')
print(I1)
cat('\n%% joint given y=1\n')
print(jnt1)

## 
## #############
## 
## %%joint given y=0, with independence:
##            0           1
## 0 0.89760785 0.044971905
## 1 0.05468064 0.002739607
## 
## %% joint given y=0
##    
##               0           1
##   0 0.897642164 0.044937587
##   1 0.054646325 0.002773925
## 
## #############
## 
## %%joint given y=1 with independence:
##           0          1
## 0 0.5347977 0.01662077
## 1 0.4350605 0.01352108
## 
## %% joint given y=1
##    
##              0          1
##   0 0.53723404 0.01418440
##   1 0.43262411 0.01595745

Not exactly the same, but pretty similar.

hw1-sp26_sol

Robert McCulloch

2026-02-14

1. Train test/split, categorical variables, and transformations

2. Naive Bayes