import numpy as np
ptabH = np.array([[3598,2],[5,0]])
ptabH = ptabH/ptabH.sum()
ptabS = np.array([[549,3],[12,0]])
ptabS = ptabS/ptabS.sum()
ptabH is our estimate of the joint distribution of (age,adult) given y=ham:
print(ptabH)
ptabS is estimate of the joint distribution of (age,adult) given y=spam.
print(ptabS)
print(5/3605)
We can also get the marginals from the probability tables:
pagH = ptabH.sum(axis=1) #age marginal for ham
padH = ptabH.sum(axis=0) #adult marginal for spam
print("age marginal give ham is:",pagH)
Given spam, the marginals are:
pagS = ptabS.sum(axis=1)
padS = ptabS.sum(axis=0)
pxH = pagH[1] * padH[0] #P(age=1,adult=0|y=spam), with NB
pxS= pagS[1] * padS[0] #P(age=1,adult=0|y=spam), with NB
nH = np.array([[3598,2],[5,0]]).sum()
nS = np.array([[549,3],[12,0]]).sum()
pH = nH/(nH+nS)
pS = 1 - pH
print("prior probs for Ham and Spam:",pH,pS)
pHgx = pH*pxH/(pH*pxH + pS*pxS) #ham give x=(age=1,adult=0)
print("P(ham|age=1,adult=0)",pHgx)
Without the Naive Bayes assumption, we just get p(x|y) from the Ham table and the spam table.
PPxH = ptabH[1,0]
PPxS = ptabS[1,0]
PPHgx = pH*pxH/(pH*PPxH + pS*PPxS)
print("P(ham|age=1,adult=0)",PPHgx)
print(ptabH[1,1])
print(ptabS[1,1])
Since this never happened, there are no relative frequencies to work with.
It might make sense to say the likelihood ratio is 1!!
imports:
####################
## imports
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
import math
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from numpy.random import default_rng
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor#ipython magic function, helps display of plots in a notebook
%matplotlib inline
####################
## get data
cd = pd.read_csv("http://www.rob-mcculloch.org/data/susedcars.csv")
cd = cd[['price','mileage']]
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
print(cd.head()) # head just prints out the first few rows
ii = np.argsort(cd['mileage']) #makes it easier to plot if sorted on x.
cd = cd.iloc[ii,:]
## run regression and get in sample fits
lmm = LinearRegression(fit_intercept=True)
x = cd['mileage'].to_numpy().reshape((cd.shape[0],1))
lmm.fit(x,cd['price'])
yhat = lmm.predict(x)
## do knn and get in sample fits, I played around and used k=100.
knnm = KNeighborsRegressor(n_neighbors=75)
knnm.fit(x,cd['price'])
yhat1 = knnm.predict(x)
## predict at 100
xp = np.array([100]).reshape((1,1))
ypred = lmm.predict(xp)
ypred1 = knnm.predict(xp)
##plot
plt.scatter(cd['mileage'],cd['price'],s=.5)
plt.plot(x,yhat,c='red',lw=2)
plt.plot(x,yhat1,c='green',lw=2)
plt.xlabel('mileage'); plt.ylabel('price')
plt.scatter(np.array([100]),ypred,c='red',s=50)
plt.scatter(np.array([100]),ypred1,c='green',s=50)
plt.show()
#train/test split
rng = np.random.RandomState(34)
y = cd['price'].to_numpy()
Xtrain, Xtest, ytrain, ytest = train_test_split(x,y,random_state=rng, test_size=.2)
##reg
regm = LinearRegression(fit_intercept=True)
regm.fit(Xtrain,ytrain)
yhat = regm.predict(Xtest)
##knn
knnm = KNeighborsRegressor(n_neighbors=75)
knnm.fit(Xtrain,ytrain)
yhat1 = knnm.predict(Xtest)
def rmse(y,yh):
return(math.sqrt(np.mean((y-yh)**2)))
print("reg rsme: ",rmse(ytest,yhat))
print("knn rmse: ",rmse(ytest,yhat1))
## plot
plt.scatter(Xtest,ytest,c='blue',s=.5)
plt.xlabel('mileage'); plt.ylabel('price')
plt.scatter(Xtest,yhat,c='red',s=1)
plt.scatter(Xtest,yhat1,c='green',s=2)
plt.show()