# numpy and pandas
import numpy as np
import pandas as pd
import math

#graphics with matplotlib
import matplotlib.pyplot as plt

# model, train/test split, dummies (one-hot-encoding), rmse metric from scikit learn.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error

cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
print("***the column names are:")
print(cd.columns.values)
# we will just use price, mileage, and trim
cd = cd[['price','mileage','trim']]
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
cd.head()

***the column names are:
['price' 'trim' 'isOneOwner' 'mileage' 'year' 'color' 'displacement']

n = cd.shape[0]
ntr = np.floor(.75*n).astype(int)
nte = n-ntr
print(f'\n ***n, n train, n test: {n}, {ntr}, {nte}\n')

 ***n, n train, n test: 1000, 750, 250

cdtr, cdte = train_test_split(cd,random_state=99, test_size=.25)  #always set the seed, Gretzky
print(cdtr.shape)
print(cdte.shape)

(750, 3)
(250, 3)

## random number generator
rng = np.random.default_rng(seed=14) #Dave Keon
## sample ntr from 0:(n-1) without replacement
ii = rng.choice(n,ntr,replace=False) #train indices

# Get the complement indices
cdtr1 = cd.iloc[ii,:]  #train
all_indices = np.arange(n)
comp_ii = np.setdiff1d(all_indices, ii) #test indices
cdte1 = cd.iloc[comp_ii,:] # test
print(cdtr1.shape)
print(cdte1.shape)

(750, 3)
(250, 3)

plt.scatter(cdtr['mileage'],cdtr['price'],s=.85)
plt.xlabel('mileage'); plt.ylabel('price')

Text(0, 0.5, 'price')

y = cd["price"].to_numpy()
y.shape

(1000,)

X = cd[["mileage"]].to_numpy()
X.shape

(1000, 1)

Xtr, Xte, ytr, yte = train_test_split(X,y,random_state=99, test_size=.25)
print(Xtr.shape)
print(Xte.shape)
print(ytr.shape)
print(yte.shape)

(750, 1)
(250, 1)
(750,)
(250,)

plt.scatter(Xtr,ytr,s=.85,c='green')
plt.xlabel("train data mileage"); plt.ylabel("train data price")

Text(0, 0.5, 'train data price')

lm1 = LinearRegression(fit_intercept=True)
lm1.fit(Xtr,ytr)  # fit on train
ypred = lm1.predict(Xte) # predict on test

plt.scatter(ypred,yte)
plt.plot(ypred,ypred,c='red')

[<matplotlib.lines.Line2D at 0x7cc050069a20>]

def myrmse(y,yh):
    return(math.sqrt(np.mean((y-yh)**2)))

print(myrmse(yte,ypred))
print(math.sqrt(mean_squared_error(yte,ypred)))

11.647566500771871
11.647566500771871

def mymad(y,yh):
    return(np.mean(np.abs(y-yh)))

mymad(yte,ypred)

9.212157680769527

one_hot = LabelBinarizer()  #does the same thing as OneHotEncoder but a little simpler
tdumtr = one_hot.fit_transform(cdtr["trim"])
tdumte = one_hot.transform(cdte["trim"])
print(cdtr["trim"][:10])
print(tdumtr[:10,:])

644      550
389      550
999    other
125      550
792      550
464      550
676      550
606      550
319      500
466      550
Name: trim, dtype: object
[[0 0 1 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 1 0 0]
 [0 0 1 0]]

Xttr = np.hstack([cdtr.iloc[:,[1]].to_numpy(),tdumtr[:,1:4]])
Xtte = np.hstack([cdte.iloc[:,[1]].to_numpy(),tdumte[:,1:4]])
print(Xttr[:5])
print(cdte['mileage'].iloc[:5])
print(cdte['trim'].iloc[:5])

[[ 47.894   0.      1.      0.   ]
 [ 27.295   0.      1.      0.   ]
 [131.121   0.      0.      1.   ]
 [ 12.097   0.      1.      0.   ]
 [109.609   0.      1.      0.   ]]
890     90.451
983    116.299
107     29.047
609     77.556
113     96.530
Name: mileage, dtype: float64
890      550
983      500
107    other
609      550
113      500
Name: trim, dtype: object

lm2 = LinearRegression(fit_intercept=True)
lm2.fit(Xttr,cdtr['price'])  # fit on train !
ypredt = lm2.predict(Xtte)   # predict on test !

##coeficients
lm2.coef_

array([-0.2670384 , -0.93794863, 13.62817025,  8.50519223])

convd = {'430':'black','500':'red','550':'green','other':'blue'}
cvec = [convd[cval] for cval in cdte['trim']]

plt.scatter(cdte['mileage'],ypredt,s=2.0,c=cvec)
plt.xlabel('mileage'); plt.ylabel('predictions using mileage and trim')

Text(0, 0.5, 'predictions using mileage and trim')

print(myrmse(cdte["price"],ypredt))
print(mymad(cdte["price"],ypredt))

10.263742974901055
7.89483891583861

lprice = np.log(cdtr['price'])
plt.scatter(cdtr['mileage'],lprice,s=.85)
plt.xlabel('mileage'); plt.ylabel('log of price')

Text(0, 0.5, 'log of price')

lm3 = LinearRegression(fit_intercept=True)
lm3.fit(cdtr[['mileage']],lprice)

lypred = lm3.predict(cdte[['mileage']])
ypredl = np.exp(lypred)
plt.scatter(cdte['price'],ypredl)
plt.plot(ypredl,ypredl,c='red')
plt.xlabel('test price'); plt.ylabel('predictions')

Text(0, 0.5, 'predictions')

Xq = np.column_stack([cdtr['mileage'],cdtr['mileage']**2])
lm4 = LinearRegression(fit_intercept=True)
lm4.fit(Xq,cdtr['price'])
yhatq = lm4.predict(Xq)

plt.scatter(cdtr['mileage'],cdtr['price'],marker='o',c='black',s=.9)
plt.scatter(cdtr['mileage'],yhatq,marker=5,c='red',s=.9)
plt.xlabel('mileage'); plt.ylabel('price')
plt.title('train fit, quadratice model')

Text(0.5, 1.0, 'train fit, quadratice model')

trainB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainBS.csv")
trainyB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainyB.csv")['smsTrainyB']

trainyB.value_counts()/len(trainyB)

smsTrainyB
0    0.864716
1    0.135284
Name: count, dtype: float64

pd.crosstab(trainyB,trainB['age'])

print(5/(3600+5))
print(12/(552+12))

0.0013869625520110957
0.02127659574468085

[x for x in trainB.columns]

['age', 'adult', 'call', 'know']

	price	mileage	trim
0	43.995	36.858	550
1	44.995	46.883	550
2	25.999	108.759	550
3	33.880	35.187	550
4	34.895	48.153	550

1. Train test/split, categorical variables, and transformations¶

Get the Data¶

Train/Test Split¶

Linear regression of price on mileage with test rmse¶

Is trim any good ?¶

Log transform¶

Quadratic term¶

2. Naive Bayes¶

age	0	1
smsTrainyB
0	3600	5
1	552	12