import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

## read in the data and pull off price, mileage, and year.
cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[['price','mileage','year']]
cd['price']  /= 1000
cd['mileage']  /= 1000
print(cd.head())

    price  mileage  year
0  43.995   36.858  2008
1  44.995   46.883  2012
2  25.999  108.759  2007
3  33.880   35.187  2007
4  34.895   48.153  2007

## convert to numpy
X = cd[['mileage','year']].to_numpy()  #mileage and year columns as a numpy array
print("*** type of X is",type(X))
print(X.shape) #number of rows and columns
print(X[0:4,:]) #first 4 rows
y = cd['price'].values #price as a numpy vector
print(f'length of y is {len(y)}')
print(y[:4]) #implicit start at 0

*** type of X is <class 'numpy.ndarray'>
(1000, 2)
[[  36.858 2008.   ]
 [  46.883 2012.   ]
 [ 108.759 2007.   ]
 [  35.187 2007.   ]]
length of y is 1000
[43.995 44.995 25.999 33.88 ]

## run the linear model (which is not so great)

lmmod = LinearRegression(fit_intercept=True)
lmmod.fit(X,y) # (X,y) is the training data
print("Model Slopes:    ",lmmod.coef_)
print("Model Intercept:",lmmod.intercept_)

Model Slopes:     [-0.1537219   2.69434954]
Model Intercept: -5365.489872256993

yhat = lmmod.predict(X)

plt.scatter(y,yhat,s=.8)
plt.plot(y,y,c='red') #add the line
plt.xlabel("y"); plt.ylabel("yhat")

Text(0, 0.5, 'yhat')

from sklearn.preprocessing import PolynomialFeatures
p2 = PolynomialFeatures(2,include_bias=False)
X2 = p2.fit_transform(X)
print(X2.shape)
X2[:5,:]

(1000, 5)

array([[3.68580000e+01, 2.00800000e+03, 1.35851216e+03, 7.40108640e+04,
        4.03206400e+06],
       [4.68830000e+01, 2.01200000e+03, 2.19801569e+03, 9.43285960e+04,
        4.04814400e+06],
       [1.08759000e+02, 2.00700000e+03, 1.18285201e+04, 2.18279313e+05,
        4.02804900e+06],
       [3.51870000e+01, 2.00700000e+03, 1.23812497e+03, 7.06203090e+04,
        4.02804900e+06],
       [4.81530000e+01, 2.00700000e+03, 2.31871141e+03, 9.66430710e+04,
        4.02804900e+06]])

plt.scatter(X2[:,0],X2[:,2])

<matplotlib.collections.PathCollection at 0x774477e55d20>

## pretty high correlations
pd.DataFrame(X2).corr()

# maybe better with demeaned features
Xm = X.mean(0)
Xdm = X - Xm
Xdm2 = p2.fit_transform(Xdm)
pd.DataFrame(Xdm2).corr()

plt.scatter(Xdm2[:,0],Xdm2[:,2])

<matplotlib.collections.PathCollection at 0x7744764ec490>

# I'm going to try degree 5!!
poly = PolynomialFeatures(5,include_bias=False)  ## will give 20 features
Xp = poly.fit_transform(Xdm)
print(Xp.shape)

(1000, 20)

from IPython.display import Image
Image(filename='Figure_1.png')

Image(filename='./py_price-lasso.png')

	0	1	2	3	4
0	1.000000	-0.744729	0.946540	0.999994	-0.744929
1	-0.744729	1.000000	-0.660500	-0.743045	0.999999
2	0.946540	-0.660500	1.000000	0.946143	-0.660511
3	0.999994	-0.743045	0.946143	1.000000	-0.743247
4	-0.744929	0.999999	-0.660511	-0.743247	1.000000

	0	1	2	3	4
0	1.000000	-0.744729	0.500553	-0.365394	0.241734
1	-0.744729	1.000000	-0.253579	0.303090	-0.500058
2	0.500553	-0.253579	1.000000	-0.786721	0.373183
3	-0.365394	0.303090	-0.786721	1.000000	-0.716638
4	0.241734	-0.500058	0.373183	-0.716638	1.000000

LASSO, Polynomials and the cars data¶