import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm


#ipython magic function, helps display of plots in a notebook
%matplotlib inline


%pwd

'/Users/robertmcculloch/Documents/play'


cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
print("*** the type of cd is:")
print(type(cd))
print("***number number of rows and columns is: ",cd.shape)
print("***the column names are:")
print(cd.columns.values)

*** the type of cd is:
<class 'pandas.core.frame.DataFrame'>
***number number of rows and columns is:  (1000, 7)
***the column names are:
['price' 'trim' 'isOneOwner' 'mileage' 'year' 'color' 'displacement']


temp = cd['mileage']  # pull out the variable mileage
temp[0:5]  # print out the mileage of the first 5 cars, note the indexing!! [a,b)

0     36858.0
1     46883.0
2    108759.0
3     35187.0
4     48153.0
Name: mileage, dtype: float64


cd['mileage'].describe()   # summary statistics of variable mileage

count      1000.000000
mean      73652.408000
std       42887.422189
min        1997.000000
25%       40132.750000
50%       67919.500000
75%      100138.250000
max      255419.000000
Name: mileage, dtype: float64


print(cd['color'][0:5])  # colors of first 5 cars
cd['color'].value_counts() # how many cars have each color

0    Silver
1     Black
2     White
3     Black
4     Black
Name: color, dtype: object

Black     415
other     227
Silver    213
White     145
Name: color, dtype: int64


cd = cd[['price','mileage','year']]
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
print(cd.head()) # head just prints out the first few rows

    price  mileage  year
0  43.995   36.858  2008
1  44.995   46.883  2012
2  25.999  108.759  2007
3  33.880   35.187  2007
4  34.895   48.153  2007


print(cd.describe()) #summarize each column

             price      mileage         year
count  1000.000000  1000.000000  1000.000000
mean     30.583318    73.652408  2006.939000
std      18.411018    42.887422     4.194624
min       0.995000     1.997000  1994.000000
25%      12.995000    40.132750  2004.000000
50%      29.800000    67.919500  2007.000000
75%      43.992000   100.138250  2010.000000
max      79.995000   255.419000  2013.000000


print(cd.corr()) #compute the correlation between each column

            price   mileage      year
price    1.000000 -0.815246  0.880537
mileage -0.815246  1.000000 -0.744729
year     0.880537 -0.744729  1.000000


X = cd[['mileage','year']].to_numpy()  #mileage and year columns as a numpy array
print("*** type of X is",type(X))
print(X.shape) #number of rows and columns
print(X[0:4,:]) #first 4 rows
y = cd['price'].values #price as a numpy vector
print(f'length of y is {len(y)}')
print(y[:4]) #implicit start at 0

*** type of X is <class 'numpy.ndarray'>
(1000, 2)
[[  36.858 2008.   ]
 [  46.883 2012.   ]
 [ 108.759 2007.   ]
 [  35.187 2007.   ]]
length of y is 1000
[43.995 44.995 25.999 33.88 ]


plt.scatter(X[:,1],y)
plt.xlabel("year")
plt.ylabel("price")
plt.title("year vs. price")

Text(0.5, 1.0, 'year vs. price')


plt.scatter(X[:,0],y,s=20,c="red",marker='o')
plt.xlabel("mileage")
plt.ylabel("price")
plt.title("mileage vs. price")
figure = plt.gcf()
figure.set_size_inches(10,8)


p = sns.histplot(y,bins=20)
p.set_xlabel('price')

Text(0.5, 0, 'price')


Xdf = cd[['mileage','year','price']]
Xdf.head()


Xdf.plot.scatter(0,2,c="blue") #access columns 0 and 2 = mileage and price

<AxesSubplot:xlabel='mileage', ylabel='price'>


Xdf.plot.scatter('mileage','price',c="red",s=.5) # access columns using names

<AxesSubplot:xlabel='mileage', ylabel='price'>


cd.columns.values

array(['price', 'mileage', 'year'], dtype=object)


XXdf = cd.iloc[:,[2,0]] #year and price
XXdf.head()


cd.iloc[0:3,[2,0]] #pick off rows and columns


X1 = X[:,0] # first column of X is mileage
print(X1.shape)
X1 = X1.reshape((X.shape[0],1))
print(X1.shape)
print(X1[:4])

(1000,)
(1000, 1)
[[ 36.858]
 [ 46.883]
 [108.759]
 [ 35.187]]


lmmod1 = LinearRegression(fit_intercept=True) #model object
lmmod1.fit(X1,y) # (X1,y) is the training data
print("Model Slope:    ",lmmod1.coef_)
print("Model Intercept:",lmmod1.intercept_)

Model Slope:     [-0.34997452]
Model Intercept: 56.35978447593078


yhat = lmmod1.intercept_ + lmmod1.coef_ * X1
plt.scatter(X1,y,c='blue',s=.7,label='data')
plt.plot(X1,yhat,c='red',label='linear fit')
plt.legend()
plt.xlabel('x=mileage'); plt.ylabel('y=price')
plt.title("mileage vs price with linear fit")

Text(0.5, 1.0, 'mileage vs price with linear fit')


lmmod = LinearRegression(fit_intercept=True)
lmmod.fit(X,y) # (X,y) is the training data
print("Model Slopes:    ",lmmod.coef_)
print("Model Intercept:",lmmod.intercept_)

Model Slopes:     [-0.1537219   2.69434954]
Model Intercept: -5365.489872256993


print(lmmod)

LinearRegression()


dir(lmmod) #you can always find out a lot about an object with dir() !!

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_decision_function',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_preprocess_data',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_residues',
 '_set_intercept',
 '_validate_data',
 'coef_',
 'copy_X',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'n_features_in_',
 'n_jobs',
 'normalize',
 'positive',
 'predict',
 'rank_',
 'score',
 'set_params',
 'singular_']


print(type(lmmod.coef_))
type(lmmod.set_params)

<class 'numpy.ndarray'>

method


yhat = lmmod.predict(X)
print("the length of yhat is",len(yhat))
print("the type of yhat is:")
print(type(yhat))

the length of yhat is 1000
the type of yhat is:
<class 'numpy.ndarray'>


plt.scatter(y,yhat,s=.8) 
plt.plot(y,y,c='red') #add the line
plt.xlabel("y"); plt.ylabel("yhat")

Text(0, 0.5, 'yhat')


Xp = np.array([[40,2010],[100,2004]],dtype=float)
print(Xp)
print(type(Xp))
print(Xp.dtype)

[[  40. 2010.]
 [ 100. 2004.]]
<class 'numpy.ndarray'>
float64


ypred = lmmod.predict(Xp)
print(ypred)

[44.00383414 18.61442272]


-5365.49 - .1537*40 + 2.69434954*2010

44.00457540000025


X = sm.add_constant(X) #appends 1 to beginning of each row for the intercept
print(X[0:3,:]) # you can see the 1's
results = sm.OLS(y, X).fit() #run the regression
print(results.summary()) # print out the usual summaries

[[1.00000e+00 3.68580e+01 2.00800e+03]
 [1.00000e+00 4.68830e+01 2.01200e+03]
 [1.00000e+00 1.08759e+02 2.00700e+03]]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.832
Model:                            OLS   Adj. R-squared:                  0.832
Method:                 Least Squares   F-statistic:                     2477.
Date:                Wed, 18 Jan 2023   Prob (F-statistic):               0.00
Time:                        10:59:38   Log-Likelihood:                -3438.1
No. Observations:                1000   AIC:                             6882.
Df Residuals:                     997   BIC:                             6897.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -5365.4899    171.567    -31.273      0.000   -5702.164   -5028.816
x1            -0.1537      0.008    -18.435      0.000      -0.170      -0.137
x2             2.6943      0.085     31.602      0.000       2.527       2.862
==============================================================================
Omnibus:                      171.937   Durbin-Watson:                   2.021
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              294.618
Skew:                           1.076   Prob(JB):                     1.06e-64
Kurtosis:                       4.562   Cond. No.                     1.44e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.44e+06. This might indicate that there are
strong multicollinearity or other numerical problems.


-0.1537 + np.array([-2,2])*0.008

array([-0.1697, -0.1377])


np.corrcoef(y,yhat)

array([[1.        , 0.91238967],
       [0.91238967, 1.        ]])


.91239**2

0.8324555121


yhat1 = lmmod1.predict(X1)
np.corrcoef(y,yhat1)

array([[1.        , 0.81524579],
       [0.81524579, 1.        ]])


.815**2

0.664225


X[0:3,:]

array([[1.00000e+00, 3.68580e+01, 2.00800e+03],
       [1.00000e+00, 4.68830e+01, 2.01200e+03],
       [1.00000e+00, 1.08759e+02, 2.00700e+03]])


cd.iloc[0:3,1:3]


bhat = np.hstack([lmmod.intercept_,lmmod.coef_])[:,np.newaxis]
print(bhat.shape)
bhat

(3, 1)

array([[-5.36548987e+03],
       [-1.53721903e-01],
       [ 2.69434954e+00]])


Xpp = np.hstack([np.ones((2,1)),Xp])
print("Xp:\n",Xp)
print("Xpp:\n",Xpp)

Xp:
 [[  40. 2010.]
 [ 100. 2004.]]
Xpp:
 [[1.000e+00 4.000e+01 2.010e+03]
 [1.000e+00 1.000e+02 2.004e+03]]


yhatp = Xpp @ bhat # Xpp * bhat, matrix multiplication
yhatp

array([[44.00383414],
       [18.61442272]])


yhatm = X @ bhat
print(yhatm[0:3,:])
print(yhat[0:3]) #got these ones using the predict method

[[39.09812927]
 [48.33446537]
 [25.3510212 ]]
[39.09812927 48.33446537 25.3510212 ]


dyhat = yhatm.flatten() - yhat
dyhat.shape

(1000,)


dyhat.mean()

6.192824031359123e-15


dyhat.var()

1.5184132517201138e-25


plt.scatter(yhat,yhatm)
plt.scatter(yhat,yhat,color='red',s=.5)
plt.show()

Hello World Data Analysis in Python¶

Import Needed Libraries¶

Read in the Data and Get the Variables We Want¶

Y and x, Features and Target¶

Get y=price and X=(mileage,year) as Numpy ndarrays¶

Plot y vs each x¶

plot with pandas¶

Use iloc to subset a data frame¶

Regression with just mileage¶

Run The Regression of y=price on X=(mileage,year)¶

Looking at the LinearRegression object lmmod¶

Get and Plot the Fits¶

Predictions¶

In-sample/out of sample, training data¶

scikit-learn¶

Standard Regression Output¶

Regression In Matrix Notation¶

	mileage	year	price
0	36.858	2008	43.995
1	46.883	2012	44.995
2	108.759	2007	25.999
3	35.187	2007	33.880
4	48.153	2007	34.895