import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

#ipython magic function, helps display of plots in a notebook
%matplotlib inline


cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[['price','mileage','year']]
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
print(cd.head()) # head just prints out the first few rows

    price  mileage  year
0  43.995   36.858  2008
1  44.995   46.883  2012
2  25.999  108.759  2007
3  33.880   35.187  2007
4  34.895   48.153  2007


X = cd[['mileage','year']].to_numpy()  #mileage and year columns as a numpy array
y = cd['price'].values #price as a numpy vector


plt.scatter(X[:,0],y,s=20,c="blue",marker='o')
plt.xlabel("mileage")
plt.ylabel("price")
plt.title("mileage vs. price")
figure = plt.gcf()
figure.set_size_inches(10,8)


X1 = X[:,0] # first column of X is mileage
print(X1.shape)
X1 = X1.reshape((X.shape[0],1))
print(X1.shape)

(1000,)
(1000, 1)


lmmod1 = LinearRegression(fit_intercept=True) #model object
lmmod1.fit(X1,y) # (X1,y) is the training data
print("Model Slope:    ",lmmod1.coef_)
print("Model Intercept:",lmmod1.intercept_)

Model Slope:     [-0.34997452]
Model Intercept: 56.359784475930795


lmmod = LinearRegression(fit_intercept=True)
lmmod.fit(X,y) # (X,y) is the training data
print("Model Slopes:    ",lmmod.coef_)
print("Model Intercept:",lmmod.intercept_)

Model Slopes:     [-0.1537219   2.69434954]
Model Intercept: -5365.489872256993


X2 = sm.add_constant(X) #appends 1 to beginning of each row for the intercept
print(X2[0:3,:]) # you can see the 1's
results = sm.OLS(y, X2).fit() #run the regression
print(results.summary()) # print out the usual summaries

[[1.00000e+00 3.68580e+01 2.00800e+03]
 [1.00000e+00 4.68830e+01 2.01200e+03]
 [1.00000e+00 1.08759e+02 2.00700e+03]]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.832
Model:                            OLS   Adj. R-squared:                  0.832
Method:                 Least Squares   F-statistic:                     2477.
Date:                Mon, 16 Jan 2023   Prob (F-statistic):               0.00
Time:                        06:16:05   Log-Likelihood:                -3438.1
No. Observations:                1000   AIC:                             6882.
Df Residuals:                     997   BIC:                             6897.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -5365.4899    171.567    -31.273      0.000   -5702.164   -5028.816
x1            -0.1537      0.008    -18.435      0.000      -0.170      -0.137
x2             2.6943      0.085     31.602      0.000       2.527       2.862
==============================================================================
Omnibus:                      171.937   Durbin-Watson:                   2.021
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              294.618
Skew:                           1.076   Prob(JB):                     1.06e-64
Kurtosis:                       4.562   Cond. No.                     1.44e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.44e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

Hello World Data Analysis in Python, Short version¶

imports¶

Read in data and get the variables you want¶

Get y=price and X=(mileage,year) as Numpy ndarrays¶

plot¶

Regression price on mileage¶

Regress price on mileage and year¶

Standard Regression Ouput¶