import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
#ipython magic function, helps display of plots in a notebook
%matplotlib inline
cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cd = cd[['price','mileage','year']]
cd['price'] = cd['price']/1000
cd['mileage'] = cd['mileage']/1000
print(cd.head()) # head just prints out the first few rows
price mileage year 0 43.995 36.858 2008 1 44.995 46.883 2012 2 25.999 108.759 2007 3 33.880 35.187 2007 4 34.895 48.153 2007
X = cd[['mileage','year']].to_numpy() #mileage and year columns as a numpy array
y = cd['price'].values #price as a numpy vector
plt.scatter(X[:,0],y,s=20,c="blue",marker='o')
plt.xlabel("mileage")
plt.ylabel("price")
plt.title("mileage vs. price")
figure = plt.gcf()
figure.set_size_inches(10,8)
X1 = X[:,0] # first column of X is mileage
print(X1.shape)
X1 = X1.reshape((X.shape[0],1))
print(X1.shape)
(1000,) (1000, 1)
lmmod1 = LinearRegression(fit_intercept=True) #model object
lmmod1.fit(X1,y) # (X1,y) is the training data
print("Model Slope: ",lmmod1.coef_)
print("Model Intercept:",lmmod1.intercept_)
Model Slope: [-0.34997452] Model Intercept: 56.359784475930795
lmmod = LinearRegression(fit_intercept=True)
lmmod.fit(X,y) # (X,y) is the training data
print("Model Slopes: ",lmmod.coef_)
print("Model Intercept:",lmmod.intercept_)
Model Slopes: [-0.1537219 2.69434954] Model Intercept: -5365.489872256993
X2 = sm.add_constant(X) #appends 1 to beginning of each row for the intercept
print(X2[0:3,:]) # you can see the 1's
results = sm.OLS(y, X2).fit() #run the regression
print(results.summary()) # print out the usual summaries
[[1.00000e+00 3.68580e+01 2.00800e+03] [1.00000e+00 4.68830e+01 2.01200e+03] [1.00000e+00 1.08759e+02 2.00700e+03]] OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.832 Model: OLS Adj. R-squared: 0.832 Method: Least Squares F-statistic: 2477. Date: Mon, 16 Jan 2023 Prob (F-statistic): 0.00 Time: 06:16:05 Log-Likelihood: -3438.1 No. Observations: 1000 AIC: 6882. Df Residuals: 997 BIC: 6897. Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -5365.4899 171.567 -31.273 0.000 -5702.164 -5028.816 x1 -0.1537 0.008 -18.435 0.000 -0.170 -0.137 x2 2.6943 0.085 31.602 0.000 2.527 2.862 ============================================================================== Omnibus: 171.937 Durbin-Watson: 2.021 Prob(Omnibus): 0.000 Jarque-Bera (JB): 294.618 Skew: 1.076 Prob(JB): 1.06e-64 Kurtosis: 4.562 Cond. No. 1.44e+06 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.44e+06. This might indicate that there are strong multicollinearity or other numerical problems.