hd = read.csv("http://www.rob-mcculloch.org/data/midcity.csv")
print(dim(hd))
## [1] 128 8
head(hd)
## Home Nbhd Offers SqFt Brick Bedrooms Bathrooms Price
## 1 1 2 2 1790 No 2 2 114300
## 2 2 2 3 2030 No 4 2 114200
## 3 3 2 1 1740 No 3 2 114800
## 4 4 2 3 1980 No 3 2 94700
## 5 5 2 3 2130 No 3 3 119800
## 6 6 1 2 1780 No 3 2 114600
Let’s transform price and size to be in thousands.
hd$price = hd$Price/1000
hd$size = hd$SqFt/1000
names(hd)
## [1] "Home" "Nbhd" "Offers" "SqFt" "Brick" "Bedrooms"
## [7] "Bathrooms" "Price" "price" "size"
Ok, now let’s regress price on size, Bedrooms, and Bathrooms.
lrfit = lm(price~size+Bedrooms+Bathrooms,hd)
summary(lrfit)
##
## Call:
## lm(formula = price ~ size + Bedrooms + Bathrooms, data = hd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.71 -15.63 -0.24 13.85 49.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.641 17.200 -0.328 0.743504
## size 35.643 10.667 3.341 0.001102 **
## Bedrooms 10.460 2.912 3.592 0.000472 ***
## Bathrooms 13.546 4.219 3.211 0.001685 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.36 on 124 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.426
## F-statistic: 32.42 on 3 and 124 DF, p-value: 1.535e-15
Let’s add Brick and Nbhd.
First we make them categorical variables.
hd$Brick = as.factor(hd$Brick)
hd$Nbhd = as.factor(hd$Nbhd)
summary(hd)
## Home Nbhd Offers SqFt Brick
## Min. : 1.00 1:44 Min. :1.000 Min. :1450 No :86
## 1st Qu.: 32.75 2:45 1st Qu.:2.000 1st Qu.:1880 Yes:42
## Median : 64.50 3:39 Median :3.000 Median :2000
## Mean : 64.50 Mean :2.578 Mean :2001
## 3rd Qu.: 96.25 3rd Qu.:3.000 3rd Qu.:2140
## Max. :128.00 Max. :6.000 Max. :2590
## Bedrooms Bathrooms Price price
## Min. :2.000 Min. :2.000 Min. : 69100 Min. : 69.1
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:111325 1st Qu.:111.3
## Median :3.000 Median :2.000 Median :125950 Median :126.0
## Mean :3.023 Mean :2.445 Mean :130427 Mean :130.4
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:148250 3rd Qu.:148.2
## Max. :5.000 Max. :4.000 Max. :211200 Max. :211.2
## size
## Min. :1.450
## 1st Qu.:1.880
## Median :2.000
## Mean :2.001
## 3rd Qu.:2.140
## Max. :2.590
Notice how the summary now makes sense for Brick and NBhd.
Now let’s run the regression.
R will automatically dummy up the categorical variables.
lrfit1 = lm(price~size+Bedrooms+Bathrooms+Brick+Nbhd,hd)
summary(lrfit1)
##
## Call:
## lm(formula = price ~ size + Bedrooms + Bathrooms + Brick + Nbhd,
## data = hd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.008 -7.323 -0.119 7.819 33.392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.919 10.474 1.711 0.08967 .
## size 35.930 6.404 5.610 1.30e-07 ***
## Bedrooms 1.902 1.902 1.000 0.31933
## Bathrooms 6.827 2.563 2.664 0.00878 **
## BrickYes 18.508 2.396 7.723 3.65e-12 ***
## Nbhd2 4.866 2.722 1.788 0.07633 .
## Nbhd3 34.084 3.169 10.755 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.15 on 121 degrees of freedom
## Multiple R-squared: 0.805, Adjusted R-squared: 0.7954
## F-statistic: 83.27 on 6 and 121 DF, p-value: < 2.2e-16
plot(lrfit1$fitted.values,lrfit1$residuals,xlab="fits",ylab="resids",col="blue")
Suppose you have a data frame with just y and all the x’s you want to regress y on.
Let’s make a data frame like that.
names(hd)
## [1] "Home" "Nbhd" "Offers" "SqFt" "Brick" "Bedrooms"
## [7] "Bathrooms" "Price" "price" "size"
hds = hd[,c(2,5,6,7,9,10)]
head(hds)
## Nbhd Brick Bedrooms Bathrooms price size
## 1 2 No 2 2 114.3 1.79
## 2 2 No 4 2 114.2 2.03
## 3 2 No 3 2 114.8 1.74
## 4 2 No 3 2 94.7 1.98
## 5 2 No 3 3 119.8 2.13
## 6 1 No 3 2 114.6 1.78
Now I can run the regression above without having to spell out all the x variables.
lregfit2 = lm(price~.,hds)
summary(lregfit2)
##
## Call:
## lm(formula = price ~ ., data = hds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.008 -7.323 -0.119 7.819 33.392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.919 10.474 1.711 0.08967 .
## Nbhd2 4.866 2.722 1.788 0.07633 .
## Nbhd3 34.084 3.169 10.755 < 2e-16 ***
## BrickYes 18.508 2.396 7.723 3.65e-12 ***
## Bedrooms 1.902 1.902 1.000 0.31933
## Bathrooms 6.827 2.563 2.664 0.00878 **
## size 35.930 6.404 5.610 1.30e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.15 on 121 degrees of freedom
## Multiple R-squared: 0.805, Adjusted R-squared: 0.7954
## F-statistic: 83.27 on 6 and 121 DF, p-value: < 2.2e-16