Intended Learning Outcome

This session introduces basis expansion and non-parametric regression fittings

Boston Housing data

The Boston data frame has 506 rows and 14 columns. medv is the response variable \(y\).

This data frame contains the following columns:

library(ggplot2)
library(MASS)
library(splines)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21 28.7
colnames(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"     "dis"     "rad"    
## [10] "tax"     "ptratio" "black"   "lstat"   "medv"

1 Polynomial Basis Fitting

1.1 Ordinary Polynomial Basis Fitting

The elements of ordinary polynomial basis may be correlated

Example 1: Ordinary quadratic basis

ggplot(Boston, aes(x=lstat, y=medv)) +
geom_point()+
geom_smooth(method = "lm", formula = y ~ x+I(x^2))

model_1 <- lm(Boston$medv ~ Boston$lstat + I(Boston$lstat^2) + I(Boston$lstat^3))
summary(model_1)
## 
## Call:
## lm(formula = Boston$medv ~ Boston$lstat + I(Boston$lstat^2) + 
##     I(Boston$lstat^3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5441  -3.7122  -0.5145   2.4846  26.4153 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       48.6496253  1.4347240  33.909  < 2e-16 ***
## Boston$lstat      -3.8655928  0.3287861 -11.757  < 2e-16 ***
## I(Boston$lstat^2)  0.1487385  0.0212987   6.983 9.18e-12 ***
## I(Boston$lstat^3) -0.0020039  0.0003997  -5.013 7.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared:  0.6578, Adjusted R-squared:  0.6558 
## F-statistic: 321.7 on 3 and 502 DF,  p-value: < 2.2e-16

Example 2: Ordinary cubic basis

ggplot(Boston, aes(x=lstat, y=medv)) +
geom_point()+
geom_smooth(method = "lm", formula = y ~ x+I(x^2)+I(x^3))

model_2 <- lm(Boston$medv ~ Boston$lstat + I(Boston$lstat^2) + I(Boston$lstat^3))
summary(model_2)
## 
## Call:
## lm(formula = Boston$medv ~ Boston$lstat + I(Boston$lstat^2) + 
##     I(Boston$lstat^3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5441  -3.7122  -0.5145   2.4846  26.4153 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       48.6496253  1.4347240  33.909  < 2e-16 ***
## Boston$lstat      -3.8655928  0.3287861 -11.757  < 2e-16 ***
## I(Boston$lstat^2)  0.1487385  0.0212987   6.983 9.18e-12 ***
## I(Boston$lstat^3) -0.0020039  0.0003997  -5.013 7.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared:  0.6578, Adjusted R-squared:  0.6558 
## F-statistic: 321.7 on 3 and 502 DF,  p-value: < 2.2e-16

1.2 Orthogonal Polynomial Basis Fitting

An advantage of orthogonal polynomial basis is that elements in the basis are orthogonal to each other, that is their dot product is zero.

Example 3: Orthogonal quadratic basis

ggplot(Boston, aes(x=lstat, y=medv)) +
geom_point()+
geom_smooth(method = "lm", formula = y ~ poly(x,2))

model_3 <- lm(Boston$medv ~ poly(Boston$lstat,2))
summary(model_3)
## 
## Call:
## lm(formula = Boston$medv ~ poly(Boston$lstat, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              22.5328     0.2456   91.76   <2e-16 ***
## poly(Boston$lstat, 2)1 -152.4595     5.5237  -27.60   <2e-16 ***
## poly(Boston$lstat, 2)2   64.2272     5.5237   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16

Example 4: Orthogonal cubic basis

ggplot(Boston, aes(x=lstat, y=medv)) +
geom_point()+
geom_smooth(method = "lm", formula = y ~ poly(x,3))

model_4 <- lm(Boston$medv ~ poly(Boston$lstat,3))
summary(model_4)
## 
## Call:
## lm(formula = Boston$medv ~ poly(Boston$lstat, 3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5441  -3.7122  -0.5145   2.4846  26.4153 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              22.5328     0.2399  93.937  < 2e-16 ***
## poly(Boston$lstat, 3)1 -152.4595     5.3958 -28.255  < 2e-16 ***
## poly(Boston$lstat, 3)2   64.2272     5.3958  11.903  < 2e-16 ***
## poly(Boston$lstat, 3)3  -27.0511     5.3958  -5.013 7.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared:  0.6578, Adjusted R-squared:  0.6558 
## F-statistic: 321.7 on 3 and 502 DF,  p-value: < 2.2e-16

1.3 Fitted Values and Confidence Intervals

Add to the original scatter plot the predicted values and the two confidence bands

ord = order(Boston$lstat) # TO make the plot more clear
predicted.intervals <- predict(model_4,data.frame(x=Boston$lstat[ord]),interval='confidence',level=0.95)
plot(Boston$lstat[ord],Boston$medv[ord],col='deepskyblue4',xlab='Boston$lstat',main='Observed data')
lines(Boston$lstat[ord],predicted.intervals[,1][ord],col='green',lwd=3) # The fitted value
lines(Boston$lstat[ord],predicted.intervals[,2][ord],col='black',lwd=1) # The lower
lines(Boston$lstat[ord],predicted.intervals[,3][ord],col='black',lwd=1) # The upper
# The position of legend can be controlled the following parameters, like: "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right" and "center" 
legend("topright",c("Observ.", "Predicted"), 
       col=c("deepskyblue4","green"), lwd=3)