Intended Learning Outcome

To get you started with basic data mining procedures, including data preprocessing, variable selection, model building, model checking and model evaluation in R.

Boston Housing data

The Boston data frame has 506 rows and 14 columns. medv is the response variable \(y\).

This data frame contains the following columns:

library(MASS)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21 28.7
colnames(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"     "dis"     "rad"    
## [10] "tax"     "ptratio" "black"   "lstat"   "medv"

1 Data Preprocessing

1.1 Explanatory Data Analysis

# We use pairs() function
# Tilde is used to separate the left- and right-hand sides in a model formula
pairs(medv~ zn + indus + lstat + dis + rm + crim , data = Boston, main = "Boston Data")

1.2 Standardization

If we want our results to be invariant to the units and the parameter estimates \(\beta_i\) to be comparible, we can standardize the variables. Essentially we are replacing the original values with their z-score.

for (i in 1:ncol(Boston)){
    if (i != 14) {
        Boston[,i] = scale(Boston[,i])
    }
}

2 Variable Selection & Model Building

2.1 Linear Regression

In simple linear regression, we try to minimize the sum of squared error: \[SSE(\beta) = \sum e^2_i = \sum(y_i-\hat{y_i})^2={\bf (y-X\beta)^T(y-X\beta)}\] Differentiating with respect to \(\beta\): \[\frac{\partial SSE(\beta)}{\partial\beta} = -2 {\bf X^T(y-X\beta)}\] Setting to zero leads to the normal equation: \[{\bf X^TX\hat{\beta}=X^Ty}\] Since \(X^TX\) is invertible: \[{\bf \hat{\beta}=(X^TX)^{-1}X^Ty}\] \[{\bf \hat{y}=X\hat{\beta}}\]

The following model includes all \(x\) varables in the model

model_1 = lm(medv~., data=Boston)
summary(model_1)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.53281    0.21095 106.814  < 2e-16 ***
## crim        -0.92906    0.28269  -3.287 0.001087 ** 
## zn           1.08264    0.32016   3.382 0.000778 ***
## indus        0.14104    0.42188   0.334 0.738288    
## chas         0.68241    0.21884   3.118 0.001925 ** 
## nox         -2.05875    0.44262  -4.651 4.25e-06 ***
## rm           2.67688    0.29364   9.116  < 2e-16 ***
## age          0.01949    0.37184   0.052 0.958229    
## dis         -3.10712    0.41999  -7.398 6.01e-13 ***
## rad          2.66485    0.57770   4.613 5.07e-06 ***
## tax         -2.07884    0.63379  -3.280 0.001112 ** 
## ptratio     -2.06265    0.28323  -7.283 1.31e-12 ***
## black        0.85011    0.24521   3.467 0.000573 ***
## lstat       -3.74733    0.36216 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

More details of the model

coefficients(model_1) # model coefficients
## (Intercept)        crim          zn       indus        chas         nox          rm         age 
## 22.53280632 -0.92906457  1.08263896  0.14103943  0.68241438 -2.05875361  2.67687661  0.01948534 
##         dis         rad         tax     ptratio       black       lstat 
## -3.10711605  2.66485220 -2.07883689 -2.06264585  0.85010886 -3.74733185
confint(model_1, level=0.95) # CIs for model parameters 
##                  2.5 %     97.5 %
## (Intercept) 22.1183240 22.9472886
## crim        -1.4844926 -0.3736365
## zn           0.4535932  1.7116847
## indus       -0.6878735  0.9699523
## chas         0.2524457  1.1123830
## nox         -2.9284182 -1.1890891
## rm           2.0999307  3.2538225
## age         -0.7111055  0.7500762
## dis         -3.9323189 -2.2819132
## rad          1.5297973  3.7999071
## tax         -3.3241057 -0.8335680
## ptratio     -2.6191409 -1.5061508
## black        0.3683115  1.3319063
## lstat       -4.4589047 -3.0357590
fitted(model_1)[1:6] # predicted values
##        1        2        3        4        5        6 
## 30.00384 25.02556 30.56760 28.60704 27.94352 25.25628
residuals(model_1)[1:6]  # residuals
##         1         2         3         4         5         6 
## -6.003843 -3.425562  4.132403  4.792964  8.256476  3.443716
anova(model_1) # anova table 
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## crim        1  6440.8  6440.8 286.0300 < 2.2e-16 ***
## zn          1  3554.3  3554.3 157.8452 < 2.2e-16 ***
## indus       1  2551.2  2551.2 113.2984 < 2.2e-16 ***
## chas        1  1529.8  1529.8  67.9393 1.543e-15 ***
## nox         1    76.2    76.2   3.3861 0.0663505 .  
## rm          1 10938.1 10938.1 485.7530 < 2.2e-16 ***
## age         1    90.3    90.3   4.0087 0.0458137 *  
## dis         1  1779.5  1779.5  79.0262 < 2.2e-16 ***
## rad         1    34.1    34.1   1.5159 0.2188325    
## tax         1   329.6   329.6  14.6352 0.0001472 ***
## ptratio     1  1309.3  1309.3  58.1454 1.266e-13 ***
## black       1   593.3   593.3  26.3496 4.109e-07 ***
## lstat       1  2410.8  2410.8 107.0634 < 2.2e-16 ***
## Residuals 492 11078.8    22.5                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov(model_1) # covariance matrix for model parameters
##               (Intercept)          crim            zn         indus          chas           nox
## (Intercept)  4.450169e-02  1.675196e-17  9.699317e-17  4.136711e-17 -1.836590e-17  1.666076e-17
## crim         1.675196e-17  7.991348e-02 -7.888184e-03  4.425580e-03  3.095214e-03  7.459395e-03
## zn           9.699317e-17 -7.888184e-03  1.025012e-01  1.436376e-02 -1.034382e-03  5.071521e-03
## indus        4.136711e-17  4.425580e-03  1.436376e-02  1.779845e-01 -9.509673e-03 -4.929246e-02
## chas        -1.836590e-17  3.095214e-03 -1.034382e-03 -9.509673e-03  4.788925e-02 -3.249586e-03
## nox          1.666076e-17  7.459395e-03  5.071521e-03 -4.929246e-02 -3.249586e-03  1.959151e-01
## rm           1.636006e-18  2.185442e-03 -1.464686e-02  1.138864e-02 -2.003170e-03  1.214793e-02
## age          3.189466e-17 -3.511727e-04  1.436440e-02 -1.728628e-04 -4.131571e-03 -4.363803e-02
## dis         -5.509446e-17  1.386910e-02 -5.414553e-02  3.740426e-02  1.449195e-03  5.241110e-02
## rad          6.401592e-17 -4.359598e-02  1.957132e-02  6.784676e-02 -1.360274e-02 -3.634965e-02
## tax         -2.005672e-16  2.123171e-03 -4.528975e-02 -1.171005e-01  1.680936e-02 -2.092649e-02
## ptratio      2.702419e-17  1.677134e-03  2.821693e-02 -1.552309e-02  5.805277e-03  4.132172e-02
## black       -2.072888e-17  8.138840e-03 -8.957707e-04  3.440873e-03 -2.753404e-03  8.125914e-03
## lstat        1.643294e-17 -1.563353e-02 -6.160535e-03 -1.164111e-02  4.710267e-03 -1.102357e-02
##                        rm           age           dis           rad           tax       ptratio
## (Intercept)  1.636006e-18  3.189466e-17 -5.509446e-17  6.401592e-17 -2.005672e-16  2.702419e-17
## crim         2.185442e-03 -3.511727e-04  1.386910e-02 -4.359598e-02  2.123171e-03  1.677134e-03
## zn          -1.464686e-02  1.436440e-02 -5.414553e-02  1.957132e-02 -4.528975e-02  2.821693e-02
## indus        1.138864e-02 -1.728628e-04  3.740426e-02  6.784676e-02 -1.171005e-01 -1.552309e-02
## chas        -2.003170e-03 -4.131571e-03  1.449195e-03 -1.360274e-02  1.680936e-02  5.805277e-03
## nox          1.214793e-02 -4.363803e-02  5.241110e-02 -3.634965e-02 -2.092649e-02  4.132172e-02
## rm           8.622530e-02 -2.257691e-02  1.647167e-02 -2.681512e-02  1.382431e-02  1.326338e-02
## age         -2.257691e-02  1.382652e-01  4.549351e-02  1.628292e-02 -7.058001e-03 -8.235011e-03
## dis          1.647167e-02  4.549351e-02  1.763948e-01  3.846589e-03 -5.181990e-03 -1.113593e-02
## rad         -2.681512e-02  1.628292e-02  3.846589e-03  3.337323e-01 -2.885025e-01 -3.091158e-02
## tax          1.382431e-02 -7.058001e-03 -5.181990e-03 -2.885025e-01  4.016897e-01 -1.354604e-02
## ptratio      1.326338e-02 -8.235011e-03 -1.113593e-02 -3.091158e-02 -1.354604e-02  8.022082e-02
## black        7.541458e-03 -5.810818e-03  2.261954e-03  1.025140e-02  4.141221e-03 -2.329800e-03
## lstat        5.688716e-02 -4.593608e-02 -5.847922e-03 -7.458910e-03  3.517037e-03 -4.658335e-03
##                     black         lstat
## (Intercept) -2.072888e-17  1.643294e-17
## crim         8.138840e-03 -1.563353e-02
## zn          -8.957707e-04 -6.160535e-03
## indus        3.440873e-03 -1.164111e-02
## chas        -2.753404e-03  4.710267e-03
## nox          8.125914e-03 -1.102357e-02
## rm           7.541458e-03  5.688716e-02
## age         -5.810818e-03 -4.593608e-02
## dis          2.261954e-03 -5.847922e-03
## rad          1.025140e-02 -7.458910e-03
## tax          4.141221e-03  3.517037e-03
## ptratio     -2.329800e-03 -4.658335e-03
## black        6.013030e-02  1.376953e-02
## lstat        1.376953e-02  1.311605e-01

Interaction terms or in model

#The following way automatically add the main and interaction effects of crim and zn
lm(medv~crim*zn, data=Boston)
## 
## Call:
## lm(formula = medv ~ crim * zn, data = Boston)
## 
## Coefficients:
## (Intercept)         crim           zn      crim:zn  
##       54.91        75.94        69.88       161.82
lm(medv~crim*zn*rad + indus, data=Boston)
## 
## Call:
## lm(formula = medv ~ crim * zn * rad + indus, data = Boston)
## 
## Coefficients:
## (Intercept)         crim           zn          rad        indus      crim:zn     crim:rad  
##      45.769       55.627       47.371       -6.281       -2.641      112.244      -13.935  
##      zn:rad  crim:zn:rad  
##     -11.405      -24.846

Using Nonlinear variables

#The following way automatically add the main and interaction effects of crim and zn
Boston_new = Boston
Boston_new$rm_2 <- Boston_new$rm*Boston_new$rm
lm(medv~rm_2, data=Boston_new)
## 
## Call:
## lm(formula = medv ~ rm_2, data = Boston_new)
## 
## Coefficients:
## (Intercept)         rm_2  
##      20.701        1.835

2.2 Variable Selection via AIC

library(leaps)

Compare Model Fit Manually

  • AIC

Akaike information criterion (AIC) offers a relative estimate of the information lost when a given model is used to represent the process that generates the data. In doing so, it deals with the trade-off between the goodness of fit of the model and the complexity of the model. \[ AIC = -2log(L) + 2p\]

  • BIC

Bayesian information criterion (BIC) is a criterion for model selection among a finite set of models; the model with the lowest BIC is preferred.

\[ BIC = -2log(L) + p*log(n)\]

When fitting models, it is possible to increase the likelihood by adding parameters, but doing so may result in overfitting. Both BIC and AIC attempt to resolve this problem by introducing a penalty term for the number of parameters in the model; The penalty term of BIC is larger than AIC.

model_1 = lm(medv~., data = Boston)
model_2 = lm(medv~crim+zn, data = Boston)
summary(model_1)
summary(model_2)
AIC(model_1)
## [1] 3027.609
AIC(model_2)
## [1] 3553.602
BIC(model_1)
## [1] 3091.007
BIC(model_2)
## [1] 3570.508

2.3 Variable Selection via Stepwise Regression

To perform the Forward/Backward/Stepwise Regression in R, we need to define the starting points:

nullmodel=lm(medv~1, data=Boston)
fullmodel=lm(medv~., data=Boston)

nullmodel is the model with no varaible in it, while fullmodel is the model with every variable in it.

Backward Elimination (Output Omitted)

model.step = step(fullmodel,direction='backward')

Forward Selection (Output Omitted)

model.step = step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='forward')

Stepwise Selection

model.step = step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='both')
## Start:  AIC=2246.51
## medv ~ 1
## 
##           Df Sum of Sq   RSS    AIC
## + lstat    1   23243.9 19472 1851.0
## + rm       1   20654.4 22062 1914.2
## + ptratio  1   11014.3 31702 2097.6
## + indus    1    9995.2 32721 2113.6
## + tax      1    9377.3 33339 2123.1
## + nox      1    7800.1 34916 2146.5
## + crim     1    6440.8 36276 2165.8
## + rad      1    6221.1 36495 2168.9
## + age      1    6069.8 36647 2171.0
## + zn       1    5549.7 37167 2178.1
## + black    1    4749.9 37966 2188.9
## + dis      1    2668.2 40048 2215.9
## + chas     1    1312.1 41404 2232.7
## <none>                 42716 2246.5
## 
## Step:  AIC=1851.01
## medv ~ lstat
## 
##           Df Sum of Sq   RSS    AIC
## + rm       1    4033.1 15439 1735.6
## + ptratio  1    2670.1 16802 1778.4
## + chas     1     786.3 18686 1832.2
## + dis      1     772.4 18700 1832.5
## + age      1     304.3 19168 1845.0
## + tax      1     274.4 19198 1845.8
## + black    1     198.3 19274 1847.8
## + zn       1     160.3 19312 1848.8
## + crim     1     146.9 19325 1849.2
## + indus    1      98.7 19374 1850.4
## <none>                 19472 1851.0
## + rad      1      25.1 19447 1852.4
## + nox      1       4.8 19468 1852.9
## - lstat    1   23243.9 42716 2246.5
## 
## Step:  AIC=1735.58
## medv ~ lstat + rm
## 
##           Df Sum of Sq   RSS    AIC
## + ptratio  1    1711.3 13728 1678.1
## + chas     1     548.5 14891 1719.3
## + black    1     512.3 14927 1720.5
## + tax      1     425.2 15014 1723.5
## + dis      1     351.2 15088 1725.9
## + crim     1     311.4 15128 1727.3
## + rad      1     180.5 15259 1731.6
## + indus    1      61.1 15378 1735.6
## <none>                 15439 1735.6
## + zn       1      56.6 15383 1735.7
## + age      1      20.2 15419 1736.9
## + nox      1      14.9 15424 1737.1
## - rm       1    4033.1 19472 1851.0
## - lstat    1    6622.6 22062 1914.2
## 
## Step:  AIC=1678.13
## medv ~ lstat + rm + ptratio
## 
##           Df Sum of Sq   RSS    AIC
## + dis      1     499.1 13229 1661.4
## + black    1     389.7 13338 1665.6
## + chas     1     378.0 13350 1666.0
## + crim     1     122.5 13606 1675.6
## + age      1      66.2 13662 1677.7
## <none>                 13728 1678.1
## + tax      1      44.4 13684 1678.5
## + nox      1      24.8 13703 1679.2
## + zn       1      15.0 13713 1679.6
## + rad      1       6.1 13722 1679.9
## + indus    1       0.8 13727 1680.1
## - ptratio  1    1711.3 15439 1735.6
## - rm       1    3074.3 16802 1778.4
## - lstat    1    5013.6 18742 1833.7
## 
## Step:  AIC=1661.39
## medv ~ lstat + rm + ptratio + dis
## 
##           Df Sum of Sq   RSS    AIC
## + nox      1     759.6 12469 1633.5
## + black    1     502.6 12726 1643.8
## + chas     1     267.4 12962 1653.1
## + indus    1     242.6 12986 1654.0
## + tax      1     240.3 12989 1654.1
## + crim     1     233.5 12995 1654.4
## + zn       1     144.8 13084 1657.8
## + age      1      61.4 13168 1661.0
## <none>                 13229 1661.4
## + rad      1      22.4 13206 1662.5
## - dis      1     499.1 13728 1678.1
## - ptratio  1    1859.3 15088 1725.9
## - rm       1    2622.6 15852 1750.9
## - lstat    1    5349.2 18578 1831.2
## 
## Step:  AIC=1633.47
## medv ~ lstat + rm + ptratio + dis + nox
## 
##           Df Sum of Sq   RSS    AIC
## + chas     1     328.3 12141 1622.0
## + black    1     311.8 12158 1622.7
## + zn       1     151.7 12318 1629.3
## + crim     1     141.4 12328 1629.7
## + rad      1      53.5 12416 1633.3
## <none>                 12469 1633.5
## + indus    1      17.1 12452 1634.8
## + tax      1      10.5 12459 1635.0
## + age      1       0.2 12469 1635.5
## - nox      1     759.6 13229 1661.4
## - dis      1    1233.8 13703 1679.2
## - ptratio  1    2116.5 14586 1710.8
## - rm       1    2546.2 15016 1725.5
## - lstat    1    3664.3 16134 1761.8
## 
## Step:  AIC=1621.97
## medv ~ lstat + rm + ptratio + dis + nox + chas
## 
##           Df Sum of Sq   RSS    AIC
## + black    1     272.8 11868 1612.5
## + zn       1     164.4 11977 1617.1
## + crim     1     116.3 12025 1619.1
## + rad      1      58.6 12082 1621.5
## <none>                 12141 1622.0
## + indus    1      26.3 12115 1622.9
## + tax      1       4.2 12137 1623.8
## + age      1       2.3 12139 1623.9
## - chas     1     328.3 12469 1633.5
## - nox      1     820.4 12962 1653.1
## - dis      1    1146.8 13288 1665.6
## - ptratio  1    1924.9 14066 1694.4
## - rm       1    2480.7 14622 1714.0
## - lstat    1    3509.3 15650 1748.5
## 
## Step:  AIC=1612.47
## medv ~ lstat + rm + ptratio + dis + nox + chas + black
## 
##           Df Sum of Sq   RSS    AIC
## + zn       1    189.94 11678 1606.3
## + rad      1    144.32 11724 1608.3
## + crim     1     55.63 11813 1612.1
## <none>                 11868 1612.5
## + indus    1     15.58 11853 1613.8
## + age      1      9.45 11859 1614.1
## + tax      1      2.70 11866 1614.4
## - black    1    272.84 12141 1622.0
## - chas     1    289.27 12158 1622.7
## - nox      1    626.85 12495 1636.5
## - dis      1   1103.33 12972 1655.5
## - ptratio  1   1804.30 13672 1682.1
## - rm       1   2658.21 14526 1712.7
## - lstat    1   2991.55 14860 1724.2
## 
## Step:  AIC=1606.31
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn
## 
##           Df Sum of Sq   RSS    AIC
## + crim     1     94.71 11584 1604.2
## + rad      1     93.61 11585 1604.2
## <none>                 11678 1606.3
## + indus    1     16.05 11662 1607.6
## + tax      1      3.95 11674 1608.1
## + age      1      1.49 11677 1608.2
## - zn       1    189.94 11868 1612.5
## - black    1    298.37 11977 1617.1
## - chas     1    300.42 11979 1617.2
## - nox      1    627.62 12306 1630.8
## - dis      1   1276.45 12955 1656.8
## - ptratio  1   1364.63 13043 1660.2
## - rm       1   2384.55 14063 1698.3
## - lstat    1   3052.50 14731 1721.8
## 
## Step:  AIC=1604.19
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
##     crim
## 
##           Df Sum of Sq   RSS    AIC
## + rad      1    228.60 11355 1596.1
## <none>                 11584 1604.2
## + indus    1     15.77 11568 1605.5
## + age      1      2.47 11581 1606.1
## + tax      1      1.31 11582 1606.1
## - crim     1     94.71 11678 1606.3
## - black    1    222.18 11806 1611.8
## - zn       1    229.02 11813 1612.1
## - chas     1    284.34 11868 1614.5
## - nox      1    578.44 12162 1626.8
## - ptratio  1   1192.90 12776 1651.8
## - dis      1   1345.70 12929 1657.8
## - rm       1   2419.57 14003 1698.2
## - lstat    1   2753.42 14337 1710.1
## 
## Step:  AIC=1596.1
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
##     crim + rad
## 
##           Df Sum of Sq   RSS    AIC
## + tax      1    273.62 11081 1585.8
## <none>                 11355 1596.1
## + indus    1     33.89 11321 1596.6
## + age      1      0.10 11355 1598.1
## - zn       1    171.14 11526 1601.7
## - rad      1    228.60 11584 1604.2
## - crim     1    229.70 11585 1604.2
## - chas     1    272.67 11628 1606.1
## - black    1    295.78 11651 1607.1
## - nox      1    785.16 12140 1627.9
## - dis      1   1341.37 12696 1650.6
## - ptratio  1   1419.77 12775 1653.7
## - rm       1   2182.57 13538 1683.1
## - lstat    1   2785.28 14140 1705.1
## 
## Step:  AIC=1585.76
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
##     crim + rad + tax
## 
##           Df Sum of Sq   RSS    AIC
## <none>                 11081 1585.8
## + indus    1      2.52 11079 1587.7
## + age      1      0.06 11081 1587.8
## - chas     1    227.21 11309 1594.0
## - crim     1    245.37 11327 1594.8
## - zn       1    257.82 11339 1595.4
## - black    1    270.82 11352 1596.0
## - tax      1    273.62 11355 1596.1
## - rad      1    500.92 11582 1606.1
## - nox      1    541.91 11623 1607.9
## - ptratio  1   1206.45 12288 1636.0
## - dis      1   1448.94 12530 1645.9
## - rm       1   1963.66 13045 1666.3
## - lstat    1   2723.48 13805 1695.0

Finally, we use the selected variables to build the model.

model_1 = lm(medv~lstat + rm + ptratio + dis + nox + chas + black + zn +  crim + rad + tax, data=Boston)
summary(model_1)
## 
## Call:
## lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas + 
##     black + zn + crim + rad + tax, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5984  -2.7386  -0.5046   1.7273  26.2373 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.5328     0.2106 107.018  < 2e-16 ***
## lstat        -3.7316     0.3387 -11.019  < 2e-16 ***
## rm            2.6711     0.2855   9.356  < 2e-16 ***
## ptratio      -2.0492     0.2794  -7.334 9.24e-13 ***
## dis          -3.1432     0.3911  -8.037 6.84e-15 ***
## nox          -2.0135     0.4097  -4.915 1.21e-06 ***
## chas          0.6905     0.2170   3.183 0.001551 ** 
## black         0.8482     0.2441   3.475 0.000557 ***
## zn            1.0692     0.3154   3.390 0.000754 ***
## crim         -0.9325     0.2820  -3.307 0.001010 ** 
## rad           2.6088     0.5521   4.726 3.00e-06 ***
## tax          -1.9850     0.5684  -3.493 0.000521 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7348 
## F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16

3 Model Checking

3.1 Residual Plots

Residual histogram with normal density

plot(model_1$fitted.values, model_1$res, xlab="Fitted", ylab="Residuals", main="Residual Plot")
abline(h=0)

hist(model_1$res, main="Residual Histogram")

qqnorm(model_1$res, ylab="Residuals")
qqline(model_1$res)

3.2 Box-Cox Transformation

Find the optimal \(\lambda\) and the corresponding 95% confidence intervals

trans = boxcox(model_1, plotit=T, lambda=seq(-1,1 , by=0.05))

lambda = trans$x[trans$y == max(trans$y)]
tmp=trans$x[trans$y > max(trans$y) - qchisq(0.95, 1)/2]
lambda
## [1] 0.1111111
range(tmp)
## [1] 0.01010101 0.21212121
model_2 = lm(medv^lambda~lstat + rm + ptratio + dis + nox + chas + black + zn +  crim + rad + tax,data=Boston)
plot(fitted(model_2), residuals(model_2), 
     xlab="Fitted", ylab="Residuals",
     main="Residual Plot")
abline(h=0)