Boston Housing data

The Boston data frame has 506 rows and 14 columns. This data frame contains the following columns:

library(MASS)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21 28.7

1 Data Visualization with \(ggplot\)

1.1 Start \(ggplot\)

  • install.packages(“package_name”)
  • library(package_name)
  • help(package=“ggplot2”)
library(ggplot2)

1.2 Box plot

Basic box plot

# Create a 2D ggplot graph, with Charles River dummy variable (chas) as the x_axis and median value of house as the y_axis;
p <-ggplot(data=Boston,aes(x=factor(chas),y=medv)) 
# Add the boxplot, title, xlab, ylab and summary on the graph
p+ geom_boxplot(outlier.colour="red", outlier.shape=8, outlier.size=4)+
ggtitle("House price VS Charles River dummy variable")+
xlab("Charles River dummy variable")+
ylab("House price") +
stat_summary(fun.y=mean, geom="point", shape=23, size=3)

Boxplot with multiple groups

# Use an additional keyword "fill" to achieve multiple groups; 
# In addition, you may use keyword "color" to include more groups.
p <-ggplot(data=Boston,aes(x=factor(chas),y=medv, fill=factor(rad))) 
# Add the boxplot, title, xlab, ylab and summary on the graph
p+ geom_boxplot(outlier.colour="red", outlier.shape=8, outlier.size=4)+
ggtitle("House price VS Charles River dummy variable & rad")+
xlab("Charles River dummy variable")+
ylab("House price")

1.3 Bar plot

stack bar plot

p <-ggplot(data=Boston,aes(x=factor(rad),y=medv,fill=factor(chas)))
p + geom_bar(stat="identity") +
ggtitle("House price under different Charles River dummy variable & rad") +
xlab("Index of accessibility to radial highways.") +
ylab("House price")

Dodge bar plot

p <-ggplot(data=Boston,aes(x=factor(rad),y=medv,fill=factor(chas)))
p + geom_bar(stat="identity",position=position_dodge()) +
ggtitle("House price under different Charles River dummy variable & rad") +
xlab("Index of accessibility to radial highways.") +
ylab("House price")

1.4 Histogram

Histogram with Kernel density with transparency

# geom_density draws a density curve and alpha is for the transparency.
# geom_vline draws a dashed line at the mean point of "medv".
ggplot(Boston, aes(x=medv))+
geom_histogram(aes(y=..density..), colour= "black", fill="white",binwidth=2)+
geom_density(alpha=.2, fill = "#FF6666")+
ggtitle("Histogram of House Price") +
xlab("House Price") +
ylab("Density")+
theme_bw()+
geom_vline(aes(xintercept=mean(medv)),
color="blue", linetype="dashed", size=1)

Histogram with an overlay normal density

# stat_function draws a normal density curve with the same mean & std with the data.
ggplot(Boston, aes(x = medv)) +
geom_histogram(aes(y = ..density..),
color = "royalblue",
fill = "white", alpha=0.5, binwidth=1.5) +
stat_function(fun = dnorm, colour= "red",
args= list(mean = mean(Boston$medv, na.rm = TRUE),
sd= sd(Boston$medv, na.rm = TRUE))) +
ggtitle("Normal Curve for overall House Price")