1 Principal Component Analysis

DataX = data.frame(x1 = iris$Sepal.Length, x2 = iris$Petal.Width)
(tmp = prcomp(DataX))
## Standard deviations:
## [1] 1.0734371 0.3382787
## 
## Rotation:
##          PC1        PC2
## x1 0.7419133 -0.6704958
## x2 0.6704958  0.7419133

2 K-means Clustering

DataX = data.frame(x1 = iris$Sepal.Length, x2 = iris$Sepal.Width)
set.seed(9999)
par(mfrow=c(1,2))
for (kk in 2:5) {
  yfit_kmeans = kmeans(DataX, kk)
  plot(DataX$x1, DataX$x2, pch=19, cex=1, col=seq(2,1+kk)[yfit_kmeans$cluster])
  # points(DataX$x1, DataX$x2, pch=21, cex=1.6, col=seq(2,1+kk)[iris$Species])
  points(yfit_kmeans$centers, pch=8, cex=1.4, col=seq(2,1+kk))
  points(yfit_kmeans$centers, pch=21, cex=2.5, col=seq(2,1+kk))
  title(main=paste("K-Means Clustering: k =", kk))
  }

K-Means on PC-scores: some astonishing result appears!

DataX = iris[,-5]
pr = prcomp(DataX)
PCScore = data.frame(PC1 = as.matrix(DataX) %*% pr$rotation[,1], PC2 = as.matrix(DataX) %*% pr$rotation[,2])
kk = 3
fit = kmeans(PCScore, kk)
par(mfrow=c(1,2))
set.seed(0)
plot(PCScore$PC1, PCScore$PC2, pch=19, cex=1,
     col=seq(2,1+kk)[fit$cluster],
     main="K-means on PC scores (Purely Unsupervised)")
plot(PCScore$PC1, PCScore$PC2, pch=19, cex=1,
     col=seq(2,1+kk)[iris$Species],
     main="Iris Species (True Labels)")