#
# Introduction to Clustering demo
#
# k-means clustering
#
# History:
#
# 2018/07/11 Initial code (copied from other example documents) walter
johnston
#
###
#
# k-means algorithm description/explanation:
#
# user- supplied input parameters:
#
# k: number of clusters
# nstart: number of initializing trials
# iter.max: maximum number of iterations (repetitions)
#
# (1) randomize observations into "k" initial groups (keep best of
"nstart" trials)
# (2) calculate centroid (vector of arithmetic means) for each cluster
# (3) calculate within-sum-of-squares (ESS) for each cluster (retain
value)
# (4) re-assign observations into closest cluster
# (distance from centroid; retain count of re-assignments)
# (5) if iterations <= "iter.max" and count of re-assignments > 0, go to
(2)
# (6) finished
#
###
# code to fetch the package if it is not present
###
#
# filter for complete cases (no missing data)
#
###
myBoston <- Boston[ complete.cases(Boston), ]
dim(Boston)
dim(myBoston)
###
#
# kmeans: fixed number of clusters
#
# cluster data based on error-sum-of-squares
# random starting points
# user specified number of clusters
# user specified number of starting trials (best one is automatically selected)
# user specified maximum number of iterations
#
###
# user choices
#
seed <- 1 # random number generator seed
minClusters <- 1 # minimum number of clusters (see code)
maxClusters <- 20 # maximum number of clusters (see code)
km.nstart <- 10 # number of starting trials
km.iter.max <- 20 # iteration limit
###
# start skip of code
###
###
# skip to here
###
# measures of interest
table(t$cluster)
t$tot.withinss # overall
t$withinss # by cluster
# by cluster
t3 <- t2 %>%
group_by(cluster) %>%
do( data.frame(wss = wssf(.) ) )
sum(t3) # overall
t3 # by cluster
# individual divergences
t$withinss - t3$wss
# overall divergence
t$tot.withinss - sum(t3$wss)
###
#
# k-means WSS calculations woking correctly
#
# now, apply it to hclust() to select a number of clusters
#
# use: squared euclidean distance as metric for clustering
# method="complete"
#
###