Anda di halaman 1dari 3

memory.

limit()
memory.size(max=T)
rm(list=ls())
install.packages("tidyverse")
library(tidyverse) # data manipulation
install.packages("cluster")
library(cluster) # clustering algorithms
install.packages("factoextra")
library(factoextra) # clustering algorithms & visualization
library(dplyr)
setwd("C:/users/JOSEPH/Documents/codes")
getwd()
# Import test data
data<-read.csv("malware_dataset1.csv")
print(summary(data))
#Student: I added the below command. At line 20 I was unable to scale data. So
converted all the data to numeric.
data=as.data.frame(sapply(data, as.numeric))
data.1<-select(data,13:23)
data1 <- na.omit(data.1)
print(summary(data1))
#As we don't want the clustering algorithm to depend to an arbitrary
#variable unit, we start by scaling data using the R function scale:

data1 <- scale(data1)


head(data1)
distance <- get_dist(data1)
print(distance)

# plot cluster library


library(cluster)

# K-Means Cluster Analysis


# simplest example, just the dataset and number of clusters
fit <- kmeans(na.omit(data1),centers=5) # 5 cluster solution
# get cluster means
aggregate(data1,by=list(fit$cluster),FUN=mean)
# append cluster assignment
mydata <- data.frame(data1, fit$cluster)

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,


labels=2, lines=0)
fit <- kmeans(data1, 8) # 8 cluster solution
# get cluster means
aggregate(data1,by=list(fit$cluster),FUN=mean)
# append cluster assignment
mydata <- data.frame(data1, fit$cluster)

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,


labels=2, lines=0)
# K-Means Clustering with 5 clusters
fit <- kmeans(mydata, 5)
# Determine number of clusters
wss <- (nrow(data1)-1)*sum(apply(data1,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data1,
centers=i)$withinss)
#A plot of the within groups sum of squares by number of clusters extracted can
help determine the appropriate number of clusters.
#The analyst looks for a bend in the plot similar to a scree test in factor
analysis
# We want (total within-cluster variation) to be the lowest
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

# Determine number of clusters


wss <- (nrow(data1)-1)*sum(apply(data1,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data1,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

## Cluster Plot against 1st 2 principal components

# vary parameters for most readable graph


library(cluster)
clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)

# Centroid Plot against 1st 2 discriminant functions


library(fpc)
plotcluster(mydata, fit$cluster)

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high =


"#FC4E07"))
# try with 25 attempts, 2 clusters
km <- kmeans(data1, centers = 2, nstart = 25)
str(km)
#The output of kmeans is a list with several bits of information. The most
important being:
# cluster: A vector of integers (from 1:k) indicating the cluster to which each
point is allocated.
#centers: A matrix of cluster centers.
#totss: The total sum of squares.
#withinss: Vector of within-cluster sum of squares, one component per cluster.
#tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).
#betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.
#size: The number of points in each cluster.

# print the clusters


print(km)

# Plot clusters
fviz_cluster(km, data = data1)

(cl <- kmeans(data1, 8))


plot(data1, col = cl$cluster)
points(cl$centers, col = 1:3, pch = 8, cex = 2)
# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)

## cluster centers "fitted" to each obs.:


fitted.data1 <- fitted(cl); head(fitted.data1)
resid.data1 <- data1 - fitted(cl)

## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
c(ss(fitted.data1), ss(resid.data1), ss(data1)))
stopifnot(all.equal(cl$ totss, ss(data1)),
all.equal(cl$ tot.withinss, ss(resid.data1)),
## these three are the same:
all.equal(cl$ betweenss, ss(fitted.data1)),
all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
## and hence also
all.equal(ss(data1), ss(fitted.data1) + ss(resid.data1))
)

kmeans(data1,1)$withinss # trivial one-cluster, (its W.SS == ss(x))

## random starts do help here with too many clusters


## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)

Anda mungkin juga menyukai