RRRR

memory.
limit()
memory.size(max=T)
rm(list=ls())
install.packages("tidyverse")
library(tidyverse) # data manipulation
install.packages("cluster")
library(cluster) # clustering algorithms
install.packages("factoextra")
library(factoextra) # clustering algorithms & visualization
library(dplyr)
setwd("C:/users/JOSEPH/Documents/codes")
getwd()
# Import test data
data<-read.csv("malware_dataset1.csv")
print(summary(data))
#Student: I added the below command. At line 20 I was unable to scale data. So
converted all the data to numeric.
data=as.data.frame(sapply(data, as.numeric))
data.1<-select(data,13:23)
data1 <- na.omit(data.1)
print(summary(data1))
#As we don't want the clustering algorithm to depend to an arbitrary
#variable unit, we start by scaling data using the R function scale:
data1 <- scale(data1)

head(data1)
distance <- get_dist(data1)
print(distance)
# plot cluster library

library(cluster)
# K-Means Cluster Analysis

# simplest example, just the dataset and number of clusters
fit <- kmeans(na.omit(data1),centers=5) # 5 cluster solution
# get cluster means
aggregate(data1,by=list(fit$cluster),FUN=mean)
# append cluster assignment
mydata <- data.frame(data1, fit$cluster)
clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,

labels=2, lines=0)
fit <- kmeans(data1, 8) # 8 cluster solution
# get cluster means
aggregate(data1,by=list(fit$cluster),FUN=mean)
# append cluster assignment
mydata <- data.frame(data1, fit$cluster)

labels=2, lines=0)
# K-Means Clustering with 5 clusters
fit <- kmeans(mydata, 5)
# Determine number of clusters
wss <- (nrow(data1)-1)*sum(apply(data1,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data1,
centers=i)$withinss)
#A plot of the within groups sum of squares by number of clusters extracted can
help determine the appropriate number of clusters.
#The analyst looks for a bend in the plot similar to a scree test in factor
analysis
# We want (total within-cluster variation) to be the lowest
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# Determine number of clusters

wss <- (nrow(data1)-1)*sum(apply(data1,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data1,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
## Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph

library(cluster)
labels=2, lines=0)
# Centroid Plot against 1st 2 discriminant functions

library(fpc)
plotcluster(mydata, fit$cluster)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high =

"#FC4E07"))
# try with 25 attempts, 2 clusters
km <- kmeans(data1, centers = 2, nstart = 25)
str(km)
#The output of kmeans is a list with several bits of information. The most
important being:
# cluster: A vector of integers (from 1:k) indicating the cluster to which each
point is allocated.
#centers: A matrix of cluster centers.
#totss: The total sum of squares.
#withinss: Vector of within-cluster sum of squares, one component per cluster.
#tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).
#betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.
#size: The number of points in each cluster.
# print the clusters

print(km)
# Plot clusters
fviz_cluster(km, data = data1)
(cl <- kmeans(data1, 8))

plot(data1, col = cl$cluster)
points(cl$centers, col = 1:3, pch = 8, cex = 2)
# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)
## cluster centers "fitted" to each obs.:

fitted.data1 <- fitted(cl); head(fitted.data1)
resid.data1 <- data1 - fitted(cl)
## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
c(ss(fitted.data1), ss(resid.data1), ss(data1)))
stopifnot(all.equal(cl$ totss, ss(data1)),
all.equal(cl$ tot.withinss, ss(resid.data1)),
## these three are the same:
all.equal(cl$ betweenss, ss(fitted.data1)),
all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
## and hence also
all.equal(ss(data1), ss(fitted.data1) + ss(resid.data1))
)
kmeans(data1,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
## random starts do help here with too many clusters

## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)

RRRR

Diunggah oleh

Informasi Dokumen

Judul Asli

Hak Cipta

Format Tersedia

Bagikan dokumen Ini

Bagikan atau Tanam Dokumen

Opsi Berbagi

Apakah menurut Anda dokumen ini bermanfaat?

Apakah konten ini tidak pantas?

Hak Cipta:

Format Tersedia

RRRR

Diunggah oleh

Hak Cipta:

Format Tersedia

memory.

data1 <- scale(data1)

# plot cluster library

# K-Means Cluster Analysis

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,

clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,

# Determine number of clusters

## Cluster Plot against 1st 2 principal components

# vary parameters for most readable graph

# Centroid Plot against 1st 2 discriminant functions

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high =

# print the clusters

(cl <- kmeans(data1, 8))

## cluster centers "fitted" to each obs.:

kmeans(data1,1)$withinss # trivial one-cluster, (its W.SS == ss(x))

## random starts do help here with too many clusters

Anda mungkin juga menyukai