-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.R
150 lines (96 loc) · 3.44 KB
/
cluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# This function finds the medoids given a distance matrix and a
# vector representing the clustering
find_representatives <- function(D, cl) {
D <- as.matrix(D)
n_clusters <- max(cl)
res <- vector(mode = "integer", length = n_clusters)
for (i in seq(n_clusters)) {
idx <- which(cl == i)
if (length(idx) > 1) {
tmp <- colMeans(D[idx, idx])
res[i] <- idx[which.min(tmp)[1]]
} else {
res[i] <- idx
}
}
return(res)
}
# This function clusters implications, given the
# dissimilarity matrix and the number k of clusters.
# All methods are equivalent, only difference is
# performance in larger datasets.
cluster_implications <- function(diss, k,
method = "diana") {
res <- list(clustering = NULL,
representatives = NULL,
mean_diss = NULL,
max_diss = NULL,
inter = NULL)
# Several methods have been implemented, all of them
# equivalent to PAM, with the same idea of partitioning,
# but performing slightly better with large datasets.
if (method == "pam") {
cl <- cluster::pam(diss, k = k)
res$clustering <- cl$clustering
res$representatives <- cl$id.med
}
if (method == "fanny") {
if (k > 1) {
cl <- cluster::fanny(x = diss,
k = k,
memb.exp = 1.1,
maxit = 1000)
res$clustering <- cl$clustering
} else {
res$clustering <- rep(1, attr(diss, "Size"))
}
res$representatives <- find_representatives(diss,
res$clustering)
}
if (method == "agnes") {
cl <- cluster::agnes(diss, method = "ward")
res$clustering <- cutree(cl, k = k)
res$representatives <- find_representatives(diss,
res$clustering)
}
if (method == "diana") {
cl <- cluster::diana(diss)
res$clustering <- cutree(cl, k = k)
res$representatives <- find_representatives(diss,
res$clustering)
}
D <- as.matrix(diss)
# We return the mean and maximum intracluster dissimilarity
for (i in seq(k)) {
ii <- which(res$clustering == i)
Dtmp <- D[res$representatives[i], ii]
res$mean_diss[i] <- mean(Dtmp)
res$max_diss[i] <- max(Dtmp)
}
# And, for each pair of clusters, find its minimum, mean
# and maximum dissimilarity
res$inter <- tibble(i = NULL, j = NULL,
min_d = NULL, mean_d = NULL,
max_d = NULL)
if (k > 1) {
for (i in seq(k - 1)) {
for (j in seq(i + 1, k)) {
foo <- D[which(res$clustering == i), which(res$clustering == j)]
if (!is.vector(foo)) {
foo <- foo %>%
apply(1, min)
}
res$inter <- rbind(res$inter,
tibble(i = i, j = j,
min_d = min(foo),
mean_d = mean(foo, trim = 0.1),
max_d = max(foo)))
}
}
} else {
res$inter <- tibble(i = 1, j = 1,
min_d = 0, mean_d = 0,
max_d = 0)
}
return(res)
}