Skip to content

Commit ba4855c

Browse files
authored
N clusters medoids fix (#129)
* Don't sample with replacement for random placement You'll always have an inertia at least as good as not doing this; i.e. picking distinct medoids only helps in the optimization. * Add test to ensure n_clusters It appears to only be the "random" initialization that has the bug, but it's cheap to test all initializations + methods. * The initial medoids changed for determinism We're now sampling differently so the inital deterministic sample is different. This updates the test to use the new deterministic medoids.
1 parent 3876fbd commit ba4855c

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

sklearn_extra/cluster/_k_medoids.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ def _initialize_medoids(self, D, n_clusters, random_state_):
412412

413413
if self.init == "random": # Random initialization
414414
# Pick random k medoids as the initial ones.
415-
medoids = random_state_.choice(len(D), n_clusters)
415+
medoids = random_state_.choice(len(D), n_clusters, replace=False)
416416
elif self.init == "k-medoids++":
417417
medoids = self._kpp_init(D, n_clusters, random_state_)
418418
elif self.init == "heuristic": # Initialization by heuristic

sklearn_extra/cluster/tests/test_k_medoids.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,24 @@ def test_kmedoid_results(method, init, dtype):
4848
assert dtype is np.dtype(km.transform(X_cc.astype(dtype)).dtype).type
4949

5050

51+
@pytest.mark.parametrize("method", ["alternate", "pam"])
52+
@pytest.mark.parametrize(
53+
"init", ["random", "heuristic", "build", "k-medoids++"]
54+
)
55+
def test_kmedoid_nclusters(method, init):
56+
n_clusters = 50
57+
58+
km = KMedoids(
59+
n_clusters=n_clusters,
60+
init=init,
61+
method=method,
62+
max_iter=1,
63+
random_state=rng,
64+
)
65+
km.fit(X_cc)
66+
assert len(np.unique(km.medoid_indices_)) == n_clusters
67+
68+
5169
def test_clara_results():
5270
expected = np.hstack([np.zeros(50), np.ones(50)])
5371
km = CLARA(n_clusters=2)
@@ -113,7 +131,7 @@ def test_random_deterministic():
113131
D = euclidean_distances(X)
114132

115133
medoids = KMedoids(init="random")._initialize_medoids(D, 4, rng)
116-
assert_array_equal(medoids, [47, 117, 67, 103])
134+
assert_array_equal(medoids, [114, 62, 33, 107])
117135

118136

119137
def test_heuristic_deterministic():

0 commit comments

Comments
 (0)