From 83b05163e4f2b33bfd504df2735836324e125fd9 Mon Sep 17 00:00:00 2001 From: Javier Alvarez Date: Thu, 29 Nov 2018 15:26:21 +0100 Subject: [PATCH 1/2] Renamed Square to Region --- dislib/cluster/dbscan/base.py | 4 ++-- dislib/cluster/dbscan/classes.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dislib/cluster/dbscan/base.py b/dislib/cluster/dbscan/base.py index 4cedcfab..02e7cac8 100644 --- a/dislib/cluster/dbscan/base.py +++ b/dislib/cluster/dbscan/base.py @@ -4,7 +4,7 @@ from pycompss.api.api import compss_wait_on from pycompss.api.task import task -from dislib.cluster.dbscan.classes import Square +from dislib.cluster.dbscan.classes import Region from dislib.data import Dataset @@ -101,7 +101,7 @@ def fit(self, dataset): # Compute dbscan in each region of the grid for idx in np.ndindex(grid.shape): - grid[idx] = Square(idx, self._eps, grid.shape, region_sizes) + grid[idx] = Region(idx, self._eps, grid.shape, region_sizes) grid[idx].init_data(sorted_data, grid.shape) grid[idx].partial_scan(self._min_samples, self._max_samples) diff --git a/dislib/cluster/dbscan/classes.py b/dislib/cluster/dbscan/classes.py index 771c4504..5c246443 100644 --- a/dislib/cluster/dbscan/classes.py +++ b/dislib/cluster/dbscan/classes.py @@ -9,7 +9,7 @@ _NO_CP = -3 -class Square(object): +class Region(object): def __init__(self, coord, epsilon, grid_shape, region_sizes): self.coord = coord self.epsilon = epsilon From 2ba6778b7590f90fa966715b39cb344dd45bdc52 Mon Sep 17 00:00:00 2001 From: Javier Alvarez Date: Thu, 29 Nov 2018 15:51:17 +0100 Subject: [PATCH 2/2] [K-means] Fixed bug in fit_predict function --- dislib/cluster/kmeans/base.py | 12 +- examples/kmeans.py | 1 + tests/tests.py | 320 +++++++++++++++++----------------- 3 files changed, 169 insertions(+), 164 deletions(-) diff --git a/dislib/cluster/kmeans/base.py b/dislib/cluster/kmeans/base.py index 4d2f5740..ca28b618 100644 --- a/dislib/cluster/kmeans/base.py +++ b/dislib/cluster/kmeans/base.py @@ -62,7 +62,7 @@ def fit(self, dataset): ----- This method modifies the input Dataset by setting the cluster labels. """ - centers = _init_centers(dataset[0], self._n_clusters, + centers = _init_centers(dataset.n_features, self._n_clusters, self._random_state) self.centers = compss_wait_on(centers) @@ -105,7 +105,7 @@ def fit_predict(self, dataset): for subset in dataset: labels.append(_get_label(subset)) - return np.array(compss_wait_on(labels)) + return np.concatenate(compss_wait_on(labels)) def predict(self, x): """ Predict the closest cluster each sample in x belongs to. @@ -122,9 +122,10 @@ def predict(self, x): """ labels = [] - for x in x: - dist = np.linalg.norm(x - self.centers, axis=1) + for sample in x: + dist = np.linalg.norm(sample - self.centers, axis=1) labels.append(np.argmin(dist)) + return np.array(labels) def _converged(self, old_centers, iter): @@ -154,9 +155,8 @@ def _get_label(subset): @task(returns=np.array) -def _init_centers(subset, n_clusters, random_state): +def _init_centers(n_features, n_clusters, random_state): np.random.seed(random_state) - n_features = subset.samples.shape[1] centers = np.random.random((n_clusters, n_features)) return centers diff --git a/examples/kmeans.py b/examples/kmeans.py index a4f6384e..c002e088 100644 --- a/examples/kmeans.py +++ b/examples/kmeans.py @@ -67,6 +67,7 @@ def main(): kmeans = KMeans(n_clusters=3, random_state=random_state) y_pred = kmeans.fit_predict(dataset) + plt.subplot(224) plt.scatter(x_filtered[:, 0], x_filtered[:, 1], c=y_pred) centers = kmeans.centers diff --git a/tests/tests.py b/tests/tests.py index 55b96a70..62ad8120 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -10,16 +10,15 @@ from sklearn.datasets import make_moons from sklearn.preprocessing import StandardScaler -from dislib.cluster import DBSCAN +from dislib.cluster import DBSCAN, KMeans from dislib.data import Subset from dislib.data import load_csv_file from dislib.data import load_csv_files from dislib.data import load_data -from dislib.data import load_libsvm_file -from dislib.data import load_libsvm_files +from dislib.data import load_libsvm_file, load_libsvm_files -sys.path.insert(0, - os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +# sys.path.insert(0, +# os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) # the import tests should be removed; import should be tests in class specific @@ -48,6 +47,25 @@ def test_cascadecsvm(self): self.assertIsNotNone(CascadeSVM) +class KMeansTest(unittest.TestCase): + def test_fit_predict(self): + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) + + dataset = load_data(x_filtered, subset_size=300) + + kmeans = KMeans(n_clusters=3, random_state=170) + labels = kmeans.fit_predict(dataset) + + centers = np.array([[-8.941375656533449, -5.481371322614891], + [-4.524023204953875, 0.06235042593214654], + [2.332994701667008, 0.37681003933082696]]) + + self.assertTrue((centers == kmeans.centers).all()) + self.assertEqual(labels.size, 610) + + class DBSCANTest(unittest.TestCase): def test_n_clusters_blobs(self): n_samples = 1500 @@ -104,12 +122,6 @@ def test_n_clusters_aniso(self): self.assertEqual(n_clusters, 4) def test_n_clusters_blobs_max_samples(self): - from sklearn.datasets import make_blobs - from dislib.cluster import DBSCAN - from sklearn.preprocessing import StandardScaler - from dislib.data import load_data - import numpy as np - n_samples = 1500 # Test blobs @@ -123,12 +135,6 @@ def test_n_clusters_blobs_max_samples(self): self.assertEqual(n_clusters, 3) def test_n_clusters_circles_max_samples(self): - from sklearn.datasets import make_circles - from dislib.cluster import DBSCAN - from sklearn.preprocessing import StandardScaler - from dislib.data import load_data - import numpy as np - n_samples = 1500 # Test circles @@ -142,12 +148,6 @@ def test_n_clusters_circles_max_samples(self): self.assertEqual(n_clusters, 2) def test_n_clusters_moons_max_samples(self): - from sklearn.datasets import make_moons - from dislib.cluster import DBSCAN - from sklearn.preprocessing import StandardScaler - from dislib.data import load_data - import numpy as np - n_samples = 1500 # Test moons @@ -161,12 +161,6 @@ def test_n_clusters_moons_max_samples(self): self.assertEqual(n_clusters, 2) def test_n_clusters_aniso_max_samples(self): - from sklearn.datasets import make_blobs - from dislib.cluster import DBSCAN - from sklearn.preprocessing import StandardScaler - from dislib.data import load_data - import numpy as np - n_samples = 1500 # Test aniso @@ -184,7 +178,6 @@ def test_n_clusters_aniso_max_samples(self): class DataLoadingTest(unittest.TestCase): def test_load_data_with_labels(self): - x, y = make_blobs(n_samples=1500) data = load_data(x=x, y=y, subset_size=100) @@ -199,189 +192,199 @@ def test_load_data_with_labels(self): self.assertTrue((read_y == y).all()) self.assertEqual(len(data), 15) - def test_load_data_without_labels(self): - x, y = make_blobs(n_samples=1500) - data = load_data(x=x, subset_size=100) +def test_load_data_without_labels(self): + x, y = make_blobs(n_samples=1500) + data = load_data(x=x, subset_size=100) - read_x = np.empty((0, x.shape[1])) + read_x = np.empty((0, x.shape[1])) - for subset in data: - read_x = np.concatenate((read_x, subset.samples)) + for subset in data: + read_x = np.concatenate((read_x, subset.samples)) - self.assertTrue((read_x == x).all()) - self.assertEqual(len(data), 15) + self.assertTrue((read_x == x).all()) + self.assertEqual(len(data), 15) - def test_load_libsvm_file_sparse(self): - file_ = "tests/files/libsvm/2" - data = load_libsvm_file(file_, 10, 780) - data.collect() - x, y = load_svmlight_file(file_, n_features=780) +def test_load_libsvm_file_sparse(self): + file_ = "tests/files/libsvm/2" - read_x = np.empty((0, x.shape[1])) - read_y = np.empty(0) + data = load_libsvm_file(file_, 10, 780) + data.collect() + x, y = load_svmlight_file(file_, n_features=780) - for subset in data: - read_x = np.concatenate((read_x, subset.samples.toarray())) - read_y = np.concatenate((read_y, subset.labels)) + read_x = np.empty((0, x.shape[1])) + read_y = np.empty(0) - self.assertTrue((read_x == x.toarray()).all()) - self.assertTrue((read_y == y).all()) - self.assertEqual(len(data), 6) + for subset in data: + read_x = np.concatenate((read_x, subset.samples.toarray())) + read_y = np.concatenate((read_y, subset.labels)) + + self.assertTrue((read_x == x.toarray()).all()) + self.assertTrue((read_y == y).all()) + self.assertEqual(len(data), 6) + + +def test_load_libsvm_file_dense(self): + file_ = "tests/files/libsvm/1" + + data = load_libsvm_file(file_, 20, 780, False) + data.collect() + x, y = load_svmlight_file(file_, n_features=780) - def test_load_libsvm_file_dense(self): + read_x = np.empty((0, x.shape[1])) + read_y = np.empty(0) - file_ = "tests/files/libsvm/1" - data = load_libsvm_file(file_, 20, 780, False) - data.collect() + for subset in data: + read_x = np.concatenate((read_x, subset.samples)) + read_y = np.concatenate((read_y, subset.labels)) + + self.assertTrue((read_x == x.toarray()).all()) + self.assertTrue((read_y == y).all()) + self.assertEqual(len(data), 4) + + +def test_load_libsvm_files_sparse(self): + dir_ = "tests/files/libsvm" + + file_list = os.listdir(dir_) + data = load_libsvm_files(dir_, 780) + data.collect() + + for i, subset in enumerate(data): + samples = subset.samples.toarray() + file_ = os.path.join(dir_, file_list[i]) x, y = load_svmlight_file(file_, n_features=780) - read_x = np.empty((0, x.shape[1])) - read_y = np.empty(0) + self.assertTrue((samples == x).all()) + self.assertTrue((subset.labels == y).all()) - for subset in data: - read_x = np.concatenate((read_x, subset.samples)) - read_y = np.concatenate((read_y, subset.labels)) + self.assertEqual(len(data), 3) - self.assertTrue((read_x == x.toarray()).all()) - self.assertTrue((read_y == y).all()) - self.assertEqual(len(data), 4) - def test_load_libsvm_files_sparse(self): +def test_load_libsvm_files_dense(self): + dir_ = "tests/files/libsvm" - dir_ = "tests/files/libsvm" - file_list = os.listdir(dir_) - data = load_libsvm_files(dir_, 780) - data.collect() + file_list = os.listdir(dir_) + data = load_libsvm_files(dir_, 780, False) + data.collect() - for i, subset in enumerate(data): - samples = subset.samples.toarray() - file_ = os.path.join(dir_, file_list[i]) - x, y = load_svmlight_file(file_, n_features=780) + for i, subset in enumerate(data): + samples = subset.samples + file_ = os.path.join(dir_, file_list[i]) + x, y = load_svmlight_file(file_, n_features=780) - self.assertTrue((samples == x).all()) - self.assertTrue((subset.labels == y).all()) + self.assertTrue((samples == x).all()) + self.assertTrue((subset.labels == y).all()) - self.assertEqual(len(data), 3) + self.assertEqual(len(data), 3) - def test_load_libsvm_files_dense(self): - dir_ = "tests/files/libsvm" - file_list = os.listdir(dir_) - data = load_libsvm_files(dir_, 780, False) - data.collect() +def test_load_csv_file(self): + csv_file = "tests/files/csv/1" - for i, subset in enumerate(data): - samples = subset.samples - file_ = os.path.join(dir_, file_list[i]) - x, y = load_svmlight_file(file_, n_features=780) + data = load_csv_file(csv_file, subset_size=300, n_features=122) + data.collect() + csv = np.loadtxt(csv_file, delimiter=",") - self.assertTrue((samples == x).all()) - self.assertTrue((subset.labels == y).all()) + read_x = np.empty((0, csv.shape[1])) - self.assertEqual(len(data), 3) + for subset in data: + read_x = np.concatenate((read_x, subset.samples)) - def test_load_csv_file(self): + self.assertTrue((read_x == csv).all()) + self.assertEqual(len(data), 15) + self.assertIsNone(subset.labels) - csv_file = "tests/files/csv/1" - data = load_csv_file(csv_file, subset_size=300, n_features=122) - data.collect() - csv = np.loadtxt(csv_file, delimiter=",") - read_x = np.empty((0, csv.shape[1])) +def test_load_csv_file_labels_last(self): + csv_file = "tests/files/csv/1" - for subset in data: - read_x = np.concatenate((read_x, subset.samples)) + data = load_csv_file(csv_file, subset_size=1000, n_features=121, + label_col="last") + data.collect() + csv = np.loadtxt(csv_file, delimiter=",") - self.assertTrue((read_x == csv).all()) - self.assertEqual(len(data), 15) - self.assertIsNone(subset.labels) + read_x = np.empty((0, csv.shape[1] - 1)) + read_y = np.empty(0) - def test_load_csv_file_labels_last(self): + for subset in data: + read_x = np.concatenate((read_x, subset.samples)) + read_y = np.concatenate((read_y, subset.labels)) - csv_file = "tests/files/csv/1" - data = load_csv_file(csv_file, subset_size=1000, n_features=121, - label_col="last") - data.collect() - csv = np.loadtxt(csv_file, delimiter=",") + self.assertTrue((read_x == csv[:, :-1]).all()) + self.assertTrue((read_y == csv[:, -1]).all()) + self.assertEqual(len(data), 5) - read_x = np.empty((0, csv.shape[1] - 1)) - read_y = np.empty(0) - for subset in data: - read_x = np.concatenate((read_x, subset.samples)) - read_y = np.concatenate((read_y, subset.labels)) +def test_load_csv_file_labels_first(self): + csv_file = "tests/files/csv/2" - self.assertTrue((read_x == csv[:, :-1]).all()) - self.assertTrue((read_y == csv[:, -1]).all()) - self.assertEqual(len(data), 5) + data = load_csv_file(csv_file, subset_size=100, n_features=121, + label_col="first") + data.collect() + csv = np.loadtxt(csv_file, delimiter=",") - def test_load_csv_file_labels_first(self): + read_x = np.empty((0, csv.shape[1] - 1)) + read_y = np.empty(0) - csv_file = "tests/files/csv/2" - data = load_csv_file(csv_file, subset_size=100, n_features=121, - label_col="first") - data.collect() - csv = np.loadtxt(csv_file, delimiter=",") + for subset in data: + read_x = np.concatenate((read_x, subset.samples)) + read_y = np.concatenate((read_y, subset.labels)) - read_x = np.empty((0, csv.shape[1] - 1)) - read_y = np.empty(0) + self.assertTrue((read_x == csv[:, 1:]).all()) + self.assertTrue((read_y == csv[:, 0]).all()) + self.assertEqual(len(data), 44) - for subset in data: - read_x = np.concatenate((read_x, subset.samples)) - read_y = np.concatenate((read_y, subset.labels)) - self.assertTrue((read_x == csv[:, 1:]).all()) - self.assertTrue((read_y == csv[:, 0]).all()) - self.assertEqual(len(data), 44) +def test_load_csv_files(self): + csv_dir = "tests/files/csv" - def test_load_csv_files(self): + file_list = os.listdir(csv_dir) + data = load_csv_files(csv_dir, n_features=122) + data.collect() - csv_dir = "tests/files/csv" - file_list = os.listdir(csv_dir) - data = load_csv_files(csv_dir, n_features=122) - data.collect() + for i, subset in enumerate(data): + csv_file = os.path.join(csv_dir, file_list[i]) + csv = np.loadtxt(csv_file, delimiter=",") - for i, subset in enumerate(data): - csv_file = os.path.join(csv_dir, file_list[i]) - csv = np.loadtxt(csv_file, delimiter=",") + self.assertTrue((subset.samples == csv).all()) - self.assertTrue((subset.samples == csv).all()) + self.assertEqual(len(data), 3) - self.assertEqual(len(data), 3) - def test_load_csv_files_labels_last(self): +def test_load_csv_files_labels_last(self): + csv_dir = "tests/files/csv" - csv_dir = "tests/files/csv" - file_list = os.listdir(csv_dir) - data = load_csv_files(csv_dir, n_features=122, label_col="last") - data.collect() + file_list = os.listdir(csv_dir) + data = load_csv_files(csv_dir, n_features=122, label_col="last") + data.collect() - for i, subset in enumerate(data): - csv_file = os.path.join(csv_dir, file_list[i]) - csv = np.loadtxt(csv_file, delimiter=",") + for i, subset in enumerate(data): + csv_file = os.path.join(csv_dir, file_list[i]) + csv = np.loadtxt(csv_file, delimiter=",") - self.assertTrue((subset.samples == csv[:, :-1]).all()) - self.assertTrue((subset.labels == csv[:, -1]).all()) + self.assertTrue((subset.samples == csv[:, :-1]).all()) + self.assertTrue((subset.labels == csv[:, -1]).all()) - self.assertEqual(len(data), 3) + self.assertEqual(len(data), 3) - def test_load_csv_files_labels_first(self): - csv_dir = "tests/files/csv" - file_list = os.listdir(csv_dir) - data = load_csv_files(csv_dir, n_features=122, label_col="first") - data.collect() +def test_load_csv_files_labels_first(self): + csv_dir = "tests/files/csv" - for i, subset in enumerate(data): - csv_file = os.path.join(csv_dir, file_list[i]) - csv = np.loadtxt(csv_file, delimiter=",") + file_list = os.listdir(csv_dir) + data = load_csv_files(csv_dir, n_features=122, label_col="first") + data.collect() - self.assertTrue((subset.samples == csv[:, 1:]).all()) - self.assertTrue((subset.labels == csv[:, 0]).all()) + for i, subset in enumerate(data): + csv_file = os.path.join(csv_dir, file_list[i]) + csv = np.loadtxt(csv_file, delimiter=",") + + self.assertTrue((subset.samples == csv[:, 1:]).all()) + self.assertTrue((subset.labels == csv[:, 0]).all()) - self.assertEqual(len(data), 3) + self.assertEqual(len(data), 3) class DataClassesTest(unittest.TestCase): @@ -417,6 +420,7 @@ def test_dataset_extend(self): def test_dataset_collect(self): csv_file = "tests/files/csv/3" + dataset = load_csv_file(csv_file, subset_size=300, n_features=122) dataset.collect()