Add set data test cases (#61)

stevenhua0320 · pre-commit-ci[bot] · sbillinge · web-flow · commit d1bf3d4af3cc · 2024-08-13T12:00:27.000-04:00
* add test cases to test files and make edition to make sure the behavior of the test pass.

* [pre-commit.ci] auto fixes from pre-commit hooks

* change case in test__eq__ to be compatible with the behavior of setdata

* delete text and redundant tests

* tweaking error message in DataClusters

* [pre-commit.ci] auto fixes from pre-commit hooks

* update test for checking implicit attributes for setdata function

* [pre-commit.ci] auto fixes from pre-commit hooks

* update test for setdata function

* update setdata test to right format.

* update to constructor test &amp; make setdata clear function private

* final tweaks to tests by Simon

* fix actual_attribute typo

* final refactor of actual_attr

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Simon Billinge &lt;sbillinge@users.noreply.github.com&gt;
diff --git a/src/diffpy/srmise/dataclusters.py b/src/diffpy/srmise/dataclusters.py
@@ -22,43 +22,57 @@
 
 
 class DataClusters:
-    """Find clusters corresponding to peaks in numerical x-, y-value arrays.
+    """Find clusters corresponding to peaks in the PDF (y-array)
 
-    DataClusters determines which points, given a pair of x- and y-value
-    sequences, roughly correspond to which visible peaks in that data.  This
-    division is contiguous, with borders between clusters near relative
+    DataClusters determines which points in inter-atomic distane, r,
+    correspond to peaks in the PDF.  The division between clusters
+    is contiguous, with borders between clusters likely near relative
     minima in the data.
 
     Clusters are iteratively formed around points with the largest
-    y-coordinates.  New clusters are added only when the unclustered data
+    PDF values.  New clusters are added only when the unclustered data
     point under consideration is greater than a given distance (the
     'resolution') from the nearest existing cluster.
 
     Data members
-    x - sequence of x coordinates.
-    y - sequence of y values
-    res - clustering 'resolution'
-    data_order - array of x, y indices ordered by decreasing y
-    clusters - array of cluster ranges
-    current_idx - index of data_order currently considered
+    ------------
+    x : array
+      The array of r values.
+    y : sequence of y values
+      The array of PDF values, G(r)
+    res : int
+      The clustering resolution, i.e., the number of points another point has to
+      be away from the center of an existing cluster to before a new cluster is
+      formed.  A value of zero allows every point to be cluster.
+    data_order : array
+      The array of x, y indices ordered by decreasing y
+    clusters :
+      The array of cluster ranges
+    current_idx - int
+      The index of data_order currently considered
     """
 
     def __init__(self, x, y, res):
-        """Initializes the data to be clustered, and the 'resolution' to use.
+        """Constructor
 
         Parameters
-        x - numeric sequence of x-value sorted in ascending order
-        y - corresponding sequence of y-values
-        res - clustering 'resolution'
+        ----------
+        x : array
+          The array of r values.
+        y : sequence of y values
+          The array of PDF values, G(r)
+        res : int
+          The clustering resolution, i.e., the number of points another point has to
+          be away from the center of an existing cluster to before a new cluster is
+          formed.  A value of zero allows every point to be cluster.
         """
         # Track internal state of clustering.
         self.INIT = 0
         self.READY = 1
         self.CLUSTERING = 2
         self.DONE = 3
-
-        self.clear()
-        self.setdata(x, y, res)
+        self._clear()
+        self._setdata(x, y, res)
 
         return
 
@@ -87,7 +101,7 @@ def __eq__(self, other):
             and self.DONE == other.DONE
         )
 
-    def clear(self):
+    def _clear(self):
         """
         Clear all data and reset the cluster object to a transient initial state.
 
@@ -120,35 +134,37 @@ def reset_clusters(self):
             self.status = self.READY
         return
 
-    def setdata(self, x, y, res):
+    def _setdata(self, x, y, res):
         """Assign data members for x- and y-coordinates, and resolution.
 
         Parameters
-        x - numeric sequence of x-value sorted in ascending order
-        y - corresponding sequence of y-values
-        res - clustering 'resolution'
+        ----------
+        x : array
+          The array of r values.
+        y : sequence of y values
+          The array of PDF values, G(r)
+        res : int
+          The clustering resolution, i.e., the number of points another point has to
+          be away from the center of an existing cluster to before a new cluster is
+          formed.  A value of zero allows every point to be cluster.
         """
-        # Test for error conditions
-        # 1) Length mismatch
-        # 2) Bound errors for res
-        # 3) r isn't sorted?
         if len(x) != len(y):
             raise ValueError("Sequences x and y must have the same length.")
         if res < 0:
-            raise ValueError("Resolution res must be non-negative.")
-        # Test for sorting?
+            raise ValueError(
+                "Value of resolution parameter is less than zero.  Please rerun specifying a non-negative res"
+            )
         self.x = x
         self.y = y
         self.res = res
-        # If x sequence size is empty, set the object into Initialized state.
-        if x.size == 0 and res == 0:
+        if x.size == 0:
             self.data_order = np.array([])
             self.clusters = np.array([[]])
             self.current_idx = 0
             self.lastpoint_idx = None
             self.status = self.INIT
         else:
-            self.data_order = self.y.argsort()  # Defines order of clustering
+            self.data_order = self.y.argsort()
             self.clusters = np.array([[self.data_order[-1], self.data_order[-1]]])
             self.current_idx = len(self.data_order) - 1
             self.lastpoint_idx = self.data_order[-1]
diff --git a/src/diffpy/srmise/tests/test_dataclusters.py b/src/diffpy/srmise/tests/test_dataclusters.py
@@ -1,22 +1,14 @@
 from copy import copy
 
 import numpy as np
+import pytest
 
 from diffpy.srmise.dataclusters import DataClusters
 
 
-def test_clear():
-    # Initialize DataClusters with input parameters
-    actual = DataClusters(x=np.array([1, 2, 3]), y=np.array([3, 2, 1]), res=4)
-    expected = DataClusters(x=np.array([]), y=np.array([]), res=0)
-    # Perform the clear operation
-    actual.clear()
-    assert actual == expected
-
-
 def test___eq__():
-    actual = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 0)
-    expected = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 0)
+    actual = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 1)
+    expected = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 1)
     assert expected == actual
     attributes = vars(actual)
     for attr_key, attr_val in attributes.items():
@@ -32,3 +24,66 @@ def test___eq__():
             print(f"not-equal test failed on {attr_key}")
             assert not expected == actual
         attributes.update({attr_key: reset})
+
+
+@pytest.mark.parametrize(
+    "inputs, expected",
+    [
+        (
+            {
+                "x": np.array([1, 2, 3]),
+                "y": np.array([3, 2, 1]),
+                "res": 4,
+            },
+            {
+                "x": np.array([1, 2, 3]),
+                "y": np.array([3, 2, 1]),
+                "res": 4,
+                "data_order": [2, 1, 0],
+                "clusters": np.array([[0, 0]]),
+                "current_idx": 2,
+                "lastpoint_idx": 0,
+                "INIT": 0,
+                "READY": 1,
+                "CLUSTERING": 2,
+                "DONE": 3,
+                "lastcluster_idx": None,
+                "status": 1,
+            },
+        ),
+    ],
+)
+def test_DataClusters_constructor(inputs, expected):
+    actual = DataClusters(x=inputs["x"], y=inputs["y"], res=inputs["res"])
+    actual_attributes = vars(actual)
+    for attr_key, actual_attr_val in actual_attributes.items():
+        if isinstance(actual_attr_val, np.ndarray):
+            assert np.array_equal(actual_attr_val, expected[attr_key])
+        else:
+            assert actual_attr_val == expected[attr_key]
+
+
+@pytest.mark.parametrize(
+    "inputs, msg",
+    [
+        (
+            {
+                "x": np.array([1]),
+                "y": np.array([3, 2]),
+                "res": 4,
+            },
+            "Sequences x and y must have the same length.",
+        ),
+        (
+            {
+                "x": np.array([1]),
+                "y": np.array([3]),
+                "res": -1,
+            },
+            "Value of resolution parameter is less than zero.  Please rerun specifying a non-negative res",
+        ),
+    ],
+)
+def test_set_data_order_bad(inputs, msg):
+    with pytest.raises(ValueError, match=msg):
+        DataClusters(x=inputs["x"], y=inputs["y"], res=inputs["res"])