Skip to content

Add set data test cases #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 48 additions & 32 deletions src/diffpy/srmise/dataclusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,43 +22,57 @@


class DataClusters:
"""Find clusters corresponding to peaks in numerical x-, y-value arrays.
"""Find clusters corresponding to peaks in the PDF (y-array)

DataClusters determines which points, given a pair of x- and y-value
sequences, roughly correspond to which visible peaks in that data. This
division is contiguous, with borders between clusters near relative
DataClusters determines which points in inter-atomic distane, r,
correspond to peaks in the PDF. The division between clusters
is contiguous, with borders between clusters likely near relative
minima in the data.

Clusters are iteratively formed around points with the largest
y-coordinates. New clusters are added only when the unclustered data
PDF values. New clusters are added only when the unclustered data
point under consideration is greater than a given distance (the
'resolution') from the nearest existing cluster.

Data members
x - sequence of x coordinates.
y - sequence of y values
res - clustering 'resolution'
data_order - array of x, y indices ordered by decreasing y
clusters - array of cluster ranges
current_idx - index of data_order currently considered
------------
x : array
The array of r values.
y : sequence of y values
The array of PDF values, G(r)
res : int
The clustering resolution, i.e., the number of points another point has to
be away from the center of an existing cluster to before a new cluster is
formed. A value of zero allows every point to be cluster.
data_order : array
The array of x, y indices ordered by decreasing y
clusters :
The array of cluster ranges
current_idx - int
The index of data_order currently considered
"""

def __init__(self, x, y, res):
"""Initializes the data to be clustered, and the 'resolution' to use.
"""Constructor

Parameters
x - numeric sequence of x-value sorted in ascending order
y - corresponding sequence of y-values
res - clustering 'resolution'
----------
x : array
The array of r values.
y : sequence of y values
The array of PDF values, G(r)
res : int
The clustering resolution, i.e., the number of points another point has to
be away from the center of an existing cluster to before a new cluster is
formed. A value of zero allows every point to be cluster.
"""
# Track internal state of clustering.
self.INIT = 0
self.READY = 1
self.CLUSTERING = 2
self.DONE = 3

self.clear()
self.setdata(x, y, res)
self._clear()
self._setdata(x, y, res)

return

Expand Down Expand Up @@ -87,7 +101,7 @@ def __eq__(self, other):
and self.DONE == other.DONE
)

def clear(self):
def _clear(self):
"""
Clear all data and reset the cluster object to a transient initial state.

Expand Down Expand Up @@ -120,35 +134,37 @@ def reset_clusters(self):
self.status = self.READY
return

def setdata(self, x, y, res):
def _setdata(self, x, y, res):
"""Assign data members for x- and y-coordinates, and resolution.

Parameters
x - numeric sequence of x-value sorted in ascending order
y - corresponding sequence of y-values
res - clustering 'resolution'
----------
x : array
The array of r values.
y : sequence of y values
The array of PDF values, G(r)
res : int
The clustering resolution, i.e., the number of points another point has to
be away from the center of an existing cluster to before a new cluster is
formed. A value of zero allows every point to be cluster.
"""
# Test for error conditions
# 1) Length mismatch
# 2) Bound errors for res
# 3) r isn't sorted?
if len(x) != len(y):
raise ValueError("Sequences x and y must have the same length.")
if res < 0:
raise ValueError("Resolution res must be non-negative.")
# Test for sorting?
raise ValueError(
"Value of resolution parameter is less than zero. Please rerun specifying a non-negative res"
)
self.x = x
self.y = y
self.res = res
# If x sequence size is empty, set the object into Initialized state.
if x.size == 0 and res == 0:
if x.size == 0:
self.data_order = np.array([])
self.clusters = np.array([[]])
self.current_idx = 0
self.lastpoint_idx = None
self.status = self.INIT
else:
self.data_order = self.y.argsort() # Defines order of clustering
self.data_order = self.y.argsort()
self.clusters = np.array([[self.data_order[-1], self.data_order[-1]]])
self.current_idx = len(self.data_order) - 1
self.lastpoint_idx = self.data_order[-1]
Expand Down
77 changes: 66 additions & 11 deletions src/diffpy/srmise/tests/test_dataclusters.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,14 @@
from copy import copy

import numpy as np
import pytest

from diffpy.srmise.dataclusters import DataClusters


def test_clear():
# Initialize DataClusters with input parameters
actual = DataClusters(x=np.array([1, 2, 3]), y=np.array([3, 2, 1]), res=4)
expected = DataClusters(x=np.array([]), y=np.array([]), res=0)
# Perform the clear operation
actual.clear()
assert actual == expected


def test___eq__():
actual = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 0)
expected = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 0)
actual = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 1)
expected = DataClusters(np.array([1, 2, 3]), np.array([3, 2, 1]), 1)
assert expected == actual
attributes = vars(actual)
for attr_key, attr_val in attributes.items():
Expand All @@ -32,3 +24,66 @@ def test___eq__():
print(f"not-equal test failed on {attr_key}")
assert not expected == actual
attributes.update({attr_key: reset})


@pytest.mark.parametrize(
"inputs, expected",
[
(
{
"x": np.array([1, 2, 3]),
"y": np.array([3, 2, 1]),
"res": 4,
},
{
"x": np.array([1, 2, 3]),
"y": np.array([3, 2, 1]),
"res": 4,
"data_order": [2, 1, 0],
"clusters": np.array([[0, 0]]),
"current_idx": 2,
"lastpoint_idx": 0,
"INIT": 0,
"READY": 1,
"CLUSTERING": 2,
"DONE": 3,
"lastcluster_idx": None,
"status": 1,
},
),
],
)
def test_DataClusters_constructor(inputs, expected):
actual = DataClusters(x=inputs["x"], y=inputs["y"], res=inputs["res"])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is better. Strinctly this still doesn't test set_data alone. It tests the object constructor. I think this is ok, but we may want to make clear and set_data as private functions. Then we don't need tests (or docstrings in prinicple) for them and we just test the constructor (the __init__).

Whether or not to do this depends where else these functions rae used. Do we want to make them available to users to use, or are they just being used in init alone or in init and a few other places.

These are small things, but once we touch the code we want to leave it better than when we arrived, and it is also a good learning experience.....

Could you look how these two functions are used and we can decide. If we make them private functions we can leave this test as it is but just change its name.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is better. Strinctly this still doesn't test set_data alone. It tests the object constructor. I think this is ok, but we may want to make clear and set_data as private functions. Then we don't need tests (or docstrings in prinicple) for them and we just test the constructor (the __init__).

Whether or not to do this depends where else these functions rae used. Do we want to make them available to users to use, or are they just being used in init alone or in init and a few other places.

These are small things, but once we touch the code we want to leave it better than when we arrived, and it is also a good learning experience.....

Could you look how these two functions are used and we can decide. If we make them private functions we can leave this test as it is but just change its name.

I'm certain that these two functions are only used in the constructor. It should be OK to change them into private functions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

super, let's do this. Just

  • change the name of this test to something like test_DataClusters_constructor since this is what it does anyway.
  • add an underscore to the beginning of the clear and set_data functions. You can leave the docstring since we already wrote it.
  • revisit the test_clear tests. We want to remove this test, but let's make sure that this behavior is being tested in the constructor test, so move over anything we need from there.

actual_attributes = vars(actual)
for attr_key, actual_attr_val in actual_attributes.items():
if isinstance(actual_attr_val, np.ndarray):
assert np.array_equal(actual_attr_val, expected[attr_key])
else:
assert actual_attr_val == expected[attr_key]


@pytest.mark.parametrize(
"inputs, msg",
[
(
{
"x": np.array([1]),
"y": np.array([3, 2]),
"res": 4,
},
"Sequences x and y must have the same length.",
),
(
{
"x": np.array([1]),
"y": np.array([3]),
"res": -1,
},
"Value of resolution parameter is less than zero. Please rerun specifying a non-negative res",
),
],
)
def test_set_data_order_bad(inputs, msg):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a good test, nice job.

with pytest.raises(ValueError, match=msg):
DataClusters(x=inputs["x"], y=inputs["y"], res=inputs["res"])
Loading