feat: move muD calculator from labpdfproc to here

yucongalicechen · yucongalicechen · commit bbb6151ebb2d · 2024-12-29T17:11:58.000-05:00
diff --git a/src/diffpy/utils/tools.py b/src/diffpy/utils/tools.py
@@ -3,25 +3,11 @@
 from copy import copy
 from pathlib import Path
 
+import numpy as np
+from scipy.optimize import dual_annealing
+from scipy.signal import convolve
 
-def clean_dict(obj):
-    """Remove keys from the dictionary where the corresponding value is None.
-
-    Parameters
-    ----------
-    obj: dict
-        The dictionary to clean. If None, initialize as an empty dictionary.
-
-    Returns
-    -------
-    dict:
-        The cleaned dictionary with keys removed where the value is None.
-    """
-    obj = obj if obj is not None else {}
-    for key, value in copy(obj).items():
-        if not value:
-            del obj[key]
-    return obj
+from diffpy.utils.parsers.loaddata import loadData
 
 
 def _stringify(obj):
@@ -206,3 +192,103 @@ def get_package_info(package_names, metadata=None):
         pkg_info.update({package: importlib.metadata.version(package)})
     metadata["package_info"] = pkg_info
     return metadata
+
+
+def _top_hat(z, half_slit_width):
+    """Create a top-hat function, return 1.0 for values within the specified
+    slit width and 0 otherwise."""
+    return np.where((z >= -half_slit_width) & (z <= half_slit_width), 1.0, 0.0)
+
+
+def _model_function(z, diameter, z0, I0, mud, slope):
+    """
+    Compute the model function with the following steps:
+    1. Let dz = z-z0, so that dz is centered at 0
+    2. Compute length l that is the effective length for computing intensity I = I0 * e^{-mu * l}:
+    - For dz within the capillary diameter, l is the chord length of the circle at position dz
+    - For dz outside this range, l = 0
+    3. Apply a linear adjustment to I0 by taking I0 as I0 - slope * z
+    """
+    min_radius = -diameter / 2
+    max_radius = diameter / 2
+    dz = z - z0
+    length = np.piecewise(
+        dz,
+        [dz < min_radius, (min_radius <= dz) & (dz <= max_radius), dz > max_radius],
+        [0, lambda dz: 2 * np.sqrt((diameter / 2) ** 2 - dz**2), 0],
+    )
+    return (I0 - slope * z) * np.exp(-mud / diameter * length)
+
+
+def _extend_z_and_convolve(z, diameter, half_slit_width, z0, I0, mud, slope):
+    """Extend z values and I values for padding (so that we don't have tails in
+    convolution), then perform convolution (note that the convolved I values
+    are the same as modeled I values if slit width is close to 0)"""
+    n_points = len(z)
+    z_left_pad = np.linspace(z.min() - n_points * (z[1] - z[0]), z.min(), n_points)
+    z_right_pad = np.linspace(z.max(), z.max() + n_points * (z[1] - z[0]), n_points)
+    z_extended = np.concatenate([z_left_pad, z, z_right_pad])
+    I_extended = _model_function(z_extended, diameter, z0, I0, mud, slope)
+    kernel = _top_hat(z_extended - z_extended.mean(), half_slit_width)
+    I_convolved = I_extended  # this takes care of the case where slit width is close to 0
+    if kernel.sum() != 0:
+        kernel /= kernel.sum()
+        I_convolved = convolve(I_extended, kernel, mode="same")
+    padding_length = len(z_left_pad)
+    return I_convolved[padding_length:-padding_length]
+
+
+def _objective_function(params, z, observed_data):
+    """Compute the objective function for fitting a model to the
+    observed/experimental data by minimizing the sum of squared residuals
+    between the observed data and the convolved model data."""
+    diameter, half_slit_width, z0, I0, mud, slope = params
+    convolved_model_data = _extend_z_and_convolve(z, diameter, half_slit_width, z0, I0, mud, slope)
+    residuals = observed_data - convolved_model_data
+    return np.sum(residuals**2)
+
+
+def _compute_single_mud(z_data, I_data):
+    """Perform dual annealing optimization and extract the parameters."""
+    bounds = [
+        (1e-5, z_data.max() - z_data.min()),  # diameter: [small positive value, upper bound]
+        (0, (z_data.max() - z_data.min()) / 2),  # half slit width: [0, upper bound]
+        (z_data.min(), z_data.max()),  # z0: [min z, max z]
+        (1e-5, I_data.max()),  # I0: [small positive value, max observed intensity]
+        (1e-5, 20),  # muD: [small positive value, upper bound]
+        (-100000, 100000),  # slope: [lower bound, upper bound]
+    ]
+    result = dual_annealing(_objective_function, bounds, args=(z_data, I_data))
+    diameter, half_slit_width, z0, I0, mud, slope = result.x
+    convolved_fitted_signal = _extend_z_and_convolve(z_data, diameter, half_slit_width, z0, I0, mud, slope)
+    residuals = I_data - convolved_fitted_signal
+    rmse = np.sqrt(np.mean(residuals**2))
+    return mud, rmse
+
+
+def compute_mud(filepath):
+    """Compute the best-fit mu*D value from a z-scan file, removing the sample
+    holder effect.
+
+    This function loads z-scan data and fits it to a model
+    that convolves a top-hat function with I = I0 * e^{-mu * l}.
+    The fitting procedure is run multiple times, and we return the best-fit parameters based on the lowest rmse.
+
+    The full mathematical details are described in the paper:
+    An ad hoc Absorption Correction for Reliable Pair-Distribution Functions from Low Energy x-ray Sources,
+    Yucong Chen, Till Schertenleib, Andrew Yang, Pascal Schouwink, Wendy L. Queen and Simon J. L. Billinge,
+    in preparation.
+
+    Parameters
+    ----------
+    filepath : str
+        The path to the z-scan file.
+
+    Returns
+    -------
+    mu*D : float
+        The best-fit mu*D value.
+    """
+    z_data, I_data = loadData(filepath, unpack=True)
+    best_mud, _ = min((_compute_single_mud(z_data, I_data) for _ in range(20)), key=lambda pair: pair[1])
+    return best_mud
diff --git a/tests/test_tools.py b/tests/test_tools.py
@@ -3,9 +3,16 @@
 import os
 from pathlib import Path
 
+import numpy as np
 import pytest
 
-from diffpy.utils.tools import check_and_build_global_config, get_package_info, get_user_info
+from diffpy.utils.tools import (
+    _extend_z_and_convolve,
+    check_and_build_global_config,
+    compute_mud,
+    get_package_info,
+    get_user_info,
+)
 
 
 @pytest.mark.parametrize(
@@ -160,3 +167,19 @@ def test_get_package_info(monkeypatch, inputs, expected):
     )
     actual_metadata = get_package_info(inputs[0], metadata=inputs[1])
     assert actual_metadata == expected
+
+
+def test_compute_mud(tmp_path):
+    diameter, slit_width, z0, I0, mud, slope = 1, 0.1, 0, 1e5, 3, 0
+    z_data = np.linspace(-1, 1, 50)
+    convolved_I_data = _extend_z_and_convolve(z_data, diameter, slit_width, z0, I0, mud, slope)
+
+    directory = Path(tmp_path)
+    file = directory / "testfile"
+    with open(file, "w") as f:
+        for x, I in zip(z_data, convolved_I_data):
+            f.write(f"{x}\t{I}\n")
+
+    expected_mud = 3
+    actual_mud = compute_mud(file)
+    assert actual_mud == pytest.approx(expected_mud, rel=1e-4, abs=1e-3)