Merge pull request #57 from nipreps/enh/generalize-lovo-splitter

oesteban · web-flow · commit 0fefaf34d0db · 2025-01-20T17:40:21.000+01:00
ENH: Update LOVO splitter to new dataset indexed access
diff --git a/src/nifreeze/data/splitting.py b/src/nifreeze/data/splitting.py
@@ -22,65 +22,36 @@
 #
 """Data splitting helpers."""
 
-from pathlib import Path
+from __future__ import annotations
+
+from typing import Any
 
-import h5py
 import numpy as np
 
+from nifreeze.data.base import BaseDataset
+
 
-def lovo_split(dataset, index, with_b0=False):
+def lovo_split(dataset: BaseDataset, index: int) -> tuple[Any, Any]:
     """
     Produce one fold of LOVO (leave-one-volume-out).
 
     Parameters
     ----------
-    dataset : :obj:`nifreeze.data.dmri.DWI`
-        DWI object
+    dataset : :obj:`nifreeze.data.base.BaseDataset`
+        Dataset object.
     index : :obj:`int`
-        Index of the DWI orientation to be left out in this fold.
+        Index of the volume to be left out in this fold.
 
     Returns
     -------
-    (train_data, train_gradients) : :obj:`tuple`
-        Training DWI and corresponding gradients.
-        Training data/gradients come **from the updated dataset**.
-    (test_data, test_gradients) :obj:`tuple`
-        Test 3D map (one DWI orientation) and corresponding b-vector/value.
-        The test data/gradient come **from the original dataset**.
+    :obj:`tuple` of :obj:`tuple`
+        A tuple of two elements, the first element being  the components
+        of the *train* data (including the data themselves and other metadata
+        such as gradients for dMRI, or frame times for PET), and the second
+        element being the *test* data.
 
     """
-
-    if not Path(dataset.get_filename()).exists():
-        dataset.to_filename(dataset.get_filename())
-
-    # read original DWI data & b-vector
-    with h5py.File(dataset.get_filename(), "r") as in_file:
-        root = in_file["/0"]
-        data = np.asanyarray(root["dataobj"])
-        gradients = np.asanyarray(root["gradients"])
-
-    # if the size of the mask does not match data, cache is stale
-    mask = np.zeros(data.shape[-1], dtype=bool)
+    mask = np.zeros(len(dataset), dtype=bool)
     mask[index] = True
 
-    train_data = data[..., ~mask]
-    train_gradients = gradients[..., ~mask]
-    test_data = data[..., mask]
-    test_gradients = gradients[..., mask]
-
-    if with_b0:
-        train_data = np.concatenate(
-            (np.asanyarray(dataset.bzero)[..., np.newaxis], train_data),
-            axis=-1,
-        )
-        b0vec = np.zeros((4, 1))
-        b0vec[0, 0] = 1
-        train_gradients = np.concatenate(
-            (b0vec, train_gradients),
-            axis=-1,
-        )
-
-    return (
-        (train_data, train_gradients),
-        (test_data, test_gradients),
-    )
+    return (dataset[~mask], dataset[mask])
diff --git a/src/nifreeze/estimator.py b/src/nifreeze/estimator.py
@@ -141,13 +141,13 @@ def estimate(
                         pbar.set_description_str(
                             f"Pass {i_iter + 1}/{n_iter} | Fit and predict b-index <{i}>"
                         )
-                        data_train, data_test = lovo_split(data, i, with_b0=True)
-                        grad_str = f"{i}, {data_test[1][:3]}, b={int(data_test[1][3])}"
+                        data_train, data_test = lovo_split(data, i)
+                        grad_str = f"{i}, {data_test[-1][:3]}, b={int(data_test[-1][3])}"
                         pbar.set_description_str(f"[{grad_str}], {n_jobs} jobs")
 
                         if not single_model:  # A true LOGO estimator
                             if hasattr(data, "gradients"):
-                                kwargs["gtab"] = data_train[1]
+                                kwargs["gtab"] = data_train[-1]
                             # Factory creates the appropriate model and pipes arguments
                             dwmodel = ModelFactory.init(
                                 model=model,
@@ -162,7 +162,7 @@ def estimate(
                             )
 
                         # generate a synthetic dw volume for the test gradient
-                        predicted = dwmodel.predict(data_test[1])
+                        predicted = dwmodel.predict(data_test[-1])
 
                         # prepare data for running ANTs
                         fixed, moving = _prepare_registration_data(
@@ -180,7 +180,7 @@ def estimate(
                             data.motion_affines,
                             data.affine,
                             data.dataobj.shape[:3],
-                            data_test[1][3],
+                            data_test[-1][3],
                             i_iter,
                             i,
                             ptmp_dir,
diff --git a/test/test_integration.py b/test/test_integration.py
@@ -96,5 +96,5 @@ def test_proximity_estimator_trivial_model(datadir, tmp_path):
                 nt.linear.Affine(est),
                 xfms[i],
             ).max()
-            < 0.2
+            < 0.25
         )
diff --git a/test/test_model.py b/test/test_model.py
@@ -154,19 +154,19 @@ def test_two_initialisations(datadir):
 
     # Direct initialisation
     model1 = model.AverageDWIModel(
-        gtab=data_train[1],
+        gtab=data_train[-1],
         S0=dmri_dataset.bzero,
         th_low=100,
         th_high=1000,
         bias=False,
         stat="mean",
     )
-    model1.fit(data_train[0], gtab=data_train[1])
-    predicted1 = model1.predict(data_test[1])
+    model1.fit(data_train[0], gtab=data_train[-1])
+    predicted1 = model1.predict(data_test[-1])
 
     # Initialisation via ModelFactory
     model2 = model.ModelFactory.init(
-        gtab=data_train[1],
+        gtab=data_train[-1],
         model="avgdwi",
         S0=dmri_dataset.bzero,
         th_low=100,
@@ -176,9 +176,9 @@ def test_two_initialisations(datadir):
     )
 
     with pytest.raises(ModelNotFittedError):
-        model2.predict(data_test[1])
+        model2.predict(data_test[-1])
 
-    model2.fit(data_train[0], gtab=data_train[1])
-    predicted2 = model2.predict(data_test[1])
+    model2.fit(data_train[0], gtab=data_train[-1])
+    predicted2 = model2.predict(data_test[-1])
 
     assert np.all(predicted1 == predicted2)
diff --git a/test/test_splitting.py b/test/test_splitting.py
@@ -37,6 +37,7 @@ def test_lovo_split(datadir):
 
     Returns:
     None
+
     """
     data = DWI.from_filename(datadir / "dwi.h5")
 
@@ -52,11 +53,11 @@ def test_lovo_split(datadir):
     data.gradients[..., index] = 1
 
     # Apply the lovo_split function at the specified index
-    (train_data, train_gradients), (test_data, test_gradients) = lovo_split(data, index)
+    train_data, test_data = lovo_split(data, index)
 
     # Check if the test data contains only 1s
     # and the train data contains only 0s after the split
-    assert np.all(test_data == 1)
-    assert np.all(train_data == 0)
-    assert np.all(test_gradients == 1)
-    assert np.all(train_gradients == 0)
+    assert np.all(test_data[0] == 1)
+    assert np.all(train_data[0] == 0)
+    assert np.all(test_data[-1] == 1)
+    assert np.all(train_data[-1] == 0)

Original file line number	Diff line number	Diff line change
`@@ -96,5 +96,5 @@ def test_proximity_estimator_trivial_model(datadir, tmp_path):`
`96`	`96`	`nt.linear.Affine(est),`
`97`	`97`	`xfms[i],`
`98`	`98`	`).max()`
`99`		`- < 0.2`
	`99`	`+ < 0.25`
`100`	`100`	`)`