feat(evaluate): 🧪 Refactor save_numpy_class_arrays_to_zarr to support append mode and improve Hausdorff distance normalization

rhoadesScholar · rhoadesScholar · commit 50a77ff9649a · 2024-12-12T17:46:03.000-05:00
chore(tests): ✏️ Update test cases for new save mode and adjust voxel size handling; remove obsolete crop manifest
diff --git a/src/cellmap_segmentation_challenge/evaluate.py b/src/cellmap_segmentation_challenge/evaluate.py
@@ -15,6 +15,7 @@
 from upath import UPath
 
 from cellmap_data import CellMapImage
+import zarr.errors
 
 from .config import PROCESSED_PATH, SUBMISSION_PATH, BASE_DATA_PATH
 from .utils import TEST_CROPS, TEST_CROPS_DICT
@@ -105,7 +106,7 @@ def save_numpy_class_labels_to_zarr(
 
 
 def save_numpy_class_arrays_to_zarr(
-    save_path, test_volume_name, label_names, labels, overwrite=False, attrs=None
+    save_path, test_volume_name, label_names, labels, mode="append", attrs=None
 ):
     """
     Save a list of 3D numpy arrays of binary or instance labels to a
@@ -116,7 +117,7 @@ def save_numpy_class_arrays_to_zarr(
         test_volume_name (str): The name of the test volume.
         label_names (list): A list of label names corresponding to the list of 3D numpy arrays.
         labels (list): A list of 3D numpy arrays of binary labels.
-        overwrite (bool): Whether to overwrite the Zarr-2 file if it already exists.
+        mode (str): The mode to use when saving the Zarr-2 file. Options are 'append' or 'overwrite'.
         attrs (dict): A dictionary of attributes to save with the Zarr-2 file.
 
     Example usage:
@@ -134,7 +135,10 @@ def save_numpy_class_arrays_to_zarr(
     zarr_group = zarr.group(store)
 
     # Save the test volume group
-    zarr_group.create_group(test_volume_name, overwrite=overwrite)
+    try:
+        zarr_group.create_group(test_volume_name, overwrite=(mode == "overwrite"))
+    except zarr.errors.ContainsGroupError:
+        print(f"Appending to existing group {test_volume_name}")
 
     # Save the labels
     for i, label_name in enumerate(label_names):
@@ -349,9 +353,9 @@ def score_instance(
     # Compute the scores
     accuracy = accuracy_score(truth_label.flatten(), matched_pred_label.flatten())
     hausdorff_dist = np.mean(hausdorff_distances) if hausdorff_distances else 0
-    normalized_hausdorff_dist = 32 ** (
-        -hausdorff_dist
-    )  # normalize Hausdorff distance to [0, 1]. 32 is abritrary chosen to have a reasonable range
+    normalized_hausdorff_dist = 1.01 ** (
+        -hausdorff_dist / np.linalg.norm(voxel_size)
+    )  # normalize Hausdorff distance to [0, 1] using the maximum distance represented by a voxel. 32 is abritrary chosen to have a reasonable range
     combined_score = (accuracy * normalized_hausdorff_dist) ** 0.5
     print(f"Accuracy: {accuracy:.4f}")
     print(f"Hausdorff Distance: {hausdorff_dist:.4f}")
@@ -956,6 +960,7 @@ def match_crop_space(path, class_label, voxel_size, shape, translation) -> np.nd
                 np.divide(input_voxel_size, voxel_size),
                 order=1,
                 mode="constant",
+                preserve_range=True,
             )
             image = image > 0.5
 
diff --git a/tests/__test_crop_manifest.csv b/tests/__test_crop_manifest.csv
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -20,14 +20,14 @@
     save_numpy_class_arrays_to_zarr,
 )
 
-ERROR_TOLERANCE = 1e-4
+ERROR_TOLERANCE = 0.1
 
 
 # %%
 @pytest.fixture(scope="session")
 def setup_temp_path(tmp_path_factory):
-    # temp_dir = tmp_path_factory.mktemp("shared_test_dir")
-    temp_dir = (REPO_ROOT / "tests" / "tmp").absolute()  # For debugging
+    temp_dir = tmp_path_factory.mktemp("shared_test_dir")
+    # temp_dir = (REPO_ROOT / "tests" / "tmp").absolute()  # For debugging
     os.environ["TEST_TMP_DIR"] = str(temp_dir)
     yield temp_dir
     # Cleanup: Unset the environment variable after tests are done
@@ -57,7 +57,7 @@ def test_fetch_data(setup_temp_path):
 
     os.makedirs(setup_temp_path / "data", exist_ok=True)
     fetch_data_cli.callback(
-        crops="116,234",
+        crops="116,118",
         raw_padding=0,
         dest=setup_temp_path / "data",
         access_mode="append",
@@ -197,8 +197,6 @@ def test_evaluate(setup_temp_path, scale, iou, accuracy):
         for crop in truth_zarr.keys():
             crop_zarr = truth_zarr[crop]
             submission_zarr.create_group(crop)
-            labels = []
-            preds = []
             for label in crop_zarr.keys():
                 label_zarr = crop_zarr[label]
                 attrs = label_zarr.attrs.asdict()
@@ -212,19 +210,24 @@ def test_evaluate(setup_temp_path, scale, iou, accuracy):
 
                 if scale:
                     pred = rescale(pred, scale, order=0, preserve_range=True)
-                    attrs["voxel_size"] = [s / scale for s in attrs["voxel_size"]]
-
-                labels.append(label)
-                preds.append(pred)
-
-            save_numpy_class_arrays_to_zarr(
-                SUBMISSION_PATH,
-                crop,
-                labels,
-                preds,
-                overwrite=True,
-                attrs=attrs,
-            )
+                    old_voxel_size = attrs["voxel_size"]
+                    new_voxel_size = [s / scale for s in attrs["voxel_size"]]
+                    attrs["voxel_size"] = new_voxel_size
+                    # Adjust the translation
+                    attrs["translation"] = [
+                        t + (n - o) / 2
+                        for t, o, n in zip(
+                            attrs["translation"], old_voxel_size, new_voxel_size
+                        )
+                    ]
+
+                save_numpy_class_arrays_to_zarr(
+                    SUBMISSION_PATH,
+                    crop,
+                    [label],
+                    [pred],
+                    attrs=attrs,
+                )
     else:
         SUBMISSION_PATH = TRUTH_PATH
     zip_submission(SUBMISSION_PATH)
@@ -245,20 +248,24 @@ def test_evaluate(setup_temp_path, scale, iou, accuracy):
             1 - results["overall_score"] < ERROR_TOLERANCE
         ), f"Overall score should be 1 but is: {results['overall_score']}"
     else:
-        assert (
-            np.abs((iou or 1) - results["overall_semantic_score"]) < ERROR_TOLERANCE
-        ), f"Semantic score should be {(iou or 1)} but is: {results['overall_semantic_score']}"
-        # Check all accuracy scores
+        # Check all accuracy scores and ious
         for label, scores in results["label_scores"].items():
             if label in INSTANCE_CLASSES:
                 assert (
                     np.abs((accuracy or 1) - scores["accuracy"]) < ERROR_TOLERANCE
                 ), f"Accuracy score for {label} should be {(accuracy or 1)} but is: {scores['accuracy']}"
+            else:
+                assert (
+                    np.abs((iou or 1) - scores["iou"]) < ERROR_TOLERANCE
+                ), f"IoU score for {label} should be {(iou or 1)} but is: {scores['iou']}"
 
 
 # %%
 
 
+def get_scaled_test_label(): ...
+
+
 def simulate_predictions_iou(true_labels, iou):
     # TODO: Add false positives (only makes false negatives currently)