Removing duplicates and preserving order in Accumulator

AlekseySh · web-flow · commit ce27d8ecaefb · 2024-04-21T21:53:22.000+07:00
diff --git a/oml/inference/abstract.py b/oml/inference/abstract.py
@@ -7,11 +7,7 @@
 
 from oml.ddp.patching import patch_dataloader_to_ddp
 from oml.ddp.utils import get_world_size_safe, is_ddp, sync_dicts_ddp
-from oml.utils.misc_torch import (
-    drop_duplicates_by_ids,
-    get_device,
-    temporary_setting_model_mode,
-)
+from oml.utils.misc_torch import get_device, temporary_setting_model_mode, unique_by_ids
 
 
 @torch.no_grad()
@@ -53,7 +49,7 @@ def _inference(
     data_synced = sync_dicts_ddp(data_to_sync, world_size=get_world_size_safe())
     outputs, ids = data_synced["outputs"], data_synced["ids"]
 
-    ids, outputs = drop_duplicates_by_ids(ids=ids, data=outputs, sort=True)
+    ids, outputs = unique_by_ids(ids=ids, data=outputs)
 
     assert len(outputs) == len(dataset), "Data was not collected correctly after DDP sync."
     assert list(range(len(dataset))) == ids, "Data was not collected correctly after DDP sync."
diff --git a/oml/lightning/callbacks/metric.py b/oml/lightning/callbacks/metric.py
@@ -8,7 +8,7 @@
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 from torch.utils.data import DataLoader
 
-from oml.const import LOG_IMAGE_FOLDER
+from oml.const import INDEX_KEY, LOG_IMAGE_FOLDER
 from oml.ddp.patching import check_loaders_is_patched, patch_dataloader_to_ddp
 from oml.interfaces.loggers import IFigureLogger
 from oml.interfaces.metrics import IBasicMetric, IMetricDDP, IMetricVisualisable
@@ -83,7 +83,7 @@ def on_validation_batch_end(
         if dataloader_idx == self.loader_idx:
             assert self._ready_to_accumulate
 
-            self.metric.update_data(outputs)
+            self.metric.update_data(outputs, indices=outputs[INDEX_KEY].tolist())
 
             self._collected_samples += len(outputs[list(outputs.keys())[0]])
             if self._collected_samples > self._expected_samples:
diff --git a/oml/metrics/accumulation.py b/oml/metrics/accumulation.py
@@ -1,16 +1,17 @@
-from typing import Any, Dict, List, Sequence, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-from torch.distributed import get_world_size
+from torch import Tensor
 
-from oml.ddp.utils import is_ddp, sync_dicts_ddp
+from oml.ddp.utils import get_world_size_safe, sync_dicts_ddp
+from oml.utils.misc_torch import unique_by_ids
 
-TStorage = Dict[str, Union[torch.Tensor, np.ndarray, List[Any]]]
+TStorage = Dict[str, Union[Tensor, np.ndarray, List[Any]]]
 
 
 class Accumulator:
-    def __init__(self, keys_to_accumulate: Sequence[str]):
+    def __init__(self, keys_to_accumulate: Tuple[str, ...]):
         """
         Class for accumulating values of different types, for instance,
         torch.Tensor and numpy.array.
@@ -27,12 +28,15 @@ def __init__(self, keys_to_accumulate: Sequence[str]):
         self._collected_samples = 0
         self._storage: TStorage = dict()
 
+        self._indices_key = "__element_indices"  # internal key to keep track of elements order if provided
+
     def refresh(self, num_samples: int) -> None:
         """
         This method refreshes the state.
 
         Args:
-            num_samples:  The total number of elements you are going to collect (for memory allocation)
+            num_samples:  The total number of elements you are going to collect (for memory allocation).
+
         """
         assert isinstance(num_samples, int) and num_samples > 0
         self.num_samples = num_samples  # type: ignore
@@ -75,20 +79,39 @@ def _put_in_storage(self, key: str, batch_value: Any) -> None:
         else:
             raise TypeError(f"Type '{type(batch_value)}' is not available for accumulating")
 
-    def update_data(self, data_dict: Dict[str, Any]) -> None:
+    def update_data(self, data_dict: Dict[str, Any], indices: Optional[List[int]] = None) -> None:
         """
         Args:
-            data_dict: We will accumulate data getting values via ``self.keys_to_accumulate``.
+            data_dict: We will accumulate data getting values via ``self.keys_to_accumulate``. All elements
+                       of the dictionary have to have the same size.
+            indices: Global indices of the elements in your batch of data. If provided, the accumulator
+                     will remove accumulated duplicates and return the elements in the sorted order after ``.sync()``.
+                     Indices may be useful in DDP (because data is gathered shuffled, additionally you may also get
+                     some duplicates due to padding). In the single device regime it's also useful if you accumulate
+                     data in shuffled order.
 
         """
-        bs_values = [len(data_dict[k]) for k in self.keys_to_accumulate]
+        keys = list(self.keys_to_accumulate)
+
+        if indices is None:
+            assert self._indices_key not in self.storage, "We are tracking ids, but they are not currently provided."
+        else:
+            assert isinstance(indices, List)
+            if (self.collected_samples > 0) and (self._indices_key not in self.storage):
+                raise RuntimeError("You provided ids, but seems like you had not done it before.")
+
+            keys += [self._indices_key]
+            data_dict[self._indices_key] = indices
+
+        bs_values = [len(data_dict[k]) for k in keys]
         bs = bs_values[0]
         assert all(bs == bs_value for bs_value in bs_values), f"Lengths of data are not equal, lengths: {bs_values}"
 
-        for k in self.keys_to_accumulate:
+        for k in keys:
             v = data_dict[k]
             self._allocate_memory_if_need(k, v)
             self._put_in_storage(k, v)
+
         self._collected_samples += bs
 
     @property
@@ -103,31 +126,47 @@ def is_storage_full(self) -> bool:
         return self.num_samples == self.collected_samples
 
     def sync(self) -> "Accumulator":
+        """
+        The method drops duplicates and sort elements by indices if they have been provided in ``self.update_data()``.
+        In DDP it also gathers data collected on several devices.
+
+        """
         # TODO: add option to broadcast instead of sync to avoid duplicating data
         if not self.is_storage_full():
             raise ValueError("Only full storages could be synced")
 
-        if is_ddp():
-            world_size = get_world_size()
-            if world_size == 1:
-                return self
-            else:
-                params = {"num_samples": [self.num_samples], "keys_to_accumulate": self.keys_to_accumulate}
+        params = {"num_samples": [self.num_samples], "keys_to_accumulate": self.keys_to_accumulate}
+        storage = self._storage
 
-                gathered_params = sync_dicts_ddp(params, world_size=world_size, device="cpu")
-                gathered_storage = sync_dicts_ddp(self._storage, world_size=world_size, device="cpu")
+        world_size = get_world_size_safe()
+        need_rebuilding = False
 
-                assert set(gathered_params["keys_to_accumulate"]) == set(
-                    self.keys_to_accumulate
-                ), "Keys of accumulators should be the same on each device"
+        if world_size > 1:
+            params = sync_dicts_ddp(params, world_size=world_size, device="cpu")
+            storage = sync_dicts_ddp(self._storage, world_size=world_size, device="cpu")
+            need_rebuilding = True
 
-                synced_accum = Accumulator(list(set(gathered_params["keys_to_accumulate"])))
-                synced_accum.refresh(sum(gathered_params["num_samples"]))
-                synced_accum.update_data(gathered_storage)
+            assert set(params["keys_to_accumulate"]) == set(
+                self.keys_to_accumulate
+            ), "Keys of accumulators should be the same on each device"
 
-                return synced_accum
+        if self._indices_key in storage:
+            for key, data in storage.items():
+                storage[key] = unique_by_ids(storage[self._indices_key], data)[1]  # type: ignore
+            indices = storage[self._indices_key]
+            need_rebuilding = True
         else:
+            indices = None
+
+        if not need_rebuilding:
+            # If indices were not provided & it's not DDP we may save time & memory avoiding re-building accumulator
             return self
 
+        synced_accum = Accumulator(tuple(set(params["keys_to_accumulate"])))
+        synced_accum.refresh(num_samples=len(storage[list(storage.keys())[0]]))
+        synced_accum.update_data(storage, indices=indices)
+
+        return synced_accum
+
 
 __all__ = ["TStorage", "Accumulator"]
diff --git a/oml/metrics/embeddings.py b/oml/metrics/embeddings.py
@@ -161,8 +161,18 @@ def setup(self, num_samples: int) -> None:  # type: ignore
 
         self.acc.refresh(num_samples=num_samples)
 
-    def update_data(self, data_dict: Dict[str, Any]) -> None:  # type: ignore
-        self.acc.update_data(data_dict=data_dict)
+    def update_data(self, data_dict: Dict[str, Any], indices: Optional[List[int]] = None) -> None:  # type: ignore
+        """
+        Args:
+            data_dict: Batch of data containing records of the same size: ``bs``.
+            indices: Global indices of the elements in your records within the range of ``(0, dataset_size - 1)``.
+                     Indices are needed in DDP (because data is gathered shuffled, additionally you may also get
+                     some duplicates due to padding). In the single device regime it's may be useful if you accumulate
+                     data in shuffled order.
+
+        """
+        # todo 522: make indices non optional and add the test
+        self.acc.update_data(data_dict=data_dict, indices=indices)
 
     def _calc_matrices(self) -> None:
         embeddings = self.acc.storage[self.embeddings_key]
@@ -382,5 +392,9 @@ class EmbeddingMetricsDDP(EmbeddingMetrics, IMetricDDP):
     def sync(self) -> None:
         self.acc = self.acc.sync()
 
+    def update_data(self, data_dict: Dict[str, Any], indices: List[int]) -> None:  # type: ignore
+        # indices are obligatory in DDP
+        return super().update_data(data_dict, indices)
+
 
 __all__ = ["TMetricsDict_ByLabels", "EmbeddingMetrics", "EmbeddingMetricsDDP"]
diff --git a/oml/utils/misc_torch.py b/oml/utils/misc_torch.py
@@ -7,8 +7,6 @@
 import torch
 from torch import Tensor, cdist
 
-from oml.utils.misc import find_first_occurrences
-
 TSingleValues = Union[int, float, np.float_, np.int_, torch.Tensor]
 TSequenceValues = Union[List[float], Tuple[float, ...], np.ndarray, torch.Tensor]
 TOnlineValues = Union[TSingleValues, TSequenceValues]
@@ -126,31 +124,33 @@ def _check_is_sequence(val: Any) -> bool:
         return False
 
 
-def drop_duplicates_by_ids(ids: List[Hashable], data: Tensor, sort: bool = True) -> Tuple[List[Hashable], Tensor]:
+TData = Tuple[List[Any], Tensor, np.ndarray]
+
+
+def unique_by_ids(ids: List[int], data: TData) -> Tuple[List[int], TData]:
     """
-    The function returns rows of data that have unique ids.
-    Thus, if there are multiple occurrences of some id, it leaves the first one.
+    The function sort data by the corresponding indices and drops duplicates.
+    Thus, if there are multiple occurrences of the same id, it takes the first one.
 
     Args:
-        ids: Identifiers of data records with the length of ``N``
-        data: Tensor of data records in the shape of ``[N, *]``
-        sort: Set ``True`` to return unique records sorted by their ids
+        ids: Indices of data with the length of ``N``
+        data: Data with the length of ``N``
 
     Returns:
-        Unique data records with their ids
+        Unique data records with their ids in the sorted order without duplicates
 
     """
-    assert isinstance(ids, list)
-    ids_first = find_first_occurrences(ids)
-    ids = [ids[i] for i in ids_first]
-    data = data[ids_first]
+    assert len(ids) == len(data)
+    assert isinstance(ids, list) and len(ids) >= 1
+
+    ids_unq, positions_unq = np.unique(ids, return_index=True)
 
-    if sort:
-        ii_permute = torch.argsort(torch.tensor(ids))
-        ids = [ids[i] for i in ii_permute]
-        data = data[ii_permute]
+    if isinstance(data, (list, tuple)):
+        data = [data[i] for i in positions_unq]  # type: ignore
+    else:
+        data = data[positions_unq]
 
-    return ids, data
+    return ids_unq.tolist(), data
 
 
 @contextmanager
@@ -465,6 +465,6 @@ def _check_dimensions(self, n_components: int) -> None:
     "take_2d",
     "assign_2d",
     "PCA",
-    "drop_duplicates_by_ids",
+    "unique_by_ids",
     "normalise",
 ]
diff --git a/tests/test_integrations/test_lightning/test_pipeline.py b/tests/test_integrations/test_lightning/test_pipeline.py
@@ -12,6 +12,7 @@
 
 from oml.const import (
     EMBEDDINGS_KEY,
+    INDEX_KEY,
     INPUT_TENSORS_KEY,
     IS_GALLERY_KEY,
     IS_QUERY_KEY,
@@ -31,7 +32,13 @@ def __init__(self, labels: List[int], im_size: int):
     def __getitem__(self, item: int) -> Dict[str, Any]:
         input_tensors = torch.rand((3, self.im_size, self.im_size))
         label = torch.tensor(self.labels[item]).long()
-        return {INPUT_TENSORS_KEY: input_tensors, LABELS_KEY: label, IS_QUERY_KEY: True, IS_GALLERY_KEY: True}
+        return {
+            INPUT_TENSORS_KEY: input_tensors,
+            LABELS_KEY: label,
+            IS_QUERY_KEY: True,
+            IS_GALLERY_KEY: True,
+            INDEX_KEY: item,
+        }
 
     def __len__(self) -> int:
         return len(self.labels)
diff --git a/tests/test_oml/test_ddp/test_accumulator.py b/tests/test_oml/test_ddp/test_accumulator.py
@@ -12,31 +12,61 @@
 @pytest.mark.long
 @pytest.mark.parametrize("world_size", [1, 2, 3])
 @pytest.mark.parametrize("device", ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"])
-def test_ddp_accumulator(world_size: int, device: str) -> None:
-    run_in_ddp(world_size=world_size, fn=check_ddp_accumulator, args=(device,))
+@pytest.mark.parametrize("create_duplicate", [True, False])
+def test_ddp_accumulator(world_size: int, device: str, create_duplicate: bool) -> None:
+    run_in_ddp(world_size=world_size, fn=check_ddp_accumulator, args=(device, create_duplicate))
 
 
-def check_ddp_accumulator(rank: int, world_size: int, device: str) -> None:
+@pytest.mark.parametrize("device", ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"])
+@pytest.mark.parametrize("create_duplicate", [True, False])
+def test_fake_ddp_accumulator(device: str, create_duplicate: bool) -> None:
+    # we expect the same duplicate removing behaviour without initializing DDP
+    check_accumulator(rank=0, world_size=1, device=device, create_duplicate=create_duplicate)
+
+
+def check_ddp_accumulator(rank: int, world_size: int, device: str, create_duplicate: bool) -> None:
     init_ddp(rank, world_size)
+    check_accumulator(rank, world_size, device, create_duplicate)
 
+
+def check_accumulator(rank: int, world_size: int, device: str, create_duplicate: bool) -> None:
     value = rank + 1
+    size = value
+
+    indices = {0: [0], 1: [1, 2], 2: [3, 4, 5]}[rank]
+
+    if create_duplicate and (rank == 0):
+        # let's pretend we doubled our single record at the rank 0
+        size = 2
+        indices = [0, 0]
 
     data = {
-        "list": [value] * value,
-        "tensor_1d": value * torch.ones(value, device=device),
-        "tensor_3d": value * torch.ones((value, 2, 3), device=device),
-        "numpy_1d": value * np.ones(value),
-        "numpy_3d": value * np.ones((value, 4, 5)),
+        "list": [value] * size,
+        "tensor_1d": value * torch.ones(size, device=device),
+        "tensor_3d": value * torch.ones((size, 2, 3), device=device),
+        "numpy_1d": value * np.ones(size),
+        "numpy_3d": value * np.ones((size, 4, 5)),
     }
 
-    acc = Accumulator(keys_to_accumulate=list(data.keys()))
+    acc = Accumulator(keys_to_accumulate=tuple(data.keys()))
     acc.refresh(len(data["list"]))
-    acc.update_data(data)
+    acc.update_data(data, indices=indices)
+
+    acc_synced = acc.sync()
+    synced_data = acc_synced.storage
+    synced_num_samples = acc_synced.num_samples
 
-    synced_data = acc.sync().storage
+    assert acc_synced.is_storage_full()
 
     len_after_sync = sum(range(1, world_size + 1))
 
+    indices_synced = synced_data[acc._indices_key]
+
+    assert len_after_sync == synced_num_samples
+
+    assert len(indices_synced) == len(set(indices_synced))
+    assert sorted(indices_synced) == list(range(len_after_sync))
+
     assert len(synced_data["list"]) == len_after_sync
 
     assert synced_data["tensor_1d"].ndim == 1  # type: ignore
diff --git a/tests/test_oml/test_metrics/test_accumulator.py b/tests/test_oml/test_metrics/test_accumulator.py
diff --git a/tests/test_oml/test_utils/test_misc_torch.py b/tests/test_oml/test_utils/test_misc_torch.py