minor upd

AlekseySh · AlekseySh · commit b6e403bfbdc2 · 2024-04-18T22:45:27.000+07:00
diff --git a/ml-runs/0/meta.yaml b/ml-runs/0/meta.yaml
diff --git a/oml/inference/abstract.py b/oml/inference/abstract.py
@@ -52,11 +52,7 @@ def _inference(
     ids, outputs = unique_by_ids(ids=ids, data=outputs)
 
     assert len(outputs) == len(dataset), "Data was not collected correctly after DDP sync."
-    assert list(range(len(dataset))) == ids, (
-        list(range(len(dataset))),
-        ids,
-        "zzz",
-    )  # , "Data was not collected correctly after DDP sync."
+    assert list(range(len(dataset))) == ids, "Data was not collected correctly after DDP sync."
 
     return outputs
 
diff --git a/oml/metrics/accumulation.py b/oml/metrics/accumulation.py
@@ -86,7 +86,7 @@ def update_data(self, data_dict: Dict[str, Any], indices: Optional[List[int]] =
             data_dict: We will accumulate data getting values via ``self.keys_to_accumulate``. All elements
                        of the dictionary have to have the same size.
             indices: Global indices of the elements in your batch of data. If provided, the accumulator
-                     will remove all the accumulated duplicates and return the elements in sorted order.
+                     will remove accumulated duplicates and return the elements in the sorted order after ``.sync()``.
                      Indices may be useful in DDP (because data is gathered shuffled, additionally you may also get
                      some duplicates due to padding). In the single device regime it's also useful if you accumulate
                      data in shuffled order.
@@ -125,7 +125,7 @@ def is_storage_full(self) -> bool:
 
     def sync(self) -> "Accumulator":
         """
-        The method drops duplicates if ids have been provided in ``self.update_data``.
+        The method drops duplicates and sort elements by indices if they have been provided in ``self.update_data()``.
         In DDP it also gathers data collected on several devices.
 
         """
@@ -157,7 +157,7 @@ def sync(self) -> "Accumulator":
             indices = None
 
         if not need_rebuilding:
-            # If we found no duplicates and there are no multiple devices, we may save time & memory on re-creating
+            # If indices were not provided & it's not DDP we may save time & memory avoiding re-building accumulator
             return self
 
         synced_accum = Accumulator(tuple(set(params["keys_to_accumulate"])))
diff --git a/oml/metrics/embeddings.py b/oml/metrics/embeddings.py
@@ -164,10 +164,10 @@ def setup(self, num_samples: int) -> None:  # type: ignore
     def update_data(self, data_dict: Dict[str, Any], indices: Optional[List[int]] = None) -> None:  # type: ignore
         """
         Args:
-            data_dict: Batch of data containing elements of the same size: ``bs``.
-            indices: Global indices of the elements in your batch of data withing the range ``(0, dataset_size - 1)``.
+            data_dict: Batch of data containing records of the same size: ``bs``.
+            indices: Global indices of the elements in your records withing the range of ``(0, dataset_size - 1)``.
                      Indices are needed in DDP (because data is gathered shuffled, additionally you may also get
-                     some duplicates due to padding). In the single device regime it's also useful if you accumulate
+                     some duplicates due to padding). In the single device regime it's may be useful if you accumulate
                      data in shuffled order.
 
         """
@@ -392,7 +392,7 @@ def sync(self) -> None:
         self.acc = self.acc.sync()
 
     def update_data(self, data_dict: Dict[str, Any], indices: List[int]) -> None:  # type: ignore
-        # indices are obligatory in DDP
+        # indices are obligatory in DDP, so we don't accumulate shuffled data
         return super().update_data(data_dict, indices)
 
 
diff --git a/oml/utils/misc_torch.py b/oml/utils/misc_torch.py
@@ -129,12 +129,12 @@ def _check_is_sequence(val: Any) -> bool:
 
 def unique_by_ids(ids: List[int], data: TData) -> Tuple[List[int], TData]:
     """
-    The function sort data by the corresponding ids and drops duplicates.
+    The function sort data by the corresponding indices and drops duplicates.
     Thus, if there are multiple occurrences of the same id, it takes the first one.
 
     Args:
-        ids: Indices of data records with the length of ``N``
-        data: Data records with the lengths of ``N``
+        ids: Indices of data with the length of ``N``
+        data: Data with the length of ``N``
 
     Returns:
         Unique data records with their ids in the sorted order without duplicates
diff --git a/tests/test_oml/test_ddp/test_accumulator.py b/tests/test_oml/test_ddp/test_accumulator.py
@@ -20,7 +20,7 @@ def test_ddp_accumulator(world_size: int, device: str, create_duplicate: bool) -
 @pytest.mark.parametrize("device", ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"])
 @pytest.mark.parametrize("create_duplicate", [True, False])
 def test_fake_ddp_accumulator(device: str, create_duplicate: bool) -> None:
-    # we expect the same behaviour outside DDP
+    # we expect the same behaviour without initializing DDP
     check_accumulator(rank=0, world_size=1, device=device, create_duplicate=create_duplicate)