Read the length of the datasource from the FileInstructions to limit I/O.

marcenacp · The TensorFlow Datasets Authors · commit d97542517bd5 · 2025-03-19T04:13:32.000-07:00
PiperOrigin-RevId: 737687954
diff --git a/tensorflow_datasets/core/data_sources/array_record.py b/tensorflow_datasets/core/data_sources/array_record.py
@@ -56,7 +56,7 @@ class ArrayRecordDataSource(base.BaseDataSource):
   length: int = dataclasses.field(init=False)
 
   def __post_init__(self):
-    file_instructions = base.file_instructions(self.dataset_info, self.split)
+    file_instructions = self.split_info.file_instructions
     self.data_source = array_record_data_source.ArrayRecordDataSource(
         file_instructions
     )
diff --git a/tensorflow_datasets/core/data_sources/base.py b/tensorflow_datasets/core/data_sources/base.py
@@ -45,16 +45,6 @@ def __getitems__(self, keys: Iterable[int]) -> T:
     """Returns the value for the given `keys`."""
 
 
-def file_instructions(
-    dataset_info: dataset_info_lib.DatasetInfo,
-    split: splits_lib.Split | None = None,
-) -> list[shard_utils.FileInstruction]:
-  """Retrieves the file instructions from the DatasetInfo."""
-  split_infos = dataset_info.splits.values()
-  split_dict = splits_lib.SplitDict(split_infos=split_infos)
-  return split_dict[split].file_instructions
-
-
 @dataclasses.dataclass
 class BaseDataSource(MappingView, Sequence):
   """Base DataSource to override all dunder methods with the deserialization.
@@ -94,6 +84,13 @@ def _deserialize(self, record: Any) -> Any:
           return features.deserialize_example_np(record, decoders=self.decoders)  # pylint: disable=attribute-error
         raise ValueError('No features set, cannot decode example!')
 
+  @property
+  def split_info(self) -> splits_lib.SplitInfo | splits_lib.SubSplitInfo:
+    """Returns the SplitInfo for the split."""
+    split_infos = self.dataset_info.splits.values()
+    splits_dict = splits_lib.SplitDict(split_infos=split_infos)
+    return splits_dict[self.split]  # will raise an error if split is not found
+
   def __getitem__(self, key: SupportsIndex) -> Any:
     record = self.data_source[key.__index__()]
     return self._deserialize(record)
@@ -133,7 +130,7 @@ def __repr__(self) -> str:
     )
 
   def __len__(self) -> int:
-    return self.data_source.__len__()
+    return sum(fi.take for fi in self.split_info.file_instructions)
 
   def __iter__(self):
     for i in range(self.__len__()):
diff --git a/tensorflow_datasets/core/data_sources/base_test.py b/tensorflow_datasets/core/data_sources/base_test.py
@@ -94,6 +94,12 @@ def test_read_write(
   for i, element in enumerate(data_source):
     assert element == {'id': i}
 
+  # Also works on sliced splits.
+  data_source = builder.as_data_source(split='train[0:2]')
+  assert len(data_source) == 2
+  data_source = builder.as_data_source(split='train[:50%]')
+  assert len(data_source) == 2
+
 
 _FILE_INSTRUCTIONS = [
     shard_utils.FileInstruction(
diff --git a/tensorflow_datasets/core/data_sources/parquet.py b/tensorflow_datasets/core/data_sources/parquet.py
@@ -57,7 +57,7 @@ class ParquetDataSource(base.BaseDataSource):
   """ParquetDataSource to read from a ParquetDataset."""
 
   def __post_init__(self):
-    file_instructions = base.file_instructions(self.dataset_info, self.split)
+    file_instructions = self.split_info.file_instructions
     filenames = [
         file_instruction.filename for file_instruction in file_instructions
     ]

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ class ArrayRecordDataSource(base.BaseDataSource):`
`56`	`56`	`length: int = dataclasses.field(init=False)`
`57`	`57`
`58`	`58`	`def __post_init__(self):`
`59`		`- file_instructions = base.file_instructions(self.dataset_info, self.split)`
	`59`	`+ file_instructions = self.split_info.file_instructions`
`60`	`60`	`self.data_source = array_record_data_source.ArrayRecordDataSource(`
`61`	`61`	`file_instructions`
`62`	`62`	`)`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ class ParquetDataSource(base.BaseDataSource):`
`57`	`57`	`"""ParquetDataSource to read from a ParquetDataset."""`
`58`	`58`
`59`	`59`	`def __post_init__(self):`
`60`		`- file_instructions = base.file_instructions(self.dataset_info, self.split)`
	`60`	`+ file_instructions = self.split_info.file_instructions`
`61`	`61`	`filenames = [`
`62`	`62`	`file_instruction.filename for file_instruction in file_instructions`
`63`	`63`	`]`