Skip to content

Commit eb7ae91

Browse files
marcenacpThe TensorFlow Datasets Authors
authored and
The TensorFlow Datasets Authors
committed
Stream from Hugging Face instead of downloading and preparing everything.
PiperOrigin-RevId: 657212303
1 parent 2123db7 commit eb7ae91

File tree

1 file changed

+36
-18
lines changed

1 file changed

+36
-18
lines changed

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,22 @@ class _ShardInfo:
108108
num_exceptions: int
109109

110110

111+
def _load_dataset(
112+
hf_builder: hf_datasets.DatasetBuilder,
113+
split: str,
114+
) -> hf_datasets.Dataset:
115+
"""Efficiently loads a HuggingFace iterable dataset from its builder."""
116+
return hf_datasets.load_dataset(
117+
hf_builder.repo_id,
118+
hf_builder.config_id,
119+
split=split,
120+
streaming=True,
121+
)
122+
123+
111124
def _write_shard(
112125
shard_spec: _ShardSpec,
113-
hf_builder,
126+
hf_builder: hf_datasets.DatasetBuilder,
114127
example_writer,
115128
features: feature_lib.FeaturesDict,
116129
ignore_hf_errors: bool,
@@ -136,12 +149,19 @@ def _write_shard(
136149
def get_serialized_examples_iter():
137150
nonlocal num_bytes
138151
nonlocal num_exceptions
139-
dataset = hf_builder.as_dataset(
140-
split=shard_spec.shard_split, run_post_process=False
152+
dataset = _load_dataset(
153+
hf_builder,
154+
shard_spec.hf_split,
141155
)
156+
dataset = iter(dataset)
142157
for i in range(shard_spec.num_examples):
158+
if i < shard_spec.start_index:
159+
next(dataset)
160+
continue
161+
if i >= shard_spec.end_index:
162+
break
143163
try:
144-
hf_value = dataset[i]
164+
hf_value = next(dataset)
145165
except Exception: # pylint: disable=broad-exception-caught
146166
num_exceptions += 1
147167
if ignore_hf_errors:
@@ -257,14 +277,6 @@ def _create_builder_config(
257277
) -> Optional[dataset_builder.BuilderConfig]:
258278
return self._converted_builder_config
259279

260-
@functools.lru_cache(maxsize=1)
261-
def _hf_download_and_prepare(self):
262-
login_to_hf(self._hf_hub_token)
263-
self._hf_builder.download_and_prepare(
264-
num_proc=self._hf_num_proc,
265-
verification_mode=self._verification_mode,
266-
)
267-
268280
@property
269281
def _hf_info(self) -> hf_datasets.DatasetInfo:
270282
"""Retrieves the dataset info from the HuggingFace Datasets."""
@@ -278,11 +290,18 @@ def _hf_hub_info(self) -> huggingface_hub.hf_api.DatasetInfo:
278290
)
279291

280292
def _hf_features(self) -> hf_datasets.Features:
281-
if not self._hf_info.features:
282-
# We need to download and prepare the data to know its features.
283-
self._hf_download_and_prepare()
284-
285-
return self._hf_info.features
293+
# Return the features from the builder info.
294+
if self._hf_info.features:
295+
return self._hf_info.features
296+
# Return the features from the first split.
297+
for split in self._hf_info.splits:
298+
ds = _load_dataset(
299+
self._hf_builder,
300+
split,
301+
)
302+
if hasattr(ds, 'info') and ds.info.features:
303+
return ds.info.features
304+
raise ValueError('No features found in the dataset.')
286305

287306
def _info(self) -> dataset_info_lib.DatasetInfo:
288307
return dataset_info_lib.DatasetInfo(
@@ -309,7 +328,6 @@ def _generate_splits(
309328
) -> Sequence[splits_lib.SplitInfo]:
310329
"""Prepares the dataset by writing to shards directly."""
311330
del dl_manager, download_config # Unused.
312-
self._hf_download_and_prepare()
313331

314332
shard_specs_by_split: dict[str, Sequence[_ShardSpec]] = {}
315333
for hf_split, hf_split_info in self._hf_info.splits.items():

0 commit comments

Comments
 (0)