@@ -108,9 +108,22 @@ class _ShardInfo:
108
108
num_exceptions : int
109
109
110
110
111
+ def _load_dataset (
112
+ hf_builder : hf_datasets .DatasetBuilder ,
113
+ split : str ,
114
+ ) -> hf_datasets .Dataset :
115
+ """Efficiently loads a HuggingFace iterable dataset from its builder."""
116
+ return hf_datasets .load_dataset (
117
+ hf_builder .repo_id ,
118
+ hf_builder .config_id ,
119
+ split = split ,
120
+ streaming = True ,
121
+ )
122
+
123
+
111
124
def _write_shard (
112
125
shard_spec : _ShardSpec ,
113
- hf_builder ,
126
+ hf_builder : hf_datasets . DatasetBuilder ,
114
127
example_writer ,
115
128
features : feature_lib .FeaturesDict ,
116
129
ignore_hf_errors : bool ,
@@ -136,12 +149,19 @@ def _write_shard(
136
149
def get_serialized_examples_iter ():
137
150
nonlocal num_bytes
138
151
nonlocal num_exceptions
139
- dataset = hf_builder .as_dataset (
140
- split = shard_spec .shard_split , run_post_process = False
152
+ dataset = _load_dataset (
153
+ hf_builder ,
154
+ shard_spec .hf_split ,
141
155
)
156
+ dataset = iter (dataset )
142
157
for i in range (shard_spec .num_examples ):
158
+ if i < shard_spec .start_index :
159
+ next (dataset )
160
+ continue
161
+ if i >= shard_spec .end_index :
162
+ break
143
163
try :
144
- hf_value = dataset [ i ]
164
+ hf_value = next ( dataset )
145
165
except Exception : # pylint: disable=broad-exception-caught
146
166
num_exceptions += 1
147
167
if ignore_hf_errors :
@@ -257,14 +277,6 @@ def _create_builder_config(
257
277
) -> Optional [dataset_builder .BuilderConfig ]:
258
278
return self ._converted_builder_config
259
279
260
- @functools .lru_cache (maxsize = 1 )
261
- def _hf_download_and_prepare (self ):
262
- login_to_hf (self ._hf_hub_token )
263
- self ._hf_builder .download_and_prepare (
264
- num_proc = self ._hf_num_proc ,
265
- verification_mode = self ._verification_mode ,
266
- )
267
-
268
280
@property
269
281
def _hf_info (self ) -> hf_datasets .DatasetInfo :
270
282
"""Retrieves the dataset info from the HuggingFace Datasets."""
@@ -278,11 +290,18 @@ def _hf_hub_info(self) -> huggingface_hub.hf_api.DatasetInfo:
278
290
)
279
291
280
292
def _hf_features (self ) -> hf_datasets .Features :
281
- if not self ._hf_info .features :
282
- # We need to download and prepare the data to know its features.
283
- self ._hf_download_and_prepare ()
284
-
285
- return self ._hf_info .features
293
+ # Return the features from the builder info.
294
+ if self ._hf_info .features :
295
+ return self ._hf_info .features
296
+ # Return the features from the first split.
297
+ for split in self ._hf_info .splits :
298
+ ds = _load_dataset (
299
+ self ._hf_builder ,
300
+ split ,
301
+ )
302
+ if hasattr (ds , 'info' ) and ds .info .features :
303
+ return ds .info .features
304
+ raise ValueError ('No features found in the dataset.' )
286
305
287
306
def _info (self ) -> dataset_info_lib .DatasetInfo :
288
307
return dataset_info_lib .DatasetInfo (
@@ -309,7 +328,6 @@ def _generate_splits(
309
328
) -> Sequence [splits_lib .SplitInfo ]:
310
329
"""Prepares the dataset by writing to shards directly."""
311
330
del dl_manager , download_config # Unused.
312
- self ._hf_download_and_prepare ()
313
331
314
332
shard_specs_by_split : dict [str , Sequence [_ShardSpec ]] = {}
315
333
for hf_split , hf_split_info in self ._hf_info .splits .items ():
0 commit comments