File tree Expand file tree Collapse file tree 4 files changed +13
-5
lines changed Expand file tree Collapse file tree 4 files changed +13
-5
lines changed Original file line number Diff line number Diff line change @@ -155,10 +155,11 @@ def setup_source_data(hyperparameters: RuntimeHyperparameters) -> SourceDataset:
155
155
156
156
if hyperparameters ["source_data" ]["pre_tokenized" ]:
157
157
return PreTokenizedDataset (
158
- dataset_path = hyperparameters ["source_data" ]["dataset_path" ],
159
158
context_size = hyperparameters ["source_data" ]["context_size" ],
160
159
dataset_dir = dataset_dir ,
161
160
dataset_files = dataset_files ,
161
+ dataset_path = hyperparameters ["source_data" ]["dataset_path" ],
162
+ pre_download = hyperparameters ["source_data" ]["pre_download" ],
162
163
)
163
164
164
165
if hyperparameters ["source_data" ]["tokenizer_name" ] is None :
@@ -171,12 +172,13 @@ def setup_source_data(hyperparameters: RuntimeHyperparameters) -> SourceDataset:
171
172
tokenizer = AutoTokenizer .from_pretrained (hyperparameters ["source_data" ]["tokenizer_name" ])
172
173
173
174
return TextDataset (
174
- dataset_path = hyperparameters ["source_data" ]["dataset_path" ],
175
175
context_size = hyperparameters ["source_data" ]["context_size" ],
176
- tokenizer = tokenizer ,
177
176
dataset_dir = dataset_dir ,
178
177
dataset_files = dataset_files ,
178
+ dataset_path = hyperparameters ["source_data" ]["dataset_path" ],
179
179
n_processes_preprocessing = 4 ,
180
+ pre_download = hyperparameters ["source_data" ]["pre_download" ],
181
+ tokenizer = tokenizer ,
180
182
)
181
183
182
184
Original file line number Diff line number Diff line change @@ -177,9 +177,12 @@ class SourceDataHyperparameters(NestedParameter):
177
177
dataset_dir : Parameter [str ] | None = field (default = None )
178
178
"""Dataset directory (within the HF dataset)"""
179
179
180
- dataset_files : Parameter [str ] | None = field (default = None )
180
+ dataset_files : Parameter [list [ str ] ] | None = field (default = None )
181
181
"""Dataset files (within the HF dataset)."""
182
182
183
+ pre_download : Parameter [bool ] = field (default = Parameter (value = False ))
184
+ """Whether to pre-download the dataset."""
185
+
183
186
pre_tokenized : Parameter [bool ] = field (default = Parameter (value = True ))
184
187
"""If the dataset is pre-tokenized."""
185
188
@@ -209,8 +212,9 @@ class SourceDataRuntimeHyperparameters(TypedDict):
209
212
210
213
context_size : int
211
214
dataset_dir : str | None
212
- dataset_files : str | None
215
+ dataset_files : list [ str ] | None
213
216
dataset_path : str
217
+ pre_download : bool
214
218
pre_tokenized : bool
215
219
tokenizer_name : str | None
216
220
Original file line number Diff line number Diff line change @@ -53,6 +53,7 @@ def dummy_hyperparameters() -> RuntimeHyperparameters:
53
53
"dataset_path" : "NeelNanda/c4-code-tokenized-2b" ,
54
54
"pre_tokenized" : True ,
55
55
"tokenizer_name" : None ,
56
+ "pre_download" : False ,
56
57
},
57
58
"source_model" : {
58
59
"dtype" : "float32" ,
Original file line number Diff line number Diff line change @@ -269,6 +269,7 @@ def __repr__(self) -> str:
269
269
float ,
270
270
int ,
271
271
str ,
272
+ list [str ],
272
273
)
273
274
274
275
You can’t perform that action at this time.
0 commit comments