22
22
import sys
23
23
from typing import Any , Dict , List , Union
24
24
25
- import datasets .make_datasets as make_datasets
26
25
import utils
27
26
from pathlib import Path
28
27
@@ -84,8 +83,16 @@ def get_configs(path: Path) -> List[str]:
84
83
stream = sys .stdout , format = '%(levelname)s: %(message)s' , level = args .verbose )
85
84
hostname = socket .gethostname ()
86
85
87
- # make directory for data if it doesn't exist
88
- os .makedirs ('data' , exist_ok = True )
86
+ env = os .environ .copy ()
87
+ if 'DATASETSROOT' in env :
88
+ datasets_root = env ['DATASETSROOT' ]
89
+ logging .info (f'Datasets folder at { datasets_root } ' )
90
+ elif 'DAAL_DATASETS' in env :
91
+ datasets_root = env ['DAAL_DATASETS' ]
92
+ logging .info (f'Datasets folder at { datasets_root } ' )
93
+ else :
94
+ datasets_root = ''
95
+ logging .info ('Datasets folder is not set, using local folder' )
89
96
90
97
json_result : Dict [str , Union [Dict [str , Any ], List [Any ]]] = {
91
98
'hardware' : utils .get_hw_parameters (),
@@ -155,23 +162,41 @@ def get_configs(path: Path) -> List[str]:
155
162
for dataset in params_set ['dataset' ]:
156
163
if dataset ['source' ] in ['csv' , 'npy' ]:
157
164
dataset_name = dataset ['name' ] if 'name' in dataset else 'unknown'
158
- if 'training' not in dataset or \
159
- 'x' not in dataset ['training' ] or \
160
- not utils .find_the_dataset (dataset_name ,
161
- dataset ['training' ]['x' ]):
165
+ if 'training' not in dataset or 'x' not in dataset ['training' ]:
162
166
logging .warning (
163
167
f'Dataset { dataset_name } could not be loaded. \n '
164
- 'Check the correct name or expand the download in '
165
- 'the folder dataset.' )
168
+ 'Training data for algorithm is not specified '
169
+ )
166
170
continue
167
- paths = '--file-X-train ' + dataset ['training' ]["x" ]
171
+
172
+ files = {}
173
+
174
+ files ['file-X-train' ] = dataset ['training' ]["x" ]
168
175
if 'y' in dataset ['training' ]:
169
- paths += ' -- file-y-train ' + dataset ['training' ]["y" ]
176
+ files [ ' file-y-train' ] = dataset ['training' ]["y" ]
170
177
if 'testing' in dataset :
171
- paths += ' -- file-X-test ' + dataset ["testing" ]["x" ]
178
+ files [ ' file-X-test' ] = dataset ["testing" ]["x" ]
172
179
if 'y' in dataset ['testing' ]:
173
- paths += ' --file-y-test ' + \
174
- dataset ["testing" ]["y" ]
180
+ files ['file-y-test' ] = dataset ["testing" ]["y" ]
181
+
182
+ dataset_path = utils .find_the_dataset (dataset_name , datasets_root ,
183
+ files .values ())
184
+ if dataset_path is None :
185
+ logging .warning (
186
+ f'Dataset { dataset_name } could not be loaded. \n '
187
+ 'Check the correct name or expand the download in '
188
+ 'the folder dataset.'
189
+ )
190
+ continue
191
+ elif not dataset_path and datasets_root :
192
+ logging .info (
193
+ f'{ dataset_name } is taken from local folder'
194
+ )
195
+
196
+ paths = ''
197
+ for data_path , data_file in files .items ():
198
+ paths += f'--{ data_path } { os .path .join (dataset_path , data_file )} '
199
+
175
200
elif dataset ['source' ] == 'synthetic' :
176
201
class GenerationArgs :
177
202
classes : int
@@ -186,7 +211,6 @@ class GenerationArgs:
186
211
test_samples : int
187
212
type : str
188
213
gen_args = GenerationArgs ()
189
- paths = ''
190
214
191
215
if 'seed' in params_set :
192
216
gen_args .seed = params_set ['seed' ]
@@ -210,38 +234,42 @@ class GenerationArgs:
210
234
file_prefix = f'data/synthetic-{ gen_args .type } { cls_num_for_file } -'
211
235
file_postfix = f'-{ gen_args .samples } x{ gen_args .features } .npy'
212
236
213
- isfiles = True
237
+ files = {}
214
238
gen_args .filex = f'{ file_prefix } X-train{ file_postfix } '
215
- paths += f' --file-X-train { gen_args .filex } '
216
- isfiles = isfiles and os .path .isfile (gen_args .filex )
239
+ files ['file-X-train' ] = gen_args .filex
217
240
if gen_args .type not in ['blobs' ]:
218
241
gen_args .filey = f'{ file_prefix } y-train{ file_postfix } '
219
- paths += f' --file-y-train { gen_args .filey } '
220
- isfiles = isfiles and os .path .isfile (gen_args .filey )
242
+ files ['file-y-train' ] = gen_args .filey
221
243
222
244
if 'testing' in dataset :
223
245
gen_args .test_samples = dataset ['testing' ]['n_samples' ]
224
246
gen_args .filextest = f'{ file_prefix } X-test{ file_postfix } '
225
- paths += f' --file-X-test { gen_args .filextest } '
226
- isfiles = isfiles and os .path .isfile (gen_args .filextest )
247
+ files ['file-X-test' ] = gen_args .filextest
227
248
if gen_args .type not in ['blobs' ]:
228
249
gen_args .fileytest = f'{ file_prefix } y-test{ file_postfix } '
229
- paths += f' --file-y-test { gen_args .fileytest } '
230
- isfiles = isfiles and os .path .isfile (gen_args .fileytest )
250
+ files ['file-y-test' ] = gen_args .fileytest
231
251
else :
232
252
gen_args .test_samples = 0
233
253
gen_args .filextest = gen_args .filex
254
+ files ['file-X-test' ] = gen_args .filextest
234
255
if gen_args .type not in ['blobs' ]:
235
256
gen_args .fileytest = gen_args .filey
257
+ files ['file-y-test' ] = gen_args .filey
236
258
237
- if not args .dummy_run and not isfiles :
238
- if gen_args .type == 'regression' :
239
- make_datasets .gen_regression (gen_args )
240
- elif gen_args .type == 'classification' :
241
- make_datasets .gen_classification (gen_args )
242
- elif gen_args .type == 'blobs' :
243
- make_datasets .gen_blobs (gen_args )
244
259
dataset_name = f'synthetic_{ gen_args .type } '
260
+
261
+ if not args .dummy_run :
262
+ dataset_path = utils .find_or_gen_dataset (gen_args ,
263
+ datasets_root , files .values ())
264
+ if dataset_path is None :
265
+ logging .warning (
266
+ f'Dataset { dataset_name } could not be generated. \n '
267
+ )
268
+ continue
269
+
270
+ paths = ''
271
+ for data_path , data_file in files .items ():
272
+ paths += f'--{ data_path } { os .path .join (dataset_path , data_file )} '
245
273
else :
246
274
logging .warning ('Unknown dataset source. Only synthetics datasets '
247
275
'and csv/npy files are supported now' )
0 commit comments