Skip to content

Commit 7e0fc7f

Browse files
loading datasets from DATASETSROOT directory (#110)
* knn_svm * Revert "knn_svm" This reverts commit 6272ea5. * datasets root * renaming * pep8 * process case with read only datasets root * remove logging from utils * syntetic * enable DAAL_DATASETS * fix * fix * pep8 * pep8 * pep8 * try fix * download to data * codefactor * exception for unknown type * fix logging * fix pep8
1 parent 7f8c08b commit 7e0fc7f

File tree

3 files changed

+125
-48
lines changed

3 files changed

+125
-48
lines changed

Diff for: datasets/make_datasets.py

+33-12
Original file line numberDiff line numberDiff line change
@@ -15,54 +15,75 @@
1515
# ===============================================================================
1616

1717
import argparse
18+
import logging
19+
import os
1820
import numpy as np
1921
from sklearn.datasets import make_classification, make_regression, make_blobs
2022
from sklearn.utils import check_random_state
2123
import sys
2224

2325

24-
def gen_blobs(args):
26+
def try_gen_dataset(args, folder):
27+
try:
28+
if args.type == 'regression':
29+
gen_regression(args, folder)
30+
elif args.type == 'classification':
31+
gen_classification(args, folder)
32+
elif args.type == 'blobs':
33+
gen_blobs(args, folder)
34+
else:
35+
raise ValueError(f'{args.type} is unknown dataset type')
36+
return True
37+
except BaseException as ex:
38+
logging.warning(f"Internal error generating dataset:\n{ex}")
39+
return False
40+
41+
42+
def gen_blobs(args, folder):
43+
os.makedirs(os.path.join(folder, "data"), exist_ok=True)
2544
X, y = make_blobs(n_samples=args.samples + args.test_samples,
2645
n_features=args.features,
2746
centers=args.clusters,
2847
center_box=(-32, 32),
2948
shuffle=True,
3049
random_state=args.seed)
31-
np.save(args.filex, X[:args.samples])
50+
np.save(os.path.join(folder, args.filex), X[:args.samples])
3251
if args.test_samples != 0:
33-
np.save(args.filextest, X[args.samples:])
52+
np.save(os.path.join(folder, args.filextest), X[args.samples:])
3453
return 0
3554

3655

37-
def gen_regression(args):
56+
def gen_regression(args, folder):
57+
os.makedirs(os.path.join(folder, "data"), exist_ok=True)
3858
rs = check_random_state(args.seed)
3959
X, y = make_regression(n_targets=1,
4060
n_samples=args.samples + args.test_samples,
4161
n_features=args.features,
4262
n_informative=args.features,
4363
bias=rs.normal(0, 3),
4464
random_state=rs)
45-
np.save(args.filex, X[:args.samples])
46-
np.save(args.filey, y[:args.samples])
65+
np.save(os.path.join(folder, args.filex), X[:args.samples])
66+
np.save(os.path.join(folder, args.filey), y[:args.samples])
4767
if args.test_samples != 0:
48-
np.save(args.filextest, X[args.samples:])
49-
np.save(args.fileytest, y[args.samples:])
68+
np.save(os.path.join(folder, args.filextest), X[args.samples:])
69+
np.save(os.path.join(folder, args.fileytest), y[args.samples:])
5070
return 0
5171

5272

53-
def gen_classification(args):
73+
def gen_classification(args, folder):
74+
os.makedirs(os.path.join(folder, "data"), exist_ok=True)
5475
X, y = make_classification(n_samples=args.samples + args.test_samples,
5576
n_features=args.features,
5677
n_informative=args.features,
5778
n_repeated=0,
5879
n_redundant=0,
5980
n_classes=args.classes,
6081
random_state=args.seed)
61-
np.save(args.filex, X[:args.samples])
82+
np.save(os.path.join(folder, args.filex), X[:args.samples])
6283
np.save(args.filey, y[:args.samples])
6384
if args.test_samples != 0:
64-
np.save(args.filextest, X[args.samples:])
65-
np.save(args.fileytest, y[args.samples:])
85+
np.save(os.path.join(folder, args.filextest), X[args.samples:])
86+
np.save(os.path.join(folder, args.fileytest), y[args.samples:])
6687
return 0
6788

6889

Diff for: runner.py

+59-31
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import sys
2323
from typing import Any, Dict, List, Union
2424

25-
import datasets.make_datasets as make_datasets
2625
import utils
2726
from pathlib import Path
2827

@@ -84,8 +83,16 @@ def get_configs(path: Path) -> List[str]:
8483
stream=sys.stdout, format='%(levelname)s: %(message)s', level=args.verbose)
8584
hostname = socket.gethostname()
8685

87-
# make directory for data if it doesn't exist
88-
os.makedirs('data', exist_ok=True)
86+
env = os.environ.copy()
87+
if 'DATASETSROOT' in env:
88+
datasets_root = env['DATASETSROOT']
89+
logging.info(f'Datasets folder at {datasets_root}')
90+
elif 'DAAL_DATASETS' in env:
91+
datasets_root = env['DAAL_DATASETS']
92+
logging.info(f'Datasets folder at {datasets_root}')
93+
else:
94+
datasets_root = ''
95+
logging.info('Datasets folder is not set, using local folder')
8996

9097
json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = {
9198
'hardware': utils.get_hw_parameters(),
@@ -155,23 +162,41 @@ def get_configs(path: Path) -> List[str]:
155162
for dataset in params_set['dataset']:
156163
if dataset['source'] in ['csv', 'npy']:
157164
dataset_name = dataset['name'] if 'name' in dataset else 'unknown'
158-
if 'training' not in dataset or \
159-
'x' not in dataset['training'] or \
160-
not utils.find_the_dataset(dataset_name,
161-
dataset['training']['x']):
165+
if 'training' not in dataset or 'x' not in dataset['training']:
162166
logging.warning(
163167
f'Dataset {dataset_name} could not be loaded. \n'
164-
'Check the correct name or expand the download in '
165-
'the folder dataset.')
168+
'Training data for algorithm is not specified'
169+
)
166170
continue
167-
paths = '--file-X-train ' + dataset['training']["x"]
171+
172+
files = {}
173+
174+
files['file-X-train'] = dataset['training']["x"]
168175
if 'y' in dataset['training']:
169-
paths += ' --file-y-train ' + dataset['training']["y"]
176+
files['file-y-train'] = dataset['training']["y"]
170177
if 'testing' in dataset:
171-
paths += ' --file-X-test ' + dataset["testing"]["x"]
178+
files['file-X-test'] = dataset["testing"]["x"]
172179
if 'y' in dataset['testing']:
173-
paths += ' --file-y-test ' + \
174-
dataset["testing"]["y"]
180+
files['file-y-test'] = dataset["testing"]["y"]
181+
182+
dataset_path = utils.find_the_dataset(dataset_name, datasets_root,
183+
files.values())
184+
if dataset_path is None:
185+
logging.warning(
186+
f'Dataset {dataset_name} could not be loaded. \n'
187+
'Check the correct name or expand the download in '
188+
'the folder dataset.'
189+
)
190+
continue
191+
elif not dataset_path and datasets_root:
192+
logging.info(
193+
f'{dataset_name} is taken from local folder'
194+
)
195+
196+
paths = ''
197+
for data_path, data_file in files.items():
198+
paths += f'--{data_path} {os.path.join(dataset_path, data_file)} '
199+
175200
elif dataset['source'] == 'synthetic':
176201
class GenerationArgs:
177202
classes: int
@@ -186,7 +211,6 @@ class GenerationArgs:
186211
test_samples: int
187212
type: str
188213
gen_args = GenerationArgs()
189-
paths = ''
190214

191215
if 'seed' in params_set:
192216
gen_args.seed = params_set['seed']
@@ -210,38 +234,42 @@ class GenerationArgs:
210234
file_prefix = f'data/synthetic-{gen_args.type}{cls_num_for_file}-'
211235
file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy'
212236

213-
isfiles = True
237+
files = {}
214238
gen_args.filex = f'{file_prefix}X-train{file_postfix}'
215-
paths += f' --file-X-train {gen_args.filex}'
216-
isfiles = isfiles and os.path.isfile(gen_args.filex)
239+
files['file-X-train'] = gen_args.filex
217240
if gen_args.type not in ['blobs']:
218241
gen_args.filey = f'{file_prefix}y-train{file_postfix}'
219-
paths += f' --file-y-train {gen_args.filey}'
220-
isfiles = isfiles and os.path.isfile(gen_args.filey)
242+
files['file-y-train'] = gen_args.filey
221243

222244
if 'testing' in dataset:
223245
gen_args.test_samples = dataset['testing']['n_samples']
224246
gen_args.filextest = f'{file_prefix}X-test{file_postfix}'
225-
paths += f' --file-X-test {gen_args.filextest}'
226-
isfiles = isfiles and os.path.isfile(gen_args.filextest)
247+
files['file-X-test'] = gen_args.filextest
227248
if gen_args.type not in ['blobs']:
228249
gen_args.fileytest = f'{file_prefix}y-test{file_postfix}'
229-
paths += f' --file-y-test {gen_args.fileytest}'
230-
isfiles = isfiles and os.path.isfile(gen_args.fileytest)
250+
files['file-y-test'] = gen_args.fileytest
231251
else:
232252
gen_args.test_samples = 0
233253
gen_args.filextest = gen_args.filex
254+
files['file-X-test'] = gen_args.filextest
234255
if gen_args.type not in ['blobs']:
235256
gen_args.fileytest = gen_args.filey
257+
files['file-y-test'] = gen_args.filey
236258

237-
if not args.dummy_run and not isfiles:
238-
if gen_args.type == 'regression':
239-
make_datasets.gen_regression(gen_args)
240-
elif gen_args.type == 'classification':
241-
make_datasets.gen_classification(gen_args)
242-
elif gen_args.type == 'blobs':
243-
make_datasets.gen_blobs(gen_args)
244259
dataset_name = f'synthetic_{gen_args.type}'
260+
261+
if not args.dummy_run:
262+
dataset_path = utils.find_or_gen_dataset(gen_args,
263+
datasets_root, files.values())
264+
if dataset_path is None:
265+
logging.warning(
266+
f'Dataset {dataset_name} could not be generated. \n'
267+
)
268+
continue
269+
270+
paths = ''
271+
for data_path, data_file in files.items():
272+
paths += f'--{data_path} {os.path.join(dataset_path, data_file)} '
245273
else:
246274
logging.warning('Unknown dataset source. Only synthetics datasets '
247275
'and csv/npy files are supported now')

Diff for: utils.py

+33-5
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616

1717
import json
1818
import os
19-
import pathlib
2019
import platform
2120
import subprocess
2221
import sys
23-
from typing import Any, Dict, List, Tuple, Union, cast
22+
from pathlib import Path
23+
from typing import Any, Dict, Iterable, List, Tuple, Union, cast
2424

25+
from datasets.make_datasets import try_gen_dataset
2526
from datasets.load_datasets import try_load_dataset
2627

2728

@@ -51,9 +52,36 @@ def filter_stdout(text: str) -> Tuple[str, str]:
5152
return filtered, extra
5253

5354

54-
def find_the_dataset(name: str, fullpath: str) -> bool:
55-
return os.path.isfile(fullpath) or try_load_dataset(
56-
dataset_name=name, output_directory=pathlib.Path(fullpath).parent)
55+
def files_in_folder(folder: str, files: Iterable[str]) -> bool:
56+
for file in files:
57+
if not os.path.isfile(os.path.join(folder, file)):
58+
return False
59+
return True
60+
61+
62+
def find_or_gen_dataset(args: Any, folder: str, files: Iterable[str]):
63+
if files_in_folder("", files):
64+
return ""
65+
if folder:
66+
if files_in_folder(folder, files) or \
67+
try_gen_dataset(args, folder):
68+
return folder
69+
if try_gen_dataset(args, ""):
70+
return ""
71+
return None
72+
73+
74+
def find_the_dataset(name: str, folder: str, files: Iterable[str]):
75+
if files_in_folder("", files):
76+
return ""
77+
if folder:
78+
if files_in_folder(folder, files) or \
79+
try_load_dataset(dataset_name=name,
80+
output_directory=Path(os.path.join(folder, "data"))):
81+
return folder
82+
if try_load_dataset(dataset_name=name, output_directory=Path("data")):
83+
return ""
84+
return None
5785

5886

5987
def read_output_from_command(command: str,

0 commit comments

Comments
 (0)