Skip to content

Commit 72dfdd2

Browse files
committed
Merge branch 'main' into large-scale
2 parents d419a01 + e76e463 commit 72dfdd2

File tree

11 files changed

+189
-34
lines changed

11 files changed

+189
-34
lines changed

.github/CODEOWNERS

+15-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
1-
# owners and reviewers
2-
configs @Alexsandruss
3-
configs/spmd* @Alexsandruss @ethanglaser
4-
sklbench @Alexsandruss
5-
*.md @Alexsandruss @samir-nasibli
6-
requirements*.txt @Alexsandruss @ethanglaser
7-
conda-env-*.yml @Alexsandruss @ethanglaser
1+
# benchmarking config files
2+
configs @Alexsandruss
3+
configs/spmd/* @ethanglaser
4+
configs/regular/xgboost* @razdoburdin
5+
# sklbench implementation and associated docs
6+
sklbench @Alexsandruss
7+
# all documentation files
8+
*.md @Alexsandruss
9+
# dependencies files
10+
requirements*.txt @Alexsandruss @ethanglaser
11+
conda-env-*.yml @Alexsandruss @ethanglaser
12+
# repository utilities
13+
/.* @Alexsandruss
14+
/*.yml @Alexsandruss
15+
/*.toml @Alexsandruss

.github/pull_request_template.md

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
## Description
2+
3+
_Add a comprehensive description of proposed changes_
4+
5+
_List associated issue number(s) if exist(s): #6 (for example)_
6+
7+
_Documentation PR (if needed): #1340 (for example)_
8+
9+
---
10+
11+
PR should start as a draft, then move to ready for review state after CI is passed and all applicable checkboxes are closed.
12+
This approach ensures that reviewers don't spend extra time asking for regular requirements.
13+
14+
You can remove a checkbox as not applicable only if it doesn't relate to this PR in any way.
15+
16+
Checklist to comply with **before moving PR from draft**:
17+
18+
**PR completeness and readability**
19+
20+
- [ ] I have reviewed my changes thoroughly before submitting this pull request.
21+
- [ ] I have commented my code, particularly in hard-to-understand areas.
22+
- [ ] I have updated the documentation to reflect the changes or created a separate PR with update and provided its number in the description, if necessary.
23+
- [ ] Git commit message contains an appropriate signed-off-by string _(see [CONTRIBUTING.md](https://github.com/intel/scikit-learn-intelex/blob/main/CONTRIBUTING.md#pull-requests) for details)_.
24+
- [ ] I have added a respective label(s) to PR if I have a permission for that.
25+
- [ ] I have resolved any merge conflicts that might occur with the base branch.
26+
27+
**Testing**
28+
29+
- [ ] I have run it locally and tested the changes extensively.
30+
- [ ] All CI jobs are green or I have provided justification why they aren't.
31+
- [ ] I have extended testing suite if new functionality was introduced in this PR.

.github/workflows/pr-checklist.yml

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#===============================================================================
2+
# Copyright 2024 Intel Corporation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#===============================================================================
16+
17+
name: Check PR Checklist
18+
19+
on:
20+
pull_request:
21+
types: [opened, edited, synchronize, ready_for_review, converted_to_draft]
22+
23+
permissions:
24+
contents: read
25+
26+
concurrency:
27+
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref_name }}-${{ github.event.number || github.sha }}
28+
cancel-in-progress: true
29+
30+
jobs:
31+
checklist:
32+
name: Close all checkboxes before moving from draft
33+
timeout-minutes: 5
34+
runs-on: ubuntu-24.04
35+
steps:
36+
- name: Checkout
37+
uses: actions/checkout@v4
38+
- name: Get pull request details
39+
id: pr
40+
uses: actions/github-script@v7
41+
with:
42+
script: |
43+
const pr_desc = await github.rest.pulls.get({
44+
owner: context.repo.owner,
45+
repo: context.repo.repo,
46+
pull_number: context.payload.pull_request.number
47+
});
48+
core.setOutput('body', pr_desc.data.body)
49+
core.setOutput('draft', pr_desc.data.draft)
50+
core.setOutput('author_type', pr_desc.data.user.type)
51+
- name: Check if all checkboxes are checked
52+
id: checkboxes
53+
env:
54+
DESCRIPTION: ${{ steps.pr.outputs.body }}
55+
run: |
56+
UNCHECKED=$(echo "$DESCRIPTION" | grep -c '\[ \]' || true)
57+
echo "unchecked=$UNCHECKED" >> $GITHUB_OUTPUT
58+
- name: Fail if not all checkboxes are checked, PR is not draft and author is not a bot
59+
if: ${{ (steps.pr.outputs.draft == 'false') && (steps.checkboxes.outputs.unchecked != '0') && (steps.pr.outputs.author_type != 'Bot') }}
60+
run: |
61+
echo "Unchecked checkboxes: ${{ steps.checkboxes.outputs.unchecked }}"
62+
exit 1

.gitignore

+11-9
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
1-
# Release and work directories
1+
# Python cache
22
__pycache__*
3-
__work*
43

54
# Visual Studio related files, e.g., ".vscode"
65
.vs*
76

8-
# Dataset files
9-
data_cache
10-
*.csv
11-
*.npy
12-
*.npz
137

14-
# Results at repo root
15-
vtune_results
8+
# Misc. files at repository root:
9+
# - default data cache directory
10+
/data_cache
11+
# - results
12+
/_*results*
1613
/*.json
1714
/*.xlsx
15+
# - scripts
1816
/*.ipynb
17+
/*.py
18+
/*.sh
19+
# - archives with results or data
20+
/*.tgz

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,6 @@ flowchart TB
100100
- [Benchmarks Runner](sklbench/runner/README.md)
101101
- [Report Generator](sklbench/report/README.md)
102102
- [Benchmarks](sklbench/benchmarks/README.md)
103-
- [Data Processing](sklbench/datasets/README.md)
103+
- [Data Processing and Storage](sklbench/datasets/README.md)
104104
- [Emulators](sklbench/emulators/README.md)
105105
- [Developer Guide](docs/README.md)

configs/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ Configs have the three highest parameter keys:
8585
|<h3>Benchmark workflow parameters</h3>||||
8686
| `bench`:`taskset` | None | | Value for `-c` argument of `taskset` utility used over benchmark subcommand. |
8787
| `bench`:`vtune_profiling` | None | | Analysis type for `collect` argument of Intel(R) VTune* Profiler tool. Linux* OS only. |
88-
| `bench`:`vtune_results_directory` | `vtune_results` | | Directory path to store Intel(R) VTune* Profiler results. |
88+
| `bench`:`vtune_results_directory` | `_vtune_results` | | Directory path to store Intel(R) VTune* Profiler results. |
8989
| `bench`:`n_runs` | `10` | | Number of runs for measured entity. |
9090
| `bench`:`time_limit` | `3600` | | Time limit in seconds before the benchmark early stop. |
9191
| `bench`:`distributor` | None | None, `mpi` | Library used to handle distributed algorithm. |

sklbench/datasets/README.md

+41-10
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Data Handling in Benchmarks
1+
# Data Processing and Storage in Benchmarks
22

33
Data handling steps:
44
1. Load data:
@@ -7,6 +7,14 @@ Data handling steps:
77
2. Split data into subsets if requested
88
3. Convert to requested form (data type, format, order, etc.)
99

10+
Existing data sources:
11+
- Synthetic data from sklearn
12+
- OpenML datasets
13+
- Custom loaders for named datasets
14+
- User-provided datasets in compatible format
15+
16+
## Data Caching
17+
1018
There are two levels of caching with corresponding directories: `raw cache` for files downloaded from external sources, and just `cache` for files applicable for fast-loading in benchmarks.
1119

1220
Each dataset has few associated files in usual `cache`: data component files (`x`, `y`, `weights`, etc.) and JSON file with dataset properties (number of classes, clusters, default split arguments).
@@ -21,16 +29,39 @@ data_cache/
2129
```
2230

2331
Cached file formats:
24-
| Format | File extension | Associated Python types |
25-
| --- | --- | --- |
26-
| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame |
27-
| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series |
28-
| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix |
32+
| Format | File extension | Associated Python types | Comment |
33+
| --- | --- | --- | --- |
34+
| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame | |
35+
| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series | Data is stored under `arr_0` name |
36+
| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix | Data is stored under `data`, `indices` and `indptr` names |
2937

30-
Existing data sources:
31-
- Synthetic data from sklearn
32-
- OpenML datasets
33-
- Custom loaders for named datasets
38+
## How to Modify Dataset for Compatibility with Scikit-learn_bench
39+
40+
In order to reuse an existing dataset in scikit-learn_bench, you need to convert its file(s) into compatible format for dataset cache loader.
41+
42+
Cached dataset consist of few files:
43+
- `{dataset name}.json` file which store required and optional dataset information
44+
- `{dataset name}_{data component name}.{data component extension}` files which store dataset components (data, labels, etc.)
45+
46+
Example of `{dataset name}.json`:
47+
```json
48+
{"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 11}}
49+
```
50+
51+
`n_classes` property in a dataset info file is *required* for classification datasets.
52+
53+
Currently, `x` (data) and `y` (labels) are the only supported and *required* data components.
54+
55+
Scikit-learn_bench-compatible dataset should be stored in `data:cache_directory` (`${PWD}/data_cache` or `{repository root}/data_cache` by default).
56+
57+
You can specify created compatible dataset in config files the same way as datasets explicitly registered in scikit-learn_bench using its name:
58+
```json
59+
{
60+
"data": {
61+
"dataset": "{dataset name}"
62+
}
63+
}
64+
```
3465

3566
---
3667
[Documentation tree](../../README.md#-documentation)

sklbench/datasets/__init__.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from ..utils.custom_types import BenchCase
2323
from .loaders import (
2424
dataset_loading_functions,
25+
load_custom_data,
2526
load_openml_data,
2627
load_sklearn_synthetic_data,
2728
)
@@ -47,9 +48,17 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
4748
dataset = get_bench_case_value(bench_case, "data:dataset")
4849
if dataset is not None:
4950
dataset_params = get_bench_case_value(bench_case, "data:dataset_kwargs", dict())
50-
return dataset_loading_functions[dataset](
51-
**common_kwargs, preproc_kwargs=preproc_kwargs, dataset_params=dataset_params
52-
)
51+
if dataset in dataset_loading_functions:
52+
# registered dataset loading branch
53+
return dataset_loading_functions[dataset](
54+
**common_kwargs,
55+
preproc_kwargs=preproc_kwargs,
56+
dataset_params=dataset_params,
57+
)
58+
else:
59+
# user-provided dataset loading branch
60+
return load_custom_data(**common_kwargs, preproc_kwargs=preproc_kwargs)
61+
5362
# load by source
5463
source = get_bench_case_value(bench_case, "data:source")
5564
if source is not None:

sklbench/datasets/loaders.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
make_regression,
3030
)
3131

32-
from .common import cache, preprocess
32+
from .common import cache, load_data_description, load_data_from_cache, preprocess
3333
from .downloaders import (
3434
download_and_read_csv,
3535
download_kaggle_files,
@@ -84,6 +84,18 @@ def load_sklearn_synthetic_data(
8484
return {"x": x, "y": y}, data_desc
8585

8686

87+
@preprocess
88+
def load_custom_data(
89+
data_name: str,
90+
data_cache: str,
91+
raw_data_cache: str,
92+
):
93+
"""Function to load data specified by user and stored in format compatible with scikit-learn_bench cache"""
94+
return load_data_from_cache(data_cache, data_name), load_data_description(
95+
data_cache, data_name
96+
)
97+
98+
8799
"""
88100
Classification datasets
89101
"""

sklbench/report/implementation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ def generate_report(args: argparse.Namespace):
360360
summary_df = summary_df[summary_df.columns.sortlevel(level=0, ascending=False)[0]]
361361
logger.info(f"{custom_format('Report summary', bcolor='HEADER')}\n{summary_df}")
362362
if summary_df.size > 0:
363-
summary_ws = wb.create_sheet("Summary")
363+
summary_ws = wb.create_sheet(title="Summary", index=0)
364364
write_df_to_sheet(summary_df, summary_ws)
365365
apply_rules_for_sheet(summary_ws, args.perf_color_scale, args.quality_color_scale)
366366
# write environment info

sklbench/runner/commands_helper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def generate_benchmark_command(
5555
if vtune_profiling is not None:
5656
if sys.platform == "linux":
5757
vtune_result_dir = get_bench_case_value(
58-
bench_case, "bench:vtune_results_directory", "vtune_results"
58+
bench_case, "bench:vtune_results_directory", "_vtune_results"
5959
)
6060
os.makedirs(vtune_result_dir, exist_ok=True)
6161
vtune_result_path = os.path.join(

0 commit comments

Comments
 (0)