Merge branch 'main' into inc-support

ethanglaser · ethanglaser · commit 66d977dfeae4 · 2025-03-06T16:57:01.000-08:00
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,7 +1,15 @@
-# owners and reviewers
-configs             @Alexsandruss
-configs/spmd*       @Alexsandruss @ethanglaser
-sklbench            @Alexsandruss
-*.md                @Alexsandruss @samir-nasibli
-requirements*.txt   @Alexsandruss @ethanglaser
-conda-env-*.yml     @Alexsandruss @ethanglaser
+# benchmarking config files
+configs                    @Alexsandruss
+configs/spmd/*             @ethanglaser
+configs/regular/xgboost*   @razdoburdin
+# sklbench implementation and associated docs
+sklbench                   @Alexsandruss
+# all documentation files
+*.md                       @Alexsandruss
+# dependencies files
+requirements*.txt          @Alexsandruss @ethanglaser
+conda-env-*.yml            @Alexsandruss @ethanglaser
+# repository utilities
+/.*                        @Alexsandruss
+/*.yml                     @Alexsandruss
+/*.toml                    @Alexsandruss
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,31 @@
+## Description
+
+_Add a comprehensive description of proposed changes_
+
+_List associated issue number(s) if exist(s): #6 (for example)_
+
+_Documentation PR (if needed): #1340 (for example)_
+
+---
+
+PR should start as a draft, then move to ready for review state after CI is passed and all applicable checkboxes are closed.
+This approach ensures that reviewers don't spend extra time asking for regular requirements.
+
+You can remove a checkbox as not applicable only if it doesn't relate to this PR in any way.
+
+Checklist to comply with **before moving PR from draft**:
+
+**PR completeness and readability**
+
+- [ ] I have reviewed my changes thoroughly before submitting this pull request.
+- [ ] I have commented my code, particularly in hard-to-understand areas.
+- [ ] I have updated the documentation to reflect the changes or created a separate PR with update and provided its number in the description, if necessary.
+- [ ] Git commit message contains an appropriate signed-off-by string _(see [CONTRIBUTING.md](https://github.com/intel/scikit-learn-intelex/blob/main/CONTRIBUTING.md#pull-requests) for details)_.
+- [ ] I have added a respective label(s) to PR if I have a permission for that.
+- [ ] I have resolved any merge conflicts that might occur with the base branch.
+
+**Testing**
+
+- [ ] I have run it locally and tested the changes extensively.
+- [ ] All CI jobs are green or I have provided justification why they aren't.
+- [ ] I have extended testing suite if new functionality was introduced in this PR.
diff --git a/.github/workflows/pr-checklist.yml b/.github/workflows/pr-checklist.yml
@@ -0,0 +1,62 @@
+#===============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+name: Check PR Checklist
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize, ready_for_review, converted_to_draft]
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref_name }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  checklist:
+    name: Close all checkboxes before moving from draft
+    timeout-minutes: 5
+    runs-on: ubuntu-24.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+    - name: Get pull request details
+      id: pr
+      uses: actions/github-script@v7
+      with:
+        script: |
+          const pr_desc = await github.rest.pulls.get({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            pull_number: context.payload.pull_request.number
+          });
+          core.setOutput('body', pr_desc.data.body)
+          core.setOutput('draft', pr_desc.data.draft)
+          core.setOutput('author_type', pr_desc.data.user.type)
+    - name: Check if all checkboxes are checked
+      id: checkboxes
+      env:
+        DESCRIPTION: ${{ steps.pr.outputs.body }}
+      run: |
+        UNCHECKED=$(echo "$DESCRIPTION" | grep -c '\[ \]' || true)
+        echo "unchecked=$UNCHECKED" >> $GITHUB_OUTPUT
+    - name: Fail if not all checkboxes are checked, PR is not draft and author is not a bot
+      if: ${{ (steps.pr.outputs.draft == 'false') && (steps.checkboxes.outputs.unchecked != '0') && (steps.pr.outputs.author_type != 'Bot') }}
+      run: |
+        echo "Unchecked checkboxes: ${{ steps.checkboxes.outputs.unchecked }}"
+        exit 1
diff --git a/.gitignore b/.gitignore
@@ -1,18 +1,20 @@
-# Release and work directories
+# Python cache
 __pycache__*
-__work*
 
 # Visual Studio related files, e.g., ".vscode"
 .vs*
 
-# Dataset files
-data_cache
-*.csv
-*.npy
-*.npz
 
-# Results at repo root
-vtune_results
+# Misc. files at repository root:
+# - default data cache directory
+/data_cache
+# - results
+/_*results*
 /*.json
 /*.xlsx
+# - scripts
 /*.ipynb
+/*.py
+/*.sh
+# - archives with results or data
+/*.tgz
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,12 +16,12 @@
 
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 24.1.1
     hooks:
       - id: black
         language_version: python3.10
   - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         language_version: python3.10
diff --git a/README.md b/README.md
@@ -100,6 +100,6 @@ flowchart TB
 - [Benchmarks Runner](sklbench/runner/README.md)
 - [Report Generator](sklbench/report/README.md)
 - [Benchmarks](sklbench/benchmarks/README.md)
-- [Data Processing](sklbench/datasets/README.md)
+- [Data Processing and Storage](sklbench/datasets/README.md)
 - [Emulators](sklbench/emulators/README.md)
 - [Developer Guide](docs/README.md)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -19,14 +19,10 @@ jobs:
     pool:
       vmImage: 'ubuntu-22.04'
     steps:
-    - task: UsePythonVersion@0
-      inputs:
-        versionSpec: '3.10'
-        addToPath: true
     - script: |
-        python -m pip install --upgrade pip setuptools
-        pip install isort black
-        isort --check . && black --check .
+        python -m pip install --upgrade pip pre-commit==4.0.1
+        pre-commit install
+        pre-commit run --all-files --show-diff-on-failure
       displayName: 'Linting'
   - job: Linux
     dependsOn: Lint
diff --git a/configs/README.md b/configs/README.md
@@ -85,7 +85,7 @@ Configs have the three highest parameter keys:
 |<h3>Benchmark workflow parameters</h3>||||
 | `bench`:`taskset` | None |  | Value for `-c` argument of `taskset` utility used over benchmark subcommand. |
 | `bench`:`vtune_profiling` | None |  | Analysis type for `collect` argument of Intel(R) VTune* Profiler tool. Linux* OS only. |
-| `bench`:`vtune_results_directory` | `vtune_results` |  | Directory path to store Intel(R) VTune* Profiler results. |
+| `bench`:`vtune_results_directory` | `_vtune_results` |  | Directory path to store Intel(R) VTune* Profiler results. |
 | `bench`:`n_runs` | `10` |  | Number of runs for measured entity. |
 | `bench`:`time_limit` | `3600` |  | Time limit in seconds before the benchmark early stop. |
 | `bench`:`distributor` | None | None, `mpi` | Library used to handle distributed algorithm. |
diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json
@@ -70,7 +70,7 @@
     "TEMPLATES": {
         "sklearn kmeans": {
             "SETS": [
-                "sklearn-ex[preview] implementations",
+                "sklearn-ex[cpu,gpu] implementations",
                 "common kmeans parameters",
                 "sklearn kmeans parameters",
                 "kmeans datasets"
diff --git a/sklbench/datasets/README.md b/sklbench/datasets/README.md
@@ -1,4 +1,4 @@
-# Data Handling in Benchmarks
+# Data Processing and Storage in Benchmarks
 
 Data handling steps:
 1. Load data:
@@ -7,6 +7,14 @@ Data handling steps:
 2. Split data into subsets if requested
 3. Convert to requested form (data type, format, order, etc.)
 
+Existing data sources:
+ - Synthetic data from sklearn
+ - OpenML datasets
+ - Custom loaders for named datasets
+ - User-provided datasets in compatible format
+
+## Data Caching
+
 There are two levels of caching with corresponding directories: `raw cache` for files downloaded from external sources, and just `cache` for files applicable for fast-loading in benchmarks.
 
 Each dataset has few associated files in usual `cache`: data component files (`x`, `y`, `weights`, etc.) and JSON file with dataset properties (number of classes, clusters, default split arguments).
@@ -21,16 +29,39 @@ data_cache/
 ```
 
 Cached file formats:
-| Format | File extension | Associated Python types |
-| --- | --- | --- |
-| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame |
-| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series |
-| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix |
+| Format | File extension | Associated Python types | Comment |
+| --- | --- | --- | --- |
+| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame |  |
+| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series | Data is stored under `arr_0` name |
+| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix | Data is stored under `data`, `indices` and `indptr` names |
 
-Existing data sources:
- - Synthetic data from sklearn
- - OpenML datasets
- - Custom loaders for named datasets
+## How to Modify Dataset for Compatibility with Scikit-learn_bench
+
+In order to reuse an existing dataset in scikit-learn_bench, you need to convert its file(s) into compatible format for dataset cache loader.
+
+Cached dataset consist of few files:
+- `{dataset name}.json` file which store required and optional dataset information
+- `{dataset name}_{data component name}.{data component extension}` files which store dataset components (data, labels, etc.)
+
+Example of `{dataset name}.json`:
+```json
+{"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 11}}
+```
+
+`n_classes` property in a dataset info file is *required* for classification datasets.
+
+Currently, `x` (data) and `y` (labels) are the only supported and *required* data components.
+
+Scikit-learn_bench-compatible dataset should be stored in `data:cache_directory` (`${PWD}/data_cache` or `{repository root}/data_cache` by default).
+
+You can specify created compatible dataset in config files the same way as datasets explicitly registered in scikit-learn_bench using its name:
+```json
+{
+    "data": {
+        "dataset": "{dataset name}"
+    }
+}
+```
 
 ---
 [Documentation tree](../../README.md#-documentation)
diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py
@@ -22,6 +22,7 @@
 from ..utils.custom_types import BenchCase
 from .loaders import (
     dataset_loading_functions,
+    load_custom_data,
     load_openml_data,
     load_sklearn_synthetic_data,
 )
@@ -47,9 +48,17 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
     dataset = get_bench_case_value(bench_case, "data:dataset")
     if dataset is not None:
         dataset_params = get_bench_case_value(bench_case, "data:dataset_kwargs", dict())
-        return dataset_loading_functions[dataset](
-            **common_kwargs, preproc_kwargs=preproc_kwargs, dataset_params=dataset_params
-        )
+        if dataset in dataset_loading_functions:
+            # registered dataset loading branch
+            return dataset_loading_functions[dataset](
+                **common_kwargs,
+                preproc_kwargs=preproc_kwargs,
+                dataset_params=dataset_params,
+            )
+        else:
+            # user-provided dataset loading branch
+            return load_custom_data(**common_kwargs, preproc_kwargs=preproc_kwargs)
+
     # load by source
     source = get_bench_case_value(bench_case, "data:source")
     if source is not None:
diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py
@@ -29,7 +29,7 @@
     make_regression,
 )
 
-from .common import cache, preprocess
+from .common import cache, load_data_description, load_data_from_cache, preprocess
 from .downloaders import (
     download_and_read_csv,
     download_kaggle_files,
@@ -84,6 +84,18 @@ def load_sklearn_synthetic_data(
     return {"x": x, "y": y}, data_desc
 
 
+@preprocess
+def load_custom_data(
+    data_name: str,
+    data_cache: str,
+    raw_data_cache: str,
+):
+    """Function to load data specified by user and stored in format compatible with scikit-learn_bench cache"""
+    return load_data_from_cache(data_cache, data_name), load_data_description(
+        data_cache, data_name
+    )
+
+
 """
 Classification datasets
 """
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
@@ -357,7 +357,7 @@ def generate_report(args: argparse.Namespace):
     summary_df = summary_df[summary_df.columns.sortlevel(level=0, ascending=False)[0]]
     logger.info(f"{custom_format('Report summary', bcolor='HEADER')}\n{summary_df}")
     if summary_df.size > 0:
-        summary_ws = wb.create_sheet("Summary")
+        summary_ws = wb.create_sheet(title="Summary", index=0)
         write_df_to_sheet(summary_df, summary_ws)
         apply_rules_for_sheet(summary_ws, args.perf_color_scale, args.quality_color_scale)
     # write environment info
diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py
@@ -51,7 +51,7 @@ def generate_benchmark_command(
     if vtune_profiling is not None:
         if sys.platform == "linux":
             vtune_result_dir = get_bench_case_value(
-                bench_case, "bench:vtune_results_directory", "vtune_results"
+                bench_case, "bench:vtune_results_directory", "_vtune_results"
             )
             os.makedirs(vtune_result_dir, exist_ok=True)
             vtune_result_path = os.path.join(
diff --git a/sklbench/utils/common.py b/sklbench/utils/common.py
@@ -66,7 +66,11 @@ def read_output_from_command(command: str) -> Tuple[int, str, str]:
         stderr=sp.PIPE,
         encoding="utf-8",
     )
-    return res.returncode, res.stdout[:-1], res.stderr[:-1]
+    return (
+        res.returncode,
+        res.stdout.strip(),
+        res.stderr.strip(),
+    )
 
 
 def hash_from_json_repr(x: JsonTypesUnion, hash_limit: int = 5) -> str:
@@ -120,7 +124,7 @@ def flatten_list(input_list: List, ensure_type_homogeneity: bool = False) -> Lis
 
 
 def get_module_members(
-    module_names_chain: Union[List, str]
+    module_names_chain: Union[List, str],
 ) -> Tuple[ModuleContentMap, ModuleContentMap]:
     def get_module_name(module_names_chain: List[str]) -> str:
         name = module_names_chain[0]