Support incremental benchmarking of datasets larger than memory

ethanglaser · ethanglaser · commit 7aa42a3c92fe · 2025-03-24T07:31:04.000-07:00
diff --git a/configs/README.md b/configs/README.md
@@ -117,7 +117,7 @@ Configs have the three highest parameter keys:
 |:---------------|:--------------|:--------|:------------|
 | `algorithm`:`estimator` | None |  | Name of measured estimator. |
 | `algorithm`:`estimator_params` | Empty `dict` |  | Parameters for estimator constructor. |
-| `algorithm`:`training`:`num_batches` | 5 |  | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. |
+| `algorithm`:`num_batches`:`training` | 5 |  | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. |
 | `algorithm`:`online_inference_mode` | False |  | Enables online mode for inference methods of estimator (separate call for each sample). |
 | `algorithm`:`sklearn_context` | None |  | Parameters for sklearn `config_context` used over estimator. |
 | `algorithm`:`sklearnex_context` | None |  | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. |
diff --git a/configs/sklearnex_incremental_example.json b/configs/sklearnex_incremental_example.json
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
@@ -260,10 +260,7 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
             # only relative improvements are included in summary currently
             if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
                 metric_columns.append(column)
-    if metric_columns:
-        summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
-    else:
-        summary = pd.DataFrame()
+    summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
     summary.index = pd.Index([df_name])
     return summary
 
diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml
@@ -45,11 +45,6 @@ steps:
       conda activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
-  - script: |
-      source /usr/share/miniconda/etc/profile.d/conda.sh
-      conda activate bench-env
-      python -m sklbench --report -l DEBUG --report -c configs/sklearnex_incremental_example.json
-    displayName: Incremental algorithms example run
   - script: |
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate bench-env
diff --git a/test-configuration-win.yml b/test-configuration-win.yml
@@ -43,10 +43,6 @@ steps:
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
-  - script: |
-      call activate bench-env
-      python -m sklbench --report -l DEBUG --report -c configs/incremental.json
-    displayName: Incremental algorithms example run
   - script: |
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json