feat: Add additional data descriptors to benchmark results

marcialouis · marcialouis · commit 6bad102d9e7d · 2025-03-07T16:35:15.000-08:00
- Updated  to include additional information (, , and ) in the benchmark results.
- Ensured that the  dictionary is updated with the new fields and metrics.
- Modified  to handle and process the new data descriptors in the benchmark results.

These changes enhance the benchmark results by providing more detailed information about the dataset and its characteristics
diff --git a/sklbench/benchmarks/custom_function.py b/sklbench/benchmarks/custom_function.py
@@ -104,11 +104,14 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
         "function": function_name,
     }
     result = enrich_result(result, bench_case)
-    # TODO: replace `x_train` data_desc with more informative values
-    result.update(data_description["x_train"])
+    # Replace `x_train` data_desc with more informative values
+    result.update({
+        "memory_usage": x_train.nbytes,
+        "feature_names": list(x_train.columns) if isinstance(x_train, pd.DataFrame) else None,
+        "class_distribution": dict(pd.Series(y_train).value_counts()) if y_train is not None else None
+    })
     result.update(metrics)
     return [result]
 
-
 if __name__ == "__main__":
     main_template(main)
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
@@ -89,12 +89,15 @@
     "dataset",
     "samples",
     "features",
+    "feature_names",
     "format",
     "dtype",
     "order",
     "n_classes",
+    "class_distribution",
     "n_clusters",
     "batch_size",
+    "memory_usage",
 ]
 
 DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"]