[test] Synthesize the California housing dataset. (#11672)

trivialfis · web-flow · commit f47b02fc17a0 · 2025-09-04T03:06:34.000+08:00
diff --git a/demo/guide-python/gpu_tree_shap.py b/demo/guide-python/gpu_tree_shap.py
@@ -5,16 +5,24 @@
 Demonstrates using GPU acceleration to compute SHAP values for feature importance.
 
 """
+from urllib.error import HTTPError
+
 import shap
-from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_california_housing, make_regression
 
 import xgboost as xgb
 
 # Fetch dataset using sklearn
-data = fetch_california_housing()
-print(data.DESCR)
-X = data.data
-y = data.target
+try:
+    _data = fetch_california_housing(return_X_y=True)
+    X = _data.data
+    y = _data.target
+    feature_names = _data.feature_names
+    print(_data.DESCR)
+except HTTPError:
+    # Use a synthetic dataset instead if we couldn't
+    X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
+    feature_names = [f"f{i}" for i in range(8)]
 
 num_round = 500
 
@@ -26,7 +34,7 @@
 }
 
 # GPU accelerated training
-dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
+dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
 model = xgb.train(param, dtrain, num_round)
 
 # Compute shap values using GPU with xgboost
@@ -47,9 +55,9 @@
     explainer.expected_value,
     shap_values[0, :],
     X[0, :],
-    feature_names=data.feature_names,
+    feature_names=feature_names,
     matplotlib=True,
 )
 
 # Show a summary of feature importance
-shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
+shap.summary_plot(shap_values, X, plot_type="bar", feature_names=feature_names)
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
@@ -11,9 +11,15 @@
 """
 
 import pickle
+from urllib.error import HTTPError
 
 import numpy as np
-from sklearn.datasets import fetch_california_housing, load_digits, load_iris
+from sklearn.datasets import (
+    fetch_california_housing,
+    load_digits,
+    load_iris,
+    make_regression,
+)
 from sklearn.metrics import confusion_matrix, mean_squared_error
 from sklearn.model_selection import GridSearchCV, KFold, train_test_split
 
@@ -44,7 +50,13 @@
     print(confusion_matrix(actuals, predictions))
 
 print("California Housing: regression")
-X, y = fetch_california_housing(return_X_y=True)
+
+try:
+    X, y = fetch_california_housing(return_X_y=True)
+except HTTPError:
+    # Use a synthetic dataset instead if we couldn't
+    X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
+
 kf = KFold(n_splits=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf.split(X):
     xgb_model = xgb.XGBRegressor(n_jobs=1).fit(X[train_index], y[train_index])
diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py
@@ -4,15 +4,20 @@
 """
 
 import multiprocessing
+from urllib.error import HTTPError
 
-from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_california_housing, make_regression
 from sklearn.model_selection import GridSearchCV
 
 import xgboost as xgb
 
 if __name__ == "__main__":
     print("Parallel Parameter optimization")
-    X, y = fetch_california_housing(return_X_y=True)
+    try:
+        X, y = fetch_california_housing(return_X_y=True)
+    except HTTPError:
+        # Use a synthetic dataset instead if we couldn't
+        X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
     # Make sure the number of threads is balanced.
     xgb_model = xgb.XGBRegressor(
         n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"
diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py
@@ -7,16 +7,22 @@
 
 """
 
+from urllib.error import HTTPError
+
 import numpy as np
-from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_california_housing, make_regression
 
 import xgboost as xgb
 
 
-def main():
+def main() -> None:
     n_rounds = 32
 
-    X, y = fetch_california_housing(return_X_y=True)
+    try:
+        X, y = fetch_california_housing(return_X_y=True)
+    except HTTPError:
+        # Use a synthetic dataset instead if we couldn't
+        X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
 
     # Train a model first
     X_train = X[: X.shape[0] // 2]
@@ -50,7 +56,7 @@ def main():
 
     # Refresh the model without changing the leaf value, but tree statistic including
     # cover and weight are refreshed.
-    refresh_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    refresh_result = {}
     refreshed = xgb.train(
         {"process_type": "update", "updater": "refresh", "refresh_leaf": False},
         Xy_refresh,
diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py
@@ -131,6 +131,7 @@ class LintersPaths:
         "demo/guide-python/sklearn_examples.py",
         "demo/guide-python/continuation.py",
         "demo/guide-python/callbacks.py",
+        "demo/guide-python/update_process.py",
         "demo/guide-python/cat_in_the_dat.py",
         "demo/guide-python/categorical.py",
         "demo/guide-python/cat_pipeline.py",
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
@@ -120,7 +120,7 @@ def plot_importance(
 
     if show_values is True:
         for x, y in zip(values, ylocs):
-            ax.text(x + 1, y, values_format.format(v=x), va="center")
+            ax.text(x + 1, float(y), values_format.format(v=x), va="center")
 
     ax.set_yticks(ylocs)
     ax.set_yticklabels(labels)
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
@@ -241,10 +241,58 @@ def check_inf(rng: RNG) -> None:
 
 @memory.cache
 def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """Fetch the California housing dataset from sklearn."""
-    datasets = pytest.importorskip("sklearn.datasets")
-    data = datasets.fetch_california_housing()
-    return data.data, data.target
+    """Synthesize a dataset similar to the sklearn California housing dataset.
+
+    The real one can be obtained via:
+
+    .. code-block::
+
+        import sklearn.datasets
+
+        X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)
+
+    """
+    n_samples = 20640
+    rng = np.random.default_rng(2025)
+
+    pd = pytest.importorskip("pandas")
+
+    def mixture_2comp(
+        means: List[float], sigmas: List[float], weights: List[float]
+    ) -> np.ndarray:
+        l0 = rng.normal(
+            size=(int(n_samples * weights[0])), loc=means[0], scale=sigmas[0]
+        )
+        l1 = rng.normal(size=(n_samples - l0.shape[0]), loc=means[1], scale=sigmas[1])
+        return np.concatenate([l0, l1], axis=0)
+
+    def norm(mean: float, std: float) -> np.ndarray:
+        return rng.normal(loc=mean, scale=std, size=(n_samples,))
+
+    df = pd.DataFrame(
+        {
+            "Longitude": mixture_2comp(
+                [-118.0703597, -121.85682825],
+                [0.7897320650373969, 0.7248398629412008],
+                [0.60402556, 0.39597444],
+            ),
+            "Latitude": mixture_2comp(
+                [37.84266317, 33.86030848],
+                [1.0643911549736087, 0.5049274656834589],
+                [0.44485062, 0.55514938],
+            ),
+            "MedInc": norm(mean=3.8706710029069766, std=1.8997756945748738),
+            "HouseAge": norm(mean=28.639486434108527, std=12.585252725724606),
+            "AveRooms": norm(mean=5.428999742190376, std=2.474113202333516),
+            "AveBedrms": norm(mean=1.096675149606208, std=0.47389937625774475),
+            "Population": norm(mean=1425.4767441860465, std=1132.434687757615),
+            "AveOccup": norm(mean=3.0706551594363742, std=10.385797959128219),
+            "MedHouseVal": norm(mean=2.068558169089147, std=1.1539282040412253),
+        }
+    )
+    X = df[df.columns.difference(["MedHouseVal"])].to_numpy()
+    y = df["MedHouseVal"].to_numpy()
+    return X, y
 
 
 @memory.cache
diff --git a/python-package/xgboost/testing/with_skl.py b/python-package/xgboost/testing/with_skl.py
@@ -8,6 +8,7 @@
 
 from ..core import DMatrix
 from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
+from .data import get_california_housing
 from .ordinal import make_recoded
 from .utils import Device
 
@@ -114,11 +115,10 @@ def run_boost_from_prediction_multi_clasas(
 
 def run_housing_rf_regression(tree_method: str, device: Device) -> None:
     """Testwith the cali housing dataset."""
-    from sklearn.datasets import fetch_california_housing
     from sklearn.metrics import mean_squared_error
     from sklearn.model_selection import KFold
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     rng = np.random.RandomState(1994)
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
@@ -10,7 +10,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.data import np_dtypes, pd_dtypes
+from xgboost.testing.data import get_california_housing, np_dtypes, pd_dtypes
 from xgboost.testing.predict import run_base_margin_vs_base_score, run_predict_leaf
 
 
@@ -36,9 +36,7 @@ def test_predict_leaf(DMatrixT: Type[xgb.DMatrix]) -> None:
 
 
 def test_predict_shape():
-    from sklearn.datasets import fetch_california_housing
-
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     reg = xgb.XGBRegressor(n_estimators=1)
     reg.fit(X, y)
     predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
diff --git a/tests/python/test_with_shap.py b/tests/python/test_with_shap.py
@@ -2,6 +2,7 @@
 import pytest
 
 import xgboost as xgb
+from xgboost.testing.data import get_california_housing
 
 try:
     import shap
@@ -16,9 +17,7 @@
 # xgboost removed ntree_limit in 2.0, which breaks the SHAP package.
 @pytest.mark.xfail
 def test_with_shap() -> None:
-    from sklearn.datasets import fetch_california_housing
-
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     dtrain = xgb.DMatrix(X, label=y)
     model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
     explainer = shap.TreeExplainer(model)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
@@ -12,6 +12,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import get_california_housing
 from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
 from xgboost.testing.updater import get_basescore
@@ -464,11 +465,10 @@ def test_num_parallel_tree():
 
 
 def test_regression():
-    from sklearn.datasets import fetch_california_housing
     from sklearn.metrics import mean_squared_error
     from sklearn.model_selection import KFold
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
@@ -501,28 +501,26 @@ def test_rf_regression():
 
 @pytest.mark.parametrize("tree_method", ["exact", "hist", "approx"])
 def test_parameter_tuning(tree_method: str) -> None:
-    from sklearn.datasets import fetch_california_housing
     from sklearn.model_selection import GridSearchCV
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     reg = xgb.XGBRegressor(learning_rate=0.1, tree_method=tree_method)
     grid_cv = GridSearchCV(
         reg, {"max_depth": [2, 4], "n_estimators": [50, 200]}, cv=2, verbose=1
     )
     grid_cv.fit(X, y)
     assert grid_cv.best_score_ < 0.7
     assert grid_cv.best_params_ == {
-        "n_estimators": 200,
-        "max_depth": 4 if tree_method == "exact" else 2,
+        "n_estimators": 50,
+        "max_depth": 2,
     }
 
 
 def test_regression_with_custom_objective():
-    from sklearn.datasets import fetch_california_housing
     from sklearn.metrics import mean_squared_error
     from sklearn.model_selection import KFold
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBRegressor(objective=tm.ls_obj).fit(
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -38,6 +38,7 @@
     make_categorical,
     run_recode,
 )
+from xgboost.testing.data import get_california_housing
 from xgboost.testing.params import hist_cache_strategy, hist_parameter_strategy
 from xgboost.testing.shared import (
     get_feature_weights,
@@ -1629,9 +1630,7 @@ def test_feature_weights(self, client: "Client") -> None:
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_custom_objective(self, client: "Client") -> None:
-        from sklearn.datasets import fetch_california_housing
-
-        X, y = fetch_california_housing(return_X_y=True)
+        X, y = get_california_housing()
         X, y = da.from_array(X), da.from_array(y)
         rounds = 20