Skip to content

Commit f47b02f

Browse files
authored
[test] Synthesize the California housing dataset. (#11672)
1 parent 79eef6a commit f47b02f

File tree

12 files changed

+115
-41
lines changed

12 files changed

+115
-41
lines changed

demo/guide-python/gpu_tree_shap.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,24 @@
55
Demonstrates using GPU acceleration to compute SHAP values for feature importance.
66
77
"""
8+
from urllib.error import HTTPError
9+
810
import shap
9-
from sklearn.datasets import fetch_california_housing
11+
from sklearn.datasets import fetch_california_housing, make_regression
1012

1113
import xgboost as xgb
1214

1315
# Fetch dataset using sklearn
14-
data = fetch_california_housing()
15-
print(data.DESCR)
16-
X = data.data
17-
y = data.target
16+
try:
17+
_data = fetch_california_housing(return_X_y=True)
18+
X = _data.data
19+
y = _data.target
20+
feature_names = _data.feature_names
21+
print(_data.DESCR)
22+
except HTTPError:
23+
# Use a synthetic dataset instead if we couldn't
24+
X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
25+
feature_names = [f"f{i}" for i in range(8)]
1826

1927
num_round = 500
2028

@@ -26,7 +34,7 @@
2634
}
2735

2836
# GPU accelerated training
29-
dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
37+
dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
3038
model = xgb.train(param, dtrain, num_round)
3139

3240
# Compute shap values using GPU with xgboost
@@ -47,9 +55,9 @@
4755
explainer.expected_value,
4856
shap_values[0, :],
4957
X[0, :],
50-
feature_names=data.feature_names,
58+
feature_names=feature_names,
5159
matplotlib=True,
5260
)
5361

5462
# Show a summary of feature importance
55-
shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
63+
shap.summary_plot(shap_values, X, plot_type="bar", feature_names=feature_names)

demo/guide-python/sklearn_examples.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,15 @@
1111
"""
1212

1313
import pickle
14+
from urllib.error import HTTPError
1415

1516
import numpy as np
16-
from sklearn.datasets import fetch_california_housing, load_digits, load_iris
17+
from sklearn.datasets import (
18+
fetch_california_housing,
19+
load_digits,
20+
load_iris,
21+
make_regression,
22+
)
1723
from sklearn.metrics import confusion_matrix, mean_squared_error
1824
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
1925

@@ -44,7 +50,13 @@
4450
print(confusion_matrix(actuals, predictions))
4551

4652
print("California Housing: regression")
47-
X, y = fetch_california_housing(return_X_y=True)
53+
54+
try:
55+
X, y = fetch_california_housing(return_X_y=True)
56+
except HTTPError:
57+
# Use a synthetic dataset instead if we couldn't
58+
X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
59+
4860
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
4961
for train_index, test_index in kf.split(X):
5062
xgb_model = xgb.XGBRegressor(n_jobs=1).fit(X[train_index], y[train_index])

demo/guide-python/sklearn_parallel.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,20 @@
44
"""
55

66
import multiprocessing
7+
from urllib.error import HTTPError
78

8-
from sklearn.datasets import fetch_california_housing
9+
from sklearn.datasets import fetch_california_housing, make_regression
910
from sklearn.model_selection import GridSearchCV
1011

1112
import xgboost as xgb
1213

1314
if __name__ == "__main__":
1415
print("Parallel Parameter optimization")
15-
X, y = fetch_california_housing(return_X_y=True)
16+
try:
17+
X, y = fetch_california_housing(return_X_y=True)
18+
except HTTPError:
19+
# Use a synthetic dataset instead if we couldn't
20+
X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
1621
# Make sure the number of threads is balanced.
1722
xgb_model = xgb.XGBRegressor(
1823
n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"

demo/guide-python/update_process.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,22 @@
77
88
"""
99

10+
from urllib.error import HTTPError
11+
1012
import numpy as np
11-
from sklearn.datasets import fetch_california_housing
13+
from sklearn.datasets import fetch_california_housing, make_regression
1214

1315
import xgboost as xgb
1416

1517

16-
def main():
18+
def main() -> None:
1719
n_rounds = 32
1820

19-
X, y = fetch_california_housing(return_X_y=True)
21+
try:
22+
X, y = fetch_california_housing(return_X_y=True)
23+
except HTTPError:
24+
# Use a synthetic dataset instead if we couldn't
25+
X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
2026

2127
# Train a model first
2228
X_train = X[: X.shape[0] // 2]
@@ -50,7 +56,7 @@ def main():
5056

5157
# Refresh the model without changing the leaf value, but tree statistic including
5258
# cover and weight are refreshed.
53-
refresh_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
59+
refresh_result = {}
5460
refreshed = xgb.train(
5561
{"process_type": "update", "updater": "refresh", "refresh_leaf": False},
5662
Xy_refresh,

ops/script/lint_python.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ class LintersPaths:
131131
"demo/guide-python/sklearn_examples.py",
132132
"demo/guide-python/continuation.py",
133133
"demo/guide-python/callbacks.py",
134+
"demo/guide-python/update_process.py",
134135
"demo/guide-python/cat_in_the_dat.py",
135136
"demo/guide-python/categorical.py",
136137
"demo/guide-python/cat_pipeline.py",

python-package/xgboost/plotting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def plot_importance(
120120

121121
if show_values is True:
122122
for x, y in zip(values, ylocs):
123-
ax.text(x + 1, y, values_format.format(v=x), va="center")
123+
ax.text(x + 1, float(y), values_format.format(v=x), va="center")
124124

125125
ax.set_yticks(ylocs)
126126
ax.set_yticklabels(labels)

python-package/xgboost/testing/data.py

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,58 @@ def check_inf(rng: RNG) -> None:
241241

242242
@memory.cache
243243
def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
244-
"""Fetch the California housing dataset from sklearn."""
245-
datasets = pytest.importorskip("sklearn.datasets")
246-
data = datasets.fetch_california_housing()
247-
return data.data, data.target
244+
"""Synthesize a dataset similar to the sklearn California housing dataset.
245+
246+
The real one can be obtained via:
247+
248+
.. code-block::
249+
250+
import sklearn.datasets
251+
252+
X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)
253+
254+
"""
255+
n_samples = 20640
256+
rng = np.random.default_rng(2025)
257+
258+
pd = pytest.importorskip("pandas")
259+
260+
def mixture_2comp(
261+
means: List[float], sigmas: List[float], weights: List[float]
262+
) -> np.ndarray:
263+
l0 = rng.normal(
264+
size=(int(n_samples * weights[0])), loc=means[0], scale=sigmas[0]
265+
)
266+
l1 = rng.normal(size=(n_samples - l0.shape[0]), loc=means[1], scale=sigmas[1])
267+
return np.concatenate([l0, l1], axis=0)
268+
269+
def norm(mean: float, std: float) -> np.ndarray:
270+
return rng.normal(loc=mean, scale=std, size=(n_samples,))
271+
272+
df = pd.DataFrame(
273+
{
274+
"Longitude": mixture_2comp(
275+
[-118.0703597, -121.85682825],
276+
[0.7897320650373969, 0.7248398629412008],
277+
[0.60402556, 0.39597444],
278+
),
279+
"Latitude": mixture_2comp(
280+
[37.84266317, 33.86030848],
281+
[1.0643911549736087, 0.5049274656834589],
282+
[0.44485062, 0.55514938],
283+
),
284+
"MedInc": norm(mean=3.8706710029069766, std=1.8997756945748738),
285+
"HouseAge": norm(mean=28.639486434108527, std=12.585252725724606),
286+
"AveRooms": norm(mean=5.428999742190376, std=2.474113202333516),
287+
"AveBedrms": norm(mean=1.096675149606208, std=0.47389937625774475),
288+
"Population": norm(mean=1425.4767441860465, std=1132.434687757615),
289+
"AveOccup": norm(mean=3.0706551594363742, std=10.385797959128219),
290+
"MedHouseVal": norm(mean=2.068558169089147, std=1.1539282040412253),
291+
}
292+
)
293+
X = df[df.columns.difference(["MedHouseVal"])].to_numpy()
294+
y = df["MedHouseVal"].to_numpy()
295+
return X, y
248296

249297

250298
@memory.cache

python-package/xgboost/testing/with_skl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from ..core import DMatrix
1010
from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
11+
from .data import get_california_housing
1112
from .ordinal import make_recoded
1213
from .utils import Device
1314

@@ -114,11 +115,10 @@ def run_boost_from_prediction_multi_clasas(
114115

115116
def run_housing_rf_regression(tree_method: str, device: Device) -> None:
116117
"""Testwith the cali housing dataset."""
117-
from sklearn.datasets import fetch_california_housing
118118
from sklearn.metrics import mean_squared_error
119119
from sklearn.model_selection import KFold
120120

121-
X, y = fetch_california_housing(return_X_y=True)
121+
X, y = get_california_housing()
122122
rng = np.random.RandomState(1994)
123123
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
124124
for train_index, test_index in kf.split(X, y):

tests/python/test_predict.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import xgboost as xgb
1212
from xgboost import testing as tm
13-
from xgboost.testing.data import np_dtypes, pd_dtypes
13+
from xgboost.testing.data import get_california_housing, np_dtypes, pd_dtypes
1414
from xgboost.testing.predict import run_base_margin_vs_base_score, run_predict_leaf
1515

1616

@@ -36,9 +36,7 @@ def test_predict_leaf(DMatrixT: Type[xgb.DMatrix]) -> None:
3636

3737

3838
def test_predict_shape():
39-
from sklearn.datasets import fetch_california_housing
40-
41-
X, y = fetch_california_housing(return_X_y=True)
39+
X, y = get_california_housing()
4240
reg = xgb.XGBRegressor(n_estimators=1)
4341
reg.fit(X, y)
4442
predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)

tests/python/test_with_shap.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import pytest
33

44
import xgboost as xgb
5+
from xgboost.testing.data import get_california_housing
56

67
try:
78
import shap
@@ -16,9 +17,7 @@
1617
# xgboost removed ntree_limit in 2.0, which breaks the SHAP package.
1718
@pytest.mark.xfail
1819
def test_with_shap() -> None:
19-
from sklearn.datasets import fetch_california_housing
20-
21-
X, y = fetch_california_housing(return_X_y=True)
20+
X, y = get_california_housing()
2221
dtrain = xgb.DMatrix(X, label=y)
2322
model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
2423
explainer = shap.TreeExplainer(model)

0 commit comments

Comments
 (0)