@@ -241,10 +241,58 @@ def check_inf(rng: RNG) -> None:
241241
242242@memory .cache
243243def get_california_housing () -> Tuple [np .ndarray , np .ndarray ]:
244- """Fetch the California housing dataset from sklearn."""
245- datasets = pytest .importorskip ("sklearn.datasets" )
246- data = datasets .fetch_california_housing ()
247- return data .data , data .target
244+ """Synthesize a dataset similar to the sklearn California housing dataset.
245+
246+ The real one can be obtained via:
247+
248+ .. code-block::
249+
250+ import sklearn.datasets
251+
252+ X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)
253+
254+ """
255+ n_samples = 20640
256+ rng = np .random .default_rng (2025 )
257+
258+ pd = pytest .importorskip ("pandas" )
259+
260+ def mixture_2comp (
261+ means : List [float ], sigmas : List [float ], weights : List [float ]
262+ ) -> np .ndarray :
263+ l0 = rng .normal (
264+ size = (int (n_samples * weights [0 ])), loc = means [0 ], scale = sigmas [0 ]
265+ )
266+ l1 = rng .normal (size = (n_samples - l0 .shape [0 ]), loc = means [1 ], scale = sigmas [1 ])
267+ return np .concatenate ([l0 , l1 ], axis = 0 )
268+
269+ def norm (mean : float , std : float ) -> np .ndarray :
270+ return rng .normal (loc = mean , scale = std , size = (n_samples ,))
271+
272+ df = pd .DataFrame (
273+ {
274+ "Longitude" : mixture_2comp (
275+ [- 118.0703597 , - 121.85682825 ],
276+ [0.7897320650373969 , 0.7248398629412008 ],
277+ [0.60402556 , 0.39597444 ],
278+ ),
279+ "Latitude" : mixture_2comp (
280+ [37.84266317 , 33.86030848 ],
281+ [1.0643911549736087 , 0.5049274656834589 ],
282+ [0.44485062 , 0.55514938 ],
283+ ),
284+ "MedInc" : norm (mean = 3.8706710029069766 , std = 1.8997756945748738 ),
285+ "HouseAge" : norm (mean = 28.639486434108527 , std = 12.585252725724606 ),
286+ "AveRooms" : norm (mean = 5.428999742190376 , std = 2.474113202333516 ),
287+ "AveBedrms" : norm (mean = 1.096675149606208 , std = 0.47389937625774475 ),
288+ "Population" : norm (mean = 1425.4767441860465 , std = 1132.434687757615 ),
289+ "AveOccup" : norm (mean = 3.0706551594363742 , std = 10.385797959128219 ),
290+ "MedHouseVal" : norm (mean = 2.068558169089147 , std = 1.1539282040412253 ),
291+ }
292+ )
293+ X = df [df .columns .difference (["MedHouseVal" ])].to_numpy ()
294+ y = df ["MedHouseVal" ].to_numpy ()
295+ return X , y
248296
249297
250298@memory .cache
0 commit comments