Merge branch 'main' into 1.3.X

zillow · Feb 17, 2024 · 114f066 · 114f066
2 parents 634b9cd + e5c7169
commit 114f066
Show file tree

Hide file tree

Showing 10 changed files with 39 additions and 40 deletions.
diff --git a/docs/_static/favicon.ico b/docs/_static/favicon.ico
diff --git a/docs/_static/quantile-forest-logo.png b/docs/_static/quantile-forest-logo.png
diff --git a/docs/user_guide.rst b/docs/user_guide.rst
@@ -119,17 +119,16 @@ Multi-target quantile regression is also supported. If the target values are mul
 Quantile Weighting
 ~~~~~~~~~~~~~~~~~~
 
-By default, the predict method calculates quantiles by weighting each sample inversely according to the size of its leaf node (`weighted_leaves = True`). If `weighted_leaves = False`, each sample in a leaf (including repeated bootstrap samples) will be given equal weight. Note that this leaf-based weighting can only be used with weighted quantiles.
-
 By default, the predict method calculates quantiles using a weighted quantile method (`weighted_quantile = True`), which assigns a weight to each sample in the training set based on the number of times that it co-occurs in the same leaves as the test sample. When the number of samples in the training set is larger than the expected size of this list (i.e., :math:`n_{train} \gg n_{trees} \cdot n_{leaves} \cdot n_{leafsamples}`), it can be more efficient to calculate an unweighted quantile (`weighted_quantile = False`), which aggregates the list of training `y` values for each leaf node to which the test sample belongs across all trees. For a given input, both methods can return the same output values::
 
     >>> import numpy as np
-    >>> kwargs = {"weighted_leaves": False}
-    >>> y_pred_weighted = reg.predict(X_test, weighted_quantile=True, **kwargs)
-    >>> y_pred_unweighted = reg.predict(X_test, weighted_quantile=False, **kwargs)
+    >>> y_pred_weighted = reg.predict(X_test, weighted_quantile=True)
+    >>> y_pred_unweighted = reg.predict(X_test, weighted_quantile=False)
     >>> np.allclose(y_pred_weighted, y_pred_unweighted)
     True
 
+By default, the predict method calculates quantiles by giving each sample in a leaf (including repeated bootstrap samples) equal weight (`weighted_leaves = False`). If `weighted_leaves = True`, each sample will be weighted inversely according to the size of its leaf node. Note that this leaf-based weighting can only be used with weighted quantiles.
+
 Out-of-Bag Estimation
 ~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py
@@ -449,7 +449,7 @@ def predict(
         quantiles=None,
         interpolation="linear",
         weighted_quantile=True,
-        weighted_leaves=True,
+        weighted_leaves=False,
         aggregate_leaves_first=True,
         oob_score=False,
         indices=None,
@@ -490,7 +490,7 @@ def predict(
             number of training samples relative to siblings is small, weighted
             quantiles can be more efficient to compute than unweighted ones.
 
-        weighted_leaves : bool, default=True
+        weighted_leaves : bool, default=False
             Weight samples inversely to the size of their leaf node.
             Only used if `weighted_quantile=True` and `max_samples_leaf!=1`.
 

diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx
@@ -617,7 +617,7 @@ cdef class QuantileForest:
         UINT8_t[:, :] X_indices=None,
         char* interpolation=b"linear",
         bint weighted_quantile=<bint>True,
-        bint weighted_leaves=<bint>True,
+        bint weighted_leaves=<bint>False,
         bint aggregate_leaves_first=<bint>True,
     ):
         """Return predictions for ``est.apply`` outputs.
@@ -644,7 +644,7 @@ cdef class QuantileForest:
         weighted_quantile : bool, default=True
             Calculate weighted quantiles.
 
-        weighted_leaves : bool, default=True
+        weighted_leaves : bool, default=False
             Weight samples inversely to the size of their leaf node.
 
         aggregate_leaves_first : bool, default=True

diff --git a/quantile_forest/tests/examples/plot_quantile_extrapolation.py b/quantile_forest/tests/examples/plot_quantile_extrapolation.py
@@ -60,7 +60,7 @@ def get_test_X(X):
 )
 qrf.fit(np.expand_dims(X_train, axis=-1), y_train)
 
-y_pred = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])  # extrapolate
+y_pred = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])
 
 
 df = pd.DataFrame(
@@ -156,7 +156,7 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
         tooltip=tooltip_pred,
     )
 
-    base1 = bar_pred + points_true + line_true + line_pred
+    chart = bar_pred + points_true + line_true + line_pred
 
     if legend:
         # For desired legend ordering.
@@ -175,10 +175,10 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
             blank = blank.encode(
                 color=alt.Color(f"{k}:N", scale=alt.Scale(range=[v["color"]]), title=None)
             )
-            base1 += blank
-        base1 = base1.resolve_scale(color="independent")
+            chart += blank
+        chart = chart.resolve_scale(color="independent")
 
-    chart = base1.properties(height=200, width=300, title=title)
+    chart = chart.properties(height=200, width=300, title=title)
 
     return chart
 

diff --git a/quantile_forest/tests/examples/plot_quantile_interpolation.py b/quantile_forest/tests/examples/plot_quantile_interpolation.py
@@ -68,6 +68,7 @@
     data["y_med"].extend(y_medians[idx])
     data["y_low"].extend(y_medians[idx] - y_errs[idx][0])
     data["y_upp"].extend(y_medians[idx] + y_errs[idx][1])
+
 df = pd.DataFrame(data)
 
 
@@ -80,6 +81,14 @@ def plot_interpolations(df, legend):
         alt.value("lightgray"),
     )
 
+    tooltip = [
+        alt.Tooltip("method:N", title="Method"),
+        alt.Tooltip("x:N", title="X Values"),
+        alt.Tooltip("y_med:N", format=".3f", title="Median Y Value"),
+        alt.Tooltip("y_low:N", format=".3f", title="Lower Y Value"),
+        alt.Tooltip("y_upp:N", format=".3f", title="Upper Y Value"),
+    ]
+
     point = (
         alt.Chart(df, width=alt.Step(20))
         .mark_circle(opacity=1, size=75)
@@ -92,13 +101,7 @@ def plot_interpolations(df, legend):
             ),
             y=alt.Y("y_med:Q", title="Actual and Predicted Values"),
             color=color,
-            tooltip=[
-                alt.Tooltip("method:N", title="Method"),
-                alt.Tooltip("x:N", title="X Values"),
-                alt.Tooltip("y_med:N", format=".3f", title="Median Y Value"),
-                alt.Tooltip("y_low:N", format=".3f", title="Lower Y Value"),
-                alt.Tooltip("y_upp:N", format=".3f", title="Upper Y Value"),
-            ],
+            tooltip=tooltip,
         )
     )
 
@@ -115,13 +118,7 @@ def plot_interpolations(df, legend):
             y=alt.Y("y_low:Q", title=""),
             y2=alt.Y2("y_upp:Q", title=None),
             color=color,
-            tooltip=[
-                alt.Tooltip("method:N", title="Method"),
-                alt.Tooltip("x:N", title="X Values"),
-                alt.Tooltip("y_med:N", format=".3f", title="Median Y Value"),
-                alt.Tooltip("y_low:N", format=".3f", title="Lower Y Value"),
-                alt.Tooltip("y_upp:N", format=".3f", title="Upper Y Value"),
-            ],
+            tooltip=tooltip,
         )
     )
 

diff --git a/quantile_forest/tests/examples/plot_quantile_intervals.py b/quantile_forest/tests/examples/plot_quantile_intervals.py
@@ -2,8 +2,10 @@
 Quantile Regression Forests Prediction Intervals
 ================================================
 
-An example of how to use a quantile regression forest to plot prediction
-intervals on the California Housing dataset.
+An example of how to use quantile regression forests to generate prediction
+intervals on the California Housing dataset. Inspired by Figure 3 of
+"Quantile Regression Forests" by Meinshausen:
+https://jmlr.org/papers/v7/meinshausen06a.html.
 """
 
 import altair as alt
@@ -61,8 +63,8 @@
         "y_pred_upp": np.concatenate(y_pred_upp),
     }
 ).pipe(
-    lambda x: x * 100_000
-)  # convert to dollars
+    lambda x: x * 100_000  # convert to dollars
+)
 
 
 def plot_calibration_and_intervals(df):

diff --git a/quantile_forest/tests/examples/plot_quantile_multioutput.py b/quantile_forest/tests/examples/plot_quantile_multioutput.py
@@ -21,11 +21,11 @@
 
 funcs = [
     {
-        "truth": lambda x: np.log1p(x + 1),
-        "noise": lambda x: np.log1p(x + 1) * np.random.uniform(size=len(x)),
+        "signal": lambda x: np.log1p(x + 1),
+        "noise": lambda x: np.log1p(x) * np.random.uniform(size=len(x)),
     },
     {
-        "truth": lambda x: np.log1p(np.sqrt(x)),
+        "signal": lambda x: np.log1p(np.sqrt(x)),
         "noise": lambda x: np.log1p(x / 2) * np.random.uniform(size=len(x)),
     },
 ]
@@ -40,7 +40,7 @@ def make_func_Xy(funcs, bounds, n_samples):
     x = np.linspace(*bounds, n_samples)
     y = np.empty((len(x), len(funcs)))
     for i, func in enumerate(funcs):
-        y[:, i] = func["truth"](x) + func["noise"](x)
+        y[:, i] = func["signal"](x) + func["noise"](x)
     return np.atleast_2d(x).T, y
 
 
@@ -51,14 +51,14 @@ def make_func_Xy(funcs, bounds, n_samples):
 qrf = RandomForestQuantileRegressor(max_samples_leaf=None, max_depth=4, random_state=0)
 qrf.fit(X_train, y_train)
 
-y_pred = qrf.predict(X, quantiles=[0.025, 0.5, 0.975], weighted_leaves=False)
+y_pred = qrf.predict(X, quantiles=[0.025, 0.5, 0.975], weighted_quantile=False)
 y_pred = y_pred.reshape(-1, 3, len(funcs))
 
 df = pd.DataFrame(
     {
         "x": np.tile(X.squeeze(), len(funcs)),
         "y": y.reshape(-1, order="F"),
-        "y_true": np.concatenate([f["truth"](X.squeeze()) for f in funcs]),
+        "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]),
         "y_pred": np.concatenate([y_pred[:, 1, i] for i in range(len(funcs))]),
         "y_pred_low": np.concatenate([y_pred[:, 0, i] for i in range(len(funcs))]),
         "y_pred_upp": np.concatenate([y_pred[:, 2, i] for i in range(len(funcs))]),

diff --git a/quantile_forest/tests/examples/plot_quantile_weighting.py b/quantile_forest/tests/examples/plot_quantile_weighting.py
@@ -28,7 +28,7 @@ def timing():
     t1 = time.time()
 
 
-X, y = datasets.make_regression(n_samples=500, n_features=4, random_state=0)
+X, y = datasets.make_regression(n_samples=250, n_features=4, n_targets=5, random_state=0)
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 
@@ -69,6 +69,7 @@ def timing():
         timings[i, j, :] = [rf_time(), qrf_weighted_time(), qrf_unweighted_time()]
         timings[i, j, :] *= 1000  # convert from milliseconds to seconds
 
+timings /= timings.min()  # normalize by minimum runtime
 timings = np.transpose(timings, axes=[2, 0, 1])  # put the estimator name first
 
 data = {"name": [], "n_estimators": [], "iteration": [], "runtime": []}
@@ -115,7 +116,7 @@ def plot_timings_by_size(df, legend):
         .mark_line()
         .encode(
             x=alt.X("n_estimators:Q", title="Number of Estimators"),
-            y=alt.Y("mean:Q", title="Prediction Runtime (seconds)"),
+            y=alt.Y("mean:Q", title="Prediction Runtime (normalized)"),
             color=color,
         )
     )