Skip to content

Commit

Permalink
Merge branch 'main' into 1.3.X
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Feb 17, 2024
2 parents 634b9cd + e5c7169 commit 114f066
Show file tree
Hide file tree
Showing 10 changed files with 39 additions and 40 deletions.
Binary file modified docs/_static/favicon.ico
Binary file not shown.
Binary file modified docs/_static/quantile-forest-logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 4 additions & 5 deletions docs/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,16 @@ Multi-target quantile regression is also supported. If the target values are mul
Quantile Weighting
~~~~~~~~~~~~~~~~~~

By default, the predict method calculates quantiles by weighting each sample inversely according to the size of its leaf node (`weighted_leaves = True`). If `weighted_leaves = False`, each sample in a leaf (including repeated bootstrap samples) will be given equal weight. Note that this leaf-based weighting can only be used with weighted quantiles.

By default, the predict method calculates quantiles using a weighted quantile method (`weighted_quantile = True`), which assigns a weight to each sample in the training set based on the number of times that it co-occurs in the same leaves as the test sample. When the number of samples in the training set is larger than the expected size of this list (i.e., :math:`n_{train} \gg n_{trees} \cdot n_{leaves} \cdot n_{leafsamples}`), it can be more efficient to calculate an unweighted quantile (`weighted_quantile = False`), which aggregates the list of training `y` values for each leaf node to which the test sample belongs across all trees. For a given input, both methods can return the same output values::

>>> import numpy as np
>>> kwargs = {"weighted_leaves": False}
>>> y_pred_weighted = reg.predict(X_test, weighted_quantile=True, **kwargs)
>>> y_pred_unweighted = reg.predict(X_test, weighted_quantile=False, **kwargs)
>>> y_pred_weighted = reg.predict(X_test, weighted_quantile=True)
>>> y_pred_unweighted = reg.predict(X_test, weighted_quantile=False)
>>> np.allclose(y_pred_weighted, y_pred_unweighted)
True

By default, the predict method calculates quantiles by giving each sample in a leaf (including repeated bootstrap samples) equal weight (`weighted_leaves = False`). If `weighted_leaves = True`, each sample will be weighted inversely according to the size of its leaf node. Note that this leaf-based weighting can only be used with weighted quantiles.

Out-of-Bag Estimation
~~~~~~~~~~~~~~~~~~~~~

Expand Down
4 changes: 2 additions & 2 deletions quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def predict(
quantiles=None,
interpolation="linear",
weighted_quantile=True,
weighted_leaves=True,
weighted_leaves=False,
aggregate_leaves_first=True,
oob_score=False,
indices=None,
Expand Down Expand Up @@ -490,7 +490,7 @@ def predict(
number of training samples relative to siblings is small, weighted
quantiles can be more efficient to compute than unweighted ones.
weighted_leaves : bool, default=True
weighted_leaves : bool, default=False
Weight samples inversely to the size of their leaf node.
Only used if `weighted_quantile=True` and `max_samples_leaf!=1`.
Expand Down
4 changes: 2 additions & 2 deletions quantile_forest/_quantile_forest_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ cdef class QuantileForest:
UINT8_t[:, :] X_indices=None,
char* interpolation=b"linear",
bint weighted_quantile=<bint>True,
bint weighted_leaves=<bint>True,
bint weighted_leaves=<bint>False,
bint aggregate_leaves_first=<bint>True,
):
"""Return predictions for ``est.apply`` outputs.
Expand All @@ -644,7 +644,7 @@ cdef class QuantileForest:
weighted_quantile : bool, default=True
Calculate weighted quantiles.
weighted_leaves : bool, default=True
weighted_leaves : bool, default=False
Weight samples inversely to the size of their leaf node.
aggregate_leaves_first : bool, default=True
Expand Down
10 changes: 5 additions & 5 deletions quantile_forest/tests/examples/plot_quantile_extrapolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_test_X(X):
)
qrf.fit(np.expand_dims(X_train, axis=-1), y_train)

y_pred = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975]) # extrapolate
y_pred = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])


df = pd.DataFrame(
Expand Down Expand Up @@ -156,7 +156,7 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
tooltip=tooltip_pred,
)

base1 = bar_pred + points_true + line_true + line_pred
chart = bar_pred + points_true + line_true + line_pred

if legend:
# For desired legend ordering.
Expand All @@ -175,10 +175,10 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
blank = blank.encode(
color=alt.Color(f"{k}:N", scale=alt.Scale(range=[v["color"]]), title=None)
)
base1 += blank
base1 = base1.resolve_scale(color="independent")
chart += blank
chart = chart.resolve_scale(color="independent")

chart = base1.properties(height=200, width=300, title=title)
chart = chart.properties(height=200, width=300, title=title)

return chart

Expand Down
25 changes: 11 additions & 14 deletions quantile_forest/tests/examples/plot_quantile_interpolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
data["y_med"].extend(y_medians[idx])
data["y_low"].extend(y_medians[idx] - y_errs[idx][0])
data["y_upp"].extend(y_medians[idx] + y_errs[idx][1])

df = pd.DataFrame(data)


Expand All @@ -80,6 +81,14 @@ def plot_interpolations(df, legend):
alt.value("lightgray"),
)

tooltip = [
alt.Tooltip("method:N", title="Method"),
alt.Tooltip("x:N", title="X Values"),
alt.Tooltip("y_med:N", format=".3f", title="Median Y Value"),
alt.Tooltip("y_low:N", format=".3f", title="Lower Y Value"),
alt.Tooltip("y_upp:N", format=".3f", title="Upper Y Value"),
]

point = (
alt.Chart(df, width=alt.Step(20))
.mark_circle(opacity=1, size=75)
Expand All @@ -92,13 +101,7 @@ def plot_interpolations(df, legend):
),
y=alt.Y("y_med:Q", title="Actual and Predicted Values"),
color=color,
tooltip=[
alt.Tooltip("method:N", title="Method"),
alt.Tooltip("x:N", title="X Values"),
alt.Tooltip("y_med:N", format=".3f", title="Median Y Value"),
alt.Tooltip("y_low:N", format=".3f", title="Lower Y Value"),
alt.Tooltip("y_upp:N", format=".3f", title="Upper Y Value"),
],
tooltip=tooltip,
)
)

Expand All @@ -115,13 +118,7 @@ def plot_interpolations(df, legend):
y=alt.Y("y_low:Q", title=""),
y2=alt.Y2("y_upp:Q", title=None),
color=color,
tooltip=[
alt.Tooltip("method:N", title="Method"),
alt.Tooltip("x:N", title="X Values"),
alt.Tooltip("y_med:N", format=".3f", title="Median Y Value"),
alt.Tooltip("y_low:N", format=".3f", title="Lower Y Value"),
alt.Tooltip("y_upp:N", format=".3f", title="Upper Y Value"),
],
tooltip=tooltip,
)
)

Expand Down
10 changes: 6 additions & 4 deletions quantile_forest/tests/examples/plot_quantile_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
Quantile Regression Forests Prediction Intervals
================================================
An example of how to use a quantile regression forest to plot prediction
intervals on the California Housing dataset.
An example of how to use quantile regression forests to generate prediction
intervals on the California Housing dataset. Inspired by Figure 3 of
"Quantile Regression Forests" by Meinshausen:
https://jmlr.org/papers/v7/meinshausen06a.html.
"""

import altair as alt
Expand Down Expand Up @@ -61,8 +63,8 @@
"y_pred_upp": np.concatenate(y_pred_upp),
}
).pipe(
lambda x: x * 100_000
) # convert to dollars
lambda x: x * 100_000 # convert to dollars
)


def plot_calibration_and_intervals(df):
Expand Down
12 changes: 6 additions & 6 deletions quantile_forest/tests/examples/plot_quantile_multioutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@

funcs = [
{
"truth": lambda x: np.log1p(x + 1),
"noise": lambda x: np.log1p(x + 1) * np.random.uniform(size=len(x)),
"signal": lambda x: np.log1p(x + 1),
"noise": lambda x: np.log1p(x) * np.random.uniform(size=len(x)),
},
{
"truth": lambda x: np.log1p(np.sqrt(x)),
"signal": lambda x: np.log1p(np.sqrt(x)),
"noise": lambda x: np.log1p(x / 2) * np.random.uniform(size=len(x)),
},
]
Expand All @@ -40,7 +40,7 @@ def make_func_Xy(funcs, bounds, n_samples):
x = np.linspace(*bounds, n_samples)
y = np.empty((len(x), len(funcs)))
for i, func in enumerate(funcs):
y[:, i] = func["truth"](x) + func["noise"](x)
y[:, i] = func["signal"](x) + func["noise"](x)
return np.atleast_2d(x).T, y


Expand All @@ -51,14 +51,14 @@ def make_func_Xy(funcs, bounds, n_samples):
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, max_depth=4, random_state=0)
qrf.fit(X_train, y_train)

y_pred = qrf.predict(X, quantiles=[0.025, 0.5, 0.975], weighted_leaves=False)
y_pred = qrf.predict(X, quantiles=[0.025, 0.5, 0.975], weighted_quantile=False)
y_pred = y_pred.reshape(-1, 3, len(funcs))

df = pd.DataFrame(
{
"x": np.tile(X.squeeze(), len(funcs)),
"y": y.reshape(-1, order="F"),
"y_true": np.concatenate([f["truth"](X.squeeze()) for f in funcs]),
"y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]),
"y_pred": np.concatenate([y_pred[:, 1, i] for i in range(len(funcs))]),
"y_pred_low": np.concatenate([y_pred[:, 0, i] for i in range(len(funcs))]),
"y_pred_upp": np.concatenate([y_pred[:, 2, i] for i in range(len(funcs))]),
Expand Down
5 changes: 3 additions & 2 deletions quantile_forest/tests/examples/plot_quantile_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def timing():
t1 = time.time()


X, y = datasets.make_regression(n_samples=500, n_features=4, random_state=0)
X, y = datasets.make_regression(n_samples=250, n_features=4, n_targets=5, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

Expand Down Expand Up @@ -69,6 +69,7 @@ def timing():
timings[i, j, :] = [rf_time(), qrf_weighted_time(), qrf_unweighted_time()]
timings[i, j, :] *= 1000 # convert from milliseconds to seconds

timings /= timings.min() # normalize by minimum runtime
timings = np.transpose(timings, axes=[2, 0, 1]) # put the estimator name first

data = {"name": [], "n_estimators": [], "iteration": [], "runtime": []}
Expand Down Expand Up @@ -115,7 +116,7 @@ def plot_timings_by_size(df, legend):
.mark_line()
.encode(
x=alt.X("n_estimators:Q", title="Number of Estimators"),
y=alt.Y("mean:Q", title="Prediction Runtime (seconds)"),
y=alt.Y("mean:Q", title="Prediction Runtime (normalized)"),
color=color,
)
)
Expand Down

0 comments on commit 114f066

Please sign in to comment.