From 1245e3e8befb6dcbf15b78d4ac00bedc89dab5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 14 Dec 2023 17:09:28 -0600 Subject: [PATCH] add fourier example --- .../how-to-guides/exogenous_features.ipynb | 530 +++++++++++++++++- 1 file changed, 529 insertions(+), 1 deletion(-) diff --git a/nbs/docs/how-to-guides/exogenous_features.ipynb b/nbs/docs/how-to-guides/exogenous_features.ipynb index 2f91c90d..3d6fb630 100644 --- a/nbs/docs/how-to-guides/exogenous_features.ipynb +++ b/nbs/docs/how-to-guides/exogenous_features.ipynb @@ -144,6 +144,14 @@ "series.head()" ] }, + { + "cell_type": "markdown", + "id": "782a6bf1-b360-4aff-b37f-b9ed3ee131e5", + "metadata": {}, + "source": [ + "## Use existing exogenous features" + ] + }, { "cell_type": "markdown", "id": "96fe6893-70c5-4ef5-b318-686c69660a1d", @@ -369,7 +377,7 @@ { "data": { "text/plain": [ - "MLForecast(models=[LGBMRegressor], freq=, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=2)" + "MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=2)" ] }, "execution_count": null, @@ -522,6 +530,526 @@ "preds.head()" ] }, + { + "cell_type": "markdown", + "id": "39f21994-af9a-4376-9e96-1d67d87e6336", + "metadata": {}, + "source": [ + "## Generating exogenous features" + ] + }, + { + "cell_type": "markdown", + "id": "f9bb4f27-0a3d-43a8-9f48-57e1983e67d7", + "metadata": {}, + "source": [ + "Nixtla provides some utilities to generate exogenous features for both training and forecasting such as [statsforecast's mstl_decomposition](https://nixtlaverse.nixtla.io/statsforecast/docs/how-to-guides/generating_features.html) or the [transform_exog function](transforming_exog.ipynb). We also have [utilsforecast's fourier function](https://nixtlaverse.nixtla.io/utilsforecast/feature_engineering.html#fourier), which we'll demonstrate here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "625b300c-9a92-4cf4-af09-d8759e1cef85", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from utilsforecast.feature_engineering import fourier" + ] + }, + { + "cell_type": "markdown", + "id": "f482e987-401c-4a75-bcc6-ed2c77021baa", + "metadata": {}, + "source": [ + "Suppose you start with some data like the one above where we have a couple of static features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90b0f9eb-6c07-417f-a5d3-2dc3d16b1d24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsystatic_0product_id
0id_002000-10-0539.8119837945
1id_002000-10-06103.2740137945
2id_002000-10-07176.5747447945
3id_002000-10-08258.9879007945
4id_002000-10-09344.9404047945
\n", + "
" + ], + "text/plain": [ + " unique_id ds y static_0 product_id\n", + "0 id_00 2000-10-05 39.811983 79 45\n", + "1 id_00 2000-10-06 103.274013 79 45\n", + "2 id_00 2000-10-07 176.574744 79 45\n", + "3 id_00 2000-10-08 258.987900 79 45\n", + "4 id_00 2000-10-09 344.940404 79 45" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "series.head()" + ] + }, + { + "cell_type": "markdown", + "id": "7855322f-1d14-4c9d-9b5f-c8d464896c6f", + "metadata": {}, + "source": [ + "Now we'd like to add some fourier terms to model the seasonality. We can do that with the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "535ae0fd-d16a-49b6-8bdb-c750fdd9a5e1", + "metadata": {}, + "outputs": [], + "source": [ + "transformed_df, future_df = fourier(series, freq='D', season_length=7, k=2, h=7)" + ] + }, + { + "cell_type": "markdown", + "id": "7e0c42c7-5d9e-45b4-b7db-19e72086f5d3", + "metadata": {}, + "source": [ + "This provides an extended training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c4c662e-1a35-46b0-a5f1-19e52584b0f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsystatic_0product_idsin1_7sin2_7cos1_7cos2_7
0id_002000-10-0539.81198379450.7818320.9749280.623490-0.222521
1id_002000-10-06103.27401379450.974928-0.433884-0.222521-0.900969
2id_002000-10-07176.57474479450.433884-0.781831-0.9009690.623490
3id_002000-10-08258.9879007945-0.4338840.781832-0.9009690.623490
4id_002000-10-09344.9404047945-0.9749280.433884-0.222521-0.900969
\n", + "
" + ], + "text/plain": [ + " unique_id ds y static_0 product_id sin1_7 sin2_7 \\\n", + "0 id_00 2000-10-05 39.811983 79 45 0.781832 0.974928 \n", + "1 id_00 2000-10-06 103.274013 79 45 0.974928 -0.433884 \n", + "2 id_00 2000-10-07 176.574744 79 45 0.433884 -0.781831 \n", + "3 id_00 2000-10-08 258.987900 79 45 -0.433884 0.781832 \n", + "4 id_00 2000-10-09 344.940404 79 45 -0.974928 0.433884 \n", + "\n", + " cos1_7 cos2_7 \n", + "0 0.623490 -0.222521 \n", + "1 -0.222521 -0.900969 \n", + "2 -0.900969 0.623490 \n", + "3 -0.900969 0.623490 \n", + "4 -0.222521 -0.900969 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformed_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8e311c3c-392e-4f96-90b8-c1095fe2c21e", + "metadata": {}, + "source": [ + "Along with the future values of the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53b1e42a-9d13-4469-882b-269b21a00964", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddssin1_7sin2_7cos1_7cos2_7
0id_002001-05-15-0.781828-0.9749300.623494-0.222511
1id_002001-05-160.0000060.0000111.0000001.000000
2id_002001-05-170.7818350.9749250.623485-0.222533
3id_002001-05-180.974927-0.433895-0.222527-0.900963
4id_002001-05-190.433878-0.781823-0.9009720.623500
\n", + "
" + ], + "text/plain": [ + " unique_id ds sin1_7 sin2_7 cos1_7 cos2_7\n", + "0 id_00 2001-05-15 -0.781828 -0.974930 0.623494 -0.222511\n", + "1 id_00 2001-05-16 0.000006 0.000011 1.000000 1.000000\n", + "2 id_00 2001-05-17 0.781835 0.974925 0.623485 -0.222533\n", + "3 id_00 2001-05-18 0.974927 -0.433895 -0.222527 -0.900963\n", + "4 id_00 2001-05-19 0.433878 -0.781823 -0.900972 0.623500" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "future_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8df48b27-7765-4107-aa7d-09c8737669f6", + "metadata": {}, + "source": [ + "We can now train using only these features (and the static ones)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53874524-d2d7-40bc-8d82-93575c8c35f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MLForecast(models=[LinearRegression], freq=D, lag_features=[], date_features=[], num_threads=1)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fcst2 = MLForecast(models=LinearRegression(), freq='D')\n", + "fcst2.fit(transformed_df, static_features=['static_0', 'product_id'])" + ] + }, + { + "cell_type": "markdown", + "id": "a16a719d-dfc0-4800-bac5-345ad2dc2681", + "metadata": {}, + "source": [ + "And provide the future values to the predict method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "666c78ff-3814-44ee-acd3-ae61bd9e771b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsLinearRegression
0id_002001-05-15275.822342
1id_002001-05-16262.258117
2id_002001-05-17238.195850
3id_002001-05-18240.997814
4id_002001-05-19262.247123
\n", + "
" + ], + "text/plain": [ + " unique_id ds LinearRegression\n", + "0 id_00 2001-05-15 275.822342\n", + "1 id_00 2001-05-16 262.258117\n", + "2 id_00 2001-05-17 238.195850\n", + "3 id_00 2001-05-18 240.997814\n", + "4 id_00 2001-05-19 262.247123" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fcst2.predict(h=7, X_df=future_df).head()" + ] + }, { "cell_type": "code", "execution_count": null,