INRIA · ogrisel · May 27, 2025 · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # exlude datasets and externals
 notebooks/datasets
 notebooks/joblib/
+wrap-up/
 
 # jupyter-book
 jupyter-book/_build

diff --git a/Makefile b/Makefile
@@ -1,6 +1,7 @@
 PYTHON_SCRIPTS_DIR = python_scripts
 NOTEBOOKS_DIR = notebooks
 JUPYTER_BOOK_DIR = jupyter-book
+WRAP_UP_DIR = wrap-up
 JUPYTER_KERNEL := python3
 MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\[email protected]@")
 
@@ -37,6 +38,10 @@ quizzes:
 full-index:
 	python build_tools/generate-index.py
 
+run-code-in-wrap-up-quizzes:
+	python build_tools/generate-wrap-up.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(WRAP_UP_DIR)
+	jupytext --execute --to notebook $(WRAP_UP_DIR)/*.py
+
 $(JUPYTER_BOOK_DIR):
 	jupyter-book build $(JUPYTER_BOOK_DIR)
 	rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html

diff --git a/build_tools/generate-wrap-up.py b/build_tools/generate-wrap-up.py
@@ -0,0 +1,107 @@
+import sys
+import os
+import glob
+
+
+def extract_python_code_blocks(md_file_path):
+    """
+    Extract Python code blocks from a markdown file.
+
+    Args:
+        md_file_path (str): Path to the markdown file
+
+    Returns:
+        list: List of extracted Python code blocks
+    """
+    code_blocks = []
+    in_python_block = False
+    current_block = []
+
+    with open(md_file_path, "r", encoding="utf-8") as file:
+        for line in file:
+            line = line.rstrip("\n")
+
+            if line.strip() == "```python":
+                in_python_block = True
+                current_block = []
+            elif line.strip() == "```" and in_python_block:
+                in_python_block = False
+                code_blocks.append("\n".join(current_block))
+            elif in_python_block:
+                current_block.append(line)
+
+    return code_blocks
+
+
+def write_jupyter_notebook_file(
+    code_blocks, output_file="notebook_from_md.py"
+):
+    """
+    Writes extracted code blocks to a Python file formatted as Jupyter notebook cells.
+
+    Args:
+        code_blocks (list): List of code blocks to write
+        output_file (str): Path to the output file
+    """
+    with open(output_file, "w", encoding="utf-8") as file:
+        file.write(
+            "# %% [markdown] \n # ## Notebook generated from Markdown file\n\n"
+        )
+
+        for i, block in enumerate(code_blocks, 1):
+            file.write(f"# %% [markdown]\n# ## Cell {i}\n\n# %%\n{block}\n\n")
+
+        print(
+            f"Successfully wrote {len(code_blocks)} code cells to"
+            f" {output_file}"
+        )
+
+
+def process_quiz_files(input_path, output_dir):
+    """
+    Process all wrap_up_quiz files in the input path and convert them to notebooks.
+
+    Args:
+        input_path (str): Path to look for wrap_up_quiz files in subfolders
+        output_dir (str): Directory to write the generated notebooks
+    """
+    # Create output directory if it doesn't exist
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        print(f"Created output directory: {output_dir}")
+
+    # Find all files containing "wrap_up_quiz" in their name in the input path subfolders
+    quiz_files = glob.glob(
+        f"{input_path}/**/*wrap_up_quiz*.md", recursive=True
+    )
+
+    if not quiz_files:
+        print(f"No wrap_up_quiz.md files found in {input_path} subfolders.")
+        return
+
+    print(f"Found {len(quiz_files)} wrap_up_quiz files to process.")
+
+    # Process each file
+    for md_file_path in quiz_files:
+        print(f"\nProcessing: {md_file_path}")
+
+        # Extract code blocks
+        code_blocks = extract_python_code_blocks(md_file_path)
+
+        # Generate output filename
+        subfolder = md_file_path.split(os.sep)[3]  # Get subfolder name
+        output_file = os.path.join(output_dir, f"{subfolder}_wrap_up_quiz.py")
+
+        # Display results and write notebook file
+        if code_blocks:
+            print(f"Found {len(code_blocks)} Python code blocks")
+            write_jupyter_notebook_file(code_blocks, output_file=output_file)
+        else:
+            print(f"No Python code blocks found in {md_file_path}.")
+
+
+if __name__ == "__main__":
+    input_path = sys.argv[1]
+    output_dir = sys.argv[2]
+
+    process_quiz_files(input_path, output_dir)
diff --git a/notebooks/03_categorical_pipeline_ex_02.ipynb b/notebooks/03_categorical_pipeline_ex_02.ipynb
@@ -160,26 +160,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Analysis\n",
-    "\n",
-    "From an accuracy point of view, the result is almost exactly the same. The\n",
-    "reason is that `HistGradientBoostingClassifier` is expressive and robust\n",
-    "enough to deal with misleading ordering of integer coded categories (which was\n",
-    "not the case for linear models).\n",
-    "\n",
-    "However from a computation point of view, the training time is much longer:\n",
-    "this is caused by the fact that `OneHotEncoder` generates more features than\n",
-    "`OrdinalEncoder`; for each unique categorical value a column is created.\n",
-    "\n",
-    "Note that the current implementation `HistGradientBoostingClassifier` is still\n",
-    "incomplete, and once sparse representation are handled correctly, training\n",
-    "time might improve with such kinds of encodings.\n",
-    "\n",
-    "The main take away message is that arbitrary integer coding of categories is\n",
-    "perfectly fine for `HistGradientBoostingClassifier` and yields fast training\n",
-    "times.\n",
-    "\n",
-    "Which encoder should I use?\n",
+    "## Which encoder should I use?\n",
     "\n",
     "|                  | Meaningful order              | Non-meaningful order |\n",
     "| ---------------- | ----------------------------- | -------------------- |\n",

diff --git a/notebooks/cross_validation_grouping.ipynb b/notebooks/cross_validation_grouping.ipynb
@@ -189,9 +189,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If we read carefully, 13 writers wrote the digits of our dataset, accounting\n",
-    "for a total amount of 1797 samples. Thus, a writer wrote several times the\n",
-    "same numbers. Let's suppose that the writer samples are grouped. Subsequently,\n",
+    "If we read carefully, `load_digits` loads a copy of the **test set** of the\n",
+    "UCI ML hand-written digits dataset, which consists of 1797 images by\n",
+    "**13 different writers**. Thus, each writer wrote several times the same\n",
+    "numbers. Let's suppose the dataset is ordered by writer. Subsequently,\n",
     "not shuffling the data will keep all writer samples together either in the\n",
     "training or the testing sets. Mixing the data will break this structure, and\n",
     "therefore digits written by the same writer will be available in both the\n",

diff --git a/notebooks/datasets_bike_rides.ipynb b/notebooks/datasets_bike_rides.ipynb
@@ -271,7 +271,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data_ride.resample(\"60S\").mean().plot()\n",
+    "data_ride.resample(\"60s\").mean().plot()\n",
     "plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n",
     "_ = plt.title(\"Sensor values for different cyclist measurements\")"
    ]

diff --git a/notebooks/ensemble_adaboost.ipynb b/notebooks/ensemble_adaboost.ipynb
@@ -271,7 +271,7 @@
     "\n",
     "estimator = DecisionTreeClassifier(max_depth=3, random_state=0)\n",
     "adaboost = AdaBoostClassifier(\n",
-    "    estimator=estimator, n_estimators=3, algorithm=\"SAMME\", random_state=0\n",
+    "    estimator=estimator, n_estimators=3, random_state=0\n",
     ")\n",
     "adaboost.fit(data, target)"
    ]

diff --git a/notebooks/ensemble_ex_03.ipynb b/notebooks/ensemble_ex_03.ipynb
@@ -107,6 +107,24 @@
     "ensemble. However, the scores reach a plateau where adding new trees just\n",
     "makes fitting and scoring slower.\n",
     "\n",
+    "Now repeat the analysis for the gradient boosting model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "# Write your code here."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "Gradient boosting models overfit when the number of trees is too large. To\n",
     "avoid adding a new unnecessary tree, unlike random-forest gradient-boosting\n",
     "offers an early-stopping option. Internally, the algorithm uses an\n",
@@ -115,9 +133,9 @@
     "improving for several iterations, it stops adding trees.\n",
     "\n",
     "Now, create a gradient-boosting model with `n_estimators=1_000`. This number\n",
-    "of trees is certainly too large. Change the parameter `n_iter_no_change`\n",
-    "such that the gradient boosting fitting stops after adding 5 trees to avoid\n",
-    "deterioration of the overall generalization performance."
+    "of trees is certainly too large as we have seen above. Change the parameter\n",
+    "`n_iter_no_change` such that the gradient boosting fitting stops after adding\n",
+    "5 trees to avoid deterioration of the overall generalization performance."
    ]
   },
   {

diff --git a/notebooks/linear_models_regularization.ipynb b/notebooks/linear_models_regularization.ipynb
@@ -618,7 +618,7 @@
     "ridge = make_pipeline(\n",
     "    MinMaxScaler(),\n",
     "    PolynomialFeatures(degree=2, include_bias=False),\n",
-    "    RidgeCV(alphas=alphas, store_cv_values=True),\n",
+    "    RidgeCV(alphas=alphas, store_cv_results=True),\n",
     ")"
    ]
   },
@@ -677,7 +677,7 @@
     "It indicates that our model is not overfitting.\n",
     "\n",
     "When fitting the ridge regressor, we also requested to store the error found\n",
-    "during cross-validation (by setting the parameter `store_cv_values=True`). We\n",
+    "during cross-validation (by setting the parameter `store_cv_results=True`). We\n",
     "can plot the mean squared error for the different `alphas` regularization\n",
     "strengths that we tried. The error bars represent one standard deviation of the\n",
     "average mean square error across folds for a given value of `alpha`."
@@ -690,7 +690,7 @@
    "outputs": [],
    "source": [
     "mse_alphas = [\n",
-    "    est[-1].cv_values_.mean(axis=0) for est in cv_results[\"estimator\"]\n",
+    "    est[-1].cv_results_.mean(axis=0) for est in cv_results[\"estimator\"]\n",
     "]\n",
     "cv_alphas = pd.DataFrame(mse_alphas, columns=alphas)\n",
     "cv_alphas = cv_alphas.aggregate([\"mean\", \"std\"]).T\n",

diff --git a/notebooks/parameter_tuning_grid_search.ipynb b/notebooks/parameter_tuning_grid_search.ipynb
@@ -157,6 +157,9 @@
     "preprocessor = ColumnTransformer(\n",
     "    [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n",
     "    remainder=\"passthrough\",\n",
+    "    # Silence a deprecation warning in scikit-learn v1.6 related to how the\n",
+    "    # ColumnTransformer stores an attribute that we do not use in this notebook\n",
+    "    force_int_remainder_cols=False,\n",
     ")"
    ]
   },

diff --git a/notebooks/parameter_tuning_nested.ipynb b/notebooks/parameter_tuning_nested.ipynb
@@ -70,6 +70,7 @@
     "        (\"cat_preprocessor\", categorical_preprocessor, categorical_columns),\n",
     "    ],\n",
     "    remainder=\"passthrough\",\n",
+    "    force_int_remainder_cols=False,  # Silence a warning in scikit-learn v1.6.\n",
     ")"
    ]
   },

diff --git a/notebooks/parameter_tuning_randomized_search.ipynb b/notebooks/parameter_tuning_randomized_search.ipynb
@@ -121,6 +121,7 @@
     "preprocessor = ColumnTransformer(\n",
     "    [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n",
     "    remainder=\"passthrough\",\n",
+    "    force_int_remainder_cols=False,  # Silence a warning in scikit-learn v1.6.\n",
     ")"
    ]
   },

diff --git a/notebooks/trees_ex_01.ipynb b/notebooks/trees_ex_01.ipynb
@@ -83,9 +83,9 @@
     "<div class=\"admonition warning alert alert-danger\">\n",
     "<p class=\"first admonition-title\" style=\"font-weight: bold;\">Warning</p>\n",
     "<p class=\"last\">At this time, it is not possible to use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict_proba\"</span></tt> for\n",
-    "multiclass problems. This is a planned feature for a future version of\n",
-    "scikit-learn. In the mean time, you can use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt>\n",
-    "instead.</p>\n",
+    "multiclass problems on a single plot. This is a planned feature for a future\n",
+    "version of scikit-learn. In the mean time, you can use\n",
+    "<tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt> instead.</p>\n",
     "</div>"
    ]
   },