Skip to content

Feature branch to update to 1.6 #813

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 27, 2025
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# exlude datasets and externals
notebooks/datasets
notebooks/joblib/
wrap-up/

# jupyter-book
jupyter-book/_build
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
PYTHON_SCRIPTS_DIR = python_scripts
NOTEBOOKS_DIR = notebooks
JUPYTER_BOOK_DIR = jupyter-book
WRAP_UP_DIR = wrap-up
JUPYTER_KERNEL := python3
MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\[email protected]@")

Expand Down Expand Up @@ -37,6 +38,10 @@ quizzes:
full-index:
python build_tools/generate-index.py

run-code-in-wrap-up-quizzes:
python build_tools/generate-wrap-up.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(WRAP_UP_DIR)
jupytext --execute --to notebook $(WRAP_UP_DIR)/*.py

$(JUPYTER_BOOK_DIR):
jupyter-book build $(JUPYTER_BOOK_DIR)
rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html
Expand Down
107 changes: 107 additions & 0 deletions build_tools/generate-wrap-up.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import sys
import os
import glob


def extract_python_code_blocks(md_file_path):
"""
Extract Python code blocks from a markdown file.

Args:
md_file_path (str): Path to the markdown file

Returns:
list: List of extracted Python code blocks
"""
code_blocks = []
in_python_block = False
current_block = []

with open(md_file_path, "r", encoding="utf-8") as file:
for line in file:
line = line.rstrip("\n")

if line.strip() == "```python":
in_python_block = True
current_block = []
elif line.strip() == "```" and in_python_block:
in_python_block = False
code_blocks.append("\n".join(current_block))
elif in_python_block:
current_block.append(line)

return code_blocks


def write_jupyter_notebook_file(
code_blocks, output_file="notebook_from_md.py"
):
"""
Writes extracted code blocks to a Python file formatted as Jupyter notebook cells.

Args:
code_blocks (list): List of code blocks to write
output_file (str): Path to the output file
"""
with open(output_file, "w", encoding="utf-8") as file:
file.write(
"# %% [markdown] \n # ## Notebook generated from Markdown file\n\n"
)

for i, block in enumerate(code_blocks, 1):
file.write(f"# %% [markdown]\n# ## Cell {i}\n\n# %%\n{block}\n\n")

print(
f"Successfully wrote {len(code_blocks)} code cells to"
f" {output_file}"
)


def process_quiz_files(input_path, output_dir):
"""
Process all wrap_up_quiz files in the input path and convert them to notebooks.

Args:
input_path (str): Path to look for wrap_up_quiz files in subfolders
output_dir (str): Directory to write the generated notebooks
"""
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")

# Find all files containing "wrap_up_quiz" in their name in the input path subfolders
quiz_files = glob.glob(
f"{input_path}/**/*wrap_up_quiz*.md", recursive=True
)

if not quiz_files:
print(f"No wrap_up_quiz.md files found in {input_path} subfolders.")
return

print(f"Found {len(quiz_files)} wrap_up_quiz files to process.")

# Process each file
for md_file_path in quiz_files:
print(f"\nProcessing: {md_file_path}")

# Extract code blocks
code_blocks = extract_python_code_blocks(md_file_path)

# Generate output filename
subfolder = md_file_path.split(os.sep)[3] # Get subfolder name
output_file = os.path.join(output_dir, f"{subfolder}_wrap_up_quiz.py")

# Display results and write notebook file
if code_blocks:
print(f"Found {len(code_blocks)} Python code blocks")
write_jupyter_notebook_file(code_blocks, output_file=output_file)
else:
print(f"No Python code blocks found in {md_file_path}.")


if __name__ == "__main__":
input_path = sys.argv[1]
output_dir = sys.argv[2]

process_quiz_files(input_path, output_dir)
21 changes: 1 addition & 20 deletions notebooks/03_categorical_pipeline_ex_02.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -160,26 +160,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Analysis\n",
"\n",
"From an accuracy point of view, the result is almost exactly the same. The\n",
"reason is that `HistGradientBoostingClassifier` is expressive and robust\n",
"enough to deal with misleading ordering of integer coded categories (which was\n",
"not the case for linear models).\n",
"\n",
"However from a computation point of view, the training time is much longer:\n",
"this is caused by the fact that `OneHotEncoder` generates more features than\n",
"`OrdinalEncoder`; for each unique categorical value a column is created.\n",
"\n",
"Note that the current implementation `HistGradientBoostingClassifier` is still\n",
"incomplete, and once sparse representation are handled correctly, training\n",
"time might improve with such kinds of encodings.\n",
"\n",
"The main take away message is that arbitrary integer coding of categories is\n",
"perfectly fine for `HistGradientBoostingClassifier` and yields fast training\n",
"times.\n",
"\n",
"Which encoder should I use?\n",
"## Which encoder should I use?\n",
"\n",
"| | Meaningful order | Non-meaningful order |\n",
"| ---------------- | ----------------------------- | -------------------- |\n",
Expand Down
7 changes: 4 additions & 3 deletions notebooks/cross_validation_grouping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"If we read carefully, 13 writers wrote the digits of our dataset, accounting\n",
"for a total amount of 1797 samples. Thus, a writer wrote several times the\n",
"same numbers. Let's suppose that the writer samples are grouped. Subsequently,\n",
"If we read carefully, `load_digits` loads a copy of the **test set** of the\n",
"UCI ML hand-written digits dataset, which consists of 1797 images by\n",
"**13 different writers**. Thus, each writer wrote several times the same\n",
"numbers. Let's suppose the dataset is ordered by writer. Subsequently,\n",
"not shuffling the data will keep all writer samples together either in the\n",
"training or the testing sets. Mixing the data will break this structure, and\n",
"therefore digits written by the same writer will be available in both the\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/datasets_bike_rides.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
"metadata": {},
"outputs": [],
"source": [
"data_ride.resample(\"60S\").mean().plot()\n",
"data_ride.resample(\"60s\").mean().plot()\n",
"plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n",
"_ = plt.title(\"Sensor values for different cyclist measurements\")"
]
Expand Down
2 changes: 1 addition & 1 deletion notebooks/ensemble_adaboost.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
"\n",
"estimator = DecisionTreeClassifier(max_depth=3, random_state=0)\n",
"adaboost = AdaBoostClassifier(\n",
" estimator=estimator, n_estimators=3, algorithm=\"SAMME\", random_state=0\n",
" estimator=estimator, n_estimators=3, random_state=0\n",
")\n",
"adaboost.fit(data, target)"
]
Expand Down
24 changes: 21 additions & 3 deletions notebooks/ensemble_ex_03.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,24 @@
"ensemble. However, the scores reach a plateau where adding new trees just\n",
"makes fitting and scoring slower.\n",
"\n",
"Now repeat the analysis for the gradient boosting model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
"# Write your code here."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Gradient boosting models overfit when the number of trees is too large. To\n",
"avoid adding a new unnecessary tree, unlike random-forest gradient-boosting\n",
"offers an early-stopping option. Internally, the algorithm uses an\n",
Expand All @@ -115,9 +133,9 @@
"improving for several iterations, it stops adding trees.\n",
"\n",
"Now, create a gradient-boosting model with `n_estimators=1_000`. This number\n",
"of trees is certainly too large. Change the parameter `n_iter_no_change`\n",
"such that the gradient boosting fitting stops after adding 5 trees to avoid\n",
"deterioration of the overall generalization performance."
"of trees is certainly too large as we have seen above. Change the parameter\n",
"`n_iter_no_change` such that the gradient boosting fitting stops after adding\n",
"5 trees to avoid deterioration of the overall generalization performance."
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions notebooks/linear_models_regularization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@
"ridge = make_pipeline(\n",
" MinMaxScaler(),\n",
" PolynomialFeatures(degree=2, include_bias=False),\n",
" RidgeCV(alphas=alphas, store_cv_values=True),\n",
" RidgeCV(alphas=alphas, store_cv_results=True),\n",
")"
]
},
Expand Down Expand Up @@ -677,7 +677,7 @@
"It indicates that our model is not overfitting.\n",
"\n",
"When fitting the ridge regressor, we also requested to store the error found\n",
"during cross-validation (by setting the parameter `store_cv_values=True`). We\n",
"during cross-validation (by setting the parameter `store_cv_results=True`). We\n",
"can plot the mean squared error for the different `alphas` regularization\n",
"strengths that we tried. The error bars represent one standard deviation of the\n",
"average mean square error across folds for a given value of `alpha`."
Expand All @@ -690,7 +690,7 @@
"outputs": [],
"source": [
"mse_alphas = [\n",
" est[-1].cv_values_.mean(axis=0) for est in cv_results[\"estimator\"]\n",
" est[-1].cv_results_.mean(axis=0) for est in cv_results[\"estimator\"]\n",
"]\n",
"cv_alphas = pd.DataFrame(mse_alphas, columns=alphas)\n",
"cv_alphas = cv_alphas.aggregate([\"mean\", \"std\"]).T\n",
Expand Down
3 changes: 3 additions & 0 deletions notebooks/parameter_tuning_grid_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@
"preprocessor = ColumnTransformer(\n",
" [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n",
" remainder=\"passthrough\",\n",
" # Silence a deprecation warning in scikit-learn v1.6 related to how the\n",
" # ColumnTransformer stores an attribute that we do not use in this notebook\n",
" force_int_remainder_cols=False,\n",
")"
]
},
Expand Down
1 change: 1 addition & 0 deletions notebooks/parameter_tuning_nested.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
" (\"cat_preprocessor\", categorical_preprocessor, categorical_columns),\n",
" ],\n",
" remainder=\"passthrough\",\n",
" force_int_remainder_cols=False, # Silence a warning in scikit-learn v1.6.\n",
")"
]
},
Expand Down
1 change: 1 addition & 0 deletions notebooks/parameter_tuning_randomized_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
"preprocessor = ColumnTransformer(\n",
" [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n",
" remainder=\"passthrough\",\n",
" force_int_remainder_cols=False, # Silence a warning in scikit-learn v1.6.\n",
")"
]
},
Expand Down
6 changes: 3 additions & 3 deletions notebooks/trees_ex_01.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@
"<div class=\"admonition warning alert alert-danger\">\n",
"<p class=\"first admonition-title\" style=\"font-weight: bold;\">Warning</p>\n",
"<p class=\"last\">At this time, it is not possible to use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict_proba\"</span></tt> for\n",
"multiclass problems. This is a planned feature for a future version of\n",
"scikit-learn. In the mean time, you can use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt>\n",
"instead.</p>\n",
"multiclass problems on a single plot. This is a planned feature for a future\n",
"version of scikit-learn. In the mean time, you can use\n",
"<tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt> instead.</p>\n",
"</div>"
]
},
Expand Down
Loading