Skip to content

Feature branch to update to 1.6 #813

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# exlude datasets and externals
notebooks/datasets
notebooks/joblib/
wrap-up/

# jupyter-book
jupyter-book/_build
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
PYTHON_SCRIPTS_DIR = python_scripts
NOTEBOOKS_DIR = notebooks
JUPYTER_BOOK_DIR = jupyter-book
WRAP_UP_DIR = wrap-up
JUPYTER_KERNEL := python3
MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\[email protected]@")

Expand Down Expand Up @@ -37,6 +38,10 @@ quizzes:
full-index:
python build_tools/generate-index.py

run-code-in-wrap-up-quizzes:
python build_tools/generate-wrap-up.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(WRAP_UP_DIR)
jupytext --execute --to notebook $(WRAP_UP_DIR)/*.py

$(JUPYTER_BOOK_DIR):
jupyter-book build $(JUPYTER_BOOK_DIR)
rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html
Expand Down
107 changes: 107 additions & 0 deletions build_tools/generate-wrap-up.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import sys
import os
import glob


def extract_python_code_blocks(md_file_path):
"""
Extract Python code blocks from a markdown file.

Args:
md_file_path (str): Path to the markdown file

Returns:
list: List of extracted Python code blocks
"""
code_blocks = []
in_python_block = False
current_block = []

with open(md_file_path, "r", encoding="utf-8") as file:
for line in file:
line = line.rstrip("\n")

if line.strip() == "```python":
in_python_block = True
current_block = []
elif line.strip() == "```" and in_python_block:
in_python_block = False
code_blocks.append("\n".join(current_block))
elif in_python_block:
current_block.append(line)

return code_blocks


def write_jupyter_notebook_file(
code_blocks, output_file="notebook_from_md.py"
):
"""
Writes extracted code blocks to a Python file formatted as Jupyter notebook cells.

Args:
code_blocks (list): List of code blocks to write
output_file (str): Path to the output file
"""
with open(output_file, "w", encoding="utf-8") as file:
file.write(
"# %% [markdown] \n # ## Notebook generated from Markdown file\n\n"
)

for i, block in enumerate(code_blocks, 1):
file.write(f"# %% [markdown]\n# ## Cell {i}\n\n# %%\n{block}\n\n")

print(
f"Successfully wrote {len(code_blocks)} code cells to"
f" {output_file}"
)


def process_quiz_files(input_path, output_dir):
"""
Process all wrap_up_quiz files in the input path and convert them to notebooks.

Args:
input_path (str): Path to look for wrap_up_quiz files in subfolders
output_dir (str): Directory to write the generated notebooks
"""
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")

# Find all files containing "wrap_up_quiz" in their name in the input path subfolders
quiz_files = glob.glob(
f"{input_path}/**/*wrap_up_quiz*.md", recursive=True
)

if not quiz_files:
print(f"No wrap_up_quiz.md files found in {input_path} subfolders.")
return

print(f"Found {len(quiz_files)} wrap_up_quiz files to process.")

# Process each file
for md_file_path in quiz_files:
print(f"\nProcessing: {md_file_path}")

# Extract code blocks
code_blocks = extract_python_code_blocks(md_file_path)

# Generate output filename
subfolder = md_file_path.split(os.sep)[3] # Get subfolder name
output_file = os.path.join(output_dir, f"{subfolder}_wrap_up_quiz.py")

# Display results and write notebook file
if code_blocks:
print(f"Found {len(code_blocks)} Python code blocks")
write_jupyter_notebook_file(code_blocks, output_file=output_file)
else:
print(f"No Python code blocks found in {md_file_path}.")


if __name__ == "__main__":
input_path = sys.argv[1]
output_dir = sys.argv[2]

process_quiz_files(input_path, output_dir)
2 changes: 1 addition & 1 deletion notebooks/datasets_bike_rides.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
"metadata": {},
"outputs": [],
"source": [
"data_ride.resample(\"60S\").mean().plot()\n",
"data_ride.resample(\"60s\").mean().plot()\n",
"plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n",
"_ = plt.title(\"Sensor values for different cyclist measurements\")"
]
Expand Down
2 changes: 1 addition & 1 deletion notebooks/ensemble_adaboost.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
"\n",
"estimator = DecisionTreeClassifier(max_depth=3, random_state=0)\n",
"adaboost = AdaBoostClassifier(\n",
" estimator=estimator, n_estimators=3, algorithm=\"SAMME\", random_state=0\n",
" estimator=estimator, n_estimators=3, random_state=0\n",
")\n",
"adaboost.fit(data, target)"
]
Expand Down
6 changes: 3 additions & 3 deletions notebooks/linear_models_regularization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@
"ridge = make_pipeline(\n",
" MinMaxScaler(),\n",
" PolynomialFeatures(degree=2, include_bias=False),\n",
" RidgeCV(alphas=alphas, store_cv_values=True),\n",
" RidgeCV(alphas=alphas, store_cv_results=True),\n",
")"
]
},
Expand Down Expand Up @@ -677,7 +677,7 @@
"It indicates that our model is not overfitting.\n",
"\n",
"When fitting the ridge regressor, we also requested to store the error found\n",
"during cross-validation (by setting the parameter `store_cv_values=True`). We\n",
"during cross-validation (by setting the parameter `store_cv_results=True`). We\n",
"can plot the mean squared error for the different `alphas` regularization\n",
"strengths that we tried. The error bars represent one standard deviation of the\n",
"average mean square error across folds for a given value of `alpha`."
Expand All @@ -690,7 +690,7 @@
"outputs": [],
"source": [
"mse_alphas = [\n",
" est[-1].cv_values_.mean(axis=0) for est in cv_results[\"estimator\"]\n",
" est[-1].cv_results_.mean(axis=0) for est in cv_results[\"estimator\"]\n",
"]\n",
"cv_alphas = pd.DataFrame(mse_alphas, columns=alphas)\n",
"cv_alphas = cv_alphas.aggregate([\"mean\", \"std\"]).T\n",
Expand Down
1 change: 1 addition & 0 deletions notebooks/parameter_tuning_nested.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
" (\"cat_preprocessor\", categorical_preprocessor, categorical_columns),\n",
" ],\n",
" remainder=\"passthrough\",\n",
" force_int_remainder_cols=False, # Silence a warning in scikit-learn v1.6.\n",
")"
]
},
Expand Down
1 change: 1 addition & 0 deletions notebooks/parameter_tuning_randomized_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
"preprocessor = ColumnTransformer(\n",
" [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n",
" remainder=\"passthrough\",\n",
" force_int_remainder_cols=False, # Silence a warning in scikit-learn v1.6.\n",
")"
]
},
Expand Down
6 changes: 3 additions & 3 deletions notebooks/trees_ex_01.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@
"<div class=\"admonition warning alert alert-danger\">\n",
"<p class=\"first admonition-title\" style=\"font-weight: bold;\">Warning</p>\n",
"<p class=\"last\">At this time, it is not possible to use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict_proba\"</span></tt> for\n",
"multiclass problems. This is a planned feature for a future version of\n",
"scikit-learn. In the mean time, you can use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt>\n",
"instead.</p>\n",
"multiclass problems on a single plot. This is a planned feature for a future\n",
"version of scikit-learn. In the mean time, you can use\n",
"<tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt> instead.</p>\n",
"</div>"
]
},
Expand Down
97 changes: 42 additions & 55 deletions notebooks/trees_sol_01.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@
"<div class=\"admonition warning alert alert-danger\">\n",
"<p class=\"first admonition-title\" style=\"font-weight: bold;\">Warning</p>\n",
"<p class=\"last\">At this time, it is not possible to use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict_proba\"</span></tt> for\n",
"multiclass problems. This is a planned feature for a future version of\n",
"scikit-learn. In the mean time, you can use <tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt>\n",
"instead.</p>\n",
"multiclass problems on a single plot. This is a planned feature for a future\n",
"version of scikit-learn. In the mean time, you can use\n",
"<tt class=\"docutils literal\"><span class=\"pre\">response_method=\"predict\"</span></tt> instead.</p>\n",
"</div>"
]
},
Expand Down Expand Up @@ -212,12 +212,14 @@
"except that for a K-class problem you have K probability outputs for each\n",
"data point. Visualizing all these on a single plot can quickly become tricky\n",
"to interpret. It is then common to instead produce K separate plots, one for\n",
"each class, in a one-vs-rest (or one-vs-all) fashion.\n",
"each class, in a one-vs-rest (or one-vs-all) fashion. This can be achieved by\n",
"calling `DecisionBoundaryDisplay` several times, once for each class, and\n",
"passing the `class_of_interest` parameter to the function.\n",
"\n",
"For example, in the plot below, the first plot on the left shows in yellow the\n",
"certainty on classifying a data point as belonging to the \"Adelie\" class. In\n",
"the same plot, the spectre from green to purple represents the certainty of\n",
"**not** belonging to the \"Adelie\" class. The same logic applies to the other\n",
"For example, in the plot below, the first plot on the left shows the\n",
"certainty of classifying a data point as belonging to the \"Adelie\" class. The\n",
"darker the color, the more certain the model is that a given point in the\n",
"feature space belongs to a given class. The same logic applies to the other\n",
"plots in the figure."
]
},
Expand All @@ -231,48 +233,38 @@
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"xx = np.linspace(30, 60, 100)\n",
"yy = np.linspace(10, 23, 100)\n",
"xx, yy = np.meshgrid(xx, yy)\n",
"Xfull = pd.DataFrame(\n",
" {\"Culmen Length (mm)\": xx.ravel(), \"Culmen Depth (mm)\": yy.ravel()}\n",
")\n",
"\n",
"probas = tree.predict_proba(Xfull)\n",
"n_classes = len(np.unique(tree.classes_))\n",
"from matplotlib import cm\n",
"\n",
"_, axs = plt.subplots(ncols=3, nrows=1, sharey=True, figsize=(12, 5))\n",
"plt.suptitle(\"Predicted probabilities for decision tree model\", y=0.8)\n",
"plt.suptitle(\"Predicted probabilities for decision tree model\", y=1.05)\n",
"plt.subplots_adjust(bottom=0.45)\n",
"\n",
"for class_of_interest in range(n_classes):\n",
" axs[class_of_interest].set_title(\n",
" f\"Class {tree.classes_[class_of_interest]}\"\n",
" )\n",
" imshow_handle = axs[class_of_interest].imshow(\n",
" probas[:, class_of_interest].reshape((100, 100)),\n",
" extent=(30, 60, 10, 23),\n",
" vmin=0.0,\n",
" vmax=1.0,\n",
" origin=\"lower\",\n",
" cmap=\"viridis\",\n",
"for idx, (class_of_interest, ax) in enumerate(zip(tree.classes_, axs)):\n",
" ax.set_title(f\"Class {class_of_interest}\")\n",
" DecisionBoundaryDisplay.from_estimator(\n",
" tree,\n",
" data_test,\n",
" response_method=\"predict_proba\",\n",
" class_of_interest=class_of_interest,\n",
" ax=ax,\n",
" vmin=0,\n",
" vmax=1,\n",
" cmap=\"Blues\",\n",
" )\n",
" axs[class_of_interest].set_xlabel(\"Culmen Length (mm)\")\n",
" if class_of_interest == 0:\n",
" axs[class_of_interest].set_ylabel(\"Culmen Depth (mm)\")\n",
" idx = target_test == tree.classes_[class_of_interest]\n",
" axs[class_of_interest].scatter(\n",
" data_test[\"Culmen Length (mm)\"].loc[idx],\n",
" data_test[\"Culmen Depth (mm)\"].loc[idx],\n",
" ax.scatter(\n",
" data_test[\"Culmen Length (mm)\"].loc[target_test == class_of_interest],\n",
" data_test[\"Culmen Depth (mm)\"].loc[target_test == class_of_interest],\n",
" marker=\"o\",\n",
" c=\"w\",\n",
" edgecolor=\"k\",\n",
" )\n",
" ax.set_xlabel(\"Culmen Length (mm)\")\n",
" if idx == 0:\n",
" ax.set_ylabel(\"Culmen Depth (mm)\")\n",
"\n",
"ax = plt.axes([0.15, 0.04, 0.7, 0.05])\n",
"plt.colorbar(imshow_handle, cax=ax, orientation=\"horizontal\")\n",
"_ = plt.title(\"Probability\")"
"ax = plt.axes([0.15, 0.14, 0.7, 0.05])\n",
"plt.colorbar(cm.ScalarMappable(cmap=\"Blues\"), cax=ax, orientation=\"horizontal\")\n",
"_ = ax.set_title(\"Predicted class membership probability\")"
]
},
{
Expand All @@ -283,22 +275,17 @@
]
},
"source": [
"\n",
"<div class=\"admonition note alert alert-info\">\n",
"<p class=\"first admonition-title\" style=\"font-weight: bold;\">Note</p>\n",
"<p class=\"last\">You may have noticed that we are no longer using a diverging colormap. Indeed,\n",
"the chance level for a one-vs-rest binarization of the multi-class\n",
"classification problem is almost never at predicted probability of 0.5. So\n",
"using a colormap with a neutral white at 0.5 might give a false impression on\n",
"the certainty.</p>\n",
"</div>\n",
"\n",
"In future versions of scikit-learn `DecisionBoundaryDisplay` will support a\n",
"`class_of_interest` parameter that will allow in particular for a\n",
"visualization of `predict_proba` in multi-class settings.\n",
"\n",
"We also plan to make it possible to visualize the `predict_proba` values for\n",
"the class with the maximum predicted probability (without having to pass a\n",
"given a fixed `class_of_interest` value)."
"<p class=\"last\">You may notice that we do not use a diverging colormap (2 color gradients with\n",
"white in the middle). Indeed, in a multiclass setting, 0.5 is not a\n",
"meaningful value, hence using white as the center of the colormap is not\n",
"appropriate. Instead, we use a sequential colormap, where the color intensity\n",
"indicates the certainty of the classification. The darker the color, the more\n",
"certain the model is that a given point in the feature space belongs to a\n",
"given class.</p>\n",
"</div>"
]
}
],
Expand Down
2 changes: 1 addition & 1 deletion python_scripts/datasets_bike_rides.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@
# smoother visualization.

# %%
data_ride.resample("60S").mean().plot()
data_ride.resample("60s").mean().plot()
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
_ = plt.title("Sensor values for different cyclist measurements")

Expand Down
2 changes: 1 addition & 1 deletion python_scripts/ensemble_adaboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@

estimator = DecisionTreeClassifier(max_depth=3, random_state=0)
adaboost = AdaBoostClassifier(
estimator=estimator, n_estimators=3, algorithm="SAMME", random_state=0
estimator=estimator, n_estimators=3, random_state=0
)
adaboost.fit(data, target)

Expand Down
6 changes: 3 additions & 3 deletions python_scripts/linear_models_regularization.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@
ridge = make_pipeline(
MinMaxScaler(),
PolynomialFeatures(degree=2, include_bias=False),
RidgeCV(alphas=alphas, store_cv_values=True),
RidgeCV(alphas=alphas, store_cv_results=True),
)

# %%
Expand Down Expand Up @@ -458,14 +458,14 @@
# It indicates that our model is not overfitting.
#
# When fitting the ridge regressor, we also requested to store the error found
# during cross-validation (by setting the parameter `store_cv_values=True`). We
# during cross-validation (by setting the parameter `store_cv_results=True`). We
# can plot the mean squared error for the different `alphas` regularization
# strengths that we tried. The error bars represent one standard deviation of the
# average mean square error across folds for a given value of `alpha`.

# %%
mse_alphas = [
est[-1].cv_values_.mean(axis=0) for est in cv_results["estimator"]
est[-1].cv_results_.mean(axis=0) for est in cv_results["estimator"]
]
cv_alphas = pd.DataFrame(mse_alphas, columns=alphas)
cv_alphas = cv_alphas.aggregate(["mean", "std"]).T
Expand Down
3 changes: 3 additions & 0 deletions python_scripts/parameter_tuning_grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@
preprocessor = ColumnTransformer(
[("cat_preprocessor", categorical_preprocessor, categorical_columns)],
remainder="passthrough",
# Silence a deprecation warning in scikit-learn v1.6 related to how the
# ColumnTransformer stores an attribute that we do not use in this notebook
force_int_remainder_cols=False,
)

# %% [markdown]
Expand Down
Loading