59 |
60 | # #######################################################################################
61 | # Interact link settings
62 | notebook_interface : "notebook"
63 | # notebook_interface: "classic" # The interface interactive links will activate ["classic", "jupyterlab"]
64 |
65 | sphinx:
66 | config:
67 | nb_custom_formats:
68 | .py:
69 | - jupytext.reads
70 | - fmt: py:percent
71 | # Needed for plotly rendering:
72 | # https://jupyterbook.org/interactive/interactive.html#plotly
73 | html_js_files:
74 | - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js
75 |
76 | #######################################################################################
77 | # Launch button settings
78 | repository:
79 | url : https://github.com/INRIA/scikit-learn-mooc
80 | branch: main
81 |
82 | launch_buttons:
83 | binderhub_url: "https://mybinder.org"
84 | # colab_url: "https://colab.research.google.com" # Not working for now,
85 | # because it needs .ipynb
86 | # Disable thebe support since it does not start in the right folder, see
87 | # https://github.com/INRIA/scikit-learn-mooc/issues/669 for more details
88 | # thebe: true
89 |
90 | binder:
91 | binderhub_url : "https://mybinder.org"
92 | text : "Launch binder"
93 |
94 |
95 | latex:
96 | latex_engine : "xelatex"
97 | latex_documents:
98 | targetname: book.tex
99 |
--------------------------------------------------------------------------------
/jupyter-book/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/jupyter-book/_static/favicon.ico
--------------------------------------------------------------------------------
/jupyter-book/_static/matomo.js:
--------------------------------------------------------------------------------
1 | var _paq = window._paq = window._paq || [];
2 | /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
3 | _paq.push(['trackPageView']);
4 | _paq.push(['enableLinkTracking']);
5 | (function() {
6 | var u = "https://piwik.inria.fr/";
7 | _paq.push(['setTrackerUrl', u + 'piwik.php']);
8 | _paq.push(['setSiteId', '127']);
9 | var d = document,
10 | g = d.createElement('script'),
11 | s = d.getElementsByTagName('script')[0];
12 | g.async = true;
13 | g.src = u + 'piwik.js';
14 | s.parentNode.insertBefore(g, s);
15 | })();
16 |
--------------------------------------------------------------------------------
/jupyter-book/_static/sklearn_mooc.css:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | Note: the video and slides iframes currently use the same CSS styles but use
4 | different classes to get future-proof flexibility.
5 | */
6 |
7 | iframe.video {
8 | width: 100%;
9 | aspect-ratio: 4/3;
10 | margin-bottom: 1em;
11 | }
12 |
13 | iframe.slides {
14 | width: 100%;
15 | aspect-ratio: 4/3;
16 | margin-bottom: 1em;
17 | }
18 |
19 | /*
20 | Better highlighting of modules in toc.html, for some reason modules
21 | are aria-level="2" rather than h2
22 | */
23 | p[aria-level="2"] {
24 | font-size: 1.2em;
25 | margin-top: 2em;
26 | margin-bottom: 0.5em;
27 | font-weight: bold;
28 | }
29 |
30 | /* The adds in the landing page */
31 |
32 | div.mooc_add {
33 | display: table;
34 | }
35 |
36 | div.mooc_add a {
37 | color: #000000;
38 | display: block;
39 | border-radius: .4em;
40 | background-color: #F7931E;
41 | border: 1px solid #7b5a46;
42 | box-shadow: 1px 1px 1px #CA9875;
43 | padding: 5pt;
44 | }
45 |
46 | @media screen and (min-width: 900px) {
47 | div.mooc_add {
48 | width: 25ex;
49 | position: fixed;
50 | right: calc(5pt + .15 * (100vw - 900px));
51 | bottom: calc(5pt + max(0pt, .05*(100vh - 200px)));
52 | }
53 |
54 |
55 | div.footer {
56 | max-width: 60vw;
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/jupyter-book/_static/sklearn_mooc.js:
--------------------------------------------------------------------------------
1 | (function() {
2 | function inIframe() {
3 | try {
4 | return window.self !== window.top;
5 | } catch (e) {
6 | return true;
7 | }
8 | }
9 |
10 | function contentOnly() {
11 | var urlParams = new URLSearchParams(window.location.search);
12 | return urlParams.get('content_only') !== null;
13 | }
14 |
15 | function removeIfExists(el) {
16 | if (el) {
17 | el.remove();
18 | };
19 | }
20 |
21 | function adjustBinderLink() {
22 | // Binder links to .py instead of .ipynb. In an ideal world, there
23 | // would be a way to do it in _config.yml or you could tell Jupyter to
24 | // use the Notebook interface to open the .py but ?factory=Notebook
25 | // does not work on the mybinder.org URL only on the
26 | // hub.2i2c.mybinder.org URL
27 | var elements = document.querySelectorAll('.dropdown-launch-buttons a');
28 | elements.forEach(
29 | function(el) {
30 | el.href = el.href.replace(/python_scripts\/(.+)\.py/, "notebooks/$1.ipynb");
31 | }
32 | );
33 | }
34 |
35 | function displayContentOnly() {
36 | removeIfExists(document.querySelector('#site-navigation'));
37 | removeIfExists(document.querySelector('.topbar'));
38 | removeIfExists(document.querySelector('.footer'));
39 | // the prev/next buttons at the bottom of the page may have a different
40 | // class (depending on the theme version maybe?), removing both to be
41 | // safe.
42 | removeIfExists(document.querySelector('.prev-next-bottom'));
43 | removeIfExists(document.querySelector('.prev-next-area'));
44 | var elementsToRemove = document.querySelectorAll('.remove-from-content-only');
45 | elementsToRemove.forEach(
46 | function(el) {
47 | removeIfExists(el);
48 | }
49 | );
50 | document.querySelector('#main-content').querySelector('.col-md-9').className = 'col-12';
51 |
52 | var style = document.createElement('style');
53 | style.appendChild(
54 | document.createTextNode(
55 | 'hypothesis-sidebar, hypothesis-notebook, hypothesis-adder{display:none!important;}'));
56 | document.getElementsByTagName('head')[0].appendChild(style);
57 | }
58 |
59 | document.addEventListener("DOMContentLoaded", function() {
60 | if (inIframe() || contentOnly()) {
61 | displayContentOnly();
62 | }
63 | adjustBinderLink();
64 | });
65 | }());
66 |
--------------------------------------------------------------------------------
/jupyter-book/appendix/acknowledgement.md:
--------------------------------------------------------------------------------
1 | # Acknowledgement
2 |
3 | ## Figure attributions
4 |
5 | The diagram presenting the API design in the module "The predictive modeling
6 | pipeline" used the following figures:
7 |
8 | - The "Parameters Free Icon" is licensed under CC-BY 3.0 -
9 | [source](https://www.onlinewebfonts.com/icon/512285)
10 | - The "Settings Gears SVG Vector" is licensed under CC0 -
11 | [source](https://www.svgrepo.com/svg/57066/settings-gears)
12 | - The "Close icon" is licensed under MIT -
13 | [source](https://www.iconfinder.com/icons/211652/close_icon)
14 |
--------------------------------------------------------------------------------
/jupyter-book/appendix/datasets_intro.md:
--------------------------------------------------------------------------------
1 | # Datasets description
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/appendix/notebook_timings.md:
--------------------------------------------------------------------------------
1 | # Notebook timings
2 |
3 | ```{nb-exec-table}
4 | ```
5 |
--------------------------------------------------------------------------------
/jupyter-book/appendix/toc_redirect.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Table of contents
4 |
--------------------------------------------------------------------------------
/jupyter-book/concluding_remarks_video.md:
--------------------------------------------------------------------------------
1 | # 🎥 Concluding remarks
2 |
3 |
6 |
--------------------------------------------------------------------------------
/jupyter-book/datasets:
--------------------------------------------------------------------------------
1 | ../datasets
--------------------------------------------------------------------------------
/jupyter-book/ensemble/bagging_slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Intuitions on ensemble models: bagging
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/boosting_slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Intuitions on ensemble models: boosting
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_boosting_index.md:
--------------------------------------------------------------------------------
1 | # Ensemble based on boosting
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_bootstrap_index.md:
--------------------------------------------------------------------------------
1 | # Ensemble method using bootstrapping
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_hyperparameters_index.md:
--------------------------------------------------------------------------------
1 | # Hyperparameter tuning with ensemble methods
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | This module will go into details regarding algorithms that are combining
8 | several models together, also called ensemble of models. We will present two
9 | families of such techniques: (i) based on bootstrapping and (ii) based
10 | on boosting. We will present bagging and random forest that belong to the
11 | former strategy and AdaBoost and gradient boosting decision tree that belong
12 | to the later strategy. Finally, we will go into details regarding the
13 | hyperparameters allowing to tune these models and compare them between models.
14 |
15 | ## Before getting started
16 |
17 |
18 |
19 | The required technical skills to carry on this module are:
20 |
21 | - skills acquired during the "The Predictive Modeling Pipeline" module with
22 | basic usage of scikit-learn;
23 | - skills acquired during the "Selecting The Best Model" module, mainly around
24 | the concept of underfit/overfit and the usage of cross-validation in
25 | scikit-learn;
26 | - skills acquired during the modules "Linear Models" and
27 | "Decision Tree Models".
28 |
29 |
30 |
31 | ## Objectives and time schedule
32 |
33 |
34 |
35 | The objective in the module are the following:
36 |
37 | - understanding the principles behind bootstrapping and boosting;
38 | - get intuitions with specific models such as random forest
39 | and gradient boosting;
40 | - identify the important hyperparameters of random forest and gradient boosting
41 | decision trees as well as their typical values.
42 |
43 |
44 |
45 | The estimated time to go through this module is about 6 hours.
46 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_module_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 |
6 |
7 | So in this module, we discussed ensemble learners which are a type of
8 | learner that combines simpler learners together. We saw two strategies:
9 |
10 | - one based on bootstrap samples allowing learners to be fit in a parallel
11 | manner;
12 | - the other called boosting which fit learners sequentially.
13 |
14 | From these two families, we mainly focused on giving intuitions regarding the
15 | internal machinery of the random forest and gradient-boosting models which
16 | are state-of-the-art methods.
17 |
18 | ## To go further
19 |
20 |
21 |
22 | You can refer to the following scikit-learn examples which are related to
23 | the concepts approached in this module:
24 |
25 | - [Early-stopping in gradient-boosting](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_early_stopping.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-early-stopping-py)
26 | - [Combining predictors using stacking](https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html#sphx-glr-auto-examples-ensemble-plot-stack-predictors-py)
27 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_quiz_m6_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M6.01
2 |
3 | ```{admonition} Question
4 | By default, a
5 | [`BaggingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)
6 | or [`BaggingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)
7 | draw:
8 |
9 | - a) random samples with replacement over training points
10 | - b) random samples with replacement over features
11 | - c) random samples without replacement over training points
12 | - d) random samples without replacement over features
13 |
14 | _Select all answers that apply_
15 |
16 | Hint: it is possible to access the documentation for those classes by
17 | clicking on the links on their names.
18 | ```
19 |
20 | +++
21 |
22 | ```{admonition} Question
23 | In a
24 | [`BaggingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)
25 | or [`BaggingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html),
26 | the parameter `base_estimator` can be:
27 |
28 | - a) any predictor
29 | - b) a decision tree predictor
30 | - c) a linear model predictor
31 |
32 | _Select a single answer_
33 | ```
34 |
35 | +++
36 |
37 | ```{admonition} Question
38 |
39 | In the context of a classification problem, what are the differences between a
40 | bagging classifier and a random forest classifier:
41 |
42 | - a) in a random forest, the base model is always a decision tree
43 | - b) in a random forest, the split threshold values are decided completely at
44 | random
45 | - c) in a random forest, a random resampling is performed both over features
46 | as well as over samples
47 |
48 | _Select all answers that apply_
49 | ```
50 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_quiz_m6_02.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M6.02
2 |
3 | ```{admonition} Question
4 | Select the correct statements:
5 |
6 | - a) Both bagging and boosting combine several predictors
7 | - b) Both bagging and boosting are based on decision trees
8 | - c) Boosting combines predictors sequentially
9 | - d) Bagging combines predictors simultaneously
10 |
11 | _Select all answers that apply_
12 | ```
13 |
14 | +++
15 |
16 | ```{admonition} Question
17 | Boosting algorithms learn their predictor:
18 |
19 | - a) by training predictors in parallel on slightly different datasets
20 | - b) by training predictors sequentially which correct previous prediction errors
21 | - c) by taking a linear combination of weak predictors
22 |
23 | _Select all answers that apply_
24 | ```
25 |
26 | +++
27 |
28 | ```{admonition} Question
29 | Histogram gradient boosting is an accelerated gradient boosting algorithm that:
30 |
31 | - a) takes a subsample of the original samples
32 | - b) bins the numerical features
33 | - c) takes a subsample of the original features
34 |
35 | _Select a single answer_
36 | ```
37 |
38 | +++
39 |
40 | ```{admonition} Question
41 | Boosting tends to overfit when increasing the number of predictors:
42 |
43 | - a) true
44 | - b) false
45 |
46 | _Select a single answer_
47 | ```
48 |
--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_quiz_m6_03.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M6.03
2 |
3 | ```{admonition} Question
4 | When compared to random forests, gradient boosting is usually trained using:
5 |
6 | - a) shallower trees
7 | - b) deeper trees
8 | - c) a subset of features
9 | - d) all features
10 |
11 | _Select all answers that apply_
12 | ```
13 |
14 | +++
15 |
16 | ```{admonition} Question
17 | Which of the hyperparameter(s) do not exist in random forest but exists in gradient boosting:
18 |
19 | - a) number of estimators
20 | - b) maximum depth
21 | - c) learning rate
22 |
23 | _Select all answers that apply_
24 | ```
25 |
26 | +++
27 |
28 | ```{admonition} Question
29 | Which of the following options are correct about the benefits of ensemble models?
30 |
31 | - a) Better generalization performance
32 | - b) Reduced sensitivity to hyperparameter tuning of individual predictors
33 | - c) Better interpretability
34 |
35 | _Select all answers that apply_
36 | ```
37 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/cross_validation_baseline_index.md:
--------------------------------------------------------------------------------
1 | # Comparing a model with simple baselines
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/cross_validation_choices_index.md:
--------------------------------------------------------------------------------
1 | # Choice of cross-validation
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/cross_validation_nested_index.md:
--------------------------------------------------------------------------------
1 | # Nested cross-validation
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | In the previous module, we presented the general cross-validation framework
8 | and used it to evaluate models' performance. However, this is important to
9 | keep in mind that some elements in the cross-validation need to be decided
10 | depending on the nature of the problem: (i) the cross-validation strategy and
11 | (ii) the evaluation metrics. Besides, it is always good to compare the models'
12 | performance with some baseline model.
13 |
14 | In this module, we present both aspects and give insights on when to use a
15 | specific cross-validation strategy and a metric. In addition, we will also
16 | give some insights regarding how to compare a model with some baseline.
17 |
18 | ## Before getting started
19 |
20 |
21 |
22 | The required technical skills to carry on this module are:
23 |
24 | - skills acquired during the "The Predictive Modeling Pipeline" module with
25 | basic usage of scikit-learn;
26 | - skills acquired during the "Selecting The Best Model" module, mainly around
27 | the concept of underfit/overfit and the usage of cross-validation in
28 | scikit-learn.
29 |
30 |
31 |
32 | ## Objectives and time schedule
33 |
34 |
35 |
36 | The objective in the module are the following:
37 |
38 | - understand the necessity of using an appropriate cross-validation strategy
39 | depending on the data;
40 | - get the intuitions behind comparing a model with some basic models that
41 | can be used as baseline;
42 | - understand the principles behind using nested cross-validation when the model
43 | needs to be evaluated as well as optimized;
44 | - understand the differences between regression and classification metrics;
45 | - understand the differences between metrics.
46 |
47 |
48 |
49 | The estimated time to go through this module is about 6 hours.
50 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_module_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 |
6 |
7 | In this notebook, we presented the framework used in machine-learning to
8 | evaluate a predictive model's performance: the cross-validation.
9 |
10 | Besides, we presented several splitting strategies that can be used in the
11 | general cross-validation framework. These strategies should be used wisely
12 | when encountering some specific patterns or types of data.
13 |
14 | Finally, we show how to perform nested cross-validation to select an optimal
15 | model and evaluate its generalization performance.
16 |
17 | ## To go further
18 |
19 |
20 |
21 | You can refer to the following scikit-learn examples which are related to
22 | the concepts approached in this module:
23 |
24 | - [Comparison of cross-validation strategies](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py)
25 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M7.01
2 |
3 | ```{admonition} Question
4 | What the benefit of using cross-validation?
5 |
6 | - a) Give information about performance variability
7 | - b) Remove the need to use a baseline algorithm
8 | - c) Give information regarding under- or over-fitting of a model
9 |
10 | _Select all answers that apply_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | Does a dummy classifier or regressor rely on the input feature values in
17 | the input data `X` to make the predictions?
18 |
19 | - a) Yes
20 | - b) No
21 |
22 | _Select a single answer_
23 | ```
24 |
25 | +++
26 |
27 | ```{admonition} Question
28 | Does a dummy classifier from scikit-learn always make constant predictions
29 | whatever the chosen strategy?
30 |
31 | - a) Yes
32 | - b) No
33 |
34 | _Select a single answer_
35 | ```
36 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_02.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M7.02
2 |
3 | ```{admonition} Question
4 | We have a dataset with patient records from 10 different hospitals, and our goal
5 | is to predict whether a patient has a disease or not. Let's also suppose that
6 | the classes ("disease" and "no-disease") are imbalanced. Additionally, we suspect
7 | that each hospital's data may have systematic biases due to factors like
8 | medical devices, policies, socioeconomic status of the patients, etc.
9 |
10 | Which cross-validation strategy is the most suitable for assessing the model's
11 | ability to make good predictions on patients from hospitals not seen during
12 | training?
13 |
14 | - a) Group stratified k-fold cross-validation
15 | - b) Group k-fold
16 | - c) Stratified k-fold cross-validation
17 | - d) Leave-one-out cross-validation
18 |
19 | _Select a single answer_
20 | ```
21 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_03.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M7.03
2 |
3 | ```{admonition} Question
4 | How to evaluate and tune the hyperparameters of a model?
5 |
6 | - a) Fit the model on the train set, set the parameters using the test set, and
7 | evaluate the model on the same test set
8 | - b) Fit the model on the train set, set the parameters using a validation set,
9 | and evaluate the model on the test set
10 | - c) use a nested cross-validation, with an inner cross-validation to tune the
11 | parameters of the model and an outer cross-validation to evaluate the model's
12 | performance
13 |
14 | _Select all answers that apply_
15 | ```
16 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_04.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M7.04
2 |
3 | ```{admonition} Question
4 | What is the default score in scikit-learn when using a classifier?
5 |
6 | - a) balanced accuracy
7 | - b) ROC-AUC
8 | - c) accuracy
9 |
10 | _Select a single answer_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | Other than the decision threshold, metrics such as recall and precision also
17 | depend on the regularization parameters. Assuming that class "1" (in red) is the
18 | positive class, use the following figures to select which statements are true in
19 | this particular logistic regression model:
20 |
21 | 
22 | 
23 |
24 | - a) stronger regularization leads to higher precision
25 | - b) stronger regularization leads to lower precision
26 | - c) stronger regularization leads to higher recall
27 | - d) stronger regularization leads to lower recall
28 |
29 | _Select all answers that apply_
30 | ```
31 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_05.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M7.05
2 |
3 | ```{admonition} Question
4 | What is the default score in scikit-learn when using a regressor?
5 |
6 | - a) $R^2$
7 | - b) mean absolute error
8 | - c) median absolute error
9 |
10 | _Select a single answer_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | If we observe that the values returned by
17 | `cross_val_scores(model, X, y, scoring="r2")` increase after changing the model
18 | parameters, it means that the latest model:
19 |
20 | - a) generalizes better
21 | - b) generalizes worse
22 |
23 | _Select a single answer_
24 | ```
25 |
26 | +++
27 |
28 | ```{admonition} Question
29 | If all the values returned by
30 | `cross_val_score(model_A, X, y, scoring="neg_mean_squared_error")`
31 | are strictly lower than those returned by
32 | `cross_val_score(model_B, X, y, scoring="neg_mean_squared_error")`
33 | it means that `model_B` generalizes:
34 |
35 | - a) better than `model_A`
36 | - b) worse than `model_A`
37 |
38 | Hint: Remember that `"neg_mean_squared_error"` is an alias for the negative of
39 | the Mean Squared Error.
40 |
41 | _Select a single answer_
42 | ```
43 |
44 | +++
45 |
46 | ```{admonition} Question
47 | Values returned by `cross_val_scores(model, X, y, scoring="neg_mean_squared_error")`
48 | are:
49 |
50 | - a) guaranteed to be positive or zero
51 | - b) guaranteed to be negative or zero
52 | - c) can be either positive or negative depending on the data
53 |
54 | _Select a single answer_
55 | ```
56 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/metrics_classification_index.md:
--------------------------------------------------------------------------------
1 | # Classification metrics
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/evaluation/metrics_regression_index.md:
--------------------------------------------------------------------------------
1 | # Regression metrics
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_limitation_index.md:
--------------------------------------------------------------------------------
1 | # Caveats of feature selection
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | This module gives some insights regarding feature selection. Besides motivating
8 | the benefit of using feature selection, we also illustrate some of the known
9 | caveats.
10 |
11 | ## Before getting started
12 |
13 |
14 |
15 | The required technical skills to carry on this module are:
16 |
17 | - skills acquired during the "The Predictive Modeling Pipeline" module with
18 | basic usage of scikit-learn;
19 | - skills acquired during the "Selecting The Best Model" module, mainly around
20 | the concept of underfit/overfit and the usage of cross-validation in
21 | scikit-learn.
22 |
23 |
24 |
25 | ## Objectives and time schedule
26 |
27 |
28 |
29 | The objective in the module are the following:
30 |
31 | - understand in which case feature selection is beneficial;
32 | - be aware of the caveats and how to put into practice feature selection
33 | techniques.
34 |
35 |
36 |
37 | The estimated time to go through this module is about 50 minutes.
38 |
--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_module_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 |
6 |
7 | In this module, we presented the principle of feature selection. In short,
8 | feature selection is not a magical tool to get marginal gains. We tackle
9 | the following aspects:
10 |
11 | - you should use feature selection to speed-up training and testing rather
12 | than seeking for marginal performance gains;
13 | - you should be careful regarding the framework and how to include a feature
14 | selector within your pipeline;
15 | - you should be aware of the limitation of a feature selector based on
16 | machine-learning models.
17 |
18 | ## To go further
19 |
20 |
21 |
22 | You can refer to the following scikit-learn examples which are related to
23 | the concepts approached during this module:
24 |
25 | - [Recursive feature selection using cross-validation](https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py)
26 |
--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_quiz.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz
2 |
3 | ```{admonition} Question
4 | What is the main advantage of using feature selection?
5 |
6 | - a) speeding-up the training of an algorithm
7 | - b) fine tuning the model's performance
8 | - c) remove noisy features
9 |
10 | _Select a single answer_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | When selecting feature, the decision should be made using:
17 |
18 | - a) the entire dataset
19 | - b) the training set
20 | - c) the testing set
21 |
22 | _Select a single answer_
23 | ```
24 |
--------------------------------------------------------------------------------
/jupyter-book/figures:
--------------------------------------------------------------------------------
1 | ../figures
--------------------------------------------------------------------------------
/jupyter-book/interpretation/interpretation_quiz.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz
2 |
3 | ```{admonition} Question
4 | With a same dataset, feature importance might differs if:
5 |
6 | - a) we use two different models
7 | - b) we use two different train/test split with a same model
8 | - c) we use a same model with a different set of hyper-parameters
9 | - d) we use a same model with the same set of hyper-parameters but a different
10 | random_state
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | In linear model, the feature importance:
17 |
18 | - a) might be infer from the coefficients
19 | - b) might be infer by `importance_permutation`
20 | - c) need a regularization to infer the importance
21 | - d) is a built-in attribute
22 | ```
23 |
24 | +++
25 |
26 | ```{admonition} Question
27 | If two feature are the same (thus correlated)
28 |
29 | - a) their feature importance will be the same
30 | - b) their feature importance will be divided by 2
31 | - c) only one will receive all the feature importance, the second one will be 0
32 | - d) it depends
33 | ```
34 |
35 | +++
36 |
37 | ```{admonition} Question
38 | The feature importance provided by the scikit-learn random forest:
39 |
40 | - a) has bias for categorical feature
41 | - b) has bias for continuous (high cardinality) feature
42 | - c) is independent from the train/test split
43 | - d) is independent from the hyper-parameters
44 | ```
45 |
46 | +++
47 |
48 | ```{admonition} Question
49 | To evaluate the feature importance for a specific model, one could:
50 |
51 | - a) drop a column and compare the score
52 | - b) shuffle a column and compare the score
53 | - c) put all column to 0 and compare the score
54 | - d) change a column value to random number and compare the score
55 | ```
56 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_intuitions_index.md:
--------------------------------------------------------------------------------
1 | # Intuitions on linear models
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | In this module, will go further into details regarding models that use
8 | linear parametrization.
9 | We will see how to use this family of models for both classification and
10 | regression problems. Besides, we will explain how to fight over-fitting using
11 | regularization.
12 | Finally, we will show how linear models can be used with
13 | data presenting non-linearity.
14 |
15 | ## Before getting started
16 |
17 |
18 |
19 | The required technical skills to carry on this module are:
20 |
21 | - skills acquired during the "The Predictive Modeling Pipeline" module with
22 | basic usage of scikit-learn;
23 | - skills acquired during the "Selecting The Best Model" module, mainly around
24 | the concept of underfit/overfit and the usage of cross-validation in
25 | scikit-learn.
26 |
27 |
28 |
29 | ## Objectives and time schedule
30 |
31 |
32 |
33 | In this module, your objectives are to:
34 |
35 | - understand the linear models parametrization;
36 | - understand the implication of linear models in both
37 | regression and classification;
38 | - get intuitions of linear models applied in higher dimensional dataset;
39 | - understand the effect of regularization and how to set it;
40 | - understand how linear models can be used even with data showing non-linear
41 | relationship with the target to be predicted.
42 |
43 |
44 |
45 | The estimated time to go through this module is about 6 hours.
46 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_module_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 |
6 |
7 | In this module, we saw that:
8 |
9 | - the predictions of a linear model depend on a weighted sum of the values of
10 | the input features added to an intercept parameter;
11 | - fitting a linear model consists in adjusting both the weight coefficients and
12 | the intercept to minimize the prediction errors on the training set;
13 | - to train linear models successfully it is often required to scale the input
14 | features approximately to the same dynamic range;
15 | - regularization can be used to reduce over-fitting: weight coefficients are
16 | constrained to stay small when fitting;
17 | - the regularization hyperparameter needs to be fine-tuned by cross-validation
18 | for each new machine learning problem and dataset;
19 | - linear models can be used on problems where the target variable is not
20 | linearly related to the input features but this requires extra feature
21 | engineering work to transform the data in order to avoid under-fitting.
22 |
23 | ## To go further
24 |
25 |
26 |
27 | You can refer to the following scikit-learn examples which are related to
28 | the concepts approached during this module:
29 |
30 | - [Example of linear regression](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py)
31 | - [Comparison between a linear regression and a ridge regressor](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols_ridge_variance.html#sphx-glr-auto-examples-linear-model-plot-ols-ridge-variance-py)
32 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_non_linear_index.md:
--------------------------------------------------------------------------------
1 | # Non-linear feature engineering for linear models
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_quiz_m4_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M4.01
2 |
3 | ```{admonition} Question
4 | What is a linear regression?
5 |
6 | - a) a model that outputs a continuous prediction as the sum of the values of a
7 | **limited** subset of the input features
8 | - b) a model that outputs a binary prediction based on a linear combination
9 | of the values of the input features
10 | - c) a model that outputs a continuous prediction as a weighted sum of the input
11 | features
12 |
13 | _Select a single answer_
14 | ```
15 |
16 | +++
17 |
18 | ```{admonition} Question
19 | Is it possible to get a perfect fit (zero prediction error on the training set)
20 | with a linear classifier **by itself** on a non-linearly separable dataset?
21 |
22 | - a) yes
23 | - b) no
24 |
25 | _Select a single answer_
26 | ```
27 |
28 | +++
29 |
30 | ```{admonition} Question
31 | If we fit a linear regression where `X` is a single column vector, how many
32 | parameters our model will be made of?
33 |
34 | - a) 1
35 | - b) 2
36 | - c) 3
37 |
38 | _Select a single answer_
39 | ```
40 |
41 | +++
42 |
43 | ```{admonition} Question
44 | If we train a scikit-learn `LinearRegression` with `X` being a single column
45 | vector and `y` a vector, `coef_` and `intercept_` will be respectively:
46 |
47 | - a) an array of shape (1, 1) and a number
48 | - b) an array of shape (1,) and an array of shape (1,)
49 | - c) an array of shape (1, 1) and an array of shape (1,)
50 | - d) an array of shape (1,) and a number
51 |
52 | _Select a single answer_
53 | ```
54 |
55 | +++
56 |
57 | ```{admonition} Question
58 | The decision boundaries of a logistic regression model:
59 |
60 | - a) split classes using only one of the input features
61 | - b) split classes using a combination of the input features
62 | - c) often have curved shapes
63 |
64 | _Select a single answer_
65 | ```
66 |
67 | +++
68 |
69 | ```{admonition} Question
70 | For a binary classification task, what is the shape of the array returned by the
71 | `predict_proba` method for 10 input samples?
72 |
73 | - a) (10,)
74 | - b) (10, 2)
75 | - c) (2, 10)
76 |
77 | _Select a single answer_
78 | ```
79 |
80 | +++
81 |
82 | ```{admonition} Question
83 | In logistic regression's `predict_proba` method in scikit-learn, which of the
84 | following statements is true regarding the predicted probabilities?
85 |
86 | - a) The sum of probabilities across different classes for a given sample is always equal to 1.0.
87 | - b) The sum of probabilities across all samples for a given class is always equal to 1.0.
88 | - c) The sum of probabilities across all features for a given class is always equal to 1.0.
89 |
90 | _Select a single answer_
91 | ```
92 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_quiz_m4_02.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M4.02
2 |
3 | ```{admonition} Question
4 |
5 | Let us consider a pipeline that combines a polynomial feature extraction of
6 | degree 2 and a linear regression model. Let us assume that the linear regression
7 | coefficients are all non-zero and that the dataset contains a single feature.
8 | Is the prediction function of this pipeline a straight line?
9 |
10 | - a) yes
11 | - b) no
12 |
13 | _Select a single answer_
14 | ```
15 |
16 | +++
17 |
18 | ```{admonition} Question
19 | Fitting a linear regression where `X` has `n_features` columns and the target
20 | is a single continuous vector, what is the respective type/shape of `coef_`
21 | and `intercept_`?
22 |
23 | - a) it is not possible to fit a linear regression in dimension higher than 2
24 | - b) array of shape (`n_features`,) and a float
25 | - c) array of shape (1, `n_features`) and an array of shape (1,)
26 |
27 | _Select a single answer_
28 | ```
29 |
30 | +++
31 |
32 | ```{admonition} Question
33 | Combining (one or more) feature engineering transformers in a single pipeline:
34 |
35 | - a) increases the expressivity of the model
36 | - b) ensures that models extrapolate accurately regardless of the distribution of the data
37 | - c) may require tuning additional hyperparameters
38 | - d) inherently prevents any underfitting
39 |
40 | _Select all answers that apply_
41 | ```
42 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_quiz_m4_03.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M4.03
2 |
3 | ```{admonition} Question
4 | Which of the following estimators can solve linear regression problems?
5 |
6 | - a) sklearn.linear_model.LinearRegression
7 | - b) sklearn.linear_model.LogisticRegression
8 | - c) sklearn.linear_model.Ridge
9 |
10 | _Select all answers that apply_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | Regularization allows:
17 |
18 | - a) to create a model robust to outliers (samples that differ widely from
19 | other observations)
20 | - b) to reduce overfitting by forcing the weights to stay close to zero
21 | - c) to reduce underfitting by making the problem linearly separable
22 |
23 | _Select a single answer_
24 | ```
25 |
26 | +++
27 |
28 | ```{admonition} Question
29 | A ridge model is:
30 |
31 | - a) the same as linear regression with penalized weights
32 | - b) the same as logistic regression with penalized weights
33 | - c) a linear model
34 | - d) a non linear model
35 |
36 | _Select all answers that apply_
37 | ```
38 |
39 | +++
40 |
41 | ```{admonition} Question
42 | Assume that a data scientist has prepared a train/test split and plans to use
43 | the test for the final evaluation of a `Ridge` model. The parameter `alpha` of
44 | the `Ridge` model:
45 |
46 | - a) is internally tuned when calling `fit` on the train set
47 | - b) should be tuned by running cross-validation on a **train set**
48 | - c) should be tuned by running cross-validation on a **test set**
49 | - d) must be a positive number
50 |
51 | _Select all answers that apply_
52 | ```
53 |
54 | +++
55 |
56 | ```{admonition} Question
57 | Scaling the data before fitting a model:
58 |
59 | - a) is often useful for regularized linear models
60 | - b) is always necessary for regularized linear models
61 | - c) may speed-up fitting
62 | - d) has no impact on the optimal choice of the value of a regularization parameter
63 |
64 | _Select all answers that apply_
65 | ```
66 |
67 | +++
68 |
69 | ```{admonition} Question
70 | The effect of increasing the regularization strength in a ridge model is to:
71 |
72 | - a) shrink all weights towards zero
73 | - b) make all weights equal
74 | - c) set a subset of the weights to exactly zero
75 | - d) constrain all the weights to be positive
76 |
77 | _Select all answers that apply_
78 | ```
79 |
80 | +++
81 |
82 | ```{admonition} Question
83 | By default, a [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) in scikit-learn applies:
84 |
85 | - a) no penalty
86 | - b) a penalty that shrinks the magnitude of the weights towards zero (also called "l2 penalty")
87 | - c) a penalty that ensures all weights are equal
88 |
89 | _Select a single answer_
90 | ```
91 |
92 | +++
93 |
94 | ```{admonition} Question
95 | The parameter `C` in a logistic regression is:
96 |
97 | - a) similar to the parameter `alpha` in a ridge regressor
98 | - b) similar to `1 / alpha` where `alpha` is the parameter of a ridge regressor
99 | - c) not controlling the regularization
100 |
101 | _Select a single answer_
102 | ```
103 |
104 | +++
105 |
106 | ```{admonition} Question
107 | In logistic regression, increasing the regularization strength (by
108 | decreasing the value of `C`) makes the model:
109 |
110 | - a) more likely to overfit to the training data
111 | - b) more confident: the values returned by `predict_proba` are closer to 0 or 1
112 | - c) less complex, potentially underfitting the training data
113 |
114 | _Select a single answer_
115 | ```
116 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_regularization_index.md:
--------------------------------------------------------------------------------
1 | # Regularization in linear model
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Intuitions on linear models
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/linear_models/regularized_linear_models_slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Intuitions on regularized linear models
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/ml_concepts/quiz_intro_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz Intro.01
2 |
3 | Given a case study: pricing apartments based on a real estate website. We have
4 | thousands of house descriptions with their price. Typically, an example of a
5 | house description is the following:
6 |
7 | "Great for entertaining: spacious, updated 2 bedroom, 1 bathroom apartment in
8 | Lakeview, 97630. The house will be available from May 1st. Close to nightlife
9 | with private backyard. Price ~$1,000,000."
10 |
11 | We are interested in predicting house prices from their description. One
12 | potential use case for this would be, as a buyer, to find houses that are cheap
13 | compared to their market value.
14 |
15 | ```{admonition} Question
16 | What kind of problem is it?
17 |
18 | - a) a supervised problem
19 | - b) an unsupervised problem
20 | - c) a classification problem
21 | - d) a regression problem
22 |
23 | _Select all answers that apply_
24 | ```
25 |
26 | +++
27 |
28 | ```{admonition} Question
29 | What are the features?
30 |
31 | - a) the number of rooms might be a feature
32 | - b) the post code of the house might be a feature
33 | - c) the price of the house might be a feature
34 |
35 | _Select all answers that apply_
36 | ```
37 |
38 | +++
39 |
40 | ```{admonition} Question
41 | What is the target variable?
42 |
43 | - a) the full text description is the target
44 | - b) the price of the house is the target
45 | - c) only house description with no price mentioned are the target
46 |
47 | _Select a single answer_
48 | ```
49 |
50 | +++
51 |
52 | ```{admonition} Question
53 | What is a record (a sample)?
54 |
55 | - a) each house description is a record
56 | - b) each house price is a record
57 | - c) each kind of description (as the house size) is a record
58 |
59 | _Select a single answer_
60 | ```
61 |
--------------------------------------------------------------------------------
/jupyter-book/ml_concepts/slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Introducing machine-learning concepts
2 |
3 | This presentation will teach you the basic concepts: what is machine learning,
4 | the type of sub-problems that it covers, the vocabulary and the general
5 | pipeline.
6 |
7 |
10 |
11 |
13 |
14 | To navigate in the slides, **first click on the slides**, then:
15 | - press the **arrow keys** to go to the next/previous slide;
16 | - press **"P"** to toggle presenter mode to see the notes;
17 | - press **"F"** to toggle full-screen mode.
18 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/bias_vs_variance_quiz_m2_03.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M2.03
2 |
3 | ```{admonition} Question
4 | Fitting a model with a high bias:
5 |
6 | - a) causes an underfitted model?
7 | - b) causes an overfitted model?
8 | - c) increases the sensitivity of the learned prediction function to a random resampling of the training set observations?
9 | - d) causes the learned prediction function to make systematic errors?
10 |
11 | _Select all answers that apply_
12 | ```
13 |
14 | +++
15 |
16 | ```{admonition} Question
17 | Fitting a high variance model:
18 |
19 | - a) causes an underfitted model?
20 | - b) causes an overfitted model?
21 | - c) increases the sensitivity of the learned prediction function to a random resampling of the training set observations?
22 | - d) causes the learned prediction function to make systematic errors?
23 |
24 | _Select all answers that apply_
25 | ```
26 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/bias_vs_variance_slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Bias versus Variance
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/learning_validation_curves_quiz_m2_02.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M2.02
2 |
3 | ```{admonition} Question
4 | A model is overfitting when:
5 |
6 | - a) both the train and test errors are high
7 | - b) train error is low but test error is high
8 | - c) train error is high but the test error is low
9 | - d) both train and test errors are low
10 |
11 | _Select a single answer_
12 | ```
13 |
14 | +++
15 |
16 | ```{admonition} Question
17 | Assuming that we have a dataset with little noise, a model is underfitting when:
18 |
19 | - a) both the train and test errors are high
20 | - b) train error is low but test error is high
21 | - c) train error is high but the test error is low
22 | - d) both train and test errors are low
23 |
24 | _Select a single answer_
25 | ```
26 |
27 | +++
28 |
29 | ```{admonition} Question
30 | For a fixed training set, by sequentially adding parameters to give more
31 | flexibility to the model, we are more likely to observe:
32 |
33 | - a) a wider difference between train and test errors
34 | - b) a reduction in the difference between train and test errors
35 | - c) an increased or steady train error
36 | - d) a decrease in the train error
37 |
38 | _Select all answers that apply_
39 | ```
40 |
41 | +++
42 |
43 | ```{admonition} Question
44 | For a fixed choice of model parameters, if we increase the number of labeled
45 | observations in the training set, are we more likely to observe:
46 |
47 | - a) a wider difference between train and test errors
48 | - b) a reduction in the difference between train and test errors
49 | - c) an increased or steady train error
50 | - d) a decrease in the train error
51 |
52 | _Select all answers that apply_
53 | ```
54 |
55 | +++
56 |
57 | ```{admonition} Question
58 | Polynomial models with a high degree parameter:
59 |
60 | - a) always have the best test error (but can be slow to train)
61 | - b) underfit more than linear regression models
62 | - c) get lower training error than lower degree polynomial models
63 | - d) are more likely to overfit than lower degree polynomial models
64 |
65 | _Select all answers that apply_
66 | ```
67 |
68 | +++
69 |
70 | ```{admonition} Question
71 | If we chose the parameters of a model to get the best overfitting/underfitting
72 | tradeoff, we will always get a zero test error.
73 |
74 | - a) True
75 | - b) False
76 |
77 | _Select a single answer_
78 | ```
79 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/learning_validation_curves_slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Comparing train and test errors
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_bias_variance_index.md:
--------------------------------------------------------------------------------
1 | # Bias versus variance trade-off
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | This module gives an intuitive introduction to the very **fundamental
8 | concepts** of overfitting and underfitting in machine learning.
9 |
10 | Machine learning models can never make perfect predictions: the test error is
11 | never exactly zero. This failure comes from a **fundamental trade-off** between
12 | **modeling flexibility** and the **limited size of the training dataset**.
13 |
14 | The first presentation will define those problems and characterize how and why
15 | they arise.
16 |
17 | Then we will present a methodology to quantify those problems by **contrasting
18 | the train error with the test error** for various choice of the model family,
19 | model parameters. More importantly, we will emphasize the **impact of the size
20 | of the training set on this trade-off**.
21 |
22 | Finally we will relate overfitting and underfitting to the concepts of
23 | statistical variance and bias.
24 |
25 | ## Before getting started
26 |
27 |
28 |
29 | The required technical skills to carry on this module are:
30 |
31 | - skills acquired during the "The Predictive Modeling Pipeline" module with
32 | basic usage of scikit-learn.
33 |
34 |
35 |
36 | ## Objectives and time schedule
37 |
38 |
39 |
40 | The objective in the module are the following:
41 |
42 | - understand the concept of overfitting and underfitting;
43 | - understand the concept of generalization;
44 | - understand the general cross-validation framework used to evaluate a model.
45 |
46 |
47 |
48 | The estimated time to go through this module is about 3 hours.
49 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_overfitting_underfitting_index.md:
--------------------------------------------------------------------------------
1 | # Overfitting and underfitting
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 | - **Overfitting** is caused by the **limited size of the training set**, the
6 | **noise** in the data, and the **high flexibility** of common machine learning
7 | models.
8 |
9 | - **Underfitting** happens when the learnt prediction functions suffer from
10 | **systematic errors**. This can be caused by a choice of model family and
11 | parameters, which leads to a **lack of flexibility** to capture the repeatable
12 | structure of the true data generating process.
13 |
14 | - For a fixed training set, the objective is to **minimize the test error** by
15 | adjusting the model family and its parameters to find the
16 | **best trade-off between overfitting for underfitting**.
17 |
18 | - For a given choice of model family and parameters, **increasing the
19 | training set size will decrease overfitting** but can also cause an increase
20 | of underfitting.
21 |
22 | - The test error of a model that is neither overfitting nor underfitting can
23 | still be high if the variations of the target variable cannot be fully
24 | determined by the input features. This irreducible error is caused by what we
25 | sometimes call label noise. In practice, this often happens when we do not
26 | have access to important features for one reason or another.
27 |
28 | ## To go further
29 |
30 | It is possible to give a precise mathematical treatment of the bias and the
31 | variance of a regression model. The Wikipedia article on the [Bias-variance
32 | tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff) explains
33 | how the **squared test error can be decomposed as the sum of the squared bias,
34 | the variance and the irreducible error** for a given regression error.
35 |
36 | The next chapters on linear models, decision trees and ensembles will give
37 | concrete examples on how to diagnose and how to tackle overfitting and
38 | underfitting.
39 |
40 | You can refer to the following scikit-learn examples which are related to
41 | the concepts approached during this module:
42 |
43 | - [Illustration of underfitting and overfitting concepts](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html#sphx-glr-auto-examples-model-selection-plot-underfitting-overfitting-py)
44 | - [Difference between train and test scores](https://scikit-learn.org/stable/auto_examples/model_selection/plot_train_error_vs_test_error.html#sphx-glr-auto-examples-model-selection-plot-train-error-vs-test-error-py)
45 | - [Example of a validation curve](https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py)
46 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_validation_learning_curves_index.md:
--------------------------------------------------------------------------------
1 | # Validation and learning curves
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/overfitting_vs_under_fitting_quiz_m2_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M2.01
2 |
3 | ```{admonition} Question
4 | A model that is underfitting:
5 |
6 | - a) is too complex and thus highly flexible
7 | - b) is too constrained and thus limited by its expressivity
8 | - c) often makes prediction errors, even on training samples
9 | - d) focuses too much on noisy details of the training set
10 |
11 | _Select all answers that apply_
12 | ```
13 |
14 | +++
15 |
16 | ```{admonition} Question
17 | A model that is overfitting:
18 |
19 | - a) is too complex and thus highly flexible
20 | - b) is too constrained and thus limited by its expressivity
21 | - c) often makes prediction errors, even on training samples
22 | - d) focuses too much on noisy details of the training set
23 |
24 | _Select all answers that apply_
25 | ```
26 |
--------------------------------------------------------------------------------
/jupyter-book/overfit/overfitting_vs_under_fitting_slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Overfitting and Underfitting
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_index.md:
--------------------------------------------------------------------------------
1 | # Tabular data exploration
2 |
3 | ```{tableofcontents}
4 | ```
5 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M1.01
2 |
3 | ```{admonition} Question
4 | In the notebook "First look at our dataset", we used pandas and specifically
5 | `adult_census = pd.read_csv("../datasets/adult-census.csv")` to:
6 |
7 | - a) load a comma-separated values file
8 | - b) load a dataset already included in the pandas package
9 | - c) load a file only containing the survey features
10 | - d) load a file only containing the target of our classification problem:
11 | whether or not a person has a low or high income salary
12 | - e) load a file containing both the features and the target for our classification
13 | problem
14 |
15 | _Select all answers that apply_
16 | ```
17 |
18 | +++
19 |
20 | ```{admonition} Question
21 |
22 | In the previous notebook, we used:
23 |
24 | - a) pandas to gain insights about the dataset
25 | - b) pandas and seaborn to visually inspect the dataset
26 | - c) numpy and scipy to perform numerical inspection (for instance using
27 | `scipy.optimize.minimize`)
28 | - d) scikit-learn to fit some machine learning models
29 |
30 | _Select all answers that apply_
31 | ```
32 |
33 | +++
34 |
35 | ```{admonition} Question
36 | How is a tabular dataset organized?
37 |
38 | - a) a column represents a sample and a row represents a feature
39 | - b) a column represents a feature and a row represents a sample
40 | - c) the target variable is represented by a row
41 | - d) the target variable is represented by a column
42 |
43 | _Select all answers that apply_
44 | ```
45 |
46 | +++
47 |
48 | ```{admonition} Question
49 | A categorical variable is:
50 |
51 | - a) a variable with **only two** different possible values
52 | - b) a variable with continuous numerical values
53 | - c) a variable with a finite set of possible values
54 |
55 | _Select a single answer_
56 | ```
57 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_index.md:
--------------------------------------------------------------------------------
1 | # Fitting a scikit-learn model on numerical data
2 |
3 | ```{tableofcontents}
4 | ```
5 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M1.02
2 |
3 | ```{admonition} Question
4 | Why do we need two sets: a train set and a test set?
5 |
6 | - a) to train the model faster
7 | - b) to validate the model on unseen data
8 | - c) to improve the accuracy of the model
9 |
10 | _Select all answers that apply_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | The generalization performance of a scikit-learn model can be evaluated by:
17 |
18 | - a) calling `fit` to train the model on the **training set**, `predict` on the
19 | **test set** to get the predictions, and compute the score by passing the
20 | predictions and the true target values to some metric function
21 | - b) calling `fit` to train the model on the **training set** and `score` to compute
22 | the score on the **test set**
23 | - c) calling `cross_validate` by passing the model, the data and the target
24 | - d) calling `fit_transform` on the data and then `score` to compute
25 | the score on the **test set**
26 |
27 | _Select all answers that apply_
28 | ```
29 |
30 | +++
31 |
32 | ```{admonition} Question
33 | When calling `cross_validate(estimator, X, y, cv=5)`, the following happens:
34 |
35 | - a) `X` and `y` are internally split five times with non-overlapping test sets
36 | - b) `estimator.fit` is called 5 times on the full `X` and `y`
37 | - c) `estimator.fit` is called 5 times, each time on a different training set
38 | - d) a Python dictionary is returned containing a key/value containing a NumPy
39 | array with 5 scores computed on the **train sets**
40 | - e) a Python dictionary is returned containing a key/value containing a NumPy
41 | array with 5 scores computed on the **test sets**
42 |
43 | _Select all answers that apply_
44 | ```
45 |
46 | +++
47 |
48 | We define a 2-dimensional dataset represented graphically as follows:
49 |
50 | 
51 |
52 | ```{admonition} Question
53 | If we process the dataset using a `StandardScaler` with the default parameters,
54 | which of the following results do you expect:
55 |
56 | 
57 |
58 | - a) Preprocessing A
59 | - b) Preprocessing B
60 | - c) Preprocessing C
61 | - d) Preprocessing D
62 |
63 | _Select a single answer_
64 | ```
65 |
66 | +++
67 |
68 | ```{admonition} Question
69 | Look at the plots and the answers of the previous question. A `StandardScaler`
70 | transformer with the default parameter:
71 |
72 | - a) transforms the features so that they have similar ranges
73 | - b) transforms the features to lie in the [0.0, 1.0] range
74 | - c) transforms feature values that were originally positive-only into values that can
75 | be negative or positive
76 | - d) can help logistic regression converge faster (fewer iterations)
77 |
78 | _Select all answers that apply_
79 | ```
80 |
81 | +++
82 |
83 | ```{admonition} Question
84 | Cross-validation allows us to:
85 |
86 | - a) train the model faster
87 | - b) measure the generalization performance of the model
88 | - c) estimate the variability of the generalization score
89 |
90 | _Select all answers that apply_
91 | ```
92 |
93 | +++
94 |
95 | ```{admonition} Question
96 | `make_pipeline` (as well as `Pipeline`):
97 |
98 | - a) runs a cross-validation using the transformers and predictor given as
99 | parameters
100 | - b) combines one or several transformers and a predictor
101 | - c) tries several models at the same time
102 | - d) plots feature histogram automatically
103 |
104 | _Select all answers that apply_
105 | ```
106 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_video_cross_validation.md:
--------------------------------------------------------------------------------
1 | # 🎥 Validation of a model
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_index.md:
--------------------------------------------------------------------------------
1 | # Handling categorical data
2 |
3 | ```{tableofcontents}
4 | ```
5 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_quiz_m1_03.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M1.03
2 |
3 | ```{admonition} Question
4 | How are categorical variables represented?
5 |
6 | - a) categorical feature is only represented by non-numerical data
7 | - b) categorical feature represents a finite number of values called categories
8 | - c) categorical feature can either be represented by numerical or non-numerical values
9 |
10 | _Select all answers that apply_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | An ordinal variable:
17 |
18 | - a) is a categorical variable with a large number of different categories;
19 | - b) can be represented by integers or string labels;
20 | - c) is a categorical variable with a meaningful order.
21 |
22 | _Select all answers that apply_
23 | ```
24 |
25 | +++
26 |
27 | ```{admonition} Question
28 | One-hot encoding:
29 |
30 | - a) encodes each column with string-labeled values into a single integer-coded column
31 | - b) transforms a numerical variable into a categorical variable
32 | - c) creates one additional column for each possible category
33 | - d) transforms string-labeled variables using a numerical representation
34 |
35 | _Select all answers that apply_
36 | ```
37 |
38 | +++
39 |
40 | ```{admonition} Question
41 |
42 | Assume we have a dataset where each line describes a company. Which of the
43 | following columns should be considered as a meaningful **numerical feature** to
44 | train a machine learning model to classify companies:
45 |
46 | - a) the sector of activity ("construction", "retail", "energy", "insurance"...)
47 | - b) the phone number of the sales department
48 | - c) the number of employees
49 | - d) the profits of the last quarter
50 | - e) the post code of the head quarters
51 |
52 | _Select all answers that apply_
53 | ```
54 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_visualization_video.md:
--------------------------------------------------------------------------------
1 | # 🎥 Visualizing scikit-learn pipelines in Jupyter
2 |
3 |
6 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/predictive_modeling_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | This module will give an example of a typical predictive modeling pipeline
8 | developed using tabular data (data that can be structured in a 2-dimensional
9 | table). We will present this pipeline in a progressive way. First, we will make
10 | an analysis of the dataset used. Subsequently, we will train our first
11 | predictive pipeline with a subset of the dataset. Then, we will give particular
12 | attention to the type of data, numerical and categorical, that our model has to
13 | handle. Finally, we will extend our pipeline to use mixed types of data, i.e.
14 | numerical and categorical data.
15 |
16 | ## Before getting started
17 |
18 |
19 |
20 | The required technical skills to carry on this module are:
21 |
22 | - basic knowledge of Python programming
23 | - some prior experience with the NumPy, pandas and Matplotlib libraries is
24 | recommended but not required
25 |
26 |
27 |
28 | For a quick introduction on these requirements, you can use the following resources:
29 | - [Introduction to Python](https://scipy-lectures.org/intro/language/python_language.html)
30 | - [Introduction to NumPy](https://sebastianraschka.com/blog/2020/numpy-intro.html)
31 | - [Introduction to Pandas](https://pandas.pydata.org/docs/user_guide/10min.html)
32 | - [Introduction to Matplotlib](https://sebastianraschka.com/blog/2020/numpy-intro.html#410-matplotlib)
33 |
34 | ## Objectives and time schedule
35 |
36 |
37 |
38 | The objective in the module are the following:
39 |
40 | - build intuitions regarding an unknown dataset;
41 | - identify and differentiate numerical and categorical features;
42 | - create an advanced predictive pipeline with scikit-learn.
43 |
44 |
45 |
46 | The estimated time to go through this module is about 6 hours
47 |
--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/predictive_modeling_module_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 |
6 |
7 | In this module, you learned:
8 |
9 | - to create a scikit-learn predictive model;
10 | - about the scikit-learn API to train and test a predictive model;
11 | - to process numerical data, notably using a `Pipeline`.
12 | - to process categorical data, notably using a `OneHotEncoder` and an
13 | `OrdinalEncoder`;
14 | - to handle and process mixed data types (i.e. numerical and
15 | categorical data), notably using a `ColumnTransformer`.
16 |
17 | ## To go further
18 |
19 |
20 |
21 | You can refer to the following scikit-learn examples which are related to
22 | the concepts approached during this module:
23 |
24 | - [Predictive machine learning pipeline with mixed data types](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py)
25 | - [Importance of feature scaling](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py)
26 |
--------------------------------------------------------------------------------
/jupyter-book/python_scripts:
--------------------------------------------------------------------------------
1 | ../python_scripts
--------------------------------------------------------------------------------
/jupyter-book/scikit-learn-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/jupyter-book/scikit-learn-logo.png
--------------------------------------------------------------------------------
/jupyter-book/toc.md:
--------------------------------------------------------------------------------
1 | # Table of contents
2 |
3 | ```{tableofcontents}
4 | ```
5 |
--------------------------------------------------------------------------------
/jupyter-book/trees/slides.md:
--------------------------------------------------------------------------------
1 | # 🎥 Intuitions on tree-based models
2 |
3 |
6 |
7 |
9 |
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_classification_index.md:
--------------------------------------------------------------------------------
1 | # Decision tree in classification
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_hyperparameters_index.md:
--------------------------------------------------------------------------------
1 | # Hyperparameters of decision tree
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_intuitions_index.md:
--------------------------------------------------------------------------------
1 | # Intuitions on tree-based models
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | This module will present in details decision tree models. This model will be
8 | explained in both classification and regression problems. Besides, we will
9 | show which hyperparameters of the decision tree have an importance on their
10 | performance, allowing to find the best trade-off between under- and over-fit.
11 |
12 | ## Before getting started
13 |
14 |
15 |
16 | The required technical skills to carry on this module are:
17 |
18 | - skills acquired during the "The Predictive Modeling Pipeline" module with
19 | basic usage of scikit-learn;
20 | - skills acquired during the "Selecting The Best Model" module, mainly around
21 | the concept of underfit/overfit and the usage of cross-validation in
22 | scikit-learn.
23 |
24 |
25 |
26 | ## Objectives and time schedule
27 |
28 |
29 |
30 | The objective in the module are the following:
31 |
32 | - understand how decision trees are working in classification and regression;
33 | - check which tree parameters are important and their influences.
34 |
35 |
36 |
37 | The estimated time to go through this module is about 3 hours.
38 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_module_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 |
6 |
7 | In this module, we presented decision trees in details. We saw that they:
8 |
9 | - are suited for both regression and classification problems;
10 | - are non-parametric models;
11 | - are not able to extrapolate;
12 | - are sensitive to hyperparameter tuning.
13 |
14 | ## To go further
15 |
16 |
17 |
18 | You can refer to the following scikit-learn examples which are related to
19 | the concepts approached during this module:
20 |
21 | - [Example of decision tree regressor](https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html#sphx-glr-auto-examples-tree-plot-tree-regression-py)
22 | - [Example of decision tree classifier](https://scikit-learn.org/stable/auto_examples/tree/plot_iris_dtc.html#sphx-glr-auto-examples-tree-plot-iris-dtc-py)
23 | - [Understanding the tree structure in scikit-learn](https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py)
24 | - [Post-pruning decision trees](https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py)
25 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M5.01
2 |
3 | ```{admonition} Question
4 | From the presentation given in the video, for which kind of supervised learning
5 | tasks decision trees can be applied to:
6 |
7 | - a) classification tasks
8 | - b) regression tasks
9 | - c) clustering tasks
10 |
11 | _Select all answers that apply_
12 | ```
13 |
14 | +++
15 |
16 | ```{admonition} Question
17 | A given split node in a decision tree classifier makes:
18 |
19 | - a) a binary decision considering a single feature at a time
20 | - b) a binary decision considering a combination of all the input features
21 | - c) multiple binary decisions considering a single feature
22 | - d) a binary decision considering a non-linear combination of all input
23 | features
24 |
25 | _Select a single answer_
26 | ```
27 |
28 | +++
29 |
30 | ```{admonition} Question
31 | Which aspect of the decision tree learning procedure is most typically used to
32 | control the underfitting/overfitting trade-off?
33 |
34 | - a) The number of children of a split node
35 | - b) The magnitude of the weight coefficients
36 | - c) The maximum depth of the decision tree
37 |
38 | _Select a single answer_
39 | ```
40 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_02.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M5.02
2 |
3 | ```{admonition} Question
4 | For a decision tree built in scikit-learn, a split:
5 |
6 | - a) will use a single feature to create a rule
7 | - b) will use a combination of the features to create a rule
8 | - c) will create multiple separations, one for each class
9 |
10 | _Select a single answer_
11 | ```
12 |
13 | +++
14 |
15 | ```{admonition} Question
16 | Trees are built incrementally:
17 |
18 | - a) by splitting data over and over
19 | - b) by refining the rules of each node
20 | - c) by refining the rules of each leaf
21 |
22 | _Select a single answer_
23 | ```
24 |
25 | +++
26 |
27 | ```{admonition} Question
28 | A decision tree split is built:
29 |
30 | - a) using a random threshold
31 | - b) using the median value of a single feature as a threshold
32 | - c) using a threshold that minimizes an error
33 |
34 | _Select all answers that apply_
35 | ```
36 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_03.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M5.03
2 |
3 | ```{admonition} Question
4 | When fitting a decision tree regressor in scikit-learn, the predicted values on
5 | a leaf correspond to:
6 |
7 | - a) the median of the training samples at this node
8 | - b) the mean of the training samples at this node
9 | - c) the most frequent value of the training samples at this node
10 |
11 | _Select a single answer_
12 | ```
13 |
14 | +++
15 |
16 | ```{admonition} Question
17 | Decision tree regressors can predict:
18 |
19 | - a) any values, including values larger or smaller than those observed in `y_train`;
20 | - b) only values in the range from `np.min(y_train)` to `np.max(y_train)`.
21 |
22 | _Select a single answer_
23 | ```
24 |
25 | +++
26 |
27 | ```{admonition} Question
28 | The predictions of a tree regressor correspond to:
29 |
30 | - a) a piecewise-linear function
31 | - b) a piecewise-constant function
32 | - c) a piecewise-cubic function
33 |
34 | _Select a single answer_
35 | ```
36 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_04.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M5.04
2 |
3 | ```{admonition} Question
4 | If a decision tree is overfitting, you need to increase the maximum depth.
5 |
6 | - a) True
7 | - b) False
8 |
9 | _Select a single answer_
10 | ```
11 |
12 | +++
13 |
14 | ```{admonition} Question
15 | How should you choose the maximum depth of a decision tree?
16 |
17 | - a) choosing the depth maximizing the score on a validation set with a
18 | cross-validation, with a grid-search for instance
19 | - b) choosing the depth maximizing the score on the train set
20 | - c) choosing the depth maximizing the score on the test set
21 |
22 | _Select all answers that apply_
23 | ```
24 |
--------------------------------------------------------------------------------
/jupyter-book/trees/trees_regression_index.md:
--------------------------------------------------------------------------------
1 | # Decision tree in regression
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_automated_index.md:
--------------------------------------------------------------------------------
1 | # Automated tuning
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_manual_index.md:
--------------------------------------------------------------------------------
1 | # Manual tuning
2 |
3 | ```{tableofcontents}
4 |
5 | ```
6 |
--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md:
--------------------------------------------------------------------------------
1 | # ✅ Quiz M3.01
2 |
3 | ```{admonition} Question
4 | Which parameters below are hyperparameters of `HistGradientBoostingClassifier`?
5 | Remember we only consider hyperparameters to be those that potentially impact
6 | the result of the learning procedure and subsequent predictions.
7 |
8 | - a) `C`
9 | - b) `max_leaf_nodes`
10 | - c) `verbose`
11 | - d) `classes_`
12 | - e) `learning_rate`
13 |
14 | _Select all answers that apply_
15 | ```
16 |
17 | +++
18 |
19 | ````{admonition} Question
20 | Given an instance named `model` as defined by:
21 | ```python
22 | from sklearn.linear_model import LogisticRegression
23 | model = LogisticRegression()
24 | ```
25 |
26 | how do you get the value of the `C` parameter?
27 | - a) `model.get_parameters()['C']`
28 | - b) `model.get_params()['C']`
29 | - c) `model.get_params('C')`
30 | - d) `model.get_params['C']`
31 |
32 | _Select a single answer_
33 | ````
34 |
35 | +++
36 |
37 | ````{admonition} Question
38 | Given `model` defined by:
39 | ```python
40 | from sklearn.linear_model import LogisticRegression
41 |
42 | model = LogisticRegression()
43 | ```
44 |
45 | how do you set the value of the `C` parameter to `5`?
46 | - a) `model.set_params('C', 5)`
47 | - b) `model.set_params({'C': 5})`
48 | - c) `model.set_params()['C'] = 5`
49 | - d) `model.set_params(C=5)`
50 |
51 | _Select a single answer_
52 | ````
53 |
54 | +++
55 |
56 | ````{admonition} Question
57 | Given `model` defined by:
58 | ```python
59 | from sklearn.preprocessing import StandardScaler
60 | from sklearn.linear_model import LogisticRegression
61 | from sklearn.pipeline import Pipeline
62 |
63 | model = Pipeline([
64 | ('scaler', StandardScaler()),
65 | ('classifier', LogisticRegression())
66 | ])
67 | ```
68 |
69 | how do you set the value of the `C` parameter of the `LogisticRegression` component to 5:
70 | - a) `model.set_params(C=5) `
71 | - b) `model.set_params(logisticregression__C=5)`
72 | - c) `model.set_params(classifier__C=5) `
73 | - d) `model.set_params(classifier--C=5)`
74 |
75 | _Select a single answer_
76 | ````
77 |
--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_module_intro.md:
--------------------------------------------------------------------------------
1 | # Module overview
2 |
3 | ## What you will learn
4 |
5 |
6 |
7 | In the previous modules, we showed how to create, train, predict, and even
8 | evaluate a predictive model. However, we did not change the models'
9 | parameters that can be given when creating an instance. For example,
10 | for k-nearest neighbors, we initially used this default parameter:
11 | `n_neighbors=5` before trying other model parameters.
12 |
13 | These parameters are called **hyperparameters**: they are parameters
14 | used to control the learning process, for instance the parameter `k`
15 | of the k-nearest neighbors. Hyperparameters are specified by the user,
16 | often manually tuned (or by an exhaustive automatic search), and
17 | cannot be estimated from the data. They should not be confused with
18 | the other parameters that are inferred during the training
19 | process. These parameters define the model itself, for instance
20 | `coef_` for the linear models.
21 |
22 | In this module, we will first show that the hyperparameters have an impact on
23 | the performance of the model and that default values are not necessarily the
24 | best option. Subsequently, we will show how to set hyperparameters in
25 | scikit-learn model. Finally, we will show strategies allowing to pick-up a
26 | combination of hyperparameters that maximizes model's performance.
27 |
28 | ## Before getting started
29 |
30 |
31 |
32 | The required technical skills to carry on this module are:
33 |
34 | - skills acquired during the "The Predictive Modeling Pipeline" with basic
35 | usage of scikit-learn;
36 | - skills related to using the cross-validation framework to evaluate a model.
37 |
38 |
39 |
40 | ## Objectives and time schedule
41 |
42 |
43 |
44 | The objective in the module are the following:
45 |
46 | - understand what is a model hyperparameter;
47 | - understand how to get and set the value of a hyperparameter in a scikit-learn
48 | model;
49 | - be able to fine tune a full predictive modeling pipeline;
50 | - understand and visualize the combination of parameters that improves the
51 | performance of a model.
52 |
53 |
54 |
55 | The estimated time to go through this module is about 3 hours.
56 |
--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_module_take_away.md:
--------------------------------------------------------------------------------
1 | # Main take-away
2 |
3 | ## Wrap-up
4 |
5 |
6 |
7 | - Hyperparameters have an impact on the models' performance and should be
8 | wisely chosen;
9 | - The search for the best hyperparameters can be automated with a grid-search
10 | approach or a randomized search approach;
11 | - A grid-search can be computationally expensive and becomes less attractive as
12 | the number of hyperparameters to explore increases. Moreover, the combinations
13 | are sampled on a fixed, regular grid.
14 | - A randomized-search allows exploring within a fixed budget, even as the number
15 | of hyperparameters increases. In this case, combinations can be sampled either
16 | on a regular grid or from a given distribution.
17 |
18 | ## To go further
19 |
20 |
21 |
22 | You can refer to the following scikit-learn examples which are related to
23 | the concepts approached during this module:
24 |
25 | - [Example of a grid-search](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py)
26 | - [Example of a randomized-search](https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py)
27 | - [Example of a nested cross-validation](https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html#sphx-glr-auto-examples-model-selection-plot-nested-cross-validation-iris-py)
28 |
--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_parallel_plot_video.md:
--------------------------------------------------------------------------------
1 | # 🎥 Analysis of hyperparameter search results
2 |
3 |
6 |
--------------------------------------------------------------------------------
/local-install-instructions.md:
--------------------------------------------------------------------------------
1 | # Local install instructions
2 |
3 | The course uses Python 3 and some data analysis packages such as Numpy, Pandas,
4 | scikit-learn, and matplotlib.
5 |
6 | ## Install Miniconda
7 |
8 | **This step is only necessary if you don't have conda installed already**:
9 |
10 | - download the Miniconda installer for your operating system (Windows, MacOSX
11 | or Linux) [here](https://docs.conda.io/en/latest/miniconda.html)
12 | - run the installer following the instructions
13 | [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html#regular-installation)
14 | depending on your operating system.
15 |
16 | ## Create conda environment
17 |
18 | ```sh
19 | # Clone this repo
20 | git clone https://github.com/INRIA/scikit-learn-mooc
21 | cd scikit-learn-mooc
22 | # Create a conda environment with the required packages for this tutorial:
23 | conda env create -f environment.yml
24 | ```
25 |
26 | ## Check your install
27 |
28 | To make sure you have all the necessary packages installed, we **strongly
29 | recommend** you to execute the `check_env.py` script located at the root of
30 | this repository:
31 |
32 | ```sh
33 | # Activate your conda environment
34 | conda activate scikit-learn-course
35 | python check_env.py
36 | ```
37 |
38 | Make sure that there is no `FAIL` in the output when running the `check_env.py`
39 | script, i.e. that its output looks similar to this:
40 |
41 | ```
42 | Using python in /home/lesteve/miniconda3/envs/scikit-learn-course
43 | 3.9.1 | packaged by conda-forge | (default, Jan 10 2021, 02:55:42)
44 | [GCC 9.3.0]
45 |
46 | [ OK ] numpy version 1.19.5
47 | [ OK ] scipy version 1.6.0
48 | [ OK ] matplotlib version 3.3.3
49 | [ OK ] sklearn version 1.6
50 | [ OK ] pandas version 2.0
51 | [ OK ] seaborn version 0.13
52 | [ OK ] notebook version 6.2.0
53 | [ OK ] plotly version 5.10.0
54 | ```
55 |
56 | ## Run Jupyter notebooks locally
57 |
58 | ```sh
59 | # Activate your conda environment
60 | conda activate scikit-learn-course
61 | jupyter notebook full-index.ipynb
62 | ```
63 |
64 | `full-index.ipynb` is an index file helping to navigate the notebooks.
65 | All the Jupyter notebooks are located in the `notebooks` folder.
66 |
--------------------------------------------------------------------------------
/notebooks/01_tabular_data_exploration_ex_01.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# \ud83d\udcdd Exercise M1.01"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Imagine we are interested in predicting penguins species based on two of their\n",
15 | "body measurements: culmen length and culmen depth. First we want to do some\n",
16 | "data exploration to get a feel for the data.\n",
17 | "\n",
18 | "What are the features? What is the target?"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "The data is located in `../datasets/penguins_classification.csv`, load it with\n",
26 | "`pandas` into a `DataFrame`."
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# Write your code here."
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Show a few samples of the data.\n",
43 | "\n",
44 | "How many features are numerical? How many features are categorical?"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Write your code here."
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "What are the different penguins species available in the dataset and how many\n",
61 | "samples of each species are there? Hint: select the right column and use the\n",
62 | "[`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)\n",
63 | "method."
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# Write your code here."
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "Plot histograms for the numerical features"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "# Write your code here."
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "Show features distribution for each class. Hint: use\n",
96 | "[`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "# Write your code here."
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Looking at these distributions, how hard do you think it would be to classify\n",
113 | "the penguins only using `\"culmen depth\"` and `\"culmen length\"`?"
114 | ]
115 | }
116 | ],
117 | "metadata": {
118 | "jupytext": {
119 | "main_language": "python"
120 | },
121 | "kernelspec": {
122 | "display_name": "Python 3",
123 | "name": "python3"
124 | }
125 | },
126 | "nbformat": 4,
127 | "nbformat_minor": 5
128 | }
--------------------------------------------------------------------------------
/notebooks/ensemble_ex_01.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# \ud83d\udcdd Exercise M6.01\n",
8 | "\n",
9 | "The aim of this notebook is to investigate if we can tune the hyperparameters\n",
10 | "of a bagging regressor and evaluate the gain obtained.\n",
11 | "\n",
12 | "We will load the California housing dataset and split it into a training and a\n",
13 | "testing set."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "from sklearn.datasets import fetch_california_housing\n",
23 | "from sklearn.model_selection import train_test_split\n",
24 | "\n",
25 | "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n",
26 | "target *= 100 # rescale the target in k$\n",
27 | "data_train, data_test, target_train, target_test = train_test_split(\n",
28 | " data, target, random_state=0, test_size=0.5\n",
29 | ")"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "
\n",
37 | "
Note
\n",
38 | "
If you want a deeper overview regarding this dataset, you can refer to the\n",
39 | "Appendix - Datasets description section at the end of this MOOC.
\n",
40 | "
"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its\n",
48 | "parameter `estimator`. Train the regressor and evaluate its generalization\n",
49 | "performance on the testing set using the mean absolute error."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "# Write your code here."
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "Now, create a `RandomizedSearchCV` instance using the previous model and tune\n",
66 | "the important parameters of the bagging regressor. Find the best parameters\n",
67 | "and check if you are able to find a set of parameters that improve the default\n",
68 | "regressor still using the mean absolute error as a metric.\n",
69 | "\n",
70 | "
\n",
71 | "
Tip
\n",
72 | "
You can list the bagging regressor's parameters using the get_params method.
If you want a deeper overview regarding this dataset, you can refer to the\n",
40 | "Appendix - Datasets description section at the end of this MOOC.
\n",
41 | "
"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "The first step will be to create a linear regression model."
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# Write your code here."
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "Then, use the `cross_val_score` to estimate the generalization performance of\n",
65 | "the model. Use a `KFold` cross-validation with 10 folds. Make the use of the\n",
66 | "$R^2$ score explicit by assigning the parameter `scoring` (even though it is\n",
67 | "the default score)."
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "# Write your code here."
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You\n",
84 | "may need to refer to the documentation for the `scoring` parameter."
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "# Write your code here."
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "Finally, use the `cross_validate` function and compute multiple scores/errors\n",
101 | "at once by passing a list of scorers to the `scoring` parameter. You can\n",
102 | "compute the $R^2$ score and the mean absolute error for instance."
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "# Write your code here."
112 | ]
113 | }
114 | ],
115 | "metadata": {
116 | "jupytext": {
117 | "main_language": "python"
118 | },
119 | "kernelspec": {
120 | "display_name": "Python 3",
121 | "name": "python3"
122 | }
123 | },
124 | "nbformat": 4,
125 | "nbformat_minor": 5
126 | }
--------------------------------------------------------------------------------
/one-day-course-index.md:
--------------------------------------------------------------------------------
1 | # The predictive modeling pipeline
2 |
3 | ## Tabular data exploration
4 |
5 | - [First look at our dataset](./notebooks/01_tabular_data_exploration.ipynb)
6 | - [Exercise 01](./notebooks/01_tabular_data_exploration_ex_01.ipynb)
7 |
8 | ## Fitting a scikit-learn model on numerical data
9 |
10 | - [First model with scikit-learn](./notebooks/02_numerical_pipeline_introduction.ipynb)
11 | - [Exercise 01](./notebooks/02_numerical_pipeline_ex_00.ipynb)
12 | - [Working with numerical data](./notebooks/02_numerical_pipeline_hands_on.ipynb)
13 | - [Exercise 02](./notebooks/02_numerical_pipeline_ex_01.ipynb)
14 | - [Preprocessing for numerical features](./notebooks/02_numerical_pipeline_scaling.ipynb)
15 |
16 | ## Handling categorical data
17 |
18 | - [Encoding of categorical variables](./notebooks/03_categorical_pipeline.ipynb)
19 | - [Exercise 01](./notebooks/03_categorical_pipeline_ex_01.ipynb)
20 | - [Using numerical and categorical variables together](./notebooks/03_categorical_pipeline_column_transformer.ipynb)
21 | - [Exercise 02](./notebooks/03_categorical_pipeline_ex_02.ipynb)
22 |
23 | # Hyperparameter tuning
24 |
25 | ## Manual tuning
26 |
27 | - [Set and get hyperparameters in scikit-learn](./notebooks/parameter_tuning_manual.ipynb)
28 | - [Exercise 01](./notebooks/parameter_tuning_ex_02.ipynb)
29 |
30 | ## Automated tuning
31 |
32 | - [Hyperparameter tuning by grid-search](./notebooks/parameter_tuning_grid_search.ipynb)
33 | - [Hyperparameter tuning by randomized-search](./notebooks/parameter_tuning_randomized_search.ipynb)
34 | - [Exercise 02](./notebooks/parameter_tuning_ex_03.ipynb)
35 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 79
3 | target_version = ['py38', 'py39', 'py310', 'py311']
4 | preview = true
5 | exclude = '''
6 | /(
7 | \.eggs # exclude a few common directories in the
8 | | \.git # root of the project
9 | | \.mypy_cache
10 | | \.vscode
11 | | build
12 | | dist
13 | )/
14 | '''
15 |
16 | [tool.ruff.lint]
17 | ignore = [
18 | 'E402', # module level import not at top of file
19 | 'F401', # imported but unused
20 | 'E501', # line too long
21 | 'E203', # whitespace before ':'
22 | ]
23 |
--------------------------------------------------------------------------------
/python_scripts/01_tabular_data_exploration_ex_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M1.01
16 |
17 | # %% [markdown]
18 | # Imagine we are interested in predicting penguins species based on two of their
19 | # body measurements: culmen length and culmen depth. First we want to do some
20 | # data exploration to get a feel for the data.
21 | #
22 | # What are the features? What is the target?
23 |
24 | # %% [markdown]
25 | # The data is located in `../datasets/penguins_classification.csv`, load it with
26 | # `pandas` into a `DataFrame`.
27 |
28 | # %%
29 | # Write your code here.
30 |
31 | # %% [markdown]
32 | # Show a few samples of the data.
33 | #
34 | # How many features are numerical? How many features are categorical?
35 |
36 | # %%
37 | # Write your code here.
38 |
39 | # %% [markdown]
40 | # What are the different penguins species available in the dataset and how many
41 | # samples of each species are there? Hint: select the right column and use the
42 | # [`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)
43 | # method.
44 |
45 | # %%
46 | # Write your code here.
47 |
48 | # %% [markdown]
49 | # Plot histograms for the numerical features
50 |
51 | # %%
52 | # Write your code here.
53 |
54 | # %% [markdown]
55 | # Show features distribution for each class. Hint: use
56 | # [`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html)
57 |
58 | # %%
59 | # Write your code here.
60 |
61 | # %% [markdown]
62 | # Looking at these distributions, how hard do you think it would be to classify
63 | # the penguins only using `"culmen depth"` and `"culmen length"`?
64 |
--------------------------------------------------------------------------------
/python_scripts/01_tabular_data_exploration_sol_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # kernelspec:
4 | # display_name: Python 3
5 | # name: python3
6 | # ---
7 |
8 | # %% [markdown]
9 | # # 📃 Solution for Exercise M1.01
10 |
11 | # %% [markdown]
12 | # Imagine we are interested in predicting penguins species based on two of their
13 | # body measurements: culmen length and culmen depth. First we want to do some
14 | # data exploration to get a feel for the data.
15 | #
16 | # What are the features? What is the target?
17 |
18 | # %% [markdown] tags=["solution"]
19 | # The features are `"culmen length"` and `"culmen depth"`. The target is the
20 | # penguin species.
21 |
22 | # %% [markdown]
23 | # The data is located in `../datasets/penguins_classification.csv`, load it with
24 | # `pandas` into a `DataFrame`.
25 |
26 | # %%
27 | # solution
28 | import pandas as pd
29 |
30 | penguins = pd.read_csv("../datasets/penguins_classification.csv")
31 |
32 | # %% [markdown]
33 | # Show a few samples of the data.
34 | #
35 | # How many features are numerical? How many features are categorical?
36 |
37 | # %% [markdown] tags=["solution"]
38 | # Both features, `"culmen length"` and `"culmen depth"` are numerical. There are
39 | # no categorical features in this dataset.
40 |
41 | # %%
42 | # solution
43 | penguins.head()
44 |
45 | # %% [markdown]
46 | # What are the different penguins species available in the dataset and how many
47 | # samples of each species are there? Hint: select the right column and use the
48 | # [`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)
49 | # method.
50 |
51 | # %%
52 | # solution
53 | penguins["Species"].value_counts()
54 |
55 | # %% [markdown]
56 | # Plot histograms for the numerical features
57 |
58 | # %%
59 | # solution
60 | _ = penguins.hist(figsize=(8, 4))
61 |
62 | # %% [markdown]
63 | # Show features distribution for each class. Hint: use
64 | # [`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html)
65 |
66 | # %%
67 | # solution
68 | import seaborn
69 |
70 | pairplot_figure = seaborn.pairplot(penguins, hue="Species")
71 |
72 | # %% [markdown] tags=["solution"]
73 | # We observe that the labels on the axis are overlapping. Even if it is not the
74 | # priority of this notebook, one can tweak them by increasing the height of each
75 | # subfigure.
76 |
77 | # %% tags=["solution"]
78 | pairplot_figure = seaborn.pairplot(penguins, hue="Species", height=4)
79 |
80 | # %% [markdown]
81 | # Looking at these distributions, how hard do you think it would be to classify
82 | # the penguins only using `"culmen depth"` and `"culmen length"`?
83 |
84 | # %% [markdown] tags=["solution"]
85 | # Looking at the previous scatter-plot showing `"culmen length"` and `"culmen
86 | # depth"`, the species are reasonably well separated:
87 | # - low culmen length -> Adelie
88 | # - low culmen depth -> Gentoo
89 | # - high culmen depth and high culmen length -> Chinstrap
90 | #
91 | # There is some small overlap between the species, so we can expect a
92 | # statistical model to perform well on this dataset but not perfectly.
93 |
--------------------------------------------------------------------------------
/python_scripts/02_numerical_pipeline_ex_00.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M1.02
16 | #
17 | # The goal of this exercise is to fit a similar model as in the previous
18 | # notebook to get familiar with manipulating scikit-learn objects and in
19 | # particular the `.fit/.predict/.score` API.
20 |
21 | # %% [markdown]
22 | # Let's load the adult census dataset with only numerical variables
23 |
24 | # %%
25 | import pandas as pd
26 |
27 | adult_census = pd.read_csv("../datasets/adult-census-numeric.csv")
28 | data = adult_census.drop(columns="class")
29 | target = adult_census["class"]
30 |
31 | # %% [markdown]
32 | # In the previous notebook we used `model = KNeighborsClassifier()`. All
33 | # scikit-learn models can be created without arguments. This is convenient
34 | # because it means that you don't need to understand the full details of a model
35 | # before starting to use it.
36 | #
37 | # One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the
38 | # number of neighbors we are going to use to make a prediction for a new data
39 | # point.
40 | #
41 | # What is the default value of the `n_neighbors` parameter?
42 | #
43 | # **Hint**: Look at the documentation on the [scikit-learn
44 | # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
45 | # or directly access the description inside your notebook by running the
46 | # following cell. This opens a pager pointing to the documentation.
47 |
48 | # %%
49 | from sklearn.neighbors import KNeighborsClassifier
50 |
51 | # KNeighborsClassifier?
52 |
53 | # %% [markdown]
54 | # Create a `KNeighborsClassifier` model with `n_neighbors=50`
55 |
56 | # %%
57 | # Write your code here.
58 |
59 | # %% [markdown]
60 | # Fit this model on the data and target loaded above
61 |
62 | # %%
63 | # Write your code here.
64 |
65 | # %% [markdown]
66 | # Use your model to make predictions on the first 10 data points inside the
67 | # data. Do they match the actual target values?
68 |
69 | # %%
70 | # Write your code here.
71 |
72 | # %% [markdown]
73 | # Compute the accuracy on the training data.
74 |
75 | # %%
76 | # Write your code here.
77 |
78 | # %% [markdown]
79 | # Now load the test data from `"../datasets/adult-census-numeric-test.csv"` and
80 | # compute the accuracy on the test data.
81 |
82 | # %%
83 | # Write your code here.
84 |
--------------------------------------------------------------------------------
/python_scripts/02_numerical_pipeline_ex_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M1.03
16 | #
17 | # The goal of this exercise is to compare the performance of our classifier in
18 | # the previous notebook (roughly 81% accuracy with `LogisticRegression`) to some
19 | # simple baseline classifiers. The simplest baseline classifier is one that
20 | # always predicts the same class, irrespective of the input data.
21 | #
22 | # - What would be the score of a model that always predicts `' >50K'`?
23 | # - What would be the score of a model that always predicts `' <=50K'`?
24 | # - Is 81% or 82% accuracy a good score for this problem?
25 | #
26 | # Use a `DummyClassifier` and do a train-test split to evaluate its accuracy on
27 | # the test set. This
28 | # [link](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators)
29 | # shows a few examples of how to evaluate the generalization performance of
30 | # these baseline models.
31 |
32 | # %%
33 | import pandas as pd
34 |
35 | adult_census = pd.read_csv("../datasets/adult-census.csv")
36 |
37 | # %% [markdown]
38 | # We first split our dataset to have the target separated from the data used to
39 | # train our predictive model.
40 |
41 | # %%
42 | target_name = "class"
43 | target = adult_census[target_name]
44 | data = adult_census.drop(columns=target_name)
45 |
46 | # %% [markdown]
47 | # We start by selecting only the numerical columns as seen in the previous
48 | # notebook.
49 |
50 | # %%
51 | numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"]
52 |
53 | data_numeric = data[numerical_columns]
54 |
55 | # %% [markdown]
56 | # Split the data and target into a train and test set.
57 |
58 | # %%
59 | from sklearn.model_selection import train_test_split
60 |
61 | # Write your code here.
62 |
63 | # %% [markdown]
64 | # Use a `DummyClassifier` such that the resulting classifier always predict the
65 | # class `' >50K'`. What is the accuracy score on the test set? Repeat the
66 | # experiment by always predicting the class `' <=50K'`.
67 | #
68 | # Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve
69 | # the desired behavior.
70 |
71 | # %%
72 | from sklearn.dummy import DummyClassifier
73 |
74 | # Write your code here.
75 |
--------------------------------------------------------------------------------
/python_scripts/02_numerical_pipeline_sol_00.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # kernelspec:
4 | # display_name: Python 3
5 | # name: python3
6 | # ---
7 |
8 | # %% [markdown]
9 | # # 📃 Solution for Exercise M1.02
10 | #
11 | # The goal of this exercise is to fit a similar model as in the previous
12 | # notebook to get familiar with manipulating scikit-learn objects and in
13 | # particular the `.fit/.predict/.score` API.
14 |
15 | # %% [markdown]
16 | # Let's load the adult census dataset with only numerical variables
17 |
18 | # %%
19 | import pandas as pd
20 |
21 | adult_census = pd.read_csv("../datasets/adult-census-numeric.csv")
22 | data = adult_census.drop(columns="class")
23 | target = adult_census["class"]
24 |
25 | # %% [markdown]
26 | # In the previous notebook we used `model = KNeighborsClassifier()`. All
27 | # scikit-learn models can be created without arguments. This is convenient
28 | # because it means that you don't need to understand the full details of a model
29 | # before starting to use it.
30 | #
31 | # One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the
32 | # number of neighbors we are going to use to make a prediction for a new data
33 | # point.
34 | #
35 | # What is the default value of the `n_neighbors` parameter?
36 | #
37 | # **Hint**: Look at the documentation on the [scikit-learn
38 | # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
39 | # or directly access the description inside your notebook by running the
40 | # following cell. This opens a pager pointing to the documentation.
41 |
42 | # %%
43 | from sklearn.neighbors import KNeighborsClassifier
44 |
45 | # KNeighborsClassifier?
46 |
47 | # %% [markdown] tags=["solution"]
48 | # We can see that the default value for `n_neighbors` is 5.
49 |
50 | # %% [markdown]
51 | # Create a `KNeighborsClassifier` model with `n_neighbors=50`
52 |
53 | # %%
54 | # solution
55 | model = KNeighborsClassifier(n_neighbors=50)
56 |
57 | # %% [markdown]
58 | # Fit this model on the data and target loaded above
59 |
60 | # %%
61 | # solution
62 | model.fit(data, target)
63 |
64 | # %% [markdown]
65 | # Use your model to make predictions on the first 10 data points inside the
66 | # data. Do they match the actual target values?
67 |
68 | # %%
69 | # solution
70 | first_data_values = data.iloc[:10]
71 | first_predictions = model.predict(first_data_values)
72 | first_predictions
73 |
74 | # %% tags=["solution"]
75 | first_target_values = target.iloc[:10]
76 | first_target_values
77 |
78 | # %% tags=["solution"]
79 | number_of_correct_predictions = (
80 | first_predictions == first_target_values
81 | ).sum()
82 | number_of_predictions = len(first_predictions)
83 | print(
84 | f"{number_of_correct_predictions}/{number_of_predictions} "
85 | "of predictions are correct"
86 | )
87 |
88 | # %% [markdown]
89 | # Compute the accuracy on the training data.
90 |
91 | # %%
92 | # solution
93 | model.score(data, target)
94 |
95 | # %% [markdown]
96 | # Now load the test data from `"../datasets/adult-census-numeric-test.csv"` and
97 | # compute the accuracy on the test data.
98 |
99 | # %%
100 | # solution
101 | adult_census_test = pd.read_csv("../datasets/adult-census-numeric-test.csv")
102 |
103 | data_test = adult_census_test.drop(columns="class")
104 | target_test = adult_census_test["class"]
105 |
106 | model.score(data_test, target_test)
107 |
108 | # %% [markdown] tags=["solution"]
109 | # Looking at the previous notebook, the accuracy seems slightly higher with
110 | # `n_neighbors=50` than with `n_neighbors=5` (the default value).
111 |
--------------------------------------------------------------------------------
/python_scripts/cross_validation_ex_02.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M7.01
16 | #
17 | # In this exercise we will define dummy classification baselines and use them as
18 | # reference to assess the relative predictive performance of a given model of
19 | # interest.
20 | #
21 | # We illustrate those baselines with the help of the Adult Census dataset, using
22 | # only the numerical features for the sake of simplicity.
23 |
24 | # %%
25 | import pandas as pd
26 |
27 | adult_census = pd.read_csv("../datasets/adult-census-numeric-all.csv")
28 | data, target = adult_census.drop(columns="class"), adult_census["class"]
29 |
30 | # %% [markdown]
31 | # First, define a `ShuffleSplit` cross-validation strategy taking half of the
32 | # samples as a testing at each round. Let us use 10 cross-validation rounds.
33 |
34 | # %%
35 | # Write your code here.
36 |
37 | # %% [markdown]
38 | # Next, create a machine learning pipeline composed of a transformer to
39 | # standardize the data followed by a logistic regression classifier.
40 |
41 | # %%
42 | # Write your code here.
43 |
44 | # %% [markdown]
45 | # Compute the cross-validation (test) scores for the classifier on this dataset.
46 | # Store the results pandas Series as we did in the previous notebook.
47 |
48 | # %%
49 | # Write your code here.
50 |
51 | # %% [markdown]
52 | # Now, compute the cross-validation scores of a dummy classifier that constantly
53 | # predicts the most frequent class observed the training set. Please refer to
54 | # the online documentation for the [sklearn.dummy.DummyClassifier
55 | # ](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)
56 | # class.
57 | #
58 | # Store the results in a second pandas Series.
59 |
60 | # %%
61 | # Write your code here.
62 |
63 | # %% [markdown]
64 | # Now that we collected the results from the baseline and the model, concatenate
65 | # the test scores as columns a single pandas dataframe.
66 |
67 | # %%
68 | # Write your code here.
69 |
70 | # %% [markdown]
71 | #
72 | # Next, plot the histogram of the cross-validation test scores for both models
73 | # with the help of [pandas built-in plotting
74 | # function](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#histograms).
75 | #
76 | # What conclusions do you draw from the results?
77 |
78 | # %%
79 | # Write your code here.
80 |
81 | # %% [markdown]
82 | # Change the `strategy` of the dummy classifier to `"stratified"`, compute the
83 | # results. Similarly compute scores for `strategy="uniform"` and then the plot
84 | # the distribution together with the other results.
85 | #
86 | # Are those new baselines better than the previous one? Why is this the case?
87 | #
88 | # Please refer to the scikit-learn documentation on
89 | # [sklearn.dummy.DummyClassifier](
90 | # https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)
91 | # to find out about the meaning of the `"stratified"` and `"uniform"`
92 | # strategies.
93 |
94 | # %%
95 | # Write your code here.
96 |
--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M6.01
16 | #
17 | # The aim of this notebook is to investigate if we can tune the hyperparameters
18 | # of a bagging regressor and evaluate the gain obtained.
19 | #
20 | # We will load the California housing dataset and split it into a training and a
21 | # testing set.
22 |
23 | # %%
24 | from sklearn.datasets import fetch_california_housing
25 | from sklearn.model_selection import train_test_split
26 |
27 | data, target = fetch_california_housing(as_frame=True, return_X_y=True)
28 | target *= 100 # rescale the target in k$
29 | data_train, data_test, target_train, target_test = train_test_split(
30 | data, target, random_state=0, test_size=0.5
31 | )
32 |
33 | # %% [markdown]
34 | # ```{note}
35 | # If you want a deeper overview regarding this dataset, you can refer to the
36 | # Appendix - Datasets description section at the end of this MOOC.
37 | # ```
38 |
39 | # %% [markdown]
40 | # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its
41 | # parameter `estimator`. Train the regressor and evaluate its generalization
42 | # performance on the testing set using the mean absolute error.
43 |
44 | # %%
45 | # Write your code here.
46 |
47 | # %% [markdown]
48 | # Now, create a `RandomizedSearchCV` instance using the previous model and tune
49 | # the important parameters of the bagging regressor. Find the best parameters
50 | # and check if you are able to find a set of parameters that improve the default
51 | # regressor still using the mean absolute error as a metric.
52 | #
53 | # ```{tip}
54 | # You can list the bagging regressor's parameters using the `get_params` method.
55 | # ```
56 |
57 | # %%
58 | # Write your code here.
59 |
--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_02.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M6.02
16 | #
17 | # The aim of this exercise it to explore some attributes available in
18 | # scikit-learn's random forest.
19 | #
20 | # First, we will fit the penguins regression dataset.
21 |
22 | # %%
23 | import pandas as pd
24 | from sklearn.model_selection import train_test_split
25 |
26 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
27 | feature_name = "Flipper Length (mm)"
28 | target_name = "Body Mass (g)"
29 | data, target = penguins[[feature_name]], penguins[target_name]
30 | data_train, data_test, target_train, target_test = train_test_split(
31 | data, target, random_state=0
32 | )
33 |
34 | # %% [markdown]
35 | # ```{note}
36 | # If you want a deeper overview regarding this dataset, you can refer to the
37 | # Appendix - Datasets description section at the end of this MOOC.
38 | # ```
39 |
40 | # %% [markdown]
41 | # Create a random forest containing three trees. Train the forest and check the
42 | # generalization performance on the testing set in terms of mean absolute error.
43 |
44 | # %%
45 | # Write your code here.
46 |
47 | # %% [markdown]
48 | # We now aim to plot the predictions from the individual trees in the forest.
49 | # For that purpose you have to create first a new dataset containing evenly
50 | # spaced values for the flipper length over the interval between 170 mm and 230
51 | # mm.
52 |
53 | # %%
54 | # Write your code here.
55 |
56 | # %% [markdown]
57 | # The trees contained in the forest that you created can be accessed with the
58 | # attribute `estimators_`. Use them to predict the body mass corresponding to
59 | # the values in this newly created dataset. Similarly find the predictions of
60 | # the random forest in this dataset.
61 |
62 | # %%
63 | # Write your code here.
64 |
65 | # %% [markdown]
66 | # Now make a plot that displays:
67 | # - the whole `data` using a scatter plot;
68 | # - the decision of each individual tree;
69 | # - the decision of the random forest.
70 |
71 | # %%
72 | # Write your code here.
73 |
--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_03.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M6.03
16 | #
17 | # The aim of this exercise is to:
18 | #
19 | # * verifying if a random forest or a gradient-boosting decision tree overfit if
20 | # the number of estimators is not properly chosen;
21 | # * use the early-stopping strategy to avoid adding unnecessary trees, to get
22 | # the best generalization performances.
23 | #
24 | # We use the California housing dataset to conduct our experiments.
25 |
26 | # %%
27 | from sklearn.datasets import fetch_california_housing
28 | from sklearn.model_selection import train_test_split
29 |
30 | data, target = fetch_california_housing(return_X_y=True, as_frame=True)
31 | target *= 100 # rescale the target in k$
32 | data_train, data_test, target_train, target_test = train_test_split(
33 | data, target, random_state=0, test_size=0.5
34 | )
35 |
36 | # %% [markdown]
37 | # ```{note}
38 | # If you want a deeper overview regarding this dataset, you can refer to the
39 | # Appendix - Datasets description section at the end of this MOOC.
40 | # ```
41 |
42 | # %% [markdown]
43 | # Create a gradient boosting decision tree with `max_depth=5` and
44 | # `learning_rate=0.5`.
45 |
46 | # %%
47 | # Write your code here.
48 |
49 | # %% [markdown]
50 | #
51 | # Also create a random forest with fully grown trees by setting `max_depth=None`.
52 |
53 | # %%
54 | # Write your code here.
55 |
56 | # %% [markdown]
57 | #
58 | # For both the gradient-boosting and random forest models, create a validation
59 | # curve using the training set to assess the impact of the number of trees on
60 | # the performance of each model. Evaluate the list of parameters `param_range =
61 | # np.array([1, 2, 5, 10, 20, 50, 100, 200])` and score it using
62 | # `neg_mean_absolute_error`. Remember to set `negate_score=True` to recover the
63 | # right sign of the Mean Absolute Error.
64 |
65 | # %%
66 | # Write your code here.
67 |
68 | # %% [markdown]
69 | # Random forest models improve when increasing the number of trees in the
70 | # ensemble. However, the scores reach a plateau where adding new trees just
71 | # makes fitting and scoring slower.
72 | #
73 | # Now repeat the analysis for the gradient boosting model.
74 |
75 | # %%
76 | # Write your code here.
77 |
78 |
79 | # %% [markdown]
80 | # Gradient boosting models overfit when the number of trees is too large. To
81 | # avoid adding a new unnecessary tree, unlike random-forest gradient-boosting
82 | # offers an early-stopping option. Internally, the algorithm uses an
83 | # out-of-sample set to compute the generalization performance of the model at
84 | # each addition of a tree. Thus, if the generalization performance is not
85 | # improving for several iterations, it stops adding trees.
86 | #
87 | # Now, create a gradient-boosting model with `n_estimators=1_000`. This number
88 | # of trees is certainly too large as we have seen above. Change the parameter
89 | # `n_iter_no_change` such that the gradient boosting fitting stops after adding
90 | # 5 trees to avoid deterioration of the overall generalization performance.
91 |
92 | # %%
93 | # Write your code here.
94 |
95 | # %% [markdown]
96 | # Estimate the generalization performance of this model again using the
97 | # `sklearn.metrics.mean_absolute_error` metric but this time using the test set
98 | # that we held out at the beginning of the notebook. Compare the resulting value
99 | # with the values observed in the validation curve.
100 |
101 | # %%
102 | # Write your code here.
103 |
--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_04.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M6.04
16 | #
17 | # The aim of the exercise is to get familiar with the histogram
18 | # gradient-boosting in scikit-learn. Besides, we will use this model within a
19 | # cross-validation framework in order to inspect internal parameters found via
20 | # grid-search.
21 | #
22 | # We will use the California housing dataset.
23 |
24 | # %%
25 | from sklearn.datasets import fetch_california_housing
26 |
27 | data, target = fetch_california_housing(return_X_y=True, as_frame=True)
28 | target *= 100 # rescale the target in k$
29 |
30 | # %% [markdown]
31 | # First, create a histogram gradient boosting regressor. You can set the trees
32 | # number to be large, and configure the model to use early-stopping.
33 |
34 | # %%
35 | # Write your code here.
36 |
37 | # %% [markdown]
38 | # We will use a grid-search to find some optimal parameter for this model. In
39 | # this grid-search, you should search for the following parameters:
40 | #
41 | # * `max_depth: [3, 8]`;
42 | # * `max_leaf_nodes: [15, 31]`;
43 | # * `learning_rate: [0.1, 1]`.
44 | #
45 | # Feel free to explore the space with additional values. Create the grid-search
46 | # providing the previous gradient boosting instance as the model.
47 |
48 | # %%
49 | # Write your code here.
50 |
51 | # %% [markdown]
52 | # Finally, we will run our experiment through cross-validation. In this regard,
53 | # define a 5-fold cross-validation. Besides, be sure to shuffle the data.
54 | # Subsequently, use the function `sklearn.model_selection.cross_validate` to run
55 | # the cross-validation. You should also set `return_estimator=True`, so that we
56 | # can investigate the inner model trained via cross-validation.
57 |
58 | # %%
59 | # Write your code here.
60 |
61 | # %% [markdown]
62 | # Now that we got the cross-validation results, print out the mean and standard
63 | # deviation score.
64 |
65 | # %%
66 | # Write your code here.
67 |
68 | # %% [markdown]
69 | # Then inspect the `estimator` entry of the results and check the best
70 | # parameters values. Besides, check the number of trees used by the model.
71 |
72 | # %%
73 | # Write your code here.
74 |
75 | # %% [markdown]
76 | # Inspect the results of the inner CV for each estimator of the outer CV.
77 | # Aggregate the mean test score for each parameter combination and make a box
78 | # plot of these scores.
79 |
80 | # %%
81 | # Write your code here.
82 |
--------------------------------------------------------------------------------
/python_scripts/ensemble_sol_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # kernelspec:
4 | # display_name: Python 3
5 | # name: python3
6 | # ---
7 |
8 | # %% [markdown]
9 | # # 📃 Solution for Exercise M6.01
10 | #
11 | # The aim of this notebook is to investigate if we can tune the hyperparameters
12 | # of a bagging regressor and evaluate the gain obtained.
13 | #
14 | # We will load the California housing dataset and split it into a training and a
15 | # testing set.
16 |
17 | # %%
18 | from sklearn.datasets import fetch_california_housing
19 | from sklearn.model_selection import train_test_split
20 |
21 | data, target = fetch_california_housing(as_frame=True, return_X_y=True)
22 | target *= 100 # rescale the target in k$
23 | data_train, data_test, target_train, target_test = train_test_split(
24 | data, target, random_state=0, test_size=0.5
25 | )
26 |
27 | # %% [markdown]
28 | # ```{note}
29 | # If you want a deeper overview regarding this dataset, you can refer to the
30 | # Appendix - Datasets description section at the end of this MOOC.
31 | # ```
32 |
33 | # %% [markdown]
34 | # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its
35 | # parameter `estimator`. Train the regressor and evaluate its generalization
36 | # performance on the testing set using the mean absolute error.
37 |
38 | # %%
39 | # solution
40 | from sklearn.metrics import mean_absolute_error
41 | from sklearn.tree import DecisionTreeRegressor
42 | from sklearn.ensemble import BaggingRegressor
43 |
44 | tree = DecisionTreeRegressor()
45 | bagging = BaggingRegressor(estimator=tree, n_jobs=2)
46 | bagging.fit(data_train, target_train)
47 | target_predicted = bagging.predict(data_test)
48 | print(
49 | "Basic mean absolute error of the bagging regressor:\n"
50 | f"{mean_absolute_error(target_test, target_predicted):.2f} k$"
51 | )
52 |
53 | # %% [markdown]
54 | # Now, create a `RandomizedSearchCV` instance using the previous model and tune
55 | # the important parameters of the bagging regressor. Find the best parameters
56 | # and check if you are able to find a set of parameters that improve the default
57 | # regressor still using the mean absolute error as a metric.
58 |
59 | # ```{tip}
60 | # You can list the bagging regressor's parameters using the `get_params` method.
61 | # ```
62 |
63 | # %%
64 | # solution
65 | for param in bagging.get_params().keys():
66 | print(param)
67 |
68 | # %% tags=["solution"]
69 | from scipy.stats import randint
70 | from sklearn.model_selection import RandomizedSearchCV
71 |
72 | param_grid = {
73 | "n_estimators": randint(10, 30),
74 | "max_samples": [0.5, 0.8, 1.0],
75 | "max_features": [0.5, 0.8, 1.0],
76 | "estimator__max_depth": randint(3, 10),
77 | }
78 | search = RandomizedSearchCV(
79 | bagging, param_grid, n_iter=20, scoring="neg_mean_absolute_error"
80 | )
81 | _ = search.fit(data_train, target_train)
82 |
83 | # %% tags=["solution"]
84 | import pandas as pd
85 |
86 | columns = [f"param_{name}" for name in param_grid.keys()]
87 | columns += ["mean_test_error", "std_test_error"]
88 | cv_results = pd.DataFrame(search.cv_results_)
89 | cv_results["mean_test_error"] = -cv_results["mean_test_score"]
90 | cv_results["std_test_error"] = cv_results["std_test_score"]
91 | cv_results[columns].sort_values(by="mean_test_error")
92 |
93 | # %% tags=["solution"]
94 | target_predicted = search.predict(data_test)
95 | print(
96 | "Mean absolute error after tuning of the bagging regressor:\n"
97 | f"{mean_absolute_error(target_test, target_predicted):.2f} k$"
98 | )
99 |
100 | # %% [markdown] tags=["solution"]
101 | # We see that the predictor provided by the bagging regressor does not need much
102 | # hyperparameter tuning compared to a single decision tree.
103 |
--------------------------------------------------------------------------------
/python_scripts/feature_selection_ex_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise 01
16 | #
17 | # The aim of this exercise is to highlight caveats to have in mind when using
18 | # feature selection. You have to be extremely careful regarding the set of data
19 | # on which you will compute the statistic that helps your feature selection
20 | # algorithm to decide which feature to select.
21 | #
22 | # On purpose, we will make you program the wrong way of doing feature selection
23 | # to gain insights.
24 | #
25 | # First, you will create a completely random dataset using NumPy. Using the
26 | # function `np.random.randn`, generate a matrix `data` containing 100 samples
27 | # and 100,000 features. Then, using the function `np.random.randint`, generate a
28 | # vector `target` with 100 samples containing either 0 or 1.
29 | #
30 | # This type of dimensionality is typical in bioinformatics when dealing with
31 | # RNA-seq. However, we will use completely randomized features such that we
32 | # don't have a link between the data and the target. Thus, the generalization
33 | # performance of any machine-learning model should not perform better than the
34 | # chance-level.
35 |
36 | # %%
37 | import numpy as np
38 |
39 | # Write your code here.
40 |
41 | # %% [markdown]
42 | # Now, create a logistic regression model and use cross-validation to check the
43 | # score of such a model. It will allow use to confirm that our model cannot
44 | # predict anything meaningful from random data.
45 |
46 | # %%
47 | # Write your code here.
48 |
49 | # %% [markdown]
50 | # Now, we will ask you to program the **wrong** pattern to select feature.
51 | # Select the feature by using the entire dataset. We will choose ten features
52 | # with the highest ANOVA F-score computed on the full dataset. Subsequently,
53 | # subsample the dataset `data` by selecting the features' subset. Finally, train
54 | # and test a logistic regression model.
55 | #
56 | # You should get some surprising results.
57 |
58 | # %%
59 | from sklearn.feature_selection import SelectKBest, f_classif
60 |
61 | # Write your code here.
62 |
63 | # %% [markdown]
64 | # Now, we will make you program the **right** way to do the feature selection.
65 | # First, split the dataset into a training and testing set. Then, fit the
66 | # feature selector on the training set. Then, transform both the training and
67 | # testing sets before you train and test the logistic regression.
68 |
69 | # %%
70 | from sklearn.model_selection import train_test_split
71 |
72 | # Write your code here.
73 |
74 | # %% [markdown]
75 | # However, the previous case is not perfect. For instance, if we were asking to
76 | # perform cross-validation, the manual `fit`/`transform` of the datasets will
77 | # make our life hard. Indeed, the solution here is to use a scikit-learn
78 | # pipeline in which the feature selection will be a pre processing stage before
79 | # to train the model.
80 | #
81 | # Thus, start by creating a pipeline with the feature selector and the logistic
82 | # regression. Then, use cross-validation to get an estimate of the uncertainty
83 | # of your model generalization performance.
84 |
85 | # %%
86 | from sklearn.pipeline import make_pipeline
87 |
88 | # Write your code here.
89 |
--------------------------------------------------------------------------------
/python_scripts/feature_selection_limitation_model.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # kernelspec:
4 | # display_name: Python 3
5 | # name: python3
6 | # ---
7 |
8 | # %% [markdown]
9 | # # Limitation of selecting feature using a model
10 | #
11 | # In this notebook, we want to show a limitation when using a machine-learning
12 | # model to make a selection.
13 | #
14 | # Indeed, one can inspect a model and find relative feature importances. For
15 | # instance, the parameters `coef_` for the linear models or
16 | # `feature_importances_` for the tree-based models carries such information.
17 | # Therefore, this method works as far as the relative feature importances given
18 | # by the model is sufficient to select the meaningful feature.
19 | #
20 | # Here, we will generate a dataset that contains a large number of random
21 | # features.
22 |
23 | # %%
24 | from sklearn.datasets import make_classification
25 |
26 | data, target = make_classification(
27 | n_samples=5000,
28 | n_features=100,
29 | n_informative=2,
30 | n_redundant=5,
31 | n_repeated=5,
32 | class_sep=0.3,
33 | random_state=0,
34 | )
35 |
36 | # %% [markdown]
37 | # First, let's build a model which will not make any features selection.
38 |
39 | # %%
40 | from sklearn.ensemble import RandomForestClassifier
41 |
42 | model_without_selection = RandomForestClassifier()
43 |
44 | # %% [markdown]
45 | # We will evaluate this model by a k-fold cross validation and store the results
46 | # in a pandas dataframe.
47 |
48 | # %%
49 | import pandas as pd
50 | from sklearn.model_selection import cross_validate
51 |
52 | cv_results_without_selection = cross_validate(
53 | model_without_selection, data, target, cv=5
54 | )
55 | cv_results_without_selection = pd.DataFrame(cv_results_without_selection)
56 |
57 | # %% [markdown]
58 | # Then, we will build another model which will include a feature selection step
59 | # based on a random forest and evaluate it as well with cross-validation.
60 |
61 | # %%
62 | from sklearn.pipeline import make_pipeline
63 | from sklearn.feature_selection import SelectFromModel
64 |
65 | feature_selector = SelectFromModel(RandomForestClassifier())
66 | model_with_selection = make_pipeline(
67 | feature_selector, RandomForestClassifier()
68 | )
69 |
70 | # %%
71 | cv_results_with_selection = cross_validate(
72 | model_with_selection, data, target, cv=5
73 | )
74 | cv_results_with_selection = pd.DataFrame(cv_results_with_selection)
75 |
76 | # %% [markdown]
77 | # We can compare the testing score of the two models. For this matter, we are
78 | # combining results in a single dataframe.
79 |
80 | # %%
81 | cv_results = pd.concat(
82 | [cv_results_without_selection, cv_results_with_selection],
83 | axis=1,
84 | keys=["Without feature selection", "With feature selection"],
85 | ).swaplevel(axis="columns")
86 |
87 | # %% [markdown]
88 | # Finally, we can check the testing score of each the model.
89 |
90 | # %%
91 | import matplotlib.pyplot as plt
92 |
93 | color = {"whiskers": "black", "medians": "black", "caps": "black"}
94 | cv_results["test_score"].plot.box(color=color, vert=False)
95 | plt.xlabel("Accuracy")
96 | _ = plt.title("Limitation of using a random forest for feature selection")
97 |
98 | # %% [markdown]
99 | # The model that selected a subset of feature is less performant than a random
100 | # forest fitted on the full dataset.
101 | #
102 | # We can rely on some aspects tackled in the notebook presenting the model
103 | # inspection to explain this behaviour. The decision tree's relative feature
104 | # importance will overestimate the importance of random feature when the
105 | # decision tree overfits the training set.
106 | #
107 | # Therefore, it is good to keep in mind that feature selection relies on
108 | # procedures making some assumptions, which can be perfectible.
109 |
--------------------------------------------------------------------------------
/python_scripts/linear_models_ex_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M4.01
16 | #
17 | # The aim of this exercise is two-fold:
18 | #
19 | # * understand the parametrization of a linear model;
20 | # * quantify the fitting accuracy of a set of such models.
21 | #
22 | # We will reuse part of the code of the course to:
23 | #
24 | # * load data;
25 | # * create the function representing a linear model.
26 | #
27 | # ## Prerequisites
28 | #
29 | # ### Data loading
30 |
31 | # %% [markdown]
32 | # ```{note}
33 | # If you want a deeper overview regarding this dataset, you can refer to the
34 | # Appendix - Datasets description section at the end of this MOOC.
35 | # ```
36 |
37 | # %%
38 | import pandas as pd
39 |
40 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
41 | feature_name = "Flipper Length (mm)"
42 | target_name = "Body Mass (g)"
43 | data, target = penguins[[feature_name]], penguins[target_name]
44 |
45 | # %% [markdown]
46 | # ### Model definition
47 |
48 |
49 | # %%
50 | def linear_model_flipper_mass(
51 | flipper_length, weight_flipper_length, intercept_body_mass
52 | ):
53 | """Linear model of the form y = a * x + b"""
54 | body_mass = weight_flipper_length * flipper_length + intercept_body_mass
55 | return body_mass
56 |
57 |
58 | # %% [markdown]
59 | # ## Main exercise
60 | #
61 | # Define a vector `weights = [...]` and a vector `intercepts = [...]` of the
62 | # same length. Each pair of entries `(weights[i], intercepts[i])` tags a
63 | # different model. Use these vectors along with the vector
64 | # `flipper_length_range` to plot several linear models that could possibly fit
65 | # our data. Use the above helper function to visualize both the models and the
66 | # real samples.
67 |
68 | # %%
69 | import numpy as np
70 |
71 | flipper_length_range = np.linspace(data.min(), data.max(), num=300)
72 |
73 | # %%
74 | # Write your code here.
75 |
76 | # %% [markdown]
77 | # In the previous question, you were asked to create several linear models. The
78 | # visualization allowed you to qualitatively assess if a model was better than
79 | # another.
80 | #
81 | # Now, you should come up with a quantitative measure which indicates the
82 | # goodness of fit of each linear model and allows you to select the best model.
83 | # Define a function `goodness_fit_measure(true_values, predictions)` that takes
84 | # as inputs the true target values and the predictions and returns a single
85 | # scalar as output.
86 |
87 |
88 | # %%
89 | # Write your code here.
90 |
91 | # %% [markdown]
92 | # You can now copy and paste the code below to show the goodness of fit for each
93 | # model.
94 | #
95 | # ```python
96 | # for model_idx, (weight, intercept) in enumerate(zip(weights, intercepts)):
97 | # target_predicted = linear_model_flipper_mass(data, weight, intercept)
98 | # print(f"Model #{model_idx}:")
99 | # print(f"{weight:.2f} (g / mm) * flipper length + {intercept:.2f} (g)")
100 | # print(f"Error: {goodness_fit_measure(target, target_predicted):.3f}\n")
101 | # ```
102 |
103 | # %%
104 | # Write your code here.
105 |
--------------------------------------------------------------------------------
/python_scripts/matplotlibrc:
--------------------------------------------------------------------------------
1 | axes.labelsize: 18.0
2 | axes.linewidth: 1.875
3 | axes.titlesize: 18.0
4 | boxplot.whiskers: 1000
5 | boxplot.patchartist: True
6 | boxplot.boxprops.color: black
7 | boxplot.capprops.color: black
8 | boxplot.medianprops.color: black
9 | boxplot.whiskerprops.color: black
10 | boxplot.boxprops.linewidth: 3.0
11 | boxplot.capprops.linewidth: 3.0
12 | boxplot.medianprops.linewidth: 2.5
13 | boxplot.whiskerprops.linewidth: 3.0
14 | figure.titlesize: 22.0
15 | font.size: 18.0
16 | grid.linewidth: 1.5
17 | legend.fontsize: 16.5
18 | legend.title_fontsize: 18.0
19 | lines.linewidth: 3.5
20 | lines.markersize: 9.0
21 | patch.linewidth: 1.5
22 | xtick.labelsize: 16.5
23 | xtick.major.size: 9.0
24 | xtick.major.width: 1.875
25 | xtick.minor.size: 6.0
26 | xtick.minor.width: 1.5
27 | ytick.labelsize: 16.5
28 | ytick.major.size: 9.0
29 | ytick.major.width: 1.875
30 | ytick.minor.size: 6.0
31 | ytick.minor.width: 1.5
32 |
--------------------------------------------------------------------------------
/python_scripts/metrics_ex_02.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M7.03
16 | #
17 | # As with the classification metrics exercise, we will evaluate the regression
18 | # metrics within a cross-validation framework to get familiar with the syntax.
19 | #
20 | # We will use the Ames house prices dataset.
21 |
22 | # %%
23 | import pandas as pd
24 | import numpy as np
25 |
26 | ames_housing = pd.read_csv("../datasets/house_prices.csv")
27 | data = ames_housing.drop(columns="SalePrice")
28 | target = ames_housing["SalePrice"]
29 | data = data.select_dtypes(np.number)
30 | target /= 1000
31 |
32 | # %% [markdown]
33 | # ```{note}
34 | # If you want a deeper overview regarding this dataset, you can refer to the
35 | # Appendix - Datasets description section at the end of this MOOC.
36 | # ```
37 |
38 |
39 | # %% [markdown]
40 | # The first step will be to create a linear regression model.
41 |
42 | # %%
43 | # Write your code here.
44 |
45 | # %% [markdown]
46 | # Then, use the `cross_val_score` to estimate the generalization performance of
47 | # the model. Use a `KFold` cross-validation with 10 folds. Make the use of the
48 | # $R^2$ score explicit by assigning the parameter `scoring` (even though it is
49 | # the default score).
50 |
51 | # %%
52 | # Write your code here.
53 |
54 | # %% [markdown]
55 | # Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You
56 | # may need to refer to the documentation for the `scoring` parameter.
57 |
58 | # %%
59 | # Write your code here.
60 |
61 | # %% [markdown]
62 | # Finally, use the `cross_validate` function and compute multiple scores/errors
63 | # at once by passing a list of scorers to the `scoring` parameter. You can
64 | # compute the $R^2$ score and the mean absolute error for instance.
65 |
66 | # %%
67 | # Write your code here.
68 |
--------------------------------------------------------------------------------
/python_scripts/parameter_tuning_ex_02.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M3.01
16 | #
17 | # The goal is to write an exhaustive search to find the best parameters
18 | # combination maximizing the model generalization performance.
19 | #
20 | # Here we use a small subset of the Adult Census dataset to make the code faster
21 | # to execute. Once your code works on the small subset, try to change
22 | # `train_size` to a larger value (e.g. 0.8 for 80% instead of 20%).
23 |
24 | # %%
25 | import pandas as pd
26 |
27 | from sklearn.model_selection import train_test_split
28 |
29 | adult_census = pd.read_csv("../datasets/adult-census.csv")
30 |
31 | target_name = "class"
32 | target = adult_census[target_name]
33 | data = adult_census.drop(columns=[target_name, "education-num"])
34 |
35 | data_train, data_test, target_train, target_test = train_test_split(
36 | data, target, train_size=0.2, random_state=42
37 | )
38 |
39 | # %%
40 | from sklearn.compose import make_column_transformer
41 | from sklearn.compose import make_column_selector as selector
42 | from sklearn.preprocessing import OrdinalEncoder
43 |
44 | categorical_preprocessor = OrdinalEncoder(
45 | handle_unknown="use_encoded_value", unknown_value=-1
46 | )
47 | preprocessor = make_column_transformer(
48 | (categorical_preprocessor, selector(dtype_include=object)),
49 | remainder="passthrough",
50 | )
51 |
52 | from sklearn.ensemble import HistGradientBoostingClassifier
53 | from sklearn.pipeline import Pipeline
54 |
55 | model = Pipeline(
56 | [
57 | ("preprocessor", preprocessor),
58 | ("classifier", HistGradientBoostingClassifier(random_state=42)),
59 | ]
60 | )
61 |
62 | # %% [markdown]
63 | # Use the previously defined model (called `model`) and using two nested `for`
64 | # loops, make a search of the best combinations of the `learning_rate` and
65 | # `max_leaf_nodes` parameters. In this regard, you need to train and test the
66 | # model by setting the parameters. The evaluation of the model should be
67 | # performed using `cross_val_score` on the training set. Use the following
68 | # parameters search:
69 | # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls
70 | # the ability of a new tree to correct the error of the previous sequence of
71 | # trees
72 | # - `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the depth
73 | # of each tree.
74 |
75 | # %%
76 | # Write your code here.
77 |
78 | # %% [markdown]
79 | # Now use the test set to score the model using the best parameters that we
80 | # found using cross-validation. You will have to refit the model over the full
81 | # training set.
82 |
83 | # %%
84 | # Write your code here.
85 |
86 | # %%
87 |
--------------------------------------------------------------------------------
/python_scripts/parameter_tuning_ex_03.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M3.02
16 | #
17 | # The goal is to find the best set of hyperparameters which maximize the
18 | # generalization performance on a training set.
19 |
20 | # %%
21 | from sklearn.datasets import fetch_california_housing
22 | from sklearn.model_selection import train_test_split
23 |
24 | data, target = fetch_california_housing(return_X_y=True, as_frame=True)
25 | target *= 100 # rescale the target in k$
26 |
27 | data_train, data_test, target_train, target_test = train_test_split(
28 | data, target, random_state=42
29 | )
30 |
31 | # %% [markdown]
32 | # In this exercise, we progressively define the regression pipeline and later
33 | # tune its hyperparameters.
34 | #
35 | # Start by defining a pipeline that:
36 | # * uses a `StandardScaler` to normalize the numerical data;
37 | # * uses a `sklearn.neighbors.KNeighborsRegressor` as a predictive model.
38 |
39 | # %%
40 | # Write your code here.
41 |
42 | # %% [markdown]
43 | # Use `RandomizedSearchCV` with `n_iter=20` and
44 | # `scoring="neg_mean_absolute_error"` to tune the following hyperparameters
45 | # of the `model`:
46 | #
47 | # - the parameter `n_neighbors` of the `KNeighborsRegressor` with values
48 | # `np.logspace(0, 3, num=10).astype(np.int32)`;
49 | # - the parameter `with_mean` of the `StandardScaler` with possible values
50 | # `True` or `False`;
51 | # - the parameter `with_std` of the `StandardScaler` with possible values `True`
52 | # or `False`.
53 | #
54 | # The `scoring` function is expected to return higher values for better models,
55 | # since grid/random search objects **maximize** it. Because of that, error
56 | # metrics like `mean_absolute_error` must be negated (using the `neg_` prefix)
57 | # to work correctly (remember lower errors represent better models).
58 | #
59 | # Notice that in the notebook "Hyperparameter tuning by randomized-search" we
60 | # pass distributions to be sampled by the `RandomizedSearchCV`. In this case we
61 | # define a fixed grid of hyperparameters to be explored. Using a `GridSearchCV`
62 | # instead would explore all the possible combinations on the grid, which can be
63 | # costly to compute for large grids, whereas the parameter `n_iter` of the
64 | # `RandomizedSearchCV` controls the number of different random combination that
65 | # are evaluated. Notice that setting `n_iter` larger than the number of possible
66 | # combinations in a grid (in this case 10 x 2 x 2 = 40) would lead to repeating
67 | # already-explored combinations.
68 | #
69 | # Once the computation has completed, print the best combination of parameters
70 | # stored in the `best_params_` attribute.
71 |
72 | # %%
73 | # Write your code here.
74 |
--------------------------------------------------------------------------------
/python_scripts/trees_dataset.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # kernelspec:
4 | # display_name: Python 3
5 | # name: python3
6 | # ---
7 |
8 | # %% [markdown]
9 | # # The penguins datasets
10 | #
11 | # In this notebook, we make a quick presentation of the [Palmer penguins
12 | # dataset](https://allisonhorst.github.io/palmerpenguins/) dataset. We use this
13 | # dataset for both classification and regression problems by selecting a subset
14 | # of the features to make our explanations intuitive.
15 | #
16 | # ## Classification dataset
17 | #
18 | # We use this dataset in classification setting to predict the penguins'
19 | # species from anatomical information.
20 | #
21 | # Each penguin is from one of the three following species: Adelie, Gentoo, and
22 | # Chinstrap. See the illustration below depicting the three different penguin
23 | # species:
24 | #
25 | # 
27 | #
28 | # This problem is a classification problem since the target is categorical. We
29 | # limit our input data to a subset of the original features to simplify our
30 | # explanations when presenting the decision tree algorithm. Indeed, we use
31 | # features based on penguins' culmen measurement. You can learn more about the
32 | # penguins' culmen with the illustration below:
33 | #
34 | # 
36 | #
37 | # We start by loading this subset of the dataset.
38 |
39 | # %%
40 | import pandas as pd
41 |
42 | penguins = pd.read_csv("../datasets/penguins_classification.csv")
43 |
44 | culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"]
45 | target_column = "Species"
46 |
47 | # %% [markdown]
48 | # Let's check the dataset more into details.
49 |
50 | # %%
51 | penguins
52 |
53 | # %% [markdown]
54 | # Since that we have few samples, we can check a scatter plot to observe the
55 | # samples distribution.
56 |
57 | # %%
58 | import seaborn as sns
59 |
60 | pairplot_figure = sns.pairplot(penguins, hue="Species")
61 | pairplot_figure.fig.set_size_inches(9, 6.5)
62 |
63 | # %% [markdown]
64 | # First let's check the feature distributions by looking at the diagonal plots
65 | # of the pairplot. We can deduce the following intuitions:
66 | #
67 | # * The Adelie species can be differentiated from the Gentoo and Chinstrap
68 | # species depending on the culmen length;
69 | # * The Gentoo species can be differentiated from the Adelie and Chinstrap
70 | # species depending on the culmen depth.
71 | #
72 | # ## Regression dataset
73 | #
74 | # In a regression setting, the target is a continuous variable instead of
75 | # categories. Here, we use two features of the dataset to make such a problem:
76 | # the flipper length is used as data and the body mass as the target. In short,
77 | # we want to predict the body mass using the flipper length.
78 | #
79 | # We load the dataset and visualize the relationship between the flipper length
80 | # and the body mass of penguins.
81 |
82 | # %%
83 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
84 |
85 | feature_name = "Flipper Length (mm)"
86 | target_column = "Body Mass (g)"
87 |
88 | # %%
89 | _ = sns.scatterplot(data=penguins, x=feature_name, y=target_column)
90 |
91 | # %% [markdown]
92 | # Here, we deal with a regression problem because our target is a continuous
93 | # variable ranging from 2.7 kg to 6.3 kg. From the scatter plot above, we
94 | # observe that we have a linear relationship between the flipper length and the
95 | # body mass. The longer the flipper of a penguin, the heavier the penguin.
96 |
--------------------------------------------------------------------------------
/python_scripts/trees_ex_01.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M5.01
16 | #
17 | # In the previous notebook, we showed how a tree with 1 level depth works. The
18 | # aim of this exercise is to repeat part of the previous experiment for a tree
19 | # with 2 levels depth to show how such parameter affects the feature space
20 | # partitioning.
21 | #
22 | # We first load the penguins dataset and split it into a training and a testing
23 | # sets:
24 |
25 | # %%
26 | import pandas as pd
27 |
28 | penguins = pd.read_csv("../datasets/penguins_classification.csv")
29 | culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"]
30 | target_column = "Species"
31 |
32 | # %% [markdown]
33 | # ```{note}
34 | # If you want a deeper overview regarding this dataset, you can refer to the
35 | # Appendix - Datasets description section at the end of this MOOC.
36 | # ```
37 |
38 | # %%
39 | from sklearn.model_selection import train_test_split
40 |
41 | data, target = penguins[culmen_columns], penguins[target_column]
42 | data_train, data_test, target_train, target_test = train_test_split(
43 | data, target, random_state=0
44 | )
45 |
46 | # %% [markdown]
47 | # Create a decision tree classifier with a maximum depth of 2 levels and fit the
48 | # training data.
49 |
50 | # %%
51 | # Write your code here.
52 |
53 | # %% [markdown]
54 | # Now plot the data and the decision boundary of the trained classifier to see
55 | # the effect of increasing the depth of the tree.
56 | #
57 | # Hint: Use the class `DecisionBoundaryDisplay` from the module
58 | # `sklearn.inspection` as shown in previous course notebooks.
59 | #
60 | # ```{warning}
61 | # At this time, it is not possible to use `response_method="predict_proba"` for
62 | # multiclass problems on a single plot. This is a planned feature for a future
63 | # version of scikit-learn. In the mean time, you can use
64 | # `response_method="predict"` instead.
65 | # ```
66 |
67 | # %%
68 | # Write your code here.
69 |
70 | # %% [markdown]
71 | # Did we make use of the feature "Culmen Length"? Plot the tree using the
72 | # function `sklearn.tree.plot_tree` to find out!
73 |
74 | # %%
75 | # Write your code here.
76 |
77 | # %% [markdown]
78 | # Compute the accuracy of the decision tree on the testing data.
79 |
80 | # %%
81 | # Write your code here.
82 |
--------------------------------------------------------------------------------
/python_scripts/trees_ex_02.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.17.1
9 | # kernelspec:
10 | # display_name: Python 3
11 | # name: python3
12 | # ---
13 |
14 | # %% [markdown]
15 | # # 📝 Exercise M5.02
16 | #
17 | # The aim of this exercise is to find out whether a decision tree model is able
18 | # to extrapolate.
19 | #
20 | # By extrapolation, we refer to values predicted by a model outside of the range
21 | # of feature values seen during the training.
22 | #
23 | # We first load the regression data.
24 |
25 | # %%
26 | import pandas as pd
27 |
28 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
29 |
30 | feature_name = "Flipper Length (mm)"
31 | target_name = "Body Mass (g)"
32 | data_train, target_train = penguins[[feature_name]], penguins[target_name]
33 |
34 | # %% [markdown]
35 | # ```{note}
36 | # If you want a deeper overview regarding this dataset, you can refer to the
37 | # Appendix - Datasets description section at the end of this MOOC.
38 | # ```
39 |
40 | # %% [markdown]
41 | # First, create two models, a linear regression model and a decision tree
42 | # regression model, and fit them on the training data. Limit the depth at 3
43 | # levels for the decision tree.
44 |
45 | # %%
46 | # Write your code here.
47 |
48 | # %% [markdown]
49 | # Create a synthetic dataset containing all possible flipper length from the
50 | # minimum to the maximum of the training dataset. Get the predictions of each
51 | # model using this dataset.
52 |
53 | # %%
54 | # Write your code here.
55 |
56 | # %% [markdown]
57 | # Create a scatter plot containing the training samples and superimpose the
58 | # predictions of both models on the top.
59 |
60 | # %%
61 | # Write your code here.
62 |
63 | # %% [markdown]
64 | # Now, we check the extrapolation capabilities of each model. Create a dataset
65 | # containing a broader range of values than your previous dataset, in other
66 | # words, add values below and above the minimum and the maximum of the flipper
67 | # length seen during training.
68 |
69 | # %%
70 | # Write your code here.
71 |
72 | # %% [markdown]
73 | # Finally, make predictions with both models on this new interval of data.
74 | # Repeat the plotting of the previous exercise.
75 |
76 | # %%
77 | # Write your code here.
78 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=1.6
2 | pandas >= 1
3 | matplotlib
4 | seaborn >= 0.13
5 | plotly
6 | jupyter-book>=0.11
7 | jupytext
8 | beautifulsoup4
9 | IPython
10 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=1.6
2 | pandas >= 1
3 | matplotlib
4 | seaborn >= 0.13
5 | plotly
6 | jupyterlab
7 | notebook
8 | IPython
9 |
--------------------------------------------------------------------------------
/slides/Makefile:
--------------------------------------------------------------------------------
1 | # Compilation is done via remarker and htmlark, both pip instalable
2 |
3 | all: ml_concepts.html overfitting_vs_underfitting.html \
4 | learning_validation_curves.html bias_vs_variance.html \
5 | linear_models.html regularized_linear_models.html trees.html \
6 | ensemble.html concluding_remarks.html
7 |
8 |
9 | %.html: %.md custom.css
10 | # HTMLArk is to embed images and css
11 | remarker $< -c custom.css > $@
12 |
--------------------------------------------------------------------------------
/slides/README.md:
--------------------------------------------------------------------------------
1 | # View slides
2 |
3 | ## On the .github.io website
4 |
5 | The general pattern is `https://inria.github.io/scikit-learn-mooc/slides/?file=[FILENAME].md`
6 |
7 | Example for ML concepts slides:
8 | https://inria.github.io/scikit-learn-mooc/slides/?file=ml_concepts.md
9 |
10 | ## Locally
11 |
12 | Useful when working on the slides:
13 |
14 | ```py
15 | # on the root repo folder
16 | python -m http.server
17 |
18 | # open your browser with the right port (from previous command) using the right md file
19 | firefox 'http://localhost:8000/slides/index.html?file=../slides/ml_concepts.md'
20 | ```
21 |
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Bold.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-BoldItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-BoldItalic.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Italic.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Light.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-LightItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-LightItalic.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Medium.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Medium.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-MediumItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-MediumItalic.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Regular.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Bold.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-BoldItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-BoldItalic.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-Italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Italic.ttf
--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Regular.ttf
--------------------------------------------------------------------------------
/slides/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Presentation
5 |
6 |
7 |
8 |
9 |
10 |
12 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/slides/intro_words.md:
--------------------------------------------------------------------------------
1 |
2 | Hi,
3 |
4 | Welcome to the Machine-learning with scikit-learn course. The goal of
5 | this course is to teach you practical aspects of machine learning. It
6 | focuses on tabular data, given that such data is often encountered in the
7 | industry. The course is light on maths, and focuses on practical aspects,
8 | not only about pure machine learning, but also about the basics of data
9 | preparation and visualization for machine learning. Most of the content is
10 | centered on executable Python code that teaches how to analyse the data,
11 | with tools such as scikit-learn.
12 |
13 | Our goal is to be didactic. If you know Python programming and basic
14 | numerics, you should be able to follow along. We hope that this course
15 | will help introduce more people to machine learning.
16 |
--------------------------------------------------------------------------------