├── .binder
    ├── environment.yml
    ├── postBuild
    └── start
├── .github
    └── workflows
    │   ├── deploy-gh-pages.yml
    │   ├── formatting.yml
    │   └── jupyter-book-pr-preview.yml
├── .gitignore
├── .jupyter
    ├── README.md
    └── jupyter_notebook_config.py
├── .pre-commit-config.yaml
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── build_tools
    ├── build_jupyter_book.sh
    ├── convert-python-script-to-notebook.py
    ├── generate-exercise-from-solution.py
    ├── generate-index.py
    ├── generate-quizzes.py
    ├── generate-wrap-up.py
    └── sanity-check.py
├── check_env.py
├── datasets
    ├── README.md
    ├── adult-census-numeric-all.csv
    ├── adult-census-numeric-test.csv
    ├── adult-census-numeric.csv
    ├── adult-census.csv
    ├── ames_housing_no_missing.csv
    ├── bike_rides.csv
    ├── blood_transfusion.csv
    ├── cps_85_wages.csv
    ├── financial-data
    │   ├── COP.csv
    │   ├── CVX.csv
    │   ├── TOT.csv
    │   ├── VLO.csv
    │   └── XOM.csv
    ├── house_prices.csv
    ├── penguins.csv
    ├── penguins_classification.csv
    └── penguins_regression.csv
├── environment-dev.yml
├── environment.yml
├── figures
    ├── README.md
    ├── api_diagram-columntransformer.svg
    ├── api_diagram-pipeline.fit.svg
    ├── api_diagram-pipeline.predict.svg
    ├── api_diagram-predictor.fit.svg
    ├── api_diagram-predictor.predict.svg
    ├── api_diagram-predictor.score.svg
    ├── api_diagram-transformer.fit.svg
    ├── api_diagram-transformer.fit_transform.svg
    ├── api_diagram-transformer.transform.svg
    ├── api_diagram.drawio
    ├── bagging.svg
    ├── bagging0.svg
    ├── bagging0_cross.svg
    ├── bagging_cross.svg
    ├── bagging_fit.svg
    ├── bagging_line.svg
    ├── bagging_overfit.svg
    ├── bagging_reg_blue.svg
    ├── bagging_reg_blue_grey.svg
    ├── bagging_reg_data.svg
    ├── bagging_reg_grey.svg
    ├── bagging_reg_grey_fitted.svg
    ├── bagging_trees.svg
    ├── bagging_trees_predict.svg
    ├── bagging_underfit.svg
    ├── bagging_vote.svg
    ├── boosting
    │   ├── boosting_iter1.svg
    │   ├── boosting_iter2.svg
    │   ├── boosting_iter3.svg
    │   ├── boosting_iter4.svg
    │   ├── boosting_iter_orange1.svg
    │   ├── boosting_iter_orange2.svg
    │   ├── boosting_iter_orange3.svg
    │   ├── boosting_iter_orange4.svg
    │   ├── boosting_iter_sized1.svg
    │   ├── boosting_iter_sized2.svg
    │   ├── boosting_iter_sized3.svg
    │   └── boosting_iter_sized4.svg
    ├── boosting0.svg
    ├── boosting0_cross.svg
    ├── boosting1.svg
    ├── boosting2.svg
    ├── boosting3.svg
    ├── boosting_reg_blue.svg
    ├── boosting_reg_data.svg
    ├── boosting_reg_grey.svg
    ├── boosting_trees1.svg
    ├── boosting_trees2.svg
    ├── boosting_trees3.svg
    ├── bossting_reg_blue.svg
    ├── categorical.svg
    ├── cross-val1-shuffle.svg
    ├── cross-val1.svg
    ├── cross-val2-shuffle.svg
    ├── cross-val2.svg
    ├── cross-val3-shuffle.svg
    ├── cross-val3.svg
    ├── cross-val4-shuffle.svg
    ├── cross-val4.svg
    ├── cross-val5-shuffle.svg
    ├── cross-val5.svg
    ├── cross_validation_diagram.png
    ├── cross_validation_train_test_diagram.png
    ├── different_models_complex_16.svg
    ├── different_models_complex_4.svg
    ├── dt_fit.svg
    ├── dt_overfit.svg
    ├── dt_underfit.svg
    ├── evaluation_quiz_precision_recall_C0.003.svg
    ├── evaluation_quiz_precision_recall_C1.svg
    ├── full_data.svg
    ├── grid_vs_random_search.svg
    ├── iris-silhouette.svg
    ├── iris-silhouette_gray.svg
    ├── iris_petal_length_cm_hist.svg
    ├── iris_petal_width_cm_hist.svg
    ├── iris_sepal_length_cm_hist.svg
    ├── iris_sepal_width_cm_hist.svg
    ├── legend_irises.svg
    ├── lin_not_separable.svg
    ├── lin_reg_2_points.svg
    ├── lin_reg_2_points_best_ridge.svg
    ├── lin_reg_2_points_best_ridge_grey.svg
    ├── lin_reg_2_points_no_penalty.svg
    ├── lin_reg_2_points_no_penalty_grey.svg
    ├── lin_reg_2_points_ridge.svg
    ├── lin_reg_2_points_ridge_grey.svg
    ├── lin_reg_3D.svg
    ├── lin_separable.svg
    ├── linear_data.svg
    ├── linear_fit.svg
    ├── linear_fit_red.svg
    ├── linear_ols.svg
    ├── linear_ols_test.svg
    ├── linear_splines.svg
    ├── linear_splines_test.svg
    ├── linreg_noreg_0.svg
    ├── linreg_noreg_0_nogrey.svg
    ├── linreg_noreg_1.svg
    ├── linreg_noreg_2.svg
    ├── linreg_noreg_3.svg
    ├── linreg_noreg_4.svg
    ├── linreg_noreg_5.svg
    ├── logistic_2D.svg
    ├── logistic_2D_C0.001.svg
    ├── logistic_2D_C1.svg
    ├── logistic_3D.svg
    ├── logistic_color.svg
    ├── mooc_computer.jpg
    ├── multinomial.svg
    ├── nested_cross_validation_diagram.png
    ├── numerical_pipeline_quiz_scaler.py
    ├── numerical_pipeline_quiz_scaler_original.png
    ├── numerical_pipeline_quiz_scaler_preprocessing.png
    ├── numerical_pipeline_wrap_up_quiz_comparison.png
    ├── ols_simple.svg
    ├── ols_simple_test.svg
    ├── ols_test.svg
    ├── people.svg
    ├── plot_cross_validation_diagram.py
    ├── plot_iris_visualization.py
    ├── plot_overfit_underfit.py
    ├── plot_parameter_tuning_cv.py
    ├── plot_precision_recall_visualization.py
    ├── plot_simple_decision_tree_adult_census.py
    ├── plot_slide_linear.py
    ├── plot_splines.py
    ├── plot_trees.py
    ├── polynomial_learning_curve_1179.svg
    ├── polynomial_learning_curve_145.svg
    ├── polynomial_learning_curve_42.svg
    ├── polynomial_learning_curve_6766.svg
    ├── polynomial_overfit.svg
    ├── polynomial_overfit_0.svg
    ├── polynomial_overfit_1.svg
    ├── polynomial_overfit_2.svg
    ├── polynomial_overfit_5.svg
    ├── polynomial_overfit_9.svg
    ├── polynomial_overfit_assymptotic.svg
    ├── polynomial_overfit_ntrain_1179.svg
    ├── polynomial_overfit_ntrain_145.svg
    ├── polynomial_overfit_ntrain_42.svg
    ├── polynomial_overfit_ntrain_6766.svg
    ├── polynomial_overfit_resample_0.svg
    ├── polynomial_overfit_resample_1.svg
    ├── polynomial_overfit_resample_2.svg
    ├── polynomial_overfit_resample_all.svg
    ├── polynomial_overfit_simple.svg
    ├── polynomial_overfit_simple_legend.svg
    ├── polynomial_overfit_test_1.svg
    ├── polynomial_overfit_test_2.svg
    ├── polynomial_overfit_test_5.svg
    ├── polynomial_overfit_test_9.svg
    ├── polynomial_overfit_truth.svg
    ├── polynomial_underfit_resample_0.svg
    ├── polynomial_underfit_resample_1.svg
    ├── polynomial_underfit_resample_2.svg
    ├── polynomial_underfit_resample_all.svg
    ├── polynomial_underfit_simple.svg
    ├── polynomial_validation_curve.svg
    ├── polynomial_validation_curve_1.svg
    ├── polynomial_validation_curve_15.svg
    ├── polynomial_validation_curve_2.svg
    ├── polynomial_validation_curve_5.svg
    ├── polynomial_validation_curve_9.svg
    ├── randomized_search_results.csv
    ├── ridge_0_withreg.svg
    ├── ridge_alpha_0.svg
    ├── ridge_alpha_50.0.svg
    ├── ridge_alpha_50.svg
    ├── ridge_alpha_500.0.svg
    ├── ridge_alpha_500.svg
    ├── ridge_alpha_None.svg
    ├── scikit-learn-logo.svg
    ├── shufflesplit_1.svg
    ├── shufflesplit_2.svg
    ├── shufflesplit_3.svg
    ├── shufflesplit_diagram.png
    ├── simple_decision_tree_adult_census.png
    ├── splines_cubic.svg
    ├── splines_cubic_test.svg
    ├── splines_test.svg
    ├── style_figs.py
    ├── supervised.png
    ├── target_bias.svg
    ├── target_bias_0.svg
    ├── target_bias_1.svg
    ├── target_bias_2.svg
    ├── target_variance.svg
    ├── target_variance_0.svg
    ├── target_variance_1.svg
    ├── target_variance_2.svg
    ├── test_scores_h_shuffle=false.svg
    ├── tree2D_1split.svg
    ├── tree2D_2split.svg
    ├── tree2D_3split.svg
    ├── tree_blue_orange1.svg
    ├── tree_blue_orange2.svg
    ├── tree_blue_orange3.svg
    ├── tree_example.svg
    ├── tree_regression1.svg
    ├── tree_regression2.svg
    ├── tree_regression3.svg
    ├── tree_regression4.svg
    ├── tree_regression_structure1.svg
    ├── tree_regression_structure2.svg
    ├── tree_regression_structure3.svg
    ├── unsupervised.png
    └── workflow.png
├── full-index.ipynb
├── jupyter-book
    ├── _config.yml
    ├── _static
    │   ├── favicon.ico
    │   ├── matomo.js
    │   ├── sklearn_mooc.css
    │   └── sklearn_mooc.js
    ├── _toc.yml
    ├── appendix
    │   ├── acknowledgement.md
    │   ├── datasets_intro.md
    │   ├── glossary.md
    │   ├── notebook_timings.md
    │   └── toc_redirect.md
    ├── concluding_remarks.md
    ├── concluding_remarks_video.md
    ├── datasets
    ├── ensemble
    │   ├── bagging_slides.md
    │   ├── boosting_slides.md
    │   ├── ensemble_boosting_index.md
    │   ├── ensemble_bootstrap_index.md
    │   ├── ensemble_hyperparameters_index.md
    │   ├── ensemble_module_intro.md
    │   ├── ensemble_module_take_away.md
    │   ├── ensemble_quiz_m6_01.md
    │   ├── ensemble_quiz_m6_02.md
    │   ├── ensemble_quiz_m6_03.md
    │   └── ensemble_wrap_up_quiz.md
    ├── evaluation
    │   ├── cross_validation_baseline_index.md
    │   ├── cross_validation_choices_index.md
    │   ├── cross_validation_nested_index.md
    │   ├── evaluation_module_intro.md
    │   ├── evaluation_module_take_away.md
    │   ├── evaluation_quiz_m7_01.md
    │   ├── evaluation_quiz_m7_02.md
    │   ├── evaluation_quiz_m7_03.md
    │   ├── evaluation_quiz_m7_04.md
    │   ├── evaluation_quiz_m7_05.md
    │   ├── evaluation_wrap_up_quiz.md
    │   ├── metrics_classification_index.md
    │   └── metrics_regression_index.md
    ├── feature_selection
    │   ├── feature_selection_limitation_index.md
    │   ├── feature_selection_module_intro.md
    │   ├── feature_selection_module_take_away.md
    │   └── feature_selection_quiz.md
    ├── figures
    ├── index.md
    ├── interpretation
    │   └── interpretation_quiz.md
    ├── linear_models
    │   ├── linear_models_intuitions_index.md
    │   ├── linear_models_module_intro.md
    │   ├── linear_models_module_take_away.md
    │   ├── linear_models_non_linear_index.md
    │   ├── linear_models_quiz_m4_01.md
    │   ├── linear_models_quiz_m4_02.md
    │   ├── linear_models_quiz_m4_03.md
    │   ├── linear_models_regularization_index.md
    │   ├── linear_models_slides.md
    │   ├── linear_models_wrap_up_quiz.md
    │   └── regularized_linear_models_slides.md
    ├── ml_concepts
    │   ├── quiz_intro_01.md
    │   └── slides.md
    ├── overfit
    │   ├── bias_vs_variance_quiz_m2_03.md
    │   ├── bias_vs_variance_slides.md
    │   ├── learning_validation_curves_quiz_m2_02.md
    │   ├── learning_validation_curves_slides.md
    │   ├── overfit_bias_variance_index.md
    │   ├── overfit_module_intro.md
    │   ├── overfit_overfitting_underfitting_index.md
    │   ├── overfit_take_away.md
    │   ├── overfit_validation_learning_curves_index.md
    │   ├── overfit_wrap_up_quiz.md
    │   ├── overfitting_vs_under_fitting_quiz_m2_01.md
    │   └── overfitting_vs_under_fitting_slides.md
    ├── predictive_modeling_pipeline
    │   ├── 01_tabular_data_exploration_index.md
    │   ├── 01_tabular_data_exploration_quiz_m1_01.md
    │   ├── 02_numerical_pipeline_index.md
    │   ├── 02_numerical_pipeline_quiz_m1_02.md
    │   ├── 02_numerical_pipeline_video_cross_validation.md
    │   ├── 03_categorical_pipeline_index.md
    │   ├── 03_categorical_pipeline_quiz_m1_03.md
    │   ├── 03_categorical_pipeline_visualization_video.md
    │   ├── predictive_modeling_module_intro.md
    │   ├── predictive_modeling_module_take_away.md
    │   └── wrap_up_quiz.md
    ├── python_scripts
    ├── scikit-learn-logo.png
    ├── toc.md
    ├── trees
    │   ├── slides.md
    │   ├── trees_classification_index.md
    │   ├── trees_hyperparameters_index.md
    │   ├── trees_intuitions_index.md
    │   ├── trees_module_intro.md
    │   ├── trees_module_take_away.md
    │   ├── trees_quiz_m5_01.md
    │   ├── trees_quiz_m5_02.md
    │   ├── trees_quiz_m5_03.md
    │   ├── trees_quiz_m5_04.md
    │   ├── trees_regression_index.md
    │   └── trees_wrap_up_quiz.md
    └── tuning
    │   ├── parameter_tuning_automated_index.md
    │   ├── parameter_tuning_automated_quiz_m3_02.md
    │   ├── parameter_tuning_manual_index.md
    │   ├── parameter_tuning_manual_quiz_m3_01.md
    │   ├── parameter_tuning_module_intro.md
    │   ├── parameter_tuning_module_take_away.md
    │   ├── parameter_tuning_parallel_plot_video.md
    │   └── parameter_tuning_wrap_up_quiz.md
├── local-install-instructions.md
├── notebooks
    ├── 01_tabular_data_exploration.ipynb
    ├── 01_tabular_data_exploration_ex_01.ipynb
    ├── 01_tabular_data_exploration_sol_01.ipynb
    ├── 02_numerical_pipeline_cross_validation.ipynb
    ├── 02_numerical_pipeline_ex_00.ipynb
    ├── 02_numerical_pipeline_ex_01.ipynb
    ├── 02_numerical_pipeline_hands_on.ipynb
    ├── 02_numerical_pipeline_introduction.ipynb
    ├── 02_numerical_pipeline_scaling.ipynb
    ├── 02_numerical_pipeline_sol_00.ipynb
    ├── 02_numerical_pipeline_sol_01.ipynb
    ├── 03_categorical_pipeline.ipynb
    ├── 03_categorical_pipeline_column_transformer.ipynb
    ├── 03_categorical_pipeline_ex_01.ipynb
    ├── 03_categorical_pipeline_ex_02.ipynb
    ├── 03_categorical_pipeline_sol_01.ipynb
    ├── 03_categorical_pipeline_sol_02.ipynb
    ├── 03_categorical_pipeline_visualization.ipynb
    ├── cross_validation_baseline.ipynb
    ├── cross_validation_ex_01.ipynb
    ├── cross_validation_ex_02.ipynb
    ├── cross_validation_grouping.ipynb
    ├── cross_validation_learning_curve.ipynb
    ├── cross_validation_nested.ipynb
    ├── cross_validation_sol_01.ipynb
    ├── cross_validation_sol_02.ipynb
    ├── cross_validation_stratification.ipynb
    ├── cross_validation_time.ipynb
    ├── cross_validation_train_test.ipynb
    ├── cross_validation_validation_curve.ipynb
    ├── datasets_adult_census.ipynb
    ├── datasets_ames_housing.ipynb
    ├── datasets_bike_rides.ipynb
    ├── datasets_blood_transfusion.ipynb
    ├── datasets_california_housing.ipynb
    ├── dev_features_importance.ipynb
    ├── ensemble_adaboost.ipynb
    ├── ensemble_bagging.ipynb
    ├── ensemble_ex_01.ipynb
    ├── ensemble_ex_02.ipynb
    ├── ensemble_ex_03.ipynb
    ├── ensemble_ex_04.ipynb
    ├── ensemble_gradient_boosting.ipynb
    ├── ensemble_hist_gradient_boosting.ipynb
    ├── ensemble_hyperparameters.ipynb
    ├── ensemble_introduction.ipynb
    ├── ensemble_random_forest.ipynb
    ├── ensemble_sol_01.ipynb
    ├── ensemble_sol_02.ipynb
    ├── ensemble_sol_03.ipynb
    ├── ensemble_sol_04.ipynb
    ├── feature_selection_ex_01.ipynb
    ├── feature_selection_introduction.ipynb
    ├── feature_selection_limitation_model.ipynb
    ├── feature_selection_sol_01.ipynb
    ├── linear_models_ex_01.ipynb
    ├── linear_models_ex_02.ipynb
    ├── linear_models_ex_03.ipynb
    ├── linear_models_ex_04.ipynb
    ├── linear_models_feature_engineering_classification.ipynb
    ├── linear_models_regularization.ipynb
    ├── linear_models_sol_01.ipynb
    ├── linear_models_sol_02.ipynb
    ├── linear_models_sol_03.ipynb
    ├── linear_models_sol_04.ipynb
    ├── linear_regression_in_sklearn.ipynb
    ├── linear_regression_non_linear_link.ipynb
    ├── linear_regression_without_sklearn.ipynb
    ├── logistic_regression.ipynb
    ├── matplotlibrc
    ├── metrics_classification.ipynb
    ├── metrics_ex_01.ipynb
    ├── metrics_ex_02.ipynb
    ├── metrics_regression.ipynb
    ├── metrics_sol_01.ipynb
    ├── metrics_sol_02.ipynb
    ├── parameter_tuning_ex_02.ipynb
    ├── parameter_tuning_ex_03.ipynb
    ├── parameter_tuning_grid_search.ipynb
    ├── parameter_tuning_manual.ipynb
    ├── parameter_tuning_nested.ipynb
    ├── parameter_tuning_parallel_plot.ipynb
    ├── parameter_tuning_randomized_search.ipynb
    ├── parameter_tuning_sol_02.ipynb
    ├── parameter_tuning_sol_03.ipynb
    ├── trees_classification.ipynb
    ├── trees_dataset.ipynb
    ├── trees_ex_01.ipynb
    ├── trees_ex_02.ipynb
    ├── trees_hyperparameters.ipynb
    ├── trees_regression.ipynb
    ├── trees_sol_01.ipynb
    └── trees_sol_02.ipynb
├── one-day-course-index.md
├── plan.md
├── pyproject.toml
├── python_scripts
    ├── 01_tabular_data_exploration.py
    ├── 01_tabular_data_exploration_ex_01.py
    ├── 01_tabular_data_exploration_sol_01.py
    ├── 02_numerical_pipeline_cross_validation.py
    ├── 02_numerical_pipeline_ex_00.py
    ├── 02_numerical_pipeline_ex_01.py
    ├── 02_numerical_pipeline_hands_on.py
    ├── 02_numerical_pipeline_introduction.py
    ├── 02_numerical_pipeline_scaling.py
    ├── 02_numerical_pipeline_sol_00.py
    ├── 02_numerical_pipeline_sol_01.py
    ├── 03_categorical_pipeline.py
    ├── 03_categorical_pipeline_column_transformer.py
    ├── 03_categorical_pipeline_ex_01.py
    ├── 03_categorical_pipeline_ex_02.py
    ├── 03_categorical_pipeline_sol_01.py
    ├── 03_categorical_pipeline_sol_02.py
    ├── 03_categorical_pipeline_visualization.py
    ├── cross_validation_baseline.py
    ├── cross_validation_ex_01.py
    ├── cross_validation_ex_02.py
    ├── cross_validation_grouping.py
    ├── cross_validation_learning_curve.py
    ├── cross_validation_nested.py
    ├── cross_validation_sol_01.py
    ├── cross_validation_sol_02.py
    ├── cross_validation_stratification.py
    ├── cross_validation_time.py
    ├── cross_validation_train_test.py
    ├── cross_validation_validation_curve.py
    ├── datasets_adult_census.py
    ├── datasets_ames_housing.py
    ├── datasets_bike_rides.py
    ├── datasets_blood_transfusion.py
    ├── datasets_california_housing.py
    ├── dev_features_importance.py
    ├── ensemble_adaboost.py
    ├── ensemble_bagging.py
    ├── ensemble_ex_01.py
    ├── ensemble_ex_02.py
    ├── ensemble_ex_03.py
    ├── ensemble_ex_04.py
    ├── ensemble_gradient_boosting.py
    ├── ensemble_hist_gradient_boosting.py
    ├── ensemble_hyperparameters.py
    ├── ensemble_introduction.py
    ├── ensemble_random_forest.py
    ├── ensemble_sol_01.py
    ├── ensemble_sol_02.py
    ├── ensemble_sol_03.py
    ├── ensemble_sol_04.py
    ├── feature_selection_ex_01.py
    ├── feature_selection_introduction.py
    ├── feature_selection_limitation_model.py
    ├── feature_selection_sol_01.py
    ├── linear_models_ex_01.py
    ├── linear_models_ex_02.py
    ├── linear_models_ex_03.py
    ├── linear_models_ex_04.py
    ├── linear_models_feature_engineering_classification.py
    ├── linear_models_regularization.py
    ├── linear_models_sol_01.py
    ├── linear_models_sol_02.py
    ├── linear_models_sol_03.py
    ├── linear_models_sol_04.py
    ├── linear_regression_in_sklearn.py
    ├── linear_regression_non_linear_link.py
    ├── linear_regression_without_sklearn.py
    ├── logistic_regression.py
    ├── matplotlibrc
    ├── metrics_classification.py
    ├── metrics_ex_01.py
    ├── metrics_ex_02.py
    ├── metrics_regression.py
    ├── metrics_sol_01.py
    ├── metrics_sol_02.py
    ├── parameter_tuning_ex_02.py
    ├── parameter_tuning_ex_03.py
    ├── parameter_tuning_grid_search.py
    ├── parameter_tuning_manual.py
    ├── parameter_tuning_nested.py
    ├── parameter_tuning_parallel_plot.py
    ├── parameter_tuning_randomized_search.py
    ├── parameter_tuning_sol_02.py
    ├── parameter_tuning_sol_03.py
    ├── trees_classification.py
    ├── trees_dataset.py
    ├── trees_ex_01.py
    ├── trees_ex_02.py
    ├── trees_hyperparameters.py
    ├── trees_regression.py
    ├── trees_sol_01.py
    └── trees_sol_02.py
├── requirements-dev.txt
├── requirements.txt
├── slides
    ├── Makefile
    ├── README.md
    ├── Ubuntu
    │   ├── UFL.txt
    │   ├── Ubuntu-Bold.ttf
    │   ├── Ubuntu-BoldItalic.ttf
    │   ├── Ubuntu-Italic.ttf
    │   ├── Ubuntu-Light.ttf
    │   ├── Ubuntu-LightItalic.ttf
    │   ├── Ubuntu-Medium.ttf
    │   ├── Ubuntu-MediumItalic.ttf
    │   └── Ubuntu-Regular.ttf
    ├── Ubuntu_Mono
    │   ├── UFL.txt
    │   ├── UbuntuMono-Bold.ttf
    │   ├── UbuntuMono-BoldItalic.ttf
    │   ├── UbuntuMono-Italic.ttf
    │   └── UbuntuMono-Regular.ttf
    ├── bagging.md
    ├── bias_vs_variance.md
    ├── boosting.md
    ├── concluding_remarks.md
    ├── custom.css
    ├── ensemble.md
    ├── index.html
    ├── intro_cross_validation.md
    ├── intro_words.md
    ├── learning_validation_curves.md
    ├── linear_models.md
    ├── ml_concepts.md
    ├── overfitting_vs_underfitting.md
    ├── regularized_linear_models.md
    └── trees.md
└── workflow-notes.md


/.binder/environment.yml:
--------------------------------------------------------------------------------
 1 | name: scikit-learn-course
 2 | 
 3 | channels:
 4 |   - conda-forge
 5 | 
 6 | dependencies:
 7 |   - python=3.10
 8 |   - scikit-learn >= 1.1.1
 9 |   - pandas >= 1
10 |   - matplotlib-base
11 |   - seaborn
12 |   - jupyterlab
13 |   - notebook
14 |   - jupytext
15 |   - plotly
16 | 


--------------------------------------------------------------------------------
/.binder/postBuild:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | echo 'export OMP_NUM_THREADS=2' >> ~/.profile
3 | echo 'export OPENBLAS_NUM_THREADS=2' >> ~/.profile
4 | echo 'export MKL_NUM_THREADS=2' >> ~/.profile
5 | 


--------------------------------------------------------------------------------
/.binder/start:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export OMP_NUM_THREADS=2
4 | export OPENBLAS_NUM_THREADS=2
5 | export MKL_NUM_THREADS=2
6 | 
7 | exec "$@"
8 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-gh-pages.yml:
--------------------------------------------------------------------------------
 1 | name: deploy-gh-pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |     - test-ci*
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 |   deploy-gh-pages:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       OMP_NUM_THREADS: 1
17 |       MKL_NUM_THREADS: 2
18 |       OPENBLAS_NUM_THREADS: 2
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |       with:
23 |         fetch-depth: 0
24 | 
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v4
27 |       with:
28 |         python-version: 3.9
29 | 
30 |     - name: Install dependencies
31 |       run: |
32 |         pip install -r requirements-dev.txt
33 | 
34 |     - name: Cache jupyter-cache folder
35 |       uses: actions/cache@v3
36 |       env:
37 |         cache-name: jupyter-cache
38 |       with:
39 |         path: jupyter-book/_build/.jupyter_cache
40 |         key: v2-${{ github.ref }}-${{ hashFiles('python_scripts/**/*.py') }}
41 |         restore-keys: |
42 |           v2-${{ github.ref }}-${{ hashFiles('python_scripts/**/*.py') }}
43 |           v2-${{ github.ref }}
44 |           v2-refs/heads/main
45 | 
46 |     - name: Build the JupyterBook
47 |       env:
48 |         GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}}
49 |       run: |
50 |         bash build_tools/build_jupyter_book.sh
51 | 
52 |     - name: Save the PR number
53 |       env:
54 |         GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}}
55 |       run: |
56 |         echo "Storing PR number ${{github.event.number}} to 'pull_request_number' file"
57 |         echo ${{github.event.number}} > pull_request_number
58 | 
59 |     - name: Upload jupyter-book artifact for preview in PRs
60 |       if: ${{ github.event_name == 'pull_request' }}
61 |       uses: actions/upload-artifact@v4
62 |       with:
63 |         name: jupyter-book
64 |         path: |
65 |           jupyter-book/_build/html
66 |           pull_request_number
67 | 
68 |     - name: Update the main gh-page website
69 |       if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
70 |       uses: peaceiris/actions-gh-pages@v3.6.1
71 |       with:
72 |         github_token: ${{ secrets.GITHUB_TOKEN }}
73 |         publish_dir: ./jupyter-book/_build/html
74 |         commit_message: "[ci skip] ${{ github.event.head_commit.message }}"
75 | 


--------------------------------------------------------------------------------
/.github/workflows/formatting.yml:
--------------------------------------------------------------------------------
 1 | name: Formatting
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "main"
 7 | 
 8 |   pull_request:
 9 |     branches:
10 |       - '*'
11 | 
12 | jobs:
13 |   run-linters:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: actions/checkout@v3
17 | 
18 |       - name: Set up Python 3.11
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: "3.11"
22 |           allow-prereleases: true
23 | 
24 |       - name: Run the linters via pre-commit
25 |         run: |
26 |           python -m pip install pre-commit
27 |           # only run pre-commit on the folder `python_scripts`
28 |           pre-commit run --files python_scripts/*
29 | 


--------------------------------------------------------------------------------
/.github/workflows/jupyter-book-pr-preview.yml:
--------------------------------------------------------------------------------
 1 | name: jupyter-book-pr-preview
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["deploy-gh-pages"]
 6 |     types:
 7 |       - completed
 8 | 
 9 | jobs:
10 |   deploy-preview:
11 |     runs-on: ubuntu-latest
12 |     if: ${{github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'success'}}
13 |     steps:
14 |       - name: 'Commit Status: Set Workflow Status as Pending'
15 |         uses: myrotvorets/set-commit-status-action@1.1.6
16 |         with:
17 |           token: ${{ secrets.GITHUB_TOKEN }}
18 |           status: pending
19 |           sha: ${{ github.event.workflow_run.head_sha }}
20 |           context: 'JupyterBook preview'
21 | 
22 |       - uses: actions/download-artifact@v4
23 |         with:
24 |           github-token: ${{secrets.GITHUB_TOKEN}}
25 |           run-id: ${{ github.event.workflow_run.id }}
26 |           name: jupyter-book
27 | 
28 |       - name: Get pull request number
29 |         id: pull-request-number
30 |         run: |
31 |           export PULL_REQUEST_NUMBER=`cat pull_request_number`
32 |           echo "PULL_REQUEST_NUMBER=$PULL_REQUEST_NUMBER"
33 |           echo "result=${PULL_REQUEST_NUMBER}" >> $GITHUB_OUTPUT
34 | 
35 |       - uses: actions/setup-node@v3
36 |         with:
37 |           node-version: '16'
38 |       - run: npm install --global netlify-cli@6
39 |       - name: Deploy to Netlify
40 |         env:
41 |           NETLIFY_AUTH_TOKEN: ${{secrets.NETLIFY_AUTH_TOKEN}}
42 |           NETLIFY_SITE_ID: ${{secrets.NETLIFY_SITE_ID}}
43 |         run: |
44 |           echo "Deploying PR ${{steps.pull-request-number.outputs.result}} to Netlify"
45 |           netlify deploy --dir=jupyter-book/_build/html --alias=pull-request-${{steps.pull-request-number.outputs.result}}
46 | 
47 |       - name: 'Commit Status: Update deployment status'
48 |         uses: myrotvorets/set-commit-status-action@1.1.6
49 |         # Always run this step regardless of job failing early
50 |         if: always()
51 |         env:
52 |           DEPLOY_SUCCESS: Successfully deployed preview.
53 |           DEPLOY_FAILURE: Failed to deploy preview.
54 |           TARGET_URL_SUCCESS: https://pull-request-${{steps.pull-request-number.outputs.result}}--scikit-learn-mooc.netlify.app/_changed.html
55 |           TARGET_URL_FAILURE: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
56 |         with:
57 |           token: ${{ secrets.GITHUB_TOKEN }}
58 |           status: ${{ job.status == 'success' && 'success' || 'failure' }}
59 |           sha: ${{ github.event.workflow_run.head_sha }}
60 |           context: 'JupyterBook preview'
61 |           description: ${{ job.status == 'success' && env.DEPLOY_SUCCESS || env.DEPLOY_FAILURE }}
62 |           targetUrl:  ${{ job.status == 'success' && env.TARGET_URL_SUCCESS || env.TARGET_URL_FAILURE }}
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # exlude datasets and externals
 2 | notebooks/datasets
 3 | notebooks/joblib/
 4 | wrap-up/
 5 | 
 6 | # jupyter-book
 7 | jupyter-book/_build
 8 | 
 9 | # HTML slides are generated from the markdown source and should generally not
10 | # be committed. There are also some html files in the slides folder that do not
11 | # have a matching markdown source files. Those are manually git added on a case
12 | # by case basis.
13 | slides/*.html
14 | 
15 | # exclude temporary files
16 | .ipynb_checkpoints
17 | .DS_Store
18 | gmon.out
19 | __pycache__
20 | *.pyc
21 | *.o
22 | *.so
23 | *.gcno
24 | *.swp
25 | *.egg-info
26 | *.egg
27 | *~
28 | build
29 | dist
30 | lib/test
31 | doc/_build
32 | *env
33 | *ENV
34 | .idea
35 | *.code-workspace
36 | .vscode
37 | 


--------------------------------------------------------------------------------
/.jupyter/README.md:
--------------------------------------------------------------------------------
1 | This directory is to setup jupyter on binder
2 | 


--------------------------------------------------------------------------------
/.jupyter/jupyter_notebook_config.py:
--------------------------------------------------------------------------------
1 | # To use jupytext in binder
2 | c.ContentsManager.preferred_jupytext_formats_read = "py:percent"  # noqa
3 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |   rev: v4.4.0
 4 |   hooks:
 5 |   -   id: check-yaml
 6 |   -   id: end-of-file-fixer
 7 |       exclude: notebooks
 8 |       exclude_types: [svg]
 9 |   -   id: trailing-whitespace
10 |       exclude: notebooks
11 |       exclude_types: [svg]
12 | - repo: https://github.com/psf/black
13 |   rev: 23.1.0
14 |   hooks:
15 |   -   id: black
16 | - repo: https://github.com/astral-sh/ruff-pre-commit
17 |   rev: v0.11.2
18 |   hooks:
19 |   -   id: ruff
20 |       args: ["--fix", "--output-format=full"]
21 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this content, please cite it as below."
3 | authors:
4 |   - name: "The scikit-learn MOOC developers"
5 | title: "scikit-learn MOOC"
6 | version: latest
7 | doi: https://doi.org/10.5281/zenodo.7220306
8 | url: "https://github.com/INRIA/scikit-learn-mooc"
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON_SCRIPTS_DIR = python_scripts
 2 | NOTEBOOKS_DIR = notebooks
 3 | JUPYTER_BOOK_DIR = jupyter-book
 4 | WRAP_UP_DIR = wrap-up
 5 | JUPYTER_KERNEL := python3
 6 | MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\.py@.ipynb@")
 7 | 
 8 | # This assumes that the folder mooc-scikit-learn-coordination and
 9 | # scikit-learn-mooc are siblings, e.g. the repos are in the
10 | # ~/dev/mooc-scikit-learn-coordination and ~/dev/scikit-learn-mooc. This should
11 | # be the case in most development setups. If not then you can pass the
12 | # GITLAB_REPO_JUPYTERBOOK_DIR variable with
13 | # make -e GITLAB_REPO_JUPYTERBOOK_DIR=your/gitlab/repo/jupyter-book-dir/goes-here
14 | GITLAB_REPO_JUPYTERBOOK_DIR = ../mooc-scikit-learn-coordination/jupyter-book
15 | 
16 | all: $(NOTEBOOKS_DIR)
17 | 
18 | .PHONY: $(NOTEBOOKS_DIR) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR) all \
19 |         exercises quizzes $(JUPYTER_BOOK_DIR) $(JUPYTER_BOOK_DIR)-clean $(JUPYTER_BOOK_DIR)-full-clean
20 | 
21 | $(NOTEBOOKS_DIR): $(MINIMAL_NOTEBOOK_FILES) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR)
22 | 
23 | $(NOTEBOOKS_DIR)/%.ipynb: $(PYTHON_SCRIPTS_DIR)/%.py
24 | 	python build_tools/convert-python-script-to-notebook.py $< $@
25 | 
26 | copy_matplotlibrc:
27 | 	cp $(PYTHON_SCRIPTS_DIR)/matplotlibrc $(NOTEBOOKS_DIR)/
28 | 
29 | sanity_check_$(NOTEBOOKS_DIR):
30 | 	python build_tools/sanity-check.py $(PYTHON_SCRIPTS_DIR) $(NOTEBOOKS_DIR)
31 | 
32 | exercises:
33 | 	python build_tools/generate-exercise-from-solution.py $(PYTHON_SCRIPTS_DIR)
34 | 
35 | quizzes:
36 | 	python build_tools/generate-quizzes.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(JUPYTER_BOOK_DIR)
37 | 
38 | full-index:
39 | 	python build_tools/generate-index.py
40 | 
41 | run-code-in-wrap-up-quizzes:
42 | 	python build_tools/generate-wrap-up.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(WRAP_UP_DIR)
43 | 	jupytext --execute --to notebook $(WRAP_UP_DIR)/*.py
44 | 
45 | $(JUPYTER_BOOK_DIR):
46 | 	jupyter-book build $(JUPYTER_BOOK_DIR)
47 | 	rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html
48 | 
49 | $(JUPYTER_BOOK_DIR)-clean:
50 | 	# keep jupyter-cache cache folder
51 | 	jupyter-book clean $(JUPYTER_BOOK_DIR)
52 | 
53 | $(JUPYTER_BOOK_DIR)-full-clean:
54 | 	# deletes jupyter-cache cache folder
55 | 	rm -rf $(JUPYTER_BOOK_DIR)/_build
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scikit-learn course
 2 | 
 3 | This is the source code for the [Machine learning in Python with scikit-learn
 4 | MOOC](https://www.fun-mooc.fr/en/courses/machine-learning-python-scikit-learn).
 5 | Enroll for the full MOOC experience (quiz solutions, executable
 6 | notebooks, discussion forum, etc ...) !
 7 | 
 8 | The MOOC is free and hosted on the [FUN-MOOC](https://fun-mooc.fr/) platform
 9 | which does not use the student data for any other purpose than improving the
10 | educational material.
11 | 
12 | The static version of the course can be browsed online: https://inria.github.io/scikit-learn-mooc
13 | 
14 | ## Course description
15 | 
16 | The course description can be found here:
17 | https://inria.github.io/scikit-learn-mooc/index.html
18 | 
19 | ## Follow the course online
20 | 
21 | A few different ways are available:
22 | - Launch an online notebook environment using [![Binder](https://mybinder.org/badge_logo.svg)](
23 |   https://mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb)
24 | - Browse [website](https://inria.github.io/scikit-learn-mooc) generated with
25 |   [Jupyter Book](https://jupyterbook.org/)
26 | 
27 | ## Running the notebooks locally
28 | 
29 | See instructions [here](./local-install-instructions.md)
30 | 
31 | ## Contributing
32 | 
33 | See [CONTRIBUTING.md](CONTRIBUTING.md)
34 | 
35 | ## How to cite us
36 | 
37 | The MOOC material is developed publicly under the [CC-BY
38 | license](https://github.com/INRIA/scikit-learn-mooc/blob/main/LICENSE).
39 | 
40 | You can cite us through the project's Zenodo archive using the following DOI:
41 | [10.5281/zenodo.7220306](https://doi.org/10.5281/zenodo.7220306).
42 | 


--------------------------------------------------------------------------------
/build_tools/build_jupyter_book.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -xe
 3 | 
 4 | jupyter_book_dir=jupyter-book
 5 | jupyter_book_build_dir="$jupyter_book_dir/_build/html"
 6 | 
 7 | function show_error_logs {
 8 |     echo "Some notebooks failed, see logs below:"
 9 |     for f in $jupyter_book_build_dir/reports/*.log; do
10 |         echo "================================================================================"
11 |         echo $f
12 |         echo "================================================================================"
13 |         cat $f
14 |     done
15 |     # You need to exit with non-zero here to cause the build to fail
16 |     exit 1
17 | }
18 | 
19 | affected_jupyter_book_paths() {
20 |     files=$(git diff --name-only origin/main...)
21 |     # TODO: rather than the grep pattern below we could potentially look at
22 |     # _toc.yml to know whether the file affects the JupyterBook
23 |     echo "$files" | grep python_scripts | perl -pe 's@\.py$@.html@'
24 |     echo "$files" | grep -P "$jupyter_book_dir/.+md$" | \
25 |         perl -pe "s@$jupyter_book_dir/(.+)\.md@\1.html@"
26 | }
27 | 
28 | write_changed_html() {
29 |     affected="$1"
30 |     if [ -n "$GITHUB_PULL_REQUEST_NUMBER" ]
31 |     then
32 |         GITHUB_PULL_REQUEST_URL="https://github.com/inria/scikit-learn-mooc/pull/$GITHUB_PULL_REQUEST_NUMBER"
33 |         echo "The following files may have been changed by PR $GITHUB_PULL_REQUEST_NUMBER:"
34 |         echo "$affected"
35 |         (
36 |             echo '<html><body>'
37 |             echo "Files changed by PR <a href=\"$GITHUB_PULL_REQUEST_URL\">$GITHUB_PULL_REQUEST_URL</a>"
38 |             echo '<ul>'
39 |             echo "$affected" | sed 's|.*|<li><a href="&">&</a> [<a href="https://inria.github.io/scikit-learn-mooc/&">main</a>]|'
40 |             echo '</ul><p>This PR JupyterBook <a href="index.html">index</a>'
41 |             echo '</ul></body></html>'
42 |         ) > "$jupyter_book_build_dir/_changed.html"
43 |     else
44 |         echo "The variable 'GITHUB_PULL_REQUEST_NUMBER' is not defined: not writing the '_changed.html' file."
45 |     fi
46 | }
47 | 
48 | git remote -v
49 | git show --stat
50 | git log --color --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -20
51 | git fetch origin main >&2 # || (echo QUICK BUILD: failed to get changed filenames for $git_range; return)
52 | git diff origin/main... --stat
53 | git diff origin/main...
54 | 
55 | affected=$(affected_jupyter_book_paths)
56 | mkdir -p $jupyter_book_build_dir
57 | write_changed_html "$affected"
58 | 
59 | make $jupyter_book_dir 2>&1 | tee $jupyter_book_dir/build.log
60 | 
61 | 
62 | # Grep the log to make sure there has been no errors when running the notebooks
63 | # since jupyter-book exit code is always 0
64 | grep 'Execution Failed' $jupyter_book_dir/build.log && show_error_logs || \
65 |     echo 'All notebooks ran successfully'
66 | 


--------------------------------------------------------------------------------
/build_tools/generate-quizzes.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pathlib import Path
 3 | import sys
 4 | 
 5 | from jupytext.myst import myst_to_notebook
 6 | import jupytext
 7 | 
 8 | 
 9 | def remove_solution(input_myst_str):
10 |     """Removes solution from myst str.
11 | 
12 |     This is based on solution having "solution" in their cell metadata tags
13 |     """
14 |     nb = myst_to_notebook(input_myst_str)
15 | 
16 |     cell_tags_list = [c["metadata"].get("tags") for c in nb.cells]
17 |     is_solution_list = [
18 |         tags is not None and "solution" in tags for tags in cell_tags_list
19 |     ]
20 |     nb.cells = [
21 |         cell
22 |         for cell, is_solution in zip(nb.cells, is_solution_list)
23 |         if not is_solution
24 |     ]
25 | 
26 |     myst_nb_str = jupytext.writes(nb, fmt="myst")
27 | 
28 |     header_pattern = re.compile(
29 |         r"---\njupytext.+---\s*", re.DOTALL | re.MULTILINE
30 |     )
31 |     return re.sub(header_pattern, "", myst_nb_str)
32 | 
33 | 
34 | def write_exercise_myst(input_path, output_path):
35 |     input_myst = input_path.read_text()
36 | 
37 |     output_myst = remove_solution(input_myst)
38 |     output_path.write_text(output_myst)
39 | 
40 | 
41 | def write_all_exercises(input_root_path, output_root_path):
42 |     print(input_root_path, output_root_path)
43 |     input_exercises = Path(input_root_path).glob("**/*quiz*.md")
44 | 
45 |     for input_path in input_exercises:
46 |         # FIXME there may be a better way with the pathlib API
47 |         relative_path_str = re.sub(
48 |             str(input_root_path) + "/?", "", str(input_path)
49 |         )
50 |         output_path = Path(output_root_path).joinpath(relative_path_str)
51 |         print(str(input_path), str(output_path))
52 |         write_exercise_myst(input_path, output_path)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     input_root_path = sys.argv[1]
57 |     output_root_path = sys.argv[2]
58 | 
59 |     write_all_exercises(input_root_path, output_root_path)
60 | 


--------------------------------------------------------------------------------
/build_tools/generate-wrap-up.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import glob
  4 | 
  5 | 
  6 | def extract_python_code_blocks(md_file_path):
  7 |     """
  8 |     Extract Python code blocks from a markdown file.
  9 | 
 10 |     Args:
 11 |         md_file_path (str): Path to the markdown file
 12 | 
 13 |     Returns:
 14 |         list: List of extracted Python code blocks
 15 |     """
 16 |     code_blocks = []
 17 |     in_python_block = False
 18 |     current_block = []
 19 | 
 20 |     with open(md_file_path, "r", encoding="utf-8") as file:
 21 |         for line in file:
 22 |             line = line.rstrip("\n")
 23 | 
 24 |             if line.strip() == "```python":
 25 |                 in_python_block = True
 26 |                 current_block = []
 27 |             elif line.strip() == "```" and in_python_block:
 28 |                 in_python_block = False
 29 |                 code_blocks.append("\n".join(current_block))
 30 |             elif in_python_block:
 31 |                 current_block.append(line)
 32 | 
 33 |     return code_blocks
 34 | 
 35 | 
 36 | def write_jupyter_notebook_file(
 37 |     code_blocks, output_file="notebook_from_md.py"
 38 | ):
 39 |     """
 40 |     Writes extracted code blocks to a Python file formatted as Jupyter notebook cells.
 41 | 
 42 |     Args:
 43 |         code_blocks (list): List of code blocks to write
 44 |         output_file (str): Path to the output file
 45 |     """
 46 |     with open(output_file, "w", encoding="utf-8") as file:
 47 |         file.write(
 48 |             "# %% [markdown] \n # ## Notebook generated from Markdown file\n\n"
 49 |         )
 50 | 
 51 |         for i, block in enumerate(code_blocks, 1):
 52 |             file.write(f"# %% [markdown]\n# ## Cell {i}\n\n# %%\n{block}\n\n")
 53 | 
 54 |         print(
 55 |             f"Successfully wrote {len(code_blocks)} code cells to"
 56 |             f" {output_file}"
 57 |         )
 58 | 
 59 | 
 60 | def process_quiz_files(input_path, output_dir):
 61 |     """
 62 |     Process all wrap_up_quiz files in the input path and convert them to notebooks.
 63 | 
 64 |     Args:
 65 |         input_path (str): Path to look for wrap_up_quiz files in subfolders
 66 |         output_dir (str): Directory to write the generated notebooks
 67 |     """
 68 |     # Create output directory if it doesn't exist
 69 |     if not os.path.exists(output_dir):
 70 |         os.makedirs(output_dir)
 71 |         print(f"Created output directory: {output_dir}")
 72 | 
 73 |     # Find all files containing "wrap_up_quiz" in their name in the input path subfolders
 74 |     quiz_files = glob.glob(
 75 |         f"{input_path}/**/*wrap_up_quiz*.md", recursive=True
 76 |     )
 77 | 
 78 |     if not quiz_files:
 79 |         print(f"No wrap_up_quiz.md files found in {input_path} subfolders.")
 80 |         return
 81 | 
 82 |     print(f"Found {len(quiz_files)} wrap_up_quiz files to process.")
 83 | 
 84 |     # Process each file
 85 |     for md_file_path in quiz_files:
 86 |         print(f"\nProcessing: {md_file_path}")
 87 | 
 88 |         # Extract code blocks
 89 |         code_blocks = extract_python_code_blocks(md_file_path)
 90 | 
 91 |         # Generate output filename
 92 |         subfolder = md_file_path.split(os.sep)[3]  # Get subfolder name
 93 |         output_file = os.path.join(output_dir, f"{subfolder}_wrap_up_quiz.py")
 94 | 
 95 |         # Display results and write notebook file
 96 |         if code_blocks:
 97 |             print(f"Found {len(code_blocks)} Python code blocks")
 98 |             write_jupyter_notebook_file(code_blocks, output_file=output_file)
 99 |         else:
100 |             print(f"No Python code blocks found in {md_file_path}.")
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     input_path = sys.argv[1]
105 |     output_dir = sys.argv[2]
106 | 
107 |     process_quiz_files(input_path, output_dir)
108 | 


--------------------------------------------------------------------------------
/build_tools/sanity-check.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import difflib
 4 | 
 5 | # TODO: we could get the list from .gitignore
 6 | IGNORE_LIST = [
 7 |     ".ipynb_checkpoints",
 8 |     "__pycache__",
 9 | ]
10 | 
11 | folder1, folder2 = sys.argv[1:3]
12 | 
13 | 
14 | def get_basename(folder):
15 |     contents = []
16 |     for fn in os.listdir(folder):
17 |         content = os.path.splitext(os.path.basename(fn))[0]
18 |         if content not in IGNORE_LIST:
19 |             contents.append(content)
20 |     return contents
21 | 
22 | 
23 | basenames1 = sorted(get_basename(folder1))
24 | basenames2 = sorted(get_basename(folder2))
25 | 
26 | if basenames1 != basenames2:
27 |     only_in_folder1 = set(basenames1) - set(basenames2)
28 |     only_in_folder2 = set(basenames2) - set(basenames1)
29 | 
30 |     raise RuntimeError(
31 |         f"Inconsistency between folder {folder1} and {folder2}\n"
32 |         f"Only in folder {folder1}: {only_in_folder1}\n"
33 |         f"Only in folder {folder2}: {only_in_folder2}"
34 |     )
35 | 


--------------------------------------------------------------------------------
/check_env.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import importlib
 3 | 
 4 | OK = "\x1b[42m[ OK ]\x1b[0m"
 5 | FAIL = "\x1b[41m[FAIL]\x1b[0m"
 6 | 
 7 | try:
 8 |     from packaging.version import Version
 9 | except ImportError:
10 |     print(
11 |         FAIL, "'packaging' package not installed, install it with conda or pip"
12 |     )
13 |     sys.exit(1)
14 | 
15 | # first check the python version
16 | print("Using python in", sys.prefix)
17 | print(sys.version)
18 | pyversion_str = f"{sys.version_info.major}.{sys.version_info.minor}"
19 | pyversion = Version(pyversion_str)
20 | 
21 | if pyversion < Version("3.8"):
22 |     print(
23 |         FAIL,
24 |         (
25 |             "Python version 3.8 or above is required,"
26 |             f" but {pyversion_str} is installed."
27 |         ),
28 |     )
29 |     sys.exit(1)
30 | print()
31 | 
32 | 
33 | def import_version(pkg, min_ver, fail_msg=""):
34 |     mod = None
35 |     try:
36 |         mod = importlib.import_module(pkg)
37 |         if pkg in {"PIL"}:
38 |             try:
39 |                 ver = mod.__version__
40 |             except AttributeError:
41 |                 try:
42 |                     ver = mod.VERSION
43 |                 except AttributeError:
44 |                     try:
45 |                         ver = mod.PILLOW_VERSION
46 |                     except Exception:
47 |                         raise
48 |         else:
49 |             ver = mod.__version__
50 |         if Version(ver) < Version(min_ver):
51 |             print(
52 |                 FAIL,
53 |                 (
54 |                     f"{lib} version {min_ver} or higher required, but"
55 |                     f" {ver} installed."
56 |                 ),
57 |             )
58 |         else:
59 |             print(OK, f"{pkg} version {ver}")
60 |     except ImportError:
61 |         print(FAIL, f"{pkg} not installed. {fail_msg}")
62 |     return mod
63 | 
64 | 
65 | requirements = {
66 |     "numpy": "1.16",
67 |     "scipy": "1.2",
68 |     "matplotlib": "3.0",
69 |     "sklearn": "1.6",
70 |     "pandas": "1",
71 |     "seaborn": "0.11",
72 |     "notebook": "5.7",
73 |     "plotly": "5.10",
74 | }
75 | 
76 | # now the dependencies
77 | for lib, required_version in list(requirements.items()):
78 |     import_version(lib, required_version)
79 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
1 | `cps_85_wages.csv` is available at https://www.openml.org/d/534
2 | `adult-census.csv` is available at https://www.openml.org/d/15950
3 | 


--------------------------------------------------------------------------------
/environment-dev.yml:
--------------------------------------------------------------------------------
 1 | name: scikit-learn-course
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - scikit-learn >= 1.6
 6 |   - pandas >= 1
 7 |   - matplotlib-base
 8 |   - seaborn >= 0.13
 9 |   - plotly >= 5.10
10 |   - jupytext
11 |   - beautifulsoup4
12 |   - IPython
13 |   - packaging
14 |   - pip
15 |   - pip:
16 |     - jupyter-book >= 0.11
17 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: scikit-learn-course
 2 | 
 3 | channels:
 4 |   - conda-forge
 5 | 
 6 | dependencies:
 7 |   - scikit-learn >= 1.6
 8 |   - pandas >= 1
 9 |   - matplotlib-base
10 |   - seaborn >= 0.13
11 |   - jupyterlab
12 |   - notebook
13 |   - plotly >= 5.10
14 |   - IPython
15 |   - packaging
16 | 


--------------------------------------------------------------------------------
/figures/README.md:
--------------------------------------------------------------------------------
1 | This directory contains didactic figures and scripts that generate them.
2 | 


--------------------------------------------------------------------------------
/figures/boosting_trees1.svg:
--------------------------------------------------------------------------------
1 | <svg version="1.1" viewBox="0.0 0.0 809.0288713910761 216.96062992125985" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l809.0289 0l0 216.96063l-809.0289 0l0 -216.96063z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l809.0289 0l0 216.96063l-809.0289 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m44.0 5.43832l88.53543 0l0 73.38583l-88.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m44.0 5.43832l88.53543 0l0 73.38583l-88.53543 0z" fill-rule="evenodd"/><path fill="#666666" d="m59.902054 30.973106l0 -17.5l1.421875 0l0 17.5l-1.421875 0z" fill-rule="nonzero"/><path fill="#666666" d="m59.902054 52.973106l0 -17.5l1.421875 0l0 17.5l-1.421875 0z" fill-rule="nonzero"/><path fill="#666666" d="m59.902054 74.973114l0 -17.500004l1.421875 0l0 17.500004l-1.421875 0z" fill-rule="nonzero"/><path fill="#4a86e8" d="m7.874016 179.03412l0 0c0 -10.297714 8.087077 -18.64566 18.062992 -18.64566l0 0c4.7906036 0 9.3849945 1.964447 12.772465 5.4611816c3.3874664 3.4967499 5.2905273 8.239349 5.2905273 13.184479l0 0c0 10.297714 -8.087078 18.645676 -18.062992 18.645676l0 0c-9.975915 0 -18.062992 -8.347961 -18.062992 -18.645676z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m7.874016 179.03412l0 0c0 -10.297714 8.087077 -18.64566 18.062992 -18.64566l0 0c4.7906036 0 9.3849945 1.964447 12.772465 5.4611816c3.3874664 3.4967499 5.2905273 8.239349 5.2905273 13.184479l0 0c0 10.297714 -8.087078 18.645676 -18.062992 18.645676l0 0c-9.975915 0 -18.062992 -8.347961 -18.062992 -18.645676z" fill-rule="evenodd"/><path fill="#ff9900" d="m132.53543 160.38846l32.62993 0l0 38.45668l-32.62993 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.53543 160.38846l32.62993 0l0 38.45668l-32.62993 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m88.267715 78.81365l-62.330708 81.57481" fill-rule="evenodd"/><path stroke="#000000" stroke-width="4.0" stroke-linejoin="round" stroke-linecap="butt" d="m88.267715 78.81365l-47.759285 62.50457" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="4.0" stroke-linecap="butt" d="m35.25861 137.30687l-5.771269 18.43512l16.270906 -10.412415z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m88.267715 78.82415l60.59842 81.57481" fill-rule="evenodd"/><path stroke="#000000" stroke-width="4.0" stroke-linejoin="round" stroke-linecap="butt" d="m88.267715 78.82415l46.28665 62.308952" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="4.0" stroke-linecap="butt" d="m129.25069 145.07297l16.128387 10.631851l-5.521042 -18.511597z" fill-rule="evenodd"/></g></svg>


--------------------------------------------------------------------------------
/figures/cross_validation_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/cross_validation_diagram.png


--------------------------------------------------------------------------------
/figures/cross_validation_train_test_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/cross_validation_train_test_diagram.png


--------------------------------------------------------------------------------
/figures/mooc_computer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/mooc_computer.jpg


--------------------------------------------------------------------------------
/figures/nested_cross_validation_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/nested_cross_validation_diagram.png


--------------------------------------------------------------------------------
/figures/numerical_pipeline_quiz_scaler.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import numpy as np
 3 | 
 4 | rng = np.random.RandomState(0)
 5 | 
 6 | X = rng.randn(100, 2)
 7 | X[:, 0] += abs(X[:, 0].min()) + 1
 8 | X[:, 1] *= 3
 9 | 
10 | # %%
11 | import seaborn as sns
12 | 
13 | sns.set_context("talk")
14 | 
15 | # %%
16 | import matplotlib.pyplot as plt
17 | import matplotlib.ticker as ticker
18 | 
19 | ticks = [-6, -4, -2, 0, 2, 4, 6]
20 | 
21 | _, ax = plt.subplots(figsize=(5, 5))
22 | sns.scatterplot(x=X[:, 0], y=X[:, 1], s=30, edgecolor="black")
23 | ax.set_xlim(-6, 6)
24 | ax.set_ylim(-6, 6)
25 | ax.set_xlabel("Feature A")
26 | ax.set_ylabel("Feature B")
27 | ax.xaxis.set_ticks_position("bottom")
28 | ax.yaxis.set_ticks_position("left")
29 | ax.set_xticklabels(ticks)
30 | ax.xaxis.set_major_locator(ticker.FixedLocator(ticks))
31 | ax.set_yticklabels(ticks)
32 | ax.yaxis.set_major_locator(ticker.FixedLocator(ticks))
33 | ax.grid(visible=True)
34 | ax.set_title("Original dataset\n", loc="center")
35 | plt.savefig("numerical_pipeline_quiz_scaler_original.png", bbox_inches="tight")
36 | 
37 | # %%
38 | from sklearn.preprocessing import StandardScaler
39 | from sklearn.preprocessing import MinMaxScaler
40 | 
41 | standard_scaler_mean_only = StandardScaler(with_std=False).fit(X)
42 | standard_scaler_scale_only = StandardScaler(with_mean=False).fit(X)
43 | standard_scaler = StandardScaler().fit(X)
44 | min_max_scaler = MinMaxScaler().fit(X)
45 | 
46 | # %%
47 | fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(12, 10))
48 | for idx, (ax, data) in enumerate(
49 |     zip(
50 |         axs.ravel(),
51 |         [
52 |             standard_scaler_mean_only.transform(X),
53 |             standard_scaler.transform(X),
54 |             min_max_scaler.transform(X),
55 |             standard_scaler_scale_only.transform(X),
56 |         ],
57 |     )
58 | ):
59 |     sns.scatterplot(x=data[:, 0], y=data[:, 1], s=30, edgecolor="black", ax=ax)
60 |     ax.set_xlim(-6, 6)
61 |     ax.set_ylim(-6, 6)
62 |     ax.set_xlabel("Feature A")
63 |     ax.set_ylabel("Feature B")
64 |     ax.xaxis.set_ticks_position("bottom")
65 |     ax.yaxis.set_ticks_position("left")
66 |     ax.set_xticklabels(ticks)
67 |     ax.xaxis.set_major_locator(ticker.FixedLocator(ticks))
68 |     ax.set_yticklabels(ticks)
69 |     ax.yaxis.set_major_locator(ticker.FixedLocator(ticks))
70 |     ax.grid(visible=True)
71 |     ax.set_title(f"Preprocessing {'ABCD'[idx]}\n")
72 | 
73 | fig.subplots_adjust(hspace=0.6, wspace=0.5)
74 | plt.savefig(
75 |     "numerical_pipeline_quiz_scaler_preprocessing.png", bbox_inches="tight"
76 | )
77 | 
78 | # %%
79 | 


--------------------------------------------------------------------------------
/figures/numerical_pipeline_quiz_scaler_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/numerical_pipeline_quiz_scaler_original.png


--------------------------------------------------------------------------------
/figures/numerical_pipeline_quiz_scaler_preprocessing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/numerical_pipeline_quiz_scaler_preprocessing.png


--------------------------------------------------------------------------------
/figures/numerical_pipeline_wrap_up_quiz_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/numerical_pipeline_wrap_up_quiz_comparison.png


--------------------------------------------------------------------------------
/figures/plot_cross_validation_diagram.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from matplotlib.patches import Patch
 4 | from pathlib import Path
 5 | from sklearn.model_selection import KFold, ShuffleSplit
 6 | 
 7 | 
 8 | FIGURES_FOLDER = Path(__file__).parent
 9 | cmap_cv = plt.cm.coolwarm
10 | 
11 | plt.style.use(FIGURES_FOLDER / "../python_scripts/matplotlibrc")
12 | 
13 | 
14 | # +
15 | def plot_cv_indices(cv, X, y, ax, lw=50):
16 |     """Create a sample plot for indices of a cross-validation object."""
17 |     splits = list(cv.split(X=X, y=y))
18 |     n_splits = len(splits)
19 | 
20 |     # Generate the training/testing visualizations for each CV split
21 |     for ii, (train, test) in enumerate(splits):
22 |         # Fill in indices with the training/test groups
23 |         indices = np.zeros(shape=X.shape[0], dtype=np.int32)
24 |         indices[train] = 1
25 | 
26 |         # Visualize the results
27 |         ax.scatter(
28 |             range(len(indices)),
29 |             [ii + 0.5] * len(indices),
30 |             c=indices,
31 |             marker="_",
32 |             lw=25,
33 |             cmap=cmap_cv,
34 |             vmin=-0.2,
35 |             vmax=1.2,
36 |         )
37 | 
38 |     # Formatting
39 |     yticklabels = list(range(n_splits))
40 |     ax.set(
41 |         yticks=np.arange(n_splits) + 0.5,
42 |         yticklabels=yticklabels,
43 |         xlabel="Sample index",
44 |         ylabel="CV iteration",
45 |         ylim=[n_splits + 0.2, -0.2],
46 |         xlim=[0, 50],
47 |     )
48 |     ax.legend(
49 |         [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
50 |         ["Training samples", "Testing samples"],
51 |         loc=(1.02, 0.8),
52 |     )
53 |     ax.set_title("{}".format(type(cv).__name__))
54 |     return ax
55 | 
56 | 
57 | n_points = 50
58 | X = np.random.randn(n_points, 10)
59 | y = np.random.randn(n_points)
60 | 
61 | fig, ax = plt.subplots(figsize=(12, 4))
62 | cv = KFold(5)
63 | _ = plot_cv_indices(cv, X, y, ax)
64 | plt.tight_layout()
65 | fig.savefig(FIGURES_FOLDER / "cross_validation_diagram.png")
66 | 
67 | fig, ax = plt.subplots(figsize=(12, 4))
68 | cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
69 | _ = plot_cv_indices(cv, X, y, ax)
70 | plt.tight_layout()
71 | fig.savefig(FIGURES_FOLDER / "shufflesplit_diagram.png")
72 | 


--------------------------------------------------------------------------------
/figures/plot_iris_visualization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Some simple visualizations on the iris data.
 3 | """
 4 | 
 5 | import numpy as np
 6 | from sklearn import datasets
 7 | from matplotlib import pyplot as plt
 8 | import style_figs
 9 | 
10 | iris = datasets.load_iris()
11 | 
12 | # Plot the histograms of each class for each feature
13 | 
14 | 
15 | X = iris.data
16 | y = iris.target
17 | for x, feature_name in zip(X.T, iris.feature_names):
18 |     plt.figure(figsize=(2.5, 2))
19 |     patches = list()
20 |     for this_y, target_name in enumerate(iris.target_names):
21 |         patch = plt.hist(
22 |             x[y == this_y],
23 |             bins=np.linspace(x.min(), x.max(), 20),
24 |             label=target_name,
25 |         )
26 |         patches.append(patch[-1][0])
27 |     style_figs.light_axis()
28 |     feature_name = feature_name.replace(" ", "_")
29 |     feature_name = feature_name.replace("(", "")
30 |     feature_name = feature_name.replace(")", "")
31 |     plt.savefig("iris_{}_hist.svg".format(feature_name))
32 | 
33 | plt.figure(figsize=(6, 0.25))
34 | plt.legend(patches, iris.target_names, ncol=3, loc=(0, -0.37), borderaxespad=0)
35 | style_figs.no_axis()
36 | plt.savefig("legend_irises.svg")
37 | 


--------------------------------------------------------------------------------
/figures/plot_simple_decision_tree_adult_census.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import numpy as np
  3 | 
  4 | from scipy import ndimage
  5 | 
  6 | import pandas as pd
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | from matplotlib.pyplot import cm
 10 | from matplotlib.colors import ListedColormap
 11 | 
 12 | import seaborn as sns
 13 | 
 14 | from sklearn.preprocessing import LabelEncoder
 15 | from sklearn.tree import DecisionTreeClassifier
 16 | 
 17 | 
 18 | HERE = Path(__file__).parent
 19 | 
 20 | top = cm.get_cmap("Oranges", 128)
 21 | bottom = cm.get_cmap("Blues_r", 128)
 22 | 
 23 | colors = np.vstack(
 24 |     [bottom(np.linspace(0, 1, 128)), top(np.linspace(0, 1, 128))]
 25 | )
 26 | blue_orange_cmap = ListedColormap(colors, name="BlueOrange")
 27 | 
 28 | 
 29 | adult_census = pd.read_csv("../datasets/adult-census.csv")
 30 | target_column = "class"
 31 | 
 32 | n_samples_to_plot = 5000
 33 | 
 34 | 
 35 | def plot_tree_decision_function(tree, X, y, ax=None):
 36 |     """Plot the different decision rules found by a `DecisionTreeClassifier`.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     tree : DecisionTreeClassifier instance
 41 |         The decision tree to inspect.
 42 |     X : dataframe of shape (n_samples, n_features)
 43 |         The data used to train the `tree` estimator.
 44 |     y : ndarray of shape (n_samples,)
 45 |         The target used to train the `tree` estimator.
 46 |     ax : matplotlib axis
 47 |         The matplotlib axis where to plot the different decision rules.
 48 |     """
 49 |     import numpy as np
 50 | 
 51 |     plt.figure(figsize=(12, 10))
 52 |     h = 0.02
 53 |     x_min, x_max = 0, 100
 54 |     y_min, y_max = 0, 100
 55 |     xx, yy = np.meshgrid(
 56 |         np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)
 57 |     )
 58 | 
 59 |     Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
 60 |     Z = Z.reshape(xx.shape)
 61 |     faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
 62 |     faces = faces.reshape(xx.shape)
 63 |     border = ndimage.laplace(faces) != 0
 64 |     if ax is None:
 65 |         ax = plt.gca()
 66 |     ax.scatter(
 67 |         X.iloc[:, 0],
 68 |         X.iloc[:, 1],
 69 |         c=np.array(["tab:blue", "tab:orange"])[y],
 70 |         s=60,
 71 |         alpha=0.7,
 72 |         vmin=0,
 73 |         vmax=1,
 74 |     )
 75 |     levels = np.linspace(0, 1, 101)
 76 |     contours = ax.contourf(
 77 |         xx, yy, Z, alpha=0.4, levels=levels, cmap=blue_orange_cmap
 78 |     )
 79 |     ax.get_figure().colorbar(contours, ticks=np.linspace(0, 1, 11))
 80 |     ax.scatter(xx[border], yy[border], marker=".", s=1)
 81 |     ax.set_xlabel(X.columns[0])
 82 |     ax.set_ylabel(X.columns[1])
 83 |     ax.set_xlim([x_min, x_max])
 84 |     ax.set_ylim([y_min, y_max])
 85 |     sns.despine(offset=10)
 86 |     plt.savefig(HERE / "simple_decision_tree_adult_census.png")
 87 | 
 88 | 
 89 | # select a subset of data
 90 | data_subset = adult_census[:n_samples_to_plot]
 91 | X = data_subset[["age", "hours-per-week"]]
 92 | y = LabelEncoder().fit_transform(data_subset[target_column].to_numpy())
 93 | 
 94 | max_leaf_nodes = 3
 95 | tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=0)
 96 | tree.fit(X, y)
 97 | 
 98 | # plot the decision function learned by the tree
 99 | plot_tree_decision_function(tree, X, y)
100 | 


--------------------------------------------------------------------------------
/figures/plot_splines.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple example of overfit with splines
  3 | """
  4 | import numpy as np
  5 | from matplotlib import pyplot as plt
  6 | import style_figs
  7 | 
  8 | from sklearn import datasets, linear_model
  9 | 
 10 | # Load the diabetes dataset
 11 | diabetes = datasets.load_diabetes()
 12 | 
 13 | 
 14 | # Use only one feature
 15 | diabetes_X = diabetes.data[:, np.newaxis]
 16 | diabetes_X_temp = diabetes_X[:, :, 2]
 17 | 
 18 | # Split the data into training/testing sets
 19 | diabetes_X_train = diabetes_X_temp[:-200:3]
 20 | diabetes_X_test = diabetes_X_temp[-200:].T
 21 | 
 22 | # Split the targets into training/testing sets
 23 | diabetes_y_train = diabetes.target[:-200:3]
 24 | diabetes_y_test = diabetes.target[-200:]
 25 | 
 26 | # Sort the data and remove duplicates (for interpolation)
 27 | order = np.argsort(diabetes_X_train.ravel())
 28 | X_train = diabetes_X_train.ravel()[order]
 29 | y_train = diabetes_y_train[order]
 30 | # Avoid duplicates
 31 | y_train_ = list()
 32 | for this_x in np.unique(X_train):
 33 |     y_train_.append(np.mean(y_train[X_train == this_x]))
 34 | X_train = np.unique(X_train)
 35 | 
 36 | y_train = np.array(y_train_)
 37 | 
 38 | # Create linear regression object
 39 | regr = linear_model.LinearRegression()
 40 | 
 41 | # Train the model using the training sets
 42 | regr.fit(X_train.reshape((-1, 1)), y_train)
 43 | 
 44 | 
 45 | plt.figure(1, figsize=(0.8 * 4, 0.8 * 3), facecolor="none")
 46 | # Plot with test data
 47 | plt.clf()
 48 | ax = plt.axes([0.1, 0.1, 0.9, 0.9])
 49 | 
 50 | plt.scatter(X_train, y_train, color="k", s=9)
 51 | 
 52 | plt.plot(
 53 |     [-0.08, 0.12],
 54 |     regr.predict(
 55 |         [
 56 |             [
 57 |                 -0.08,
 58 |             ],
 59 |             [
 60 |                 0.12,
 61 |             ],
 62 |         ]
 63 |     ),
 64 |     linewidth=3,
 65 | )
 66 | 
 67 | plt.axis("tight")
 68 | ymin, ymax = plt.ylim()
 69 | style_figs.light_axis()
 70 | plt.ylabel("y", size=16, weight=600)
 71 | plt.xlabel("x", size=16, weight=600)
 72 | 
 73 | plt.savefig("ols_simple.svg", facecolor="none", edgecolor="none")
 74 | 
 75 | plt.scatter(diabetes_X_test, diabetes_y_test, color="C1", s=9)
 76 | plt.ylim(ymin, ymax)
 77 | plt.xlim(-0.08, 0.12)
 78 | 
 79 | plt.savefig("ols_test.svg", facecolor="none", edgecolor="none")
 80 | 
 81 | 
 82 | # Plot cubic splines
 83 | plt.clf()
 84 | ax = plt.axes([0.1, 0.1, 0.9, 0.9])
 85 | 
 86 | from scipy import interpolate
 87 | 
 88 | f = interpolate.interp1d(
 89 |     X_train,
 90 |     y_train,
 91 |     kind="quadratic",
 92 |     bounds_error=False,
 93 |     fill_value="extrapolate",
 94 | )
 95 | plt.scatter(X_train, y_train, color="k", s=9, zorder=20)
 96 | x_spline = np.linspace(-0.08, 0.12, 600)
 97 | y_spline = f(x_spline)
 98 | plt.plot(x_spline, y_spline, linewidth=3)
 99 | 
100 | plt.axis("tight")
101 | plt.xlim(-0.08, 0.12)
102 | plt.ylim(ymin, ymax)
103 | 
104 | style_figs.light_axis()
105 | 
106 | plt.ylabel("y", size=16, weight=600)
107 | plt.xlabel("x", size=16, weight=600)
108 | 
109 | 
110 | plt.savefig("splines_cubic.svg", facecolor="none", edgecolor="none")
111 | 
112 | 
113 | plt.scatter(diabetes_X_test, diabetes_y_test, color="C1", s=9)
114 | plt.savefig("splines_test.svg", facecolor="none", edgecolor="none")
115 | 
116 | plt.show()
117 | 


--------------------------------------------------------------------------------
/figures/shufflesplit_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/shufflesplit_diagram.png


--------------------------------------------------------------------------------
/figures/simple_decision_tree_adult_census.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/simple_decision_tree_adult_census.png


--------------------------------------------------------------------------------
/figures/style_figs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple styling used for matplotlib figures
 3 | """
 4 | 
 5 | from matplotlib import pyplot as plt
 6 | 
 7 | # Configuration settings to help visibility on small screen / prints
 8 | plt.rcParams["xtick.labelsize"] = 20
 9 | plt.rcParams["ytick.labelsize"] = 20
10 | plt.rcParams["figure.titlesize"] = 15
11 | plt.rcParams["font.size"] = 20
12 | plt.rcParams["axes.labelsize"] = 20
13 | plt.rcParams["axes.facecolor"] = "none"
14 | plt.rcParams["legend.fontsize"] = 18
15 | plt.rcParams["lines.linewidth"] = 3
16 | plt.rcParams["figure.figsize"] = [0.8 * 6.4, 0.8 * 4.8]
17 | plt.rcParams["legend.frameon"] = False
18 | plt.rcParams["legend.columnspacing"] = 1.8
19 | plt.rcParams["legend.handlelength"] = 1.5
20 | plt.rcParams["legend.handletextpad"] = 0.5
21 | 
22 | 
23 | # Utility functions
24 | def light_axis():
25 |     "Hide the top and right spines"
26 |     ax = plt.gca()
27 |     for s in ("top", "right"):
28 |         ax.spines[s].set_visible(False)
29 |     plt.xticks(())
30 |     plt.yticks(())
31 |     plt.subplots_adjust(left=0.01, bottom=0.01, top=0.99, right=0.99)
32 | 
33 | 
34 | def no_axis():
35 |     plt.axis("off")
36 |     plt.subplots_adjust(left=0.0, bottom=0.0, top=1, right=1)
37 | 


--------------------------------------------------------------------------------
/figures/supervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/supervised.png


--------------------------------------------------------------------------------
/figures/unsupervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/unsupervised.png


--------------------------------------------------------------------------------
/figures/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/workflow.png


--------------------------------------------------------------------------------
/jupyter-book/_config.yml:
--------------------------------------------------------------------------------
 1 | #######################################################################################
 2 | # Book settings
 3 | title : Scikit-learn course
 4 | author: scikit-learn developers
 5 | logo: 'scikit-learn-logo.png'
 6 | copyright: "2022-2024"
 7 | 
 8 | # Information about where the book exists on the web
 9 | description: >-
10 |   scikit-learn course
11 | exclude_patterns:
12 |   -  _build
13 |   - Thumbs.db
14 |   - .DS_Store
15 |   - "**.ipynb_checkpoints"
16 |   - "figures"
17 |   - "datasets"
18 |   - "README.md"
19 | 
20 | 
21 | #######################################################################################
22 | # Execution settings
23 | execute:
24 |   execute_notebooks           : cache
25 |   timeout                     : 300
26 | 
27 | #######################################################################################
28 | # Parse and render settings
29 | parse:
30 |   myst_enable_extensions:
31 |     - colon_fence
32 |     - dollarmath
33 |     - linkify
34 |     - substitution
35 | 
36 | #######################################################################################
37 | # HTML-specific settings
38 | html:
39 |   home_page_in_navbar         : false
40 |   use_edit_page_button        : true
41 |   use_repository_button       : true
42 |   use_issues_button           : true
43 |   favicon: _static/favicon.ico
44 |   comments:
45 |     hypothesis: true
46 |   extra_footer: |
47 |     <div>
48 |       <div class="mooc_add">
49 |        <a href="https://www.fun-mooc.fr/en/courses/machine-learning-python-scikit-learn">Join the full MOOC experience</a>
50 |        <a href="https://certification.probabl.ai/">Get officially certified!</a>
51 |       </div>
52 |       Brought to you under a <a href="https://github.com/INRIA/scikit-learn-mooc/blob/main/LICENSE">CC-BY License</a> by
53 |       <a href="https://learninglab.inria.fr">Inria Learning Lab</a>,
54 |       <a href="https://scikit-learn.fondation-inria.fr">scikit-learn @ La Fondation Inria</a>,
55 |       <a href="https://www.inria-academy.fr/formation/scikit-learn-la-boite-a-outils-de-lapprentissage-automatique/">Inria Academy</a>,
56 |       <a href="https://probabl.ai/">probabl</a>,
57 |       with many thanks to the <a href="https://scikit-learn.org">scikit-learn</a> community as a whole!
58 |     </div>
59 | 
60 | # #######################################################################################
61 | # Interact link settings
62 | notebook_interface            : "notebook"
63 | # notebook_interface: "classic" # The interface interactive links will activate ["classic", "jupyterlab"]
64 | 
65 | sphinx:
66 |   config:
67 |     nb_custom_formats:
68 |       .py:
69 |         - jupytext.reads
70 |         - fmt: py:percent
71 |     # Needed for plotly rendering:
72 |     # https://jupyterbook.org/interactive/interactive.html#plotly
73 |     html_js_files:
74 |       - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js
75 | 
76 | #######################################################################################
77 | # Launch button settings
78 | repository:
79 |   url                         : https://github.com/INRIA/scikit-learn-mooc
80 |   branch: main
81 | 
82 | launch_buttons:
83 |   binderhub_url: "https://mybinder.org"
84 | #  colab_url: "https://colab.research.google.com" # Not working for now,
85 | #  because it needs .ipynb
86 | #  Disable thebe support since it does not start in the right folder, see
87 | #  https://github.com/INRIA/scikit-learn-mooc/issues/669 for more details
88 | #  thebe: true
89 | 
90 | binder:
91 |   binderhub_url               : "https://mybinder.org"
92 |   text                        : "Launch binder"
93 | 
94 | 
95 | latex:
96 |   latex_engine                : "xelatex"
97 |   latex_documents:
98 |     targetname: book.tex
99 | 


--------------------------------------------------------------------------------
/jupyter-book/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/jupyter-book/_static/favicon.ico


--------------------------------------------------------------------------------
/jupyter-book/_static/matomo.js:
--------------------------------------------------------------------------------
 1 | var _paq = window._paq = window._paq || [];
 2 | /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
 3 | _paq.push(['trackPageView']);
 4 | _paq.push(['enableLinkTracking']);
 5 | (function() {
 6 |     var u = "https://piwik.inria.fr/";
 7 |     _paq.push(['setTrackerUrl', u + 'piwik.php']);
 8 |     _paq.push(['setSiteId', '127']);
 9 |     var d = document,
10 |         g = d.createElement('script'),
11 |         s = d.getElementsByTagName('script')[0];
12 |     g.async = true;
13 |     g.src = u + 'piwik.js';
14 |     s.parentNode.insertBefore(g, s);
15 | })();
16 | 


--------------------------------------------------------------------------------
/jupyter-book/_static/sklearn_mooc.css:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 | Note: the video and slides iframes currently use the same CSS styles but use
 4 | different classes to get future-proof flexibility.
 5 | */
 6 | 
 7 | iframe.video {
 8 |     width: 100%;
 9 |     aspect-ratio: 4/3;
10 |     margin-bottom: 1em;
11 | }
12 | 
13 | iframe.slides {
14 |     width: 100%;
15 |     aspect-ratio: 4/3;
16 |     margin-bottom: 1em;
17 | }
18 | 
19 | /*
20 | Better highlighting of modules in toc.html, for some reason modules
21 | are aria-level="2" rather than h2
22 | */
23 | p[aria-level="2"] {
24 |     font-size: 1.2em;
25 |     margin-top: 2em;
26 |     margin-bottom: 0.5em;
27 |     font-weight: bold;
28 | }
29 | 
30 | /* The adds in the landing page */
31 | 
32 | div.mooc_add {
33 |     display: table;
34 | }
35 | 
36 | div.mooc_add a {
37 |     color: #000000;
38 |     display: block;
39 |     border-radius: .4em;
40 |     background-color: #F7931E;
41 |     border: 1px solid #7b5a46;
42 |     box-shadow: 1px 1px 1px #CA9875;
43 |     padding: 5pt;
44 | }
45 | 
46 | @media screen and (min-width: 900px) {
47 |     div.mooc_add {
48 |       width: 25ex;
49 |       position: fixed;
50 |       right: calc(5pt + .15 * (100vw - 900px));
51 |       bottom: calc(5pt + max(0pt, .05*(100vh - 200px)));
52 |   }
53 | 
54 | 
55 |   div.footer {
56 |     max-width: 60vw;
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/jupyter-book/_static/sklearn_mooc.js:
--------------------------------------------------------------------------------
 1 | (function() {
 2 |     function inIframe() {
 3 |         try {
 4 |             return window.self !== window.top;
 5 |         } catch (e) {
 6 |             return true;
 7 |         }
 8 |     }
 9 | 
10 |     function contentOnly() {
11 |         var urlParams = new URLSearchParams(window.location.search);
12 |         return urlParams.get('content_only') !== null;
13 |     }
14 | 
15 |     function removeIfExists(el) {
16 |         if (el) {
17 |             el.remove();
18 |         };
19 |     }
20 | 
21 |     function adjustBinderLink() {
22 |         // Binder links to .py instead of .ipynb. In an ideal world, there
23 |         // would be a way to do it in _config.yml or you could tell Jupyter to
24 |         // use the Notebook interface to open the .py but ?factory=Notebook
25 |         // does not work on the mybinder.org URL only on the
26 |         // hub.2i2c.mybinder.org URL
27 |         var elements = document.querySelectorAll('.dropdown-launch-buttons a');
28 |         elements.forEach(
29 |             function(el) {
30 |                 el.href = el.href.replace(/python_scripts\/(.+)\.py/, "notebooks/$1.ipynb");
31 |             }
32 |         );
33 |     }
34 | 
35 |     function displayContentOnly() {
36 |         removeIfExists(document.querySelector('#site-navigation'));
37 |         removeIfExists(document.querySelector('.topbar'));
38 |         removeIfExists(document.querySelector('.footer'));
39 |         // the prev/next buttons at the bottom of the page may have a different
40 |         // class (depending on the theme version maybe?), removing both to be
41 |         // safe.
42 |         removeIfExists(document.querySelector('.prev-next-bottom'));
43 |         removeIfExists(document.querySelector('.prev-next-area'));
44 |         var elementsToRemove = document.querySelectorAll('.remove-from-content-only');
45 |         elementsToRemove.forEach(
46 |             function(el) {
47 |                 removeIfExists(el);
48 |             }
49 |         );
50 |         document.querySelector('#main-content').querySelector('.col-md-9').className = 'col-12';
51 | 
52 |         var style = document.createElement('style');
53 |         style.appendChild(
54 |             document.createTextNode(
55 |                 'hypothesis-sidebar, hypothesis-notebook, hypothesis-adder{display:none!important;}'));
56 |         document.getElementsByTagName('head')[0].appendChild(style);
57 |     }
58 | 
59 |     document.addEventListener("DOMContentLoaded", function() {
60 |         if (inIframe() || contentOnly()) {
61 |             displayContentOnly();
62 |         }
63 |         adjustBinderLink();
64 |     });
65 | }());
66 | 


--------------------------------------------------------------------------------
/jupyter-book/appendix/acknowledgement.md:
--------------------------------------------------------------------------------
 1 | # Acknowledgement
 2 | 
 3 | ## Figure attributions
 4 | 
 5 | The diagram presenting the API design in the module "The predictive modeling
 6 | pipeline" used the following figures:
 7 | 
 8 | - The "Parameters Free Icon" is licensed under CC-BY 3.0 -
 9 |   [source](https://www.onlinewebfonts.com/icon/512285)
10 | - The "Settings Gears SVG Vector" is licensed under CC0 -
11 |   [source](https://www.svgrepo.com/svg/57066/settings-gears)
12 | - The "Close icon" is licensed under MIT -
13 |   [source](https://www.iconfinder.com/icons/211652/close_icon)
14 | 


--------------------------------------------------------------------------------
/jupyter-book/appendix/datasets_intro.md:
--------------------------------------------------------------------------------
1 | # Datasets description
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/appendix/notebook_timings.md:
--------------------------------------------------------------------------------
1 | # Notebook timings
2 | 
3 | ```{nb-exec-table}
4 | ```
5 | 


--------------------------------------------------------------------------------
/jupyter-book/appendix/toc_redirect.md:
--------------------------------------------------------------------------------
1 | <meta http-equiv="refresh" content="0; URL=../toc.html" />
2 | 
3 | # Table of contents
4 | 


--------------------------------------------------------------------------------
/jupyter-book/concluding_remarks_video.md:
--------------------------------------------------------------------------------
1 | # 🎥 Concluding remarks
2 | 
3 | <iframe class="video" width="640px" height="480px"
4 |         src="https://www.youtube.com/embed/dqnPOlPYA4s?rel=0"
5 |         allowfullscreen></iframe>
6 | 


--------------------------------------------------------------------------------
/jupyter-book/datasets:
--------------------------------------------------------------------------------
1 | ../datasets


--------------------------------------------------------------------------------
/jupyter-book/ensemble/bagging_slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Intuitions on ensemble models: bagging
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/SnvdnIlOsHQ?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/bagging.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/boosting_slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Intuitions on ensemble models: boosting
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/k2ZCG1OLjVM?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/boosting.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_boosting_index.md:
--------------------------------------------------------------------------------
1 | # Ensemble based on boosting
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_bootstrap_index.md:
--------------------------------------------------------------------------------
1 | # Ensemble method using bootstrapping
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_hyperparameters_index.md:
--------------------------------------------------------------------------------
1 | # Hyperparameter tuning with ensemble methods
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | This module will go into details regarding algorithms that are combining
 8 | several models together, also called ensemble of models. We will present two
 9 | families of such techniques: (i) based on bootstrapping and (ii) based
10 | on boosting. We will present bagging and random forest that belong to the
11 | former strategy and AdaBoost and gradient boosting decision tree that belong
12 | to the later strategy. Finally, we will go into details regarding the
13 | hyperparameters allowing to tune these models and compare them between models.
14 | 
15 | ## Before getting started
16 | 
17 | <!-- Give the required skills for the module -->
18 | 
19 | The required technical skills to carry on this module are:
20 | 
21 | - skills acquired during the "The Predictive Modeling Pipeline" module with
22 |   basic usage of scikit-learn;
23 | - skills acquired during the "Selecting The Best Model" module, mainly around
24 |   the concept of underfit/overfit and the usage of cross-validation in
25 |   scikit-learn;
26 | - skills acquired during the modules "Linear Models" and
27 |   "Decision Tree Models".
28 | 
29 | <!-- Point to resources to learning these skills -->
30 | 
31 | ## Objectives and time schedule
32 | 
33 | <!-- Give the learning objectives -->
34 | 
35 | The objective in the module are the following:
36 | 
37 | - understanding the principles behind bootstrapping and boosting;
38 | - get intuitions with specific models such as random forest
39 |   and gradient boosting;
40 | - identify the important hyperparameters of random forest and gradient boosting
41 |   decision trees as well as their typical values.
42 | 
43 | <!-- Give the investment in time -->
44 | 
45 | The estimated time to go through this module is about 6 hours.
46 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_module_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | <!-- Quick wrap-up for the module -->
 6 | 
 7 | So in this module, we discussed ensemble learners which are a type of
 8 | learner that combines simpler learners together. We saw two strategies:
 9 | 
10 | - one based on bootstrap samples allowing learners to be fit in a parallel
11 |   manner;
12 | - the other called boosting which fit learners sequentially.
13 | 
14 | From these two families, we mainly focused on giving intuitions regarding the
15 | internal machinery of the random forest and gradient-boosting models which
16 | are state-of-the-art methods.
17 | 
18 | ## To go further
19 | 
20 | <!-- Some extra links of content to go further -->
21 | 
22 | You can refer to the following scikit-learn examples which are related to
23 | the concepts approached in this module:
24 | 
25 | - [Early-stopping in gradient-boosting](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_early_stopping.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-early-stopping-py)
26 | - [Combining predictors using stacking](https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html#sphx-glr-auto-examples-ensemble-plot-stack-predictors-py)
27 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_quiz_m6_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M6.01
 2 | 
 3 | ```{admonition} Question
 4 | By default, a
 5 | [`BaggingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)
 6 | or [`BaggingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)
 7 | draw:
 8 | 
 9 | - a) random samples with replacement over training points
10 | - b) random samples with replacement over features
11 | - c) random samples without replacement over training points
12 | - d) random samples without replacement over features
13 | 
14 | _Select all answers that apply_
15 | 
16 | Hint: it is possible to access the documentation for those classes by
17 | clicking on the links on their names.
18 | ```
19 | 
20 | +++
21 | 
22 | ```{admonition} Question
23 | In a
24 | [`BaggingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)
25 | or [`BaggingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html),
26 | the parameter `base_estimator` can be:
27 | 
28 | - a) any predictor
29 | - b) a decision tree predictor
30 | - c) a linear model predictor
31 | 
32 | _Select a single answer_
33 | ```
34 | 
35 | +++
36 | 
37 | ```{admonition} Question
38 | 
39 | In the context of a classification problem, what are the differences between a
40 | bagging classifier and a random forest classifier:
41 | 
42 | - a) in a random forest, the base model is always a decision tree
43 | - b) in a random forest, the split threshold values are decided completely at
44 |   random
45 | - c) in a random forest, a random resampling is performed both over features
46 |   as well as over samples
47 | 
48 | _Select all answers that apply_
49 | ```
50 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_quiz_m6_02.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M6.02
 2 | 
 3 | ```{admonition} Question
 4 | Select the correct statements:
 5 | 
 6 | - a) Both bagging and boosting combine several predictors
 7 | - b) Both bagging and boosting are based on decision trees
 8 | - c) Boosting combines predictors sequentially
 9 | - d) Bagging combines predictors simultaneously
10 | 
11 | _Select all answers that apply_
12 | ```
13 | 
14 | +++
15 | 
16 | ```{admonition} Question
17 | Boosting algorithms learn their predictor:
18 | 
19 | - a) by training predictors in parallel on slightly different datasets
20 | - b) by training predictors sequentially which correct previous prediction errors
21 | - c) by taking a linear combination of weak predictors
22 | 
23 | _Select all answers that apply_
24 | ```
25 | 
26 | +++
27 | 
28 | ```{admonition} Question
29 | Histogram gradient boosting is an accelerated gradient boosting algorithm that:
30 | 
31 | - a) takes a subsample of the original samples
32 | - b) bins the numerical features
33 | - c) takes a subsample of the original features
34 | 
35 | _Select a single answer_
36 | ```
37 | 
38 | +++
39 | 
40 | ```{admonition} Question
41 | Boosting tends to overfit when increasing the number of predictors:
42 | 
43 | - a) true
44 | - b) false
45 | 
46 | _Select a single answer_
47 | ```
48 | 


--------------------------------------------------------------------------------
/jupyter-book/ensemble/ensemble_quiz_m6_03.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M6.03
 2 | 
 3 | ```{admonition} Question
 4 | When compared to random forests, gradient boosting is usually trained using:
 5 | 
 6 | - a) shallower trees
 7 | - b) deeper trees
 8 | - c) a subset of features
 9 | - d) all features
10 | 
11 | _Select all answers that apply_
12 | ```
13 | 
14 | +++
15 | 
16 | ```{admonition} Question
17 | Which of the hyperparameter(s) do not exist in random forest but exists in gradient boosting:
18 | 
19 | - a) number of estimators
20 | - b) maximum depth
21 | - c) learning rate
22 | 
23 | _Select all answers that apply_
24 | ```
25 | 
26 | +++
27 | 
28 | ```{admonition} Question
29 | Which of the following options are correct about the benefits of ensemble models?
30 | 
31 | - a) Better generalization performance
32 | - b) Reduced sensitivity to hyperparameter tuning of individual predictors
33 | - c) Better interpretability
34 | 
35 | _Select all answers that apply_
36 | ```
37 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/cross_validation_baseline_index.md:
--------------------------------------------------------------------------------
1 | # Comparing a model with simple baselines
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/cross_validation_choices_index.md:
--------------------------------------------------------------------------------
1 | # Choice of cross-validation
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/cross_validation_nested_index.md:
--------------------------------------------------------------------------------
1 | # Nested cross-validation
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | In the previous module, we presented the general cross-validation framework
 8 | and used it to evaluate models' performance. However, this is important to
 9 | keep in mind that some elements in the cross-validation need to be decided
10 | depending on the nature of the problem: (i) the cross-validation strategy and
11 | (ii) the evaluation metrics. Besides, it is always good to compare the models'
12 | performance with some baseline model.
13 | 
14 | In this module, we present both aspects and give insights on when to use a
15 | specific cross-validation strategy and a metric. In addition, we will also
16 | give some insights regarding how to compare a model with some baseline.
17 | 
18 | ## Before getting started
19 | 
20 | <!-- Give the required skills for the module -->
21 | 
22 | The required technical skills to carry on this module are:
23 | 
24 | - skills acquired during the "The Predictive Modeling Pipeline" module with
25 |   basic usage of scikit-learn;
26 | - skills acquired during the "Selecting The Best Model" module, mainly around
27 |   the concept of underfit/overfit and the usage of cross-validation in
28 |   scikit-learn.
29 | 
30 | <!-- Point to resources to learning these skills -->
31 | 
32 | ## Objectives and time schedule
33 | 
34 | <!-- Give the learning objectives -->
35 | 
36 | The objective in the module are the following:
37 | 
38 | - understand the necessity of using an appropriate cross-validation strategy
39 |   depending on the data;
40 | - get the intuitions behind comparing a model with some basic models that
41 |   can be used as baseline;
42 | - understand the principles behind using nested cross-validation when the model
43 |   needs to be evaluated as well as optimized;
44 | - understand the differences between regression and classification metrics;
45 | - understand the differences between metrics.
46 | 
47 | <!-- Give the investment in time -->
48 | 
49 | The estimated time to go through this module is about 6 hours.
50 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_module_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | <!-- Quick wrap-up for the module -->
 6 | 
 7 | In this notebook, we presented the framework used in machine-learning to
 8 | evaluate a predictive model's performance: the cross-validation.
 9 | 
10 | Besides, we presented several splitting strategies that can be used in the
11 | general cross-validation framework. These strategies should be used wisely
12 | when encountering some specific patterns or types of data.
13 | 
14 | Finally, we show how to perform nested cross-validation to select an optimal
15 | model and evaluate its generalization performance.
16 | 
17 | ## To go further
18 | 
19 | <!-- Some extra links of content to go further -->
20 | 
21 | You can refer to the following scikit-learn examples which are related to
22 | the concepts approached in this module:
23 | 
24 | - [Comparison of cross-validation strategies](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py)
25 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M7.01
 2 | 
 3 | ```{admonition} Question
 4 | What the benefit of using cross-validation?
 5 | 
 6 | - a) Give information about performance variability
 7 | - b) Remove the need to use a baseline algorithm
 8 | - c) Give information regarding under- or over-fitting of a model
 9 | 
10 | _Select all answers that apply_
11 | ```
12 | 
13 | +++
14 | 
15 | ```{admonition} Question
16 | Does a dummy classifier or regressor rely on the input feature values in
17 | the input data `X` to make the predictions?
18 | 
19 | - a) Yes
20 | - b) No
21 | 
22 | _Select a single answer_
23 | ```
24 | 
25 | +++
26 | 
27 | ```{admonition} Question
28 | Does a dummy classifier from scikit-learn always make constant predictions
29 | whatever the chosen strategy?
30 | 
31 | - a) Yes
32 | - b) No
33 | 
34 | _Select a single answer_
35 | ```
36 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_02.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M7.02
 2 | 
 3 | ```{admonition} Question
 4 | We have a dataset with patient records from 10 different hospitals, and our goal
 5 | is to predict whether a patient has a disease or not. Let's also suppose that
 6 | the classes ("disease" and "no-disease") are imbalanced. Additionally, we suspect
 7 | that each hospital's data may have systematic biases due to factors like
 8 | medical devices, policies, socioeconomic status of the patients, etc.
 9 | 
10 | Which cross-validation strategy is the most suitable for assessing the model's
11 | ability to make good predictions on patients from hospitals not seen during
12 | training?
13 | 
14 | - a) Group stratified k-fold cross-validation
15 | - b) Group k-fold
16 | - c) Stratified k-fold cross-validation
17 | - d) Leave-one-out cross-validation
18 | 
19 | _Select a single answer_
20 | ```
21 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_03.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M7.03
 2 | 
 3 | ```{admonition} Question
 4 | How to evaluate and tune the hyperparameters of a model?
 5 | 
 6 | - a) Fit the model on the train set, set the parameters using the test set, and
 7 |   evaluate the model on the same test set
 8 | - b) Fit the model on the train set, set the parameters using a validation set,
 9 |   and evaluate the model on the test set
10 | - c) use a nested cross-validation, with an inner cross-validation to tune the
11 |   parameters of the model and an outer cross-validation to evaluate the model's
12 |   performance
13 | 
14 | _Select all answers that apply_
15 | ```
16 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_04.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M7.04
 2 | 
 3 | ```{admonition} Question
 4 | What is the default score in scikit-learn when using a classifier?
 5 | 
 6 | - a) balanced accuracy
 7 | - b) ROC-AUC
 8 | - c) accuracy
 9 | 
10 | _Select a single answer_
11 | ```
12 | 
13 | +++
14 | 
15 | ```{admonition} Question
16 | Other than the decision threshold, metrics such as recall and precision also
17 | depend on the regularization parameters. Assuming that class "1" (in red) is the
18 | positive class, use the following figures to select which statements are true in
19 | this particular logistic regression model:
20 | 
21 | ![Precision-recall C=3e-3](../../figures/evaluation_quiz_precision_recall_C0.003.svg)
22 | ![Precision-recall C=1](../../figures/evaluation_quiz_precision_recall_C1.svg)
23 | 
24 | - a) stronger regularization leads to higher precision
25 | - b) stronger regularization leads to lower precision
26 | - c) stronger regularization leads to higher recall
27 | - d) stronger regularization leads to lower recall
28 | 
29 | _Select all answers that apply_
30 | ```
31 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/evaluation_quiz_m7_05.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M7.05
 2 | 
 3 | ```{admonition} Question
 4 | What is the default score in scikit-learn when using a regressor?
 5 | 
 6 | - a) $R^2$
 7 | - b) mean absolute error
 8 | - c) median absolute error
 9 | 
10 | _Select a single answer_
11 | ```
12 | 
13 | +++
14 | 
15 | ```{admonition} Question
16 | If we observe that the values returned by
17 | `cross_val_scores(model, X, y, scoring="r2")` increase after changing the model
18 | parameters, it means that the latest model:
19 | 
20 | - a) generalizes better
21 | - b) generalizes worse
22 | 
23 | _Select a single answer_
24 | ```
25 | 
26 | +++
27 | 
28 | ```{admonition} Question
29 | If all the values returned by
30 | `cross_val_score(model_A, X, y, scoring="neg_mean_squared_error")`
31 | are strictly lower than those returned by
32 | `cross_val_score(model_B, X, y, scoring="neg_mean_squared_error")`
33 | it means that `model_B` generalizes:
34 | 
35 | - a) better than `model_A`
36 | - b) worse than `model_A`
37 | 
38 | Hint: Remember that `"neg_mean_squared_error"` is an alias for the negative of
39 | the Mean Squared Error.
40 | 
41 | _Select a single answer_
42 | ```
43 | 
44 | +++
45 | 
46 | ```{admonition} Question
47 | Values returned by `cross_val_scores(model, X, y, scoring="neg_mean_squared_error")`
48 | are:
49 | 
50 | - a) guaranteed to be positive or zero
51 | - b) guaranteed to be negative or zero
52 | - c) can be either positive or negative depending on the data
53 | 
54 | _Select a single answer_
55 | ```
56 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/metrics_classification_index.md:
--------------------------------------------------------------------------------
1 | # Classification metrics
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/evaluation/metrics_regression_index.md:
--------------------------------------------------------------------------------
1 | # Regression metrics
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_limitation_index.md:
--------------------------------------------------------------------------------
1 | # Caveats of feature selection
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | This module gives some insights regarding feature selection. Besides motivating
 8 | the benefit of using feature selection, we also illustrate some of the known
 9 | caveats.
10 | 
11 | ## Before getting started
12 | 
13 | <!-- Give the required skills for the module -->
14 | 
15 | The required technical skills to carry on this module are:
16 | 
17 | - skills acquired during the "The Predictive Modeling Pipeline" module with
18 |   basic usage of scikit-learn;
19 | - skills acquired during the "Selecting The Best Model" module, mainly around
20 |   the concept of underfit/overfit and the usage of cross-validation in
21 |   scikit-learn.
22 | 
23 | <!-- Point to resources to learning these skills -->
24 | 
25 | ## Objectives and time schedule
26 | 
27 | <!-- Give the learning objectives -->
28 | 
29 | The objective in the module are the following:
30 | 
31 | - understand in which case feature selection is beneficial;
32 | - be aware of the caveats and how to put into practice feature selection
33 |   techniques.
34 | 
35 | <!-- Give the investment in time -->
36 | 
37 | The estimated time to go through this module is about 50 minutes.
38 | 


--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_module_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | <!-- Quick wrap-up for the module -->
 6 | 
 7 | In this module, we presented the principle of feature selection. In short,
 8 | feature selection is not a magical tool to get marginal gains. We tackle
 9 | the following aspects:
10 | 
11 | - you should use feature selection to speed-up training and testing rather
12 |   than seeking for marginal performance gains;
13 | - you should be careful regarding the framework and how to include a feature
14 |   selector within your pipeline;
15 | - you should be aware of the limitation of a feature selector based on
16 |   machine-learning models.
17 | 
18 | ## To go further
19 | 
20 | <!-- Some extra links of content to go further -->
21 | 
22 | You can refer to the following scikit-learn examples which are related to
23 | the concepts approached during this module:
24 | 
25 | - [Recursive feature selection using cross-validation](https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py)
26 | 


--------------------------------------------------------------------------------
/jupyter-book/feature_selection/feature_selection_quiz.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz
 2 | 
 3 | ```{admonition} Question
 4 | What is the main advantage of using feature selection?
 5 | 
 6 | - a) speeding-up the training of an algorithm
 7 | - b) fine tuning the model's performance
 8 | - c) remove noisy features
 9 | 
10 | _Select a single answer_
11 | ```
12 | 
13 | +++
14 | 
15 | ```{admonition} Question
16 | When selecting feature, the decision should be made using:
17 | 
18 | - a) the entire dataset
19 | - b) the training set
20 | - c) the testing set
21 | 
22 | _Select a single answer_
23 | ```
24 | 


--------------------------------------------------------------------------------
/jupyter-book/figures:
--------------------------------------------------------------------------------
1 | ../figures


--------------------------------------------------------------------------------
/jupyter-book/interpretation/interpretation_quiz.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz
 2 | 
 3 | ```{admonition} Question
 4 | With a same dataset, feature importance might differs if:
 5 | 
 6 | - a) we use two different models
 7 | - b) we use two different train/test split with a same model
 8 | - c) we use a same model with a different set of hyper-parameters
 9 | - d) we use a same model with the same set of hyper-parameters but a different
10 |   random_state
11 | ```
12 | 
13 | +++
14 | 
15 | ```{admonition} Question
16 | In linear model, the feature importance:
17 | 
18 | - a) might be infer from the coefficients
19 | - b) might be infer by `importance_permutation`
20 | - c) need a regularization to infer the importance
21 | - d) is a built-in attribute
22 | ```
23 | 
24 | +++
25 | 
26 | ```{admonition} Question
27 | If two feature are the same (thus correlated)
28 | 
29 | - a) their feature importance will be the same
30 | - b) their feature importance will be divided by 2
31 | - c) only one will receive all the feature importance, the second one will be 0
32 | - d) it depends
33 | ```
34 | 
35 | +++
36 | 
37 | ```{admonition} Question
38 | The feature importance provided by the scikit-learn random forest:
39 | 
40 | - a) has bias for categorical feature
41 | - b) has bias for continuous (high cardinality) feature
42 | - c) is independent from the train/test split
43 | - d) is independent from the hyper-parameters
44 | ```
45 | 
46 | +++
47 | 
48 | ```{admonition} Question
49 | To evaluate the feature importance for a specific model, one could:
50 | 
51 | - a) drop a column and compare the score
52 | - b) shuffle a column and compare the score
53 | - c) put all column to 0 and compare the score
54 | - d) change a column value to random number and compare the score
55 | ```
56 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_intuitions_index.md:
--------------------------------------------------------------------------------
1 | # Intuitions on linear models
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | In this module, will go further into details regarding models that use
 8 | linear parametrization.
 9 | We will see how to use this family of models for both classification and
10 | regression problems. Besides, we will explain how to fight over-fitting using
11 | regularization.
12 | Finally, we will show how linear models can be used with
13 | data presenting non-linearity.
14 | 
15 | ## Before getting started
16 | 
17 | <!-- Give the required skills for the module -->
18 | 
19 | The required technical skills to carry on this module are:
20 | 
21 | - skills acquired during the "The Predictive Modeling Pipeline" module with
22 |   basic usage of scikit-learn;
23 | - skills acquired during the "Selecting The Best Model" module, mainly around
24 |   the concept of underfit/overfit and the usage of cross-validation in
25 |   scikit-learn.
26 | 
27 | <!-- Point to resources to learning these skills -->
28 | 
29 | ## Objectives and time schedule
30 | 
31 | <!-- Give the learning objectives -->
32 | 
33 | In this module, your objectives are to:
34 | 
35 | - understand the linear models parametrization;
36 | - understand the implication of linear models in both
37 |   regression and classification;
38 | - get intuitions of linear models applied in higher dimensional dataset;
39 | - understand the effect of regularization and how to set it;
40 | - understand how linear models can be used even with data showing non-linear
41 |   relationship with the target to be predicted.
42 | 
43 | <!-- Give the investment in time -->
44 | 
45 | The estimated time to go through this module is about 6 hours.
46 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_module_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | <!-- Quick wrap-up for the module -->
 6 | 
 7 | In this module, we saw that:
 8 | 
 9 | - the predictions of a linear model depend on a weighted sum of the values of
10 |   the input features added to an intercept parameter;
11 | - fitting a linear model consists in adjusting both the weight coefficients and
12 |   the intercept to minimize the prediction errors on the training set;
13 | - to train linear models successfully it is often required to scale the input
14 |   features approximately to the same dynamic range;
15 | - regularization can be used to reduce over-fitting: weight coefficients are
16 |   constrained to stay small when fitting;
17 | - the regularization hyperparameter needs to be fine-tuned by cross-validation
18 |   for each new machine learning problem and dataset;
19 | - linear models can be used on problems where the target variable is not
20 |   linearly related to the input features but this requires extra feature
21 |   engineering work to transform the data in order to avoid under-fitting.
22 | 
23 | ## To go further
24 | 
25 | <!-- Some extra links of content to go further -->
26 | 
27 | You can refer to the following scikit-learn examples which are related to
28 | the concepts approached during this module:
29 | 
30 | - [Example of linear regression](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py)
31 | - [Comparison between a linear regression and a ridge regressor](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols_ridge_variance.html#sphx-glr-auto-examples-linear-model-plot-ols-ridge-variance-py)
32 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_non_linear_index.md:
--------------------------------------------------------------------------------
1 | # Non-linear feature engineering for linear models
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_quiz_m4_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M4.01
 2 | 
 3 | ```{admonition} Question
 4 | What is a linear regression?
 5 | 
 6 | - a) a model that outputs a continuous prediction as the sum of the values of a
 7 |   **limited** subset of the input features
 8 | - b) a model that outputs a binary prediction based on a linear combination
 9 |   of the values of the input features
10 | - c) a model that outputs a continuous prediction as a weighted sum of the input
11 |   features
12 | 
13 | _Select a single answer_
14 | ```
15 | 
16 | +++
17 | 
18 | ```{admonition} Question
19 | Is it possible to get a perfect fit (zero prediction error on the training set)
20 | with a linear classifier **by itself** on a non-linearly separable dataset?
21 | 
22 | - a) yes
23 | - b) no
24 | 
25 | _Select a single answer_
26 | ```
27 | 
28 | +++
29 | 
30 | ```{admonition} Question
31 | If we fit a linear regression where `X` is a single column vector, how many
32 | parameters our model will be made of?
33 | 
34 | - a) 1
35 | - b) 2
36 | - c) 3
37 | 
38 | _Select a single answer_
39 | ```
40 | 
41 | +++
42 | 
43 | ```{admonition} Question
44 | If we train a scikit-learn `LinearRegression` with `X` being a single column
45 | vector and `y` a vector, `coef_` and `intercept_` will be respectively:
46 | 
47 | - a) an array of shape (1, 1) and a number
48 | - b) an array of shape (1,) and an array of shape (1,)
49 | - c) an array of shape (1, 1) and an array of shape (1,)
50 | - d) an array of shape (1,) and a number
51 | 
52 | _Select a single answer_
53 | ```
54 | 
55 | +++
56 | 
57 | ```{admonition} Question
58 | The decision boundaries of a logistic regression model:
59 | 
60 | - a) split classes using only one of the input features
61 | - b) split classes using a combination of the input features
62 | - c) often have curved shapes
63 | 
64 | _Select a single answer_
65 | ```
66 | 
67 | +++
68 | 
69 | ```{admonition} Question
70 | For a binary classification task, what is the shape of the array returned by the
71 | `predict_proba` method for 10 input samples?
72 | 
73 | - a) (10,)
74 | - b) (10, 2)
75 | - c) (2, 10)
76 | 
77 | _Select a single answer_
78 | ```
79 | 
80 | +++
81 | 
82 | ```{admonition} Question
83 | In logistic regression's `predict_proba` method in scikit-learn, which of the
84 | following statements is true regarding the predicted probabilities?
85 | 
86 | - a) The sum of probabilities across different classes for a given sample is always equal to 1.0.
87 | - b) The sum of probabilities across all samples for a given class is always equal to 1.0.
88 | - c) The sum of probabilities across all features for a given class is always equal to 1.0.
89 | 
90 | _Select a single answer_
91 | ```
92 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_quiz_m4_02.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M4.02
 2 | 
 3 | ```{admonition} Question
 4 | 
 5 | Let us consider a pipeline that combines a polynomial feature extraction of
 6 | degree 2 and a linear regression model. Let us assume that the linear regression
 7 | coefficients are all non-zero and that the dataset contains a single feature.
 8 | Is the prediction function of this pipeline a straight line?
 9 | 
10 | - a) yes
11 | - b) no
12 | 
13 | _Select a single answer_
14 | ```
15 | 
16 | +++
17 | 
18 | ```{admonition} Question
19 | Fitting a linear regression where `X` has `n_features` columns and the target
20 | is a single continuous vector, what is the respective type/shape of `coef_`
21 | and `intercept_`?
22 | 
23 | - a) it is not possible to fit a linear regression in dimension higher than 2
24 | - b) array of shape (`n_features`,) and a float
25 | - c) array of shape (1, `n_features`) and an array of shape (1,)
26 | 
27 | _Select a single answer_
28 | ```
29 | 
30 | +++
31 | 
32 | ```{admonition} Question
33 | Combining (one or more) feature engineering transformers in a single pipeline:
34 | 
35 | - a) increases the expressivity of the model
36 | - b) ensures that models extrapolate accurately regardless of the distribution of the data
37 | - c) may require tuning additional hyperparameters
38 | - d) inherently prevents any underfitting
39 | 
40 | _Select all answers that apply_
41 | ```
42 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_quiz_m4_03.md:
--------------------------------------------------------------------------------
  1 | # ✅ Quiz M4.03
  2 | 
  3 | ```{admonition} Question
  4 | Which of the following estimators can solve linear regression problems?
  5 | 
  6 | - a) sklearn.linear_model.LinearRegression
  7 | - b) sklearn.linear_model.LogisticRegression
  8 | - c) sklearn.linear_model.Ridge
  9 | 
 10 | _Select all answers that apply_
 11 | ```
 12 | 
 13 | +++
 14 | 
 15 | ```{admonition} Question
 16 | Regularization allows:
 17 | 
 18 | - a) to create a model robust to outliers (samples that differ widely from
 19 |   other observations)
 20 | - b) to reduce overfitting by forcing the weights to stay close to zero
 21 | - c) to reduce underfitting by making the problem linearly separable
 22 | 
 23 | _Select a single answer_
 24 | ```
 25 | 
 26 | +++
 27 | 
 28 | ```{admonition} Question
 29 | A ridge model is:
 30 | 
 31 | - a) the same as linear regression with penalized weights
 32 | - b) the same as logistic regression with penalized weights
 33 | - c) a linear model
 34 | - d) a non linear model
 35 | 
 36 | _Select all answers that apply_
 37 | ```
 38 | 
 39 | +++
 40 | 
 41 | ```{admonition} Question
 42 | Assume that a data scientist has prepared a train/test split and plans to use
 43 | the test for the final evaluation of a `Ridge` model. The parameter `alpha` of
 44 | the `Ridge` model:
 45 | 
 46 | - a) is internally tuned when calling `fit` on the train set
 47 | - b) should be tuned by running cross-validation on a **train set**
 48 | - c) should be tuned by running cross-validation on a **test set**
 49 | - d) must be a positive number
 50 | 
 51 | _Select all answers that apply_
 52 | ```
 53 | 
 54 | +++
 55 | 
 56 | ```{admonition} Question
 57 | Scaling the data before fitting a model:
 58 | 
 59 | - a) is often useful for regularized linear models
 60 | - b) is always necessary for regularized linear models
 61 | - c) may speed-up fitting
 62 | - d) has no impact on the optimal choice of the value of a regularization parameter
 63 | 
 64 | _Select all answers that apply_
 65 | ```
 66 | 
 67 | +++
 68 | 
 69 | ```{admonition} Question
 70 | The effect of increasing the regularization strength in a ridge model is to:
 71 | 
 72 | - a) shrink all weights towards zero
 73 | - b) make all weights equal
 74 | - c) set a subset of the weights to exactly zero
 75 | - d) constrain all the weights to be positive
 76 | 
 77 | _Select all answers that apply_
 78 | ```
 79 | 
 80 | +++
 81 | 
 82 | ```{admonition} Question
 83 | By default, a [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) in scikit-learn applies:
 84 | 
 85 | - a) no penalty
 86 | - b) a penalty that shrinks the magnitude of the weights towards zero (also called "l2 penalty")
 87 | - c) a penalty that ensures all weights are equal
 88 | 
 89 | _Select a single answer_
 90 | ```
 91 | 
 92 | +++
 93 | 
 94 | ```{admonition} Question
 95 | The parameter `C` in a logistic regression is:
 96 | 
 97 | - a) similar to the parameter `alpha` in a ridge regressor
 98 | - b) similar to `1 / alpha` where `alpha` is the parameter of a ridge regressor
 99 | - c) not controlling the regularization
100 | 
101 | _Select a single answer_
102 | ```
103 | 
104 | +++
105 | 
106 | ```{admonition} Question
107 | In logistic regression, increasing the regularization strength (by
108 | decreasing the value of `C`) makes the model:
109 | 
110 | - a) more likely to overfit to the training data
111 | - b) more confident: the values returned by `predict_proba` are closer to 0 or 1
112 | - c) less complex, potentially underfitting the training data
113 | 
114 | _Select a single answer_
115 | ```
116 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_regularization_index.md:
--------------------------------------------------------------------------------
1 | # Regularization in linear model
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/linear_models_slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Intuitions on linear models
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/ksEGivkPP7I?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/linear_models.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/linear_models/regularized_linear_models_slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Intuitions on regularized linear models
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/K64zB_5skls?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/regularized_linear_models.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/ml_concepts/quiz_intro_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz Intro.01
 2 | 
 3 | Given a case study: pricing apartments based on a real estate website. We have
 4 | thousands of house descriptions with their price. Typically, an example of a
 5 | house description is the following:
 6 | 
 7 | "Great for entertaining: spacious, updated 2 bedroom, 1 bathroom apartment in
 8 | Lakeview, 97630. The house will be available from May 1st. Close to nightlife
 9 | with private backyard. Price ~$1,000,000."
10 | 
11 | We are interested in predicting house prices from their description. One
12 | potential use case for this would be, as a buyer, to find houses that are cheap
13 | compared to their market value.
14 | 
15 | ```{admonition} Question
16 | What kind of problem is it?
17 | 
18 | - a) a supervised problem
19 | - b) an unsupervised problem
20 | - c) a classification problem
21 | - d) a regression problem
22 | 
23 | _Select all answers that apply_
24 | ```
25 | 
26 | +++
27 | 
28 | ```{admonition} Question
29 | What are the features?
30 | 
31 | - a) the number of rooms might be a feature
32 | - b) the post code of the house might be a feature
33 | - c) the price of the house might be a feature
34 | 
35 | _Select all answers that apply_
36 | ```
37 | 
38 | +++
39 | 
40 | ```{admonition} Question
41 | What is the target variable?
42 | 
43 | - a) the full text description is the target
44 | - b) the price of the house is the target
45 | - c) only house description with no price mentioned are the target
46 | 
47 | _Select a single answer_
48 | ```
49 | 
50 | +++
51 | 
52 | ```{admonition} Question
53 | What is a record (a sample)?
54 | 
55 | - a) each house description is a record
56 | - b) each house price is a record
57 | - c) each kind of description (as the house size) is a record
58 | 
59 | _Select a single answer_
60 | ```
61 | 


--------------------------------------------------------------------------------
/jupyter-book/ml_concepts/slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Introducing machine-learning concepts
 2 | 
 3 | This presentation will teach you the basic concepts: what is machine learning,
 4 | the type of sub-problems that it covers, the vocabulary and the general
 5 | pipeline.
 6 | 
 7 | <iframe class="video" width="640px" height="480px"
 8 |         src="https://www.youtube.com/embed/f0b11x2tAZw?rel=0"
 9 |         allowfullscreen></iframe>
10 | 
11 | <iframe class="slides"
12 |         src="../slides/index.html?file=../slides/ml_concepts.md"></iframe>
13 | 
14 | To navigate in the slides, **first click on the slides**, then:
15 | - press the **arrow keys** to go to the next/previous slide;
16 | - press **"P"** to toggle presenter mode to see the notes;
17 | - press **"F"** to toggle full-screen mode.
18 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/bias_vs_variance_quiz_m2_03.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M2.03
 2 | 
 3 | ```{admonition} Question
 4 | Fitting a model with a high bias:
 5 | 
 6 | - a) causes an underfitted model?
 7 | - b) causes an overfitted model?
 8 | - c) increases the sensitivity of the learned prediction function to a random resampling of the training set observations?
 9 | - d) causes the learned prediction function to make systematic errors?
10 | 
11 | _Select all answers that apply_
12 | ```
13 | 
14 | +++
15 | 
16 | ```{admonition} Question
17 | Fitting a high variance model:
18 | 
19 | - a) causes an underfitted model?
20 | - b) causes an overfitted model?
21 | - c) increases the sensitivity of the learned prediction function to a random resampling of the training set observations?
22 | - d) causes the learned prediction function to make systematic errors?
23 | 
24 | _Select all answers that apply_
25 | ```
26 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/bias_vs_variance_slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Bias versus Variance
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/VOeTTiML1pY?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/bias_vs_variance.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/learning_validation_curves_quiz_m2_02.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M2.02
 2 | 
 3 | ```{admonition} Question
 4 | A model is overfitting when:
 5 | 
 6 | - a) both the train and test errors are high
 7 | - b) train error is low but test error is high
 8 | - c) train error is high but the test error is low
 9 | - d) both train and test errors are low
10 | 
11 | _Select a single answer_
12 | ```
13 | 
14 | +++
15 | 
16 | ```{admonition} Question
17 | Assuming that we have a dataset with little noise, a model is underfitting when:
18 | 
19 | - a) both the train and test errors are high
20 | - b) train error is low but test error is high
21 | - c) train error is high but the test error is low
22 | - d) both train and test errors are low
23 | 
24 | _Select a single answer_
25 | ```
26 | 
27 | +++
28 | 
29 | ```{admonition} Question
30 | For a fixed training set, by sequentially adding parameters to give more
31 | flexibility to the model, we are more likely to observe:
32 | 
33 | - a) a wider difference between train and test errors
34 | - b) a reduction in the difference between train and test errors
35 | - c) an increased or steady train error
36 | - d) a decrease in the train error
37 | 
38 | _Select all answers that apply_
39 | ```
40 | 
41 | +++
42 | 
43 | ```{admonition} Question
44 | For a fixed choice of model parameters, if we increase the number of labeled
45 | observations in the training set, are we more likely to observe:
46 | 
47 | - a) a wider difference between train and test errors
48 | - b) a reduction in the difference between train and test errors
49 | - c) an increased or steady train error
50 | - d) a decrease in the train error
51 | 
52 | _Select all answers that apply_
53 | ```
54 | 
55 | +++
56 | 
57 | ```{admonition} Question
58 | Polynomial models with a high degree parameter:
59 | 
60 | - a) always have the best test error (but can be slow to train)
61 | - b) underfit more than linear regression models
62 | - c) get lower training error than lower degree polynomial models
63 | - d) are more likely to overfit than lower degree polynomial models
64 | 
65 | _Select all answers that apply_
66 | ```
67 | 
68 | +++
69 | 
70 | ```{admonition} Question
71 | If we chose the parameters of a model to get the best overfitting/underfitting
72 | tradeoff, we will always get a zero test error.
73 | 
74 | - a) True
75 | - b) False
76 | 
77 | _Select a single answer_
78 | ```
79 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/learning_validation_curves_slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Comparing train and test errors
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/9uS4sE-UTm0?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/learning_validation_curves.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_bias_variance_index.md:
--------------------------------------------------------------------------------
1 | # Bias versus variance trade-off
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | This module gives an intuitive introduction to the very **fundamental
 8 | concepts** of overfitting and underfitting in machine learning.
 9 | 
10 | Machine learning models can never make perfect predictions: the test error is
11 | never exactly zero. This failure comes from a **fundamental trade-off** between
12 | **modeling flexibility** and the **limited size of the training dataset**.
13 | 
14 | The first presentation will define those problems and characterize how and why
15 | they arise.
16 | 
17 | Then we will present a methodology to quantify those problems by **contrasting
18 | the train error with the test error** for various choice of the model family,
19 | model parameters. More importantly, we will emphasize the **impact of the size
20 | of the training set on this trade-off**.
21 | 
22 | Finally we will relate overfitting and underfitting to the concepts of
23 | statistical variance and bias.
24 | 
25 | ## Before getting started
26 | 
27 | <!-- Give the required skills for the module -->
28 | 
29 | The required technical skills to carry on this module are:
30 | 
31 | - skills acquired during the "The Predictive Modeling Pipeline" module with
32 |   basic usage of scikit-learn.
33 | 
34 | <!-- Point to resources to learning these skills -->
35 | 
36 | ## Objectives and time schedule
37 | 
38 | <!-- Give the learning objectives -->
39 | 
40 | The objective in the module are the following:
41 | 
42 | - understand the concept of overfitting and underfitting;
43 | - understand the concept of generalization;
44 | - understand the general cross-validation framework used to evaluate a model.
45 | 
46 | <!-- Give the investment in time -->
47 | 
48 | The estimated time to go through this module is about 3 hours.
49 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_overfitting_underfitting_index.md:
--------------------------------------------------------------------------------
1 | # Overfitting and underfitting
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | - **Overfitting** is caused by the **limited size of the training set**, the
 6 |   **noise** in the data, and the **high flexibility** of common machine learning
 7 |   models.
 8 | 
 9 | - **Underfitting** happens when the learnt prediction functions suffer from
10 |   **systematic errors**. This can be caused by a choice of model family and
11 |   parameters, which leads to a **lack of flexibility** to capture the repeatable
12 |   structure of the true data generating process.
13 | 
14 | - For a fixed training set, the objective is to **minimize the test error** by
15 |   adjusting the model family and its parameters to find the
16 |   **best trade-off between overfitting for underfitting**.
17 | 
18 | - For a given choice of model family and parameters, **increasing the
19 |   training set size will decrease overfitting** but can also cause an increase
20 |   of underfitting.
21 | 
22 | - The test error of a model that is neither overfitting nor underfitting can
23 |   still be high if the variations of the target variable cannot be fully
24 |   determined by the input features. This irreducible error is caused by what we
25 |   sometimes call label noise. In practice, this often happens when we do not
26 |   have access to important features for one reason or another.
27 | 
28 | ## To go further
29 | 
30 | It is possible to give a precise mathematical treatment of the bias and the
31 | variance of a regression model. The Wikipedia article on the [Bias-variance
32 | tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff) explains
33 | how the **squared test error can be decomposed as the sum of the squared bias,
34 | the variance and the irreducible error** for a given regression error.
35 | 
36 | The next chapters on linear models, decision trees and ensembles will give
37 | concrete examples on how to diagnose and how to tackle overfitting and
38 | underfitting.
39 | 
40 | You can refer to the following scikit-learn examples which are related to
41 | the concepts approached during this module:
42 | 
43 | - [Illustration of underfitting and overfitting concepts](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html#sphx-glr-auto-examples-model-selection-plot-underfitting-overfitting-py)
44 | - [Difference between train and test scores](https://scikit-learn.org/stable/auto_examples/model_selection/plot_train_error_vs_test_error.html#sphx-glr-auto-examples-model-selection-plot-train-error-vs-test-error-py)
45 | - [Example of a validation curve](https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py)
46 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/overfit_validation_learning_curves_index.md:
--------------------------------------------------------------------------------
1 | # Validation and learning curves
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/overfitting_vs_under_fitting_quiz_m2_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M2.01
 2 | 
 3 | ```{admonition} Question
 4 | A model that is underfitting:
 5 | 
 6 | - a) is too complex and thus highly flexible
 7 | - b) is too constrained and thus limited by its expressivity
 8 | - c) often makes prediction errors, even on training samples
 9 | - d) focuses too much on noisy details of the training set
10 | 
11 | _Select all answers that apply_
12 | ```
13 | 
14 | +++
15 | 
16 | ```{admonition} Question
17 | A model that is overfitting:
18 | 
19 | - a) is too complex and thus highly flexible
20 | - b) is too constrained and thus limited by its expressivity
21 | - c) often makes prediction errors, even on training samples
22 | - d) focuses too much on noisy details of the training set
23 | 
24 | _Select all answers that apply_
25 | ```
26 | 


--------------------------------------------------------------------------------
/jupyter-book/overfit/overfitting_vs_under_fitting_slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Overfitting and Underfitting
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/xErJGDwWqys?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/overfitting_vs_underfitting.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_index.md:
--------------------------------------------------------------------------------
1 | # Tabular data exploration
2 | 
3 | ```{tableofcontents}
4 | ```
5 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M1.01
 2 | 
 3 | ```{admonition} Question
 4 | In the notebook "First look at our dataset", we used pandas and specifically
 5 | `adult_census = pd.read_csv("../datasets/adult-census.csv")` to:
 6 | 
 7 | - a) load a comma-separated values file
 8 | - b) load a dataset already included in the pandas package
 9 | - c) load a file only containing the survey features
10 | - d) load a file only containing the target of our classification problem:
11 |   whether or not a person has a low or high income salary
12 | - e) load a file containing both the features and the target for our classification
13 |   problem
14 | 
15 | _Select all answers that apply_
16 | ```
17 | 
18 | +++
19 | 
20 | ```{admonition} Question
21 | 
22 | In the previous notebook, we used:
23 | 
24 | - a) pandas to gain insights about the dataset
25 | - b) pandas and seaborn to visually inspect the dataset
26 | - c) numpy and scipy to perform numerical inspection (for instance using
27 |   `scipy.optimize.minimize`)
28 | - d) scikit-learn to fit some machine learning models
29 | 
30 | _Select all answers that apply_
31 | ```
32 | 
33 | +++
34 | 
35 | ```{admonition} Question
36 | How is a tabular dataset organized?
37 | 
38 | - a) a column represents a sample and a row represents a feature
39 | - b) a column represents a feature and a row represents a sample
40 | - c) the target variable is represented by a row
41 | - d) the target variable is represented by a column
42 | 
43 | _Select all answers that apply_
44 | ```
45 | 
46 | +++
47 | 
48 | ```{admonition} Question
49 | A categorical variable is:
50 | 
51 | - a) a variable with **only two** different possible values
52 | - b) a variable with continuous numerical values
53 | - c) a variable with a finite set of possible values
54 | 
55 | _Select a single answer_
56 | ```
57 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_index.md:
--------------------------------------------------------------------------------
1 | # Fitting a scikit-learn model on numerical data
2 | 
3 | ```{tableofcontents}
4 | ```
5 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.md:
--------------------------------------------------------------------------------
  1 | # ✅ Quiz M1.02
  2 | 
  3 | ```{admonition} Question
  4 | Why do we need two sets: a train set and a test set?
  5 | 
  6 | - a) to train the model faster
  7 | - b) to validate the model on unseen data
  8 | - c) to improve the accuracy of the model
  9 | 
 10 | _Select all answers that apply_
 11 | ```
 12 | 
 13 | +++
 14 | 
 15 | ```{admonition} Question
 16 | The generalization performance of a scikit-learn model can be evaluated by:
 17 | 
 18 | - a) calling `fit` to train the model on the **training set**, `predict` on the
 19 |   **test set** to get the predictions, and compute the score by passing the
 20 |   predictions and the true target values to some metric function
 21 | - b) calling `fit` to train the model on the **training set** and `score` to compute
 22 |   the score on the **test set**
 23 | - c) calling `cross_validate` by passing the model, the data and the target
 24 | - d) calling `fit_transform` on the data and then `score` to compute
 25 |   the score on the **test set**
 26 | 
 27 | _Select all answers that apply_
 28 | ```
 29 | 
 30 | +++
 31 | 
 32 | ```{admonition} Question
 33 | When calling `cross_validate(estimator, X, y, cv=5)`, the following happens:
 34 | 
 35 | - a) `X` and `y` are internally split five times with non-overlapping test sets
 36 | - b) `estimator.fit` is called 5 times on the full `X` and `y`
 37 | - c) `estimator.fit` is called 5 times, each time on a different training set
 38 | - d) a Python dictionary is returned containing a key/value containing a NumPy
 39 |   array with 5 scores computed on the **train sets**
 40 | - e) a Python dictionary is returned containing a key/value containing a NumPy
 41 |   array with 5 scores computed on the **test sets**
 42 | 
 43 | _Select all answers that apply_
 44 | ```
 45 | 
 46 | +++
 47 | 
 48 | We define a 2-dimensional dataset represented graphically as follows:
 49 | 
 50 | ![Original dataset](../../figures/numerical_pipeline_quiz_scaler_original.png)
 51 | 
 52 | ```{admonition} Question
 53 | If we process the dataset using a `StandardScaler` with the default parameters,
 54 | which of the following results do you expect:
 55 | 
 56 | ![Preprocessed datasets](../../figures/numerical_pipeline_quiz_scaler_preprocessing.png)
 57 | 
 58 | - a) Preprocessing A
 59 | - b) Preprocessing B
 60 | - c) Preprocessing C
 61 | - d) Preprocessing D
 62 | 
 63 | _Select a single answer_
 64 | ```
 65 | 
 66 | +++
 67 | 
 68 | ```{admonition} Question
 69 | Look at the plots and the answers of the previous question. A `StandardScaler`
 70 | transformer with the default parameter:
 71 | 
 72 | - a) transforms the features so that they have similar ranges
 73 | - b) transforms the features to lie in the [0.0, 1.0] range
 74 | - c) transforms feature values that were originally positive-only into values that can
 75 |   be negative or positive
 76 | - d) can help logistic regression converge faster (fewer iterations)
 77 | 
 78 | _Select all answers that apply_
 79 | ```
 80 | 
 81 | +++
 82 | 
 83 | ```{admonition} Question
 84 | Cross-validation allows us to:
 85 | 
 86 | - a) train the model faster
 87 | - b) measure the generalization performance of the model
 88 | - c) estimate the variability of the generalization score
 89 | 
 90 | _Select all answers that apply_
 91 | ```
 92 | 
 93 | +++
 94 | 
 95 | ```{admonition} Question
 96 | `make_pipeline` (as well as `Pipeline`):
 97 | 
 98 | - a) runs a cross-validation using the transformers and predictor given as
 99 |   parameters
100 | - b) combines one or several transformers and a predictor
101 | - c) tries several models at the same time
102 | - d) plots feature histogram automatically
103 | 
104 | _Select all answers that apply_
105 | ```
106 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_video_cross_validation.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Validation of a model
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/kLWvI9fSnKc?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/intro_cross_validation.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_index.md:
--------------------------------------------------------------------------------
1 | # Handling categorical data
2 | 
3 | ```{tableofcontents}
4 | ```
5 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_quiz_m1_03.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M1.03
 2 | 
 3 | ```{admonition} Question
 4 | How are categorical variables represented?
 5 | 
 6 | - a) categorical feature is only represented by non-numerical data
 7 | - b) categorical feature represents a finite number of values called categories
 8 | - c) categorical feature can either be represented by numerical or non-numerical values
 9 | 
10 | _Select all answers that apply_
11 | ```
12 | 
13 | +++
14 | 
15 | ```{admonition} Question
16 | An ordinal variable:
17 | 
18 | - a) is a categorical variable with a large number of different categories;
19 | - b) can be represented by integers or string labels;
20 | - c) is a categorical variable with a meaningful order.
21 | 
22 | _Select all answers that apply_
23 | ```
24 | 
25 | +++
26 | 
27 | ```{admonition} Question
28 | One-hot encoding:
29 | 
30 | - a) encodes each column with string-labeled values into a single integer-coded column
31 | - b) transforms a numerical variable into a categorical variable
32 | - c) creates one additional column for each possible category
33 | - d) transforms string-labeled variables using a numerical representation
34 | 
35 | _Select all answers that apply_
36 | ```
37 | 
38 | +++
39 | 
40 | ```{admonition} Question
41 | 
42 | Assume we have a dataset where each line describes a company. Which of the
43 | following columns should be considered as a meaningful **numerical feature** to
44 | train a machine learning model to classify companies:
45 | 
46 | - a) the sector of activity ("construction", "retail", "energy", "insurance"...)
47 | - b) the phone number of the sales department
48 | - c) the number of employees
49 | - d) the profits of the last quarter
50 | - e) the post code of the head quarters
51 | 
52 | _Select all answers that apply_
53 | ```
54 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_visualization_video.md:
--------------------------------------------------------------------------------
1 | # 🎥 Visualizing scikit-learn pipelines in Jupyter
2 | 
3 | <iframe class="video" width="640px" height="480px"
4 |         src="https://www.youtube.com/embed/D0Nyumrs0G4?rel=0"
5 |         allowfullscreen></iframe>
6 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/predictive_modeling_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | This module will give an example of a typical predictive modeling pipeline
 8 | developed using tabular data (data that can be structured in a 2-dimensional
 9 | table). We will present this pipeline in a progressive way. First, we will make
10 | an analysis of the dataset used. Subsequently, we will train our first
11 | predictive pipeline with a subset of the dataset. Then, we will give particular
12 | attention to the type of data, numerical and categorical, that our model has to
13 | handle. Finally, we will extend our pipeline to use mixed types of data, i.e.
14 | numerical and categorical data.
15 | 
16 | ## Before getting started
17 | 
18 | <!-- Give the required skills for the module -->
19 | 
20 | The required technical skills to carry on this module are:
21 | 
22 | - basic knowledge of Python programming
23 | - some prior experience with the NumPy, pandas and Matplotlib libraries is
24 |   recommended but not required
25 | 
26 | <!-- Point to resources to learning these skills -->
27 | 
28 | For a quick introduction on these requirements, you can use the following resources:
29 | - [Introduction to Python](https://scipy-lectures.org/intro/language/python_language.html)
30 | - [Introduction to NumPy](https://sebastianraschka.com/blog/2020/numpy-intro.html)
31 | - [Introduction to Pandas](https://pandas.pydata.org/docs/user_guide/10min.html)
32 | - [Introduction to Matplotlib](https://sebastianraschka.com/blog/2020/numpy-intro.html#410-matplotlib)
33 | 
34 | ## Objectives and time schedule
35 | 
36 | <!-- Give the learning objectives -->
37 | 
38 | The objective in the module are the following:
39 | 
40 | - build intuitions regarding an unknown dataset;
41 | - identify and differentiate numerical and categorical features;
42 | - create an advanced predictive pipeline with scikit-learn.
43 | 
44 | <!-- Give the investment in time -->
45 | 
46 | The estimated time to go through this module is about 6 hours
47 | 


--------------------------------------------------------------------------------
/jupyter-book/predictive_modeling_pipeline/predictive_modeling_module_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | <!-- Quick wrap-up for the module -->
 6 | 
 7 | In this module, you learned:
 8 | 
 9 | - to create a scikit-learn predictive model;
10 | - about the scikit-learn API to train and test a predictive model;
11 | - to process numerical data, notably using a `Pipeline`.
12 | - to process categorical data, notably using a `OneHotEncoder` and an
13 |   `OrdinalEncoder`;
14 | - to handle and process mixed data types (i.e. numerical and
15 |   categorical data), notably using a `ColumnTransformer`.
16 | 
17 | ## To go further
18 | 
19 | <!-- Some extra links of content to go further -->
20 | 
21 | You can refer to the following scikit-learn examples which are related to
22 | the concepts approached during this module:
23 | 
24 | - [Predictive machine learning pipeline with mixed data types](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py)
25 | - [Importance of feature scaling](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py)
26 | 


--------------------------------------------------------------------------------
/jupyter-book/python_scripts:
--------------------------------------------------------------------------------
1 | ../python_scripts


--------------------------------------------------------------------------------
/jupyter-book/scikit-learn-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/jupyter-book/scikit-learn-logo.png


--------------------------------------------------------------------------------
/jupyter-book/toc.md:
--------------------------------------------------------------------------------
1 | # Table of contents
2 | 
3 | ```{tableofcontents}
4 | ```
5 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/slides.md:
--------------------------------------------------------------------------------
 1 | # 🎥 Intuitions on tree-based models
 2 | 
 3 | <iframe class="video" width="640px" height="480px"
 4 |         src="https://www.youtube.com/embed/1kIHC1O_drM?rel=0"
 5 |         allowfullscreen></iframe>
 6 | 
 7 | <iframe class="slides"
 8 |         src="../slides/index.html?file=../slides/trees.md"></iframe>
 9 | 
10 | To navigate in the slides, **first click on the slides**, then:
11 | - press the **arrow keys** to go to the next/previous slide;
12 | - press **"P"** to toggle presenter mode to see the notes;
13 | - press **"F"** to toggle full-screen mode.
14 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_classification_index.md:
--------------------------------------------------------------------------------
1 | # Decision tree in classification
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_hyperparameters_index.md:
--------------------------------------------------------------------------------
1 | # Hyperparameters of decision tree
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_intuitions_index.md:
--------------------------------------------------------------------------------
1 | # Intuitions on tree-based models
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | This module will present in details decision tree models. This model will be
 8 | explained in both classification and regression problems. Besides, we will
 9 | show which hyperparameters of the decision tree have an importance on their
10 | performance, allowing to find the best trade-off between under- and over-fit.
11 | 
12 | ## Before getting started
13 | 
14 | <!-- Give the required skills for the module -->
15 | 
16 | The required technical skills to carry on this module are:
17 | 
18 | - skills acquired during the "The Predictive Modeling Pipeline" module with
19 |   basic usage of scikit-learn;
20 | - skills acquired during the "Selecting The Best Model" module, mainly around
21 |   the concept of underfit/overfit and the usage of cross-validation in
22 |   scikit-learn.
23 | 
24 | <!-- Point to resources to learning these skills -->
25 | 
26 | ## Objectives and time schedule
27 | 
28 | <!-- Give the learning objectives -->
29 | 
30 | The objective in the module are the following:
31 | 
32 | - understand how decision trees are working in classification and regression;
33 | - check which tree parameters are important and their influences.
34 | 
35 | <!-- Give the investment in time -->
36 | 
37 | The estimated time to go through this module is about 3 hours.
38 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_module_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | <!-- Quick wrap-up for the module -->
 6 | 
 7 | In this module, we presented decision trees in details. We saw that they:
 8 | 
 9 | - are suited for both regression and classification problems;
10 | - are non-parametric models;
11 | - are not able to extrapolate;
12 | - are sensitive to hyperparameter tuning.
13 | 
14 | ## To go further
15 | 
16 | <!-- Some extra links of content to go further -->
17 | 
18 | You can refer to the following scikit-learn examples which are related to
19 | the concepts approached during this module:
20 | 
21 | - [Example of decision tree regressor](https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html#sphx-glr-auto-examples-tree-plot-tree-regression-py)
22 | - [Example of decision tree classifier](https://scikit-learn.org/stable/auto_examples/tree/plot_iris_dtc.html#sphx-glr-auto-examples-tree-plot-iris-dtc-py)
23 | - [Understanding the tree structure in scikit-learn](https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py)
24 | - [Post-pruning decision trees](https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py)
25 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M5.01
 2 | 
 3 | ```{admonition} Question
 4 | From the presentation given in the video, for which kind of supervised learning
 5 | tasks decision trees can be applied to:
 6 | 
 7 | - a) classification tasks
 8 | - b) regression tasks
 9 | - c) clustering tasks
10 | 
11 | _Select all answers that apply_
12 | ```
13 | 
14 | +++
15 | 
16 | ```{admonition} Question
17 | A given split node in a decision tree classifier makes:
18 | 
19 | - a) a binary decision considering a single feature at a time
20 | - b) a binary decision considering a combination of all the input features
21 | - c) multiple binary decisions considering a single feature
22 | - d) a binary decision considering a non-linear combination of all input
23 |   features
24 | 
25 | _Select a single answer_
26 | ```
27 | 
28 | +++
29 | 
30 | ```{admonition} Question
31 | Which aspect of the decision tree learning procedure is most typically used to
32 | control the underfitting/overfitting trade-off?
33 | 
34 | - a) The number of children of a split node
35 | - b) The magnitude of the weight coefficients
36 | - c) The maximum depth of the decision tree
37 | 
38 | _Select a single answer_
39 | ```
40 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_02.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M5.02
 2 | 
 3 | ```{admonition} Question
 4 | For a decision tree built in scikit-learn, a split:
 5 | 
 6 | - a) will use a single feature to create a rule
 7 | - b) will use a combination of the features to create a rule
 8 | - c) will create multiple separations, one for each class
 9 | 
10 | _Select a single answer_
11 | ```
12 | 
13 | +++
14 | 
15 | ```{admonition} Question
16 | Trees are built incrementally:
17 | 
18 | - a) by splitting data over and over
19 | - b) by refining the rules of each node
20 | - c) by refining the rules of each leaf
21 | 
22 | _Select a single answer_
23 | ```
24 | 
25 | +++
26 | 
27 | ```{admonition} Question
28 | A decision tree split is built:
29 | 
30 | - a) using a random threshold
31 | - b) using the median value of a single feature as a threshold
32 | - c) using a threshold that minimizes an error
33 | 
34 | _Select all answers that apply_
35 | ```
36 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_03.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M5.03
 2 | 
 3 | ```{admonition} Question
 4 | When fitting a decision tree regressor in scikit-learn, the predicted values on
 5 | a leaf correspond to:
 6 | 
 7 | - a) the median of the training samples at this node
 8 | - b) the mean of the training samples at this node
 9 | - c) the most frequent value of the training samples at this node
10 | 
11 | _Select a single answer_
12 | ```
13 | 
14 | +++
15 | 
16 | ```{admonition} Question
17 | Decision tree regressors can predict:
18 | 
19 | - a) any values, including values larger or smaller than those observed in `y_train`;
20 | - b) only values in the range from `np.min(y_train)` to `np.max(y_train)`.
21 | 
22 | _Select a single answer_
23 | ```
24 | 
25 | +++
26 | 
27 | ```{admonition} Question
28 | The predictions of a tree regressor correspond to:
29 | 
30 | - a) a piecewise-linear function
31 | - b) a piecewise-constant function
32 | - c) a piecewise-cubic function
33 | 
34 | _Select a single answer_
35 | ```
36 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_quiz_m5_04.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M5.04
 2 | 
 3 | ```{admonition} Question
 4 | If a decision tree is overfitting, you need to increase the maximum depth.
 5 | 
 6 | - a) True
 7 | - b) False
 8 | 
 9 | _Select a single answer_
10 | ```
11 | 
12 | +++
13 | 
14 | ```{admonition} Question
15 | How should you choose the maximum depth of a decision tree?
16 | 
17 | - a) choosing the depth maximizing the score on a validation set with a
18 |   cross-validation, with a grid-search for instance
19 | - b) choosing the depth maximizing the score on the train set
20 | - c) choosing the depth maximizing the score on the test set
21 | 
22 | _Select all answers that apply_
23 | ```
24 | 


--------------------------------------------------------------------------------
/jupyter-book/trees/trees_regression_index.md:
--------------------------------------------------------------------------------
1 | # Decision tree in regression
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_automated_index.md:
--------------------------------------------------------------------------------
1 | # Automated tuning
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_manual_index.md:
--------------------------------------------------------------------------------
1 | # Manual tuning
2 | 
3 | ```{tableofcontents}
4 | 
5 | ```
6 | 


--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md:
--------------------------------------------------------------------------------
 1 | # ✅ Quiz M3.01
 2 | 
 3 | ```{admonition} Question
 4 | Which parameters below are hyperparameters of `HistGradientBoostingClassifier`?
 5 | Remember we only consider hyperparameters to be those that potentially impact
 6 | the result of the learning procedure and subsequent predictions.
 7 | 
 8 | - a) `C`
 9 | - b) `max_leaf_nodes`
10 | - c) `verbose`
11 | - d) `classes_`
12 | - e) `learning_rate`
13 | 
14 | _Select all answers that apply_
15 | ```
16 | 
17 | +++
18 | 
19 | ````{admonition} Question
20 | Given an instance named `model` as defined by:
21 | ```python
22 | from sklearn.linear_model import LogisticRegression
23 | model = LogisticRegression()
24 | ```
25 | 
26 | how do you get the value of the `C` parameter?
27 | - a) `model.get_parameters()['C']`
28 | - b) `model.get_params()['C']`
29 | - c) `model.get_params('C')`
30 | - d) `model.get_params['C']`
31 | 
32 | _Select a single answer_
33 | ````
34 | 
35 | +++
36 | 
37 | ````{admonition} Question
38 | Given `model` defined by:
39 | ```python
40 | from sklearn.linear_model import LogisticRegression
41 | 
42 | model = LogisticRegression()
43 | ```
44 | 
45 | how do you set the value of the `C` parameter to `5`?
46 | - a) `model.set_params('C', 5)`
47 | - b) `model.set_params({'C': 5})`
48 | - c) `model.set_params()['C'] = 5`
49 | - d) `model.set_params(C=5)`
50 | 
51 | _Select a single answer_
52 | ````
53 | 
54 | +++
55 | 
56 | ````{admonition} Question
57 | Given `model` defined by:
58 | ```python
59 | from sklearn.preprocessing import StandardScaler
60 | from sklearn.linear_model import LogisticRegression
61 | from sklearn.pipeline import Pipeline
62 | 
63 | model = Pipeline([
64 |     ('scaler', StandardScaler()),
65 |     ('classifier', LogisticRegression())
66 | ])
67 | ```
68 | 
69 | how do you set the value of the `C` parameter of the `LogisticRegression` component to 5:
70 | - a) `model.set_params(C=5) `
71 | - b) `model.set_params(logisticregression__C=5)`
72 | - c) `model.set_params(classifier__C=5) `
73 | - d) `model.set_params(classifier--C=5)`
74 | 
75 | _Select a single answer_
76 | ````
77 | 


--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_module_intro.md:
--------------------------------------------------------------------------------
 1 | # Module overview
 2 | 
 3 | ## What you will learn
 4 | 
 5 | <!-- Give in plain English what the module is about -->
 6 | 
 7 | In the previous modules, we showed how to create, train, predict, and even
 8 | evaluate a predictive model. However, we did not change the models'
 9 | parameters that can be given when creating an instance. For example,
10 | for k-nearest neighbors, we initially used this default parameter:
11 | `n_neighbors=5` before trying other model parameters.
12 | 
13 | These parameters are called **hyperparameters**: they are parameters
14 | used to control the learning process, for instance the parameter `k`
15 | of the k-nearest neighbors. Hyperparameters are specified by the user,
16 | often manually tuned (or by an exhaustive automatic search), and
17 | cannot be estimated from the data. They should not be confused with
18 | the other parameters that are inferred during the training
19 | process. These parameters define the model itself, for instance
20 | `coef_` for the linear models.
21 | 
22 | In this module, we will first show that the hyperparameters have an impact on
23 | the performance of the model and that default values are not necessarily the
24 | best option. Subsequently, we will show how to set hyperparameters in
25 | scikit-learn model. Finally, we will show strategies allowing to pick-up a
26 | combination of hyperparameters that maximizes model's performance.
27 | 
28 | ## Before getting started
29 | 
30 | <!-- Give the required skills for the module -->
31 | 
32 | The required technical skills to carry on this module are:
33 | 
34 | - skills acquired during the "The Predictive Modeling Pipeline" with basic
35 |   usage of scikit-learn;
36 | - skills related to using the cross-validation framework to evaluate a model.
37 | 
38 | <!-- Point to resources to learning these skills -->
39 | 
40 | ## Objectives and time schedule
41 | 
42 | <!-- Give the learning objectives -->
43 | 
44 | The objective in the module are the following:
45 | 
46 | - understand what is a model hyperparameter;
47 | - understand how to get and set the value of a hyperparameter in a scikit-learn
48 |   model;
49 | - be able to fine tune a full predictive modeling pipeline;
50 | - understand and visualize the combination of parameters that improves the
51 |   performance of a model.
52 | 
53 | <!-- Give the investment in time -->
54 | 
55 | The estimated time to go through this module is about 3 hours.
56 | 


--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_module_take_away.md:
--------------------------------------------------------------------------------
 1 | # Main take-away
 2 | 
 3 | ## Wrap-up
 4 | 
 5 | <!-- Quick wrap-up for the module -->
 6 | 
 7 | - Hyperparameters have an impact on the models' performance and should be
 8 |   wisely chosen;
 9 | - The search for the best hyperparameters can be automated with a grid-search
10 |   approach or a randomized search approach;
11 | - A grid-search can be computationally expensive and becomes less attractive as
12 |   the number of hyperparameters to explore increases. Moreover, the combinations
13 |   are sampled on a fixed, regular grid.
14 | - A randomized-search allows exploring within a fixed budget, even as the number
15 |   of hyperparameters increases. In this case, combinations can be sampled either
16 |   on a regular grid or from a given distribution.
17 | 
18 | ## To go further
19 | 
20 | <!-- Some extra links of content to go further -->
21 | 
22 | You can refer to the following scikit-learn examples which are related to
23 | the concepts approached during this module:
24 | 
25 | - [Example of a grid-search](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py)
26 | - [Example of a randomized-search](https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py)
27 | - [Example of a nested cross-validation](https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html#sphx-glr-auto-examples-model-selection-plot-nested-cross-validation-iris-py)
28 | 


--------------------------------------------------------------------------------
/jupyter-book/tuning/parameter_tuning_parallel_plot_video.md:
--------------------------------------------------------------------------------
1 | # 🎥 Analysis of hyperparameter search results
2 | 
3 | <iframe class="video" width="640px" height="480px"
4 |         src="https://www.youtube.com/embed/55BweAh6X5o?rel=0"
5 |         allowfullscreen></iframe>
6 | 


--------------------------------------------------------------------------------
/local-install-instructions.md:
--------------------------------------------------------------------------------
 1 | # Local install instructions
 2 | 
 3 | The course uses Python 3 and some data analysis packages such as Numpy, Pandas,
 4 | scikit-learn, and matplotlib.
 5 | 
 6 | ## Install Miniconda
 7 | 
 8 | **This step is only necessary if you don't have conda installed already**:
 9 | 
10 | - download the Miniconda installer for your operating system (Windows, MacOSX
11 |   or Linux) [here](https://docs.conda.io/en/latest/miniconda.html)
12 | - run the installer following the instructions
13 |   [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html#regular-installation)
14 |   depending on your operating system.
15 | 
16 | ## Create conda environment
17 | 
18 | ```sh
19 | # Clone this repo
20 | git clone https://github.com/INRIA/scikit-learn-mooc
21 | cd scikit-learn-mooc
22 | # Create a conda environment with the required packages for this tutorial:
23 | conda env create -f environment.yml
24 | ```
25 | 
26 | ## Check your install
27 | 
28 | To make sure you have all the necessary packages installed, we **strongly
29 | recommend** you to execute the `check_env.py` script located at the root of
30 | this repository:
31 | 
32 | ```sh
33 | # Activate your conda environment
34 | conda activate scikit-learn-course
35 | python check_env.py
36 | ```
37 | 
38 | Make sure that there is no `FAIL` in the output when running the `check_env.py`
39 | script, i.e. that its output looks similar to this:
40 | 
41 | ```
42 | Using python in /home/lesteve/miniconda3/envs/scikit-learn-course
43 | 3.9.1 | packaged by conda-forge | (default, Jan 10 2021, 02:55:42)
44 | [GCC 9.3.0]
45 | 
46 | [ OK ] numpy version 1.19.5
47 | [ OK ] scipy version 1.6.0
48 | [ OK ] matplotlib version 3.3.3
49 | [ OK ] sklearn version 1.6
50 | [ OK ] pandas version 2.0
51 | [ OK ] seaborn version 0.13
52 | [ OK ] notebook version 6.2.0
53 | [ OK ] plotly version 5.10.0
54 | ```
55 | 
56 | ## Run Jupyter notebooks locally
57 | 
58 | ```sh
59 | # Activate your conda environment
60 | conda activate scikit-learn-course
61 | jupyter notebook full-index.ipynb
62 | ```
63 | 
64 | `full-index.ipynb` is an index file helping to navigate the notebooks.
65 | All the Jupyter notebooks are located in the `notebooks` folder.
66 | 


--------------------------------------------------------------------------------
/notebooks/01_tabular_data_exploration_ex_01.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# \ud83d\udcdd Exercise M1.01"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Imagine we are interested in predicting penguins species based on two of their\n",
 15 |     "body measurements: culmen length and culmen depth. First we want to do some\n",
 16 |     "data exploration to get a feel for the data.\n",
 17 |     "\n",
 18 |     "What are the features? What is the target?"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "The data is located in `../datasets/penguins_classification.csv`, load it with\n",
 26 |     "`pandas` into a `DataFrame`."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Write your code here."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Show a few samples of the data.\n",
 43 |     "\n",
 44 |     "How many features are numerical? How many features are categorical?"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Write your code here."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "What are the different penguins species available in the dataset and how many\n",
 61 |     "samples of each species are there? Hint: select the right column and use the\n",
 62 |     "[`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)\n",
 63 |     "method."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# Write your code here."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "Plot histograms for the numerical features"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "# Write your code here."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "Show features distribution for each class. Hint: use\n",
 96 |     "[`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# Write your code here."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "Looking at these distributions, how hard do you think it would be to classify\n",
113 |     "the penguins only using `\"culmen depth\"` and `\"culmen length\"`?"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "jupytext": {
119 |    "main_language": "python"
120 |   },
121 |   "kernelspec": {
122 |    "display_name": "Python 3",
123 |    "name": "python3"
124 |   }
125 |  },
126 |  "nbformat": 4,
127 |  "nbformat_minor": 5
128 | }


--------------------------------------------------------------------------------
/notebooks/ensemble_ex_01.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# \ud83d\udcdd Exercise M6.01\n",
 8 |     "\n",
 9 |     "The aim of this notebook is to investigate if we can tune the hyperparameters\n",
10 |     "of a bagging regressor and evaluate the gain obtained.\n",
11 |     "\n",
12 |     "We will load the California housing dataset and split it into a training and a\n",
13 |     "testing set."
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": null,
19 |    "metadata": {},
20 |    "outputs": [],
21 |    "source": [
22 |     "from sklearn.datasets import fetch_california_housing\n",
23 |     "from sklearn.model_selection import train_test_split\n",
24 |     "\n",
25 |     "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n",
26 |     "target *= 100  # rescale the target in k$\n",
27 |     "data_train, data_test, target_train, target_test = train_test_split(\n",
28 |     "    data, target, random_state=0, test_size=0.5\n",
29 |     ")"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "markdown",
34 |    "metadata": {},
35 |    "source": [
36 |     "<div class=\"admonition note alert alert-info\">\n",
37 |     "<p class=\"first admonition-title\" style=\"font-weight: bold;\">Note</p>\n",
38 |     "<p class=\"last\">If you want a deeper overview regarding this dataset, you can refer to the\n",
39 |     "Appendix - Datasets description section at the end of this MOOC.</p>\n",
40 |     "</div>"
41 |    ]
42 |   },
43 |   {
44 |    "cell_type": "markdown",
45 |    "metadata": {},
46 |    "source": [
47 |     "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its\n",
48 |     "parameter `estimator`. Train the regressor and evaluate its generalization\n",
49 |     "performance on the testing set using the mean absolute error."
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": null,
55 |    "metadata": {},
56 |    "outputs": [],
57 |    "source": [
58 |     "# Write your code here."
59 |    ]
60 |   },
61 |   {
62 |    "cell_type": "markdown",
63 |    "metadata": {},
64 |    "source": [
65 |     "Now, create a `RandomizedSearchCV` instance using the previous model and tune\n",
66 |     "the important parameters of the bagging regressor. Find the best parameters\n",
67 |     "and check if you are able to find a set of parameters that improve the default\n",
68 |     "regressor still using the mean absolute error as a metric.\n",
69 |     "\n",
70 |     "<div class=\"admonition tip alert alert-warning\">\n",
71 |     "<p class=\"first admonition-title\" style=\"font-weight: bold;\">Tip</p>\n",
72 |     "<p class=\"last\">You can list the bagging regressor's parameters using the <tt class=\"docutils literal\">get_params</tt> method.</p>\n",
73 |     "</div>"
74 |    ]
75 |   },
76 |   {
77 |    "cell_type": "code",
78 |    "execution_count": null,
79 |    "metadata": {},
80 |    "outputs": [],
81 |    "source": [
82 |     "# Write your code here."
83 |    ]
84 |   }
85 |  ],
86 |  "metadata": {
87 |   "jupytext": {
88 |    "main_language": "python"
89 |   },
90 |   "kernelspec": {
91 |    "display_name": "Python 3",
92 |    "name": "python3"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 5
97 | }


--------------------------------------------------------------------------------
/notebooks/matplotlibrc:
--------------------------------------------------------------------------------
 1 | axes.labelsize: 18.0
 2 | axes.linewidth: 1.875
 3 | axes.titlesize: 18.0
 4 | boxplot.whiskers: 1000
 5 | boxplot.patchartist: True
 6 | boxplot.boxprops.color: black
 7 | boxplot.capprops.color: black
 8 | boxplot.medianprops.color: black
 9 | boxplot.whiskerprops.color: black
10 | boxplot.boxprops.linewidth: 3.0
11 | boxplot.capprops.linewidth: 3.0
12 | boxplot.medianprops.linewidth: 2.5
13 | boxplot.whiskerprops.linewidth: 3.0
14 | figure.titlesize: 22.0
15 | font.size: 18.0
16 | grid.linewidth: 1.5
17 | legend.fontsize: 16.5
18 | legend.title_fontsize: 18.0
19 | lines.linewidth: 3.5
20 | lines.markersize: 9.0
21 | patch.linewidth: 1.5
22 | xtick.labelsize: 16.5
23 | xtick.major.size: 9.0
24 | xtick.major.width: 1.875
25 | xtick.minor.size: 6.0
26 | xtick.minor.width: 1.5
27 | ytick.labelsize: 16.5
28 | ytick.major.size: 9.0
29 | ytick.major.width: 1.875
30 | ytick.minor.size: 6.0
31 | ytick.minor.width: 1.5
32 | 


--------------------------------------------------------------------------------
/notebooks/metrics_ex_02.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# \ud83d\udcdd Exercise M7.03\n",
  8 |     "\n",
  9 |     "As with the classification metrics exercise, we will evaluate the regression\n",
 10 |     "metrics within a cross-validation framework to get familiar with the syntax.\n",
 11 |     "\n",
 12 |     "We will use the Ames house prices dataset."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "\n",
 24 |     "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\")\n",
 25 |     "data = ames_housing.drop(columns=\"SalePrice\")\n",
 26 |     "target = ames_housing[\"SalePrice\"]\n",
 27 |     "data = data.select_dtypes(np.number)\n",
 28 |     "target /= 1000"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {
 34 |     "lines_to_next_cell": 2
 35 |    },
 36 |    "source": [
 37 |     "<div class=\"admonition note alert alert-info\">\n",
 38 |     "<p class=\"first admonition-title\" style=\"font-weight: bold;\">Note</p>\n",
 39 |     "<p class=\"last\">If you want a deeper overview regarding this dataset, you can refer to the\n",
 40 |     "Appendix - Datasets description section at the end of this MOOC.</p>\n",
 41 |     "</div>"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "The first step will be to create a linear regression model."
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# Write your code here."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Then, use the `cross_val_score` to estimate the generalization performance of\n",
 65 |     "the model. Use a `KFold` cross-validation with 10 folds. Make the use of the\n",
 66 |     "$R^2$ score explicit by assigning the parameter `scoring` (even though it is\n",
 67 |     "the default score)."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Write your code here."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You\n",
 84 |     "may need to refer to the documentation for the `scoring` parameter."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# Write your code here."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Finally, use the `cross_validate` function and compute multiple scores/errors\n",
101 |     "at once by passing a list of scorers to the `scoring` parameter. You can\n",
102 |     "compute the $R^2$ score and the mean absolute error for instance."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# Write your code here."
112 |    ]
113 |   }
114 |  ],
115 |  "metadata": {
116 |   "jupytext": {
117 |    "main_language": "python"
118 |   },
119 |   "kernelspec": {
120 |    "display_name": "Python 3",
121 |    "name": "python3"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 5
126 | }


--------------------------------------------------------------------------------
/one-day-course-index.md:
--------------------------------------------------------------------------------
 1 | # The predictive modeling pipeline
 2 | 
 3 | ## Tabular data exploration
 4 | 
 5 | - [First look at our dataset](./notebooks/01_tabular_data_exploration.ipynb)
 6 | - [Exercise 01](./notebooks/01_tabular_data_exploration_ex_01.ipynb)
 7 | 
 8 | ## Fitting a scikit-learn model on numerical data
 9 | 
10 | - [First model with scikit-learn](./notebooks/02_numerical_pipeline_introduction.ipynb)
11 | - [Exercise 01](./notebooks/02_numerical_pipeline_ex_00.ipynb)
12 | - [Working with numerical data](./notebooks/02_numerical_pipeline_hands_on.ipynb)
13 | - [Exercise 02](./notebooks/02_numerical_pipeline_ex_01.ipynb)
14 | - [Preprocessing for numerical features](./notebooks/02_numerical_pipeline_scaling.ipynb)
15 | 
16 | ## Handling categorical data
17 | 
18 | - [Encoding of categorical variables](./notebooks/03_categorical_pipeline.ipynb)
19 | - [Exercise 01](./notebooks/03_categorical_pipeline_ex_01.ipynb)
20 | - [Using numerical and categorical variables together](./notebooks/03_categorical_pipeline_column_transformer.ipynb)
21 | - [Exercise 02](./notebooks/03_categorical_pipeline_ex_02.ipynb)
22 | 
23 | # Hyperparameter tuning
24 | 
25 | ## Manual tuning
26 | 
27 | - [Set and get hyperparameters in scikit-learn](./notebooks/parameter_tuning_manual.ipynb)
28 | - [Exercise 01](./notebooks/parameter_tuning_ex_02.ipynb)
29 | 
30 | ## Automated tuning
31 | 
32 | - [Hyperparameter tuning by grid-search](./notebooks/parameter_tuning_grid_search.ipynb)
33 | - [Hyperparameter tuning by randomized-search](./notebooks/parameter_tuning_randomized_search.ipynb)
34 | - [Exercise 02](./notebooks/parameter_tuning_ex_03.ipynb)
35 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 79
 3 | target_version = ['py38', 'py39', 'py310', 'py311']
 4 | preview = true
 5 | exclude = '''
 6 | /(
 7 |     \.eggs         # exclude a few common directories in the
 8 |   | \.git          # root of the project
 9 |   | \.mypy_cache
10 |   | \.vscode
11 |   | build
12 |   | dist
13 | )/
14 | '''
15 | 
16 | [tool.ruff.lint]
17 | ignore = [
18 |     'E402',  # module level import not at top of file
19 |     'F401',  # imported but unused
20 |     'E501',  # line too long
21 |     'E203',  # whitespace before ':'
22 | ]
23 | 


--------------------------------------------------------------------------------
/python_scripts/01_tabular_data_exploration_ex_01.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M1.01
16 | 
17 | # %% [markdown]
18 | # Imagine we are interested in predicting penguins species based on two of their
19 | # body measurements: culmen length and culmen depth. First we want to do some
20 | # data exploration to get a feel for the data.
21 | #
22 | # What are the features? What is the target?
23 | 
24 | # %% [markdown]
25 | # The data is located in `../datasets/penguins_classification.csv`, load it with
26 | # `pandas` into a `DataFrame`.
27 | 
28 | # %%
29 | # Write your code here.
30 | 
31 | # %% [markdown]
32 | # Show a few samples of the data.
33 | #
34 | # How many features are numerical? How many features are categorical?
35 | 
36 | # %%
37 | # Write your code here.
38 | 
39 | # %% [markdown]
40 | # What are the different penguins species available in the dataset and how many
41 | # samples of each species are there? Hint: select the right column and use the
42 | # [`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)
43 | # method.
44 | 
45 | # %%
46 | # Write your code here.
47 | 
48 | # %% [markdown]
49 | # Plot histograms for the numerical features
50 | 
51 | # %%
52 | # Write your code here.
53 | 
54 | # %% [markdown]
55 | # Show features distribution for each class. Hint: use
56 | # [`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html)
57 | 
58 | # %%
59 | # Write your code here.
60 | 
61 | # %% [markdown]
62 | # Looking at these distributions, how hard do you think it would be to classify
63 | # the penguins only using `"culmen depth"` and `"culmen length"`?
64 | 


--------------------------------------------------------------------------------
/python_scripts/01_tabular_data_exploration_sol_01.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   kernelspec:
 4 | #     display_name: Python 3
 5 | #     name: python3
 6 | # ---
 7 | 
 8 | # %% [markdown]
 9 | # # 📃 Solution for Exercise M1.01
10 | 
11 | # %% [markdown]
12 | # Imagine we are interested in predicting penguins species based on two of their
13 | # body measurements: culmen length and culmen depth. First we want to do some
14 | # data exploration to get a feel for the data.
15 | #
16 | # What are the features? What is the target?
17 | 
18 | # %% [markdown] tags=["solution"]
19 | # The features are `"culmen length"` and `"culmen depth"`. The target is the
20 | # penguin species.
21 | 
22 | # %% [markdown]
23 | # The data is located in `../datasets/penguins_classification.csv`, load it with
24 | # `pandas` into a `DataFrame`.
25 | 
26 | # %%
27 | # solution
28 | import pandas as pd
29 | 
30 | penguins = pd.read_csv("../datasets/penguins_classification.csv")
31 | 
32 | # %% [markdown]
33 | # Show a few samples of the data.
34 | #
35 | # How many features are numerical? How many features are categorical?
36 | 
37 | # %% [markdown] tags=["solution"]
38 | # Both features, `"culmen length"` and `"culmen depth"` are numerical. There are
39 | # no categorical features in this dataset.
40 | 
41 | # %%
42 | # solution
43 | penguins.head()
44 | 
45 | # %% [markdown]
46 | # What are the different penguins species available in the dataset and how many
47 | # samples of each species are there? Hint: select the right column and use the
48 | # [`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)
49 | # method.
50 | 
51 | # %%
52 | # solution
53 | penguins["Species"].value_counts()
54 | 
55 | # %% [markdown]
56 | # Plot histograms for the numerical features
57 | 
58 | # %%
59 | # solution
60 | _ = penguins.hist(figsize=(8, 4))
61 | 
62 | # %% [markdown]
63 | # Show features distribution for each class. Hint: use
64 | # [`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html)
65 | 
66 | # %%
67 | # solution
68 | import seaborn
69 | 
70 | pairplot_figure = seaborn.pairplot(penguins, hue="Species")
71 | 
72 | # %% [markdown] tags=["solution"]
73 | # We observe that the labels on the axis are overlapping. Even if it is not the
74 | # priority of this notebook, one can tweak them by increasing the height of each
75 | # subfigure.
76 | 
77 | # %% tags=["solution"]
78 | pairplot_figure = seaborn.pairplot(penguins, hue="Species", height=4)
79 | 
80 | # %% [markdown]
81 | # Looking at these distributions, how hard do you think it would be to classify
82 | # the penguins only using `"culmen depth"` and `"culmen length"`?
83 | 
84 | # %% [markdown] tags=["solution"]
85 | # Looking at the previous scatter-plot showing `"culmen length"` and `"culmen
86 | # depth"`, the species are reasonably well separated:
87 | # - low culmen length -> Adelie
88 | # - low culmen depth -> Gentoo
89 | # - high culmen depth and high culmen length -> Chinstrap
90 | #
91 | # There is some small overlap between the species, so we can expect a
92 | # statistical model to perform well on this dataset but not perfectly.
93 | 


--------------------------------------------------------------------------------
/python_scripts/02_numerical_pipeline_ex_00.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M1.02
16 | #
17 | # The goal of this exercise is to fit a similar model as in the previous
18 | # notebook to get familiar with manipulating scikit-learn objects and in
19 | # particular the `.fit/.predict/.score` API.
20 | 
21 | # %% [markdown]
22 | # Let's load the adult census dataset with only numerical variables
23 | 
24 | # %%
25 | import pandas as pd
26 | 
27 | adult_census = pd.read_csv("../datasets/adult-census-numeric.csv")
28 | data = adult_census.drop(columns="class")
29 | target = adult_census["class"]
30 | 
31 | # %% [markdown]
32 | # In the previous notebook we used `model = KNeighborsClassifier()`. All
33 | # scikit-learn models can be created without arguments. This is convenient
34 | # because it means that you don't need to understand the full details of a model
35 | # before starting to use it.
36 | #
37 | # One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the
38 | # number of neighbors we are going to use to make a prediction for a new data
39 | # point.
40 | #
41 | # What is the default value of the `n_neighbors` parameter?
42 | #
43 | # **Hint**: Look at the documentation on the [scikit-learn
44 | # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
45 | # or directly access the description inside your notebook by running the
46 | # following cell. This opens a pager pointing to the documentation.
47 | 
48 | # %%
49 | from sklearn.neighbors import KNeighborsClassifier
50 | 
51 | # KNeighborsClassifier?
52 | 
53 | # %% [markdown]
54 | # Create a `KNeighborsClassifier` model with `n_neighbors=50`
55 | 
56 | # %%
57 | # Write your code here.
58 | 
59 | # %% [markdown]
60 | # Fit this model on the data and target loaded above
61 | 
62 | # %%
63 | # Write your code here.
64 | 
65 | # %% [markdown]
66 | # Use your model to make predictions on the first 10 data points inside the
67 | # data. Do they match the actual target values?
68 | 
69 | # %%
70 | # Write your code here.
71 | 
72 | # %% [markdown]
73 | # Compute the accuracy on the training data.
74 | 
75 | # %%
76 | # Write your code here.
77 | 
78 | # %% [markdown]
79 | # Now load the test data from `"../datasets/adult-census-numeric-test.csv"` and
80 | # compute the accuracy on the test data.
81 | 
82 | # %%
83 | # Write your code here.
84 | 


--------------------------------------------------------------------------------
/python_scripts/02_numerical_pipeline_ex_01.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M1.03
16 | #
17 | # The goal of this exercise is to compare the performance of our classifier in
18 | # the previous notebook (roughly 81% accuracy with `LogisticRegression`) to some
19 | # simple baseline classifiers. The simplest baseline classifier is one that
20 | # always predicts the same class, irrespective of the input data.
21 | #
22 | # - What would be the score of a model that always predicts `' >50K'`?
23 | # - What would be the score of a model that always predicts `' <=50K'`?
24 | # - Is 81% or 82% accuracy a good score for this problem?
25 | #
26 | # Use a `DummyClassifier` and do a train-test split to evaluate its accuracy on
27 | # the test set. This
28 | # [link](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators)
29 | # shows a few examples of how to evaluate the generalization performance of
30 | # these baseline models.
31 | 
32 | # %%
33 | import pandas as pd
34 | 
35 | adult_census = pd.read_csv("../datasets/adult-census.csv")
36 | 
37 | # %% [markdown]
38 | # We first split our dataset to have the target separated from the data used to
39 | # train our predictive model.
40 | 
41 | # %%
42 | target_name = "class"
43 | target = adult_census[target_name]
44 | data = adult_census.drop(columns=target_name)
45 | 
46 | # %% [markdown]
47 | # We start by selecting only the numerical columns as seen in the previous
48 | # notebook.
49 | 
50 | # %%
51 | numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"]
52 | 
53 | data_numeric = data[numerical_columns]
54 | 
55 | # %% [markdown]
56 | # Split the data and target into a train and test set.
57 | 
58 | # %%
59 | from sklearn.model_selection import train_test_split
60 | 
61 | # Write your code here.
62 | 
63 | # %% [markdown]
64 | # Use a `DummyClassifier` such that the resulting classifier always predict the
65 | # class `' >50K'`. What is the accuracy score on the test set? Repeat the
66 | # experiment by always predicting the class `' <=50K'`.
67 | #
68 | # Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve
69 | # the desired behavior.
70 | 
71 | # %%
72 | from sklearn.dummy import DummyClassifier
73 | 
74 | # Write your code here.
75 | 


--------------------------------------------------------------------------------
/python_scripts/02_numerical_pipeline_sol_00.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   kernelspec:
  4 | #     display_name: Python 3
  5 | #     name: python3
  6 | # ---
  7 | 
  8 | # %% [markdown]
  9 | # # 📃 Solution for Exercise M1.02
 10 | #
 11 | # The goal of this exercise is to fit a similar model as in the previous
 12 | # notebook to get familiar with manipulating scikit-learn objects and in
 13 | # particular the `.fit/.predict/.score` API.
 14 | 
 15 | # %% [markdown]
 16 | # Let's load the adult census dataset with only numerical variables
 17 | 
 18 | # %%
 19 | import pandas as pd
 20 | 
 21 | adult_census = pd.read_csv("../datasets/adult-census-numeric.csv")
 22 | data = adult_census.drop(columns="class")
 23 | target = adult_census["class"]
 24 | 
 25 | # %% [markdown]
 26 | # In the previous notebook we used `model = KNeighborsClassifier()`. All
 27 | # scikit-learn models can be created without arguments. This is convenient
 28 | # because it means that you don't need to understand the full details of a model
 29 | # before starting to use it.
 30 | #
 31 | # One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the
 32 | # number of neighbors we are going to use to make a prediction for a new data
 33 | # point.
 34 | #
 35 | # What is the default value of the `n_neighbors` parameter?
 36 | #
 37 | # **Hint**: Look at the documentation on the [scikit-learn
 38 | # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
 39 | # or directly access the description inside your notebook by running the
 40 | # following cell. This opens a pager pointing to the documentation.
 41 | 
 42 | # %%
 43 | from sklearn.neighbors import KNeighborsClassifier
 44 | 
 45 | # KNeighborsClassifier?
 46 | 
 47 | # %% [markdown] tags=["solution"]
 48 | # We can see that the default value for `n_neighbors` is 5.
 49 | 
 50 | # %% [markdown]
 51 | # Create a `KNeighborsClassifier` model with `n_neighbors=50`
 52 | 
 53 | # %%
 54 | # solution
 55 | model = KNeighborsClassifier(n_neighbors=50)
 56 | 
 57 | # %% [markdown]
 58 | # Fit this model on the data and target loaded above
 59 | 
 60 | # %%
 61 | # solution
 62 | model.fit(data, target)
 63 | 
 64 | # %% [markdown]
 65 | # Use your model to make predictions on the first 10 data points inside the
 66 | # data. Do they match the actual target values?
 67 | 
 68 | # %%
 69 | # solution
 70 | first_data_values = data.iloc[:10]
 71 | first_predictions = model.predict(first_data_values)
 72 | first_predictions
 73 | 
 74 | # %% tags=["solution"]
 75 | first_target_values = target.iloc[:10]
 76 | first_target_values
 77 | 
 78 | # %% tags=["solution"]
 79 | number_of_correct_predictions = (
 80 |     first_predictions == first_target_values
 81 | ).sum()
 82 | number_of_predictions = len(first_predictions)
 83 | print(
 84 |     f"{number_of_correct_predictions}/{number_of_predictions} "
 85 |     "of predictions are correct"
 86 | )
 87 | 
 88 | # %% [markdown]
 89 | # Compute the accuracy on the training data.
 90 | 
 91 | # %%
 92 | # solution
 93 | model.score(data, target)
 94 | 
 95 | # %% [markdown]
 96 | # Now load the test data from `"../datasets/adult-census-numeric-test.csv"` and
 97 | # compute the accuracy on the test data.
 98 | 
 99 | # %%
100 | # solution
101 | adult_census_test = pd.read_csv("../datasets/adult-census-numeric-test.csv")
102 | 
103 | data_test = adult_census_test.drop(columns="class")
104 | target_test = adult_census_test["class"]
105 | 
106 | model.score(data_test, target_test)
107 | 
108 | # %% [markdown] tags=["solution"]
109 | # Looking at the previous notebook, the accuracy seems slightly higher with
110 | # `n_neighbors=50` than with `n_neighbors=5` (the default value).
111 | 


--------------------------------------------------------------------------------
/python_scripts/cross_validation_ex_02.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M7.01
16 | #
17 | # In this exercise we will define dummy classification baselines and use them as
18 | # reference to assess the relative predictive performance of a given model of
19 | # interest.
20 | #
21 | # We illustrate those baselines with the help of the Adult Census dataset, using
22 | # only the numerical features for the sake of simplicity.
23 | 
24 | # %%
25 | import pandas as pd
26 | 
27 | adult_census = pd.read_csv("../datasets/adult-census-numeric-all.csv")
28 | data, target = adult_census.drop(columns="class"), adult_census["class"]
29 | 
30 | # %% [markdown]
31 | # First, define a `ShuffleSplit` cross-validation strategy taking half of the
32 | # samples as a testing at each round. Let us use 10 cross-validation rounds.
33 | 
34 | # %%
35 | # Write your code here.
36 | 
37 | # %% [markdown]
38 | # Next, create a machine learning pipeline composed of a transformer to
39 | # standardize the data followed by a logistic regression classifier.
40 | 
41 | # %%
42 | # Write your code here.
43 | 
44 | # %% [markdown]
45 | # Compute the cross-validation (test) scores for the classifier on this dataset.
46 | # Store the results pandas Series as we did in the previous notebook.
47 | 
48 | # %%
49 | # Write your code here.
50 | 
51 | # %% [markdown]
52 | # Now, compute the cross-validation scores of a dummy classifier that constantly
53 | # predicts the most frequent class observed the training set. Please refer to
54 | # the online documentation for the [sklearn.dummy.DummyClassifier
55 | # ](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)
56 | # class.
57 | #
58 | # Store the results in a second pandas Series.
59 | 
60 | # %%
61 | # Write your code here.
62 | 
63 | # %% [markdown]
64 | # Now that we collected the results from the baseline and the model, concatenate
65 | # the test scores as columns a single pandas dataframe.
66 | 
67 | # %%
68 | # Write your code here.
69 | 
70 | # %% [markdown]
71 | #
72 | # Next, plot the histogram of the cross-validation test scores for both models
73 | # with the help of [pandas built-in plotting
74 | # function](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#histograms).
75 | #
76 | # What conclusions do you draw from the results?
77 | 
78 | # %%
79 | # Write your code here.
80 | 
81 | # %% [markdown]
82 | # Change the `strategy` of the dummy classifier to `"stratified"`, compute the
83 | # results. Similarly compute scores for `strategy="uniform"` and then the  plot
84 | # the distribution together with the other results.
85 | #
86 | # Are those new baselines better than the previous one? Why is this the case?
87 | #
88 | # Please refer to the scikit-learn documentation on
89 | # [sklearn.dummy.DummyClassifier](
90 | # https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)
91 | # to find out about the meaning of the `"stratified"` and `"uniform"`
92 | # strategies.
93 | 
94 | # %%
95 | # Write your code here.
96 | 


--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_01.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M6.01
16 | #
17 | # The aim of this notebook is to investigate if we can tune the hyperparameters
18 | # of a bagging regressor and evaluate the gain obtained.
19 | #
20 | # We will load the California housing dataset and split it into a training and a
21 | # testing set.
22 | 
23 | # %%
24 | from sklearn.datasets import fetch_california_housing
25 | from sklearn.model_selection import train_test_split
26 | 
27 | data, target = fetch_california_housing(as_frame=True, return_X_y=True)
28 | target *= 100  # rescale the target in k$
29 | data_train, data_test, target_train, target_test = train_test_split(
30 |     data, target, random_state=0, test_size=0.5
31 | )
32 | 
33 | # %% [markdown]
34 | # ```{note}
35 | # If you want a deeper overview regarding this dataset, you can refer to the
36 | # Appendix - Datasets description section at the end of this MOOC.
37 | # ```
38 | 
39 | # %% [markdown]
40 | # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its
41 | # parameter `estimator`. Train the regressor and evaluate its generalization
42 | # performance on the testing set using the mean absolute error.
43 | 
44 | # %%
45 | # Write your code here.
46 | 
47 | # %% [markdown]
48 | # Now, create a `RandomizedSearchCV` instance using the previous model and tune
49 | # the important parameters of the bagging regressor. Find the best parameters
50 | # and check if you are able to find a set of parameters that improve the default
51 | # regressor still using the mean absolute error as a metric.
52 | #
53 | # ```{tip}
54 | # You can list the bagging regressor's parameters using the `get_params` method.
55 | # ```
56 | 
57 | # %%
58 | # Write your code here.
59 | 


--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_02.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M6.02
16 | #
17 | # The aim of this exercise it to explore some attributes available in
18 | # scikit-learn's random forest.
19 | #
20 | # First, we will fit the penguins regression dataset.
21 | 
22 | # %%
23 | import pandas as pd
24 | from sklearn.model_selection import train_test_split
25 | 
26 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
27 | feature_name = "Flipper Length (mm)"
28 | target_name = "Body Mass (g)"
29 | data, target = penguins[[feature_name]], penguins[target_name]
30 | data_train, data_test, target_train, target_test = train_test_split(
31 |     data, target, random_state=0
32 | )
33 | 
34 | # %% [markdown]
35 | # ```{note}
36 | # If you want a deeper overview regarding this dataset, you can refer to the
37 | # Appendix - Datasets description section at the end of this MOOC.
38 | # ```
39 | 
40 | # %% [markdown]
41 | # Create a random forest containing three trees. Train the forest and check the
42 | # generalization performance on the testing set in terms of mean absolute error.
43 | 
44 | # %%
45 | # Write your code here.
46 | 
47 | # %% [markdown]
48 | # We now aim to plot the predictions from the individual trees in the forest.
49 | # For that purpose you have to create first a new dataset containing evenly
50 | # spaced values for the flipper length over the interval between 170 mm and 230
51 | # mm.
52 | 
53 | # %%
54 | # Write your code here.
55 | 
56 | # %% [markdown]
57 | # The trees contained in the forest that you created can be accessed with the
58 | # attribute `estimators_`. Use them to predict the body mass corresponding to
59 | # the values in this newly created dataset. Similarly find the predictions of
60 | # the random forest in this dataset.
61 | 
62 | # %%
63 | # Write your code here.
64 | 
65 | # %% [markdown]
66 | # Now make a plot that displays:
67 | # - the whole `data` using a scatter plot;
68 | # - the decision of each individual tree;
69 | # - the decision of the random forest.
70 | 
71 | # %%
72 | # Write your code here.
73 | 


--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_03.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     text_representation:
  5 | #       extension: .py
  6 | #       format_name: percent
  7 | #       format_version: '1.3'
  8 | #       jupytext_version: 1.17.1
  9 | #   kernelspec:
 10 | #     display_name: Python 3
 11 | #     name: python3
 12 | # ---
 13 | 
 14 | # %% [markdown]
 15 | # # 📝 Exercise M6.03
 16 | #
 17 | # The aim of this exercise is to:
 18 | #
 19 | # * verifying if a random forest or a gradient-boosting decision tree overfit if
 20 | #   the number of estimators is not properly chosen;
 21 | # * use the early-stopping strategy to avoid adding unnecessary trees, to get
 22 | #   the best generalization performances.
 23 | #
 24 | # We use the California housing dataset to conduct our experiments.
 25 | 
 26 | # %%
 27 | from sklearn.datasets import fetch_california_housing
 28 | from sklearn.model_selection import train_test_split
 29 | 
 30 | data, target = fetch_california_housing(return_X_y=True, as_frame=True)
 31 | target *= 100  # rescale the target in k$
 32 | data_train, data_test, target_train, target_test = train_test_split(
 33 |     data, target, random_state=0, test_size=0.5
 34 | )
 35 | 
 36 | # %% [markdown]
 37 | # ```{note}
 38 | # If you want a deeper overview regarding this dataset, you can refer to the
 39 | # Appendix - Datasets description section at the end of this MOOC.
 40 | # ```
 41 | 
 42 | # %% [markdown]
 43 | # Create a gradient boosting decision tree with `max_depth=5` and
 44 | # `learning_rate=0.5`.
 45 | 
 46 | # %%
 47 | # Write your code here.
 48 | 
 49 | # %% [markdown]
 50 | #
 51 | # Also create a random forest with fully grown trees by setting `max_depth=None`.
 52 | 
 53 | # %%
 54 | # Write your code here.
 55 | 
 56 | # %% [markdown]
 57 | #
 58 | # For both the gradient-boosting and random forest models, create a validation
 59 | # curve using the training set to assess the impact of the number of trees on
 60 | # the performance of each model. Evaluate the list of parameters `param_range =
 61 | # np.array([1, 2, 5, 10, 20, 50, 100, 200])` and score it using
 62 | # `neg_mean_absolute_error`. Remember to set `negate_score=True` to recover the
 63 | # right sign of the Mean Absolute Error.
 64 | 
 65 | # %%
 66 | # Write your code here.
 67 | 
 68 | # %% [markdown]
 69 | # Random forest models improve when increasing the number of trees in the
 70 | # ensemble. However, the scores reach a plateau where adding new trees just
 71 | # makes fitting and scoring slower.
 72 | #
 73 | # Now repeat the analysis for the gradient boosting model.
 74 | 
 75 | # %%
 76 | # Write your code here.
 77 | 
 78 | 
 79 | # %% [markdown]
 80 | # Gradient boosting models overfit when the number of trees is too large. To
 81 | # avoid adding a new unnecessary tree, unlike random-forest gradient-boosting
 82 | # offers an early-stopping option. Internally, the algorithm uses an
 83 | # out-of-sample set to compute the generalization performance of the model at
 84 | # each addition of a tree. Thus, if the generalization performance is not
 85 | # improving for several iterations, it stops adding trees.
 86 | #
 87 | # Now, create a gradient-boosting model with `n_estimators=1_000`. This number
 88 | # of trees is certainly too large as we have seen above. Change the parameter
 89 | # `n_iter_no_change` such that the gradient boosting fitting stops after adding
 90 | # 5 trees to avoid deterioration of the overall generalization performance.
 91 | 
 92 | # %%
 93 | # Write your code here.
 94 | 
 95 | # %% [markdown]
 96 | # Estimate the generalization performance of this model again using the
 97 | # `sklearn.metrics.mean_absolute_error` metric but this time using the test set
 98 | # that we held out at the beginning of the notebook. Compare the resulting value
 99 | # with the values observed in the validation curve.
100 | 
101 | # %%
102 | # Write your code here.
103 | 


--------------------------------------------------------------------------------
/python_scripts/ensemble_ex_04.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M6.04
16 | #
17 | # The aim of the exercise is to get familiar with the histogram
18 | # gradient-boosting in scikit-learn. Besides, we will use this model within a
19 | # cross-validation framework in order to inspect internal parameters found via
20 | # grid-search.
21 | #
22 | # We will use the California housing dataset.
23 | 
24 | # %%
25 | from sklearn.datasets import fetch_california_housing
26 | 
27 | data, target = fetch_california_housing(return_X_y=True, as_frame=True)
28 | target *= 100  # rescale the target in k$
29 | 
30 | # %% [markdown]
31 | # First, create a histogram gradient boosting regressor. You can set the trees
32 | # number to be large, and configure the model to use early-stopping.
33 | 
34 | # %%
35 | # Write your code here.
36 | 
37 | # %% [markdown]
38 | # We will use a grid-search to find some optimal parameter for this model. In
39 | # this grid-search, you should search for the following parameters:
40 | #
41 | # * `max_depth: [3, 8]`;
42 | # * `max_leaf_nodes: [15, 31]`;
43 | # * `learning_rate: [0.1, 1]`.
44 | #
45 | # Feel free to explore the space with additional values. Create the grid-search
46 | # providing the previous gradient boosting instance as the model.
47 | 
48 | # %%
49 | # Write your code here.
50 | 
51 | # %% [markdown]
52 | # Finally, we will run our experiment through cross-validation. In this regard,
53 | # define a 5-fold cross-validation. Besides, be sure to shuffle the data.
54 | # Subsequently, use the function `sklearn.model_selection.cross_validate` to run
55 | # the cross-validation. You should also set `return_estimator=True`, so that we
56 | # can investigate the inner model trained via cross-validation.
57 | 
58 | # %%
59 | # Write your code here.
60 | 
61 | # %% [markdown]
62 | # Now that we got the cross-validation results, print out the mean and standard
63 | # deviation score.
64 | 
65 | # %%
66 | # Write your code here.
67 | 
68 | # %% [markdown]
69 | # Then inspect the `estimator` entry of the results and check the best
70 | # parameters values. Besides, check the number of trees used by the model.
71 | 
72 | # %%
73 | # Write your code here.
74 | 
75 | # %% [markdown]
76 | # Inspect the results of the inner CV for each estimator of the outer CV.
77 | # Aggregate the mean test score for each parameter combination and make a box
78 | # plot of these scores.
79 | 
80 | # %%
81 | # Write your code here.
82 | 


--------------------------------------------------------------------------------
/python_scripts/ensemble_sol_01.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   kernelspec:
  4 | #     display_name: Python 3
  5 | #     name: python3
  6 | # ---
  7 | 
  8 | # %% [markdown]
  9 | # # 📃 Solution for Exercise M6.01
 10 | #
 11 | # The aim of this notebook is to investigate if we can tune the hyperparameters
 12 | # of a bagging regressor and evaluate the gain obtained.
 13 | #
 14 | # We will load the California housing dataset and split it into a training and a
 15 | # testing set.
 16 | 
 17 | # %%
 18 | from sklearn.datasets import fetch_california_housing
 19 | from sklearn.model_selection import train_test_split
 20 | 
 21 | data, target = fetch_california_housing(as_frame=True, return_X_y=True)
 22 | target *= 100  # rescale the target in k$
 23 | data_train, data_test, target_train, target_test = train_test_split(
 24 |     data, target, random_state=0, test_size=0.5
 25 | )
 26 | 
 27 | # %% [markdown]
 28 | # ```{note}
 29 | # If you want a deeper overview regarding this dataset, you can refer to the
 30 | # Appendix - Datasets description section at the end of this MOOC.
 31 | # ```
 32 | 
 33 | # %% [markdown]
 34 | # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its
 35 | # parameter `estimator`. Train the regressor and evaluate its generalization
 36 | # performance on the testing set using the mean absolute error.
 37 | 
 38 | # %%
 39 | # solution
 40 | from sklearn.metrics import mean_absolute_error
 41 | from sklearn.tree import DecisionTreeRegressor
 42 | from sklearn.ensemble import BaggingRegressor
 43 | 
 44 | tree = DecisionTreeRegressor()
 45 | bagging = BaggingRegressor(estimator=tree, n_jobs=2)
 46 | bagging.fit(data_train, target_train)
 47 | target_predicted = bagging.predict(data_test)
 48 | print(
 49 |     "Basic mean absolute error of the bagging regressor:\n"
 50 |     f"{mean_absolute_error(target_test, target_predicted):.2f} k$"
 51 | )
 52 | 
 53 | # %% [markdown]
 54 | # Now, create a `RandomizedSearchCV` instance using the previous model and tune
 55 | # the important parameters of the bagging regressor. Find the best parameters
 56 | # and check if you are able to find a set of parameters that improve the default
 57 | # regressor still using the mean absolute error as a metric.
 58 | 
 59 | # ```{tip}
 60 | # You can list the bagging regressor's parameters using the `get_params` method.
 61 | # ```
 62 | 
 63 | # %%
 64 | # solution
 65 | for param in bagging.get_params().keys():
 66 |     print(param)
 67 | 
 68 | # %% tags=["solution"]
 69 | from scipy.stats import randint
 70 | from sklearn.model_selection import RandomizedSearchCV
 71 | 
 72 | param_grid = {
 73 |     "n_estimators": randint(10, 30),
 74 |     "max_samples": [0.5, 0.8, 1.0],
 75 |     "max_features": [0.5, 0.8, 1.0],
 76 |     "estimator__max_depth": randint(3, 10),
 77 | }
 78 | search = RandomizedSearchCV(
 79 |     bagging, param_grid, n_iter=20, scoring="neg_mean_absolute_error"
 80 | )
 81 | _ = search.fit(data_train, target_train)
 82 | 
 83 | # %% tags=["solution"]
 84 | import pandas as pd
 85 | 
 86 | columns = [f"param_{name}" for name in param_grid.keys()]
 87 | columns += ["mean_test_error", "std_test_error"]
 88 | cv_results = pd.DataFrame(search.cv_results_)
 89 | cv_results["mean_test_error"] = -cv_results["mean_test_score"]
 90 | cv_results["std_test_error"] = cv_results["std_test_score"]
 91 | cv_results[columns].sort_values(by="mean_test_error")
 92 | 
 93 | # %% tags=["solution"]
 94 | target_predicted = search.predict(data_test)
 95 | print(
 96 |     "Mean absolute error after tuning of the bagging regressor:\n"
 97 |     f"{mean_absolute_error(target_test, target_predicted):.2f} k$"
 98 | )
 99 | 
100 | # %% [markdown] tags=["solution"]
101 | # We see that the predictor provided by the bagging regressor does not need much
102 | # hyperparameter tuning compared to a single decision tree.
103 | 


--------------------------------------------------------------------------------
/python_scripts/feature_selection_ex_01.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise 01
16 | #
17 | # The aim of this exercise is to highlight caveats to have in mind when using
18 | # feature selection. You have to be extremely careful regarding the set of data
19 | # on which you will compute the statistic that helps your feature selection
20 | # algorithm to decide which feature to select.
21 | #
22 | # On purpose, we will make you program the wrong way of doing feature selection
23 | # to gain insights.
24 | #
25 | # First, you will create a completely random dataset using NumPy. Using the
26 | # function `np.random.randn`, generate a matrix `data` containing 100 samples
27 | # and 100,000 features. Then, using the function `np.random.randint`, generate a
28 | # vector `target` with 100 samples containing either 0 or 1.
29 | #
30 | # This type of dimensionality is typical in bioinformatics when dealing with
31 | # RNA-seq. However, we will use completely randomized features such that we
32 | # don't have a link between the data and the target. Thus, the generalization
33 | # performance of any machine-learning model should not perform better than the
34 | # chance-level.
35 | 
36 | # %%
37 | import numpy as np
38 | 
39 | # Write your code here.
40 | 
41 | # %% [markdown]
42 | # Now, create a logistic regression model and use cross-validation to check the
43 | # score of such a model. It will allow use to confirm that our model cannot
44 | # predict anything meaningful from random data.
45 | 
46 | # %%
47 | # Write your code here.
48 | 
49 | # %% [markdown]
50 | # Now, we will ask you to program the **wrong** pattern to select feature.
51 | # Select the feature by using the entire dataset. We will choose ten features
52 | # with the highest ANOVA F-score computed on the full dataset. Subsequently,
53 | # subsample the dataset `data` by selecting the features' subset. Finally, train
54 | # and test a logistic regression model.
55 | #
56 | # You should get some surprising results.
57 | 
58 | # %%
59 | from sklearn.feature_selection import SelectKBest, f_classif
60 | 
61 | # Write your code here.
62 | 
63 | # %% [markdown]
64 | # Now, we will make you program the **right** way to do the feature selection.
65 | # First, split the dataset into a training and testing set. Then, fit the
66 | # feature selector on the training set. Then, transform both the training and
67 | # testing sets before you train and test the logistic regression.
68 | 
69 | # %%
70 | from sklearn.model_selection import train_test_split
71 | 
72 | # Write your code here.
73 | 
74 | # %% [markdown]
75 | # However, the previous case is not perfect. For instance, if we were asking to
76 | # perform cross-validation, the manual `fit`/`transform` of the datasets will
77 | # make our life hard. Indeed, the solution here is to use a scikit-learn
78 | # pipeline in which the feature selection will be a pre processing stage before
79 | # to train the model.
80 | #
81 | # Thus, start by creating a pipeline with the feature selector and the logistic
82 | # regression. Then, use cross-validation to get an estimate of the uncertainty
83 | # of your model generalization performance.
84 | 
85 | # %%
86 | from sklearn.pipeline import make_pipeline
87 | 
88 | # Write your code here.
89 | 


--------------------------------------------------------------------------------
/python_scripts/feature_selection_limitation_model.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   kernelspec:
  4 | #     display_name: Python 3
  5 | #     name: python3
  6 | # ---
  7 | 
  8 | # %% [markdown]
  9 | # # Limitation of selecting feature using a model
 10 | #
 11 | # In this notebook, we want to show a limitation when using a machine-learning
 12 | # model to make a selection.
 13 | #
 14 | # Indeed, one can inspect a model and find relative feature importances. For
 15 | # instance, the parameters `coef_` for the linear models or
 16 | # `feature_importances_` for the tree-based models carries such information.
 17 | # Therefore, this method works as far as the relative feature importances given
 18 | # by the model is sufficient to select the meaningful feature.
 19 | #
 20 | # Here, we will generate a dataset that contains a large number of random
 21 | # features.
 22 | 
 23 | # %%
 24 | from sklearn.datasets import make_classification
 25 | 
 26 | data, target = make_classification(
 27 |     n_samples=5000,
 28 |     n_features=100,
 29 |     n_informative=2,
 30 |     n_redundant=5,
 31 |     n_repeated=5,
 32 |     class_sep=0.3,
 33 |     random_state=0,
 34 | )
 35 | 
 36 | # %% [markdown]
 37 | # First, let's build a model which will not make any features selection.
 38 | 
 39 | # %%
 40 | from sklearn.ensemble import RandomForestClassifier
 41 | 
 42 | model_without_selection = RandomForestClassifier()
 43 | 
 44 | # %% [markdown]
 45 | # We will evaluate this model by a k-fold cross validation and store the results
 46 | # in a pandas dataframe.
 47 | 
 48 | # %%
 49 | import pandas as pd
 50 | from sklearn.model_selection import cross_validate
 51 | 
 52 | cv_results_without_selection = cross_validate(
 53 |     model_without_selection, data, target, cv=5
 54 | )
 55 | cv_results_without_selection = pd.DataFrame(cv_results_without_selection)
 56 | 
 57 | # %% [markdown]
 58 | # Then, we will build another model which will include a feature selection step
 59 | # based on a random forest and evaluate it as well with cross-validation.
 60 | 
 61 | # %%
 62 | from sklearn.pipeline import make_pipeline
 63 | from sklearn.feature_selection import SelectFromModel
 64 | 
 65 | feature_selector = SelectFromModel(RandomForestClassifier())
 66 | model_with_selection = make_pipeline(
 67 |     feature_selector, RandomForestClassifier()
 68 | )
 69 | 
 70 | # %%
 71 | cv_results_with_selection = cross_validate(
 72 |     model_with_selection, data, target, cv=5
 73 | )
 74 | cv_results_with_selection = pd.DataFrame(cv_results_with_selection)
 75 | 
 76 | # %% [markdown]
 77 | # We can compare the testing score of the two models. For this matter, we are
 78 | # combining results in a single dataframe.
 79 | 
 80 | # %%
 81 | cv_results = pd.concat(
 82 |     [cv_results_without_selection, cv_results_with_selection],
 83 |     axis=1,
 84 |     keys=["Without feature selection", "With feature selection"],
 85 | ).swaplevel(axis="columns")
 86 | 
 87 | # %% [markdown]
 88 | # Finally, we can check the testing score of each the model.
 89 | 
 90 | # %%
 91 | import matplotlib.pyplot as plt
 92 | 
 93 | color = {"whiskers": "black", "medians": "black", "caps": "black"}
 94 | cv_results["test_score"].plot.box(color=color, vert=False)
 95 | plt.xlabel("Accuracy")
 96 | _ = plt.title("Limitation of using a random forest for feature selection")
 97 | 
 98 | # %% [markdown]
 99 | # The model that selected a subset of feature is less performant than a random
100 | # forest fitted on the full dataset.
101 | #
102 | # We can rely on some aspects tackled in the notebook presenting the model
103 | # inspection to explain this behaviour. The decision tree's relative feature
104 | # importance will overestimate the importance of random feature when the
105 | # decision tree overfits the training set.
106 | #
107 | # Therefore, it is good to keep in mind that feature selection relies on
108 | # procedures making some assumptions, which can be perfectible.
109 | 


--------------------------------------------------------------------------------
/python_scripts/linear_models_ex_01.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     text_representation:
  5 | #       extension: .py
  6 | #       format_name: percent
  7 | #       format_version: '1.3'
  8 | #       jupytext_version: 1.17.1
  9 | #   kernelspec:
 10 | #     display_name: Python 3
 11 | #     name: python3
 12 | # ---
 13 | 
 14 | # %% [markdown]
 15 | # # 📝 Exercise M4.01
 16 | #
 17 | # The aim of this exercise is two-fold:
 18 | #
 19 | # * understand the parametrization of a linear model;
 20 | # * quantify the fitting accuracy of a set of such models.
 21 | #
 22 | # We will reuse part of the code of the course to:
 23 | #
 24 | # * load data;
 25 | # * create the function representing a linear model.
 26 | #
 27 | # ## Prerequisites
 28 | #
 29 | # ### Data loading
 30 | 
 31 | # %% [markdown]
 32 | # ```{note}
 33 | # If you want a deeper overview regarding this dataset, you can refer to the
 34 | # Appendix - Datasets description section at the end of this MOOC.
 35 | # ```
 36 | 
 37 | # %%
 38 | import pandas as pd
 39 | 
 40 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
 41 | feature_name = "Flipper Length (mm)"
 42 | target_name = "Body Mass (g)"
 43 | data, target = penguins[[feature_name]], penguins[target_name]
 44 | 
 45 | # %% [markdown]
 46 | # ### Model definition
 47 | 
 48 | 
 49 | # %%
 50 | def linear_model_flipper_mass(
 51 |     flipper_length, weight_flipper_length, intercept_body_mass
 52 | ):
 53 |     """Linear model of the form y = a * x + b"""
 54 |     body_mass = weight_flipper_length * flipper_length + intercept_body_mass
 55 |     return body_mass
 56 | 
 57 | 
 58 | # %% [markdown]
 59 | # ## Main exercise
 60 | #
 61 | # Define a vector `weights = [...]` and a vector `intercepts = [...]` of the
 62 | # same length. Each pair of entries `(weights[i], intercepts[i])` tags a
 63 | # different model. Use these vectors along with the vector
 64 | # `flipper_length_range` to plot several linear models that could possibly fit
 65 | # our data. Use the above helper function to visualize both the models and the
 66 | # real samples.
 67 | 
 68 | # %%
 69 | import numpy as np
 70 | 
 71 | flipper_length_range = np.linspace(data.min(), data.max(), num=300)
 72 | 
 73 | # %%
 74 | # Write your code here.
 75 | 
 76 | # %% [markdown]
 77 | # In the previous question, you were asked to create several linear models. The
 78 | # visualization allowed you to qualitatively assess if a model was better than
 79 | # another.
 80 | #
 81 | # Now, you should come up with a quantitative measure which indicates the
 82 | # goodness of fit of each linear model and allows you to select the best model.
 83 | # Define a function `goodness_fit_measure(true_values, predictions)` that takes
 84 | # as inputs the true target values and the predictions and returns a single
 85 | # scalar as output.
 86 | 
 87 | 
 88 | # %%
 89 | # Write your code here.
 90 | 
 91 | # %% [markdown]
 92 | # You can now copy and paste the code below to show the goodness of fit for each
 93 | # model.
 94 | #
 95 | # ```python
 96 | # for model_idx, (weight, intercept) in enumerate(zip(weights, intercepts)):
 97 | #     target_predicted = linear_model_flipper_mass(data, weight, intercept)
 98 | #     print(f"Model #{model_idx}:")
 99 | #     print(f"{weight:.2f} (g / mm) * flipper length + {intercept:.2f} (g)")
100 | #     print(f"Error: {goodness_fit_measure(target, target_predicted):.3f}\n")
101 | # ```
102 | 
103 | # %%
104 | # Write your code here.
105 | 


--------------------------------------------------------------------------------
/python_scripts/matplotlibrc:
--------------------------------------------------------------------------------
 1 | axes.labelsize: 18.0
 2 | axes.linewidth: 1.875
 3 | axes.titlesize: 18.0
 4 | boxplot.whiskers: 1000
 5 | boxplot.patchartist: True
 6 | boxplot.boxprops.color: black
 7 | boxplot.capprops.color: black
 8 | boxplot.medianprops.color: black
 9 | boxplot.whiskerprops.color: black
10 | boxplot.boxprops.linewidth: 3.0
11 | boxplot.capprops.linewidth: 3.0
12 | boxplot.medianprops.linewidth: 2.5
13 | boxplot.whiskerprops.linewidth: 3.0
14 | figure.titlesize: 22.0
15 | font.size: 18.0
16 | grid.linewidth: 1.5
17 | legend.fontsize: 16.5
18 | legend.title_fontsize: 18.0
19 | lines.linewidth: 3.5
20 | lines.markersize: 9.0
21 | patch.linewidth: 1.5
22 | xtick.labelsize: 16.5
23 | xtick.major.size: 9.0
24 | xtick.major.width: 1.875
25 | xtick.minor.size: 6.0
26 | xtick.minor.width: 1.5
27 | ytick.labelsize: 16.5
28 | ytick.major.size: 9.0
29 | ytick.major.width: 1.875
30 | ytick.minor.size: 6.0
31 | ytick.minor.width: 1.5
32 | 


--------------------------------------------------------------------------------
/python_scripts/metrics_ex_02.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M7.03
16 | #
17 | # As with the classification metrics exercise, we will evaluate the regression
18 | # metrics within a cross-validation framework to get familiar with the syntax.
19 | #
20 | # We will use the Ames house prices dataset.
21 | 
22 | # %%
23 | import pandas as pd
24 | import numpy as np
25 | 
26 | ames_housing = pd.read_csv("../datasets/house_prices.csv")
27 | data = ames_housing.drop(columns="SalePrice")
28 | target = ames_housing["SalePrice"]
29 | data = data.select_dtypes(np.number)
30 | target /= 1000
31 | 
32 | # %% [markdown]
33 | # ```{note}
34 | # If you want a deeper overview regarding this dataset, you can refer to the
35 | # Appendix - Datasets description section at the end of this MOOC.
36 | # ```
37 | 
38 | 
39 | # %% [markdown]
40 | # The first step will be to create a linear regression model.
41 | 
42 | # %%
43 | # Write your code here.
44 | 
45 | # %% [markdown]
46 | # Then, use the `cross_val_score` to estimate the generalization performance of
47 | # the model. Use a `KFold` cross-validation with 10 folds. Make the use of the
48 | # $R^2$ score explicit by assigning the parameter `scoring` (even though it is
49 | # the default score).
50 | 
51 | # %%
52 | # Write your code here.
53 | 
54 | # %% [markdown]
55 | # Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You
56 | # may need to refer to the documentation for the `scoring` parameter.
57 | 
58 | # %%
59 | # Write your code here.
60 | 
61 | # %% [markdown]
62 | # Finally, use the `cross_validate` function and compute multiple scores/errors
63 | # at once by passing a list of scorers to the `scoring` parameter. You can
64 | # compute the $R^2$ score and the mean absolute error for instance.
65 | 
66 | # %%
67 | # Write your code here.
68 | 


--------------------------------------------------------------------------------
/python_scripts/parameter_tuning_ex_02.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M3.01
16 | #
17 | # The goal is to write an exhaustive search to find the best parameters
18 | # combination maximizing the model generalization performance.
19 | #
20 | # Here we use a small subset of the Adult Census dataset to make the code faster
21 | # to execute. Once your code works on the small subset, try to change
22 | # `train_size` to a larger value (e.g. 0.8 for 80% instead of 20%).
23 | 
24 | # %%
25 | import pandas as pd
26 | 
27 | from sklearn.model_selection import train_test_split
28 | 
29 | adult_census = pd.read_csv("../datasets/adult-census.csv")
30 | 
31 | target_name = "class"
32 | target = adult_census[target_name]
33 | data = adult_census.drop(columns=[target_name, "education-num"])
34 | 
35 | data_train, data_test, target_train, target_test = train_test_split(
36 |     data, target, train_size=0.2, random_state=42
37 | )
38 | 
39 | # %%
40 | from sklearn.compose import make_column_transformer
41 | from sklearn.compose import make_column_selector as selector
42 | from sklearn.preprocessing import OrdinalEncoder
43 | 
44 | categorical_preprocessor = OrdinalEncoder(
45 |     handle_unknown="use_encoded_value", unknown_value=-1
46 | )
47 | preprocessor = make_column_transformer(
48 |     (categorical_preprocessor, selector(dtype_include=object)),
49 |     remainder="passthrough",
50 | )
51 | 
52 | from sklearn.ensemble import HistGradientBoostingClassifier
53 | from sklearn.pipeline import Pipeline
54 | 
55 | model = Pipeline(
56 |     [
57 |         ("preprocessor", preprocessor),
58 |         ("classifier", HistGradientBoostingClassifier(random_state=42)),
59 |     ]
60 | )
61 | 
62 | # %% [markdown]
63 | # Use the previously defined model (called `model`) and using two nested `for`
64 | # loops, make a search of the best combinations of the `learning_rate` and
65 | # `max_leaf_nodes` parameters. In this regard, you need to train and test the
66 | # model by setting the parameters. The evaluation of the model should be
67 | # performed using `cross_val_score` on the training set. Use the following
68 | # parameters search:
69 | # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls
70 | #   the ability of a new tree to correct the error of the previous sequence of
71 | #   trees
72 | # - `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the depth
73 | #   of each tree.
74 | 
75 | # %%
76 | # Write your code here.
77 | 
78 | # %% [markdown]
79 | # Now use the test set to score the model using the best parameters that we
80 | # found using cross-validation. You will have to refit the model over the full
81 | # training set.
82 | 
83 | # %%
84 | # Write your code here.
85 | 
86 | # %%
87 | 


--------------------------------------------------------------------------------
/python_scripts/parameter_tuning_ex_03.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M3.02
16 | #
17 | # The goal is to find the best set of hyperparameters which maximize the
18 | # generalization performance on a training set.
19 | 
20 | # %%
21 | from sklearn.datasets import fetch_california_housing
22 | from sklearn.model_selection import train_test_split
23 | 
24 | data, target = fetch_california_housing(return_X_y=True, as_frame=True)
25 | target *= 100  # rescale the target in k$
26 | 
27 | data_train, data_test, target_train, target_test = train_test_split(
28 |     data, target, random_state=42
29 | )
30 | 
31 | # %% [markdown]
32 | # In this exercise, we progressively define the regression pipeline and later
33 | # tune its hyperparameters.
34 | #
35 | # Start by defining a pipeline that:
36 | # * uses a `StandardScaler` to normalize the numerical data;
37 | # * uses a `sklearn.neighbors.KNeighborsRegressor` as a predictive model.
38 | 
39 | # %%
40 | # Write your code here.
41 | 
42 | # %% [markdown]
43 | # Use `RandomizedSearchCV` with `n_iter=20` and
44 | # `scoring="neg_mean_absolute_error"` to tune the following hyperparameters
45 | # of the `model`:
46 | #
47 | # - the parameter `n_neighbors` of the `KNeighborsRegressor` with values
48 | #   `np.logspace(0, 3, num=10).astype(np.int32)`;
49 | # - the parameter `with_mean` of the `StandardScaler` with possible values
50 | #   `True` or `False`;
51 | # - the parameter `with_std` of the `StandardScaler` with possible values `True`
52 | #   or `False`.
53 | #
54 | # The `scoring` function is expected to return higher values for better models,
55 | # since grid/random search objects **maximize** it. Because of that, error
56 | # metrics like `mean_absolute_error` must be negated (using the `neg_` prefix)
57 | # to work correctly (remember lower errors represent better models).
58 | #
59 | # Notice that in the notebook "Hyperparameter tuning by randomized-search" we
60 | # pass distributions to be sampled by the `RandomizedSearchCV`. In this case we
61 | # define a fixed grid of hyperparameters to be explored. Using a `GridSearchCV`
62 | # instead would explore all the possible combinations on the grid, which can be
63 | # costly to compute for large grids, whereas the parameter `n_iter` of the
64 | # `RandomizedSearchCV` controls the number of different random combination that
65 | # are evaluated. Notice that setting `n_iter` larger than the number of possible
66 | # combinations in a grid (in this case 10 x 2 x 2 = 40) would lead to repeating
67 | # already-explored combinations.
68 | #
69 | # Once the computation has completed, print the best combination of parameters
70 | # stored in the `best_params_` attribute.
71 | 
72 | # %%
73 | # Write your code here.
74 | 


--------------------------------------------------------------------------------
/python_scripts/trees_dataset.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   kernelspec:
 4 | #     display_name: Python 3
 5 | #     name: python3
 6 | # ---
 7 | 
 8 | # %% [markdown]
 9 | # # The penguins datasets
10 | #
11 | # In this notebook, we make a quick presentation of the [Palmer penguins
12 | # dataset](https://allisonhorst.github.io/palmerpenguins/) dataset. We use this
13 | # dataset for both classification and regression problems by selecting a subset
14 | # of the features to make our explanations intuitive.
15 | #
16 | # ## Classification dataset
17 | #
18 | # We use this dataset in classification setting to predict the penguins'
19 | # species from anatomical information.
20 | #
21 | # Each penguin is from one of the three following species: Adelie, Gentoo, and
22 | # Chinstrap. See the illustration below depicting the three different penguin
23 | # species:
24 | #
25 | # ![Image of
26 | # penguins](https://github.com/allisonhorst/palmerpenguins/raw/main/man/figures/lter_penguins.png)
27 | #
28 | # This problem is a classification problem since the target is categorical. We
29 | # limit our input data to a subset of the original features to simplify our
30 | # explanations when presenting the decision tree algorithm. Indeed, we use
31 | # features based on penguins' culmen measurement. You can learn more about the
32 | # penguins' culmen with the illustration below:
33 | #
34 | # ![Image of
35 | # culmen](https://github.com/allisonhorst/palmerpenguins/raw/main/man/figures/culmen_depth.png)
36 | #
37 | # We start by loading this subset of the dataset.
38 | 
39 | # %%
40 | import pandas as pd
41 | 
42 | penguins = pd.read_csv("../datasets/penguins_classification.csv")
43 | 
44 | culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"]
45 | target_column = "Species"
46 | 
47 | # %% [markdown]
48 | # Let's check the dataset more into details.
49 | 
50 | # %%
51 | penguins
52 | 
53 | # %% [markdown]
54 | # Since that we have few samples, we can check a scatter plot to observe the
55 | # samples distribution.
56 | 
57 | # %%
58 | import seaborn as sns
59 | 
60 | pairplot_figure = sns.pairplot(penguins, hue="Species")
61 | pairplot_figure.fig.set_size_inches(9, 6.5)
62 | 
63 | # %% [markdown]
64 | # First let's check the feature distributions by looking at the diagonal plots
65 | # of the pairplot. We can deduce the following intuitions:
66 | #
67 | # * The Adelie species can be differentiated from the Gentoo and Chinstrap
68 | #   species depending on the culmen length;
69 | # * The Gentoo species can be differentiated from the Adelie and Chinstrap
70 | #   species depending on the culmen depth.
71 | #
72 | # ## Regression dataset
73 | #
74 | # In a regression setting, the target is a continuous variable instead of
75 | # categories. Here, we use two features of the dataset to make such a problem:
76 | # the flipper length is used as data and the body mass as the target. In short,
77 | # we want to predict the body mass using the flipper length.
78 | #
79 | # We load the dataset and visualize the relationship between the flipper length
80 | # and the body mass of penguins.
81 | 
82 | # %%
83 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
84 | 
85 | feature_name = "Flipper Length (mm)"
86 | target_column = "Body Mass (g)"
87 | 
88 | # %%
89 | _ = sns.scatterplot(data=penguins, x=feature_name, y=target_column)
90 | 
91 | # %% [markdown]
92 | # Here, we deal with a regression problem because our target is a continuous
93 | # variable ranging from 2.7 kg to 6.3 kg. From the scatter plot above, we
94 | # observe that we have a linear relationship between the flipper length and the
95 | # body mass. The longer the flipper of a penguin, the heavier the penguin.
96 | 


--------------------------------------------------------------------------------
/python_scripts/trees_ex_01.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M5.01
16 | #
17 | # In the previous notebook, we showed how a tree with 1 level depth works. The
18 | # aim of this exercise is to repeat part of the previous experiment for a tree
19 | # with 2 levels depth to show how such parameter affects the feature space
20 | # partitioning.
21 | #
22 | # We first load the penguins dataset and split it into a training and a testing
23 | # sets:
24 | 
25 | # %%
26 | import pandas as pd
27 | 
28 | penguins = pd.read_csv("../datasets/penguins_classification.csv")
29 | culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"]
30 | target_column = "Species"
31 | 
32 | # %% [markdown]
33 | # ```{note}
34 | # If you want a deeper overview regarding this dataset, you can refer to the
35 | # Appendix - Datasets description section at the end of this MOOC.
36 | # ```
37 | 
38 | # %%
39 | from sklearn.model_selection import train_test_split
40 | 
41 | data, target = penguins[culmen_columns], penguins[target_column]
42 | data_train, data_test, target_train, target_test = train_test_split(
43 |     data, target, random_state=0
44 | )
45 | 
46 | # %% [markdown]
47 | # Create a decision tree classifier with a maximum depth of 2 levels and fit the
48 | # training data.
49 | 
50 | # %%
51 | # Write your code here.
52 | 
53 | # %% [markdown]
54 | # Now plot the data and the decision boundary of the trained classifier to see
55 | # the effect of increasing the depth of the tree.
56 | #
57 | # Hint: Use the class `DecisionBoundaryDisplay` from the module
58 | # `sklearn.inspection` as shown in previous course notebooks.
59 | #
60 | # ```{warning}
61 | # At this time, it is not possible to use `response_method="predict_proba"` for
62 | # multiclass problems on a single plot. This is a planned feature for a future
63 | # version of scikit-learn. In the mean time, you can use
64 | # `response_method="predict"` instead.
65 | # ```
66 | 
67 | # %%
68 | # Write your code here.
69 | 
70 | # %% [markdown]
71 | # Did we make use of the feature "Culmen Length"? Plot the tree using the
72 | # function `sklearn.tree.plot_tree` to find out!
73 | 
74 | # %%
75 | # Write your code here.
76 | 
77 | # %% [markdown]
78 | # Compute the accuracy of the decision tree on the testing data.
79 | 
80 | # %%
81 | # Write your code here.
82 | 


--------------------------------------------------------------------------------
/python_scripts/trees_ex_02.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     text_representation:
 5 | #       extension: .py
 6 | #       format_name: percent
 7 | #       format_version: '1.3'
 8 | #       jupytext_version: 1.17.1
 9 | #   kernelspec:
10 | #     display_name: Python 3
11 | #     name: python3
12 | # ---
13 | 
14 | # %% [markdown]
15 | # # 📝 Exercise M5.02
16 | #
17 | # The aim of this exercise is to find out whether a decision tree model is able
18 | # to extrapolate.
19 | #
20 | # By extrapolation, we refer to values predicted by a model outside of the range
21 | # of feature values seen during the training.
22 | #
23 | # We first load the regression data.
24 | 
25 | # %%
26 | import pandas as pd
27 | 
28 | penguins = pd.read_csv("../datasets/penguins_regression.csv")
29 | 
30 | feature_name = "Flipper Length (mm)"
31 | target_name = "Body Mass (g)"
32 | data_train, target_train = penguins[[feature_name]], penguins[target_name]
33 | 
34 | # %% [markdown]
35 | # ```{note}
36 | # If you want a deeper overview regarding this dataset, you can refer to the
37 | # Appendix - Datasets description section at the end of this MOOC.
38 | # ```
39 | 
40 | # %% [markdown]
41 | # First, create two models, a linear regression model and a decision tree
42 | # regression model, and fit them on the training data. Limit the depth at 3
43 | # levels for the decision tree.
44 | 
45 | # %%
46 | # Write your code here.
47 | 
48 | # %% [markdown]
49 | # Create a synthetic dataset containing all possible flipper length from the
50 | # minimum to the maximum of the training dataset. Get the predictions of each
51 | # model using this dataset.
52 | 
53 | # %%
54 | # Write your code here.
55 | 
56 | # %% [markdown]
57 | # Create a scatter plot containing the training samples and superimpose the
58 | # predictions of both models on the top.
59 | 
60 | # %%
61 | # Write your code here.
62 | 
63 | # %% [markdown]
64 | # Now, we check the extrapolation capabilities of each model. Create a dataset
65 | # containing a broader range of values than your previous dataset, in other
66 | # words, add values below and above the minimum and the maximum of the flipper
67 | # length seen during training.
68 | 
69 | # %%
70 | # Write your code here.
71 | 
72 | # %% [markdown]
73 | # Finally, make predictions with both models on this new interval of data.
74 | # Repeat the plotting of the previous exercise.
75 | 
76 | # %%
77 | # Write your code here.
78 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn>=1.6
 2 | pandas >= 1
 3 | matplotlib
 4 | seaborn >= 0.13
 5 | plotly
 6 | jupyter-book>=0.11
 7 | jupytext
 8 | beautifulsoup4
 9 | IPython
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=1.6
2 | pandas >= 1
3 | matplotlib
4 | seaborn >= 0.13
5 | plotly
6 | jupyterlab
7 | notebook
8 | IPython
9 | 


--------------------------------------------------------------------------------
/slides/Makefile:
--------------------------------------------------------------------------------
 1 | # Compilation is done via remarker and htmlark, both pip instalable
 2 | 
 3 | all: ml_concepts.html overfitting_vs_underfitting.html \
 4 |      learning_validation_curves.html bias_vs_variance.html \
 5 |      linear_models.html regularized_linear_models.html trees.html \
 6 |      ensemble.html concluding_remarks.html
 7 | 
 8 | 
 9 | %.html: %.md custom.css
10 | 	# HTMLArk is to embed images and css
11 | 	remarker $< -c custom.css > $@
12 | 


--------------------------------------------------------------------------------
/slides/README.md:
--------------------------------------------------------------------------------
 1 | # View slides
 2 | 
 3 | ## On the .github.io website
 4 | 
 5 | The general pattern is `https://inria.github.io/scikit-learn-mooc/slides/?file=[FILENAME].md`
 6 | 
 7 | Example for ML concepts slides:
 8 | https://inria.github.io/scikit-learn-mooc/slides/?file=ml_concepts.md
 9 | 
10 | ## Locally
11 | 
12 | Useful when working on the slides:
13 | 
14 | ```py
15 | # on the root repo folder
16 | python -m http.server
17 | 
18 | # open your browser with the right port (from previous command) using the right md file
19 | firefox 'http://localhost:8000/slides/index.html?file=../slides/ml_concepts.md'
20 | ```
21 | 


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Bold.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-BoldItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-BoldItalic.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Italic.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Light.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-LightItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-LightItalic.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Medium.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Medium.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-MediumItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-MediumItalic.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu/Ubuntu-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Regular.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Bold.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-BoldItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-BoldItalic.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-Italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Italic.ttf


--------------------------------------------------------------------------------
/slides/Ubuntu_Mono/UbuntuMono-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Regular.ttf


--------------------------------------------------------------------------------
/slides/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Presentation</title>
 5 |   <meta charset="utf-8">
 6 |   <link rel="stylesheet" type="text/css" href="custom.css">
 7 | </head>
 8 | <body>
 9 | <textarea id="source"></textarea>
10 | <script src="https://remarkjs.com/downloads/remark-latest.min.js">
11 | </script>
12 | <script>
13 |   var slideshow = remark.create({
14 |     ratio: '16:9',
15 |     slideNumberFormat: '(%current%/%total%)',
16 |     countIncrementalSlides: false,
17 |     highlightLines: true,
18 |     navigation: {
19 |       // disable scrolling to avoid weird behaviour inside iframe in FUN
20 |       // (scrolling over the slides moves your window and navigates slides at
21 |       // the same time)
22 |       scroll: false,
23 |     },
24 |     sourceUrl: getUrlParameter('file') ? getUrlParameter('file') : 'slides.md'
25 | 
26 |   });
27 | 
28 |   function getUrlParameter(name) {
29 |     name = name.replace(/[\[]/, '\\[').replace(/[\]]/, '\\]');
30 |     var regex = new RegExp('[\\?&]' + name + '=([^&#]*)');
31 |     var results = regex.exec(location.search);
32 |     return results === null ? '' : decodeURIComponent(results[1].replace(/\+/g, ' '));
33 |   };
34 | </script>
35 | </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/slides/intro_words.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Hi,
 3 | 
 4 | Welcome to the Machine-learning with scikit-learn course. The goal of
 5 | this course is to teach you practical aspects of machine learning. It
 6 | focuses on tabular data, given that such data is often encountered in the
 7 | industry. The course is light on maths, and focuses on practical aspects,
 8 | not only about pure machine learning, but also about the basics of data
 9 | preparation and visualization for machine learning. Most of the content is
10 | centered on executable Python code that teaches how to analyse the data,
11 | with tools such as scikit-learn.
12 | 
13 | Our goal is to be didactic. If you know Python programming and basic
14 | numerics, you should be able to follow along. We hope that this course
15 | will help introduce more people to machine learning.
16 | 


--------------------------------------------------------------------------------