├── .binder ├── environment.yml ├── postBuild └── start ├── .github └── workflows │ ├── deploy-gh-pages.yml │ ├── formatting.yml │ └── jupyter-book-pr-preview.yml ├── .gitignore ├── .jupyter ├── README.md └── jupyter_notebook_config.py ├── .pre-commit-config.yaml ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── build_tools ├── build_jupyter_book.sh ├── convert-python-script-to-notebook.py ├── generate-exercise-from-solution.py ├── generate-index.py ├── generate-quizzes.py ├── generate-wrap-up.py └── sanity-check.py ├── check_env.py ├── datasets ├── README.md ├── adult-census-numeric-all.csv ├── adult-census-numeric-test.csv ├── adult-census-numeric.csv ├── adult-census.csv ├── ames_housing_no_missing.csv ├── bike_rides.csv ├── blood_transfusion.csv ├── cps_85_wages.csv ├── financial-data │ ├── COP.csv │ ├── CVX.csv │ ├── TOT.csv │ ├── VLO.csv │ └── XOM.csv ├── house_prices.csv ├── penguins.csv ├── penguins_classification.csv └── penguins_regression.csv ├── environment-dev.yml ├── environment.yml ├── figures ├── README.md ├── api_diagram-columntransformer.svg ├── api_diagram-pipeline.fit.svg ├── api_diagram-pipeline.predict.svg ├── api_diagram-predictor.fit.svg ├── api_diagram-predictor.predict.svg ├── api_diagram-predictor.score.svg ├── api_diagram-transformer.fit.svg ├── api_diagram-transformer.fit_transform.svg ├── api_diagram-transformer.transform.svg ├── api_diagram.drawio ├── bagging.svg ├── bagging0.svg ├── bagging0_cross.svg ├── bagging_cross.svg ├── bagging_fit.svg ├── bagging_line.svg ├── bagging_overfit.svg ├── bagging_reg_blue.svg ├── bagging_reg_blue_grey.svg ├── bagging_reg_data.svg ├── bagging_reg_grey.svg ├── bagging_reg_grey_fitted.svg ├── bagging_trees.svg ├── bagging_trees_predict.svg ├── bagging_underfit.svg ├── bagging_vote.svg ├── boosting │ ├── boosting_iter1.svg │ ├── boosting_iter2.svg │ ├── boosting_iter3.svg │ ├── boosting_iter4.svg │ ├── boosting_iter_orange1.svg │ ├── boosting_iter_orange2.svg │ ├── boosting_iter_orange3.svg │ ├── boosting_iter_orange4.svg │ ├── boosting_iter_sized1.svg │ ├── boosting_iter_sized2.svg │ ├── boosting_iter_sized3.svg │ └── boosting_iter_sized4.svg ├── boosting0.svg ├── boosting0_cross.svg ├── boosting1.svg ├── boosting2.svg ├── boosting3.svg ├── boosting_reg_blue.svg ├── boosting_reg_data.svg ├── boosting_reg_grey.svg ├── boosting_trees1.svg ├── boosting_trees2.svg ├── boosting_trees3.svg ├── bossting_reg_blue.svg ├── categorical.svg ├── cross-val1-shuffle.svg ├── cross-val1.svg ├── cross-val2-shuffle.svg ├── cross-val2.svg ├── cross-val3-shuffle.svg ├── cross-val3.svg ├── cross-val4-shuffle.svg ├── cross-val4.svg ├── cross-val5-shuffle.svg ├── cross-val5.svg ├── cross_validation_diagram.png ├── cross_validation_train_test_diagram.png ├── different_models_complex_16.svg ├── different_models_complex_4.svg ├── dt_fit.svg ├── dt_overfit.svg ├── dt_underfit.svg ├── evaluation_quiz_precision_recall_C0.003.svg ├── evaluation_quiz_precision_recall_C1.svg ├── full_data.svg ├── grid_vs_random_search.svg ├── iris-silhouette.svg ├── iris-silhouette_gray.svg ├── iris_petal_length_cm_hist.svg ├── iris_petal_width_cm_hist.svg ├── iris_sepal_length_cm_hist.svg ├── iris_sepal_width_cm_hist.svg ├── legend_irises.svg ├── lin_not_separable.svg ├── lin_reg_2_points.svg ├── lin_reg_2_points_best_ridge.svg ├── lin_reg_2_points_best_ridge_grey.svg ├── lin_reg_2_points_no_penalty.svg ├── lin_reg_2_points_no_penalty_grey.svg ├── lin_reg_2_points_ridge.svg ├── lin_reg_2_points_ridge_grey.svg ├── lin_reg_3D.svg ├── lin_separable.svg ├── linear_data.svg ├── linear_fit.svg ├── linear_fit_red.svg ├── linear_ols.svg ├── linear_ols_test.svg ├── linear_splines.svg ├── linear_splines_test.svg ├── linreg_noreg_0.svg ├── linreg_noreg_0_nogrey.svg ├── linreg_noreg_1.svg ├── linreg_noreg_2.svg ├── linreg_noreg_3.svg ├── linreg_noreg_4.svg ├── linreg_noreg_5.svg ├── logistic_2D.svg ├── logistic_2D_C0.001.svg ├── logistic_2D_C1.svg ├── logistic_3D.svg ├── logistic_color.svg ├── mooc_computer.jpg ├── multinomial.svg ├── nested_cross_validation_diagram.png ├── numerical_pipeline_quiz_scaler.py ├── numerical_pipeline_quiz_scaler_original.png ├── numerical_pipeline_quiz_scaler_preprocessing.png ├── numerical_pipeline_wrap_up_quiz_comparison.png ├── ols_simple.svg ├── ols_simple_test.svg ├── ols_test.svg ├── people.svg ├── plot_cross_validation_diagram.py ├── plot_iris_visualization.py ├── plot_overfit_underfit.py ├── plot_parameter_tuning_cv.py ├── plot_precision_recall_visualization.py ├── plot_simple_decision_tree_adult_census.py ├── plot_slide_linear.py ├── plot_splines.py ├── plot_trees.py ├── polynomial_learning_curve_1179.svg ├── polynomial_learning_curve_145.svg ├── polynomial_learning_curve_42.svg ├── polynomial_learning_curve_6766.svg ├── polynomial_overfit.svg ├── polynomial_overfit_0.svg ├── polynomial_overfit_1.svg ├── polynomial_overfit_2.svg ├── polynomial_overfit_5.svg ├── polynomial_overfit_9.svg ├── polynomial_overfit_assymptotic.svg ├── polynomial_overfit_ntrain_1179.svg ├── polynomial_overfit_ntrain_145.svg ├── polynomial_overfit_ntrain_42.svg ├── polynomial_overfit_ntrain_6766.svg ├── polynomial_overfit_resample_0.svg ├── polynomial_overfit_resample_1.svg ├── polynomial_overfit_resample_2.svg ├── polynomial_overfit_resample_all.svg ├── polynomial_overfit_simple.svg ├── polynomial_overfit_simple_legend.svg ├── polynomial_overfit_test_1.svg ├── polynomial_overfit_test_2.svg ├── polynomial_overfit_test_5.svg ├── polynomial_overfit_test_9.svg ├── polynomial_overfit_truth.svg ├── polynomial_underfit_resample_0.svg ├── polynomial_underfit_resample_1.svg ├── polynomial_underfit_resample_2.svg ├── polynomial_underfit_resample_all.svg ├── polynomial_underfit_simple.svg ├── polynomial_validation_curve.svg ├── polynomial_validation_curve_1.svg ├── polynomial_validation_curve_15.svg ├── polynomial_validation_curve_2.svg ├── polynomial_validation_curve_5.svg ├── polynomial_validation_curve_9.svg ├── randomized_search_results.csv ├── ridge_0_withreg.svg ├── ridge_alpha_0.svg ├── ridge_alpha_50.0.svg ├── ridge_alpha_50.svg ├── ridge_alpha_500.0.svg ├── ridge_alpha_500.svg ├── ridge_alpha_None.svg ├── scikit-learn-logo.svg ├── shufflesplit_1.svg ├── shufflesplit_2.svg ├── shufflesplit_3.svg ├── shufflesplit_diagram.png ├── simple_decision_tree_adult_census.png ├── splines_cubic.svg ├── splines_cubic_test.svg ├── splines_test.svg ├── style_figs.py ├── supervised.png ├── target_bias.svg ├── target_bias_0.svg ├── target_bias_1.svg ├── target_bias_2.svg ├── target_variance.svg ├── target_variance_0.svg ├── target_variance_1.svg ├── target_variance_2.svg ├── test_scores_h_shuffle=false.svg ├── tree2D_1split.svg ├── tree2D_2split.svg ├── tree2D_3split.svg ├── tree_blue_orange1.svg ├── tree_blue_orange2.svg ├── tree_blue_orange3.svg ├── tree_example.svg ├── tree_regression1.svg ├── tree_regression2.svg ├── tree_regression3.svg ├── tree_regression4.svg ├── tree_regression_structure1.svg ├── tree_regression_structure2.svg ├── tree_regression_structure3.svg ├── unsupervised.png └── workflow.png ├── full-index.ipynb ├── jupyter-book ├── _config.yml ├── _static │ ├── favicon.ico │ ├── matomo.js │ ├── sklearn_mooc.css │ └── sklearn_mooc.js ├── _toc.yml ├── appendix │ ├── acknowledgement.md │ ├── datasets_intro.md │ ├── glossary.md │ ├── notebook_timings.md │ └── toc_redirect.md ├── concluding_remarks.md ├── concluding_remarks_video.md ├── datasets ├── ensemble │ ├── bagging_slides.md │ ├── boosting_slides.md │ ├── ensemble_boosting_index.md │ ├── ensemble_bootstrap_index.md │ ├── ensemble_hyperparameters_index.md │ ├── ensemble_module_intro.md │ ├── ensemble_module_take_away.md │ ├── ensemble_quiz_m6_01.md │ ├── ensemble_quiz_m6_02.md │ ├── ensemble_quiz_m6_03.md │ └── ensemble_wrap_up_quiz.md ├── evaluation │ ├── cross_validation_baseline_index.md │ ├── cross_validation_choices_index.md │ ├── cross_validation_nested_index.md │ ├── evaluation_module_intro.md │ ├── evaluation_module_take_away.md │ ├── evaluation_quiz_m7_01.md │ ├── evaluation_quiz_m7_02.md │ ├── evaluation_quiz_m7_03.md │ ├── evaluation_quiz_m7_04.md │ ├── evaluation_quiz_m7_05.md │ ├── evaluation_wrap_up_quiz.md │ ├── metrics_classification_index.md │ └── metrics_regression_index.md ├── feature_selection │ ├── feature_selection_limitation_index.md │ ├── feature_selection_module_intro.md │ ├── feature_selection_module_take_away.md │ └── feature_selection_quiz.md ├── figures ├── index.md ├── interpretation │ └── interpretation_quiz.md ├── linear_models │ ├── linear_models_intuitions_index.md │ ├── linear_models_module_intro.md │ ├── linear_models_module_take_away.md │ ├── linear_models_non_linear_index.md │ ├── linear_models_quiz_m4_01.md │ ├── linear_models_quiz_m4_02.md │ ├── linear_models_quiz_m4_03.md │ ├── linear_models_regularization_index.md │ ├── linear_models_slides.md │ ├── linear_models_wrap_up_quiz.md │ └── regularized_linear_models_slides.md ├── ml_concepts │ ├── quiz_intro_01.md │ └── slides.md ├── overfit │ ├── bias_vs_variance_quiz_m2_03.md │ ├── bias_vs_variance_slides.md │ ├── learning_validation_curves_quiz_m2_02.md │ ├── learning_validation_curves_slides.md │ ├── overfit_bias_variance_index.md │ ├── overfit_module_intro.md │ ├── overfit_overfitting_underfitting_index.md │ ├── overfit_take_away.md │ ├── overfit_validation_learning_curves_index.md │ ├── overfit_wrap_up_quiz.md │ ├── overfitting_vs_under_fitting_quiz_m2_01.md │ └── overfitting_vs_under_fitting_slides.md ├── predictive_modeling_pipeline │ ├── 01_tabular_data_exploration_index.md │ ├── 01_tabular_data_exploration_quiz_m1_01.md │ ├── 02_numerical_pipeline_index.md │ ├── 02_numerical_pipeline_quiz_m1_02.md │ ├── 02_numerical_pipeline_video_cross_validation.md │ ├── 03_categorical_pipeline_index.md │ ├── 03_categorical_pipeline_quiz_m1_03.md │ ├── 03_categorical_pipeline_visualization_video.md │ ├── predictive_modeling_module_intro.md │ ├── predictive_modeling_module_take_away.md │ └── wrap_up_quiz.md ├── python_scripts ├── scikit-learn-logo.png ├── toc.md ├── trees │ ├── slides.md │ ├── trees_classification_index.md │ ├── trees_hyperparameters_index.md │ ├── trees_intuitions_index.md │ ├── trees_module_intro.md │ ├── trees_module_take_away.md │ ├── trees_quiz_m5_01.md │ ├── trees_quiz_m5_02.md │ ├── trees_quiz_m5_03.md │ ├── trees_quiz_m5_04.md │ ├── trees_regression_index.md │ └── trees_wrap_up_quiz.md └── tuning │ ├── parameter_tuning_automated_index.md │ ├── parameter_tuning_automated_quiz_m3_02.md │ ├── parameter_tuning_manual_index.md │ ├── parameter_tuning_manual_quiz_m3_01.md │ ├── parameter_tuning_module_intro.md │ ├── parameter_tuning_module_take_away.md │ ├── parameter_tuning_parallel_plot_video.md │ └── parameter_tuning_wrap_up_quiz.md ├── local-install-instructions.md ├── notebooks ├── 01_tabular_data_exploration.ipynb ├── 01_tabular_data_exploration_ex_01.ipynb ├── 01_tabular_data_exploration_sol_01.ipynb ├── 02_numerical_pipeline_cross_validation.ipynb ├── 02_numerical_pipeline_ex_00.ipynb ├── 02_numerical_pipeline_ex_01.ipynb ├── 02_numerical_pipeline_hands_on.ipynb ├── 02_numerical_pipeline_introduction.ipynb ├── 02_numerical_pipeline_scaling.ipynb ├── 02_numerical_pipeline_sol_00.ipynb ├── 02_numerical_pipeline_sol_01.ipynb ├── 03_categorical_pipeline.ipynb ├── 03_categorical_pipeline_column_transformer.ipynb ├── 03_categorical_pipeline_ex_01.ipynb ├── 03_categorical_pipeline_ex_02.ipynb ├── 03_categorical_pipeline_sol_01.ipynb ├── 03_categorical_pipeline_sol_02.ipynb ├── 03_categorical_pipeline_visualization.ipynb ├── cross_validation_baseline.ipynb ├── cross_validation_ex_01.ipynb ├── cross_validation_ex_02.ipynb ├── cross_validation_grouping.ipynb ├── cross_validation_learning_curve.ipynb ├── cross_validation_nested.ipynb ├── cross_validation_sol_01.ipynb ├── cross_validation_sol_02.ipynb ├── cross_validation_stratification.ipynb ├── cross_validation_time.ipynb ├── cross_validation_train_test.ipynb ├── cross_validation_validation_curve.ipynb ├── datasets_adult_census.ipynb ├── datasets_ames_housing.ipynb ├── datasets_bike_rides.ipynb ├── datasets_blood_transfusion.ipynb ├── datasets_california_housing.ipynb ├── dev_features_importance.ipynb ├── ensemble_adaboost.ipynb ├── ensemble_bagging.ipynb ├── ensemble_ex_01.ipynb ├── ensemble_ex_02.ipynb ├── ensemble_ex_03.ipynb ├── ensemble_ex_04.ipynb ├── ensemble_gradient_boosting.ipynb ├── ensemble_hist_gradient_boosting.ipynb ├── ensemble_hyperparameters.ipynb ├── ensemble_introduction.ipynb ├── ensemble_random_forest.ipynb ├── ensemble_sol_01.ipynb ├── ensemble_sol_02.ipynb ├── ensemble_sol_03.ipynb ├── ensemble_sol_04.ipynb ├── feature_selection_ex_01.ipynb ├── feature_selection_introduction.ipynb ├── feature_selection_limitation_model.ipynb ├── feature_selection_sol_01.ipynb ├── linear_models_ex_01.ipynb ├── linear_models_ex_02.ipynb ├── linear_models_ex_03.ipynb ├── linear_models_ex_04.ipynb ├── linear_models_feature_engineering_classification.ipynb ├── linear_models_regularization.ipynb ├── linear_models_sol_01.ipynb ├── linear_models_sol_02.ipynb ├── linear_models_sol_03.ipynb ├── linear_models_sol_04.ipynb ├── linear_regression_in_sklearn.ipynb ├── linear_regression_non_linear_link.ipynb ├── linear_regression_without_sklearn.ipynb ├── logistic_regression.ipynb ├── matplotlibrc ├── metrics_classification.ipynb ├── metrics_ex_01.ipynb ├── metrics_ex_02.ipynb ├── metrics_regression.ipynb ├── metrics_sol_01.ipynb ├── metrics_sol_02.ipynb ├── parameter_tuning_ex_02.ipynb ├── parameter_tuning_ex_03.ipynb ├── parameter_tuning_grid_search.ipynb ├── parameter_tuning_manual.ipynb ├── parameter_tuning_nested.ipynb ├── parameter_tuning_parallel_plot.ipynb ├── parameter_tuning_randomized_search.ipynb ├── parameter_tuning_sol_02.ipynb ├── parameter_tuning_sol_03.ipynb ├── trees_classification.ipynb ├── trees_dataset.ipynb ├── trees_ex_01.ipynb ├── trees_ex_02.ipynb ├── trees_hyperparameters.ipynb ├── trees_regression.ipynb ├── trees_sol_01.ipynb └── trees_sol_02.ipynb ├── one-day-course-index.md ├── plan.md ├── pyproject.toml ├── python_scripts ├── 01_tabular_data_exploration.py ├── 01_tabular_data_exploration_ex_01.py ├── 01_tabular_data_exploration_sol_01.py ├── 02_numerical_pipeline_cross_validation.py ├── 02_numerical_pipeline_ex_00.py ├── 02_numerical_pipeline_ex_01.py ├── 02_numerical_pipeline_hands_on.py ├── 02_numerical_pipeline_introduction.py ├── 02_numerical_pipeline_scaling.py ├── 02_numerical_pipeline_sol_00.py ├── 02_numerical_pipeline_sol_01.py ├── 03_categorical_pipeline.py ├── 03_categorical_pipeline_column_transformer.py ├── 03_categorical_pipeline_ex_01.py ├── 03_categorical_pipeline_ex_02.py ├── 03_categorical_pipeline_sol_01.py ├── 03_categorical_pipeline_sol_02.py ├── 03_categorical_pipeline_visualization.py ├── cross_validation_baseline.py ├── cross_validation_ex_01.py ├── cross_validation_ex_02.py ├── cross_validation_grouping.py ├── cross_validation_learning_curve.py ├── cross_validation_nested.py ├── cross_validation_sol_01.py ├── cross_validation_sol_02.py ├── cross_validation_stratification.py ├── cross_validation_time.py ├── cross_validation_train_test.py ├── cross_validation_validation_curve.py ├── datasets_adult_census.py ├── datasets_ames_housing.py ├── datasets_bike_rides.py ├── datasets_blood_transfusion.py ├── datasets_california_housing.py ├── dev_features_importance.py ├── ensemble_adaboost.py ├── ensemble_bagging.py ├── ensemble_ex_01.py ├── ensemble_ex_02.py ├── ensemble_ex_03.py ├── ensemble_ex_04.py ├── ensemble_gradient_boosting.py ├── ensemble_hist_gradient_boosting.py ├── ensemble_hyperparameters.py ├── ensemble_introduction.py ├── ensemble_random_forest.py ├── ensemble_sol_01.py ├── ensemble_sol_02.py ├── ensemble_sol_03.py ├── ensemble_sol_04.py ├── feature_selection_ex_01.py ├── feature_selection_introduction.py ├── feature_selection_limitation_model.py ├── feature_selection_sol_01.py ├── linear_models_ex_01.py ├── linear_models_ex_02.py ├── linear_models_ex_03.py ├── linear_models_ex_04.py ├── linear_models_feature_engineering_classification.py ├── linear_models_regularization.py ├── linear_models_sol_01.py ├── linear_models_sol_02.py ├── linear_models_sol_03.py ├── linear_models_sol_04.py ├── linear_regression_in_sklearn.py ├── linear_regression_non_linear_link.py ├── linear_regression_without_sklearn.py ├── logistic_regression.py ├── matplotlibrc ├── metrics_classification.py ├── metrics_ex_01.py ├── metrics_ex_02.py ├── metrics_regression.py ├── metrics_sol_01.py ├── metrics_sol_02.py ├── parameter_tuning_ex_02.py ├── parameter_tuning_ex_03.py ├── parameter_tuning_grid_search.py ├── parameter_tuning_manual.py ├── parameter_tuning_nested.py ├── parameter_tuning_parallel_plot.py ├── parameter_tuning_randomized_search.py ├── parameter_tuning_sol_02.py ├── parameter_tuning_sol_03.py ├── trees_classification.py ├── trees_dataset.py ├── trees_ex_01.py ├── trees_ex_02.py ├── trees_hyperparameters.py ├── trees_regression.py ├── trees_sol_01.py └── trees_sol_02.py ├── requirements-dev.txt ├── requirements.txt ├── slides ├── Makefile ├── README.md ├── Ubuntu │ ├── UFL.txt │ ├── Ubuntu-Bold.ttf │ ├── Ubuntu-BoldItalic.ttf │ ├── Ubuntu-Italic.ttf │ ├── Ubuntu-Light.ttf │ ├── Ubuntu-LightItalic.ttf │ ├── Ubuntu-Medium.ttf │ ├── Ubuntu-MediumItalic.ttf │ └── Ubuntu-Regular.ttf ├── Ubuntu_Mono │ ├── UFL.txt │ ├── UbuntuMono-Bold.ttf │ ├── UbuntuMono-BoldItalic.ttf │ ├── UbuntuMono-Italic.ttf │ └── UbuntuMono-Regular.ttf ├── bagging.md ├── bias_vs_variance.md ├── boosting.md ├── concluding_remarks.md ├── custom.css ├── ensemble.md ├── index.html ├── intro_cross_validation.md ├── intro_words.md ├── learning_validation_curves.md ├── linear_models.md ├── ml_concepts.md ├── overfitting_vs_underfitting.md ├── regularized_linear_models.md └── trees.md └── workflow-notes.md /.binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: scikit-learn-course 2 | 3 | channels: 4 | - conda-forge 5 | 6 | dependencies: 7 | - python=3.10 8 | - scikit-learn >= 1.1.1 9 | - pandas >= 1 10 | - matplotlib-base 11 | - seaborn 12 | - jupyterlab 13 | - notebook 14 | - jupytext 15 | - plotly 16 | -------------------------------------------------------------------------------- /.binder/postBuild: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo 'export OMP_NUM_THREADS=2' >> ~/.profile 3 | echo 'export OPENBLAS_NUM_THREADS=2' >> ~/.profile 4 | echo 'export MKL_NUM_THREADS=2' >> ~/.profile 5 | -------------------------------------------------------------------------------- /.binder/start: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export OMP_NUM_THREADS=2 4 | export OPENBLAS_NUM_THREADS=2 5 | export MKL_NUM_THREADS=2 6 | 7 | exec "$@" 8 | -------------------------------------------------------------------------------- /.github/workflows/deploy-gh-pages.yml: -------------------------------------------------------------------------------- 1 | name: deploy-gh-pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - test-ci* 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | deploy-gh-pages: 14 | runs-on: ubuntu-latest 15 | env: 16 | OMP_NUM_THREADS: 1 17 | MKL_NUM_THREADS: 2 18 | OPENBLAS_NUM_THREADS: 2 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Set up Python 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: 3.9 29 | 30 | - name: Install dependencies 31 | run: | 32 | pip install -r requirements-dev.txt 33 | 34 | - name: Cache jupyter-cache folder 35 | uses: actions/cache@v3 36 | env: 37 | cache-name: jupyter-cache 38 | with: 39 | path: jupyter-book/_build/.jupyter_cache 40 | key: v2-${{ github.ref }}-${{ hashFiles('python_scripts/**/*.py') }} 41 | restore-keys: | 42 | v2-${{ github.ref }}-${{ hashFiles('python_scripts/**/*.py') }} 43 | v2-${{ github.ref }} 44 | v2-refs/heads/main 45 | 46 | - name: Build the JupyterBook 47 | env: 48 | GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}} 49 | run: | 50 | bash build_tools/build_jupyter_book.sh 51 | 52 | - name: Save the PR number 53 | env: 54 | GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}} 55 | run: | 56 | echo "Storing PR number ${{github.event.number}} to 'pull_request_number' file" 57 | echo ${{github.event.number}} > pull_request_number 58 | 59 | - name: Upload jupyter-book artifact for preview in PRs 60 | if: ${{ github.event_name == 'pull_request' }} 61 | uses: actions/upload-artifact@v4 62 | with: 63 | name: jupyter-book 64 | path: | 65 | jupyter-book/_build/html 66 | pull_request_number 67 | 68 | - name: Update the main gh-page website 69 | if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} 70 | uses: peaceiris/actions-gh-pages@v3.6.1 71 | with: 72 | github_token: ${{ secrets.GITHUB_TOKEN }} 73 | publish_dir: ./jupyter-book/_build/html 74 | commit_message: "[ci skip] ${{ github.event.head_commit.message }}" 75 | -------------------------------------------------------------------------------- /.github/workflows/formatting.yml: -------------------------------------------------------------------------------- 1 | name: Formatting 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | 8 | pull_request: 9 | branches: 10 | - '*' 11 | 12 | jobs: 13 | run-linters: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: Set up Python 3.11 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: "3.11" 22 | allow-prereleases: true 23 | 24 | - name: Run the linters via pre-commit 25 | run: | 26 | python -m pip install pre-commit 27 | # only run pre-commit on the folder `python_scripts` 28 | pre-commit run --files python_scripts/* 29 | -------------------------------------------------------------------------------- /.github/workflows/jupyter-book-pr-preview.yml: -------------------------------------------------------------------------------- 1 | name: jupyter-book-pr-preview 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["deploy-gh-pages"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | deploy-preview: 11 | runs-on: ubuntu-latest 12 | if: ${{github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'success'}} 13 | steps: 14 | - name: 'Commit Status: Set Workflow Status as Pending' 15 | uses: myrotvorets/set-commit-status-action@1.1.6 16 | with: 17 | token: ${{ secrets.GITHUB_TOKEN }} 18 | status: pending 19 | sha: ${{ github.event.workflow_run.head_sha }} 20 | context: 'JupyterBook preview' 21 | 22 | - uses: actions/download-artifact@v4 23 | with: 24 | github-token: ${{secrets.GITHUB_TOKEN}} 25 | run-id: ${{ github.event.workflow_run.id }} 26 | name: jupyter-book 27 | 28 | - name: Get pull request number 29 | id: pull-request-number 30 | run: | 31 | export PULL_REQUEST_NUMBER=`cat pull_request_number` 32 | echo "PULL_REQUEST_NUMBER=$PULL_REQUEST_NUMBER" 33 | echo "result=${PULL_REQUEST_NUMBER}" >> $GITHUB_OUTPUT 34 | 35 | - uses: actions/setup-node@v3 36 | with: 37 | node-version: '16' 38 | - run: npm install --global netlify-cli@6 39 | - name: Deploy to Netlify 40 | env: 41 | NETLIFY_AUTH_TOKEN: ${{secrets.NETLIFY_AUTH_TOKEN}} 42 | NETLIFY_SITE_ID: ${{secrets.NETLIFY_SITE_ID}} 43 | run: | 44 | echo "Deploying PR ${{steps.pull-request-number.outputs.result}} to Netlify" 45 | netlify deploy --dir=jupyter-book/_build/html --alias=pull-request-${{steps.pull-request-number.outputs.result}} 46 | 47 | - name: 'Commit Status: Update deployment status' 48 | uses: myrotvorets/set-commit-status-action@1.1.6 49 | # Always run this step regardless of job failing early 50 | if: always() 51 | env: 52 | DEPLOY_SUCCESS: Successfully deployed preview. 53 | DEPLOY_FAILURE: Failed to deploy preview. 54 | TARGET_URL_SUCCESS: https://pull-request-${{steps.pull-request-number.outputs.result}}--scikit-learn-mooc.netlify.app/_changed.html 55 | TARGET_URL_FAILURE: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 56 | with: 57 | token: ${{ secrets.GITHUB_TOKEN }} 58 | status: ${{ job.status == 'success' && 'success' || 'failure' }} 59 | sha: ${{ github.event.workflow_run.head_sha }} 60 | context: 'JupyterBook preview' 61 | description: ${{ job.status == 'success' && env.DEPLOY_SUCCESS || env.DEPLOY_FAILURE }} 62 | targetUrl: ${{ job.status == 'success' && env.TARGET_URL_SUCCESS || env.TARGET_URL_FAILURE }} 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # exlude datasets and externals 2 | notebooks/datasets 3 | notebooks/joblib/ 4 | wrap-up/ 5 | 6 | # jupyter-book 7 | jupyter-book/_build 8 | 9 | # HTML slides are generated from the markdown source and should generally not 10 | # be committed. There are also some html files in the slides folder that do not 11 | # have a matching markdown source files. Those are manually git added on a case 12 | # by case basis. 13 | slides/*.html 14 | 15 | # exclude temporary files 16 | .ipynb_checkpoints 17 | .DS_Store 18 | gmon.out 19 | __pycache__ 20 | *.pyc 21 | *.o 22 | *.so 23 | *.gcno 24 | *.swp 25 | *.egg-info 26 | *.egg 27 | *~ 28 | build 29 | dist 30 | lib/test 31 | doc/_build 32 | *env 33 | *ENV 34 | .idea 35 | *.code-workspace 36 | .vscode 37 | -------------------------------------------------------------------------------- /.jupyter/README.md: -------------------------------------------------------------------------------- 1 | This directory is to setup jupyter on binder 2 | -------------------------------------------------------------------------------- /.jupyter/jupyter_notebook_config.py: -------------------------------------------------------------------------------- 1 | # To use jupytext in binder 2 | c.ContentsManager.preferred_jupytext_formats_read = "py:percent" # noqa 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | exclude: notebooks 8 | exclude_types: [svg] 9 | - id: trailing-whitespace 10 | exclude: notebooks 11 | exclude_types: [svg] 12 | - repo: https://github.com/psf/black 13 | rev: 23.1.0 14 | hooks: 15 | - id: black 16 | - repo: https://github.com/astral-sh/ruff-pre-commit 17 | rev: v0.11.2 18 | hooks: 19 | - id: ruff 20 | args: ["--fix", "--output-format=full"] 21 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this content, please cite it as below." 3 | authors: 4 | - name: "The scikit-learn MOOC developers" 5 | title: "scikit-learn MOOC" 6 | version: latest 7 | doi: https://doi.org/10.5281/zenodo.7220306 8 | url: "https://github.com/INRIA/scikit-learn-mooc" 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON_SCRIPTS_DIR = python_scripts 2 | NOTEBOOKS_DIR = notebooks 3 | JUPYTER_BOOK_DIR = jupyter-book 4 | WRAP_UP_DIR = wrap-up 5 | JUPYTER_KERNEL := python3 6 | MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\.py@.ipynb@") 7 | 8 | # This assumes that the folder mooc-scikit-learn-coordination and 9 | # scikit-learn-mooc are siblings, e.g. the repos are in the 10 | # ~/dev/mooc-scikit-learn-coordination and ~/dev/scikit-learn-mooc. This should 11 | # be the case in most development setups. If not then you can pass the 12 | # GITLAB_REPO_JUPYTERBOOK_DIR variable with 13 | # make -e GITLAB_REPO_JUPYTERBOOK_DIR=your/gitlab/repo/jupyter-book-dir/goes-here 14 | GITLAB_REPO_JUPYTERBOOK_DIR = ../mooc-scikit-learn-coordination/jupyter-book 15 | 16 | all: $(NOTEBOOKS_DIR) 17 | 18 | .PHONY: $(NOTEBOOKS_DIR) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR) all \ 19 | exercises quizzes $(JUPYTER_BOOK_DIR) $(JUPYTER_BOOK_DIR)-clean $(JUPYTER_BOOK_DIR)-full-clean 20 | 21 | $(NOTEBOOKS_DIR): $(MINIMAL_NOTEBOOK_FILES) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR) 22 | 23 | $(NOTEBOOKS_DIR)/%.ipynb: $(PYTHON_SCRIPTS_DIR)/%.py 24 | python build_tools/convert-python-script-to-notebook.py $< $@ 25 | 26 | copy_matplotlibrc: 27 | cp $(PYTHON_SCRIPTS_DIR)/matplotlibrc $(NOTEBOOKS_DIR)/ 28 | 29 | sanity_check_$(NOTEBOOKS_DIR): 30 | python build_tools/sanity-check.py $(PYTHON_SCRIPTS_DIR) $(NOTEBOOKS_DIR) 31 | 32 | exercises: 33 | python build_tools/generate-exercise-from-solution.py $(PYTHON_SCRIPTS_DIR) 34 | 35 | quizzes: 36 | python build_tools/generate-quizzes.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(JUPYTER_BOOK_DIR) 37 | 38 | full-index: 39 | python build_tools/generate-index.py 40 | 41 | run-code-in-wrap-up-quizzes: 42 | python build_tools/generate-wrap-up.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(WRAP_UP_DIR) 43 | jupytext --execute --to notebook $(WRAP_UP_DIR)/*.py 44 | 45 | $(JUPYTER_BOOK_DIR): 46 | jupyter-book build $(JUPYTER_BOOK_DIR) 47 | rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html 48 | 49 | $(JUPYTER_BOOK_DIR)-clean: 50 | # keep jupyter-cache cache folder 51 | jupyter-book clean $(JUPYTER_BOOK_DIR) 52 | 53 | $(JUPYTER_BOOK_DIR)-full-clean: 54 | # deletes jupyter-cache cache folder 55 | rm -rf $(JUPYTER_BOOK_DIR)/_build 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scikit-learn course 2 | 3 | This is the source code for the [Machine learning in Python with scikit-learn 4 | MOOC](https://www.fun-mooc.fr/en/courses/machine-learning-python-scikit-learn). 5 | Enroll for the full MOOC experience (quiz solutions, executable 6 | notebooks, discussion forum, etc ...) ! 7 | 8 | The MOOC is free and hosted on the [FUN-MOOC](https://fun-mooc.fr/) platform 9 | which does not use the student data for any other purpose than improving the 10 | educational material. 11 | 12 | The static version of the course can be browsed online: https://inria.github.io/scikit-learn-mooc 13 | 14 | ## Course description 15 | 16 | The course description can be found here: 17 | https://inria.github.io/scikit-learn-mooc/index.html 18 | 19 | ## Follow the course online 20 | 21 | A few different ways are available: 22 | - Launch an online notebook environment using [![Binder](https://mybinder.org/badge_logo.svg)]( 23 | https://mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) 24 | - Browse [website](https://inria.github.io/scikit-learn-mooc) generated with 25 | [Jupyter Book](https://jupyterbook.org/) 26 | 27 | ## Running the notebooks locally 28 | 29 | See instructions [here](./local-install-instructions.md) 30 | 31 | ## Contributing 32 | 33 | See [CONTRIBUTING.md](CONTRIBUTING.md) 34 | 35 | ## How to cite us 36 | 37 | The MOOC material is developed publicly under the [CC-BY 38 | license](https://github.com/INRIA/scikit-learn-mooc/blob/main/LICENSE). 39 | 40 | You can cite us through the project's Zenodo archive using the following DOI: 41 | [10.5281/zenodo.7220306](https://doi.org/10.5281/zenodo.7220306). 42 | -------------------------------------------------------------------------------- /build_tools/build_jupyter_book.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -xe 3 | 4 | jupyter_book_dir=jupyter-book 5 | jupyter_book_build_dir="$jupyter_book_dir/_build/html" 6 | 7 | function show_error_logs { 8 | echo "Some notebooks failed, see logs below:" 9 | for f in $jupyter_book_build_dir/reports/*.log; do 10 | echo "================================================================================" 11 | echo $f 12 | echo "================================================================================" 13 | cat $f 14 | done 15 | # You need to exit with non-zero here to cause the build to fail 16 | exit 1 17 | } 18 | 19 | affected_jupyter_book_paths() { 20 | files=$(git diff --name-only origin/main...) 21 | # TODO: rather than the grep pattern below we could potentially look at 22 | # _toc.yml to know whether the file affects the JupyterBook 23 | echo "$files" | grep python_scripts | perl -pe 's@\.py$@.html@' 24 | echo "$files" | grep -P "$jupyter_book_dir/.+md$" | \ 25 | perl -pe "s@$jupyter_book_dir/(.+)\.md@\1.html@" 26 | } 27 | 28 | write_changed_html() { 29 | affected="$1" 30 | if [ -n "$GITHUB_PULL_REQUEST_NUMBER" ] 31 | then 32 | GITHUB_PULL_REQUEST_URL="https://github.com/inria/scikit-learn-mooc/pull/$GITHUB_PULL_REQUEST_NUMBER" 33 | echo "The following files may have been changed by PR $GITHUB_PULL_REQUEST_NUMBER:" 34 | echo "$affected" 35 | ( 36 | echo '' 37 | echo "Files changed by PR $GITHUB_PULL_REQUEST_URL" 38 | echo '

This PR JupyterBook index' 41 | echo '' 42 | ) > "$jupyter_book_build_dir/_changed.html" 43 | else 44 | echo "The variable 'GITHUB_PULL_REQUEST_NUMBER' is not defined: not writing the '_changed.html' file." 45 | fi 46 | } 47 | 48 | git remote -v 49 | git show --stat 50 | git log --color --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -20 51 | git fetch origin main >&2 # || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) 52 | git diff origin/main... --stat 53 | git diff origin/main... 54 | 55 | affected=$(affected_jupyter_book_paths) 56 | mkdir -p $jupyter_book_build_dir 57 | write_changed_html "$affected" 58 | 59 | make $jupyter_book_dir 2>&1 | tee $jupyter_book_dir/build.log 60 | 61 | 62 | # Grep the log to make sure there has been no errors when running the notebooks 63 | # since jupyter-book exit code is always 0 64 | grep 'Execution Failed' $jupyter_book_dir/build.log && show_error_logs || \ 65 | echo 'All notebooks ran successfully' 66 | -------------------------------------------------------------------------------- /build_tools/generate-quizzes.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | import sys 4 | 5 | from jupytext.myst import myst_to_notebook 6 | import jupytext 7 | 8 | 9 | def remove_solution(input_myst_str): 10 | """Removes solution from myst str. 11 | 12 | This is based on solution having "solution" in their cell metadata tags 13 | """ 14 | nb = myst_to_notebook(input_myst_str) 15 | 16 | cell_tags_list = [c["metadata"].get("tags") for c in nb.cells] 17 | is_solution_list = [ 18 | tags is not None and "solution" in tags for tags in cell_tags_list 19 | ] 20 | nb.cells = [ 21 | cell 22 | for cell, is_solution in zip(nb.cells, is_solution_list) 23 | if not is_solution 24 | ] 25 | 26 | myst_nb_str = jupytext.writes(nb, fmt="myst") 27 | 28 | header_pattern = re.compile( 29 | r"---\njupytext.+---\s*", re.DOTALL | re.MULTILINE 30 | ) 31 | return re.sub(header_pattern, "", myst_nb_str) 32 | 33 | 34 | def write_exercise_myst(input_path, output_path): 35 | input_myst = input_path.read_text() 36 | 37 | output_myst = remove_solution(input_myst) 38 | output_path.write_text(output_myst) 39 | 40 | 41 | def write_all_exercises(input_root_path, output_root_path): 42 | print(input_root_path, output_root_path) 43 | input_exercises = Path(input_root_path).glob("**/*quiz*.md") 44 | 45 | for input_path in input_exercises: 46 | # FIXME there may be a better way with the pathlib API 47 | relative_path_str = re.sub( 48 | str(input_root_path) + "/?", "", str(input_path) 49 | ) 50 | output_path = Path(output_root_path).joinpath(relative_path_str) 51 | print(str(input_path), str(output_path)) 52 | write_exercise_myst(input_path, output_path) 53 | 54 | 55 | if __name__ == "__main__": 56 | input_root_path = sys.argv[1] 57 | output_root_path = sys.argv[2] 58 | 59 | write_all_exercises(input_root_path, output_root_path) 60 | -------------------------------------------------------------------------------- /build_tools/generate-wrap-up.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import glob 4 | 5 | 6 | def extract_python_code_blocks(md_file_path): 7 | """ 8 | Extract Python code blocks from a markdown file. 9 | 10 | Args: 11 | md_file_path (str): Path to the markdown file 12 | 13 | Returns: 14 | list: List of extracted Python code blocks 15 | """ 16 | code_blocks = [] 17 | in_python_block = False 18 | current_block = [] 19 | 20 | with open(md_file_path, "r", encoding="utf-8") as file: 21 | for line in file: 22 | line = line.rstrip("\n") 23 | 24 | if line.strip() == "```python": 25 | in_python_block = True 26 | current_block = [] 27 | elif line.strip() == "```" and in_python_block: 28 | in_python_block = False 29 | code_blocks.append("\n".join(current_block)) 30 | elif in_python_block: 31 | current_block.append(line) 32 | 33 | return code_blocks 34 | 35 | 36 | def write_jupyter_notebook_file( 37 | code_blocks, output_file="notebook_from_md.py" 38 | ): 39 | """ 40 | Writes extracted code blocks to a Python file formatted as Jupyter notebook cells. 41 | 42 | Args: 43 | code_blocks (list): List of code blocks to write 44 | output_file (str): Path to the output file 45 | """ 46 | with open(output_file, "w", encoding="utf-8") as file: 47 | file.write( 48 | "# %% [markdown] \n # ## Notebook generated from Markdown file\n\n" 49 | ) 50 | 51 | for i, block in enumerate(code_blocks, 1): 52 | file.write(f"# %% [markdown]\n# ## Cell {i}\n\n# %%\n{block}\n\n") 53 | 54 | print( 55 | f"Successfully wrote {len(code_blocks)} code cells to" 56 | f" {output_file}" 57 | ) 58 | 59 | 60 | def process_quiz_files(input_path, output_dir): 61 | """ 62 | Process all wrap_up_quiz files in the input path and convert them to notebooks. 63 | 64 | Args: 65 | input_path (str): Path to look for wrap_up_quiz files in subfolders 66 | output_dir (str): Directory to write the generated notebooks 67 | """ 68 | # Create output directory if it doesn't exist 69 | if not os.path.exists(output_dir): 70 | os.makedirs(output_dir) 71 | print(f"Created output directory: {output_dir}") 72 | 73 | # Find all files containing "wrap_up_quiz" in their name in the input path subfolders 74 | quiz_files = glob.glob( 75 | f"{input_path}/**/*wrap_up_quiz*.md", recursive=True 76 | ) 77 | 78 | if not quiz_files: 79 | print(f"No wrap_up_quiz.md files found in {input_path} subfolders.") 80 | return 81 | 82 | print(f"Found {len(quiz_files)} wrap_up_quiz files to process.") 83 | 84 | # Process each file 85 | for md_file_path in quiz_files: 86 | print(f"\nProcessing: {md_file_path}") 87 | 88 | # Extract code blocks 89 | code_blocks = extract_python_code_blocks(md_file_path) 90 | 91 | # Generate output filename 92 | subfolder = md_file_path.split(os.sep)[3] # Get subfolder name 93 | output_file = os.path.join(output_dir, f"{subfolder}_wrap_up_quiz.py") 94 | 95 | # Display results and write notebook file 96 | if code_blocks: 97 | print(f"Found {len(code_blocks)} Python code blocks") 98 | write_jupyter_notebook_file(code_blocks, output_file=output_file) 99 | else: 100 | print(f"No Python code blocks found in {md_file_path}.") 101 | 102 | 103 | if __name__ == "__main__": 104 | input_path = sys.argv[1] 105 | output_dir = sys.argv[2] 106 | 107 | process_quiz_files(input_path, output_dir) 108 | -------------------------------------------------------------------------------- /build_tools/sanity-check.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import difflib 4 | 5 | # TODO: we could get the list from .gitignore 6 | IGNORE_LIST = [ 7 | ".ipynb_checkpoints", 8 | "__pycache__", 9 | ] 10 | 11 | folder1, folder2 = sys.argv[1:3] 12 | 13 | 14 | def get_basename(folder): 15 | contents = [] 16 | for fn in os.listdir(folder): 17 | content = os.path.splitext(os.path.basename(fn))[0] 18 | if content not in IGNORE_LIST: 19 | contents.append(content) 20 | return contents 21 | 22 | 23 | basenames1 = sorted(get_basename(folder1)) 24 | basenames2 = sorted(get_basename(folder2)) 25 | 26 | if basenames1 != basenames2: 27 | only_in_folder1 = set(basenames1) - set(basenames2) 28 | only_in_folder2 = set(basenames2) - set(basenames1) 29 | 30 | raise RuntimeError( 31 | f"Inconsistency between folder {folder1} and {folder2}\n" 32 | f"Only in folder {folder1}: {only_in_folder1}\n" 33 | f"Only in folder {folder2}: {only_in_folder2}" 34 | ) 35 | -------------------------------------------------------------------------------- /check_env.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import importlib 3 | 4 | OK = "\x1b[42m[ OK ]\x1b[0m" 5 | FAIL = "\x1b[41m[FAIL]\x1b[0m" 6 | 7 | try: 8 | from packaging.version import Version 9 | except ImportError: 10 | print( 11 | FAIL, "'packaging' package not installed, install it with conda or pip" 12 | ) 13 | sys.exit(1) 14 | 15 | # first check the python version 16 | print("Using python in", sys.prefix) 17 | print(sys.version) 18 | pyversion_str = f"{sys.version_info.major}.{sys.version_info.minor}" 19 | pyversion = Version(pyversion_str) 20 | 21 | if pyversion < Version("3.8"): 22 | print( 23 | FAIL, 24 | ( 25 | "Python version 3.8 or above is required," 26 | f" but {pyversion_str} is installed." 27 | ), 28 | ) 29 | sys.exit(1) 30 | print() 31 | 32 | 33 | def import_version(pkg, min_ver, fail_msg=""): 34 | mod = None 35 | try: 36 | mod = importlib.import_module(pkg) 37 | if pkg in {"PIL"}: 38 | try: 39 | ver = mod.__version__ 40 | except AttributeError: 41 | try: 42 | ver = mod.VERSION 43 | except AttributeError: 44 | try: 45 | ver = mod.PILLOW_VERSION 46 | except Exception: 47 | raise 48 | else: 49 | ver = mod.__version__ 50 | if Version(ver) < Version(min_ver): 51 | print( 52 | FAIL, 53 | ( 54 | f"{lib} version {min_ver} or higher required, but" 55 | f" {ver} installed." 56 | ), 57 | ) 58 | else: 59 | print(OK, f"{pkg} version {ver}") 60 | except ImportError: 61 | print(FAIL, f"{pkg} not installed. {fail_msg}") 62 | return mod 63 | 64 | 65 | requirements = { 66 | "numpy": "1.16", 67 | "scipy": "1.2", 68 | "matplotlib": "3.0", 69 | "sklearn": "1.6", 70 | "pandas": "1", 71 | "seaborn": "0.11", 72 | "notebook": "5.7", 73 | "plotly": "5.10", 74 | } 75 | 76 | # now the dependencies 77 | for lib, required_version in list(requirements.items()): 78 | import_version(lib, required_version) 79 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | `cps_85_wages.csv` is available at https://www.openml.org/d/534 2 | `adult-census.csv` is available at https://www.openml.org/d/15950 3 | -------------------------------------------------------------------------------- /environment-dev.yml: -------------------------------------------------------------------------------- 1 | name: scikit-learn-course 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - scikit-learn >= 1.6 6 | - pandas >= 1 7 | - matplotlib-base 8 | - seaborn >= 0.13 9 | - plotly >= 5.10 10 | - jupytext 11 | - beautifulsoup4 12 | - IPython 13 | - packaging 14 | - pip 15 | - pip: 16 | - jupyter-book >= 0.11 17 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: scikit-learn-course 2 | 3 | channels: 4 | - conda-forge 5 | 6 | dependencies: 7 | - scikit-learn >= 1.6 8 | - pandas >= 1 9 | - matplotlib-base 10 | - seaborn >= 0.13 11 | - jupyterlab 12 | - notebook 13 | - plotly >= 5.10 14 | - IPython 15 | - packaging 16 | -------------------------------------------------------------------------------- /figures/README.md: -------------------------------------------------------------------------------- 1 | This directory contains didactic figures and scripts that generate them. 2 | -------------------------------------------------------------------------------- /figures/boosting_trees1.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /figures/cross_validation_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/cross_validation_diagram.png -------------------------------------------------------------------------------- /figures/cross_validation_train_test_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/cross_validation_train_test_diagram.png -------------------------------------------------------------------------------- /figures/mooc_computer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/mooc_computer.jpg -------------------------------------------------------------------------------- /figures/nested_cross_validation_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/nested_cross_validation_diagram.png -------------------------------------------------------------------------------- /figures/numerical_pipeline_quiz_scaler.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import numpy as np 3 | 4 | rng = np.random.RandomState(0) 5 | 6 | X = rng.randn(100, 2) 7 | X[:, 0] += abs(X[:, 0].min()) + 1 8 | X[:, 1] *= 3 9 | 10 | # %% 11 | import seaborn as sns 12 | 13 | sns.set_context("talk") 14 | 15 | # %% 16 | import matplotlib.pyplot as plt 17 | import matplotlib.ticker as ticker 18 | 19 | ticks = [-6, -4, -2, 0, 2, 4, 6] 20 | 21 | _, ax = plt.subplots(figsize=(5, 5)) 22 | sns.scatterplot(x=X[:, 0], y=X[:, 1], s=30, edgecolor="black") 23 | ax.set_xlim(-6, 6) 24 | ax.set_ylim(-6, 6) 25 | ax.set_xlabel("Feature A") 26 | ax.set_ylabel("Feature B") 27 | ax.xaxis.set_ticks_position("bottom") 28 | ax.yaxis.set_ticks_position("left") 29 | ax.set_xticklabels(ticks) 30 | ax.xaxis.set_major_locator(ticker.FixedLocator(ticks)) 31 | ax.set_yticklabels(ticks) 32 | ax.yaxis.set_major_locator(ticker.FixedLocator(ticks)) 33 | ax.grid(visible=True) 34 | ax.set_title("Original dataset\n", loc="center") 35 | plt.savefig("numerical_pipeline_quiz_scaler_original.png", bbox_inches="tight") 36 | 37 | # %% 38 | from sklearn.preprocessing import StandardScaler 39 | from sklearn.preprocessing import MinMaxScaler 40 | 41 | standard_scaler_mean_only = StandardScaler(with_std=False).fit(X) 42 | standard_scaler_scale_only = StandardScaler(with_mean=False).fit(X) 43 | standard_scaler = StandardScaler().fit(X) 44 | min_max_scaler = MinMaxScaler().fit(X) 45 | 46 | # %% 47 | fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(12, 10)) 48 | for idx, (ax, data) in enumerate( 49 | zip( 50 | axs.ravel(), 51 | [ 52 | standard_scaler_mean_only.transform(X), 53 | standard_scaler.transform(X), 54 | min_max_scaler.transform(X), 55 | standard_scaler_scale_only.transform(X), 56 | ], 57 | ) 58 | ): 59 | sns.scatterplot(x=data[:, 0], y=data[:, 1], s=30, edgecolor="black", ax=ax) 60 | ax.set_xlim(-6, 6) 61 | ax.set_ylim(-6, 6) 62 | ax.set_xlabel("Feature A") 63 | ax.set_ylabel("Feature B") 64 | ax.xaxis.set_ticks_position("bottom") 65 | ax.yaxis.set_ticks_position("left") 66 | ax.set_xticklabels(ticks) 67 | ax.xaxis.set_major_locator(ticker.FixedLocator(ticks)) 68 | ax.set_yticklabels(ticks) 69 | ax.yaxis.set_major_locator(ticker.FixedLocator(ticks)) 70 | ax.grid(visible=True) 71 | ax.set_title(f"Preprocessing {'ABCD'[idx]}\n") 72 | 73 | fig.subplots_adjust(hspace=0.6, wspace=0.5) 74 | plt.savefig( 75 | "numerical_pipeline_quiz_scaler_preprocessing.png", bbox_inches="tight" 76 | ) 77 | 78 | # %% 79 | -------------------------------------------------------------------------------- /figures/numerical_pipeline_quiz_scaler_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/numerical_pipeline_quiz_scaler_original.png -------------------------------------------------------------------------------- /figures/numerical_pipeline_quiz_scaler_preprocessing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/numerical_pipeline_quiz_scaler_preprocessing.png -------------------------------------------------------------------------------- /figures/numerical_pipeline_wrap_up_quiz_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/numerical_pipeline_wrap_up_quiz_comparison.png -------------------------------------------------------------------------------- /figures/plot_cross_validation_diagram.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from matplotlib.patches import Patch 4 | from pathlib import Path 5 | from sklearn.model_selection import KFold, ShuffleSplit 6 | 7 | 8 | FIGURES_FOLDER = Path(__file__).parent 9 | cmap_cv = plt.cm.coolwarm 10 | 11 | plt.style.use(FIGURES_FOLDER / "../python_scripts/matplotlibrc") 12 | 13 | 14 | # + 15 | def plot_cv_indices(cv, X, y, ax, lw=50): 16 | """Create a sample plot for indices of a cross-validation object.""" 17 | splits = list(cv.split(X=X, y=y)) 18 | n_splits = len(splits) 19 | 20 | # Generate the training/testing visualizations for each CV split 21 | for ii, (train, test) in enumerate(splits): 22 | # Fill in indices with the training/test groups 23 | indices = np.zeros(shape=X.shape[0], dtype=np.int32) 24 | indices[train] = 1 25 | 26 | # Visualize the results 27 | ax.scatter( 28 | range(len(indices)), 29 | [ii + 0.5] * len(indices), 30 | c=indices, 31 | marker="_", 32 | lw=25, 33 | cmap=cmap_cv, 34 | vmin=-0.2, 35 | vmax=1.2, 36 | ) 37 | 38 | # Formatting 39 | yticklabels = list(range(n_splits)) 40 | ax.set( 41 | yticks=np.arange(n_splits) + 0.5, 42 | yticklabels=yticklabels, 43 | xlabel="Sample index", 44 | ylabel="CV iteration", 45 | ylim=[n_splits + 0.2, -0.2], 46 | xlim=[0, 50], 47 | ) 48 | ax.legend( 49 | [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))], 50 | ["Training samples", "Testing samples"], 51 | loc=(1.02, 0.8), 52 | ) 53 | ax.set_title("{}".format(type(cv).__name__)) 54 | return ax 55 | 56 | 57 | n_points = 50 58 | X = np.random.randn(n_points, 10) 59 | y = np.random.randn(n_points) 60 | 61 | fig, ax = plt.subplots(figsize=(12, 4)) 62 | cv = KFold(5) 63 | _ = plot_cv_indices(cv, X, y, ax) 64 | plt.tight_layout() 65 | fig.savefig(FIGURES_FOLDER / "cross_validation_diagram.png") 66 | 67 | fig, ax = plt.subplots(figsize=(12, 4)) 68 | cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) 69 | _ = plot_cv_indices(cv, X, y, ax) 70 | plt.tight_layout() 71 | fig.savefig(FIGURES_FOLDER / "shufflesplit_diagram.png") 72 | -------------------------------------------------------------------------------- /figures/plot_iris_visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some simple visualizations on the iris data. 3 | """ 4 | 5 | import numpy as np 6 | from sklearn import datasets 7 | from matplotlib import pyplot as plt 8 | import style_figs 9 | 10 | iris = datasets.load_iris() 11 | 12 | # Plot the histograms of each class for each feature 13 | 14 | 15 | X = iris.data 16 | y = iris.target 17 | for x, feature_name in zip(X.T, iris.feature_names): 18 | plt.figure(figsize=(2.5, 2)) 19 | patches = list() 20 | for this_y, target_name in enumerate(iris.target_names): 21 | patch = plt.hist( 22 | x[y == this_y], 23 | bins=np.linspace(x.min(), x.max(), 20), 24 | label=target_name, 25 | ) 26 | patches.append(patch[-1][0]) 27 | style_figs.light_axis() 28 | feature_name = feature_name.replace(" ", "_") 29 | feature_name = feature_name.replace("(", "") 30 | feature_name = feature_name.replace(")", "") 31 | plt.savefig("iris_{}_hist.svg".format(feature_name)) 32 | 33 | plt.figure(figsize=(6, 0.25)) 34 | plt.legend(patches, iris.target_names, ncol=3, loc=(0, -0.37), borderaxespad=0) 35 | style_figs.no_axis() 36 | plt.savefig("legend_irises.svg") 37 | -------------------------------------------------------------------------------- /figures/plot_simple_decision_tree_adult_census.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import numpy as np 3 | 4 | from scipy import ndimage 5 | 6 | import pandas as pd 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib.pyplot import cm 10 | from matplotlib.colors import ListedColormap 11 | 12 | import seaborn as sns 13 | 14 | from sklearn.preprocessing import LabelEncoder 15 | from sklearn.tree import DecisionTreeClassifier 16 | 17 | 18 | HERE = Path(__file__).parent 19 | 20 | top = cm.get_cmap("Oranges", 128) 21 | bottom = cm.get_cmap("Blues_r", 128) 22 | 23 | colors = np.vstack( 24 | [bottom(np.linspace(0, 1, 128)), top(np.linspace(0, 1, 128))] 25 | ) 26 | blue_orange_cmap = ListedColormap(colors, name="BlueOrange") 27 | 28 | 29 | adult_census = pd.read_csv("../datasets/adult-census.csv") 30 | target_column = "class" 31 | 32 | n_samples_to_plot = 5000 33 | 34 | 35 | def plot_tree_decision_function(tree, X, y, ax=None): 36 | """Plot the different decision rules found by a `DecisionTreeClassifier`. 37 | 38 | Parameters 39 | ---------- 40 | tree : DecisionTreeClassifier instance 41 | The decision tree to inspect. 42 | X : dataframe of shape (n_samples, n_features) 43 | The data used to train the `tree` estimator. 44 | y : ndarray of shape (n_samples,) 45 | The target used to train the `tree` estimator. 46 | ax : matplotlib axis 47 | The matplotlib axis where to plot the different decision rules. 48 | """ 49 | import numpy as np 50 | 51 | plt.figure(figsize=(12, 10)) 52 | h = 0.02 53 | x_min, x_max = 0, 100 54 | y_min, y_max = 0, 100 55 | xx, yy = np.meshgrid( 56 | np.arange(x_min, x_max, h), np.arange(y_min, y_max, h) 57 | ) 58 | 59 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 60 | Z = Z.reshape(xx.shape) 61 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) 62 | faces = faces.reshape(xx.shape) 63 | border = ndimage.laplace(faces) != 0 64 | if ax is None: 65 | ax = plt.gca() 66 | ax.scatter( 67 | X.iloc[:, 0], 68 | X.iloc[:, 1], 69 | c=np.array(["tab:blue", "tab:orange"])[y], 70 | s=60, 71 | alpha=0.7, 72 | vmin=0, 73 | vmax=1, 74 | ) 75 | levels = np.linspace(0, 1, 101) 76 | contours = ax.contourf( 77 | xx, yy, Z, alpha=0.4, levels=levels, cmap=blue_orange_cmap 78 | ) 79 | ax.get_figure().colorbar(contours, ticks=np.linspace(0, 1, 11)) 80 | ax.scatter(xx[border], yy[border], marker=".", s=1) 81 | ax.set_xlabel(X.columns[0]) 82 | ax.set_ylabel(X.columns[1]) 83 | ax.set_xlim([x_min, x_max]) 84 | ax.set_ylim([y_min, y_max]) 85 | sns.despine(offset=10) 86 | plt.savefig(HERE / "simple_decision_tree_adult_census.png") 87 | 88 | 89 | # select a subset of data 90 | data_subset = adult_census[:n_samples_to_plot] 91 | X = data_subset[["age", "hours-per-week"]] 92 | y = LabelEncoder().fit_transform(data_subset[target_column].to_numpy()) 93 | 94 | max_leaf_nodes = 3 95 | tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=0) 96 | tree.fit(X, y) 97 | 98 | # plot the decision function learned by the tree 99 | plot_tree_decision_function(tree, X, y) 100 | -------------------------------------------------------------------------------- /figures/plot_splines.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple example of overfit with splines 3 | """ 4 | import numpy as np 5 | from matplotlib import pyplot as plt 6 | import style_figs 7 | 8 | from sklearn import datasets, linear_model 9 | 10 | # Load the diabetes dataset 11 | diabetes = datasets.load_diabetes() 12 | 13 | 14 | # Use only one feature 15 | diabetes_X = diabetes.data[:, np.newaxis] 16 | diabetes_X_temp = diabetes_X[:, :, 2] 17 | 18 | # Split the data into training/testing sets 19 | diabetes_X_train = diabetes_X_temp[:-200:3] 20 | diabetes_X_test = diabetes_X_temp[-200:].T 21 | 22 | # Split the targets into training/testing sets 23 | diabetes_y_train = diabetes.target[:-200:3] 24 | diabetes_y_test = diabetes.target[-200:] 25 | 26 | # Sort the data and remove duplicates (for interpolation) 27 | order = np.argsort(diabetes_X_train.ravel()) 28 | X_train = diabetes_X_train.ravel()[order] 29 | y_train = diabetes_y_train[order] 30 | # Avoid duplicates 31 | y_train_ = list() 32 | for this_x in np.unique(X_train): 33 | y_train_.append(np.mean(y_train[X_train == this_x])) 34 | X_train = np.unique(X_train) 35 | 36 | y_train = np.array(y_train_) 37 | 38 | # Create linear regression object 39 | regr = linear_model.LinearRegression() 40 | 41 | # Train the model using the training sets 42 | regr.fit(X_train.reshape((-1, 1)), y_train) 43 | 44 | 45 | plt.figure(1, figsize=(0.8 * 4, 0.8 * 3), facecolor="none") 46 | # Plot with test data 47 | plt.clf() 48 | ax = plt.axes([0.1, 0.1, 0.9, 0.9]) 49 | 50 | plt.scatter(X_train, y_train, color="k", s=9) 51 | 52 | plt.plot( 53 | [-0.08, 0.12], 54 | regr.predict( 55 | [ 56 | [ 57 | -0.08, 58 | ], 59 | [ 60 | 0.12, 61 | ], 62 | ] 63 | ), 64 | linewidth=3, 65 | ) 66 | 67 | plt.axis("tight") 68 | ymin, ymax = plt.ylim() 69 | style_figs.light_axis() 70 | plt.ylabel("y", size=16, weight=600) 71 | plt.xlabel("x", size=16, weight=600) 72 | 73 | plt.savefig("ols_simple.svg", facecolor="none", edgecolor="none") 74 | 75 | plt.scatter(diabetes_X_test, diabetes_y_test, color="C1", s=9) 76 | plt.ylim(ymin, ymax) 77 | plt.xlim(-0.08, 0.12) 78 | 79 | plt.savefig("ols_test.svg", facecolor="none", edgecolor="none") 80 | 81 | 82 | # Plot cubic splines 83 | plt.clf() 84 | ax = plt.axes([0.1, 0.1, 0.9, 0.9]) 85 | 86 | from scipy import interpolate 87 | 88 | f = interpolate.interp1d( 89 | X_train, 90 | y_train, 91 | kind="quadratic", 92 | bounds_error=False, 93 | fill_value="extrapolate", 94 | ) 95 | plt.scatter(X_train, y_train, color="k", s=9, zorder=20) 96 | x_spline = np.linspace(-0.08, 0.12, 600) 97 | y_spline = f(x_spline) 98 | plt.plot(x_spline, y_spline, linewidth=3) 99 | 100 | plt.axis("tight") 101 | plt.xlim(-0.08, 0.12) 102 | plt.ylim(ymin, ymax) 103 | 104 | style_figs.light_axis() 105 | 106 | plt.ylabel("y", size=16, weight=600) 107 | plt.xlabel("x", size=16, weight=600) 108 | 109 | 110 | plt.savefig("splines_cubic.svg", facecolor="none", edgecolor="none") 111 | 112 | 113 | plt.scatter(diabetes_X_test, diabetes_y_test, color="C1", s=9) 114 | plt.savefig("splines_test.svg", facecolor="none", edgecolor="none") 115 | 116 | plt.show() 117 | -------------------------------------------------------------------------------- /figures/shufflesplit_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/shufflesplit_diagram.png -------------------------------------------------------------------------------- /figures/simple_decision_tree_adult_census.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/simple_decision_tree_adult_census.png -------------------------------------------------------------------------------- /figures/style_figs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple styling used for matplotlib figures 3 | """ 4 | 5 | from matplotlib import pyplot as plt 6 | 7 | # Configuration settings to help visibility on small screen / prints 8 | plt.rcParams["xtick.labelsize"] = 20 9 | plt.rcParams["ytick.labelsize"] = 20 10 | plt.rcParams["figure.titlesize"] = 15 11 | plt.rcParams["font.size"] = 20 12 | plt.rcParams["axes.labelsize"] = 20 13 | plt.rcParams["axes.facecolor"] = "none" 14 | plt.rcParams["legend.fontsize"] = 18 15 | plt.rcParams["lines.linewidth"] = 3 16 | plt.rcParams["figure.figsize"] = [0.8 * 6.4, 0.8 * 4.8] 17 | plt.rcParams["legend.frameon"] = False 18 | plt.rcParams["legend.columnspacing"] = 1.8 19 | plt.rcParams["legend.handlelength"] = 1.5 20 | plt.rcParams["legend.handletextpad"] = 0.5 21 | 22 | 23 | # Utility functions 24 | def light_axis(): 25 | "Hide the top and right spines" 26 | ax = plt.gca() 27 | for s in ("top", "right"): 28 | ax.spines[s].set_visible(False) 29 | plt.xticks(()) 30 | plt.yticks(()) 31 | plt.subplots_adjust(left=0.01, bottom=0.01, top=0.99, right=0.99) 32 | 33 | 34 | def no_axis(): 35 | plt.axis("off") 36 | plt.subplots_adjust(left=0.0, bottom=0.0, top=1, right=1) 37 | -------------------------------------------------------------------------------- /figures/supervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/supervised.png -------------------------------------------------------------------------------- /figures/unsupervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/unsupervised.png -------------------------------------------------------------------------------- /figures/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/figures/workflow.png -------------------------------------------------------------------------------- /jupyter-book/_config.yml: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # Book settings 3 | title : Scikit-learn course 4 | author: scikit-learn developers 5 | logo: 'scikit-learn-logo.png' 6 | copyright: "2022-2024" 7 | 8 | # Information about where the book exists on the web 9 | description: >- 10 | scikit-learn course 11 | exclude_patterns: 12 | - _build 13 | - Thumbs.db 14 | - .DS_Store 15 | - "**.ipynb_checkpoints" 16 | - "figures" 17 | - "datasets" 18 | - "README.md" 19 | 20 | 21 | ####################################################################################### 22 | # Execution settings 23 | execute: 24 | execute_notebooks : cache 25 | timeout : 300 26 | 27 | ####################################################################################### 28 | # Parse and render settings 29 | parse: 30 | myst_enable_extensions: 31 | - colon_fence 32 | - dollarmath 33 | - linkify 34 | - substitution 35 | 36 | ####################################################################################### 37 | # HTML-specific settings 38 | html: 39 | home_page_in_navbar : false 40 | use_edit_page_button : true 41 | use_repository_button : true 42 | use_issues_button : true 43 | favicon: _static/favicon.ico 44 | comments: 45 | hypothesis: true 46 | extra_footer: | 47 |

48 |
49 | Join the full MOOC experience 50 | Get officially certified! 51 |
52 | Brought to you under a CC-BY License by 53 | Inria Learning Lab, 54 | scikit-learn @ La Fondation Inria, 55 | Inria Academy, 56 | probabl, 57 | with many thanks to the scikit-learn community as a whole! 58 |
59 | 60 | # ####################################################################################### 61 | # Interact link settings 62 | notebook_interface : "notebook" 63 | # notebook_interface: "classic" # The interface interactive links will activate ["classic", "jupyterlab"] 64 | 65 | sphinx: 66 | config: 67 | nb_custom_formats: 68 | .py: 69 | - jupytext.reads 70 | - fmt: py:percent 71 | # Needed for plotly rendering: 72 | # https://jupyterbook.org/interactive/interactive.html#plotly 73 | html_js_files: 74 | - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js 75 | 76 | ####################################################################################### 77 | # Launch button settings 78 | repository: 79 | url : https://github.com/INRIA/scikit-learn-mooc 80 | branch: main 81 | 82 | launch_buttons: 83 | binderhub_url: "https://mybinder.org" 84 | # colab_url: "https://colab.research.google.com" # Not working for now, 85 | # because it needs .ipynb 86 | # Disable thebe support since it does not start in the right folder, see 87 | # https://github.com/INRIA/scikit-learn-mooc/issues/669 for more details 88 | # thebe: true 89 | 90 | binder: 91 | binderhub_url : "https://mybinder.org" 92 | text : "Launch binder" 93 | 94 | 95 | latex: 96 | latex_engine : "xelatex" 97 | latex_documents: 98 | targetname: book.tex 99 | -------------------------------------------------------------------------------- /jupyter-book/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/jupyter-book/_static/favicon.ico -------------------------------------------------------------------------------- /jupyter-book/_static/matomo.js: -------------------------------------------------------------------------------- 1 | var _paq = window._paq = window._paq || []; 2 | /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ 3 | _paq.push(['trackPageView']); 4 | _paq.push(['enableLinkTracking']); 5 | (function() { 6 | var u = "https://piwik.inria.fr/"; 7 | _paq.push(['setTrackerUrl', u + 'piwik.php']); 8 | _paq.push(['setSiteId', '127']); 9 | var d = document, 10 | g = d.createElement('script'), 11 | s = d.getElementsByTagName('script')[0]; 12 | g.async = true; 13 | g.src = u + 'piwik.js'; 14 | s.parentNode.insertBefore(g, s); 15 | })(); 16 | -------------------------------------------------------------------------------- /jupyter-book/_static/sklearn_mooc.css: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | Note: the video and slides iframes currently use the same CSS styles but use 4 | different classes to get future-proof flexibility. 5 | */ 6 | 7 | iframe.video { 8 | width: 100%; 9 | aspect-ratio: 4/3; 10 | margin-bottom: 1em; 11 | } 12 | 13 | iframe.slides { 14 | width: 100%; 15 | aspect-ratio: 4/3; 16 | margin-bottom: 1em; 17 | } 18 | 19 | /* 20 | Better highlighting of modules in toc.html, for some reason modules 21 | are aria-level="2" rather than h2 22 | */ 23 | p[aria-level="2"] { 24 | font-size: 1.2em; 25 | margin-top: 2em; 26 | margin-bottom: 0.5em; 27 | font-weight: bold; 28 | } 29 | 30 | /* The adds in the landing page */ 31 | 32 | div.mooc_add { 33 | display: table; 34 | } 35 | 36 | div.mooc_add a { 37 | color: #000000; 38 | display: block; 39 | border-radius: .4em; 40 | background-color: #F7931E; 41 | border: 1px solid #7b5a46; 42 | box-shadow: 1px 1px 1px #CA9875; 43 | padding: 5pt; 44 | } 45 | 46 | @media screen and (min-width: 900px) { 47 | div.mooc_add { 48 | width: 25ex; 49 | position: fixed; 50 | right: calc(5pt + .15 * (100vw - 900px)); 51 | bottom: calc(5pt + max(0pt, .05*(100vh - 200px))); 52 | } 53 | 54 | 55 | div.footer { 56 | max-width: 60vw; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /jupyter-book/_static/sklearn_mooc.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | function inIframe() { 3 | try { 4 | return window.self !== window.top; 5 | } catch (e) { 6 | return true; 7 | } 8 | } 9 | 10 | function contentOnly() { 11 | var urlParams = new URLSearchParams(window.location.search); 12 | return urlParams.get('content_only') !== null; 13 | } 14 | 15 | function removeIfExists(el) { 16 | if (el) { 17 | el.remove(); 18 | }; 19 | } 20 | 21 | function adjustBinderLink() { 22 | // Binder links to .py instead of .ipynb. In an ideal world, there 23 | // would be a way to do it in _config.yml or you could tell Jupyter to 24 | // use the Notebook interface to open the .py but ?factory=Notebook 25 | // does not work on the mybinder.org URL only on the 26 | // hub.2i2c.mybinder.org URL 27 | var elements = document.querySelectorAll('.dropdown-launch-buttons a'); 28 | elements.forEach( 29 | function(el) { 30 | el.href = el.href.replace(/python_scripts\/(.+)\.py/, "notebooks/$1.ipynb"); 31 | } 32 | ); 33 | } 34 | 35 | function displayContentOnly() { 36 | removeIfExists(document.querySelector('#site-navigation')); 37 | removeIfExists(document.querySelector('.topbar')); 38 | removeIfExists(document.querySelector('.footer')); 39 | // the prev/next buttons at the bottom of the page may have a different 40 | // class (depending on the theme version maybe?), removing both to be 41 | // safe. 42 | removeIfExists(document.querySelector('.prev-next-bottom')); 43 | removeIfExists(document.querySelector('.prev-next-area')); 44 | var elementsToRemove = document.querySelectorAll('.remove-from-content-only'); 45 | elementsToRemove.forEach( 46 | function(el) { 47 | removeIfExists(el); 48 | } 49 | ); 50 | document.querySelector('#main-content').querySelector('.col-md-9').className = 'col-12'; 51 | 52 | var style = document.createElement('style'); 53 | style.appendChild( 54 | document.createTextNode( 55 | 'hypothesis-sidebar, hypothesis-notebook, hypothesis-adder{display:none!important;}')); 56 | document.getElementsByTagName('head')[0].appendChild(style); 57 | } 58 | 59 | document.addEventListener("DOMContentLoaded", function() { 60 | if (inIframe() || contentOnly()) { 61 | displayContentOnly(); 62 | } 63 | adjustBinderLink(); 64 | }); 65 | }()); 66 | -------------------------------------------------------------------------------- /jupyter-book/appendix/acknowledgement.md: -------------------------------------------------------------------------------- 1 | # Acknowledgement 2 | 3 | ## Figure attributions 4 | 5 | The diagram presenting the API design in the module "The predictive modeling 6 | pipeline" used the following figures: 7 | 8 | - The "Parameters Free Icon" is licensed under CC-BY 3.0 - 9 | [source](https://www.onlinewebfonts.com/icon/512285) 10 | - The "Settings Gears SVG Vector" is licensed under CC0 - 11 | [source](https://www.svgrepo.com/svg/57066/settings-gears) 12 | - The "Close icon" is licensed under MIT - 13 | [source](https://www.iconfinder.com/icons/211652/close_icon) 14 | -------------------------------------------------------------------------------- /jupyter-book/appendix/datasets_intro.md: -------------------------------------------------------------------------------- 1 | # Datasets description 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/appendix/notebook_timings.md: -------------------------------------------------------------------------------- 1 | # Notebook timings 2 | 3 | ```{nb-exec-table} 4 | ``` 5 | -------------------------------------------------------------------------------- /jupyter-book/appendix/toc_redirect.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Table of contents 4 | -------------------------------------------------------------------------------- /jupyter-book/concluding_remarks_video.md: -------------------------------------------------------------------------------- 1 | # 🎥 Concluding remarks 2 | 3 | 6 | -------------------------------------------------------------------------------- /jupyter-book/datasets: -------------------------------------------------------------------------------- 1 | ../datasets -------------------------------------------------------------------------------- /jupyter-book/ensemble/bagging_slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Intuitions on ensemble models: bagging 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/boosting_slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Intuitions on ensemble models: boosting 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_boosting_index.md: -------------------------------------------------------------------------------- 1 | # Ensemble based on boosting 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_bootstrap_index.md: -------------------------------------------------------------------------------- 1 | # Ensemble method using bootstrapping 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_hyperparameters_index.md: -------------------------------------------------------------------------------- 1 | # Hyperparameter tuning with ensemble methods 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | This module will go into details regarding algorithms that are combining 8 | several models together, also called ensemble of models. We will present two 9 | families of such techniques: (i) based on bootstrapping and (ii) based 10 | on boosting. We will present bagging and random forest that belong to the 11 | former strategy and AdaBoost and gradient boosting decision tree that belong 12 | to the later strategy. Finally, we will go into details regarding the 13 | hyperparameters allowing to tune these models and compare them between models. 14 | 15 | ## Before getting started 16 | 17 | 18 | 19 | The required technical skills to carry on this module are: 20 | 21 | - skills acquired during the "The Predictive Modeling Pipeline" module with 22 | basic usage of scikit-learn; 23 | - skills acquired during the "Selecting The Best Model" module, mainly around 24 | the concept of underfit/overfit and the usage of cross-validation in 25 | scikit-learn; 26 | - skills acquired during the modules "Linear Models" and 27 | "Decision Tree Models". 28 | 29 | 30 | 31 | ## Objectives and time schedule 32 | 33 | 34 | 35 | The objective in the module are the following: 36 | 37 | - understanding the principles behind bootstrapping and boosting; 38 | - get intuitions with specific models such as random forest 39 | and gradient boosting; 40 | - identify the important hyperparameters of random forest and gradient boosting 41 | decision trees as well as their typical values. 42 | 43 | 44 | 45 | The estimated time to go through this module is about 6 hours. 46 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_module_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | 6 | 7 | So in this module, we discussed ensemble learners which are a type of 8 | learner that combines simpler learners together. We saw two strategies: 9 | 10 | - one based on bootstrap samples allowing learners to be fit in a parallel 11 | manner; 12 | - the other called boosting which fit learners sequentially. 13 | 14 | From these two families, we mainly focused on giving intuitions regarding the 15 | internal machinery of the random forest and gradient-boosting models which 16 | are state-of-the-art methods. 17 | 18 | ## To go further 19 | 20 | 21 | 22 | You can refer to the following scikit-learn examples which are related to 23 | the concepts approached in this module: 24 | 25 | - [Early-stopping in gradient-boosting](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_early_stopping.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-early-stopping-py) 26 | - [Combining predictors using stacking](https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html#sphx-glr-auto-examples-ensemble-plot-stack-predictors-py) 27 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_quiz_m6_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M6.01 2 | 3 | ```{admonition} Question 4 | By default, a 5 | [`BaggingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html) 6 | or [`BaggingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html) 7 | draw: 8 | 9 | - a) random samples with replacement over training points 10 | - b) random samples with replacement over features 11 | - c) random samples without replacement over training points 12 | - d) random samples without replacement over features 13 | 14 | _Select all answers that apply_ 15 | 16 | Hint: it is possible to access the documentation for those classes by 17 | clicking on the links on their names. 18 | ``` 19 | 20 | +++ 21 | 22 | ```{admonition} Question 23 | In a 24 | [`BaggingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html) 25 | or [`BaggingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html), 26 | the parameter `base_estimator` can be: 27 | 28 | - a) any predictor 29 | - b) a decision tree predictor 30 | - c) a linear model predictor 31 | 32 | _Select a single answer_ 33 | ``` 34 | 35 | +++ 36 | 37 | ```{admonition} Question 38 | 39 | In the context of a classification problem, what are the differences between a 40 | bagging classifier and a random forest classifier: 41 | 42 | - a) in a random forest, the base model is always a decision tree 43 | - b) in a random forest, the split threshold values are decided completely at 44 | random 45 | - c) in a random forest, a random resampling is performed both over features 46 | as well as over samples 47 | 48 | _Select all answers that apply_ 49 | ``` 50 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_quiz_m6_02.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M6.02 2 | 3 | ```{admonition} Question 4 | Select the correct statements: 5 | 6 | - a) Both bagging and boosting combine several predictors 7 | - b) Both bagging and boosting are based on decision trees 8 | - c) Boosting combines predictors sequentially 9 | - d) Bagging combines predictors simultaneously 10 | 11 | _Select all answers that apply_ 12 | ``` 13 | 14 | +++ 15 | 16 | ```{admonition} Question 17 | Boosting algorithms learn their predictor: 18 | 19 | - a) by training predictors in parallel on slightly different datasets 20 | - b) by training predictors sequentially which correct previous prediction errors 21 | - c) by taking a linear combination of weak predictors 22 | 23 | _Select all answers that apply_ 24 | ``` 25 | 26 | +++ 27 | 28 | ```{admonition} Question 29 | Histogram gradient boosting is an accelerated gradient boosting algorithm that: 30 | 31 | - a) takes a subsample of the original samples 32 | - b) bins the numerical features 33 | - c) takes a subsample of the original features 34 | 35 | _Select a single answer_ 36 | ``` 37 | 38 | +++ 39 | 40 | ```{admonition} Question 41 | Boosting tends to overfit when increasing the number of predictors: 42 | 43 | - a) true 44 | - b) false 45 | 46 | _Select a single answer_ 47 | ``` 48 | -------------------------------------------------------------------------------- /jupyter-book/ensemble/ensemble_quiz_m6_03.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M6.03 2 | 3 | ```{admonition} Question 4 | When compared to random forests, gradient boosting is usually trained using: 5 | 6 | - a) shallower trees 7 | - b) deeper trees 8 | - c) a subset of features 9 | - d) all features 10 | 11 | _Select all answers that apply_ 12 | ``` 13 | 14 | +++ 15 | 16 | ```{admonition} Question 17 | Which of the hyperparameter(s) do not exist in random forest but exists in gradient boosting: 18 | 19 | - a) number of estimators 20 | - b) maximum depth 21 | - c) learning rate 22 | 23 | _Select all answers that apply_ 24 | ``` 25 | 26 | +++ 27 | 28 | ```{admonition} Question 29 | Which of the following options are correct about the benefits of ensemble models? 30 | 31 | - a) Better generalization performance 32 | - b) Reduced sensitivity to hyperparameter tuning of individual predictors 33 | - c) Better interpretability 34 | 35 | _Select all answers that apply_ 36 | ``` 37 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/cross_validation_baseline_index.md: -------------------------------------------------------------------------------- 1 | # Comparing a model with simple baselines 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/cross_validation_choices_index.md: -------------------------------------------------------------------------------- 1 | # Choice of cross-validation 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/cross_validation_nested_index.md: -------------------------------------------------------------------------------- 1 | # Nested cross-validation 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/evaluation_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | In the previous module, we presented the general cross-validation framework 8 | and used it to evaluate models' performance. However, this is important to 9 | keep in mind that some elements in the cross-validation need to be decided 10 | depending on the nature of the problem: (i) the cross-validation strategy and 11 | (ii) the evaluation metrics. Besides, it is always good to compare the models' 12 | performance with some baseline model. 13 | 14 | In this module, we present both aspects and give insights on when to use a 15 | specific cross-validation strategy and a metric. In addition, we will also 16 | give some insights regarding how to compare a model with some baseline. 17 | 18 | ## Before getting started 19 | 20 | 21 | 22 | The required technical skills to carry on this module are: 23 | 24 | - skills acquired during the "The Predictive Modeling Pipeline" module with 25 | basic usage of scikit-learn; 26 | - skills acquired during the "Selecting The Best Model" module, mainly around 27 | the concept of underfit/overfit and the usage of cross-validation in 28 | scikit-learn. 29 | 30 | 31 | 32 | ## Objectives and time schedule 33 | 34 | 35 | 36 | The objective in the module are the following: 37 | 38 | - understand the necessity of using an appropriate cross-validation strategy 39 | depending on the data; 40 | - get the intuitions behind comparing a model with some basic models that 41 | can be used as baseline; 42 | - understand the principles behind using nested cross-validation when the model 43 | needs to be evaluated as well as optimized; 44 | - understand the differences between regression and classification metrics; 45 | - understand the differences between metrics. 46 | 47 | 48 | 49 | The estimated time to go through this module is about 6 hours. 50 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/evaluation_module_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | 6 | 7 | In this notebook, we presented the framework used in machine-learning to 8 | evaluate a predictive model's performance: the cross-validation. 9 | 10 | Besides, we presented several splitting strategies that can be used in the 11 | general cross-validation framework. These strategies should be used wisely 12 | when encountering some specific patterns or types of data. 13 | 14 | Finally, we show how to perform nested cross-validation to select an optimal 15 | model and evaluate its generalization performance. 16 | 17 | ## To go further 18 | 19 | 20 | 21 | You can refer to the following scikit-learn examples which are related to 22 | the concepts approached in this module: 23 | 24 | - [Comparison of cross-validation strategies](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py) 25 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/evaluation_quiz_m7_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M7.01 2 | 3 | ```{admonition} Question 4 | What the benefit of using cross-validation? 5 | 6 | - a) Give information about performance variability 7 | - b) Remove the need to use a baseline algorithm 8 | - c) Give information regarding under- or over-fitting of a model 9 | 10 | _Select all answers that apply_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | Does a dummy classifier or regressor rely on the input feature values in 17 | the input data `X` to make the predictions? 18 | 19 | - a) Yes 20 | - b) No 21 | 22 | _Select a single answer_ 23 | ``` 24 | 25 | +++ 26 | 27 | ```{admonition} Question 28 | Does a dummy classifier from scikit-learn always make constant predictions 29 | whatever the chosen strategy? 30 | 31 | - a) Yes 32 | - b) No 33 | 34 | _Select a single answer_ 35 | ``` 36 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/evaluation_quiz_m7_02.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M7.02 2 | 3 | ```{admonition} Question 4 | We have a dataset with patient records from 10 different hospitals, and our goal 5 | is to predict whether a patient has a disease or not. Let's also suppose that 6 | the classes ("disease" and "no-disease") are imbalanced. Additionally, we suspect 7 | that each hospital's data may have systematic biases due to factors like 8 | medical devices, policies, socioeconomic status of the patients, etc. 9 | 10 | Which cross-validation strategy is the most suitable for assessing the model's 11 | ability to make good predictions on patients from hospitals not seen during 12 | training? 13 | 14 | - a) Group stratified k-fold cross-validation 15 | - b) Group k-fold 16 | - c) Stratified k-fold cross-validation 17 | - d) Leave-one-out cross-validation 18 | 19 | _Select a single answer_ 20 | ``` 21 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/evaluation_quiz_m7_03.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M7.03 2 | 3 | ```{admonition} Question 4 | How to evaluate and tune the hyperparameters of a model? 5 | 6 | - a) Fit the model on the train set, set the parameters using the test set, and 7 | evaluate the model on the same test set 8 | - b) Fit the model on the train set, set the parameters using a validation set, 9 | and evaluate the model on the test set 10 | - c) use a nested cross-validation, with an inner cross-validation to tune the 11 | parameters of the model and an outer cross-validation to evaluate the model's 12 | performance 13 | 14 | _Select all answers that apply_ 15 | ``` 16 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/evaluation_quiz_m7_04.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M7.04 2 | 3 | ```{admonition} Question 4 | What is the default score in scikit-learn when using a classifier? 5 | 6 | - a) balanced accuracy 7 | - b) ROC-AUC 8 | - c) accuracy 9 | 10 | _Select a single answer_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | Other than the decision threshold, metrics such as recall and precision also 17 | depend on the regularization parameters. Assuming that class "1" (in red) is the 18 | positive class, use the following figures to select which statements are true in 19 | this particular logistic regression model: 20 | 21 | ![Precision-recall C=3e-3](../../figures/evaluation_quiz_precision_recall_C0.003.svg) 22 | ![Precision-recall C=1](../../figures/evaluation_quiz_precision_recall_C1.svg) 23 | 24 | - a) stronger regularization leads to higher precision 25 | - b) stronger regularization leads to lower precision 26 | - c) stronger regularization leads to higher recall 27 | - d) stronger regularization leads to lower recall 28 | 29 | _Select all answers that apply_ 30 | ``` 31 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/evaluation_quiz_m7_05.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M7.05 2 | 3 | ```{admonition} Question 4 | What is the default score in scikit-learn when using a regressor? 5 | 6 | - a) $R^2$ 7 | - b) mean absolute error 8 | - c) median absolute error 9 | 10 | _Select a single answer_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | If we observe that the values returned by 17 | `cross_val_scores(model, X, y, scoring="r2")` increase after changing the model 18 | parameters, it means that the latest model: 19 | 20 | - a) generalizes better 21 | - b) generalizes worse 22 | 23 | _Select a single answer_ 24 | ``` 25 | 26 | +++ 27 | 28 | ```{admonition} Question 29 | If all the values returned by 30 | `cross_val_score(model_A, X, y, scoring="neg_mean_squared_error")` 31 | are strictly lower than those returned by 32 | `cross_val_score(model_B, X, y, scoring="neg_mean_squared_error")` 33 | it means that `model_B` generalizes: 34 | 35 | - a) better than `model_A` 36 | - b) worse than `model_A` 37 | 38 | Hint: Remember that `"neg_mean_squared_error"` is an alias for the negative of 39 | the Mean Squared Error. 40 | 41 | _Select a single answer_ 42 | ``` 43 | 44 | +++ 45 | 46 | ```{admonition} Question 47 | Values returned by `cross_val_scores(model, X, y, scoring="neg_mean_squared_error")` 48 | are: 49 | 50 | - a) guaranteed to be positive or zero 51 | - b) guaranteed to be negative or zero 52 | - c) can be either positive or negative depending on the data 53 | 54 | _Select a single answer_ 55 | ``` 56 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/metrics_classification_index.md: -------------------------------------------------------------------------------- 1 | # Classification metrics 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/evaluation/metrics_regression_index.md: -------------------------------------------------------------------------------- 1 | # Regression metrics 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/feature_selection/feature_selection_limitation_index.md: -------------------------------------------------------------------------------- 1 | # Caveats of feature selection 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/feature_selection/feature_selection_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | This module gives some insights regarding feature selection. Besides motivating 8 | the benefit of using feature selection, we also illustrate some of the known 9 | caveats. 10 | 11 | ## Before getting started 12 | 13 | 14 | 15 | The required technical skills to carry on this module are: 16 | 17 | - skills acquired during the "The Predictive Modeling Pipeline" module with 18 | basic usage of scikit-learn; 19 | - skills acquired during the "Selecting The Best Model" module, mainly around 20 | the concept of underfit/overfit and the usage of cross-validation in 21 | scikit-learn. 22 | 23 | 24 | 25 | ## Objectives and time schedule 26 | 27 | 28 | 29 | The objective in the module are the following: 30 | 31 | - understand in which case feature selection is beneficial; 32 | - be aware of the caveats and how to put into practice feature selection 33 | techniques. 34 | 35 | 36 | 37 | The estimated time to go through this module is about 50 minutes. 38 | -------------------------------------------------------------------------------- /jupyter-book/feature_selection/feature_selection_module_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | 6 | 7 | In this module, we presented the principle of feature selection. In short, 8 | feature selection is not a magical tool to get marginal gains. We tackle 9 | the following aspects: 10 | 11 | - you should use feature selection to speed-up training and testing rather 12 | than seeking for marginal performance gains; 13 | - you should be careful regarding the framework and how to include a feature 14 | selector within your pipeline; 15 | - you should be aware of the limitation of a feature selector based on 16 | machine-learning models. 17 | 18 | ## To go further 19 | 20 | 21 | 22 | You can refer to the following scikit-learn examples which are related to 23 | the concepts approached during this module: 24 | 25 | - [Recursive feature selection using cross-validation](https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py) 26 | -------------------------------------------------------------------------------- /jupyter-book/feature_selection/feature_selection_quiz.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz 2 | 3 | ```{admonition} Question 4 | What is the main advantage of using feature selection? 5 | 6 | - a) speeding-up the training of an algorithm 7 | - b) fine tuning the model's performance 8 | - c) remove noisy features 9 | 10 | _Select a single answer_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | When selecting feature, the decision should be made using: 17 | 18 | - a) the entire dataset 19 | - b) the training set 20 | - c) the testing set 21 | 22 | _Select a single answer_ 23 | ``` 24 | -------------------------------------------------------------------------------- /jupyter-book/figures: -------------------------------------------------------------------------------- 1 | ../figures -------------------------------------------------------------------------------- /jupyter-book/interpretation/interpretation_quiz.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz 2 | 3 | ```{admonition} Question 4 | With a same dataset, feature importance might differs if: 5 | 6 | - a) we use two different models 7 | - b) we use two different train/test split with a same model 8 | - c) we use a same model with a different set of hyper-parameters 9 | - d) we use a same model with the same set of hyper-parameters but a different 10 | random_state 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | In linear model, the feature importance: 17 | 18 | - a) might be infer from the coefficients 19 | - b) might be infer by `importance_permutation` 20 | - c) need a regularization to infer the importance 21 | - d) is a built-in attribute 22 | ``` 23 | 24 | +++ 25 | 26 | ```{admonition} Question 27 | If two feature are the same (thus correlated) 28 | 29 | - a) their feature importance will be the same 30 | - b) their feature importance will be divided by 2 31 | - c) only one will receive all the feature importance, the second one will be 0 32 | - d) it depends 33 | ``` 34 | 35 | +++ 36 | 37 | ```{admonition} Question 38 | The feature importance provided by the scikit-learn random forest: 39 | 40 | - a) has bias for categorical feature 41 | - b) has bias for continuous (high cardinality) feature 42 | - c) is independent from the train/test split 43 | - d) is independent from the hyper-parameters 44 | ``` 45 | 46 | +++ 47 | 48 | ```{admonition} Question 49 | To evaluate the feature importance for a specific model, one could: 50 | 51 | - a) drop a column and compare the score 52 | - b) shuffle a column and compare the score 53 | - c) put all column to 0 and compare the score 54 | - d) change a column value to random number and compare the score 55 | ``` 56 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_intuitions_index.md: -------------------------------------------------------------------------------- 1 | # Intuitions on linear models 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | In this module, will go further into details regarding models that use 8 | linear parametrization. 9 | We will see how to use this family of models for both classification and 10 | regression problems. Besides, we will explain how to fight over-fitting using 11 | regularization. 12 | Finally, we will show how linear models can be used with 13 | data presenting non-linearity. 14 | 15 | ## Before getting started 16 | 17 | 18 | 19 | The required technical skills to carry on this module are: 20 | 21 | - skills acquired during the "The Predictive Modeling Pipeline" module with 22 | basic usage of scikit-learn; 23 | - skills acquired during the "Selecting The Best Model" module, mainly around 24 | the concept of underfit/overfit and the usage of cross-validation in 25 | scikit-learn. 26 | 27 | 28 | 29 | ## Objectives and time schedule 30 | 31 | 32 | 33 | In this module, your objectives are to: 34 | 35 | - understand the linear models parametrization; 36 | - understand the implication of linear models in both 37 | regression and classification; 38 | - get intuitions of linear models applied in higher dimensional dataset; 39 | - understand the effect of regularization and how to set it; 40 | - understand how linear models can be used even with data showing non-linear 41 | relationship with the target to be predicted. 42 | 43 | 44 | 45 | The estimated time to go through this module is about 6 hours. 46 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_module_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | 6 | 7 | In this module, we saw that: 8 | 9 | - the predictions of a linear model depend on a weighted sum of the values of 10 | the input features added to an intercept parameter; 11 | - fitting a linear model consists in adjusting both the weight coefficients and 12 | the intercept to minimize the prediction errors on the training set; 13 | - to train linear models successfully it is often required to scale the input 14 | features approximately to the same dynamic range; 15 | - regularization can be used to reduce over-fitting: weight coefficients are 16 | constrained to stay small when fitting; 17 | - the regularization hyperparameter needs to be fine-tuned by cross-validation 18 | for each new machine learning problem and dataset; 19 | - linear models can be used on problems where the target variable is not 20 | linearly related to the input features but this requires extra feature 21 | engineering work to transform the data in order to avoid under-fitting. 22 | 23 | ## To go further 24 | 25 | 26 | 27 | You can refer to the following scikit-learn examples which are related to 28 | the concepts approached during this module: 29 | 30 | - [Example of linear regression](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py) 31 | - [Comparison between a linear regression and a ridge regressor](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols_ridge_variance.html#sphx-glr-auto-examples-linear-model-plot-ols-ridge-variance-py) 32 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_non_linear_index.md: -------------------------------------------------------------------------------- 1 | # Non-linear feature engineering for linear models 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_quiz_m4_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M4.01 2 | 3 | ```{admonition} Question 4 | What is a linear regression? 5 | 6 | - a) a model that outputs a continuous prediction as the sum of the values of a 7 | **limited** subset of the input features 8 | - b) a model that outputs a binary prediction based on a linear combination 9 | of the values of the input features 10 | - c) a model that outputs a continuous prediction as a weighted sum of the input 11 | features 12 | 13 | _Select a single answer_ 14 | ``` 15 | 16 | +++ 17 | 18 | ```{admonition} Question 19 | Is it possible to get a perfect fit (zero prediction error on the training set) 20 | with a linear classifier **by itself** on a non-linearly separable dataset? 21 | 22 | - a) yes 23 | - b) no 24 | 25 | _Select a single answer_ 26 | ``` 27 | 28 | +++ 29 | 30 | ```{admonition} Question 31 | If we fit a linear regression where `X` is a single column vector, how many 32 | parameters our model will be made of? 33 | 34 | - a) 1 35 | - b) 2 36 | - c) 3 37 | 38 | _Select a single answer_ 39 | ``` 40 | 41 | +++ 42 | 43 | ```{admonition} Question 44 | If we train a scikit-learn `LinearRegression` with `X` being a single column 45 | vector and `y` a vector, `coef_` and `intercept_` will be respectively: 46 | 47 | - a) an array of shape (1, 1) and a number 48 | - b) an array of shape (1,) and an array of shape (1,) 49 | - c) an array of shape (1, 1) and an array of shape (1,) 50 | - d) an array of shape (1,) and a number 51 | 52 | _Select a single answer_ 53 | ``` 54 | 55 | +++ 56 | 57 | ```{admonition} Question 58 | The decision boundaries of a logistic regression model: 59 | 60 | - a) split classes using only one of the input features 61 | - b) split classes using a combination of the input features 62 | - c) often have curved shapes 63 | 64 | _Select a single answer_ 65 | ``` 66 | 67 | +++ 68 | 69 | ```{admonition} Question 70 | For a binary classification task, what is the shape of the array returned by the 71 | `predict_proba` method for 10 input samples? 72 | 73 | - a) (10,) 74 | - b) (10, 2) 75 | - c) (2, 10) 76 | 77 | _Select a single answer_ 78 | ``` 79 | 80 | +++ 81 | 82 | ```{admonition} Question 83 | In logistic regression's `predict_proba` method in scikit-learn, which of the 84 | following statements is true regarding the predicted probabilities? 85 | 86 | - a) The sum of probabilities across different classes for a given sample is always equal to 1.0. 87 | - b) The sum of probabilities across all samples for a given class is always equal to 1.0. 88 | - c) The sum of probabilities across all features for a given class is always equal to 1.0. 89 | 90 | _Select a single answer_ 91 | ``` 92 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_quiz_m4_02.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M4.02 2 | 3 | ```{admonition} Question 4 | 5 | Let us consider a pipeline that combines a polynomial feature extraction of 6 | degree 2 and a linear regression model. Let us assume that the linear regression 7 | coefficients are all non-zero and that the dataset contains a single feature. 8 | Is the prediction function of this pipeline a straight line? 9 | 10 | - a) yes 11 | - b) no 12 | 13 | _Select a single answer_ 14 | ``` 15 | 16 | +++ 17 | 18 | ```{admonition} Question 19 | Fitting a linear regression where `X` has `n_features` columns and the target 20 | is a single continuous vector, what is the respective type/shape of `coef_` 21 | and `intercept_`? 22 | 23 | - a) it is not possible to fit a linear regression in dimension higher than 2 24 | - b) array of shape (`n_features`,) and a float 25 | - c) array of shape (1, `n_features`) and an array of shape (1,) 26 | 27 | _Select a single answer_ 28 | ``` 29 | 30 | +++ 31 | 32 | ```{admonition} Question 33 | Combining (one or more) feature engineering transformers in a single pipeline: 34 | 35 | - a) increases the expressivity of the model 36 | - b) ensures that models extrapolate accurately regardless of the distribution of the data 37 | - c) may require tuning additional hyperparameters 38 | - d) inherently prevents any underfitting 39 | 40 | _Select all answers that apply_ 41 | ``` 42 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_quiz_m4_03.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M4.03 2 | 3 | ```{admonition} Question 4 | Which of the following estimators can solve linear regression problems? 5 | 6 | - a) sklearn.linear_model.LinearRegression 7 | - b) sklearn.linear_model.LogisticRegression 8 | - c) sklearn.linear_model.Ridge 9 | 10 | _Select all answers that apply_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | Regularization allows: 17 | 18 | - a) to create a model robust to outliers (samples that differ widely from 19 | other observations) 20 | - b) to reduce overfitting by forcing the weights to stay close to zero 21 | - c) to reduce underfitting by making the problem linearly separable 22 | 23 | _Select a single answer_ 24 | ``` 25 | 26 | +++ 27 | 28 | ```{admonition} Question 29 | A ridge model is: 30 | 31 | - a) the same as linear regression with penalized weights 32 | - b) the same as logistic regression with penalized weights 33 | - c) a linear model 34 | - d) a non linear model 35 | 36 | _Select all answers that apply_ 37 | ``` 38 | 39 | +++ 40 | 41 | ```{admonition} Question 42 | Assume that a data scientist has prepared a train/test split and plans to use 43 | the test for the final evaluation of a `Ridge` model. The parameter `alpha` of 44 | the `Ridge` model: 45 | 46 | - a) is internally tuned when calling `fit` on the train set 47 | - b) should be tuned by running cross-validation on a **train set** 48 | - c) should be tuned by running cross-validation on a **test set** 49 | - d) must be a positive number 50 | 51 | _Select all answers that apply_ 52 | ``` 53 | 54 | +++ 55 | 56 | ```{admonition} Question 57 | Scaling the data before fitting a model: 58 | 59 | - a) is often useful for regularized linear models 60 | - b) is always necessary for regularized linear models 61 | - c) may speed-up fitting 62 | - d) has no impact on the optimal choice of the value of a regularization parameter 63 | 64 | _Select all answers that apply_ 65 | ``` 66 | 67 | +++ 68 | 69 | ```{admonition} Question 70 | The effect of increasing the regularization strength in a ridge model is to: 71 | 72 | - a) shrink all weights towards zero 73 | - b) make all weights equal 74 | - c) set a subset of the weights to exactly zero 75 | - d) constrain all the weights to be positive 76 | 77 | _Select all answers that apply_ 78 | ``` 79 | 80 | +++ 81 | 82 | ```{admonition} Question 83 | By default, a [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) in scikit-learn applies: 84 | 85 | - a) no penalty 86 | - b) a penalty that shrinks the magnitude of the weights towards zero (also called "l2 penalty") 87 | - c) a penalty that ensures all weights are equal 88 | 89 | _Select a single answer_ 90 | ``` 91 | 92 | +++ 93 | 94 | ```{admonition} Question 95 | The parameter `C` in a logistic regression is: 96 | 97 | - a) similar to the parameter `alpha` in a ridge regressor 98 | - b) similar to `1 / alpha` where `alpha` is the parameter of a ridge regressor 99 | - c) not controlling the regularization 100 | 101 | _Select a single answer_ 102 | ``` 103 | 104 | +++ 105 | 106 | ```{admonition} Question 107 | In logistic regression, increasing the regularization strength (by 108 | decreasing the value of `C`) makes the model: 109 | 110 | - a) more likely to overfit to the training data 111 | - b) more confident: the values returned by `predict_proba` are closer to 0 or 1 112 | - c) less complex, potentially underfitting the training data 113 | 114 | _Select a single answer_ 115 | ``` 116 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_regularization_index.md: -------------------------------------------------------------------------------- 1 | # Regularization in linear model 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/linear_models_slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Intuitions on linear models 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/linear_models/regularized_linear_models_slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Intuitions on regularized linear models 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/ml_concepts/quiz_intro_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz Intro.01 2 | 3 | Given a case study: pricing apartments based on a real estate website. We have 4 | thousands of house descriptions with their price. Typically, an example of a 5 | house description is the following: 6 | 7 | "Great for entertaining: spacious, updated 2 bedroom, 1 bathroom apartment in 8 | Lakeview, 97630. The house will be available from May 1st. Close to nightlife 9 | with private backyard. Price ~$1,000,000." 10 | 11 | We are interested in predicting house prices from their description. One 12 | potential use case for this would be, as a buyer, to find houses that are cheap 13 | compared to their market value. 14 | 15 | ```{admonition} Question 16 | What kind of problem is it? 17 | 18 | - a) a supervised problem 19 | - b) an unsupervised problem 20 | - c) a classification problem 21 | - d) a regression problem 22 | 23 | _Select all answers that apply_ 24 | ``` 25 | 26 | +++ 27 | 28 | ```{admonition} Question 29 | What are the features? 30 | 31 | - a) the number of rooms might be a feature 32 | - b) the post code of the house might be a feature 33 | - c) the price of the house might be a feature 34 | 35 | _Select all answers that apply_ 36 | ``` 37 | 38 | +++ 39 | 40 | ```{admonition} Question 41 | What is the target variable? 42 | 43 | - a) the full text description is the target 44 | - b) the price of the house is the target 45 | - c) only house description with no price mentioned are the target 46 | 47 | _Select a single answer_ 48 | ``` 49 | 50 | +++ 51 | 52 | ```{admonition} Question 53 | What is a record (a sample)? 54 | 55 | - a) each house description is a record 56 | - b) each house price is a record 57 | - c) each kind of description (as the house size) is a record 58 | 59 | _Select a single answer_ 60 | ``` 61 | -------------------------------------------------------------------------------- /jupyter-book/ml_concepts/slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Introducing machine-learning concepts 2 | 3 | This presentation will teach you the basic concepts: what is machine learning, 4 | the type of sub-problems that it covers, the vocabulary and the general 5 | pipeline. 6 | 7 | 10 | 11 | 13 | 14 | To navigate in the slides, **first click on the slides**, then: 15 | - press the **arrow keys** to go to the next/previous slide; 16 | - press **"P"** to toggle presenter mode to see the notes; 17 | - press **"F"** to toggle full-screen mode. 18 | -------------------------------------------------------------------------------- /jupyter-book/overfit/bias_vs_variance_quiz_m2_03.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M2.03 2 | 3 | ```{admonition} Question 4 | Fitting a model with a high bias: 5 | 6 | - a) causes an underfitted model? 7 | - b) causes an overfitted model? 8 | - c) increases the sensitivity of the learned prediction function to a random resampling of the training set observations? 9 | - d) causes the learned prediction function to make systematic errors? 10 | 11 | _Select all answers that apply_ 12 | ``` 13 | 14 | +++ 15 | 16 | ```{admonition} Question 17 | Fitting a high variance model: 18 | 19 | - a) causes an underfitted model? 20 | - b) causes an overfitted model? 21 | - c) increases the sensitivity of the learned prediction function to a random resampling of the training set observations? 22 | - d) causes the learned prediction function to make systematic errors? 23 | 24 | _Select all answers that apply_ 25 | ``` 26 | -------------------------------------------------------------------------------- /jupyter-book/overfit/bias_vs_variance_slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Bias versus Variance 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/overfit/learning_validation_curves_quiz_m2_02.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M2.02 2 | 3 | ```{admonition} Question 4 | A model is overfitting when: 5 | 6 | - a) both the train and test errors are high 7 | - b) train error is low but test error is high 8 | - c) train error is high but the test error is low 9 | - d) both train and test errors are low 10 | 11 | _Select a single answer_ 12 | ``` 13 | 14 | +++ 15 | 16 | ```{admonition} Question 17 | Assuming that we have a dataset with little noise, a model is underfitting when: 18 | 19 | - a) both the train and test errors are high 20 | - b) train error is low but test error is high 21 | - c) train error is high but the test error is low 22 | - d) both train and test errors are low 23 | 24 | _Select a single answer_ 25 | ``` 26 | 27 | +++ 28 | 29 | ```{admonition} Question 30 | For a fixed training set, by sequentially adding parameters to give more 31 | flexibility to the model, we are more likely to observe: 32 | 33 | - a) a wider difference between train and test errors 34 | - b) a reduction in the difference between train and test errors 35 | - c) an increased or steady train error 36 | - d) a decrease in the train error 37 | 38 | _Select all answers that apply_ 39 | ``` 40 | 41 | +++ 42 | 43 | ```{admonition} Question 44 | For a fixed choice of model parameters, if we increase the number of labeled 45 | observations in the training set, are we more likely to observe: 46 | 47 | - a) a wider difference between train and test errors 48 | - b) a reduction in the difference between train and test errors 49 | - c) an increased or steady train error 50 | - d) a decrease in the train error 51 | 52 | _Select all answers that apply_ 53 | ``` 54 | 55 | +++ 56 | 57 | ```{admonition} Question 58 | Polynomial models with a high degree parameter: 59 | 60 | - a) always have the best test error (but can be slow to train) 61 | - b) underfit more than linear regression models 62 | - c) get lower training error than lower degree polynomial models 63 | - d) are more likely to overfit than lower degree polynomial models 64 | 65 | _Select all answers that apply_ 66 | ``` 67 | 68 | +++ 69 | 70 | ```{admonition} Question 71 | If we chose the parameters of a model to get the best overfitting/underfitting 72 | tradeoff, we will always get a zero test error. 73 | 74 | - a) True 75 | - b) False 76 | 77 | _Select a single answer_ 78 | ``` 79 | -------------------------------------------------------------------------------- /jupyter-book/overfit/learning_validation_curves_slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Comparing train and test errors 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/overfit/overfit_bias_variance_index.md: -------------------------------------------------------------------------------- 1 | # Bias versus variance trade-off 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/overfit/overfit_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | This module gives an intuitive introduction to the very **fundamental 8 | concepts** of overfitting and underfitting in machine learning. 9 | 10 | Machine learning models can never make perfect predictions: the test error is 11 | never exactly zero. This failure comes from a **fundamental trade-off** between 12 | **modeling flexibility** and the **limited size of the training dataset**. 13 | 14 | The first presentation will define those problems and characterize how and why 15 | they arise. 16 | 17 | Then we will present a methodology to quantify those problems by **contrasting 18 | the train error with the test error** for various choice of the model family, 19 | model parameters. More importantly, we will emphasize the **impact of the size 20 | of the training set on this trade-off**. 21 | 22 | Finally we will relate overfitting and underfitting to the concepts of 23 | statistical variance and bias. 24 | 25 | ## Before getting started 26 | 27 | 28 | 29 | The required technical skills to carry on this module are: 30 | 31 | - skills acquired during the "The Predictive Modeling Pipeline" module with 32 | basic usage of scikit-learn. 33 | 34 | 35 | 36 | ## Objectives and time schedule 37 | 38 | 39 | 40 | The objective in the module are the following: 41 | 42 | - understand the concept of overfitting and underfitting; 43 | - understand the concept of generalization; 44 | - understand the general cross-validation framework used to evaluate a model. 45 | 46 | 47 | 48 | The estimated time to go through this module is about 3 hours. 49 | -------------------------------------------------------------------------------- /jupyter-book/overfit/overfit_overfitting_underfitting_index.md: -------------------------------------------------------------------------------- 1 | # Overfitting and underfitting 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/overfit/overfit_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | - **Overfitting** is caused by the **limited size of the training set**, the 6 | **noise** in the data, and the **high flexibility** of common machine learning 7 | models. 8 | 9 | - **Underfitting** happens when the learnt prediction functions suffer from 10 | **systematic errors**. This can be caused by a choice of model family and 11 | parameters, which leads to a **lack of flexibility** to capture the repeatable 12 | structure of the true data generating process. 13 | 14 | - For a fixed training set, the objective is to **minimize the test error** by 15 | adjusting the model family and its parameters to find the 16 | **best trade-off between overfitting for underfitting**. 17 | 18 | - For a given choice of model family and parameters, **increasing the 19 | training set size will decrease overfitting** but can also cause an increase 20 | of underfitting. 21 | 22 | - The test error of a model that is neither overfitting nor underfitting can 23 | still be high if the variations of the target variable cannot be fully 24 | determined by the input features. This irreducible error is caused by what we 25 | sometimes call label noise. In practice, this often happens when we do not 26 | have access to important features for one reason or another. 27 | 28 | ## To go further 29 | 30 | It is possible to give a precise mathematical treatment of the bias and the 31 | variance of a regression model. The Wikipedia article on the [Bias-variance 32 | tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff) explains 33 | how the **squared test error can be decomposed as the sum of the squared bias, 34 | the variance and the irreducible error** for a given regression error. 35 | 36 | The next chapters on linear models, decision trees and ensembles will give 37 | concrete examples on how to diagnose and how to tackle overfitting and 38 | underfitting. 39 | 40 | You can refer to the following scikit-learn examples which are related to 41 | the concepts approached during this module: 42 | 43 | - [Illustration of underfitting and overfitting concepts](https://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html#sphx-glr-auto-examples-model-selection-plot-underfitting-overfitting-py) 44 | - [Difference between train and test scores](https://scikit-learn.org/stable/auto_examples/model_selection/plot_train_error_vs_test_error.html#sphx-glr-auto-examples-model-selection-plot-train-error-vs-test-error-py) 45 | - [Example of a validation curve](https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py) 46 | -------------------------------------------------------------------------------- /jupyter-book/overfit/overfit_validation_learning_curves_index.md: -------------------------------------------------------------------------------- 1 | # Validation and learning curves 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/overfit/overfitting_vs_under_fitting_quiz_m2_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M2.01 2 | 3 | ```{admonition} Question 4 | A model that is underfitting: 5 | 6 | - a) is too complex and thus highly flexible 7 | - b) is too constrained and thus limited by its expressivity 8 | - c) often makes prediction errors, even on training samples 9 | - d) focuses too much on noisy details of the training set 10 | 11 | _Select all answers that apply_ 12 | ``` 13 | 14 | +++ 15 | 16 | ```{admonition} Question 17 | A model that is overfitting: 18 | 19 | - a) is too complex and thus highly flexible 20 | - b) is too constrained and thus limited by its expressivity 21 | - c) often makes prediction errors, even on training samples 22 | - d) focuses too much on noisy details of the training set 23 | 24 | _Select all answers that apply_ 25 | ``` 26 | -------------------------------------------------------------------------------- /jupyter-book/overfit/overfitting_vs_under_fitting_slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Overfitting and Underfitting 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_index.md: -------------------------------------------------------------------------------- 1 | # Tabular data exploration 2 | 3 | ```{tableofcontents} 4 | ``` 5 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M1.01 2 | 3 | ```{admonition} Question 4 | In the notebook "First look at our dataset", we used pandas and specifically 5 | `adult_census = pd.read_csv("../datasets/adult-census.csv")` to: 6 | 7 | - a) load a comma-separated values file 8 | - b) load a dataset already included in the pandas package 9 | - c) load a file only containing the survey features 10 | - d) load a file only containing the target of our classification problem: 11 | whether or not a person has a low or high income salary 12 | - e) load a file containing both the features and the target for our classification 13 | problem 14 | 15 | _Select all answers that apply_ 16 | ``` 17 | 18 | +++ 19 | 20 | ```{admonition} Question 21 | 22 | In the previous notebook, we used: 23 | 24 | - a) pandas to gain insights about the dataset 25 | - b) pandas and seaborn to visually inspect the dataset 26 | - c) numpy and scipy to perform numerical inspection (for instance using 27 | `scipy.optimize.minimize`) 28 | - d) scikit-learn to fit some machine learning models 29 | 30 | _Select all answers that apply_ 31 | ``` 32 | 33 | +++ 34 | 35 | ```{admonition} Question 36 | How is a tabular dataset organized? 37 | 38 | - a) a column represents a sample and a row represents a feature 39 | - b) a column represents a feature and a row represents a sample 40 | - c) the target variable is represented by a row 41 | - d) the target variable is represented by a column 42 | 43 | _Select all answers that apply_ 44 | ``` 45 | 46 | +++ 47 | 48 | ```{admonition} Question 49 | A categorical variable is: 50 | 51 | - a) a variable with **only two** different possible values 52 | - b) a variable with continuous numerical values 53 | - c) a variable with a finite set of possible values 54 | 55 | _Select a single answer_ 56 | ``` 57 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_index.md: -------------------------------------------------------------------------------- 1 | # Fitting a scikit-learn model on numerical data 2 | 3 | ```{tableofcontents} 4 | ``` 5 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M1.02 2 | 3 | ```{admonition} Question 4 | Why do we need two sets: a train set and a test set? 5 | 6 | - a) to train the model faster 7 | - b) to validate the model on unseen data 8 | - c) to improve the accuracy of the model 9 | 10 | _Select all answers that apply_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | The generalization performance of a scikit-learn model can be evaluated by: 17 | 18 | - a) calling `fit` to train the model on the **training set**, `predict` on the 19 | **test set** to get the predictions, and compute the score by passing the 20 | predictions and the true target values to some metric function 21 | - b) calling `fit` to train the model on the **training set** and `score` to compute 22 | the score on the **test set** 23 | - c) calling `cross_validate` by passing the model, the data and the target 24 | - d) calling `fit_transform` on the data and then `score` to compute 25 | the score on the **test set** 26 | 27 | _Select all answers that apply_ 28 | ``` 29 | 30 | +++ 31 | 32 | ```{admonition} Question 33 | When calling `cross_validate(estimator, X, y, cv=5)`, the following happens: 34 | 35 | - a) `X` and `y` are internally split five times with non-overlapping test sets 36 | - b) `estimator.fit` is called 5 times on the full `X` and `y` 37 | - c) `estimator.fit` is called 5 times, each time on a different training set 38 | - d) a Python dictionary is returned containing a key/value containing a NumPy 39 | array with 5 scores computed on the **train sets** 40 | - e) a Python dictionary is returned containing a key/value containing a NumPy 41 | array with 5 scores computed on the **test sets** 42 | 43 | _Select all answers that apply_ 44 | ``` 45 | 46 | +++ 47 | 48 | We define a 2-dimensional dataset represented graphically as follows: 49 | 50 | ![Original dataset](../../figures/numerical_pipeline_quiz_scaler_original.png) 51 | 52 | ```{admonition} Question 53 | If we process the dataset using a `StandardScaler` with the default parameters, 54 | which of the following results do you expect: 55 | 56 | ![Preprocessed datasets](../../figures/numerical_pipeline_quiz_scaler_preprocessing.png) 57 | 58 | - a) Preprocessing A 59 | - b) Preprocessing B 60 | - c) Preprocessing C 61 | - d) Preprocessing D 62 | 63 | _Select a single answer_ 64 | ``` 65 | 66 | +++ 67 | 68 | ```{admonition} Question 69 | Look at the plots and the answers of the previous question. A `StandardScaler` 70 | transformer with the default parameter: 71 | 72 | - a) transforms the features so that they have similar ranges 73 | - b) transforms the features to lie in the [0.0, 1.0] range 74 | - c) transforms feature values that were originally positive-only into values that can 75 | be negative or positive 76 | - d) can help logistic regression converge faster (fewer iterations) 77 | 78 | _Select all answers that apply_ 79 | ``` 80 | 81 | +++ 82 | 83 | ```{admonition} Question 84 | Cross-validation allows us to: 85 | 86 | - a) train the model faster 87 | - b) measure the generalization performance of the model 88 | - c) estimate the variability of the generalization score 89 | 90 | _Select all answers that apply_ 91 | ``` 92 | 93 | +++ 94 | 95 | ```{admonition} Question 96 | `make_pipeline` (as well as `Pipeline`): 97 | 98 | - a) runs a cross-validation using the transformers and predictor given as 99 | parameters 100 | - b) combines one or several transformers and a predictor 101 | - c) tries several models at the same time 102 | - d) plots feature histogram automatically 103 | 104 | _Select all answers that apply_ 105 | ``` 106 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_video_cross_validation.md: -------------------------------------------------------------------------------- 1 | # 🎥 Validation of a model 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_index.md: -------------------------------------------------------------------------------- 1 | # Handling categorical data 2 | 3 | ```{tableofcontents} 4 | ``` 5 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_quiz_m1_03.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M1.03 2 | 3 | ```{admonition} Question 4 | How are categorical variables represented? 5 | 6 | - a) categorical feature is only represented by non-numerical data 7 | - b) categorical feature represents a finite number of values called categories 8 | - c) categorical feature can either be represented by numerical or non-numerical values 9 | 10 | _Select all answers that apply_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | An ordinal variable: 17 | 18 | - a) is a categorical variable with a large number of different categories; 19 | - b) can be represented by integers or string labels; 20 | - c) is a categorical variable with a meaningful order. 21 | 22 | _Select all answers that apply_ 23 | ``` 24 | 25 | +++ 26 | 27 | ```{admonition} Question 28 | One-hot encoding: 29 | 30 | - a) encodes each column with string-labeled values into a single integer-coded column 31 | - b) transforms a numerical variable into a categorical variable 32 | - c) creates one additional column for each possible category 33 | - d) transforms string-labeled variables using a numerical representation 34 | 35 | _Select all answers that apply_ 36 | ``` 37 | 38 | +++ 39 | 40 | ```{admonition} Question 41 | 42 | Assume we have a dataset where each line describes a company. Which of the 43 | following columns should be considered as a meaningful **numerical feature** to 44 | train a machine learning model to classify companies: 45 | 46 | - a) the sector of activity ("construction", "retail", "energy", "insurance"...) 47 | - b) the phone number of the sales department 48 | - c) the number of employees 49 | - d) the profits of the last quarter 50 | - e) the post code of the head quarters 51 | 52 | _Select all answers that apply_ 53 | ``` 54 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/03_categorical_pipeline_visualization_video.md: -------------------------------------------------------------------------------- 1 | # 🎥 Visualizing scikit-learn pipelines in Jupyter 2 | 3 | 6 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/predictive_modeling_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | This module will give an example of a typical predictive modeling pipeline 8 | developed using tabular data (data that can be structured in a 2-dimensional 9 | table). We will present this pipeline in a progressive way. First, we will make 10 | an analysis of the dataset used. Subsequently, we will train our first 11 | predictive pipeline with a subset of the dataset. Then, we will give particular 12 | attention to the type of data, numerical and categorical, that our model has to 13 | handle. Finally, we will extend our pipeline to use mixed types of data, i.e. 14 | numerical and categorical data. 15 | 16 | ## Before getting started 17 | 18 | 19 | 20 | The required technical skills to carry on this module are: 21 | 22 | - basic knowledge of Python programming 23 | - some prior experience with the NumPy, pandas and Matplotlib libraries is 24 | recommended but not required 25 | 26 | 27 | 28 | For a quick introduction on these requirements, you can use the following resources: 29 | - [Introduction to Python](https://scipy-lectures.org/intro/language/python_language.html) 30 | - [Introduction to NumPy](https://sebastianraschka.com/blog/2020/numpy-intro.html) 31 | - [Introduction to Pandas](https://pandas.pydata.org/docs/user_guide/10min.html) 32 | - [Introduction to Matplotlib](https://sebastianraschka.com/blog/2020/numpy-intro.html#410-matplotlib) 33 | 34 | ## Objectives and time schedule 35 | 36 | 37 | 38 | The objective in the module are the following: 39 | 40 | - build intuitions regarding an unknown dataset; 41 | - identify and differentiate numerical and categorical features; 42 | - create an advanced predictive pipeline with scikit-learn. 43 | 44 | 45 | 46 | The estimated time to go through this module is about 6 hours 47 | -------------------------------------------------------------------------------- /jupyter-book/predictive_modeling_pipeline/predictive_modeling_module_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | 6 | 7 | In this module, you learned: 8 | 9 | - to create a scikit-learn predictive model; 10 | - about the scikit-learn API to train and test a predictive model; 11 | - to process numerical data, notably using a `Pipeline`. 12 | - to process categorical data, notably using a `OneHotEncoder` and an 13 | `OrdinalEncoder`; 14 | - to handle and process mixed data types (i.e. numerical and 15 | categorical data), notably using a `ColumnTransformer`. 16 | 17 | ## To go further 18 | 19 | 20 | 21 | You can refer to the following scikit-learn examples which are related to 22 | the concepts approached during this module: 23 | 24 | - [Predictive machine learning pipeline with mixed data types](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py) 25 | - [Importance of feature scaling](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py) 26 | -------------------------------------------------------------------------------- /jupyter-book/python_scripts: -------------------------------------------------------------------------------- 1 | ../python_scripts -------------------------------------------------------------------------------- /jupyter-book/scikit-learn-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/jupyter-book/scikit-learn-logo.png -------------------------------------------------------------------------------- /jupyter-book/toc.md: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | 3 | ```{tableofcontents} 4 | ``` 5 | -------------------------------------------------------------------------------- /jupyter-book/trees/slides.md: -------------------------------------------------------------------------------- 1 | # 🎥 Intuitions on tree-based models 2 | 3 | 6 | 7 | 9 | 10 | To navigate in the slides, **first click on the slides**, then: 11 | - press the **arrow keys** to go to the next/previous slide; 12 | - press **"P"** to toggle presenter mode to see the notes; 13 | - press **"F"** to toggle full-screen mode. 14 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_classification_index.md: -------------------------------------------------------------------------------- 1 | # Decision tree in classification 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_hyperparameters_index.md: -------------------------------------------------------------------------------- 1 | # Hyperparameters of decision tree 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_intuitions_index.md: -------------------------------------------------------------------------------- 1 | # Intuitions on tree-based models 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | This module will present in details decision tree models. This model will be 8 | explained in both classification and regression problems. Besides, we will 9 | show which hyperparameters of the decision tree have an importance on their 10 | performance, allowing to find the best trade-off between under- and over-fit. 11 | 12 | ## Before getting started 13 | 14 | 15 | 16 | The required technical skills to carry on this module are: 17 | 18 | - skills acquired during the "The Predictive Modeling Pipeline" module with 19 | basic usage of scikit-learn; 20 | - skills acquired during the "Selecting The Best Model" module, mainly around 21 | the concept of underfit/overfit and the usage of cross-validation in 22 | scikit-learn. 23 | 24 | 25 | 26 | ## Objectives and time schedule 27 | 28 | 29 | 30 | The objective in the module are the following: 31 | 32 | - understand how decision trees are working in classification and regression; 33 | - check which tree parameters are important and their influences. 34 | 35 | 36 | 37 | The estimated time to go through this module is about 3 hours. 38 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_module_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | 6 | 7 | In this module, we presented decision trees in details. We saw that they: 8 | 9 | - are suited for both regression and classification problems; 10 | - are non-parametric models; 11 | - are not able to extrapolate; 12 | - are sensitive to hyperparameter tuning. 13 | 14 | ## To go further 15 | 16 | 17 | 18 | You can refer to the following scikit-learn examples which are related to 19 | the concepts approached during this module: 20 | 21 | - [Example of decision tree regressor](https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html#sphx-glr-auto-examples-tree-plot-tree-regression-py) 22 | - [Example of decision tree classifier](https://scikit-learn.org/stable/auto_examples/tree/plot_iris_dtc.html#sphx-glr-auto-examples-tree-plot-iris-dtc-py) 23 | - [Understanding the tree structure in scikit-learn](https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py) 24 | - [Post-pruning decision trees](https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py) 25 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_quiz_m5_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M5.01 2 | 3 | ```{admonition} Question 4 | From the presentation given in the video, for which kind of supervised learning 5 | tasks decision trees can be applied to: 6 | 7 | - a) classification tasks 8 | - b) regression tasks 9 | - c) clustering tasks 10 | 11 | _Select all answers that apply_ 12 | ``` 13 | 14 | +++ 15 | 16 | ```{admonition} Question 17 | A given split node in a decision tree classifier makes: 18 | 19 | - a) a binary decision considering a single feature at a time 20 | - b) a binary decision considering a combination of all the input features 21 | - c) multiple binary decisions considering a single feature 22 | - d) a binary decision considering a non-linear combination of all input 23 | features 24 | 25 | _Select a single answer_ 26 | ``` 27 | 28 | +++ 29 | 30 | ```{admonition} Question 31 | Which aspect of the decision tree learning procedure is most typically used to 32 | control the underfitting/overfitting trade-off? 33 | 34 | - a) The number of children of a split node 35 | - b) The magnitude of the weight coefficients 36 | - c) The maximum depth of the decision tree 37 | 38 | _Select a single answer_ 39 | ``` 40 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_quiz_m5_02.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M5.02 2 | 3 | ```{admonition} Question 4 | For a decision tree built in scikit-learn, a split: 5 | 6 | - a) will use a single feature to create a rule 7 | - b) will use a combination of the features to create a rule 8 | - c) will create multiple separations, one for each class 9 | 10 | _Select a single answer_ 11 | ``` 12 | 13 | +++ 14 | 15 | ```{admonition} Question 16 | Trees are built incrementally: 17 | 18 | - a) by splitting data over and over 19 | - b) by refining the rules of each node 20 | - c) by refining the rules of each leaf 21 | 22 | _Select a single answer_ 23 | ``` 24 | 25 | +++ 26 | 27 | ```{admonition} Question 28 | A decision tree split is built: 29 | 30 | - a) using a random threshold 31 | - b) using the median value of a single feature as a threshold 32 | - c) using a threshold that minimizes an error 33 | 34 | _Select all answers that apply_ 35 | ``` 36 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_quiz_m5_03.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M5.03 2 | 3 | ```{admonition} Question 4 | When fitting a decision tree regressor in scikit-learn, the predicted values on 5 | a leaf correspond to: 6 | 7 | - a) the median of the training samples at this node 8 | - b) the mean of the training samples at this node 9 | - c) the most frequent value of the training samples at this node 10 | 11 | _Select a single answer_ 12 | ``` 13 | 14 | +++ 15 | 16 | ```{admonition} Question 17 | Decision tree regressors can predict: 18 | 19 | - a) any values, including values larger or smaller than those observed in `y_train`; 20 | - b) only values in the range from `np.min(y_train)` to `np.max(y_train)`. 21 | 22 | _Select a single answer_ 23 | ``` 24 | 25 | +++ 26 | 27 | ```{admonition} Question 28 | The predictions of a tree regressor correspond to: 29 | 30 | - a) a piecewise-linear function 31 | - b) a piecewise-constant function 32 | - c) a piecewise-cubic function 33 | 34 | _Select a single answer_ 35 | ``` 36 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_quiz_m5_04.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M5.04 2 | 3 | ```{admonition} Question 4 | If a decision tree is overfitting, you need to increase the maximum depth. 5 | 6 | - a) True 7 | - b) False 8 | 9 | _Select a single answer_ 10 | ``` 11 | 12 | +++ 13 | 14 | ```{admonition} Question 15 | How should you choose the maximum depth of a decision tree? 16 | 17 | - a) choosing the depth maximizing the score on a validation set with a 18 | cross-validation, with a grid-search for instance 19 | - b) choosing the depth maximizing the score on the train set 20 | - c) choosing the depth maximizing the score on the test set 21 | 22 | _Select all answers that apply_ 23 | ``` 24 | -------------------------------------------------------------------------------- /jupyter-book/trees/trees_regression_index.md: -------------------------------------------------------------------------------- 1 | # Decision tree in regression 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/tuning/parameter_tuning_automated_index.md: -------------------------------------------------------------------------------- 1 | # Automated tuning 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/tuning/parameter_tuning_manual_index.md: -------------------------------------------------------------------------------- 1 | # Manual tuning 2 | 3 | ```{tableofcontents} 4 | 5 | ``` 6 | -------------------------------------------------------------------------------- /jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md: -------------------------------------------------------------------------------- 1 | # ✅ Quiz M3.01 2 | 3 | ```{admonition} Question 4 | Which parameters below are hyperparameters of `HistGradientBoostingClassifier`? 5 | Remember we only consider hyperparameters to be those that potentially impact 6 | the result of the learning procedure and subsequent predictions. 7 | 8 | - a) `C` 9 | - b) `max_leaf_nodes` 10 | - c) `verbose` 11 | - d) `classes_` 12 | - e) `learning_rate` 13 | 14 | _Select all answers that apply_ 15 | ``` 16 | 17 | +++ 18 | 19 | ````{admonition} Question 20 | Given an instance named `model` as defined by: 21 | ```python 22 | from sklearn.linear_model import LogisticRegression 23 | model = LogisticRegression() 24 | ``` 25 | 26 | how do you get the value of the `C` parameter? 27 | - a) `model.get_parameters()['C']` 28 | - b) `model.get_params()['C']` 29 | - c) `model.get_params('C')` 30 | - d) `model.get_params['C']` 31 | 32 | _Select a single answer_ 33 | ```` 34 | 35 | +++ 36 | 37 | ````{admonition} Question 38 | Given `model` defined by: 39 | ```python 40 | from sklearn.linear_model import LogisticRegression 41 | 42 | model = LogisticRegression() 43 | ``` 44 | 45 | how do you set the value of the `C` parameter to `5`? 46 | - a) `model.set_params('C', 5)` 47 | - b) `model.set_params({'C': 5})` 48 | - c) `model.set_params()['C'] = 5` 49 | - d) `model.set_params(C=5)` 50 | 51 | _Select a single answer_ 52 | ```` 53 | 54 | +++ 55 | 56 | ````{admonition} Question 57 | Given `model` defined by: 58 | ```python 59 | from sklearn.preprocessing import StandardScaler 60 | from sklearn.linear_model import LogisticRegression 61 | from sklearn.pipeline import Pipeline 62 | 63 | model = Pipeline([ 64 | ('scaler', StandardScaler()), 65 | ('classifier', LogisticRegression()) 66 | ]) 67 | ``` 68 | 69 | how do you set the value of the `C` parameter of the `LogisticRegression` component to 5: 70 | - a) `model.set_params(C=5) ` 71 | - b) `model.set_params(logisticregression__C=5)` 72 | - c) `model.set_params(classifier__C=5) ` 73 | - d) `model.set_params(classifier--C=5)` 74 | 75 | _Select a single answer_ 76 | ```` 77 | -------------------------------------------------------------------------------- /jupyter-book/tuning/parameter_tuning_module_intro.md: -------------------------------------------------------------------------------- 1 | # Module overview 2 | 3 | ## What you will learn 4 | 5 | 6 | 7 | In the previous modules, we showed how to create, train, predict, and even 8 | evaluate a predictive model. However, we did not change the models' 9 | parameters that can be given when creating an instance. For example, 10 | for k-nearest neighbors, we initially used this default parameter: 11 | `n_neighbors=5` before trying other model parameters. 12 | 13 | These parameters are called **hyperparameters**: they are parameters 14 | used to control the learning process, for instance the parameter `k` 15 | of the k-nearest neighbors. Hyperparameters are specified by the user, 16 | often manually tuned (or by an exhaustive automatic search), and 17 | cannot be estimated from the data. They should not be confused with 18 | the other parameters that are inferred during the training 19 | process. These parameters define the model itself, for instance 20 | `coef_` for the linear models. 21 | 22 | In this module, we will first show that the hyperparameters have an impact on 23 | the performance of the model and that default values are not necessarily the 24 | best option. Subsequently, we will show how to set hyperparameters in 25 | scikit-learn model. Finally, we will show strategies allowing to pick-up a 26 | combination of hyperparameters that maximizes model's performance. 27 | 28 | ## Before getting started 29 | 30 | 31 | 32 | The required technical skills to carry on this module are: 33 | 34 | - skills acquired during the "The Predictive Modeling Pipeline" with basic 35 | usage of scikit-learn; 36 | - skills related to using the cross-validation framework to evaluate a model. 37 | 38 | 39 | 40 | ## Objectives and time schedule 41 | 42 | 43 | 44 | The objective in the module are the following: 45 | 46 | - understand what is a model hyperparameter; 47 | - understand how to get and set the value of a hyperparameter in a scikit-learn 48 | model; 49 | - be able to fine tune a full predictive modeling pipeline; 50 | - understand and visualize the combination of parameters that improves the 51 | performance of a model. 52 | 53 | 54 | 55 | The estimated time to go through this module is about 3 hours. 56 | -------------------------------------------------------------------------------- /jupyter-book/tuning/parameter_tuning_module_take_away.md: -------------------------------------------------------------------------------- 1 | # Main take-away 2 | 3 | ## Wrap-up 4 | 5 | 6 | 7 | - Hyperparameters have an impact on the models' performance and should be 8 | wisely chosen; 9 | - The search for the best hyperparameters can be automated with a grid-search 10 | approach or a randomized search approach; 11 | - A grid-search can be computationally expensive and becomes less attractive as 12 | the number of hyperparameters to explore increases. Moreover, the combinations 13 | are sampled on a fixed, regular grid. 14 | - A randomized-search allows exploring within a fixed budget, even as the number 15 | of hyperparameters increases. In this case, combinations can be sampled either 16 | on a regular grid or from a given distribution. 17 | 18 | ## To go further 19 | 20 | 21 | 22 | You can refer to the following scikit-learn examples which are related to 23 | the concepts approached during this module: 24 | 25 | - [Example of a grid-search](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py) 26 | - [Example of a randomized-search](https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py) 27 | - [Example of a nested cross-validation](https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html#sphx-glr-auto-examples-model-selection-plot-nested-cross-validation-iris-py) 28 | -------------------------------------------------------------------------------- /jupyter-book/tuning/parameter_tuning_parallel_plot_video.md: -------------------------------------------------------------------------------- 1 | # 🎥 Analysis of hyperparameter search results 2 | 3 | 6 | -------------------------------------------------------------------------------- /local-install-instructions.md: -------------------------------------------------------------------------------- 1 | # Local install instructions 2 | 3 | The course uses Python 3 and some data analysis packages such as Numpy, Pandas, 4 | scikit-learn, and matplotlib. 5 | 6 | ## Install Miniconda 7 | 8 | **This step is only necessary if you don't have conda installed already**: 9 | 10 | - download the Miniconda installer for your operating system (Windows, MacOSX 11 | or Linux) [here](https://docs.conda.io/en/latest/miniconda.html) 12 | - run the installer following the instructions 13 | [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html#regular-installation) 14 | depending on your operating system. 15 | 16 | ## Create conda environment 17 | 18 | ```sh 19 | # Clone this repo 20 | git clone https://github.com/INRIA/scikit-learn-mooc 21 | cd scikit-learn-mooc 22 | # Create a conda environment with the required packages for this tutorial: 23 | conda env create -f environment.yml 24 | ``` 25 | 26 | ## Check your install 27 | 28 | To make sure you have all the necessary packages installed, we **strongly 29 | recommend** you to execute the `check_env.py` script located at the root of 30 | this repository: 31 | 32 | ```sh 33 | # Activate your conda environment 34 | conda activate scikit-learn-course 35 | python check_env.py 36 | ``` 37 | 38 | Make sure that there is no `FAIL` in the output when running the `check_env.py` 39 | script, i.e. that its output looks similar to this: 40 | 41 | ``` 42 | Using python in /home/lesteve/miniconda3/envs/scikit-learn-course 43 | 3.9.1 | packaged by conda-forge | (default, Jan 10 2021, 02:55:42) 44 | [GCC 9.3.0] 45 | 46 | [ OK ] numpy version 1.19.5 47 | [ OK ] scipy version 1.6.0 48 | [ OK ] matplotlib version 3.3.3 49 | [ OK ] sklearn version 1.6 50 | [ OK ] pandas version 2.0 51 | [ OK ] seaborn version 0.13 52 | [ OK ] notebook version 6.2.0 53 | [ OK ] plotly version 5.10.0 54 | ``` 55 | 56 | ## Run Jupyter notebooks locally 57 | 58 | ```sh 59 | # Activate your conda environment 60 | conda activate scikit-learn-course 61 | jupyter notebook full-index.ipynb 62 | ``` 63 | 64 | `full-index.ipynb` is an index file helping to navigate the notebooks. 65 | All the Jupyter notebooks are located in the `notebooks` folder. 66 | -------------------------------------------------------------------------------- /notebooks/01_tabular_data_exploration_ex_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# \ud83d\udcdd Exercise M1.01" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Imagine we are interested in predicting penguins species based on two of their\n", 15 | "body measurements: culmen length and culmen depth. First we want to do some\n", 16 | "data exploration to get a feel for the data.\n", 17 | "\n", 18 | "What are the features? What is the target?" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "The data is located in `../datasets/penguins_classification.csv`, load it with\n", 26 | "`pandas` into a `DataFrame`." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# Write your code here." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Show a few samples of the data.\n", 43 | "\n", 44 | "How many features are numerical? How many features are categorical?" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Write your code here." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "What are the different penguins species available in the dataset and how many\n", 61 | "samples of each species are there? Hint: select the right column and use the\n", 62 | "[`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)\n", 63 | "method." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Write your code here." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "Plot histograms for the numerical features" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# Write your code here." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Show features distribution for each class. Hint: use\n", 96 | "[`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# Write your code here." 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Looking at these distributions, how hard do you think it would be to classify\n", 113 | "the penguins only using `\"culmen depth\"` and `\"culmen length\"`?" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "jupytext": { 119 | "main_language": "python" 120 | }, 121 | "kernelspec": { 122 | "display_name": "Python 3", 123 | "name": "python3" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 5 128 | } -------------------------------------------------------------------------------- /notebooks/ensemble_ex_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# \ud83d\udcdd Exercise M6.01\n", 8 | "\n", 9 | "The aim of this notebook is to investigate if we can tune the hyperparameters\n", 10 | "of a bagging regressor and evaluate the gain obtained.\n", 11 | "\n", 12 | "We will load the California housing dataset and split it into a training and a\n", 13 | "testing set." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from sklearn.datasets import fetch_california_housing\n", 23 | "from sklearn.model_selection import train_test_split\n", 24 | "\n", 25 | "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", 26 | "target *= 100 # rescale the target in k$\n", 27 | "data_train, data_test, target_train, target_test = train_test_split(\n", 28 | " data, target, random_state=0, test_size=0.5\n", 29 | ")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "
\n", 37 | "

Note

\n", 38 | "

If you want a deeper overview regarding this dataset, you can refer to the\n", 39 | "Appendix - Datasets description section at the end of this MOOC.

\n", 40 | "
" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its\n", 48 | "parameter `estimator`. Train the regressor and evaluate its generalization\n", 49 | "performance on the testing set using the mean absolute error." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# Write your code here." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Now, create a `RandomizedSearchCV` instance using the previous model and tune\n", 66 | "the important parameters of the bagging regressor. Find the best parameters\n", 67 | "and check if you are able to find a set of parameters that improve the default\n", 68 | "regressor still using the mean absolute error as a metric.\n", 69 | "\n", 70 | "
\n", 71 | "

Tip

\n", 72 | "

You can list the bagging regressor's parameters using the get_params method.

\n", 73 | "
" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# Write your code here." 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "jupytext": { 88 | "main_language": "python" 89 | }, 90 | "kernelspec": { 91 | "display_name": "Python 3", 92 | "name": "python3" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 5 97 | } -------------------------------------------------------------------------------- /notebooks/matplotlibrc: -------------------------------------------------------------------------------- 1 | axes.labelsize: 18.0 2 | axes.linewidth: 1.875 3 | axes.titlesize: 18.0 4 | boxplot.whiskers: 1000 5 | boxplot.patchartist: True 6 | boxplot.boxprops.color: black 7 | boxplot.capprops.color: black 8 | boxplot.medianprops.color: black 9 | boxplot.whiskerprops.color: black 10 | boxplot.boxprops.linewidth: 3.0 11 | boxplot.capprops.linewidth: 3.0 12 | boxplot.medianprops.linewidth: 2.5 13 | boxplot.whiskerprops.linewidth: 3.0 14 | figure.titlesize: 22.0 15 | font.size: 18.0 16 | grid.linewidth: 1.5 17 | legend.fontsize: 16.5 18 | legend.title_fontsize: 18.0 19 | lines.linewidth: 3.5 20 | lines.markersize: 9.0 21 | patch.linewidth: 1.5 22 | xtick.labelsize: 16.5 23 | xtick.major.size: 9.0 24 | xtick.major.width: 1.875 25 | xtick.minor.size: 6.0 26 | xtick.minor.width: 1.5 27 | ytick.labelsize: 16.5 28 | ytick.major.size: 9.0 29 | ytick.major.width: 1.875 30 | ytick.minor.size: 6.0 31 | ytick.minor.width: 1.5 32 | -------------------------------------------------------------------------------- /notebooks/metrics_ex_02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# \ud83d\udcdd Exercise M7.03\n", 8 | "\n", 9 | "As with the classification metrics exercise, we will evaluate the regression\n", 10 | "metrics within a cross-validation framework to get familiar with the syntax.\n", 11 | "\n", 12 | "We will use the Ames house prices dataset." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "\n", 24 | "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\")\n", 25 | "data = ames_housing.drop(columns=\"SalePrice\")\n", 26 | "target = ames_housing[\"SalePrice\"]\n", 27 | "data = data.select_dtypes(np.number)\n", 28 | "target /= 1000" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "lines_to_next_cell": 2 35 | }, 36 | "source": [ 37 | "
\n", 38 | "

Note

\n", 39 | "

If you want a deeper overview regarding this dataset, you can refer to the\n", 40 | "Appendix - Datasets description section at the end of this MOOC.

\n", 41 | "
" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "The first step will be to create a linear regression model." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# Write your code here." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Then, use the `cross_val_score` to estimate the generalization performance of\n", 65 | "the model. Use a `KFold` cross-validation with 10 folds. Make the use of the\n", 66 | "$R^2$ score explicit by assigning the parameter `scoring` (even though it is\n", 67 | "the default score)." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# Write your code here." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You\n", 84 | "may need to refer to the documentation for the `scoring` parameter." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# Write your code here." 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Finally, use the `cross_validate` function and compute multiple scores/errors\n", 101 | "at once by passing a list of scorers to the `scoring` parameter. You can\n", 102 | "compute the $R^2$ score and the mean absolute error for instance." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# Write your code here." 112 | ] 113 | } 114 | ], 115 | "metadata": { 116 | "jupytext": { 117 | "main_language": "python" 118 | }, 119 | "kernelspec": { 120 | "display_name": "Python 3", 121 | "name": "python3" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 5 126 | } -------------------------------------------------------------------------------- /one-day-course-index.md: -------------------------------------------------------------------------------- 1 | # The predictive modeling pipeline 2 | 3 | ## Tabular data exploration 4 | 5 | - [First look at our dataset](./notebooks/01_tabular_data_exploration.ipynb) 6 | - [Exercise 01](./notebooks/01_tabular_data_exploration_ex_01.ipynb) 7 | 8 | ## Fitting a scikit-learn model on numerical data 9 | 10 | - [First model with scikit-learn](./notebooks/02_numerical_pipeline_introduction.ipynb) 11 | - [Exercise 01](./notebooks/02_numerical_pipeline_ex_00.ipynb) 12 | - [Working with numerical data](./notebooks/02_numerical_pipeline_hands_on.ipynb) 13 | - [Exercise 02](./notebooks/02_numerical_pipeline_ex_01.ipynb) 14 | - [Preprocessing for numerical features](./notebooks/02_numerical_pipeline_scaling.ipynb) 15 | 16 | ## Handling categorical data 17 | 18 | - [Encoding of categorical variables](./notebooks/03_categorical_pipeline.ipynb) 19 | - [Exercise 01](./notebooks/03_categorical_pipeline_ex_01.ipynb) 20 | - [Using numerical and categorical variables together](./notebooks/03_categorical_pipeline_column_transformer.ipynb) 21 | - [Exercise 02](./notebooks/03_categorical_pipeline_ex_02.ipynb) 22 | 23 | # Hyperparameter tuning 24 | 25 | ## Manual tuning 26 | 27 | - [Set and get hyperparameters in scikit-learn](./notebooks/parameter_tuning_manual.ipynb) 28 | - [Exercise 01](./notebooks/parameter_tuning_ex_02.ipynb) 29 | 30 | ## Automated tuning 31 | 32 | - [Hyperparameter tuning by grid-search](./notebooks/parameter_tuning_grid_search.ipynb) 33 | - [Hyperparameter tuning by randomized-search](./notebooks/parameter_tuning_randomized_search.ipynb) 34 | - [Exercise 02](./notebooks/parameter_tuning_ex_03.ipynb) 35 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 79 3 | target_version = ['py38', 'py39', 'py310', 'py311'] 4 | preview = true 5 | exclude = ''' 6 | /( 7 | \.eggs # exclude a few common directories in the 8 | | \.git # root of the project 9 | | \.mypy_cache 10 | | \.vscode 11 | | build 12 | | dist 13 | )/ 14 | ''' 15 | 16 | [tool.ruff.lint] 17 | ignore = [ 18 | 'E402', # module level import not at top of file 19 | 'F401', # imported but unused 20 | 'E501', # line too long 21 | 'E203', # whitespace before ':' 22 | ] 23 | -------------------------------------------------------------------------------- /python_scripts/01_tabular_data_exploration_ex_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M1.01 16 | 17 | # %% [markdown] 18 | # Imagine we are interested in predicting penguins species based on two of their 19 | # body measurements: culmen length and culmen depth. First we want to do some 20 | # data exploration to get a feel for the data. 21 | # 22 | # What are the features? What is the target? 23 | 24 | # %% [markdown] 25 | # The data is located in `../datasets/penguins_classification.csv`, load it with 26 | # `pandas` into a `DataFrame`. 27 | 28 | # %% 29 | # Write your code here. 30 | 31 | # %% [markdown] 32 | # Show a few samples of the data. 33 | # 34 | # How many features are numerical? How many features are categorical? 35 | 36 | # %% 37 | # Write your code here. 38 | 39 | # %% [markdown] 40 | # What are the different penguins species available in the dataset and how many 41 | # samples of each species are there? Hint: select the right column and use the 42 | # [`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html) 43 | # method. 44 | 45 | # %% 46 | # Write your code here. 47 | 48 | # %% [markdown] 49 | # Plot histograms for the numerical features 50 | 51 | # %% 52 | # Write your code here. 53 | 54 | # %% [markdown] 55 | # Show features distribution for each class. Hint: use 56 | # [`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html) 57 | 58 | # %% 59 | # Write your code here. 60 | 61 | # %% [markdown] 62 | # Looking at these distributions, how hard do you think it would be to classify 63 | # the penguins only using `"culmen depth"` and `"culmen length"`? 64 | -------------------------------------------------------------------------------- /python_scripts/01_tabular_data_exploration_sol_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # kernelspec: 4 | # display_name: Python 3 5 | # name: python3 6 | # --- 7 | 8 | # %% [markdown] 9 | # # 📃 Solution for Exercise M1.01 10 | 11 | # %% [markdown] 12 | # Imagine we are interested in predicting penguins species based on two of their 13 | # body measurements: culmen length and culmen depth. First we want to do some 14 | # data exploration to get a feel for the data. 15 | # 16 | # What are the features? What is the target? 17 | 18 | # %% [markdown] tags=["solution"] 19 | # The features are `"culmen length"` and `"culmen depth"`. The target is the 20 | # penguin species. 21 | 22 | # %% [markdown] 23 | # The data is located in `../datasets/penguins_classification.csv`, load it with 24 | # `pandas` into a `DataFrame`. 25 | 26 | # %% 27 | # solution 28 | import pandas as pd 29 | 30 | penguins = pd.read_csv("../datasets/penguins_classification.csv") 31 | 32 | # %% [markdown] 33 | # Show a few samples of the data. 34 | # 35 | # How many features are numerical? How many features are categorical? 36 | 37 | # %% [markdown] tags=["solution"] 38 | # Both features, `"culmen length"` and `"culmen depth"` are numerical. There are 39 | # no categorical features in this dataset. 40 | 41 | # %% 42 | # solution 43 | penguins.head() 44 | 45 | # %% [markdown] 46 | # What are the different penguins species available in the dataset and how many 47 | # samples of each species are there? Hint: select the right column and use the 48 | # [`value_counts`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html) 49 | # method. 50 | 51 | # %% 52 | # solution 53 | penguins["Species"].value_counts() 54 | 55 | # %% [markdown] 56 | # Plot histograms for the numerical features 57 | 58 | # %% 59 | # solution 60 | _ = penguins.hist(figsize=(8, 4)) 61 | 62 | # %% [markdown] 63 | # Show features distribution for each class. Hint: use 64 | # [`seaborn.pairplot`](https://seaborn.pydata.org/generated/seaborn.pairplot.html) 65 | 66 | # %% 67 | # solution 68 | import seaborn 69 | 70 | pairplot_figure = seaborn.pairplot(penguins, hue="Species") 71 | 72 | # %% [markdown] tags=["solution"] 73 | # We observe that the labels on the axis are overlapping. Even if it is not the 74 | # priority of this notebook, one can tweak them by increasing the height of each 75 | # subfigure. 76 | 77 | # %% tags=["solution"] 78 | pairplot_figure = seaborn.pairplot(penguins, hue="Species", height=4) 79 | 80 | # %% [markdown] 81 | # Looking at these distributions, how hard do you think it would be to classify 82 | # the penguins only using `"culmen depth"` and `"culmen length"`? 83 | 84 | # %% [markdown] tags=["solution"] 85 | # Looking at the previous scatter-plot showing `"culmen length"` and `"culmen 86 | # depth"`, the species are reasonably well separated: 87 | # - low culmen length -> Adelie 88 | # - low culmen depth -> Gentoo 89 | # - high culmen depth and high culmen length -> Chinstrap 90 | # 91 | # There is some small overlap between the species, so we can expect a 92 | # statistical model to perform well on this dataset but not perfectly. 93 | -------------------------------------------------------------------------------- /python_scripts/02_numerical_pipeline_ex_00.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M1.02 16 | # 17 | # The goal of this exercise is to fit a similar model as in the previous 18 | # notebook to get familiar with manipulating scikit-learn objects and in 19 | # particular the `.fit/.predict/.score` API. 20 | 21 | # %% [markdown] 22 | # Let's load the adult census dataset with only numerical variables 23 | 24 | # %% 25 | import pandas as pd 26 | 27 | adult_census = pd.read_csv("../datasets/adult-census-numeric.csv") 28 | data = adult_census.drop(columns="class") 29 | target = adult_census["class"] 30 | 31 | # %% [markdown] 32 | # In the previous notebook we used `model = KNeighborsClassifier()`. All 33 | # scikit-learn models can be created without arguments. This is convenient 34 | # because it means that you don't need to understand the full details of a model 35 | # before starting to use it. 36 | # 37 | # One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the 38 | # number of neighbors we are going to use to make a prediction for a new data 39 | # point. 40 | # 41 | # What is the default value of the `n_neighbors` parameter? 42 | # 43 | # **Hint**: Look at the documentation on the [scikit-learn 44 | # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) 45 | # or directly access the description inside your notebook by running the 46 | # following cell. This opens a pager pointing to the documentation. 47 | 48 | # %% 49 | from sklearn.neighbors import KNeighborsClassifier 50 | 51 | # KNeighborsClassifier? 52 | 53 | # %% [markdown] 54 | # Create a `KNeighborsClassifier` model with `n_neighbors=50` 55 | 56 | # %% 57 | # Write your code here. 58 | 59 | # %% [markdown] 60 | # Fit this model on the data and target loaded above 61 | 62 | # %% 63 | # Write your code here. 64 | 65 | # %% [markdown] 66 | # Use your model to make predictions on the first 10 data points inside the 67 | # data. Do they match the actual target values? 68 | 69 | # %% 70 | # Write your code here. 71 | 72 | # %% [markdown] 73 | # Compute the accuracy on the training data. 74 | 75 | # %% 76 | # Write your code here. 77 | 78 | # %% [markdown] 79 | # Now load the test data from `"../datasets/adult-census-numeric-test.csv"` and 80 | # compute the accuracy on the test data. 81 | 82 | # %% 83 | # Write your code here. 84 | -------------------------------------------------------------------------------- /python_scripts/02_numerical_pipeline_ex_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M1.03 16 | # 17 | # The goal of this exercise is to compare the performance of our classifier in 18 | # the previous notebook (roughly 81% accuracy with `LogisticRegression`) to some 19 | # simple baseline classifiers. The simplest baseline classifier is one that 20 | # always predicts the same class, irrespective of the input data. 21 | # 22 | # - What would be the score of a model that always predicts `' >50K'`? 23 | # - What would be the score of a model that always predicts `' <=50K'`? 24 | # - Is 81% or 82% accuracy a good score for this problem? 25 | # 26 | # Use a `DummyClassifier` and do a train-test split to evaluate its accuracy on 27 | # the test set. This 28 | # [link](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators) 29 | # shows a few examples of how to evaluate the generalization performance of 30 | # these baseline models. 31 | 32 | # %% 33 | import pandas as pd 34 | 35 | adult_census = pd.read_csv("../datasets/adult-census.csv") 36 | 37 | # %% [markdown] 38 | # We first split our dataset to have the target separated from the data used to 39 | # train our predictive model. 40 | 41 | # %% 42 | target_name = "class" 43 | target = adult_census[target_name] 44 | data = adult_census.drop(columns=target_name) 45 | 46 | # %% [markdown] 47 | # We start by selecting only the numerical columns as seen in the previous 48 | # notebook. 49 | 50 | # %% 51 | numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"] 52 | 53 | data_numeric = data[numerical_columns] 54 | 55 | # %% [markdown] 56 | # Split the data and target into a train and test set. 57 | 58 | # %% 59 | from sklearn.model_selection import train_test_split 60 | 61 | # Write your code here. 62 | 63 | # %% [markdown] 64 | # Use a `DummyClassifier` such that the resulting classifier always predict the 65 | # class `' >50K'`. What is the accuracy score on the test set? Repeat the 66 | # experiment by always predicting the class `' <=50K'`. 67 | # 68 | # Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve 69 | # the desired behavior. 70 | 71 | # %% 72 | from sklearn.dummy import DummyClassifier 73 | 74 | # Write your code here. 75 | -------------------------------------------------------------------------------- /python_scripts/02_numerical_pipeline_sol_00.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # kernelspec: 4 | # display_name: Python 3 5 | # name: python3 6 | # --- 7 | 8 | # %% [markdown] 9 | # # 📃 Solution for Exercise M1.02 10 | # 11 | # The goal of this exercise is to fit a similar model as in the previous 12 | # notebook to get familiar with manipulating scikit-learn objects and in 13 | # particular the `.fit/.predict/.score` API. 14 | 15 | # %% [markdown] 16 | # Let's load the adult census dataset with only numerical variables 17 | 18 | # %% 19 | import pandas as pd 20 | 21 | adult_census = pd.read_csv("../datasets/adult-census-numeric.csv") 22 | data = adult_census.drop(columns="class") 23 | target = adult_census["class"] 24 | 25 | # %% [markdown] 26 | # In the previous notebook we used `model = KNeighborsClassifier()`. All 27 | # scikit-learn models can be created without arguments. This is convenient 28 | # because it means that you don't need to understand the full details of a model 29 | # before starting to use it. 30 | # 31 | # One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the 32 | # number of neighbors we are going to use to make a prediction for a new data 33 | # point. 34 | # 35 | # What is the default value of the `n_neighbors` parameter? 36 | # 37 | # **Hint**: Look at the documentation on the [scikit-learn 38 | # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) 39 | # or directly access the description inside your notebook by running the 40 | # following cell. This opens a pager pointing to the documentation. 41 | 42 | # %% 43 | from sklearn.neighbors import KNeighborsClassifier 44 | 45 | # KNeighborsClassifier? 46 | 47 | # %% [markdown] tags=["solution"] 48 | # We can see that the default value for `n_neighbors` is 5. 49 | 50 | # %% [markdown] 51 | # Create a `KNeighborsClassifier` model with `n_neighbors=50` 52 | 53 | # %% 54 | # solution 55 | model = KNeighborsClassifier(n_neighbors=50) 56 | 57 | # %% [markdown] 58 | # Fit this model on the data and target loaded above 59 | 60 | # %% 61 | # solution 62 | model.fit(data, target) 63 | 64 | # %% [markdown] 65 | # Use your model to make predictions on the first 10 data points inside the 66 | # data. Do they match the actual target values? 67 | 68 | # %% 69 | # solution 70 | first_data_values = data.iloc[:10] 71 | first_predictions = model.predict(first_data_values) 72 | first_predictions 73 | 74 | # %% tags=["solution"] 75 | first_target_values = target.iloc[:10] 76 | first_target_values 77 | 78 | # %% tags=["solution"] 79 | number_of_correct_predictions = ( 80 | first_predictions == first_target_values 81 | ).sum() 82 | number_of_predictions = len(first_predictions) 83 | print( 84 | f"{number_of_correct_predictions}/{number_of_predictions} " 85 | "of predictions are correct" 86 | ) 87 | 88 | # %% [markdown] 89 | # Compute the accuracy on the training data. 90 | 91 | # %% 92 | # solution 93 | model.score(data, target) 94 | 95 | # %% [markdown] 96 | # Now load the test data from `"../datasets/adult-census-numeric-test.csv"` and 97 | # compute the accuracy on the test data. 98 | 99 | # %% 100 | # solution 101 | adult_census_test = pd.read_csv("../datasets/adult-census-numeric-test.csv") 102 | 103 | data_test = adult_census_test.drop(columns="class") 104 | target_test = adult_census_test["class"] 105 | 106 | model.score(data_test, target_test) 107 | 108 | # %% [markdown] tags=["solution"] 109 | # Looking at the previous notebook, the accuracy seems slightly higher with 110 | # `n_neighbors=50` than with `n_neighbors=5` (the default value). 111 | -------------------------------------------------------------------------------- /python_scripts/cross_validation_ex_02.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M7.01 16 | # 17 | # In this exercise we will define dummy classification baselines and use them as 18 | # reference to assess the relative predictive performance of a given model of 19 | # interest. 20 | # 21 | # We illustrate those baselines with the help of the Adult Census dataset, using 22 | # only the numerical features for the sake of simplicity. 23 | 24 | # %% 25 | import pandas as pd 26 | 27 | adult_census = pd.read_csv("../datasets/adult-census-numeric-all.csv") 28 | data, target = adult_census.drop(columns="class"), adult_census["class"] 29 | 30 | # %% [markdown] 31 | # First, define a `ShuffleSplit` cross-validation strategy taking half of the 32 | # samples as a testing at each round. Let us use 10 cross-validation rounds. 33 | 34 | # %% 35 | # Write your code here. 36 | 37 | # %% [markdown] 38 | # Next, create a machine learning pipeline composed of a transformer to 39 | # standardize the data followed by a logistic regression classifier. 40 | 41 | # %% 42 | # Write your code here. 43 | 44 | # %% [markdown] 45 | # Compute the cross-validation (test) scores for the classifier on this dataset. 46 | # Store the results pandas Series as we did in the previous notebook. 47 | 48 | # %% 49 | # Write your code here. 50 | 51 | # %% [markdown] 52 | # Now, compute the cross-validation scores of a dummy classifier that constantly 53 | # predicts the most frequent class observed the training set. Please refer to 54 | # the online documentation for the [sklearn.dummy.DummyClassifier 55 | # ](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) 56 | # class. 57 | # 58 | # Store the results in a second pandas Series. 59 | 60 | # %% 61 | # Write your code here. 62 | 63 | # %% [markdown] 64 | # Now that we collected the results from the baseline and the model, concatenate 65 | # the test scores as columns a single pandas dataframe. 66 | 67 | # %% 68 | # Write your code here. 69 | 70 | # %% [markdown] 71 | # 72 | # Next, plot the histogram of the cross-validation test scores for both models 73 | # with the help of [pandas built-in plotting 74 | # function](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#histograms). 75 | # 76 | # What conclusions do you draw from the results? 77 | 78 | # %% 79 | # Write your code here. 80 | 81 | # %% [markdown] 82 | # Change the `strategy` of the dummy classifier to `"stratified"`, compute the 83 | # results. Similarly compute scores for `strategy="uniform"` and then the plot 84 | # the distribution together with the other results. 85 | # 86 | # Are those new baselines better than the previous one? Why is this the case? 87 | # 88 | # Please refer to the scikit-learn documentation on 89 | # [sklearn.dummy.DummyClassifier]( 90 | # https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) 91 | # to find out about the meaning of the `"stratified"` and `"uniform"` 92 | # strategies. 93 | 94 | # %% 95 | # Write your code here. 96 | -------------------------------------------------------------------------------- /python_scripts/ensemble_ex_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M6.01 16 | # 17 | # The aim of this notebook is to investigate if we can tune the hyperparameters 18 | # of a bagging regressor and evaluate the gain obtained. 19 | # 20 | # We will load the California housing dataset and split it into a training and a 21 | # testing set. 22 | 23 | # %% 24 | from sklearn.datasets import fetch_california_housing 25 | from sklearn.model_selection import train_test_split 26 | 27 | data, target = fetch_california_housing(as_frame=True, return_X_y=True) 28 | target *= 100 # rescale the target in k$ 29 | data_train, data_test, target_train, target_test = train_test_split( 30 | data, target, random_state=0, test_size=0.5 31 | ) 32 | 33 | # %% [markdown] 34 | # ```{note} 35 | # If you want a deeper overview regarding this dataset, you can refer to the 36 | # Appendix - Datasets description section at the end of this MOOC. 37 | # ``` 38 | 39 | # %% [markdown] 40 | # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its 41 | # parameter `estimator`. Train the regressor and evaluate its generalization 42 | # performance on the testing set using the mean absolute error. 43 | 44 | # %% 45 | # Write your code here. 46 | 47 | # %% [markdown] 48 | # Now, create a `RandomizedSearchCV` instance using the previous model and tune 49 | # the important parameters of the bagging regressor. Find the best parameters 50 | # and check if you are able to find a set of parameters that improve the default 51 | # regressor still using the mean absolute error as a metric. 52 | # 53 | # ```{tip} 54 | # You can list the bagging regressor's parameters using the `get_params` method. 55 | # ``` 56 | 57 | # %% 58 | # Write your code here. 59 | -------------------------------------------------------------------------------- /python_scripts/ensemble_ex_02.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M6.02 16 | # 17 | # The aim of this exercise it to explore some attributes available in 18 | # scikit-learn's random forest. 19 | # 20 | # First, we will fit the penguins regression dataset. 21 | 22 | # %% 23 | import pandas as pd 24 | from sklearn.model_selection import train_test_split 25 | 26 | penguins = pd.read_csv("../datasets/penguins_regression.csv") 27 | feature_name = "Flipper Length (mm)" 28 | target_name = "Body Mass (g)" 29 | data, target = penguins[[feature_name]], penguins[target_name] 30 | data_train, data_test, target_train, target_test = train_test_split( 31 | data, target, random_state=0 32 | ) 33 | 34 | # %% [markdown] 35 | # ```{note} 36 | # If you want a deeper overview regarding this dataset, you can refer to the 37 | # Appendix - Datasets description section at the end of this MOOC. 38 | # ``` 39 | 40 | # %% [markdown] 41 | # Create a random forest containing three trees. Train the forest and check the 42 | # generalization performance on the testing set in terms of mean absolute error. 43 | 44 | # %% 45 | # Write your code here. 46 | 47 | # %% [markdown] 48 | # We now aim to plot the predictions from the individual trees in the forest. 49 | # For that purpose you have to create first a new dataset containing evenly 50 | # spaced values for the flipper length over the interval between 170 mm and 230 51 | # mm. 52 | 53 | # %% 54 | # Write your code here. 55 | 56 | # %% [markdown] 57 | # The trees contained in the forest that you created can be accessed with the 58 | # attribute `estimators_`. Use them to predict the body mass corresponding to 59 | # the values in this newly created dataset. Similarly find the predictions of 60 | # the random forest in this dataset. 61 | 62 | # %% 63 | # Write your code here. 64 | 65 | # %% [markdown] 66 | # Now make a plot that displays: 67 | # - the whole `data` using a scatter plot; 68 | # - the decision of each individual tree; 69 | # - the decision of the random forest. 70 | 71 | # %% 72 | # Write your code here. 73 | -------------------------------------------------------------------------------- /python_scripts/ensemble_ex_03.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M6.03 16 | # 17 | # The aim of this exercise is to: 18 | # 19 | # * verifying if a random forest or a gradient-boosting decision tree overfit if 20 | # the number of estimators is not properly chosen; 21 | # * use the early-stopping strategy to avoid adding unnecessary trees, to get 22 | # the best generalization performances. 23 | # 24 | # We use the California housing dataset to conduct our experiments. 25 | 26 | # %% 27 | from sklearn.datasets import fetch_california_housing 28 | from sklearn.model_selection import train_test_split 29 | 30 | data, target = fetch_california_housing(return_X_y=True, as_frame=True) 31 | target *= 100 # rescale the target in k$ 32 | data_train, data_test, target_train, target_test = train_test_split( 33 | data, target, random_state=0, test_size=0.5 34 | ) 35 | 36 | # %% [markdown] 37 | # ```{note} 38 | # If you want a deeper overview regarding this dataset, you can refer to the 39 | # Appendix - Datasets description section at the end of this MOOC. 40 | # ``` 41 | 42 | # %% [markdown] 43 | # Create a gradient boosting decision tree with `max_depth=5` and 44 | # `learning_rate=0.5`. 45 | 46 | # %% 47 | # Write your code here. 48 | 49 | # %% [markdown] 50 | # 51 | # Also create a random forest with fully grown trees by setting `max_depth=None`. 52 | 53 | # %% 54 | # Write your code here. 55 | 56 | # %% [markdown] 57 | # 58 | # For both the gradient-boosting and random forest models, create a validation 59 | # curve using the training set to assess the impact of the number of trees on 60 | # the performance of each model. Evaluate the list of parameters `param_range = 61 | # np.array([1, 2, 5, 10, 20, 50, 100, 200])` and score it using 62 | # `neg_mean_absolute_error`. Remember to set `negate_score=True` to recover the 63 | # right sign of the Mean Absolute Error. 64 | 65 | # %% 66 | # Write your code here. 67 | 68 | # %% [markdown] 69 | # Random forest models improve when increasing the number of trees in the 70 | # ensemble. However, the scores reach a plateau where adding new trees just 71 | # makes fitting and scoring slower. 72 | # 73 | # Now repeat the analysis for the gradient boosting model. 74 | 75 | # %% 76 | # Write your code here. 77 | 78 | 79 | # %% [markdown] 80 | # Gradient boosting models overfit when the number of trees is too large. To 81 | # avoid adding a new unnecessary tree, unlike random-forest gradient-boosting 82 | # offers an early-stopping option. Internally, the algorithm uses an 83 | # out-of-sample set to compute the generalization performance of the model at 84 | # each addition of a tree. Thus, if the generalization performance is not 85 | # improving for several iterations, it stops adding trees. 86 | # 87 | # Now, create a gradient-boosting model with `n_estimators=1_000`. This number 88 | # of trees is certainly too large as we have seen above. Change the parameter 89 | # `n_iter_no_change` such that the gradient boosting fitting stops after adding 90 | # 5 trees to avoid deterioration of the overall generalization performance. 91 | 92 | # %% 93 | # Write your code here. 94 | 95 | # %% [markdown] 96 | # Estimate the generalization performance of this model again using the 97 | # `sklearn.metrics.mean_absolute_error` metric but this time using the test set 98 | # that we held out at the beginning of the notebook. Compare the resulting value 99 | # with the values observed in the validation curve. 100 | 101 | # %% 102 | # Write your code here. 103 | -------------------------------------------------------------------------------- /python_scripts/ensemble_ex_04.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M6.04 16 | # 17 | # The aim of the exercise is to get familiar with the histogram 18 | # gradient-boosting in scikit-learn. Besides, we will use this model within a 19 | # cross-validation framework in order to inspect internal parameters found via 20 | # grid-search. 21 | # 22 | # We will use the California housing dataset. 23 | 24 | # %% 25 | from sklearn.datasets import fetch_california_housing 26 | 27 | data, target = fetch_california_housing(return_X_y=True, as_frame=True) 28 | target *= 100 # rescale the target in k$ 29 | 30 | # %% [markdown] 31 | # First, create a histogram gradient boosting regressor. You can set the trees 32 | # number to be large, and configure the model to use early-stopping. 33 | 34 | # %% 35 | # Write your code here. 36 | 37 | # %% [markdown] 38 | # We will use a grid-search to find some optimal parameter for this model. In 39 | # this grid-search, you should search for the following parameters: 40 | # 41 | # * `max_depth: [3, 8]`; 42 | # * `max_leaf_nodes: [15, 31]`; 43 | # * `learning_rate: [0.1, 1]`. 44 | # 45 | # Feel free to explore the space with additional values. Create the grid-search 46 | # providing the previous gradient boosting instance as the model. 47 | 48 | # %% 49 | # Write your code here. 50 | 51 | # %% [markdown] 52 | # Finally, we will run our experiment through cross-validation. In this regard, 53 | # define a 5-fold cross-validation. Besides, be sure to shuffle the data. 54 | # Subsequently, use the function `sklearn.model_selection.cross_validate` to run 55 | # the cross-validation. You should also set `return_estimator=True`, so that we 56 | # can investigate the inner model trained via cross-validation. 57 | 58 | # %% 59 | # Write your code here. 60 | 61 | # %% [markdown] 62 | # Now that we got the cross-validation results, print out the mean and standard 63 | # deviation score. 64 | 65 | # %% 66 | # Write your code here. 67 | 68 | # %% [markdown] 69 | # Then inspect the `estimator` entry of the results and check the best 70 | # parameters values. Besides, check the number of trees used by the model. 71 | 72 | # %% 73 | # Write your code here. 74 | 75 | # %% [markdown] 76 | # Inspect the results of the inner CV for each estimator of the outer CV. 77 | # Aggregate the mean test score for each parameter combination and make a box 78 | # plot of these scores. 79 | 80 | # %% 81 | # Write your code here. 82 | -------------------------------------------------------------------------------- /python_scripts/ensemble_sol_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # kernelspec: 4 | # display_name: Python 3 5 | # name: python3 6 | # --- 7 | 8 | # %% [markdown] 9 | # # 📃 Solution for Exercise M6.01 10 | # 11 | # The aim of this notebook is to investigate if we can tune the hyperparameters 12 | # of a bagging regressor and evaluate the gain obtained. 13 | # 14 | # We will load the California housing dataset and split it into a training and a 15 | # testing set. 16 | 17 | # %% 18 | from sklearn.datasets import fetch_california_housing 19 | from sklearn.model_selection import train_test_split 20 | 21 | data, target = fetch_california_housing(as_frame=True, return_X_y=True) 22 | target *= 100 # rescale the target in k$ 23 | data_train, data_test, target_train, target_test = train_test_split( 24 | data, target, random_state=0, test_size=0.5 25 | ) 26 | 27 | # %% [markdown] 28 | # ```{note} 29 | # If you want a deeper overview regarding this dataset, you can refer to the 30 | # Appendix - Datasets description section at the end of this MOOC. 31 | # ``` 32 | 33 | # %% [markdown] 34 | # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its 35 | # parameter `estimator`. Train the regressor and evaluate its generalization 36 | # performance on the testing set using the mean absolute error. 37 | 38 | # %% 39 | # solution 40 | from sklearn.metrics import mean_absolute_error 41 | from sklearn.tree import DecisionTreeRegressor 42 | from sklearn.ensemble import BaggingRegressor 43 | 44 | tree = DecisionTreeRegressor() 45 | bagging = BaggingRegressor(estimator=tree, n_jobs=2) 46 | bagging.fit(data_train, target_train) 47 | target_predicted = bagging.predict(data_test) 48 | print( 49 | "Basic mean absolute error of the bagging regressor:\n" 50 | f"{mean_absolute_error(target_test, target_predicted):.2f} k$" 51 | ) 52 | 53 | # %% [markdown] 54 | # Now, create a `RandomizedSearchCV` instance using the previous model and tune 55 | # the important parameters of the bagging regressor. Find the best parameters 56 | # and check if you are able to find a set of parameters that improve the default 57 | # regressor still using the mean absolute error as a metric. 58 | 59 | # ```{tip} 60 | # You can list the bagging regressor's parameters using the `get_params` method. 61 | # ``` 62 | 63 | # %% 64 | # solution 65 | for param in bagging.get_params().keys(): 66 | print(param) 67 | 68 | # %% tags=["solution"] 69 | from scipy.stats import randint 70 | from sklearn.model_selection import RandomizedSearchCV 71 | 72 | param_grid = { 73 | "n_estimators": randint(10, 30), 74 | "max_samples": [0.5, 0.8, 1.0], 75 | "max_features": [0.5, 0.8, 1.0], 76 | "estimator__max_depth": randint(3, 10), 77 | } 78 | search = RandomizedSearchCV( 79 | bagging, param_grid, n_iter=20, scoring="neg_mean_absolute_error" 80 | ) 81 | _ = search.fit(data_train, target_train) 82 | 83 | # %% tags=["solution"] 84 | import pandas as pd 85 | 86 | columns = [f"param_{name}" for name in param_grid.keys()] 87 | columns += ["mean_test_error", "std_test_error"] 88 | cv_results = pd.DataFrame(search.cv_results_) 89 | cv_results["mean_test_error"] = -cv_results["mean_test_score"] 90 | cv_results["std_test_error"] = cv_results["std_test_score"] 91 | cv_results[columns].sort_values(by="mean_test_error") 92 | 93 | # %% tags=["solution"] 94 | target_predicted = search.predict(data_test) 95 | print( 96 | "Mean absolute error after tuning of the bagging regressor:\n" 97 | f"{mean_absolute_error(target_test, target_predicted):.2f} k$" 98 | ) 99 | 100 | # %% [markdown] tags=["solution"] 101 | # We see that the predictor provided by the bagging regressor does not need much 102 | # hyperparameter tuning compared to a single decision tree. 103 | -------------------------------------------------------------------------------- /python_scripts/feature_selection_ex_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise 01 16 | # 17 | # The aim of this exercise is to highlight caveats to have in mind when using 18 | # feature selection. You have to be extremely careful regarding the set of data 19 | # on which you will compute the statistic that helps your feature selection 20 | # algorithm to decide which feature to select. 21 | # 22 | # On purpose, we will make you program the wrong way of doing feature selection 23 | # to gain insights. 24 | # 25 | # First, you will create a completely random dataset using NumPy. Using the 26 | # function `np.random.randn`, generate a matrix `data` containing 100 samples 27 | # and 100,000 features. Then, using the function `np.random.randint`, generate a 28 | # vector `target` with 100 samples containing either 0 or 1. 29 | # 30 | # This type of dimensionality is typical in bioinformatics when dealing with 31 | # RNA-seq. However, we will use completely randomized features such that we 32 | # don't have a link between the data and the target. Thus, the generalization 33 | # performance of any machine-learning model should not perform better than the 34 | # chance-level. 35 | 36 | # %% 37 | import numpy as np 38 | 39 | # Write your code here. 40 | 41 | # %% [markdown] 42 | # Now, create a logistic regression model and use cross-validation to check the 43 | # score of such a model. It will allow use to confirm that our model cannot 44 | # predict anything meaningful from random data. 45 | 46 | # %% 47 | # Write your code here. 48 | 49 | # %% [markdown] 50 | # Now, we will ask you to program the **wrong** pattern to select feature. 51 | # Select the feature by using the entire dataset. We will choose ten features 52 | # with the highest ANOVA F-score computed on the full dataset. Subsequently, 53 | # subsample the dataset `data` by selecting the features' subset. Finally, train 54 | # and test a logistic regression model. 55 | # 56 | # You should get some surprising results. 57 | 58 | # %% 59 | from sklearn.feature_selection import SelectKBest, f_classif 60 | 61 | # Write your code here. 62 | 63 | # %% [markdown] 64 | # Now, we will make you program the **right** way to do the feature selection. 65 | # First, split the dataset into a training and testing set. Then, fit the 66 | # feature selector on the training set. Then, transform both the training and 67 | # testing sets before you train and test the logistic regression. 68 | 69 | # %% 70 | from sklearn.model_selection import train_test_split 71 | 72 | # Write your code here. 73 | 74 | # %% [markdown] 75 | # However, the previous case is not perfect. For instance, if we were asking to 76 | # perform cross-validation, the manual `fit`/`transform` of the datasets will 77 | # make our life hard. Indeed, the solution here is to use a scikit-learn 78 | # pipeline in which the feature selection will be a pre processing stage before 79 | # to train the model. 80 | # 81 | # Thus, start by creating a pipeline with the feature selector and the logistic 82 | # regression. Then, use cross-validation to get an estimate of the uncertainty 83 | # of your model generalization performance. 84 | 85 | # %% 86 | from sklearn.pipeline import make_pipeline 87 | 88 | # Write your code here. 89 | -------------------------------------------------------------------------------- /python_scripts/feature_selection_limitation_model.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # kernelspec: 4 | # display_name: Python 3 5 | # name: python3 6 | # --- 7 | 8 | # %% [markdown] 9 | # # Limitation of selecting feature using a model 10 | # 11 | # In this notebook, we want to show a limitation when using a machine-learning 12 | # model to make a selection. 13 | # 14 | # Indeed, one can inspect a model and find relative feature importances. For 15 | # instance, the parameters `coef_` for the linear models or 16 | # `feature_importances_` for the tree-based models carries such information. 17 | # Therefore, this method works as far as the relative feature importances given 18 | # by the model is sufficient to select the meaningful feature. 19 | # 20 | # Here, we will generate a dataset that contains a large number of random 21 | # features. 22 | 23 | # %% 24 | from sklearn.datasets import make_classification 25 | 26 | data, target = make_classification( 27 | n_samples=5000, 28 | n_features=100, 29 | n_informative=2, 30 | n_redundant=5, 31 | n_repeated=5, 32 | class_sep=0.3, 33 | random_state=0, 34 | ) 35 | 36 | # %% [markdown] 37 | # First, let's build a model which will not make any features selection. 38 | 39 | # %% 40 | from sklearn.ensemble import RandomForestClassifier 41 | 42 | model_without_selection = RandomForestClassifier() 43 | 44 | # %% [markdown] 45 | # We will evaluate this model by a k-fold cross validation and store the results 46 | # in a pandas dataframe. 47 | 48 | # %% 49 | import pandas as pd 50 | from sklearn.model_selection import cross_validate 51 | 52 | cv_results_without_selection = cross_validate( 53 | model_without_selection, data, target, cv=5 54 | ) 55 | cv_results_without_selection = pd.DataFrame(cv_results_without_selection) 56 | 57 | # %% [markdown] 58 | # Then, we will build another model which will include a feature selection step 59 | # based on a random forest and evaluate it as well with cross-validation. 60 | 61 | # %% 62 | from sklearn.pipeline import make_pipeline 63 | from sklearn.feature_selection import SelectFromModel 64 | 65 | feature_selector = SelectFromModel(RandomForestClassifier()) 66 | model_with_selection = make_pipeline( 67 | feature_selector, RandomForestClassifier() 68 | ) 69 | 70 | # %% 71 | cv_results_with_selection = cross_validate( 72 | model_with_selection, data, target, cv=5 73 | ) 74 | cv_results_with_selection = pd.DataFrame(cv_results_with_selection) 75 | 76 | # %% [markdown] 77 | # We can compare the testing score of the two models. For this matter, we are 78 | # combining results in a single dataframe. 79 | 80 | # %% 81 | cv_results = pd.concat( 82 | [cv_results_without_selection, cv_results_with_selection], 83 | axis=1, 84 | keys=["Without feature selection", "With feature selection"], 85 | ).swaplevel(axis="columns") 86 | 87 | # %% [markdown] 88 | # Finally, we can check the testing score of each the model. 89 | 90 | # %% 91 | import matplotlib.pyplot as plt 92 | 93 | color = {"whiskers": "black", "medians": "black", "caps": "black"} 94 | cv_results["test_score"].plot.box(color=color, vert=False) 95 | plt.xlabel("Accuracy") 96 | _ = plt.title("Limitation of using a random forest for feature selection") 97 | 98 | # %% [markdown] 99 | # The model that selected a subset of feature is less performant than a random 100 | # forest fitted on the full dataset. 101 | # 102 | # We can rely on some aspects tackled in the notebook presenting the model 103 | # inspection to explain this behaviour. The decision tree's relative feature 104 | # importance will overestimate the importance of random feature when the 105 | # decision tree overfits the training set. 106 | # 107 | # Therefore, it is good to keep in mind that feature selection relies on 108 | # procedures making some assumptions, which can be perfectible. 109 | -------------------------------------------------------------------------------- /python_scripts/linear_models_ex_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M4.01 16 | # 17 | # The aim of this exercise is two-fold: 18 | # 19 | # * understand the parametrization of a linear model; 20 | # * quantify the fitting accuracy of a set of such models. 21 | # 22 | # We will reuse part of the code of the course to: 23 | # 24 | # * load data; 25 | # * create the function representing a linear model. 26 | # 27 | # ## Prerequisites 28 | # 29 | # ### Data loading 30 | 31 | # %% [markdown] 32 | # ```{note} 33 | # If you want a deeper overview regarding this dataset, you can refer to the 34 | # Appendix - Datasets description section at the end of this MOOC. 35 | # ``` 36 | 37 | # %% 38 | import pandas as pd 39 | 40 | penguins = pd.read_csv("../datasets/penguins_regression.csv") 41 | feature_name = "Flipper Length (mm)" 42 | target_name = "Body Mass (g)" 43 | data, target = penguins[[feature_name]], penguins[target_name] 44 | 45 | # %% [markdown] 46 | # ### Model definition 47 | 48 | 49 | # %% 50 | def linear_model_flipper_mass( 51 | flipper_length, weight_flipper_length, intercept_body_mass 52 | ): 53 | """Linear model of the form y = a * x + b""" 54 | body_mass = weight_flipper_length * flipper_length + intercept_body_mass 55 | return body_mass 56 | 57 | 58 | # %% [markdown] 59 | # ## Main exercise 60 | # 61 | # Define a vector `weights = [...]` and a vector `intercepts = [...]` of the 62 | # same length. Each pair of entries `(weights[i], intercepts[i])` tags a 63 | # different model. Use these vectors along with the vector 64 | # `flipper_length_range` to plot several linear models that could possibly fit 65 | # our data. Use the above helper function to visualize both the models and the 66 | # real samples. 67 | 68 | # %% 69 | import numpy as np 70 | 71 | flipper_length_range = np.linspace(data.min(), data.max(), num=300) 72 | 73 | # %% 74 | # Write your code here. 75 | 76 | # %% [markdown] 77 | # In the previous question, you were asked to create several linear models. The 78 | # visualization allowed you to qualitatively assess if a model was better than 79 | # another. 80 | # 81 | # Now, you should come up with a quantitative measure which indicates the 82 | # goodness of fit of each linear model and allows you to select the best model. 83 | # Define a function `goodness_fit_measure(true_values, predictions)` that takes 84 | # as inputs the true target values and the predictions and returns a single 85 | # scalar as output. 86 | 87 | 88 | # %% 89 | # Write your code here. 90 | 91 | # %% [markdown] 92 | # You can now copy and paste the code below to show the goodness of fit for each 93 | # model. 94 | # 95 | # ```python 96 | # for model_idx, (weight, intercept) in enumerate(zip(weights, intercepts)): 97 | # target_predicted = linear_model_flipper_mass(data, weight, intercept) 98 | # print(f"Model #{model_idx}:") 99 | # print(f"{weight:.2f} (g / mm) * flipper length + {intercept:.2f} (g)") 100 | # print(f"Error: {goodness_fit_measure(target, target_predicted):.3f}\n") 101 | # ``` 102 | 103 | # %% 104 | # Write your code here. 105 | -------------------------------------------------------------------------------- /python_scripts/matplotlibrc: -------------------------------------------------------------------------------- 1 | axes.labelsize: 18.0 2 | axes.linewidth: 1.875 3 | axes.titlesize: 18.0 4 | boxplot.whiskers: 1000 5 | boxplot.patchartist: True 6 | boxplot.boxprops.color: black 7 | boxplot.capprops.color: black 8 | boxplot.medianprops.color: black 9 | boxplot.whiskerprops.color: black 10 | boxplot.boxprops.linewidth: 3.0 11 | boxplot.capprops.linewidth: 3.0 12 | boxplot.medianprops.linewidth: 2.5 13 | boxplot.whiskerprops.linewidth: 3.0 14 | figure.titlesize: 22.0 15 | font.size: 18.0 16 | grid.linewidth: 1.5 17 | legend.fontsize: 16.5 18 | legend.title_fontsize: 18.0 19 | lines.linewidth: 3.5 20 | lines.markersize: 9.0 21 | patch.linewidth: 1.5 22 | xtick.labelsize: 16.5 23 | xtick.major.size: 9.0 24 | xtick.major.width: 1.875 25 | xtick.minor.size: 6.0 26 | xtick.minor.width: 1.5 27 | ytick.labelsize: 16.5 28 | ytick.major.size: 9.0 29 | ytick.major.width: 1.875 30 | ytick.minor.size: 6.0 31 | ytick.minor.width: 1.5 32 | -------------------------------------------------------------------------------- /python_scripts/metrics_ex_02.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M7.03 16 | # 17 | # As with the classification metrics exercise, we will evaluate the regression 18 | # metrics within a cross-validation framework to get familiar with the syntax. 19 | # 20 | # We will use the Ames house prices dataset. 21 | 22 | # %% 23 | import pandas as pd 24 | import numpy as np 25 | 26 | ames_housing = pd.read_csv("../datasets/house_prices.csv") 27 | data = ames_housing.drop(columns="SalePrice") 28 | target = ames_housing["SalePrice"] 29 | data = data.select_dtypes(np.number) 30 | target /= 1000 31 | 32 | # %% [markdown] 33 | # ```{note} 34 | # If you want a deeper overview regarding this dataset, you can refer to the 35 | # Appendix - Datasets description section at the end of this MOOC. 36 | # ``` 37 | 38 | 39 | # %% [markdown] 40 | # The first step will be to create a linear regression model. 41 | 42 | # %% 43 | # Write your code here. 44 | 45 | # %% [markdown] 46 | # Then, use the `cross_val_score` to estimate the generalization performance of 47 | # the model. Use a `KFold` cross-validation with 10 folds. Make the use of the 48 | # $R^2$ score explicit by assigning the parameter `scoring` (even though it is 49 | # the default score). 50 | 51 | # %% 52 | # Write your code here. 53 | 54 | # %% [markdown] 55 | # Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You 56 | # may need to refer to the documentation for the `scoring` parameter. 57 | 58 | # %% 59 | # Write your code here. 60 | 61 | # %% [markdown] 62 | # Finally, use the `cross_validate` function and compute multiple scores/errors 63 | # at once by passing a list of scorers to the `scoring` parameter. You can 64 | # compute the $R^2$ score and the mean absolute error for instance. 65 | 66 | # %% 67 | # Write your code here. 68 | -------------------------------------------------------------------------------- /python_scripts/parameter_tuning_ex_02.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M3.01 16 | # 17 | # The goal is to write an exhaustive search to find the best parameters 18 | # combination maximizing the model generalization performance. 19 | # 20 | # Here we use a small subset of the Adult Census dataset to make the code faster 21 | # to execute. Once your code works on the small subset, try to change 22 | # `train_size` to a larger value (e.g. 0.8 for 80% instead of 20%). 23 | 24 | # %% 25 | import pandas as pd 26 | 27 | from sklearn.model_selection import train_test_split 28 | 29 | adult_census = pd.read_csv("../datasets/adult-census.csv") 30 | 31 | target_name = "class" 32 | target = adult_census[target_name] 33 | data = adult_census.drop(columns=[target_name, "education-num"]) 34 | 35 | data_train, data_test, target_train, target_test = train_test_split( 36 | data, target, train_size=0.2, random_state=42 37 | ) 38 | 39 | # %% 40 | from sklearn.compose import make_column_transformer 41 | from sklearn.compose import make_column_selector as selector 42 | from sklearn.preprocessing import OrdinalEncoder 43 | 44 | categorical_preprocessor = OrdinalEncoder( 45 | handle_unknown="use_encoded_value", unknown_value=-1 46 | ) 47 | preprocessor = make_column_transformer( 48 | (categorical_preprocessor, selector(dtype_include=object)), 49 | remainder="passthrough", 50 | ) 51 | 52 | from sklearn.ensemble import HistGradientBoostingClassifier 53 | from sklearn.pipeline import Pipeline 54 | 55 | model = Pipeline( 56 | [ 57 | ("preprocessor", preprocessor), 58 | ("classifier", HistGradientBoostingClassifier(random_state=42)), 59 | ] 60 | ) 61 | 62 | # %% [markdown] 63 | # Use the previously defined model (called `model`) and using two nested `for` 64 | # loops, make a search of the best combinations of the `learning_rate` and 65 | # `max_leaf_nodes` parameters. In this regard, you need to train and test the 66 | # model by setting the parameters. The evaluation of the model should be 67 | # performed using `cross_val_score` on the training set. Use the following 68 | # parameters search: 69 | # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls 70 | # the ability of a new tree to correct the error of the previous sequence of 71 | # trees 72 | # - `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the depth 73 | # of each tree. 74 | 75 | # %% 76 | # Write your code here. 77 | 78 | # %% [markdown] 79 | # Now use the test set to score the model using the best parameters that we 80 | # found using cross-validation. You will have to refit the model over the full 81 | # training set. 82 | 83 | # %% 84 | # Write your code here. 85 | 86 | # %% 87 | -------------------------------------------------------------------------------- /python_scripts/parameter_tuning_ex_03.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M3.02 16 | # 17 | # The goal is to find the best set of hyperparameters which maximize the 18 | # generalization performance on a training set. 19 | 20 | # %% 21 | from sklearn.datasets import fetch_california_housing 22 | from sklearn.model_selection import train_test_split 23 | 24 | data, target = fetch_california_housing(return_X_y=True, as_frame=True) 25 | target *= 100 # rescale the target in k$ 26 | 27 | data_train, data_test, target_train, target_test = train_test_split( 28 | data, target, random_state=42 29 | ) 30 | 31 | # %% [markdown] 32 | # In this exercise, we progressively define the regression pipeline and later 33 | # tune its hyperparameters. 34 | # 35 | # Start by defining a pipeline that: 36 | # * uses a `StandardScaler` to normalize the numerical data; 37 | # * uses a `sklearn.neighbors.KNeighborsRegressor` as a predictive model. 38 | 39 | # %% 40 | # Write your code here. 41 | 42 | # %% [markdown] 43 | # Use `RandomizedSearchCV` with `n_iter=20` and 44 | # `scoring="neg_mean_absolute_error"` to tune the following hyperparameters 45 | # of the `model`: 46 | # 47 | # - the parameter `n_neighbors` of the `KNeighborsRegressor` with values 48 | # `np.logspace(0, 3, num=10).astype(np.int32)`; 49 | # - the parameter `with_mean` of the `StandardScaler` with possible values 50 | # `True` or `False`; 51 | # - the parameter `with_std` of the `StandardScaler` with possible values `True` 52 | # or `False`. 53 | # 54 | # The `scoring` function is expected to return higher values for better models, 55 | # since grid/random search objects **maximize** it. Because of that, error 56 | # metrics like `mean_absolute_error` must be negated (using the `neg_` prefix) 57 | # to work correctly (remember lower errors represent better models). 58 | # 59 | # Notice that in the notebook "Hyperparameter tuning by randomized-search" we 60 | # pass distributions to be sampled by the `RandomizedSearchCV`. In this case we 61 | # define a fixed grid of hyperparameters to be explored. Using a `GridSearchCV` 62 | # instead would explore all the possible combinations on the grid, which can be 63 | # costly to compute for large grids, whereas the parameter `n_iter` of the 64 | # `RandomizedSearchCV` controls the number of different random combination that 65 | # are evaluated. Notice that setting `n_iter` larger than the number of possible 66 | # combinations in a grid (in this case 10 x 2 x 2 = 40) would lead to repeating 67 | # already-explored combinations. 68 | # 69 | # Once the computation has completed, print the best combination of parameters 70 | # stored in the `best_params_` attribute. 71 | 72 | # %% 73 | # Write your code here. 74 | -------------------------------------------------------------------------------- /python_scripts/trees_dataset.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # kernelspec: 4 | # display_name: Python 3 5 | # name: python3 6 | # --- 7 | 8 | # %% [markdown] 9 | # # The penguins datasets 10 | # 11 | # In this notebook, we make a quick presentation of the [Palmer penguins 12 | # dataset](https://allisonhorst.github.io/palmerpenguins/) dataset. We use this 13 | # dataset for both classification and regression problems by selecting a subset 14 | # of the features to make our explanations intuitive. 15 | # 16 | # ## Classification dataset 17 | # 18 | # We use this dataset in classification setting to predict the penguins' 19 | # species from anatomical information. 20 | # 21 | # Each penguin is from one of the three following species: Adelie, Gentoo, and 22 | # Chinstrap. See the illustration below depicting the three different penguin 23 | # species: 24 | # 25 | # ![Image of 26 | # penguins](https://github.com/allisonhorst/palmerpenguins/raw/main/man/figures/lter_penguins.png) 27 | # 28 | # This problem is a classification problem since the target is categorical. We 29 | # limit our input data to a subset of the original features to simplify our 30 | # explanations when presenting the decision tree algorithm. Indeed, we use 31 | # features based on penguins' culmen measurement. You can learn more about the 32 | # penguins' culmen with the illustration below: 33 | # 34 | # ![Image of 35 | # culmen](https://github.com/allisonhorst/palmerpenguins/raw/main/man/figures/culmen_depth.png) 36 | # 37 | # We start by loading this subset of the dataset. 38 | 39 | # %% 40 | import pandas as pd 41 | 42 | penguins = pd.read_csv("../datasets/penguins_classification.csv") 43 | 44 | culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] 45 | target_column = "Species" 46 | 47 | # %% [markdown] 48 | # Let's check the dataset more into details. 49 | 50 | # %% 51 | penguins 52 | 53 | # %% [markdown] 54 | # Since that we have few samples, we can check a scatter plot to observe the 55 | # samples distribution. 56 | 57 | # %% 58 | import seaborn as sns 59 | 60 | pairplot_figure = sns.pairplot(penguins, hue="Species") 61 | pairplot_figure.fig.set_size_inches(9, 6.5) 62 | 63 | # %% [markdown] 64 | # First let's check the feature distributions by looking at the diagonal plots 65 | # of the pairplot. We can deduce the following intuitions: 66 | # 67 | # * The Adelie species can be differentiated from the Gentoo and Chinstrap 68 | # species depending on the culmen length; 69 | # * The Gentoo species can be differentiated from the Adelie and Chinstrap 70 | # species depending on the culmen depth. 71 | # 72 | # ## Regression dataset 73 | # 74 | # In a regression setting, the target is a continuous variable instead of 75 | # categories. Here, we use two features of the dataset to make such a problem: 76 | # the flipper length is used as data and the body mass as the target. In short, 77 | # we want to predict the body mass using the flipper length. 78 | # 79 | # We load the dataset and visualize the relationship between the flipper length 80 | # and the body mass of penguins. 81 | 82 | # %% 83 | penguins = pd.read_csv("../datasets/penguins_regression.csv") 84 | 85 | feature_name = "Flipper Length (mm)" 86 | target_column = "Body Mass (g)" 87 | 88 | # %% 89 | _ = sns.scatterplot(data=penguins, x=feature_name, y=target_column) 90 | 91 | # %% [markdown] 92 | # Here, we deal with a regression problem because our target is a continuous 93 | # variable ranging from 2.7 kg to 6.3 kg. From the scatter plot above, we 94 | # observe that we have a linear relationship between the flipper length and the 95 | # body mass. The longer the flipper of a penguin, the heavier the penguin. 96 | -------------------------------------------------------------------------------- /python_scripts/trees_ex_01.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M5.01 16 | # 17 | # In the previous notebook, we showed how a tree with 1 level depth works. The 18 | # aim of this exercise is to repeat part of the previous experiment for a tree 19 | # with 2 levels depth to show how such parameter affects the feature space 20 | # partitioning. 21 | # 22 | # We first load the penguins dataset and split it into a training and a testing 23 | # sets: 24 | 25 | # %% 26 | import pandas as pd 27 | 28 | penguins = pd.read_csv("../datasets/penguins_classification.csv") 29 | culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] 30 | target_column = "Species" 31 | 32 | # %% [markdown] 33 | # ```{note} 34 | # If you want a deeper overview regarding this dataset, you can refer to the 35 | # Appendix - Datasets description section at the end of this MOOC. 36 | # ``` 37 | 38 | # %% 39 | from sklearn.model_selection import train_test_split 40 | 41 | data, target = penguins[culmen_columns], penguins[target_column] 42 | data_train, data_test, target_train, target_test = train_test_split( 43 | data, target, random_state=0 44 | ) 45 | 46 | # %% [markdown] 47 | # Create a decision tree classifier with a maximum depth of 2 levels and fit the 48 | # training data. 49 | 50 | # %% 51 | # Write your code here. 52 | 53 | # %% [markdown] 54 | # Now plot the data and the decision boundary of the trained classifier to see 55 | # the effect of increasing the depth of the tree. 56 | # 57 | # Hint: Use the class `DecisionBoundaryDisplay` from the module 58 | # `sklearn.inspection` as shown in previous course notebooks. 59 | # 60 | # ```{warning} 61 | # At this time, it is not possible to use `response_method="predict_proba"` for 62 | # multiclass problems on a single plot. This is a planned feature for a future 63 | # version of scikit-learn. In the mean time, you can use 64 | # `response_method="predict"` instead. 65 | # ``` 66 | 67 | # %% 68 | # Write your code here. 69 | 70 | # %% [markdown] 71 | # Did we make use of the feature "Culmen Length"? Plot the tree using the 72 | # function `sklearn.tree.plot_tree` to find out! 73 | 74 | # %% 75 | # Write your code here. 76 | 77 | # %% [markdown] 78 | # Compute the accuracy of the decision tree on the testing data. 79 | 80 | # %% 81 | # Write your code here. 82 | -------------------------------------------------------------------------------- /python_scripts/trees_ex_02.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.17.1 9 | # kernelspec: 10 | # display_name: Python 3 11 | # name: python3 12 | # --- 13 | 14 | # %% [markdown] 15 | # # 📝 Exercise M5.02 16 | # 17 | # The aim of this exercise is to find out whether a decision tree model is able 18 | # to extrapolate. 19 | # 20 | # By extrapolation, we refer to values predicted by a model outside of the range 21 | # of feature values seen during the training. 22 | # 23 | # We first load the regression data. 24 | 25 | # %% 26 | import pandas as pd 27 | 28 | penguins = pd.read_csv("../datasets/penguins_regression.csv") 29 | 30 | feature_name = "Flipper Length (mm)" 31 | target_name = "Body Mass (g)" 32 | data_train, target_train = penguins[[feature_name]], penguins[target_name] 33 | 34 | # %% [markdown] 35 | # ```{note} 36 | # If you want a deeper overview regarding this dataset, you can refer to the 37 | # Appendix - Datasets description section at the end of this MOOC. 38 | # ``` 39 | 40 | # %% [markdown] 41 | # First, create two models, a linear regression model and a decision tree 42 | # regression model, and fit them on the training data. Limit the depth at 3 43 | # levels for the decision tree. 44 | 45 | # %% 46 | # Write your code here. 47 | 48 | # %% [markdown] 49 | # Create a synthetic dataset containing all possible flipper length from the 50 | # minimum to the maximum of the training dataset. Get the predictions of each 51 | # model using this dataset. 52 | 53 | # %% 54 | # Write your code here. 55 | 56 | # %% [markdown] 57 | # Create a scatter plot containing the training samples and superimpose the 58 | # predictions of both models on the top. 59 | 60 | # %% 61 | # Write your code here. 62 | 63 | # %% [markdown] 64 | # Now, we check the extrapolation capabilities of each model. Create a dataset 65 | # containing a broader range of values than your previous dataset, in other 66 | # words, add values below and above the minimum and the maximum of the flipper 67 | # length seen during training. 68 | 69 | # %% 70 | # Write your code here. 71 | 72 | # %% [markdown] 73 | # Finally, make predictions with both models on this new interval of data. 74 | # Repeat the plotting of the previous exercise. 75 | 76 | # %% 77 | # Write your code here. 78 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=1.6 2 | pandas >= 1 3 | matplotlib 4 | seaborn >= 0.13 5 | plotly 6 | jupyter-book>=0.11 7 | jupytext 8 | beautifulsoup4 9 | IPython 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=1.6 2 | pandas >= 1 3 | matplotlib 4 | seaborn >= 0.13 5 | plotly 6 | jupyterlab 7 | notebook 8 | IPython 9 | -------------------------------------------------------------------------------- /slides/Makefile: -------------------------------------------------------------------------------- 1 | # Compilation is done via remarker and htmlark, both pip instalable 2 | 3 | all: ml_concepts.html overfitting_vs_underfitting.html \ 4 | learning_validation_curves.html bias_vs_variance.html \ 5 | linear_models.html regularized_linear_models.html trees.html \ 6 | ensemble.html concluding_remarks.html 7 | 8 | 9 | %.html: %.md custom.css 10 | # HTMLArk is to embed images and css 11 | remarker $< -c custom.css > $@ 12 | -------------------------------------------------------------------------------- /slides/README.md: -------------------------------------------------------------------------------- 1 | # View slides 2 | 3 | ## On the .github.io website 4 | 5 | The general pattern is `https://inria.github.io/scikit-learn-mooc/slides/?file=[FILENAME].md` 6 | 7 | Example for ML concepts slides: 8 | https://inria.github.io/scikit-learn-mooc/slides/?file=ml_concepts.md 9 | 10 | ## Locally 11 | 12 | Useful when working on the slides: 13 | 14 | ```py 15 | # on the root repo folder 16 | python -m http.server 17 | 18 | # open your browser with the right port (from previous command) using the right md file 19 | firefox 'http://localhost:8000/slides/index.html?file=../slides/ml_concepts.md' 20 | ``` 21 | -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Bold.ttf -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-BoldItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-BoldItalic.ttf -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-Italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Italic.ttf -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Light.ttf -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-LightItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-LightItalic.ttf -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-Medium.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Medium.ttf -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-MediumItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-MediumItalic.ttf -------------------------------------------------------------------------------- /slides/Ubuntu/Ubuntu-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu/Ubuntu-Regular.ttf -------------------------------------------------------------------------------- /slides/Ubuntu_Mono/UbuntuMono-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Bold.ttf -------------------------------------------------------------------------------- /slides/Ubuntu_Mono/UbuntuMono-BoldItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-BoldItalic.ttf -------------------------------------------------------------------------------- /slides/Ubuntu_Mono/UbuntuMono-Italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Italic.ttf -------------------------------------------------------------------------------- /slides/Ubuntu_Mono/UbuntuMono-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/ee3bf0ee7997e0a2b46439e0559dd5882f25727e/slides/Ubuntu_Mono/UbuntuMono-Regular.ttf -------------------------------------------------------------------------------- /slides/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Presentation 5 | 6 | 7 | 8 | 9 | 10 | 12 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /slides/intro_words.md: -------------------------------------------------------------------------------- 1 | 2 | Hi, 3 | 4 | Welcome to the Machine-learning with scikit-learn course. The goal of 5 | this course is to teach you practical aspects of machine learning. It 6 | focuses on tabular data, given that such data is often encountered in the 7 | industry. The course is light on maths, and focuses on practical aspects, 8 | not only about pure machine learning, but also about the basics of data 9 | preparation and visualization for machine learning. Most of the content is 10 | centered on executable Python code that teaches how to analyse the data, 11 | with tools such as scikit-learn. 12 | 13 | Our goal is to be didactic. If you know Python programming and basic 14 | numerics, you should be able to follow along. We hope that this course 15 | will help introduce more people to machine learning. 16 | --------------------------------------------------------------------------------