├── .nojekyll
├── images
    ├── check_env-1.png
    ├── check_env-2.png
    └── download-repo.png
├── slides
    ├── images
    │   ├── PDSH.png
    │   ├── esl.png
    │   ├── imlp.png
    │   ├── l1_kink.png
    │   ├── logit.png
    │   ├── api-table.png
    │   ├── kfold_cv.png
    │   ├── l1l2ball.png
    │   ├── l2_l1_l0.png
    │   ├── ovr_lines.png
    │   ├── svm_or_lr.png
    │   ├── ames_scaling.png
    │   ├── binary_loss.png
    │   ├── gradient_2d.png
    │   ├── gradient_3d.png
    │   ├── group_kfold.png
    │   ├── max_depth_1.png
    │   ├── max_depth_4.png
    │   ├── max_margin.png
    │   ├── no_pruning.png
    │   ├── ram_prices.png
    │   ├── sklearn-docs.png
    │   ├── sklearn_logo.png
    │   ├── time_series1.png
    │   ├── time_series2.png
    │   ├── time_series3.png
    │   ├── tree_pruned.png
    │   ├── triazine_bar.png
    │   ├── average_voting.png
    │   ├── boston_scaling.png
    │   ├── feature_sample.png
    │   ├── grid_ccp_alpha.png
    │   ├── grid_max_depth.png
    │   ├── instability_1.png
    │   ├── instability_2.png
    │   ├── max_margin_C_1.png
    │   ├── mpl_tree_plot.png
    │   ├── ovr_boundaries.png
    │   ├── pruning_alpha.png
    │   ├── random_forest.png
    │   ├── splits_kinect.png
    │   ├── stratified_cv.png
    │   ├── time_series_cv.png
    │   ├── bias_vs_variance.png
    │   ├── binning_quantiles.png
    │   ├── bootstrap_sample.png
    │   ├── elasticnet_search.png
    │   ├── grad_boost_depth2.png
    │   ├── grad_boost_term_1.png
    │   ├── grad_boost_term_2.png
    │   ├── grad_boost_term_3.png
    │   ├── graphviz_jupyter.png
    │   ├── graphviz_source.png
    │   ├── knn_boundary_k1.png
    │   ├── knn_boundary_k3.png
    │   ├── l1l2_elasticnet.png
    │   ├── max_leaf_nodes_8.png
    │   ├── max_margin_C_0.1.png
    │   ├── ram_prices_test.png
    │   ├── ram_prices_train.png
    │   ├── robust_regression.png
    │   ├── shuffle_split_cv.png
    │   ├── supervised-ml-api.png
    │   ├── threefold_split.png
    │   ├── train-test-split.png
    │   ├── tree_illustration.png
    │   ├── tree_importances.png
    │   ├── tree_prediction.png
    │   ├── voting_classifier.png
    │   ├── forest_importances.png
    │   ├── grid_max_leaf_nodes.png
    │   ├── gridsearch_workflow.png
    │   ├── l1l2ball_intersect.png
    │   ├── lasso_alpha_search.png
    │   ├── lasso_coefficients.png
    │   ├── ridge_alpha_search.png
    │   ├── ridge_coefficients.png
    │   ├── xgboost_hist_bench.png
    │   ├── ames_housing_scatter.png
    │   ├── boston_housing_scatter.png
    │   ├── cross_validation_new.png
    │   ├── hist_gradient_boosting.png
    │   ├── knn_boundary_dataset.png
    │   ├── knn_boundary_varying_k.png
    │   ├── knn_model_complexity.png
    │   ├── lasso_alpha_triazine.png
    │   ├── linear_boundary_vector.png
    │   ├── linear_regression_1d.png
    │   ├── logreg_regularization.png
    │   ├── lr_coefficients_large.png
    │   ├── matrix-representation.png
    │   ├── min_samples_split_50.png
    │   ├── ridge_alpha_triazine.png
    │   ├── ridge_alpha_triazines.png
    │   ├── ridge_learning_curve.png
    │   ├── supervised-ml-workflow.png
    │   ├── train_test_split_new.png
    │   ├── xgboost_sklearn_bench.png
    │   ├── gradient_learning_rates.png
    │   ├── grid_search_n_neighbors.png
    │   ├── knn_boundary_test_points.png
    │   ├── ridge_alpha_search_poly.png
    │   ├── ridge_coefficients_alpha.png
    │   ├── ridge_coefficients_large.png
    │   ├── grad_boost_regression_steps.png
    │   ├── linear_svm_regularization.png
    │   ├── repeated_stratified_kfold.png
    │   ├── ridge_alpha_search_cv_runs.png
    │   ├── time_series_walk_forward_cv.png
    │   ├── train_test_validation_split.png
    │   ├── tree_building_iteration_1.png
    │   ├── tree_building_iteration_2.png
    │   ├── tree_building_iteration_9.png
    │   ├── grid_search_cross_validation.png
    │   ├── overfitting_validation_set_1.png
    │   ├── overfitting_validation_set_2.png
    │   ├── overfitting_validation_set_3.png
    │   ├── overfitting_validation_set_4.png
    │   ├── grid_search_cross_validation_new.png
    │   ├── train_test_set_2d_classification.png
    │   ├── overfitting_underfitting_cartoon_full.png
    │   ├── overfitting_underfitting_cartoon_train.png
    │   └── overfitting_underfitting_cartoon_generalization.png
    ├── 01-reminder-supervised-learning.html
    ├── style.css
    ├── 04-linear-models-classification.html
    ├── 06-gradient-boosting.html
    ├── 03-linear-models-regression.html
    └── 02-cross-validation-grid-search.html
├── .gitignore
├── notebooks
    ├── solutions
    │   ├── grid_search_k_neighbors.py
    │   ├── bike_regression.py
    │   ├── adult_classification.py
    │   └── linear_models_diabetes.py
    ├── 04 - Linear Models for Classification.ipynb
    ├── 06 - Gradient Boosting.ipynb
    ├── data
    │   ├── bank-campaign-desc.text
    │   └── ram_price.csv
    ├── 01 - Review of Supervised Learning.ipynb
    ├── 02 - Cross-validation and Grid Search.ipynb
    ├── 05 - Trees.ipynb
    └── 03 - Linear Models for Regression.ipynb
├── LICENSE
├── check_env.ipynb
└── README.md


/.nojekyll:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/images/check_env-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/check_env-1.png


--------------------------------------------------------------------------------
/images/check_env-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/check_env-2.png


--------------------------------------------------------------------------------
/slides/images/PDSH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/PDSH.png


--------------------------------------------------------------------------------
/slides/images/esl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/esl.png


--------------------------------------------------------------------------------
/slides/images/imlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/imlp.png


--------------------------------------------------------------------------------
/images/download-repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/download-repo.png


--------------------------------------------------------------------------------
/slides/images/l1_kink.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1_kink.png


--------------------------------------------------------------------------------
/slides/images/logit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/logit.png


--------------------------------------------------------------------------------
/slides/images/api-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/api-table.png


--------------------------------------------------------------------------------
/slides/images/kfold_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/kfold_cv.png


--------------------------------------------------------------------------------
/slides/images/l1l2ball.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2ball.png


--------------------------------------------------------------------------------
/slides/images/l2_l1_l0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l2_l1_l0.png


--------------------------------------------------------------------------------
/slides/images/ovr_lines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ovr_lines.png


--------------------------------------------------------------------------------
/slides/images/svm_or_lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/svm_or_lr.png


--------------------------------------------------------------------------------
/slides/images/ames_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ames_scaling.png


--------------------------------------------------------------------------------
/slides/images/binary_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/binary_loss.png


--------------------------------------------------------------------------------
/slides/images/gradient_2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_2d.png


--------------------------------------------------------------------------------
/slides/images/gradient_3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_3d.png


--------------------------------------------------------------------------------
/slides/images/group_kfold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/group_kfold.png


--------------------------------------------------------------------------------
/slides/images/max_depth_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_depth_1.png


--------------------------------------------------------------------------------
/slides/images/max_depth_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_depth_4.png


--------------------------------------------------------------------------------
/slides/images/max_margin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin.png


--------------------------------------------------------------------------------
/slides/images/no_pruning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/no_pruning.png


--------------------------------------------------------------------------------
/slides/images/ram_prices.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices.png


--------------------------------------------------------------------------------
/slides/images/sklearn-docs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/sklearn-docs.png


--------------------------------------------------------------------------------
/slides/images/sklearn_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/sklearn_logo.png


--------------------------------------------------------------------------------
/slides/images/time_series1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series1.png


--------------------------------------------------------------------------------
/slides/images/time_series2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series2.png


--------------------------------------------------------------------------------
/slides/images/time_series3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series3.png


--------------------------------------------------------------------------------
/slides/images/tree_pruned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_pruned.png


--------------------------------------------------------------------------------
/slides/images/triazine_bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/triazine_bar.png


--------------------------------------------------------------------------------
/slides/images/average_voting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/average_voting.png


--------------------------------------------------------------------------------
/slides/images/boston_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/boston_scaling.png


--------------------------------------------------------------------------------
/slides/images/feature_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/feature_sample.png


--------------------------------------------------------------------------------
/slides/images/grid_ccp_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_ccp_alpha.png


--------------------------------------------------------------------------------
/slides/images/grid_max_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_max_depth.png


--------------------------------------------------------------------------------
/slides/images/instability_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/instability_1.png


--------------------------------------------------------------------------------
/slides/images/instability_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/instability_2.png


--------------------------------------------------------------------------------
/slides/images/max_margin_C_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin_C_1.png


--------------------------------------------------------------------------------
/slides/images/mpl_tree_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/mpl_tree_plot.png


--------------------------------------------------------------------------------
/slides/images/ovr_boundaries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ovr_boundaries.png


--------------------------------------------------------------------------------
/slides/images/pruning_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/pruning_alpha.png


--------------------------------------------------------------------------------
/slides/images/random_forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/random_forest.png


--------------------------------------------------------------------------------
/slides/images/splits_kinect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/splits_kinect.png


--------------------------------------------------------------------------------
/slides/images/stratified_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/stratified_cv.png


--------------------------------------------------------------------------------
/slides/images/time_series_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series_cv.png


--------------------------------------------------------------------------------
/slides/images/bias_vs_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/bias_vs_variance.png


--------------------------------------------------------------------------------
/slides/images/binning_quantiles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/binning_quantiles.png


--------------------------------------------------------------------------------
/slides/images/bootstrap_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/bootstrap_sample.png


--------------------------------------------------------------------------------
/slides/images/elasticnet_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/elasticnet_search.png


--------------------------------------------------------------------------------
/slides/images/grad_boost_depth2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_depth2.png


--------------------------------------------------------------------------------
/slides/images/grad_boost_term_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_1.png


--------------------------------------------------------------------------------
/slides/images/grad_boost_term_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_2.png


--------------------------------------------------------------------------------
/slides/images/grad_boost_term_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_3.png


--------------------------------------------------------------------------------
/slides/images/graphviz_jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/graphviz_jupyter.png


--------------------------------------------------------------------------------
/slides/images/graphviz_source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/graphviz_source.png


--------------------------------------------------------------------------------
/slides/images/knn_boundary_k1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_k1.png


--------------------------------------------------------------------------------
/slides/images/knn_boundary_k3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_k3.png


--------------------------------------------------------------------------------
/slides/images/l1l2_elasticnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2_elasticnet.png


--------------------------------------------------------------------------------
/slides/images/max_leaf_nodes_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_leaf_nodes_8.png


--------------------------------------------------------------------------------
/slides/images/max_margin_C_0.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin_C_0.1.png


--------------------------------------------------------------------------------
/slides/images/ram_prices_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices_test.png


--------------------------------------------------------------------------------
/slides/images/ram_prices_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices_train.png


--------------------------------------------------------------------------------
/slides/images/robust_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/robust_regression.png


--------------------------------------------------------------------------------
/slides/images/shuffle_split_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/shuffle_split_cv.png


--------------------------------------------------------------------------------
/slides/images/supervised-ml-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/supervised-ml-api.png


--------------------------------------------------------------------------------
/slides/images/threefold_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/threefold_split.png


--------------------------------------------------------------------------------
/slides/images/train-test-split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train-test-split.png


--------------------------------------------------------------------------------
/slides/images/tree_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_illustration.png


--------------------------------------------------------------------------------
/slides/images/tree_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_importances.png


--------------------------------------------------------------------------------
/slides/images/tree_prediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_prediction.png


--------------------------------------------------------------------------------
/slides/images/voting_classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/voting_classifier.png


--------------------------------------------------------------------------------
/slides/images/forest_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/forest_importances.png


--------------------------------------------------------------------------------
/slides/images/grid_max_leaf_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_max_leaf_nodes.png


--------------------------------------------------------------------------------
/slides/images/gridsearch_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gridsearch_workflow.png


--------------------------------------------------------------------------------
/slides/images/l1l2ball_intersect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2ball_intersect.png


--------------------------------------------------------------------------------
/slides/images/lasso_alpha_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_alpha_search.png


--------------------------------------------------------------------------------
/slides/images/lasso_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_coefficients.png


--------------------------------------------------------------------------------
/slides/images/ridge_alpha_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search.png


--------------------------------------------------------------------------------
/slides/images/ridge_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients.png


--------------------------------------------------------------------------------
/slides/images/xgboost_hist_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/xgboost_hist_bench.png


--------------------------------------------------------------------------------
/slides/images/ames_housing_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ames_housing_scatter.png


--------------------------------------------------------------------------------
/slides/images/boston_housing_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/boston_housing_scatter.png


--------------------------------------------------------------------------------
/slides/images/cross_validation_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/cross_validation_new.png


--------------------------------------------------------------------------------
/slides/images/hist_gradient_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/hist_gradient_boosting.png


--------------------------------------------------------------------------------
/slides/images/knn_boundary_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_dataset.png


--------------------------------------------------------------------------------
/slides/images/knn_boundary_varying_k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_varying_k.png


--------------------------------------------------------------------------------
/slides/images/knn_model_complexity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_model_complexity.png


--------------------------------------------------------------------------------
/slides/images/lasso_alpha_triazine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_alpha_triazine.png


--------------------------------------------------------------------------------
/slides/images/linear_boundary_vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_boundary_vector.png


--------------------------------------------------------------------------------
/slides/images/linear_regression_1d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_regression_1d.png


--------------------------------------------------------------------------------
/slides/images/logreg_regularization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/logreg_regularization.png


--------------------------------------------------------------------------------
/slides/images/lr_coefficients_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lr_coefficients_large.png


--------------------------------------------------------------------------------
/slides/images/matrix-representation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/matrix-representation.png


--------------------------------------------------------------------------------
/slides/images/min_samples_split_50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/min_samples_split_50.png


--------------------------------------------------------------------------------
/slides/images/ridge_alpha_triazine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_triazine.png


--------------------------------------------------------------------------------
/slides/images/ridge_alpha_triazines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_triazines.png


--------------------------------------------------------------------------------
/slides/images/ridge_learning_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_learning_curve.png


--------------------------------------------------------------------------------
/slides/images/supervised-ml-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/supervised-ml-workflow.png


--------------------------------------------------------------------------------
/slides/images/train_test_split_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_split_new.png


--------------------------------------------------------------------------------
/slides/images/xgboost_sklearn_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/xgboost_sklearn_bench.png


--------------------------------------------------------------------------------
/slides/images/gradient_learning_rates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_learning_rates.png


--------------------------------------------------------------------------------
/slides/images/grid_search_n_neighbors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_n_neighbors.png


--------------------------------------------------------------------------------
/slides/images/knn_boundary_test_points.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_test_points.png


--------------------------------------------------------------------------------
/slides/images/ridge_alpha_search_poly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search_poly.png


--------------------------------------------------------------------------------
/slides/images/ridge_coefficients_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients_alpha.png


--------------------------------------------------------------------------------
/slides/images/ridge_coefficients_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients_large.png


--------------------------------------------------------------------------------
/slides/images/grad_boost_regression_steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_regression_steps.png


--------------------------------------------------------------------------------
/slides/images/linear_svm_regularization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_svm_regularization.png


--------------------------------------------------------------------------------
/slides/images/repeated_stratified_kfold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/repeated_stratified_kfold.png


--------------------------------------------------------------------------------
/slides/images/ridge_alpha_search_cv_runs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search_cv_runs.png


--------------------------------------------------------------------------------
/slides/images/time_series_walk_forward_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series_walk_forward_cv.png


--------------------------------------------------------------------------------
/slides/images/train_test_validation_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_validation_split.png


--------------------------------------------------------------------------------
/slides/images/tree_building_iteration_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_1.png


--------------------------------------------------------------------------------
/slides/images/tree_building_iteration_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_2.png


--------------------------------------------------------------------------------
/slides/images/tree_building_iteration_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_9.png


--------------------------------------------------------------------------------
/slides/images/grid_search_cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_cross_validation.png


--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_1.png


--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_2.png


--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_3.png


--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_4.png


--------------------------------------------------------------------------------
/slides/images/grid_search_cross_validation_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_cross_validation_new.png


--------------------------------------------------------------------------------
/slides/images/train_test_set_2d_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_set_2d_classification.png


--------------------------------------------------------------------------------
/slides/images/overfitting_underfitting_cartoon_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_full.png


--------------------------------------------------------------------------------
/slides/images/overfitting_underfitting_cartoon_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_train.png


--------------------------------------------------------------------------------
/slides/images/overfitting_underfitting_cartoon_generalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_generalization.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # exlude datasets and externals
 2 | notebooks/datasets
 3 | notebooks/joblib/
 4 | 
 5 | # exclude temporary files
 6 | .ipynb_checkpoints
 7 | .DS_Store
 8 | gmon.out
 9 | __pycache__
10 | *.pyc
11 | *.o
12 | *.so
13 | *.gcno
14 | *.swp
15 | *.egg-info
16 | *.egg
17 | *~
18 | build
19 | dist
20 | lib/test
21 | doc/_build
22 | *env
23 | *ENV
24 | .idea
25 | 


--------------------------------------------------------------------------------
/notebooks/solutions/grid_search_k_neighbors.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KNeighborsClassifier
 2 | 
 3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]}
 4 | 
 5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid,
 6 |                     return_train_score=True)
 7 | grid.fit(X_train, y_train)
 8 | 
 9 | print("best parameters: %s" % grid.best_params_)
10 | print("Training set accuracy: %s" % grid.score(X_train, y_train))
11 | print("Test set accuracy: %s" % grid.score(X_test, y_test))
12 | results = grid.cv_results_
13 | plt.plot(param_grid['n_neighbors'], results['mean_train_score'], label="train")
14 | plt.plot(param_grid['n_neighbors'], results['mean_test_score'], label="test")
15 | plt.legend()
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Andreas Mueller
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/notebooks/solutions/bike_regression.py:
--------------------------------------------------------------------------------
 1 | data = pd.read_csv("data/bike_day_raw.csv")
 2 | X = data.drop("cnt", axis=1)
 3 | y = data.cnt
 4 | 
 5 | display(data.head())
 6 | 
 7 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 8 | 
 9 | from sklearn.linear_model import LinearRegression
10 | 
11 | # for other models you should scale here
12 | 
13 | lr = LinearRegression().fit(X_train, y_train)
14 | 
15 | print(lr.score(X_train, y_train))
16 | 
17 | print(lr.score(X_test, y_test))
18 | 
19 | from sklearn.metrics import mean_squared_error
20 | y_pred = lr.predict(X_test)
21 | print(mean_squared_error(y_test, y_pred))
22 | 
23 | 
24 | from sklearn.compose import make_column_transformer
25 | from sklearn.preprocessing import OneHotEncoder
26 | ohe = make_column_transformer(
27 |     (OneHotEncoder(sparse=False), X_train.columns[:6]),
28 |     remainder='passthrough')
29 | 
30 | X_train_ohe = ohe.fit_transform(X_train)
31 | X_test_ohe = ohe.transform(X_test)
32 | 
33 | X_train.shape
34 | 
35 | X_train_ohe.shape
36 | 
37 | 
38 | lr = LinearRegression().fit(X_train_ohe, y_train)
39 | 
40 | print(lr.score(X_train_ohe, y_train))
41 | 
42 | print(lr.score(X_test_ohe, y_test))
43 | 
44 | from sklearn.metrics import mean_squared_error
45 | y_pred = lr.predict(X_test_ohe)
46 | 


--------------------------------------------------------------------------------
/notebooks/04 - Linear Models for Classification.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Linear Models for Classification"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "# Exercise\n",
15 |     "Load and preprocess the adult data as before.\n",
16 |     "include dummy encoding and scaling\n",
17 |     "Learn a logistic regression model and visualize the coefficients.\n",
18 |     "Then grid-search the regularization parameter C.\n",
19 |     "Compare the coefficients of the best model with the coefficients of a model with more regularization."
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": null,
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "import pandas as pd\n",
29 |     "adult = pd.read_csv(\"data/adult.csv\", index_col=0)"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "metadata": {},
36 |    "outputs": [],
37 |    "source": [
38 |     "# %load solutions/adult_classification.py"
39 |    ]
40 |   }
41 |  ],
42 |  "metadata": {
43 |   "anaconda-cloud": {},
44 |   "kernelspec": {
45 |    "display_name": "Python 3",
46 |    "language": "python",
47 |    "name": "python3"
48 |   },
49 |   "language_info": {
50 |    "codemirror_mode": {
51 |     "name": "ipython",
52 |     "version": 3
53 |    },
54 |    "file_extension": ".py",
55 |    "mimetype": "text/x-python",
56 |    "name": "python",
57 |    "nbconvert_exporter": "python",
58 |    "pygments_lexer": "ipython3",
59 |    "version": "3.7.3"
60 |   }
61 |  },
62 |  "nbformat": 4,
63 |  "nbformat_minor": 4
64 | }
65 | 


--------------------------------------------------------------------------------
/notebooks/solutions/adult_classification.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | data = pd.read_csv("data/adult.csv", index_col=0)
 3 | data.head()
 4 | 
 5 | income = data.income
 6 | data_features = data.drop("income", axis=1)
 7 | 
 8 | display(data_features.head())
 9 | 
10 | 
11 | ### one hot encode data
12 | data_one_hot = pd.get_dummies(data_features)
13 | data_one_hot.head()
14 | 
15 | 
16 | ### Preprocessing
17 | from sklearn.preprocessing import StandardScaler
18 | from sklearn.model_selection import train_test_split
19 | X_train, X_test, y_train, y_test = train_test_split(data_one_hot, income)
20 | 
21 | scaler = StandardScaler().fit(X_train)
22 | X_train_scaled = scaler.transform(X_train)
23 | 
24 | ### Cross-validation with default parameters
25 | from sklearn.model_selection import cross_val_score
26 | from sklearn.linear_model import LogisticRegression
27 | 
28 | scores = cross_val_score(LogisticRegression(), X_train_scaled, y_train)
29 | print(scores.mean())
30 | 
31 | 
32 | ### do grid search
33 | 
34 | import numpy as np
35 | 
36 | param_grid = {'C': np.logspace(-3, 3, 7)}
37 | param_grid
38 | 
39 | from sklearn.model_selection import GridSearchCV
40 | grid = GridSearchCV(LogisticRegression(), param_grid,
41 |                     return_train_score=True)
42 | 
43 | grid.fit(X_train_scaled, y_train)
44 | 
45 | grid.best_params_
46 | grid.best_score_
47 | 
48 | # some visualization
49 | 
50 | import pandas as pd
51 | %matplotlib inline
52 | res = pd.DataFrame(grid.cv_results_)
53 | res.mean_test_score.plot()
54 | res.mean_train_score.plot()
55 | import matplotlib.pyplot as plt
56 | plt.xscale("log")
57 | 
58 | grid.score(X_test, y_test)
59 | 
60 | important = np.argsort(np.abs(grid.best_estimator_.coef_)).ravel()
61 | 
62 | plt.barh(range(10), grid.best_estimator_.coef_.ravel()[important[-10:]])
63 | plt.yticks(range(10), X_train.columns[important[-10:]]);
64 | 


--------------------------------------------------------------------------------
/check_env.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from distutils.version import LooseVersion as Version\n",
10 |     "import sys\n",
11 |     "\n",
12 |     "\n",
13 |     "OK = '\\x1b[42m[ OK ]\\x1b[0m'\n",
14 |     "FAIL = \"\\x1b[41m[FAIL]\\x1b[0m\"\n",
15 |     "\n",
16 |     "try:\n",
17 |     "    import importlib\n",
18 |     "except ImportError:\n",
19 |     "    print(FAIL, \"Python version 3.5 is required,\"\n",
20 |     "                \" but %s is installed.\" % sys.version)\n",
21 |     "\n",
22 |     "    \n",
23 |     "def import_version(pkg, min_ver, fail_msg=\"\"):\n",
24 |     "    mod = None\n",
25 |     "    try:\n",
26 |     "        mod = importlib.import_module(pkg)\n",
27 |     "        ver = mod.__version__\n",
28 |     "        if Version(ver) < min_ver:\n",
29 |     "            print(FAIL, \"%s version %s or higher required, but %s installed.\"\n",
30 |     "                  % (lib, min_ver, ver))\n",
31 |     "        else:\n",
32 |     "            print(OK, '%s version %s' % (pkg, ver))\n",
33 |     "    except ImportError:\n",
34 |     "        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n",
35 |     "    return mod\n",
36 |     "\n",
37 |     "\n",
38 |     "# first check the python version\n",
39 |     "print('Using python in', sys.prefix)\n",
40 |     "print(sys.version)\n",
41 |     "pyversion = Version(sys.version)\n",
42 |     "if pyversion < \"3.5\":\n",
43 |     "    print(FAIL, \"Python version 3.5 is required,\"\n",
44 |     "                \" but %s is installed.\" % sys.version)\n",
45 |     "print()\n",
46 |     "requirements = {'numpy': \"1.6.1\", 'scipy': \"1.0\", 'matplotlib': \"2.0\",\n",
47 |     "                'IPython': \"3.0\", 'sklearn': \"0.22.1\", 'pandas': \"0.18\"}\n",
48 |     "\n",
49 |     "# now the dependencies\n",
50 |     "for lib, required_version in list(requirements.items()):\n",
51 |     "    import_version(lib, required_version)"
52 |    ]
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "anaconda-cloud": {},
57 |   "kernelspec": {
58 |    "display_name": "Python 3",
59 |    "language": "python",
60 |    "name": "python3"
61 |   },
62 |   "language_info": {
63 |    "codemirror_mode": {
64 |     "name": "ipython",
65 |     "version": 3
66 |    },
67 |    "file_extension": ".py",
68 |    "mimetype": "text/x-python",
69 |    "name": "python",
70 |    "nbconvert_exporter": "python",
71 |    "pygments_lexer": "ipython3",
72 |    "version": "3.7.3"
73 |   }
74 |  },
75 |  "nbformat": 4,
76 |  "nbformat_minor": 4
77 | }
78 | 


--------------------------------------------------------------------------------
/slides/01-reminder-supervised-learning.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <title>Introduction to Supervised Learning</title>
  5 |     <meta charset="utf-8">
  6 |     <link rel="stylesheet" href="style.css">
  7 |     <style>
  8 |       @import url(https://fonts.googleapis.com/css?family=Garamond);
  9 |       @import url(https://fonts.googleapis.com/css?family=Muli:400,700,400italic);
 10 |       @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
 11 |     </style>
 12 |   </head>
 13 |   <body>
 14 |     <textarea id="source">
 15 | 
 16 | class: center, middle
 17 | 
 18 | ![:scale 40%](images/sklearn_logo.png)
 19 | 
 20 | 
 21 | ### Intermediate Machine learning with scikit-learn
 22 | 
 23 | # Reminder: Scikit-learn API
 24 | 
 25 | Andreas C. Müller
 26 | 
 27 | Columbia University, scikit-learn
 28 | 
 29 | .smaller[https://github.com/amueller/ml-workshop-2-of-4]
 30 | 
 31 | 
 32 | ---
 33 | 
 34 | class: center
 35 | 
 36 | # scikit-learn documentation
 37 | ![:scale 60%](images/sklearn-docs.png)
 38 | 
 39 | <a href="http://scikit-learn.org/" style="color:black; font-size:50px; text-decoration:None" >scikit-learn.org</a>
 40 | 
 41 | ---
 42 | 
 43 | # Other Resources
 44 | 
 45 | .center[
 46 | ![:scale 25%](images/PDSH.png)&nbsp;&nbsp;&nbsp;
 47 | ![:scale 25%](images/imlp.png)&nbsp;&nbsp;&nbsp;
 48 | ![:scale 25%](images/esl.png)
 49 | ]
 50 | Lecture: http://www.cs.columbia.edu/~amueller/comsw4995s19/schedule/
 51 | 
 52 | https://www.youtube.com/andreasmueller
 53 | 
 54 | Videos and more slides!
 55 | 
 56 | ---
 57 | class: center
 58 | 
 59 | # Representing Data
 60 | 
 61 | ![:scale 100%](images/matrix-representation.png)
 62 | ---
 63 | class: center
 64 | 
 65 | # Training and Test Data
 66 | 
 67 | ![:scale 80%](images/train-test-split.png)
 68 | 
 69 | ---
 70 | class: center
 71 | 
 72 | # Supervised ML Workflow
 73 | 
 74 | ![:scale 100%](images/supervised-ml-api.png)
 75 | 
 76 | ---
 77 | # KNN with scikit-learn
 78 | 
 79 | ```python
 80 | from sklearn.model_selection import train_test_split
 81 | X_train, X_test, y_train, y_test = train_test_split(X, y)
 82 | 
 83 | from sklearn.neighbors import KNeighborsClassifier
 84 | knn = KNeighborsClassifier(n_neighbors=1)
 85 | knn.fit(X_train, y_train)
 86 | print("accuracy: ", knn.score(X_test, y_test)))
 87 | y_pred = knn.predict(X_test)
 88 | ```
 89 | accuracy: 0.77
 90 | 
 91 | ???
 92 | 
 93 | ---
 94 | class: center, middle
 95 | 
 96 | # Sckit-Learn API Summary
 97 | 
 98 | ![:scale 80%](images/api-table.png)
 99 | 
100 | ---
101 | class: center, middle
102 | 
103 | 
104 | # Notebook: Review of Supervised learning
105 | 
106 |     </textarea>
107 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
108 |     <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
109 | 
110 |     <script>
111 |     // Config Remark
112 |     remark.macros['scale'] = function (percentage) {
113 |         var url = this;
114 |         return '<img src="' + url + '" style="width: ' + percentage + '" />';
115 |     };
116 |     config_remark = {
117 |         highlightStyle: 'github',
118 |         highlightSpans: true,
119 |         highlightLines: true,
120 |         ratio: "16:9"
121 |     };
122 |       var slideshow = remark.create(config_remark);
123 | 
124 |     // Configure MathJax
125 |     MathJax.Hub.Config({
126 |     tex2jax: {
127 |         inlineMath: [['$','$'], ['\\(','\\)']],
128 |         skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] /* removed 'code' entry*/
129 |     }
130 |     });
131 |     MathJax.Hub.Queue(function() {
132 |         var all = MathJax.Hub.getAllJax(), i;
133 |         for(i = 0; i < all.length; i += 1) {
134 |             all[i].SourceElement().parentNode.className += ' has-jax';
135 |         }
136 |     });
137 |     </script>
138 |   </body>
139 | </html>
140 | 


--------------------------------------------------------------------------------
/notebooks/solutions/linear_models_diabetes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.linear_model import Lasso, Ridge, LinearRegression
 5 | from sklearn.model_selection import train_test_split, cross_val_score
 6 | from sklearn.datasets import load_diabetes
 7 | 
 8 | diabetes = load_diabetes()
 9 | 
10 | # create dataframe for easy boxplot
11 | df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
12 | df.boxplot()
13 | 
14 | plt.figure()
15 | plt.title("Target distribution")
16 | plt.hist(diabetes.target, bins="auto")
17 | 
18 | X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
19 |                                                     diabetes.target)
20 | 
21 | scores_lr = cross_val_score(LinearRegression(), X_train, y_train, cv=10)
22 | print("Linear regression score:", scores_lr.mean())
23 | scores_ridge = cross_val_score(Ridge(), X_train, y_train, cv=10)
24 | print("Ridge Regression score:", scores_ridge.mean())
25 | 
26 | # With scaled data
27 | from sklearn.preprocessing import StandardScaler
28 | scaler = StandardScaler().fit(X_train)
29 | X_train_scaled = scaler.transform(X_train)
30 | X_test_scaled = scaler.transform(X_test)
31 | 
32 | scores_lr = cross_val_score(LinearRegression(), X_train_scaled, y_train, cv=10)
33 | print("Linear regression w/ scaling:", scores_lr.mean())
34 | scores_ridge = cross_val_score(Ridge(), X_train_scaled, y_train, cv=10)
35 | print("Ridge regression w/ scaling:", scores_ridge.mean())
36 | 
37 | from sklearn.model_selection import GridSearchCV
38 | param_grid = {'alpha': np.logspace(-3, 3, 7)}
39 | grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
40 | grid.fit(X_train_scaled, y_train)
41 | 
42 | res = pd.DataFrame(grid.cv_results_)
43 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True)
44 | plt.title("Ridge grid search")
45 | 
46 | 
47 | print(grid.best_params_, grid.best_score_)
48 | 
49 | lr = LinearRegression().fit(X_train_scaled, y_train)
50 | 
51 | plt.figure()
52 | plt.title("Coefficients LR vs Ridge")
53 | plt.hlines(0, 0, X_train.shape[1], linewidth=.5)
54 | plt.plot(grid.best_estimator_.coef_, 'o', label="Ridge({})".format(grid.best_params_['alpha']))
55 | plt.plot(lr.coef_, 'o', label="LR", alpha=.6)
56 | plt.legend()
57 | 
58 | from sklearn.model_selection import GridSearchCV
59 | param_grid = {'alpha': np.logspace(-3, 3, 7)}
60 | grid_lasso = GridSearchCV(Lasso(), param_grid, cv=10, return_train_score=True)
61 | grid_lasso.fit(X_train_scaled, y_train)
62 | 
63 | res = pd.DataFrame(grid_lasso.cv_results_)
64 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True)
65 | plt.title("Lasso grid search")
66 | print(grid_lasso.best_params_, grid_lasso.best_score_)
67 | 
68 | plt.figure()
69 | plt.title("coefficients")
70 | plt.hlines(0, 0, X_train.shape[1], linewidth=.5)
71 | plt.plot(grid.best_estimator_.coef_, 'o', label="Ridge({})".format(grid.best_params_['alpha']))
72 | plt.plot(grid_lasso.best_estimator_.coef_, 'o', label="Lasso({})".format(grid_lasso.best_params_['alpha']))
73 | plt.plot(lr.coef_, 'o', label="LR", alpha=.6)
74 | plt.legend()
75 | 
76 | from sklearn.preprocessing import PolynomialFeatures
77 | poly = PolynomialFeatures(include_bias=False)
78 | 
79 | X_train_poly = poly.fit_transform(X_train_scaled)
80 | X_test_poly = poly.transform(X_test_scaled)
81 | 
82 | scores_lr = cross_val_score(LinearRegression(), X_train_poly, y_train, cv=10)
83 | print("Linear regression poly features:", scores_lr.mean())
84 | scores_ridge = cross_val_score(Ridge(), X_train_poly, y_train, cv=10)
85 | print("Ridge regression poly features:", scores_ridge.mean())
86 | 
87 | from sklearn.model_selection import GridSearchCV
88 | param_grid = {'alpha': np.logspace(-3, 3, 7)}
89 | grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
90 | grid.fit(X_train_poly, y_train)
91 | 
92 | res = pd.DataFrame(grid.cv_results_)
93 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True)
94 | plt.title("Ridge grid search with polynomial features")
95 | 
96 | 
97 | print(grid.best_params_, grid.best_score_)
98 | # score with polynomial features is worse!


--------------------------------------------------------------------------------
/slides/style.css:
--------------------------------------------------------------------------------
  1 | body {
  2 | font-family: 'Muli';
  3 | font-size: 140%;
  4 | }
  5 | h1, h2 {
  6 | font-family: 'Garamond';
  7 | font-weight: normal;
  8 | margin-top: 10px;
  9 | margin-bottom: 10px;
 10 | }
 11 | .remark-slide-content h1 {
 12 | font-size: 70px;
 13 | text-align: center;
 14 | }
 15 | .remark-slide-content p, .remark-slide-content li {
 16 | font-size:30px;
 17 | line-height: 1.4;
 18 | }
 19 | .remark-code {
 20 | font-size:30px;
 21 | }
 22 | .remark-slide-content p {
 23 |     margin: 5px;
 24 | }
 25 | .remark-slide-container .spacious p,
 26 | .remark-slide-container .spacious li{
 27 |     margin-bottom: 50px;
 28 |     margin-top: 50px;
 29 | }
 30 | .remark-slide-container .spacious h1{
 31 |     margin-bottom: 50px;
 32 | }
 33 | .remark-slide-container .some-space p,
 34 | .remark-slide-container .some-space li,
 35 | .remark-slide-container .some-space h1{
 36 |     margin-bottom: 30px;
 37 | }
 38 | .reset-column {
 39 |     overflow: auto;
 40 |     width: 100%;
 41 | }
 42 | .remark-slide-container .compact p, .remark-slide-container .compact li, .remark-slide-container .compact pre{
 43 | line-height: 1.1;
 44 | margin: 0px 0;
 45 | }
 46 | .remark-slide-container .compact .MathJax_Display{
 47 |     line-height: 1.1;
 48 |     margin: 1px 0;
 49 |     }
 50 | .remark-slide-container .compact h1{
 51 | margin-bottom: 3px;
 52 | }
 53 | .padding-top {
 54 |     padding-top: 100px;
 55 | }
 56 | .remark-slide-content .smaller p, .remark-slide-content .smaller p .MathJax, .remark-slide-content .smaller li,
 57 | .remark-slide-content .smaller .remark-code,  .smaller .remark-code-line,.remark-slide-content .smaller a,
 58 |  .remark-slide-content .smaller .dataframe{
 59 |     font-size: 25px;
 60 | }
 61 | 
 62 | .remark-slide-content .smallest p, .remark-slide-content .smallest .MathJax, .remark-slide-content .smallest li, .remark-slide-content .smallest .remark-code,
 63 | .smallest .remark-code-line, .remark-slide-content .smallest .dataframe, .remark-slide-content  span.smallest{
 64 | font-size: 20px;
 65 | }
 66 | .remark-slide-content .tiny p, .remark-slide-content .tiny li, .remark-slide-content .tiny .remark-code,
 67 | .tiny .remark-code-line, .remark-slide-content .tiny .dataframe{
 68 | font-size: 16px;
 69 | }
 70 | .normal {
 71 |     font-size: 30px;
 72 | }
 73 | .quote_author {
 74 |     display: block;
 75 |     text-align: right;
 76 |     margin-top: 20px;
 77 |     font-size: 30px;
 78 |     font-family: 'Garamond';
 79 | }
 80 | .larger, .larger .remark-code {
 81 |     font-size: 40px;
 82 | }
 83 | .largest, .largest .remark-code {
 84 |     font-size: 50px;
 85 | }
 86 | .left-column, .right-column {
 87 |     width: 48%;
 88 | }
 89 | .right-column{
 90 |     float: right;
 91 | }
 92 | .left-column{
 93 |     float: left;
 94 | }
 95 | .clear-column{
 96 |     clear: both;
 97 | }
 98 | .narrow-right-column {
 99 |     float: right;
100 |     width: 32%
101 | }
102 | .wide-left-column {
103 |     float: left;
104 |     width: 65%
105 | }
106 | .narrow-left-column {
107 |     float: left;
108 |     width: 32%
109 | }
110 | .wide-right-column {
111 |     float: right;
112 |     width: 65%
113 | }
114 | 
115 | .invisible {
116 |     visibility: hidden
117 | }
118 | .tiny-code .remark-code, .remark-inline-code .tiny-code{
119 | font-size: 15px;
120 | }
121 | .remark-code, .remark-inline-code  { font-family: 'Ubuntu Mono';}
122 | .hljs.remark-code {background: #e0e0e0}
123 | 
124 | /* Some additional styling taken form the Jupyter notebook CSS */
125 | table.dataframe {
126 | border: none;
127 | border-collapse: collapse;
128 | border-spacing: 0;
129 | color: black;
130 | table-layout: fixed;
131 | }
132 | table.dataframe thead {
133 | border-bottom: 1px solid black;
134 | vertical-align: bottom;
135 | }
136 | table.dataframe tr,
137 | table.dataframe th,
138 | table.dataframe td {
139 | text-align: right;
140 | vertical-align: middle;
141 | padding: 0.5em 0.5em;
142 | line-height: normal;
143 | white-space: normal;
144 | max-width: none;
145 | border: none;
146 | }
147 | table.dataframe th {
148 | font-weight: bold;
149 | }
150 | table.dataframe tbody tr:nth-child(odd) {
151 | background: #f5f5f5;
152 | }
153 | table.dataframe tbody tr:hover {
154 | background: rgba(66, 165, 245, 0.2);
155 | }


--------------------------------------------------------------------------------
/notebooks/06 - Gradient Boosting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import matplotlib.pyplot as plt\n",
 11 |     "import pandas as pd\n",
 12 |     "import sklearn\n",
 13 |     "sklearn.set_config(print_changed_only=True)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Gradient Boosting"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 30 |     "from sklearn.datasets import load_breast_cancer\n",
 31 |     "from sklearn.model_selection import train_test_split\n",
 32 |     "cancer = load_breast_cancer()\n",
 33 |     "\n",
 34 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 35 |     "    cancer.data, cancer.target, random_state=0)\n",
 36 |     "\n",
 37 |     "gbrt = GradientBoostingClassifier(random_state=0)\n",
 38 |     "gbrt.fit(X_train, y_train)\n",
 39 |     "\n",
 40 |     "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n",
 41 |     "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)\n",
 51 |     "gbrt.fit(X_train, y_train)\n",
 52 |     "\n",
 53 |     "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n",
 54 |     "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)\n",
 64 |     "gbrt.fit(X_train, y_train)\n",
 65 |     "\n",
 66 |     "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n",
 67 |     "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)\n",
 77 |     "gbrt.fit(X_train, y_train)\n",
 78 |     "\n",
 79 |     "plt.barh(range(cancer.data.shape[1]), gbrt.feature_importances_)\n",
 80 |     "plt.yticks(range(cancer.data.shape[1]), cancer.feature_names);\n",
 81 |     "ax = plt.gca()\n",
 82 |     "ax.set_position([0.4, .2, .9, .9])"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from xgboost import XGBClassifier\n",
 92 |     "xgb = XGBClassifier()\n",
 93 |     "xgb.fit(X_train, y_train)\n",
 94 |     "print(\"accuracy on training set: %f\" % xgb.score(X_train, y_train))\n",
 95 |     "print(\"accuracy on test set: %f\" % xgb.score(X_test, y_test))"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "from xgboost import XGBClassifier\n",
105 |     "xgb = XGBClassifier(n_estimators=1000)\n",
106 |     "xgb.fit(X_train, y_train)\n",
107 |     "print(\"accuracy on training set: %f\" % xgb.score(X_train, y_train))\n",
108 |     "print(\"accuracy on test set: %f\" % xgb.score(X_test, y_test))"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "# Exercise\n",
116 |     "Use GradientBoostingRegressor on the Bike dataset.\n",
117 |     "Search over the ``learning_rate`` and ``max_depth`` using ``GridSearchCV``.\n",
118 |     "What happens if you change ``n_estimators``?\n",
119 |     "\n",
120 |     "Compare the speed of XGBClassifier with GradientBoostingRegressor. How well does XGBClassifier do with defaults on the ``Bike`` dataset? Can you make it do better?"
121 |    ]
122 |   }
123 |  ],
124 |  "metadata": {
125 |   "anaconda-cloud": {},
126 |   "kernelspec": {
127 |    "display_name": "Python 3",
128 |    "language": "python",
129 |    "name": "python3"
130 |   },
131 |   "language_info": {
132 |    "codemirror_mode": {
133 |     "name": "ipython",
134 |     "version": 3
135 |    },
136 |    "file_extension": ".py",
137 |    "mimetype": "text/x-python",
138 |    "name": "python",
139 |    "nbconvert_exporter": "python",
140 |    "pygments_lexer": "ipython3",
141 |    "version": "3.7.3"
142 |   }
143 |  },
144 |  "nbformat": 4,
145 |  "nbformat_minor": 4
146 | }
147 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Intermediate Machine learning with scikit-learn
  2 | ========================================================
  3 | 
  4 | Part 2 of 4
  5 | -----------
  6 | Other parts:
  7 | - [Part 1](https://github.com/amueller/ml-workshop-1-of-4)
  8 | - [Part 3](https://github.com/amueller/ml-workshop-3-of-4)
  9 | - [Part 4](https://github.com/amueller/ml-workshop-4-of-4)
 10 | 
 11 | 
 12 | Content
 13 | -------
 14 | - [Reminder on supervised learning](https://amueller.github.io/ml-workshop-2-of-4/slides/01-reminder-supervised-learning.html)
 15 | - [Grid search and cross-validation](https://amueller.github.io/ml-workshop-2-of-4/slides/02-cross-validation-grid-search.html)
 16 | - [Linear models for regression](https://amueller.github.io/ml-workshop-2-of-4/slides/03-linear-models-regression.html)
 17 | - [Linear models for classification](https://amueller.github.io/ml-workshop-2-of-4/slides/04-linear-models-classification.html)
 18 | - [Decision trees](https://amueller.github.io/ml-workshop-2-of-4/slides/05-trees-forests.html)
 19 | - [Random Forests](https://amueller.github.io/ml-workshop-2-of-4/slides/05-trees-forests.html#26)
 20 | 
 21 | 
 22 | Instructor
 23 | -----------
 24 | 
 25 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/amuellerml) - Columbia University; [Book: Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do)
 26 | 
 27 | ---
 28 | 
 29 | This repository will contain the teaching material and other info associated
 30 | with the "Intermediate Machine Learning with scikit-learn" course.
 31 | 
 32 | About the workshop
 33 | ------------------
 34 | Scikit-learn is a machine learning library in Python, that has become a
 35 | valuable tool for many data science practitioners. This workshop will go beyond
 36 | the basics and show how to effectively evaluate and tune algorithms. We will
 37 | also discuss the most important machine learning algorithms that you're likely
 38 | to see in practice, how and when to use them, and some details about how they
 39 | work internally. The session will focus on linear models for classification and
 40 | regression and tree-based models, including random forests.
 41 | 
 42 | Prerequisites
 43 | -------------
 44 | This workshop assumes familiarity with Jupyter notebooks and basics of pandas, matplotlib and numpy.
 45 | It also assumes familiarity with the basics of supervised learning, like training and test data and basics of model evaluation.
 46 | You should have build a model with scikit-learn (or attend Introduction to Machine learning with scikit-learn) before
 47 | taking this workshop.
 48 | 
 49 | Obtaining the Tutorial Material
 50 | --------------------------------
 51 | 
 52 | 
 53 | If you are familiar with git, it is most convenient if you clone the GitHub repository. This
 54 | is highly encouraged as it allows you to easily synchronize any changes to the material.
 55 | 
 56 | ```
 57 | git clone https://github.com/amueller/ml-workshop-2-of-4.git
 58 | ```
 59 | 
 60 | If you are not familiar with git, you can download the repository as a .zip file by heading over to the GitHub repository (https://github.com/amueller/ml-workshop-2-of-4) in your browser and click the green “Download” button in the upper right.
 61 | 
 62 | ![](images/download-repo.png)
 63 | 
 64 | Please note that I may add and improve the material until shortly before the tutorial session, and we recommend you to update your copy of the materials one day before the tutorials. If you have an GitHub account and forked/cloned the repository via GitHub, you can sync your existing fork with via the following commands:
 65 | 
 66 | ```
 67 | git pull origin master
 68 | ```
 69 | 
 70 | 
 71 | Installation Notes
 72 | ------------------
 73 | 
 74 | This tutorial will require recent installations of
 75 | 
 76 | - [NumPy](http://www.numpy.org)
 77 | - [SciPy](http://www.scipy.org)
 78 | - [matplotlib](http://matplotlib.org)
 79 | - [pillow](https://python-pillow.org)
 80 | - [pandas](http://pandas.pydata.org)
 81 | - [scikit-learn](http://scikit-learn.org/stable/) (>=0.22.1)
 82 | - [IPython](http://ipython.readthedocs.org/en/stable/)
 83 | - [Jupyter Notebook](http://jupyter.org)
 84 | 
 85 | The last one is important, you should be able to type:
 86 | 
 87 |     jupyter notebook
 88 | 
 89 | in your terminal window and see the notebook panel load in your web browser.
 90 | Try opening and running a notebook from the material to see check that it works.
 91 | 
 92 | For users who do not yet have these  packages installed, a relatively
 93 | painless way to install all the requirements is to use a Python distribution
 94 | such as [Anaconda](https://www.continuum.io/downloads), which includes
 95 | the most relevant Python packages for science, math, engineering, and
 96 | data analysis; Anaconda can be downloaded and installed for free
 97 | including commercial use and redistribution.
 98 | The code examples in this tutorial requires Python 3.5 or later.
 99 | 
100 | After obtaining the material, we **strongly recommend** you to open and execute
101 | a Jupyter Notebook `jupter notebook check_env.ipynb` that is located at the
102 | top level of this repository. Inside the repository, you can open the notebook
103 | by executing
104 | 
105 | ```bash
106 | jupyter notebook check_env.ipynb
107 | ```
108 | 
109 | inside this repository. Inside the Notebook, you can run the code cell by
110 | clicking on the "Run Cells" button as illustrated in the figure below:
111 | 
112 | ![](images/check_env-1.png)
113 | 
114 | 
115 | Finally, if your environment satisfies the requirements for the tutorials, the executed code cell will produce an output message as shown below:
116 | 
117 | ![](images/check_env-2.png)
118 | 


--------------------------------------------------------------------------------
/notebooks/data/bank-campaign-desc.text:
--------------------------------------------------------------------------------
 1 | Citation Request:
 2 |   This dataset is publicly available for research. The details are described in [Moro et al., 2014]. 
 3 |   Please include this citation if you plan to use this database:
 4 | 
 5 |   [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, In press, http://dx.doi.org/10.1016/j.dss.2014.03.001
 6 | 
 7 |   Available at: [pdf] http://dx.doi.org/10.1016/j.dss.2014.03.001
 8 |                 [bib] http://www3.dsi.uminho.pt/pcortez/bib/2014-dss.txt
 9 | 
10 | 1. Title: Bank Marketing (with social/economic context)
11 | 
12 | 2. Sources
13 |    Created by: Sérgio Moro (ISCTE-IUL), Paulo Cortez (Univ. Minho) and Paulo Rita (ISCTE-IUL) @ 2014
14 |    
15 | 3. Past Usage:
16 | 
17 |   The full dataset (bank-additional-full.csv) was described and analyzed in:
18 | 
19 |   S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems (2014), doi:10.1016/j.dss.2014.03.001.
20 |  
21 | 4. Relevant Information:
22 | 
23 |    This dataset is based on "Bank Marketing" UCI dataset (please check the description at: http://archive.ics.uci.edu/ml/datasets/Bank+Marketing).
24 |    The data is enriched by the addition of five new social and economic features/attributes (national wide indicators from a ~10M population country), published by the Banco de Portugal and publicly available at: https://www.bportugal.pt/estatisticasweb.
25 |    This dataset is almost identical to the one used in [Moro et al., 2014] (it does not include all attributes due to privacy concerns). 
26 |    Using the rminer package and R tool (http://cran.r-project.org/web/packages/rminer/), we found that the addition of the five new social and economic attributes (made available here) lead to substantial improvement in the prediction of a success, even when the duration of the call is not included. Note: the file can be read in R using: d=read.table("bank-additional-full.csv",header=TRUE,sep=";")
27 |    
28 |    The zip file includes two datasets: 
29 |       1) bank-additional-full.csv with all examples, ordered by date (from May 2008 to November 2010).
30 |       2) bank-additional.csv with 10% of the examples (4119), randomly selected from bank-additional-full.csv.
31 |    The smallest dataset is provided to test more computationally demanding machine learning algorithms (e.g., SVM).
32 | 
33 |    The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y).
34 | 
35 | 5. Number of Instances: 41188 for bank-additional-full.csv
36 | 
37 | 6. Number of Attributes: 20 + output attribute.
38 | 
39 | 7. Attribute information:
40 | 
41 |    For more information, read [Moro et al., 2014].
42 | 
43 |    Input variables:
44 |    # bank client data:
45 |    1 - age (numeric)
46 |    2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
47 |    3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)
48 |    4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
49 |    5 - default: has credit in default? (categorical: "no","yes","unknown")
50 |    6 - housing: has housing loan? (categorical: "no","yes","unknown")
51 |    7 - loan: has personal loan? (categorical: "no","yes","unknown")
52 |    # related with the last contact of the current campaign:
53 |    8 - contact: contact communication type (categorical: "cellular","telephone") 
54 |    9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
55 |   10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
56 |   11 - duration: last contact duration, in seconds (numeric). Important note:  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
57 |    # other attributes:
58 |   12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
59 |   13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
60 |   14 - previous: number of contacts performed before this campaign and for this client (numeric)
61 |   15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
62 |    # social and economic context attributes
63 |   16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
64 |   17 - cons.price.idx: consumer price index - monthly indicator (numeric)     
65 |   18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)     
66 |   19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
67 |   20 - nr.employed: number of employees - quarterly indicator (numeric)
68 | 
69 |   Output variable (desired target):
70 |   21 - y - has the client subscribed a term deposit? (binary: "yes","no")
71 | 
72 | 8. Missing Attribute Values: There are several missing values in some categorical attributes, all coded with the "unknown" label. These missing values can be treated as a possible class label or using deletion or imputation techniques. 
73 | 


--------------------------------------------------------------------------------
/notebooks/01 - Review of Supervised Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Review of Supervised Learning with scikit-learn"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import sklearn\n",
 20 |     "sklearn.set_config(print_changed_only=True)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# read data.\n",
 30 |     "# you can find a description in data/bank-campaign-desc.txt\n",
 31 |     "data = pd.read_csv(\"data/bank-campaign.csv\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "data.shape"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "data.columns"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "data.head()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "y = data.target"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "X = data.drop(\"target\", axis=1)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "X.shape"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "y.shape"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "y.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "data.target.value_counts()"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "data.target.value_counts(normalize=True)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "Splitting the data:"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "from sklearn.model_selection import train_test_split\n",
138 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
139 |     "    X, y, test_size=.2, random_state=42, stratify=y)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "np.sum(y_train == \"yes\") / len(y_train)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "np.sum(y_test == \"yes\") / len(y_test)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": []
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "# import model\n",
174 |     "from sklearn.linear_model import LogisticRegression\n",
175 |     "# instantiate model, set parameters\n",
176 |     "lr = LogisticRegression(C=0.1, max_iter=1000)\n",
177 |     "# fit model\n",
178 |     "lr.fit(X_train, y_train)\n",
179 |     "lr.coef_"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "Make predictions:"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "lr.score(X_train, y_train)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "(y_train == \"no\").mean()"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "lr.score(X_test, y_test)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "# https://github.com/amueller/ml-workshop-2-of-4"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "# Exercise\n",
228 |     "Load the dataset ``data/bike_day_raw.csv``, which has the regression target ``cnt``.\n",
229 |     "This dataset is hourly bike rentals in the citybike platform. The ``cnt`` column is the number of rentals, which we want to predict from date and weather data.\n",
230 |     "\n",
231 |     "Split the data into a training and a test set using ``train_test_split``.\n",
232 |     "Use the ``LinearRegression`` class to learn a regression model on this data. You can evaluate with the ``score`` method, which provides the $R^2$ or using the ``mean_squared_error`` function from ``sklearn.metrics`` (or write it yourself in numpy)."
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "# %load solutions/bike_regression.py"
242 |    ]
243 |   }
244 |  ],
245 |  "metadata": {
246 |   "anaconda-cloud": {},
247 |   "kernelspec": {
248 |    "display_name": "Python 3",
249 |    "language": "python",
250 |    "name": "python3"
251 |   },
252 |   "language_info": {
253 |    "codemirror_mode": {
254 |     "name": "ipython",
255 |     "version": 3
256 |    },
257 |    "file_extension": ".py",
258 |    "mimetype": "text/x-python",
259 |    "name": "python",
260 |    "nbconvert_exporter": "python",
261 |    "pygments_lexer": "ipython3",
262 |    "version": "3.7.6"
263 |   }
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 4
267 | }
268 | 


--------------------------------------------------------------------------------
/notebooks/02 - Cross-validation and Grid Search.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Cross-validation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import matplotlib.pyplot as plt\n",
 17 |     "import numpy as np\n",
 18 |     "import sklearn\n",
 19 |     "sklearn.set_config(print_changed_only=True)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from sklearn.datasets import load_digits\n",
 29 |     "from sklearn.model_selection import train_test_split\n",
 30 |     "\n",
 31 |     "digits = load_digits()\n",
 32 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 33 |     "    digits.data, digits.target)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from sklearn.model_selection import cross_val_score\n",
 43 |     "from sklearn.neighbors import KNeighborsClassifier"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "cross_val_score(KNeighborsClassifier(),\n",
 53 |     "                X_train, y_train, cv=5)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "from sklearn.model_selection import KFold, RepeatedStratifiedKFold"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "cross_val_score(KNeighborsClassifier(),\n",
 72 |     "                X_train, y_train, cv=KFold(n_splits=10, shuffle=True, random_state=42))"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "cross_val_score(KNeighborsClassifier(),\n",
 82 |     "                X_train, y_train,\n",
 83 |     "                cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42))"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Grid Searches\n",
 91 |     "================="
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Grid-Search with build-in cross validation"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "from sklearn.model_selection import GridSearchCV\n",
108 |     "from sklearn.svm import SVC"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "Define parameter grid:"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "import numpy as np\n",
125 |     "\n",
126 |     "param_grid = {'C': 10. ** np.arange(-3, 3),\n",
127 |     "              'gamma' : 10. ** np.arange(-5, 0)}\n",
128 |     "\n",
129 |     "np.set_printoptions(suppress=True)\n",
130 |     "print(param_grid)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "A GridSearchCV object behaves just like a normal classifier."
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "grid_search.fit(X_train, y_train)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "grid_search.predict(X_test)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "grid_search.score(X_test, y_test)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "grid_search.best_params_"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "grid_search.best_score_"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "grid_search.best_estimator_"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "# We extract just the scores\n",
210 |     "\n",
211 |     "scores = grid_search.cv_results_['mean_test_score']\n",
212 |     "scores = np.array(scores).reshape(6, 5)\n",
213 |     "\n",
214 |     "plt.matshow(scores)\n",
215 |     "plt.xlabel('gamma')\n",
216 |     "plt.ylabel('C')\n",
217 |     "plt.colorbar()\n",
218 |     "plt.xticks(np.arange(5), param_grid['gamma'])\n",
219 |     "plt.yticks(np.arange(6), param_grid['C']);"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "# Exercises\n",
227 |     "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier."
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# %load solutions/grid_search_k_neighbors.py"
237 |    ]
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "anaconda-cloud": {},
242 |   "kernelspec": {
243 |    "display_name": "Python 3",
244 |    "language": "python",
245 |    "name": "python3"
246 |   },
247 |   "language_info": {
248 |    "codemirror_mode": {
249 |     "name": "ipython",
250 |     "version": 3
251 |    },
252 |    "file_extension": ".py",
253 |    "mimetype": "text/x-python",
254 |    "name": "python",
255 |    "nbconvert_exporter": "python",
256 |    "pygments_lexer": "ipython3",
257 |    "version": "3.7.3"
258 |   }
259 |  },
260 |  "nbformat": 4,
261 |  "nbformat_minor": 4
262 | }
263 | 


--------------------------------------------------------------------------------
/notebooks/data/ram_price.csv:
--------------------------------------------------------------------------------
  1 | ,date,price
  2 | 0,1957.0,411041792.0
  3 | 1,1959.0,67947725.0
  4 | 2,1960.0,5242880.0
  5 | 3,1965.0,2642412.0
  6 | 4,1970.0,734003.0
  7 | 5,1973.0,399360.0
  8 | 6,1974.0,314573.0
  9 | 7,1975.0,421888.0
 10 | 8,1975.08,180224.0
 11 | 9,1975.25,67584.0
 12 | 10,1975.75,49920.0
 13 | 11,1976.0,40704.0
 14 | 12,1976.17,48960.0
 15 | 13,1976.42,23040.0
 16 | 14,1976.58,32000.0
 17 | 15,1977.08,36800.0
 18 | 16,1978.17,28000.0
 19 | 17,1978.25,29440.0
 20 | 18,1978.33,19200.0
 21 | 19,1978.5,24000.0
 22 | 20,1978.58,16000.0
 23 | 21,1978.75,15200.0
 24 | 22,1979.0,10528.0
 25 | 23,1979.75,6704.0
 26 | 24,1980.0,6480.0
 27 | 25,1981.0,8800.0
 28 | 26,1981.58,4479.0
 29 | 27,1982.0,3520.0
 30 | 28,1982.17,4464.0
 31 | 29,1982.67,1980.0
 32 | 30,1983.0,2396.0
 33 | 31,1983.67,1980.0
 34 | 32,1984.0,1379.0
 35 | 33,1984.58,1331.0
 36 | 34,1985.0,880.0
 37 | 35,1985.33,720.0
 38 | 36,1985.42,550.0
 39 | 37,1985.5,420.0
 40 | 38,1985.58,350.0
 41 | 39,1985.67,300.0
 42 | 40,1985.83,300.0
 43 | 41,1985.92,300.0
 44 | 42,1986.0,300.0
 45 | 43,1986.08,300.0
 46 | 44,1986.17,300.0
 47 | 45,1986.25,300.0
 48 | 46,1986.33,190.0
 49 | 47,1986.42,190.0
 50 | 48,1986.5,190.0
 51 | 49,1986.58,190.0
 52 | 50,1986.67,190.0
 53 | 51,1986.75,190.0
 54 | 52,1986.92,190.0
 55 | 53,1987.0,176.0
 56 | 54,1987.08,176.0
 57 | 55,1987.17,157.0
 58 | 56,1987.25,154.0
 59 | 57,1987.33,154.0
 60 | 58,1987.42,154.0
 61 | 59,1987.5,154.0
 62 | 60,1987.58,154.0
 63 | 61,1987.67,163.0
 64 | 62,1987.75,133.0
 65 | 63,1987.83,163.0
 66 | 64,1987.92,163.0
 67 | 65,1988.0,163.0
 68 | 66,1988.08,182.0
 69 | 67,1988.17,199.0
 70 | 68,1988.33,199.0
 71 | 69,1988.42,199.0
 72 | 70,1988.5,505.0
 73 | 71,1988.58,505.0
 74 | 72,1988.67,505.0
 75 | 73,1988.75,505.0
 76 | 74,1988.83,505.0
 77 | 75,1988.92,505.0
 78 | 76,1989.0,505.0
 79 | 77,1989.08,505.0
 80 | 78,1989.17,505.0
 81 | 79,1989.25,505.0
 82 | 80,1989.42,344.0
 83 | 81,1989.5,197.0
 84 | 82,1989.58,188.0
 85 | 83,1989.67,188.0
 86 | 84,1989.75,128.0
 87 | 85,1989.83,117.0
 88 | 86,1989.92,113.0
 89 | 87,1990.0,106.0
 90 | 88,1990.17,98.3
 91 | 89,1990.33,98.3
 92 | 90,1990.42,89.5
 93 | 91,1990.5,82.8
 94 | 92,1990.58,81.1
 95 | 93,1990.67,71.5
 96 | 94,1990.75,59.0
 97 | 95,1990.83,51.0
 98 | 96,1990.92,45.5
 99 | 97,1991.0,44.5
100 | 98,1991.08,44.5
101 | 99,1991.17,45.0
102 | 100,1991.25,45.0
103 | 101,1991.33,45.0
104 | 102,1991.42,43.8
105 | 103,1991.5,43.8
106 | 104,1991.58,41.3
107 | 105,1991.67,46.3
108 | 106,1991.75,45.0
109 | 107,1991.83,39.8
110 | 108,1991.92,39.8
111 | 109,1992.0,36.3
112 | 110,1992.08,36.3
113 | 111,1992.17,36.3
114 | 112,1992.25,34.8
115 | 113,1992.33,30.0
116 | 114,1992.42,32.5
117 | 115,1992.5,33.5
118 | 116,1992.58,31.0
119 | 117,1992.67,27.5
120 | 118,1992.75,26.3
121 | 119,1992.83,26.3
122 | 120,1992.92,26.3
123 | 121,1993.0,33.1
124 | 122,1993.08,27.5
125 | 123,1993.17,27.5
126 | 124,1993.25,27.5
127 | 125,1993.33,27.5
128 | 126,1993.42,30.0
129 | 127,1993.5,30.0
130 | 128,1993.58,30.0
131 | 129,1993.67,30.0
132 | 130,1993.75,36.0
133 | 131,1993.83,39.8
134 | 132,1993.92,35.8
135 | 133,1994.0,35.8
136 | 134,1994.08,35.8
137 | 135,1994.17,36.0
138 | 136,1994.25,37.3
139 | 137,1994.33,37.3
140 | 138,1994.42,37.3
141 | 139,1994.5,38.5
142 | 140,1994.58,37.0
143 | 141,1994.67,34.0
144 | 142,1994.75,33.5
145 | 143,1994.83,32.3
146 | 144,1994.92,32.3
147 | 145,1995.0,32.3
148 | 146,1995.08,32.0
149 | 147,1995.17,32.0
150 | 148,1995.25,31.2
151 | 149,1995.33,31.2
152 | 150,1995.42,31.1
153 | 151,1995.5,31.2
154 | 152,1995.58,30.6
155 | 153,1995.67,33.1
156 | 154,1995.75,33.1
157 | 155,1995.83,30.9
158 | 156,1995.92,30.9
159 | 157,1996.0,29.9
160 | 158,1996.08,28.8
161 | 159,1996.17,26.1
162 | 160,1996.25,24.7
163 | 161,1996.33,17.2
164 | 162,1996.42,14.9
165 | 163,1996.5,11.3
166 | 164,1996.58,9.06
167 | 165,1996.67,8.44
168 | 166,1996.75,8.0
169 | 167,1996.83,5.25
170 | 168,1996.92,5.25
171 | 169,1997.0,4.63
172 | 170,1997.08,3.63
173 | 171,1997.17,3.0
174 | 172,1997.25,3.0
175 | 173,1997.33,3.0
176 | 174,1997.42,3.69
177 | 175,1997.5,4.0
178 | 176,1997.58,4.13
179 | 177,1997.67,3.63
180 | 178,1997.75,3.41
181 | 179,1997.83,3.25
182 | 180,1997.92,2.16
183 | 181,1998.0,2.16
184 | 182,1998.08,0.91
185 | 183,1998.17,0.97
186 | 184,1998.25,1.22
187 | 185,1998.33,1.19
188 | 186,1998.42,0.97
189 | 187,1998.58,1.03
190 | 188,1998.67,0.97
191 | 189,1998.75,1.16
192 | 190,1998.83,0.84
193 | 191,1998.92,0.84
194 | 192,1999.08,1.44
195 | 193,1999.13,0.84
196 | 194,1999.17,1.25
197 | 195,1999.25,1.25
198 | 196,1999.33,0.86
199 | 197,1999.5,0.78
200 | 198,1999.67,0.87
201 | 199,1999.75,1.04
202 | 200,1999.83,1.34
203 | 201,1999.92,2.35
204 | 202,2000.0,1.56
205 | 203,2000.08,1.48
206 | 204,2000.17,1.08
207 | 205,2000.25,0.84
208 | 206,2000.33,0.7
209 | 207,2000.42,0.9
210 | 208,2000.5,0.77
211 | 209,2000.58,0.84
212 | 210,2000.67,1.07
213 | 211,2000.75,1.12
214 | 212,2000.83,1.12
215 | 213,2000.92,0.9
216 | 214,2001.0,0.75
217 | 215,2001.08,0.464
218 | 216,2001.17,0.464
219 | 217,2001.25,0.383
220 | 218,2001.33,0.387
221 | 219,2001.42,0.305
222 | 220,2001.5,0.352
223 | 221,2001.5,0.27
224 | 222,2001.58,0.191
225 | 223,2001.67,0.191
226 | 224,2001.75,0.169
227 | 225,2001.77,0.148
228 | 226,2002.08,0.134
229 | 227,2002.08,0.207
230 | 228,2002.25,0.193
231 | 229,2002.33,0.193
232 | 230,2002.42,0.33
233 | 231,2002.58,0.193
234 | 232,2002.75,0.193
235 | 233,2003.17,0.176
236 | 234,2003.25,0.076
237 | 235,2003.33,0.126
238 | 236,2003.42,0.115
239 | 237,2003.5,0.133
240 | 238,2003.58,0.129
241 | 239,2003.67,0.143
242 | 240,2003.75,0.148
243 | 241,2003.83,0.16
244 | 242,2003.99,0.166
245 | 243,2004.0,0.174
246 | 244,2004.08,0.148
247 | 245,2004.17,0.146
248 | 246,2004.33,0.156
249 | 247,2004.42,0.203
250 | 248,2004.5,0.176
251 | 249,2005.25,0.185
252 | 250,2005.42,0.149
253 | 251,2005.83,0.116
254 | 252,2005.92,0.185
255 | 253,2006.17,0.112
256 | 254,2006.33,0.073
257 | 255,2006.5,0.082
258 | 256,2006.67,0.073
259 | 257,2006.75,0.088
260 | 258,2006.83,0.098
261 | 259,2006.99,0.092
262 | 260,2007.0,0.082
263 | 261,2007.08,0.078
264 | 262,2007.17,0.066
265 | 263,2007.33,0.0464
266 | 264,2007.5,0.0386
267 | 265,2007.67,0.0351
268 | 266,2007.75,0.0322
269 | 267,2007.83,0.0244
270 | 268,2007.92,0.0244
271 | 269,2008.0,0.0232
272 | 270,2008.08,0.022
273 | 271,2008.33,0.022
274 | 272,2008.5,0.0207
275 | 273,2008.58,0.0176
276 | 274,2008.67,0.0146
277 | 275,2008.83,0.011
278 | 276,2008.92,0.0098
279 | 277,2009.0,0.0098
280 | 278,2009.08,0.0107
281 | 279,2009.25,0.0105
282 | 280,2009.42,0.0115
283 | 281,2009.5,0.011
284 | 282,2009.58,0.0127
285 | 283,2009.75,0.0183
286 | 284,2009.92,0.0205
287 | 285,2010.0,0.019
288 | 286,2010.08,0.0202
289 | 287,2010.17,0.0195
290 | 288,2010.33,0.0242
291 | 289,2010.5,0.021
292 | 290,2010.58,0.022
293 | 291,2010.75,0.0171
294 | 292,2010.83,0.0146
295 | 293,2010.92,0.0122
296 | 294,2011.0,0.01
297 | 295,2011.08,0.0103
298 | 296,2011.33,0.01
299 | 297,2011.42,0.0085
300 | 298,2011.67,0.0054
301 | 299,2011.75,0.0051
302 | 300,2012.0,0.0049
303 | 301,2012.08,0.0049
304 | 302,2012.25,0.005
305 | 303,2012.33,0.0049
306 | 304,2012.58,0.0048
307 | 305,2012.67,0.004
308 | 306,2012.83,0.0037
309 | 307,2013.0,0.0043
310 | 308,2013.08,0.0054
311 | 309,2013.33,0.0067
312 | 310,2013.42,0.0061
313 | 311,2013.58,0.0073
314 | 312,2013.67,0.0065
315 | 313,2013.75,0.0082
316 | 314,2013.83,0.0085
317 | 315,2013.92,0.0079
318 | 316,2014.08,0.0095
319 | 317,2014.17,0.0079
320 | 318,2014.25,0.0073
321 | 319,2014.42,0.0079
322 | 320,2014.58,0.0085
323 | 321,2014.67,0.0085
324 | 322,2014.83,0.0085
325 | 323,2015.0,0.0078
326 | 324,2015.08,0.0073
327 | 325,2015.25,0.0061
328 | 326,2015.33,0.0056
329 | 327,2015.5,0.0049
330 | 328,2015.58,0.0045
331 | 329,2015.67,0.0043
332 | 330,2015.75,0.0042
333 | 331,2015.83,0.0038
334 | 332,2015.92,0.0037
335 | 


--------------------------------------------------------------------------------
/notebooks/05 - Trees.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Trees"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "import sklearn\n",
 19 |     "sklearn.set_config(print_changed_only=True)\n",
 20 |     "import pandas as pd\n",
 21 |     "from sklearn.model_selection import train_test_split\n",
 22 |     "from sklearn.pipeline import make_pipeline\n",
 23 |     "from sklearn.preprocessing import scale, StandardScaler"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from sklearn.datasets import load_breast_cancer\n",
 33 |     "cancer = load_breast_cancer()"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "print(cancer.DESCR)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 52 |     "    cancer.data, cancer.target, stratify=cancer.target, random_state=0)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "# tree visualization"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
 69 |     "tree = DecisionTreeClassifier(max_depth=2)\n",
 70 |     "tree.fit(X_train, y_train)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "plt.figure(dpi=200)\n",
 80 |     "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "# Parameter Tuning"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "tree = DecisionTreeClassifier().fit(X_train, y_train)\n",
 97 |     "plt.figure(figsize=(15, 5))\n",
 98 |     "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)\n",
108 |     "plt.figure(figsize=(15, 5))\n",
109 |     "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "tree = DecisionTreeClassifier(max_leaf_nodes=8).fit(X_train, y_train)\n",
119 |     "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "tree = DecisionTreeClassifier(min_samples_split=50).fit(X_train, y_train)\n",
129 |     "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "tree = DecisionTreeClassifier(min_impurity_decrease=.01).fit(X_train, y_train)\n",
139 |     "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "from sklearn.model_selection import GridSearchCV\n",
149 |     "param_grid = {'max_depth':range(1, 7)}\n",
150 |     "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=10)\n",
151 |     "grid.fit(X_train, y_train)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit\n",
161 |     "param_grid = {'max_depth':range(1, 7)}\n",
162 |     "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n",
163 |     "                    cv=StratifiedShuffleSplit(100), return_train_score=True)\n",
164 |     "grid.fit(X_train, y_train)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "scores = pd.DataFrame(grid.cv_results_)\n",
174 |     "scores.plot(x='param_max_depth', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n",
175 |     "plt.legend(loc=(1, 0))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "from sklearn.model_selection import GridSearchCV\n",
185 |     "param_grid = {'max_leaf_nodes': range(2, 20)}\n",
186 |     "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n",
187 |     "                    cv=StratifiedShuffleSplit(100, random_state=1),\n",
188 |     "                   return_train_score=True)\n",
189 |     "grid.fit(X_train, y_train)\n",
190 |     "\n",
191 |     "scores = pd.DataFrame(grid.cv_results_)\n",
192 |     "scores.plot(x='param_max_leaf_nodes', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n",
193 |     "plt.legend(loc=(1, 0))"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "scores = pd.DataFrame(grid.cv_results_)\n",
203 |     "scores.plot(x='param_max_leaf_nodes', y='mean_train_score', yerr='std_train_score', ax=plt.gca())\n",
204 |     "scores.plot(x='param_max_leaf_nodes', y='mean_test_score', yerr='std_test_score', ax=plt.gca())"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "grid.best_params_"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "plot_tree(grid.best_estimator_, feature_names=cancer.feature_names, filled=True)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "pd.Series(grid.best_estimator_.feature_importances_,\n",
232 |     "          index=cancer.feature_names).plot(kind=\"barh\")"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "# Exercise\n",
240 |     "Apply a decision tree to the \"adult\" dataset and visualize it.\n",
241 |     "\n",
242 |     "Tune parameters with grid-search; try at least max_leaf_nodes and max_depth, but separately.\n",
243 |     "\n",
244 |     "Visualize the resulting tree and it's feature importances."
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": []
253 |   }
254 |  ],
255 |  "metadata": {
256 |   "anaconda-cloud": {},
257 |   "kernelspec": {
258 |    "display_name": "root *",
259 |    "language": "python",
260 |    "name": "conda-root-py"
261 |   },
262 |   "language_info": {
263 |    "codemirror_mode": {
264 |     "name": "ipython",
265 |     "version": 3
266 |    },
267 |    "file_extension": ".py",
268 |    "mimetype": "text/x-python",
269 |    "name": "python",
270 |    "nbconvert_exporter": "python",
271 |    "pygments_lexer": "ipython3",
272 |    "version": "3.7.3"
273 |   }
274 |  },
275 |  "nbformat": 4,
276 |  "nbformat_minor": 4
277 | }
278 | 


--------------------------------------------------------------------------------
/notebooks/03 - Linear Models for Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Linear Models for Regression"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import matplotlib.pyplot as plt\n",
 17 |     "import numpy as np\n",
 18 |     "import sklearn\n",
 19 |     "sklearn.set_config(print_changed_only=True)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from sklearn.linear_model import Ridge, LinearRegression"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from sklearn.model_selection import cross_val_score"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "from sklearn.datasets import load_boston\n",
 47 |     "boston = load_boston()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "X, y = boston.data, boston.target"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "print(boston.DESCR)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "X.shape"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "fig, axes = plt.subplots(3, 5, figsize=(20, 10))\n",
 84 |     "for i, ax in enumerate(axes.ravel()):\n",
 85 |     "    if i > 12:\n",
 86 |     "        ax.set_visible(False)\n",
 87 |     "        continue\n",
 88 |     "    ax.plot(X[:, i], y, 'o', alpha=.5)\n",
 89 |     "    ax.set_title(\"{}: {}\".format(i, boston.feature_names[i]))\n",
 90 |     "    ax.set_ylabel(\"MEDV\")"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "print(X.shape)\n",
100 |     "print(y.shape)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "from sklearn.model_selection import train_test_split\n",
110 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
111 |     "    X, y, random_state=42)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "np.mean(cross_val_score(LinearRegression(),\n",
121 |     "                        X_train, y_train, cv=10))"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "np.mean(cross_val_score(\n",
131 |     "        Ridge(), X_train, y_train, cv=10))"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "from sklearn.model_selection import GridSearchCV\n",
141 |     "param_grid = {'alpha': np.logspace(-3, 3, 14)}\n",
142 |     "print(param_grid)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)\n",
152 |     "grid.fit(X_train, y_train)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "import pandas as pd\n",
162 |     "plt.figure(dpi=200)\n",
163 |     "results = pd.DataFrame(grid.cv_results_)\n",
164 |     "results.plot('param_alpha', 'mean_train_score', ax=plt.gca())\n",
165 |     "results.plot('param_alpha', 'mean_test_score', ax=plt.gca())\n",
166 |     "\n",
167 |     "plt.legend()\n",
168 |     "plt.xscale(\"log\")"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "from sklearn.preprocessing import PolynomialFeatures, scale\n",
178 |     "# being lazy and not really doing things properly whoops\n",
179 |     "X_poly = PolynomialFeatures(include_bias=False).fit_transform(scale(X))\n",
180 |     "print(X_poly.shape)\n",
181 |     "X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=42)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "np.mean(cross_val_score(LinearRegression(),\n",
191 |     "                        X_train, y_train, cv=10))"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "np.mean(cross_val_score(Ridge(),\n",
201 |     "                        X_train, y_train, cv=10))"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)\n",
211 |     "grid.fit(X_train, y_train)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "results = pd.DataFrame(grid.cv_results_)\n",
221 |     "\n",
222 |     "results.plot('param_alpha', 'mean_train_score', ax=plt.gca())\n",
223 |     "results.plot('param_alpha', 'mean_test_score', ax=plt.gca())\n",
224 |     "plt.legend()\n",
225 |     "plt.xscale(\"log\")"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "print(grid.best_params_)\n",
235 |     "print(grid.best_score_)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "lr = LinearRegression().fit(X_train, y_train)\n",
245 |     "plt.scatter(range(X_poly.shape[1]), lr.coef_, c=np.sign(lr.coef_), cmap=\"bwr_r\")"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "ridge = grid.best_estimator_\n",
255 |     "plt.scatter(range(X_poly.shape[1]), ridge.coef_, c=np.sign(ridge.coef_), cmap=\"bwr_r\")"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "ridge100 = Ridge(alpha=100).fit(X_train, y_train)\n",
265 |     "ridge1 = Ridge(alpha=1).fit(X_train, y_train)\n",
266 |     "plt.figure(figsize=(8, 4))\n",
267 |     "\n",
268 |     "plt.plot(ridge1.coef_, 'o', label=\"alpha=1\")\n",
269 |     "plt.plot(ridge.coef_, 'o', label=\"alpha=14\")\n",
270 |     "plt.plot(ridge100.coef_, 'o', label=\"alpha=100\")\n",
271 |     "plt.legend()"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "from sklearn.linear_model import Lasso\n",
281 |     "\n",
282 |     "lasso = Lasso().fit(X_train, y_train)\n",
283 |     "print(\"Training set score: {:.2f}\".format(lasso.score(X_train, y_train)))\n",
284 |     "print(\"Test set score: {:.2f}\".format(lasso.score(X_test, y_test)))\n",
285 |     "print(\"Number of features used:\", np.sum(lasso.coef_ != 0))"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "# Exercise\n",
293 |     "Load the diabetes dataset using ``sklearn.datasets.load_diabetes``. Apply ``LinearRegression``, ``Ridge`` and ``Lasso`` and visualize the coefficients. Try polynomial features."
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "# %load solutions/linear_models_diabetes.py"
303 |    ]
304 |   }
305 |  ],
306 |  "metadata": {
307 |   "anaconda-cloud": {},
308 |   "kernelspec": {
309 |    "display_name": "root *",
310 |    "language": "python",
311 |    "name": "conda-root-py"
312 |   },
313 |   "language_info": {
314 |    "codemirror_mode": {
315 |     "name": "ipython",
316 |     "version": 3
317 |    },
318 |    "file_extension": ".py",
319 |    "mimetype": "text/x-python",
320 |    "name": "python",
321 |    "nbconvert_exporter": "python",
322 |    "pygments_lexer": "ipython3",
323 |    "version": "3.7.3"
324 |   }
325 |  },
326 |  "nbformat": 4,
327 |  "nbformat_minor": 4
328 | }
329 | 


--------------------------------------------------------------------------------
/slides/04-linear-models-classification.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <title>Linear Models for Classification</title>
  5 |     <meta charset="utf-8">
  6 |     <link rel="stylesheet" href="style.css">
  7 |     <style>
  8 |         @import url(https://fonts.googleapis.com/css?family=Garamond);
  9 |         @import url(https://fonts.googleapis.com/css?family=Muli:400,700,400italic);
 10 |         @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
 11 |       </style>
 12 |     </head>
 13 |     <body>
 14 |       <textarea id="source">
 15 | class: center, middle
 16 | 
 17 | ![:scale 40%](images/sklearn_logo.png)
 18 | 
 19 | ### Intermediate Machine learning with scikit-learn
 20 | 
 21 | # Linear Models for Classification
 22 | 
 23 | Andreas C. Müller
 24 | 
 25 | Columbia University, scikit-learn
 26 | 
 27 | .smaller[https://github.com/amueller/ml-workshop-2-of-4]
 28 | 
 29 | ---
 30 | class: center, middle
 31 | 
 32 | # Linear models for <strong>binary</strong> classfication
 33 | 
 34 | ???
 35 | We'll first start with linear models for binary
 36 | classification, so if there are only two classes. That makes
 37 | the models much easier to understand.
 38 | 
 39 | ---
 40 | .center[
 41 | ![:scale 55%](images/linear_boundary_vector.png)
 42 | ]
 43 | 
 44 | $$\hat{y} = \text{sign}(w^T \textbf{x} + b) = \text{sign}\left(\sum\limits_{i}w_ix_i + b\right)$$
 45 | 
 46 | ???
 47 | Similar to the regression case, basically all linear models
 48 | for classification have the same way to make predictions. As
 49 | with regression, they compute an inner product of a weight
 50 | vector w with the feature vector x, and add some bias b. The
 51 | result of that is a real number, as in regression. For
 52 | classification, however, we only look at the sign of the
 53 | result, so whether it is negative or positive. If it's
 54 | positive, we predict one class, usually called +1, if it's
 55 | negative, we predict the other class, usually called -1. If
 56 | the result is 0, by convention the positive class is
 57 | predicted, but because it's a floating point number that
 58 | doesn't really happen in practice. You'll see that sometimes
 59 | in my notation I will not have a $b$. That's because you can
 60 | always add a constant feature to x to achieve the same
 61 | effect (thought you would then need to leave that feature
 62 | out of the regularization). So when I write $w^Tx$ without a
 63 | $b$ assume that there is a constant feature added that is
 64 | not part of any regularization.
 65 | 
 66 | Geometrially, what the formula means is that the decision
 67 | boundary of a linear classifier will be a hyperplane in the
 68 | feature space, where w is the normal vector of that plane.
 69 | In the 2d example here, it's just a line separating red and
 70 | blue. Everything on the right hand side would be classified
 71 | as blue by this classifier, and everything on the left-hand
 72 | side as red.
 73 | 
 74 | Questions? So again, the learning here consists of finding
 75 | parameters w and b based on the training set, and that is
 76 | where the different algorithms differ. There are quite a lot
 77 | of algorithms out there, and there are also quite a lot in
 78 | scikit-learn, but we'll only discuss the most common ones.
 79 | 
 80 | The most straight-forward way to approach finding w and b is
 81 | to use the framework of empirical risk minimization that we
 82 | talked about last time, so finding parameters that minimize
 83 | some loss o the training set. Where classification differs
 84 | quite a bit from regression is on how we want to measure
 85 | misclassifications.
 86 | 
 87 | 
 88 | ---
 89 | 
 90 | # Picking a loss?
 91 | 
 92 | $$\hat{y} = \text{sign}(w^T \textbf{x} + b)$$
 93 | 
 94 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}} \sum_{i=1}^n 1_{y_i \neq \text{sign}(w^T \textbf{x} + b)}$$`
 95 | 
 96 | .center[
 97 | ![:scale 40%](images/binary_loss.png)
 98 | ]
 99 | 
100 | ???
101 | So we need to define a loss function for given w and b that
102 | tell us how well they fit the training set. Obvious Idea:
103 | Minimize number of misclassifications aka 0-1 loss but this
104 | loss is non-convex, not continuous and minimizing it is
105 | NP-hard. So we need to relax it, which basically means we
106 | want to find a convex upper bound for this loss. This is not
107 | done on the actual prediction, but on the inner product $w^T
108 | x$, which is also called the decision function. So this
109 | graph here has the inner product on the x axis, and shows
110 | what the loss would be for class 1. The 0-1 loss is zero if
111 | the decision function is positive, and one if it's negative.
112 | Because a positive decision function means a positive
113 | predition, means correct classification in the case of y=1.
114 | A negative prediction means a wrong classification, which is
115 | penalized by the 0-1 loss with a loss of 1, i.e. one
116 | mistake.
117 | 
118 | The other losses we'll talk about are mostly the hinge loss
119 | and the log loss. You can see they are both upper bounds on
120 | the 0-1 loss but they are convex and continuous. Both of
121 | these losses care not only that you make a correct
122 | prediction, but also "how correct" your prediction is, i.e.
123 | how positive or negative your decision function is. We'll
124 | talk a bit more about the motivation of these two losses,
125 | starting with the logistic loss.
126 | 
127 | 
128 | ---
129 | 
130 | # Logistic Regression
131 | 
132 | 
133 | .left-column[
134 | $$\log\left(\frac{p(y=1|x)}{p(y=-1|x)}\right) = w^T\textbf{x} + b$$
135 | 
136 | $$p(y=1|\textbf{x}) = \frac{1}{1+e^{-w^T\textbf{x} -b }}$$
137 | 
138 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}} \sum_{i=1}^n \log(\exp(-y_i(w^T \textbf{x}_i + b)) + 1)$$`
139 | 
140 | 
141 | $$\hat{y} = \text{sign}(w^T\textbf{x} + b)$$
142 | ]
143 | .right-column[
144 | ![:scale 90%](images/logit.png)]
145 | 
146 | 
147 | ???
148 | Logistic regression is probably the most commonly used
149 | linear classifier, maybe the most commonly used classifier
150 | overall. The idea is to model the log-odds, which is log
151 | p(y=1|x) - log p(y=0|x) as a linear function, as shown here.
152 | Rearranging the formula, you get a model of p(y=1|x) as 1
153 | over 1 + ... This function is called the logistic sigmoid,
154 | and is drawn to the right here. Basically it squashed the
155 | linear function $w^Tx$ between 0 and 1, so that it can model
156 | a probability.
157 | 
158 | Given this equation for p(y|x), what we want to do is
159 | maximize the probability of the training set under this
160 | model. This approach is known as maximum likelihood.
161 | Basically you want to find w and b such that they assign
162 | maximum probability to the labels observed in the training
163 | data. You can rearrange that a bit and end up with this
164 | equation here, which contains the log-loss as seen on the
165 | last slide.
166 | 
167 | The prediction is the class with the higher probability. In
168 | the binary case, that's the same as asking whether the
169 | probability of class 1 is bigger or smaller than .5. And as
170 | you can see from the plot of the logistic sigmoid, the
171 | probability of the class +1 is greater than .5 exactly if
172 | the decision function $w^T x$ is greater than 0. So
173 | predicting the class with maximum probability is the same as
174 | predicting which side of the hyperplane given by w we are
175 | on.
176 | 
177 | Ok so this is logistic regression. We minimize this loss and
178 | get a w which defines a hyper plane. But if you think back
179 | to last time, this is only part of what we want. This
180 | formulation tries to fit the training data, but it doesn't
181 | care about finding a simple solution.
182 | 
183 | ---
184 | 
185 | 
186 | # Penalized Logistic Regression
187 | 
188 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}}C \sum_{i=1}^n\log(\exp(-y_i(w^T \textbf{x}_i + b )) + 1) + ||w||_2^2$$`
189 | 
190 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}}C \sum_{i=1}^n\log(\exp(-y_i (w^T \textbf{x}_i + b)) + 1) + ||w||_1$$`
191 | 
192 | - C is inverse to alpha (or alpha / n_samples)
193 | ???
194 | 
195 | - Both versions strongly convex, l2 version smooth (differentiable).
196 | 
197 | - All points contribute to $w$ (dense solution to dual).
198 | 
199 | So we can do the same we did for regression: we can add
200 | regularization terms using the L1 and L2 norm. The effects
201 | are the same as for regression: both push the coefficients
202 | towards zero, but the l1 norm encourages coefficients to be
203 | exactly zero, for the same reasons we discussed last time.
204 | 
205 | You could also use a mixed penalty to get something like the
206 | elasticnet. That's not implemented in the logisticregression
207 | class in scikit-learn right now, but it's certainly a
208 | sensible thing to do.
209 | 
210 | Here I used a slightly different notation as last time,
211 | though. I'm not using alpha to multiply the regularizer,
212 | instead I'm using C to multiply the loss. That's mostly
213 | because that's how it's done in scikit-learn and it has only
214 | historic reasons. The idea is exactly the same, only now C
215 | is 1 over alpha. So large C means heavy weight to the loss,
216 | means little regularization, while small C means less weight
217 | on the loss, means strong regularization.
218 | 
219 | Depending on the model, there might be a factor of n_samples
220 | in there somewhere. Usually we try to make the objective as
221 | independent of the number of samples as possible in
222 | scikit-learn, but that might lead to surprises if you're not
223 | aware of it.
224 | 
225 | Some side-notes on the optimization problem: here, as in
226 | regression, having more regularization makes the
227 | optimization problem easier. You might have seen this in
228 | your homework already, if you decrease C, meaning you add
229 | more regularization, your model fits more quickly.
230 | 
231 | One particular property of the logistic loss, compared to
232 | the hinge loss we'll discuss next is that each data point
233 | contributes to the loss, so each data point has an effect on
234 | the solution. That's also true for all the regression models
235 | we saw last time.
236 | ---
237 | # Effect of regularization
238 | 
239 | 
240 | .center[
241 | ![:scale 90%](images/logreg_regularization.png)
242 | ]
243 | 
244 | - Small C (a lot of regularization) limits the influence of individual points!
245 | 
246 | ???
247 | 
248 | So I spared you with coefficient plots, because they looks
249 | the same as for regression. All the things I said about
250 | model complexity and dependency on the number of features
251 | and samples is as true for classification as it is for
252 | regression.
253 | 
254 | There is another interesting way to thing about
255 | regularization that I found helpful, though. I'm not going
256 | to walk through the math for this, but you can reformulate
257 | the optimization problem and find that what the C parameter
258 | does is actually limit the influence of individual data
259 | points. With very large C, we said we have no
260 | regularization. It also means individual data points can
261 | have basically unlimited influence, as you can see here.
262 | There are two outliers here, which basically completely tilt
263 | the decision boundary. But if we decrease C, and therefore
264 | increase the regularization, what happens is that the
265 | influence of these outlier points becomes limited, and the
266 | other points get more influence.
267 | 
268 | ---
269 | 
270 | #Max-Margin and Support Vectors
271 | 
272 | .center[
273 | ![:scale 75%](images/max_margin.png)
274 | ]
275 | 
276 | ???
277 | A point is within the margin if 〖y_i w〗^T x is smaller
278 | than one. That means if you have a smaller w, you basically
279 | have a smaller margin given that you're on the correct side.
280 | If you're on the wrong side, you'll have always have a loss.
281 | If you're in the correct side, if you're w^x is small, then
282 | you also have a loss.
283 | ---
284 | class: center
285 | 
286 | #Max-Margin and Support Vectors
287 | `$$ \min_{w \in \mathbb{R}^p, b \in \mathbb{R}} C \sum_{i=1}^n \max(0, 1 - y_i (w^T\mathbf{x} + b)) + ||w||^2_2 $$`
288 | 
289 | $$\text{Within margin} \Leftrightarrow y_i(w^T x  + b)< 1$$
290 | 
291 | Smaller $w \Rightarrow$ larger margin
292 | 
293 | 
294 | ---
295 | class: center
296 | 
297 | #Max-Margin and Support Vectors
298 | 
299 | .left-column[
300 | ![:scale 80%](images/max_margin_C_0.1.png)
301 | ]
302 | .right-column[
303 | ![:scale 80%](images/max_margin_C_1.png)
304 | ]
305 | 
306 | ???
307 | Here are two examples on the same dataset. Where I learned
308 | linear support vector machine with c-0.1, and c=1. With
309 | c=0.1, you have a wider margin. There are points inside the
310 | margin and all the points inside the margin are support
311 | vectors which contribute to the solution. Points that are
312 | outside of the margin and on the correct side doesn't
313 | contribute to the solution. These points are sort of
314 | classified correctly, not when they’re ignored. The normal
315 | vector is w and basically, the size of the margin is the
316 | inverse of the length of w. C=0.1 means I have less emphasis
317 | on the data fitting and more emphasis on the shrinking w.
318 | This will lead to a smaller w. If I have larger C that means
319 | less regularization, which will lead to a larger W, larger W
320 | means a smaller margin. So there are fewer points here, they
321 | are inside the margin and therefore, fewer support vectors.
322 | More regularization usually means a larger margin but more
323 | points inside the margin. Also, more support vectors mean
324 | there are more data points that actually influence the
325 | solution.
326 | ---
327 | 
328 | 
329 | # (soft margin) linear SVM
330 | 
331 | .larger[
332 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}}C \sum_{i=1}^n\max(0,1-y_i(w^T \textbf{x}_i + b)) + ||w||_2^2$$`
333 | 
334 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}}C \sum_{i=1}^n\max(0,1-y_i(w^T \textbf{x}_i + b))+ ||w||_1$$`
335 | ]
336 | 
337 | ???
338 | - Both versions strongly convex, neither smooth.
339 | 
340 | - Only some points contribute (the support vectors) to $w$ (sparse solution to dual).
341 | 
342 | Moving from logistic regression to linear SVMs is just a
343 | matter of changing the loss from the log loss to the hinge
344 | loss. The hinge-loss is defined as ... And we can penalize
345 | using either l1 or l2 norm, or again, in principle also
346 | elastic net. This formulation with the hinge loss doesn't
347 | really make sense without the penalty, because of the
348 | formulation of the hinge loss. What this loss says is
349 | basically "if you predict the right class with a margin of
350 | 1, there is no loss". Otherwise the loss is linear in the
351 | decision function. So you need to be on the right side of
352 | the hyperplane by a given amount, and then there is no more
353 | loss. That's the reason you need the penalty, for the 1 to
354 | make sense. Otherwise you could just scale up $w$ to make it
355 | far enough on the right side. But the regularization
356 | penalizes growing $w$.
357 | 
358 | The hinge loss has a kink, same as the l1 norm, and so it's
359 | not a smooth optimization problem any more, but that's not
360 | really a big deal. What's interesting is that all the points
361 | that are classified correctly with a margin of at least 1
362 | have a loss of zero, and so they don't influence the
363 | solution any more. All the point that are not classified
364 | correctly by this margin are the ones that do influence the
365 | solution and they are called the support vectors.
366 | 
367 | FIXME graph of not influencing the solution?
368 | 
369 | 
370 | ---
371 | class: center
372 | # Logistic Regression vs SVM
373 | .compact[
374 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}}C \sum_{i=1}^n\log(\exp(-y_i(w^T \textbf{x}_i+b)) + 1) + ||w||_2^2$$`
375 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}}C \sum_{i=1}^n\max(0,1-y_i(w^T \textbf{x}_i + b)) + ||w||_2^2$$`
376 | 
377 | ![:scale 35%](images/binary_loss.png)
378 | ]
379 | ???
380 | 
381 | So this is the main difference between logistic regression
382 | and linear SVMs: Does it penalize misclassifications
383 | according to the green line, or according to the blue line?
384 | In practice it doesn't make a big difference.
385 | 
386 | ---
387 | 
388 | 
389 | # SVM or LogReg?
390 | 
391 | .center[
392 | ![:scale 80%](images/svm_or_lr.png)
393 | ]
394 | 
395 | - Need compact model or believe solution is sparse? Use L1
396 | 
397 | ???
398 | So which one of them should you use? If you need probability
399 | estimates, you should use logistic regression. If you don't,
400 | you can pick either, and it doesn't really matter. Logistic
401 | regression can be a bit faster to optimize in theory. If
402 | you're in a setting where there's many more feature than
403 | samples, it might make sense to use linear SVMs and solve
404 | the dual, but you can actually solve either of the problems
405 | in the dual, and we'll talk about what that means in
406 | practice in a little bit.
407 | 
408 | ---
409 | 
410 | class: center, middle
411 | 
412 | # Multiclass classification
413 | 
414 | 
415 | ???
416 | 
417 | Ok, so I think that's enough on the two loss functions and
418 | regularization, and hopefully you have a bit of a feel for
419 | how these two classifiers work, and also an understanding
420 | that they are in fact quite similar in practice.
421 | 
422 | Next I want to look at how to go from binary classification
423 | to multi-class classification. Basically there is a simple
424 | but hacky way, and there's a slightly more complicated but
425 | theoretically sound way.
426 | 
427 | ---
428 | class: center
429 | 
430 | # Reduction to Binary Classification
431 | 
432 | # One Vs Rest
433 | 
434 | For 4 classes:
435 | 
436 | 1v{2,3,4}, 2v{1,3,4}, 3v{1,2,4}, 4v{1,2,3}
437 | 
438 | In general:
439 | 
440 | n binary classifiers - each on all data
441 | 
442 | ???
443 | 
444 | 
445 | Let's start with One vs Rest. here, we learn one binary
446 | classifier for each class against the remaining classes. So
447 | let's say we have 4 classes, called 1 to 4.  First we learn
448 | a binary classifier of the points in class 1 vs the points
449 | in the classes 2, 3 and 4.  Then, we do the same for class
450 | 2, and so on. The way we end up building as many classifiers
451 | as we have classes.
452 | 
453 | ---
454 | 
455 | class: spacious
456 | 
457 | # Prediction with One Vs Rest
458 | 
459 | 
460 | "Class with highest score"
461 | 
462 | $$\hat{y} = \text{arg}\max_{i \in Y} \textbf{w}^T_i\textbf{x} + b_i$$
463 | 
464 | 
465 | ???
466 | To make a prediction, we compute the decision function of
467 | all classifiers, say 4 in the example, on a new data point.
468 | The one with the highest score for the positive class, the
469 | single class, wins, and that class is predicted.
470 | 
471 | It's a little bit unclear why this works as well as it does.
472 | Maybe there's some papers about that now, but I'm not
473 | 
474 | So in this case we have one coefficient vector w and one
475 | bias b for each class.
476 | 
477 | ---
478 | 
479 | # One vs Rest Prediction
480 | 
481 | .center[
482 | ![:scale 80%](images/ovr_lines.png)
483 | ]
484 | 
485 | 
486 | ???
487 | Here is an illustration of what that looks like.
488 | Unfortunately it's a bit hard to draw 4 classes in general
489 | position in 2 dimensions, so I only used 3 classes here. So
490 | each class has an associated coefficient vector and bias,
491 | corresponding to a line. The line tries to separate this
492 | class from the other two classes.
493 | # Fixme draw ws?
494 | 
495 | ---
496 | 
497 | # One vs Rest Prediction
498 | 
499 | .center[
500 | ![:scale 80%](images/ovr_boundaries.png)
501 | ]
502 | 
503 | 
504 | ???
505 | 
506 | Here are the decision boundaries resulting from the these
507 | three binary classifiers. Basically what they say is that
508 | the line that is closest decides the class. What you can not
509 | see here is that each of the lines also have a magnitude
510 | associated with them. It's not only the direction of the
511 | normal vector that matters, but also the length. You can
512 | think of that as some form of uncertainty attached to the
513 | line.
514 | 
515 | ---
516 | 
517 | 
518 | # Multinomial Logistic Regression
519 | 
520 | Probabilistic multi-class model:
521 | 
522 | `$$p(y=i|x) = \frac{e^{\textbf{w}_i^T\textbf{x} + b_i}}{\sum_{j=1}^k e^{\textbf{w}_j^T\textbf{x} + b_j}}$$`
523 | 
524 | `$$\min_{w \in ℝ^{pk}, b \in \mathbb{R}^k} -\sum_{i=1}^n \log(p(y=y_i|x_i, w, b))$$`
525 | 
526 | $$\hat{y} = \text{arg} \max_{i=1,...,k} \textbf{w}^T_i\textbf{x} + b_i$$
527 | 
528 | - Same prediction rule as OvR !
529 | 
530 | ???
531 | The binary logistic regression case can be generalized to
532 | multinomial logistic regression, in which we model the
533 | probability that i is one of the classes using this formula,
534 | which is also known as softmax. The probability is
535 | proportional to e to the minus $w^t x$ which is the same as
536 | in the binary case. But now we need to normalize it so that
537 | the sum over all classes is one. So we just divide it by
538 | this sum.
539 | 
540 | ---
541 | class: center, middle
542 | 
543 | 
544 | # Notebook: Linear Models for Classification
545 | 
546 | 
547 |     </textarea>
548 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
549 |     <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
550 | 
551 |     <script>
552 |     // Config Remark
553 |     remark.macros['scale'] = function (percentage) {
554 |         var url = this;
555 |         return '<img src="' + url + '" style="width: ' + percentage + '" />';
556 |     };
557 |     config_remark = {
558 |         highlightStyle: 'github',
559 |         highlightSpans: true,
560 |         highlightLines: true,
561 |         ratio: "16:9"
562 |     };
563 |       var slideshow = remark.create(config_remark);
564 |     // Configure MathJax
565 |     MathJax.Hub.Config({
566 |     tex2jax: {
567 |         inlineMath: [['$','$'], ['\\(','\\)']],
568 |         skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] /* removed 'code' entry*/
569 |     }
570 |     });
571 |     MathJax.Hub.Queue(function() {
572 |         var all = MathJax.Hub.getAllJax(), i;
573 |         for(i = 0; i < all.length; i += 1) {
574 |             all[i].SourceElement().parentNode.className += ' has-jax';
575 |         }
576 |     });
577 |     </script>
578 |   </body>
579 | </html>
580 | 


--------------------------------------------------------------------------------
/slides/06-gradient-boosting.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <title>Gradient Boosting</title>
  5 |     <meta charset="utf-8">
  6 |     <link rel="stylesheet" href="style.css">
  7 |     <style>
  8 |         @import url(https://fonts.googleapis.com/css?family=Garamond);
  9 |         @import url(https://fonts.googleapis.com/css?family=Muli:400,700,400italic);
 10 |         @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
 11 |       </style>
 12 |     </head>
 13 |     <body>
 14 |       <textarea id="source">
 15 | 
 16 | class: center, middle
 17 | 
 18 | ![:scale 40%](images/sklearn_logo.png)
 19 | 
 20 | ### Intermediate  Machine learning with scikit-learn
 21 | 
 22 | # Gradient Boosting
 23 | 
 24 | Andreas C. Müller
 25 | 
 26 | Columbia University, scikit-learn
 27 | 
 28 | .smaller[https://github.com/amueller/ml-workshop-2-of-4]
 29 | 
 30 | ---
 31 | # Reminder: Gradient Descent
 32 | 
 33 | .right-column[![:scale 100%](images/gradient_3d.png)]
 34 | 
 35 | Want: $$\arg \min_w F(w)$$
 36 | 
 37 | Initialize $w_0$
 38 | 
 39 | $$w^{(i+1)} \leftarrow w^{(i)} - \eta_i\frac{d}{dw}F(w^{(i)})$$
 40 | 
 41 | Converges to local minimum
 42 | 
 43 | ???
 44 | First, let's talk about Gradient Descent. So we have some
 45 | function we want to minimize here the function is Lasso
 46 | training data set plus the regularizer. F is the objective
 47 | of the model and I want to find the best parameter setting
 48 | w. The way gradient descent works are that we initialize it
 49 | with some W and then we compute a gradient then we walk down
 50 | the gradient by a small step. This converges to a local
 51 | minimum in the function. For linear models, there's only one
 52 | global minimum. Basically, any optimization algorithm you
 53 | can think of will always converge to the same solution, they
 54 | only converge at different speeds.
 55 | 
 56 | ---
 57 | 
 58 | # Reminder: Gradient Descent
 59 | 
 60 | .center[
 61 | ![:scale 40%](images/gradient_2d.png)
 62 | ]
 63 | 
 64 | $$w^{(i+1)} \leftarrow w^{(i)} - \eta_i\frac{d}{dw}F(w^{(i)})$$
 65 | 
 66 | 
 67 | ???
 68 | ---
 69 | # Pick a learning rate
 70 | 
 71 | .center[
 72 | ![:scale 90%](images/gradient_learning_rates.png)
 73 | ]
 74 | 
 75 | $$w^{(i+1)} \leftarrow w^{(i)} - \eta_i\frac{d}{dw}F(w^{(i)})$$
 76 | ???
 77 | A little bit of an issue here is picking a learning rate
 78 | which is how big a step are you making. If you make too
 79 | small a step, then you're going to get stuck wherever you're
 80 | started. If you pick the right step size, you can make very
 81 | quick progress. If you pick too a big step size, you're
 82 | going to be missing the target and just jumping around the
 83 | target. This is a very simple optimizer. But it's not great
 84 | and it’s not very fast because you need to compute gradients
 85 | over the whole dataset. What you can do instead is you can
 86 | approximate a gradient by looking only a single data point
 87 | at a time. 
 88 | 
 89 | ---
 90 | 
 91 | # (Stochastic) Gradient Descent
 92 | .smaller[
 93 | Logistic Regression Objective:
 94 | 
 95 | `$$F(w, b) = -C \sum_{i=1}^n\log(\exp(-y_iw^T \textbf{x}_ii -b) +1 ) + ||w||_2^2$$`
 96 | 
 97 | Gradient:
 98 | 
 99 | `$$\frac{d}{dw}F(w) = \frac{d}{dw} -C \sum_{i=1}^n\log(\exp(-y_iw^T \textbf{x}_i - b) +1 ) + ||w||_2^2$$`
100 | 
101 | Stochastic Gradient: Pick $x_i$ randomly, then
102 | 
103 | `$$\frac{d}{dw}F(w) \approx \frac{d}{dw} -C \log(\exp(-y_iw^T \textbf{x}_ii -b) +1 ) + \frac{1}{n}||w||_2^2$$`
104 | 
105 | In practice: just iterate over i.
106 | ]
107 | 
108 | ???
109 | Looking at the Logistic Regression, the functional Logistic
110 | Regression we want to minimize is the log loss plus the
111 | regularizer. Instead of looking at the gradient for the
112 | whole sum here, we can get a stochastic approximation of the
113 | gradient by looking at only one of the sums at a time. In
114 | practice, we iterate over the dataset and go one by one
115 | through all the data points. We make sure we shuffle them
116 | before. Then we do small gradient steps. This is a very bad
117 | optimizer compared to the SAG but it can go very quickly
118 | over a lot of data. 
119 | 
120 | 
121 | ---
122 | # Boosting 
123 | .larger[
124 | $$f(x) = \sum_k g_k(x)$$
125 | 
126 |  - Family of algorithms to create "strong" learner $f$ from "weak" learners $g_k$.
127 |  - AdaBoost, GentleBoost, LogitBoost, …
128 | ]
129 | ???
130 |  - Trees or stumps work best
131 |  - Gradient Boosting often the best of the bunch
132 |  - Many specialized algorithms (ranking etc)
133 | 
134 | This is an instance of a more general family of models,
135 | called boosting models, which all iteratively try to improve
136 | a model built up from weak learners. Gradient boosting is
137 | this particular technique where we are trying to fit the
138 | residuals, and it's been found to work very well in
139 | practice, in particular if you're using shallow trees as the
140 | weak learners. In principle, you could use any model as a
141 | weak learner, but trees just work really well.
142 | ---
143 | class: centre,middle
144 | # Gradient Boosting
145 | ???
146 | Gradient boosting is one of the most successfull supervised
147 | machine learning methods in practice. It's often used in
148 | kaggle to win competition, it's used for credit scoring,
149 | it's one of the standard tools of the trade. It's one of the
150 | best of-the-shelf models A standard implementation that
151 | people use is XGBoost, but there's also an implementation in
152 | scikit-learn, and we'll talk about both of them.
153 | 
154 | Last time we talked about Random forests, which builds many
155 | trees independently, each randomized in a different way, and
156 | then averages their predictions. Gradient boosting on the
157 | other hand builds trees one by one in a sequential manner,
158 | with each tree requiring the results of previous trees.
159 | Often, Gradient boosting is done with very small trees, or
160 | even decision stumps, which is trees of depth one, so a
161 | single split.
162 | 
163 | ---
164 | class: center, middle
165 | # Boosting (in General)
166 | 
167 | ???
168 |  - “Meta-algorithm” to create strong learners from weak learners.
169 |  - AdaBoost, GentleBoost, …
170 |  - Trees or stumps work best
171 |  - Gradient Boosting often the best of the bunch
172 |  - Many specialized algorithms (ranking etc)
173 | 
174 | This is an instance of a more general family of models,
175 | called boosting models, which all iteratively try to improve
176 | a model built up from weak learners. Gradient boosting is
177 | this particular technique where we are trying to fit the
178 | residuals, and it's been found to work very well in
179 | practice, in particular if you're using shallow trees as the
180 | weak learners. In principle, you could use any model as a
181 | weak learner, but trees just work really well.
182 | 
183 | ---
184 | 
185 | # Gradient Boosting Algorithm
186 | 
187 | 
188 | 
189 | `$$ f_{1}(x) \approx y  $$`
190 | 
191 | `$$ f_{2}(x) \approx y - f_{1}(x) $$`
192 | 
193 | `$$ f_{3}(x) \approx y - f_{1}(x) - f_{2}(x)$$`
194 | 
195 | --
196 | 
197 | $y \approx$ ![:scale 22.5%](images/grad_boost_term_1.png) + ![:scale 22.5%](images/grad_boost_term_2.png) + ![:scale 20%](images/grad_boost_term_3.png) + ...
198 | ???
199 | Let's look at the regression case first. We start by
200 | building a single tree f1 to try to predict the output y.
201 | But we strongly restrict f1, so it will be rather bad at
202 | predicting y. Next, we'll look at the residual of this first
203 | model, so y - f1(x). We now train a new model f2 to try and
204 | predict this residual, in other words to correct the
205 | mistakes made by f1. Again, this will be a very simple
206 | model, so it will still not be able to fix all errors. Then,
207 | we look at the residual of both of the models together, so y
208 | - f1(x) - f2(x), so the mistakes that could not be fixed by
209 | f2, and we build f3 to fix that, and so on. This is natural
210 | for regression. For classification this is not as clear. For
211 | binary classification you use log-loss, or rather you apply
212 | the logistic function to get a binary prediction, for
213 | multi-class you can use 1 vs rest.
214 | 
215 | So we're sequentially building up a model using what's
216 | called "weak learners", small trees, and create a more
217 | powerful composite model.
218 | 
219 | ---
220 | # Gradient Boosting Algorithm
221 | 
222 | 
223 | 
224 | `$$ f_{1}(x) \approx y  $$`
225 | 
226 | `$$ f_{2}(x) \approx y - \gamma f_{1}(x) $$`
227 | 
228 | `$$ f_{3}(x) \approx y - \gamma f_{1}(x) - \gamma f_{2}(x)$$`
229 | 
230 | $y \approx \gamma$ ![:scale 22.5%](images/grad_boost_term_1.png) + $\gamma$ ![:scale 22.5%](images/grad_boost_term_2.png) + $\gamma$ ![:scale 20%](images/grad_boost_term_3.png) + ...
231 | <br />
232 | <br />
233 | Learning rate $\gamma, i.e. 0.1$
234 | 
235 | ???
236 | - Iteratively add regression trees to model
237 | - Use log loss for classification
238 | - Discount update by learning rate
239 | 
240 | FIXME plot for regression models
241 | Come back to this
242 | ---
243 | class: compact
244 | # Gradient Boosting is Gradient Descent
245 | 
246 | .left-column[
247 | ## Linear regression
248 | $$ L(\mathbf{x}_i, y_i, \mathbf{w}, b) = \sum_i(y_i - \hat{y}_i)^2 $$
249 | $$ = \sum_i(y_i - w^T \mathbf{x}_i - b)^2 $$
250 | 
251 | optimize:
252 | 
253 | `$$\min_{w \in \mathbb{R}^p, b\in\mathbb{R}} \sum_{i=1}^n (y_i - w^T\mathbf{x}_i - b)^2$$`
254 | 
255 | gradient descent:
256 | $$w_{j+1} = w_j - \gamma \frac{\partial L(\mathbf{x}_i, y_i, \mathbf{w}, b)}{\partial\mathbf{w}}$$
257 | ]
258 | --
259 | .right-column[
260 | ## Gradient Boosting
261 | $$ L(y_i, \hat{y}_i) = \sum_i(y_i - \hat{y}_i)^2 $$
262 | 
263 | optimize:
264 | 
265 | `$$\min_{\hat{y}\in \mathbb{R}^n} \sum_{i=1}^n (y_i - \hat{y_i})^2$$`
266 | 
267 | gradient descent:
268 | $$\hat{y}_{j+1} = \hat{y}_j - \gamma \frac{\partial L(y_i, \hat{y}_i)}{\partial\hat{y}}$$
269 | ]
270 | ---
271 | #GradientBoostingRegressor
272 | 
273 | .center[
274 | ![:scale 50%](images/grad_boost_regression_steps.png)
275 | ]
276 | ???
277 | Here's an illustration for doing this for regression. This
278 | is a 1D regression dataset for illustration purposes here to
279 | form features on the x-axis, the prediction is on the
280 | y-axis.
281 | 
282 | 
283 | In the first step, I'm just fitting my tree to the data. I
284 | use a simple tree of depth 3. The depth 3 tree is not able
285 | to completely model the data and so the orange is the tree
286 | that was fit. After this first step, I look at the total
287 | predictions. So this is just gamma times the predictions
288 | made by the first tree. So you see everything is sort of
289 | squashed together. This is the effect of gamma. The blue
290 | points here in this next panel is this data minus the minus
291 | the predictions from step one, and so this is the residual.
292 | It still looks pretty much the same, because we took only a
293 | small step. Then I fit another tree to this residuals again
294 | of depth three. Then here's the total prediction, which is
295 | gamma times the first tree plus gamma times the second tree,
296 | followed against the original data. The orange has more
297 | steps now because it's a combination of two trees.
298 | 
299 | As I continue the same procedure until step five, you can
300 | see that residual gets much smaller and it has learned most
301 | of the variations in the data. A total linear combination of
302 | all the trees and learning looks like this. And if I keep
303 | doing this, then at some point, residuals will become very
304 | small. And the total prediction will fit the data better and
305 | better.
306 | 
307 | The question is can I extrapolate?
308 | 
309 | The answer is no.
310 | 
311 | The question is how is this different?
312 | 
313 | And it's quite different. It’s kind of hard for me to answer
314 | his question.
315 | 
316 | A) The combination of stumps is different than building a
317 | deeper tree because you always apply all the splits to the
318 | whole dataset. Whereas if you make a tree deeper, you will
319 | have different splits on the deeper nodes. If you have
320 | already like 10 nodes, and you want to grow one more level,
321 | each of these parts of the data will have a different split.
322 | Whereas if I add another stump it will be on a global level.
323 | In decision trees, decisions are very hard. And here you
324 | don't trust any hard decisions. So you only go a little bit
325 | in each direction.
326 | 
327 | B) This works way better.
328 | 
329 | So this was for regression and because it's easier to
330 | visualize, you can do the same thing for classification.
331 | 
332 | Basically, what gradient boosting for classification does is
333 | you’re letting regression trees to learn decision function.
334 | So basically, you're doing regression again, but you're
335 | doing logistic regression. Instead of using a linear model
336 | for logistic regression, we're now using this linear
337 | combination of trees. So in other words, what we're doing is
338 | we're applying a log loss, and we're trying to find the
339 | regression function that has a small log loss. This f is the
340 | decision function that we're doing. And so inside a gradient
341 | boosting classifier, you're not actually learning
342 | classification trees, you're learning regression trees,
343 | which are trying to predict the probability, which is quite
344 | different.
345 | 
346 | The question is, how they're different?
347 | 
348 | The same is true for f1. If I fit f1, exactly to y this
349 | becomes 0. One reason is the gamma, which means I only go a
350 | small step. But even if I set gamma equal to one, the point
351 | is that I restricted my f1 to not completely fit the data.
352 | I'm not trying to completely overfit the data, I'm trying to
353 | use a simple model. So if I use a tree of depth one, there
354 | will be a very large residual.
355 | ---
356 | # Gradient boosting for classification
357 | Logistic regression:
358 | 
359 | `$$\min_{w \in ℝ^{p}, b \in \mathbb{R}}\sum_{i=1}^n\log(\exp(-y_i(w^T \textbf{x}_i + b )) + 1)$$`
360 | 
361 | Gradient boosting:
362 | `$$\min_{y_i\in \mathbb{R}^n}\sum_{i=1}^n\log(\exp(-y_i \hat{y}_i) + 1)$$`
363 | 
364 | ---
365 | # Multiclass Gradient Boosting
366 | Multinomial logistic regression:
367 | `$$p(y=c|x) = \frac{e^{\textbf{w}_c^T\textbf{x} + b_i}}{\sum_{j=1}^k e^{\textbf{w}_j^T\textbf{x} + b_j}}$$`
368 | Multi-class gradient boosting:
369 | `$$p(y=c|x) = \frac{e^{\hat{y}^{(c)}}}{\sum_{j=1}^k e^{\hat{y}^{(j)}}}$$`
370 | One regression tree per class (per gradient step).
371 | ---
372 | #GradientBoostingClassifier / HistGradientBoostingClassifier
373 | 
374 | .center[
375 | ![:scale 70%](images/grad_boost_depth2.png)
376 | ]
377 | ???
378 | Here’s an illustration of what this might look like for
379 | classification. This is basically the same thing going on.
380 | I’m plotting the probabilities assigned by the model.
381 | 
382 | Here's a first estimator, which makes some mistakes here and
383 | makes a bunch of mistake in the middle. This is the decision
384 | tree of depth two. Then I fit the model to the residuals
385 | here. To minimize the log loss of this dataset changes
386 | something in the middle, and so on. And you can see that as
387 | I add more and more estimator it fits the data better and
388 | better and gets more and more complex decision boundaries.
389 | 
390 | Since it works like a gradient descent on log loss, it's a
391 | little bit harder to visualize this for classification.
392 | 
393 | White mean probability of 0.5 (which is a tie). Red means
394 | high probability for the red class. Blue means a high
395 | probability for the blue class. Here, clearly, it hasn't fit
396 | the data perfectly. So I could add more and more models and
397 | then, in the end, it would fit the data perfectly.
398 | 
399 | For multi-class, the default thing to do is One Versus Rest.
400 | 
401 | ---
402 | class:spacious
403 | # Gradient Boosting Advantages
404 | 
405 | # Early stopping
406 | 
407 | - Adding trees can lead to overfitting
408 | - Stop adding trees when validation accuracy stops increasing
409 | 
410 | two choices:
411 | 
412 | - pick number of trees and tune learning rate
413 | 
414 | - pick learning rate, use early stopping
415 | ???
416 | As I said earlier, if you add more and more trees, the
417 | algorithms can overfit. So instead of searching for the
418 | learning rate, or searching for number of trees, you can
419 | also do early stopping. Because this is a sequential
420 | algorithm that gets better and better the more trees you
421 | add, you can just use a validation set and stop adding trees
422 | once you overfit. You can do that both with scikit-learn and
423 | XGBoost.
424 | 
425 | Basically, the idea is that you pick a large number of
426 | estimators and you have a separate validation set and if the
427 | validation set accuracy doesn't improve or if it decreases
428 | for number of iterations, say five, then you just stop the
429 | learning. This way, you get results faster, because you
430 | don't keep learning but also you possibly get a better model
431 | because you know the overfitting.
432 | 
433 | The only downside stopping learning is that you have fewer
434 | data to train your model because you need to the validation
435 | set for early stopping. You’ll still need a separate test
436 | set to see how well the model actually does. You can't use
437 | the same set for early stopping and for evaluation.
438 | ---
439 | 
440 | class:spacious
441 | # Tuning of Gradient Boosting
442 | - Typically strong pruning via max_depth
443 | 
444 | - Tune max_features
445 | - Tune column subsampling, row subsampling
446 | - Regularization
447 | - Pick learning rate and do early stopping
448 | - Or Pick n_estimators, tune learning rate (if not early stopping)
449 | ???
450 | There are several things you can tune about the gradient
451 | boosting. A common approach is to pick the number of
452 | estimators that you have time for. So runtime is obviously
453 | linear than number of estimators because you need to build
454 | each tree one by one. Each tree is built with the same
455 | parameters. And then you can tune the learning rate to see
456 | how strongly you want to fit the data.
457 | 
458 | You can also tune something like max features if you want to
459 | add more randomness, but that's actually not very commonly
460 | used. You can also subsample the data if you want faster
461 | training. And typically there's a maximum depth.
462 | Traditionally, it was like maximum depth of one, two or
463 | three, though in kaggle people are doing like 8 to 10 or
464 | something. But typically, the depth is much, much smaller
465 | than for random forests. This means the model will be small
466 | in memory, and also will be faster to predict since you have
467 | less deep trees to traverse.
468 | Before we go into implementation I want to talk a little bit
469 | about analyzing the models. So again, because it’s a
470 | tree-based model, the prediction is a linear combination of
471 | trees, you can look at feature importance like the same way
472 | we did for random forest and trees.
473 | 
474 | ---
475 | class: middle
476 | # Improvements:
477 | # "extreme" gradient boosting
478 | 
479 | [XGBoost: A Scalable Tree Boosting System, 2016](http://dmlc.cs.washington.edu/data/pdf/XGBoostArxiv.pdf)
480 | ---
481 | class: center, middle
482 | # Speeding up tree-building via binning
483 | ---
484 | # Split finding is slow
485 | 
486 | ```python
487 | for feature in features:
488 |     for threshold in thresholds(f):
489 |         gain = compute_gain(feature, threshold)
490 |         if gain > best_gain:
491 |             best_split = (feature, threshold)
492 | ```
493 | 
494 | - Thresholds: all unique values of feature.
495 | - Scanning thresholds = sorting: O(n log n)
496 | ---
497 | # Binning features
498 | 
499 | ![:scale 100%](images/binning_quantiles.png)
500 | ---
501 | # Binning features
502 | 
503 | .left-column[
504 | ## Original
505 | ```
506 | [[5. , 2. , 3.5, 1. ],
507 |  [4.9, 3. , 1.4, 0.2],
508 |  [4.4, 2.9, 1.4, 0.2],
509 |  [5. , 2.3, 3.3, 1. ],
510 |  [4.9, 2.5, 4.5, 1.7],
511 |  [6.3, 2.5, 5. , 1.9],
512 |  [6.3, 2.3, 4.4, 1.3],
513 |  [5. , 3.5, 1.3, 0.3],
514 |  [6.1, 2.8, 4.7, 1.2],
515 |  [5. , 3.5, 1.6, 0.6]])
516 |  ```
517 |  ]
518 |  .right-column[
519 | ## Binned
520 | ```
521 | [[1, 0, 1, 1],
522 |  [0, 2, 0, 1],
523 |  [0, 1, 0, 1],
524 |  [1, 0, 1, 1],
525 |  [0, 0, 2, 3],
526 |  [3, 0, 3, 4],
527 |  [3, 0, 2, 2],
528 |  [1, 4, 0, 1],
529 |  [3, 1, 3, 2],
530 |  [1, 4, 1, 1]])
531 | ```
532 |  ]
533 | ---
534 | # Binned split finding is fast
535 | .left-column[
536 | ![:scale 100%](images/xgboost-exact.png)
537 | ]
538 | --
539 | .right-column[
540 | ![:scale 100%](images/xgboost-binning.png)
541 | ]
542 | ---
543 | class: spacious
544 | 
545 | # A better split criterion
546 | - Include second order (hessian) and regularizer
547 | ![:scale 100%](images/xgboost-criterion.png)
548 | 
549 | ---
550 | 
551 | # Aggressive sub-sampling
552 | .padding-top[
553 | According to user feedback, using column sub-sampling prevents over-fitting even more so than the traditional row sub-sampling. (XGBoost: A Scalable Tree Boosting System, 2016)
554 | ]
555 | ---
556 | # HistGradientBoostingClassifier
557 | 
558 | ![:scale 100%](images/hist_gradient_boosting.png)
559 | ---
560 | 
561 | .left-column[
562 | 
563 | ## GradientBoostingClassifier
564 | 
565 | - no binning
566 | - single core
567 | - sparse data support
568 | ]
569 | 
570 | .right-column[
571 | ## HistGradientBoostingClassifier
572 | - binning
573 | - multicore
574 | - no sparse data support
575 | - missing value support
576 | - soon: monotonicity support
577 | - soonish: native categorical variables
578 | ]
579 | 
580 | ---
581 | # XGBoost
582 | `conda install -c conda-forge xgboost`
583 | 
584 | ```python
585 | from xgboost import XGBClassifier
586 | xgb = XGBClassifier()
587 | xgb.fit(X_train, y_train)
588 | xgb.score(X_test, y_test))
589 | ```
590 | 
591 | - supports missing values
592 | - GPU training
593 | - networked parallel training
594 | - monotonicity constraints
595 | - supports sparse data
596 | 
597 | ???
598 | In terms of implementation, there's a couple of
599 | implementation methods. Scikit-learn has gradient boosting
600 | classifier and gradient boosting regressor. Unfortunately,
601 | they are rather slow.
602 | 
603 | So one of the most commonly used packages is XGBoost, which
604 | you can install from conda forge for example. It's
605 | completely scikit-learn compatible. So you can import
606 | XGBClassifier if you want you can use it with grid search or
607 | with pipelines.
608 | 
609 | The cool thing is its fast. It supports missing values so
610 | you don't need to do an imputation strategy, you can put
611 | them directly into XGBoost. And it also supports multi-core.
612 | 
613 | The gradient boosting in scikit-learn is just sequential on
614 | a single core which on most of the machines are not great.
615 | If you have a big machine you want to use all the cores in
616 | parallel. You can’t do that with scikit-learn but you can do
617 | it with XGBoost.
618 | 
619 | For random forest, on the other hand, scikit-learn can also
620 | run random forest in parallel, because it's much easier.
621 | XGBoost also has a random forest. XGBoost also has some
622 | enhancements to the algorithm. It has L1 and L2 penalties
623 | for the leaves, so you can do something like an elastic net
624 | or lasso penalty to the leaves. I think by default, it's
625 | disabled. But they are not really training like the vanilla
626 | decision trees.
627 | 
628 | One of the reasons they're faster is because they have a
629 | faster implementation. But you can make even faster if you
630 | use approximate splits in the trees.
631 | 
632 | If you want to find a split, you need to search over all
633 | features and you need to search over all possible thresholds
634 | on the feature. Searching over all possible thresholds on
635 | the feature means soaring feature, which is an analog
636 | operation. Instead, what you can do is you can bin the
637 | features, and then its linear time operation but the
638 | threshold will be approximate.
639 | 
640 | And that will make your computation much faster.
641 | - Efficient implementation of gradient boosting (5x sklearn)
642 | - Improvements on original algorithm
643 | - https://arxiv.org/abs/1603.02754
644 | - Adds l1 and l2 penalty on leaf-weights
645 | - Fast approximate split finding
646 | - Scikit-learn compatible interface
647 | 
648 | ---
649 | # LightGBM
650 | `conda install -c conda-forge lightgbm`
651 | 
652 | ```python
653 | from lightgbm.sklearn import LGBMClassifier
654 | lgbm = LGBMClassifier()
655 | lgbm.fit(X_train, y_train)
656 | lgbm.score(X_test, y_test))
657 | ```
658 | 
659 | - supports missing values
660 | - natively supports categorical variables
661 | - GPU training
662 | - networked parallel training
663 | - monotonicity constraints
664 | - supports sparse data
665 | ---
666 | # CatBoost
667 | .smallest[
668 | `conda install -c conda-forge catboost`
669 | ```python
670 | from catboost.sklearn import CatBoostClassifier
671 | catb = CatBoostClassifier()
672 | catb.fit(X_train, y_train)
673 | catb.score(X_test, y_test))
674 | ```
675 | ]
676 | .smaller[
677 | - optimized for categorical variables
678 | - uses one feature / threshold for all splits on a given level aka symmetric trees
679 | - Symmetric trees are "different" but can be much faster
680 | - supports missing value
681 | - GPU training
682 | - monotonicity constraints
683 | - uses bagged and smoothed version of target encoding for categorical variables
684 | - lots of tooling
685 | ]
686 | ---
687 | class:spacious
688 | # Gradient Boosting Advantages
689 | 
690 | - Very fast using HistGradientBoosting (or XGBoost, LightGBM)
691 | - Small model size
692 | - Typically more accurate than Random Forests
693 | 
694 | - "old" GradientBoosting in sklearn is comparatively slow
695 | ???
696 | It's sort of slower to train if you're training serial, but
697 | if you paralyze it, it's often faster to train since it has
698 | a much smaller model size. So the trees are usually not as
699 | deep and you don't need as many because you're doing much
700 | more focused learning where you try to correct the mistakes
701 | of the other models. So usually it’s very fast to predict
702 | because prediction can happen in parallel over all the
703 | trees, while learning cannot happen in parallel over all the
704 | trees necessarily. Usually, this is more accurate than
705 | random forests.
706 | 
707 | The question is for the same number of estimators, how does
708 | low versus high learning rate changes?
709 | 
710 | High learning rate allows you to fit the data more strongly
711 | and also overfit the data more strongly.
712 | 
713 | 
714 | ---
715 | class: middle
716 | # Concluding tree-based models
717 | ---
718 | class:spacious
719 | # When to use tree-based models
720 | - Model non-linear relationships
721 | - Doesn’t care about scaling, no need for feature engineering
722 | - Single tree: very interpretable (if small)
723 | - Random forests very robust, good benchmark
724 | - Gradient boosting often best performance with careful tuning
725 | 
726 | ???
727 | To summarize, tree-based models are really very, very
728 | popular family of models. They're very commonly used. You
729 | probably want to use them if you want nonlinear
730 | relationships, or if you have a lot of different kinds of
731 | weird features because they really don't care about scaling
732 | of the features, so it allows you to get rid of mostly
733 | preprocessing.
734 | 
735 | If you want very interpretable model, single trees or small
736 | single trees are good ideas, because it's one of the few
737 | models that you can sort of write down on a blackboard and
738 | show to someone, and they'll have some idea of what's going
739 | on. That's impossible for any other models.
740 | 
741 | Random forests are great because they're very robust. And
742 | you don't have to tune anything, you just run a random
743 | forest with 100 trees and with the default settings and it
744 | will work great.
745 | 
746 | And so this one of my first benchmarks. Usually, I first ran
747 | a logistic regression then I run random forests.
748 | 
749 | Gradient Boosting is often the best performing model, sort
750 | of the centered toolbox. Sometimes you need to tune a little
751 | bit more, so you need to tune the depth of the trees or the
752 | learning rate, and so on. But if you tune them, then they
753 | will usually beat random forest on those data sets.
754 | 
755 | One case where you might not want to use trees is if you
756 | have very high dimensional sparse data then linear models
757 | might work better but also your mileage may vary. On the
758 | contrary, if you have like a low dimensional space,
759 | tree-based models are probably a good bet.
760 | 
761 | Next thing we'll talk about is a different way to build
762 | ensembles, different way to put together multiple models.
763 | 
764 | ---
765 | class: center, middle
766 | 
767 | # Questions ?
768 | 
769 |     </textarea>
770 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
771 |     <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
772 | 
773 |     <script>
774 |     // Config Remark
775 |     remark.macros['scale'] = function (percentage) {
776 |         var url = this;
777 |         return '<img src="' + url + '" style="width: ' + percentage + '" />';
778 |     };
779 |     config_remark = {
780 |         highlightStyle: 'magula',
781 |         highlightSpans: true,
782 |         highlightLines: true,
783 |         ratio: "16:9"
784 |     };
785 |       var slideshow = remark.create(config_remark);
786 |     // Configure MathJax
787 |     MathJax.Hub.Config({
788 |     tex2jax: {
789 |         inlineMath: [['$','$'], ['\\(','\\)']],
790 |         skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] /* removed 'code' entry*/
791 |     }
792 |     });
793 |     MathJax.Hub.Queue(function() {
794 |         var all = MathJax.Hub.getAllJax(), i;
795 |         for(i = 0; i < all.length; i += 1) {
796 |             all[i].SourceElement().parentNode.className += ' has-jax';
797 |         }
798 |     });
799 |     </script>
800 |   </body>
801 | </html>
802 | 


--------------------------------------------------------------------------------
/slides/03-linear-models-regression.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <title>Linear Models for Regression</title>
  5 |     <meta charset="utf-8">
  6 |     <link rel="stylesheet" href="style.css">
  7 |     <style>
  8 |         @import url(https://fonts.googleapis.com/css?family=Garamond);
  9 |         @import url(https://fonts.googleapis.com/css?family=Muli:400,700,400italic);
 10 |         @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
 11 |       </style>
 12 |     </head>
 13 |     <body>
 14 |       <textarea id="source">
 15 | 
 16 | class: center, middle
 17 | 
 18 | ![:scale 40%](images/sklearn_logo.png)
 19 | 
 20 | ### Intermediate Machine learning with scikit-learn
 21 | 
 22 | # Linear Models for Regression
 23 | 
 24 | Andreas C. Müller
 25 | 
 26 | Columbia University, scikit-learn
 27 | 
 28 | .smaller[https://github.com/amueller/ml-workshop-2-of-4]
 29 | 
 30 | ---
 31 | class:center
 32 | 
 33 | # Linear Models for Regression
 34 | 
 35 | ![:scale 50%](images/linear_regression_1d.png)
 36 | 
 37 | $$\hat{y} = w^T \mathbf{x} + b = \sum_{i=1}^p w_i x_i +b$$
 38 | 
 39 | ???
 40 | Predictions in all linear models for regression are of the
 41 | form shown here: It's an inner product of the features with
 42 | some coefficient or weight vector w, and some bias or
 43 | intercept b. In other words, the output is a weighted sum of
 44 | the inputs, possibly with a shift. here i runs over the
 45 | features and x_i is one feature of the data point x. These
 46 | models are called linear models because they are linear in
 47 | the parameters w. The way I wrote it down here they are also
 48 | linear in the features x_i. However, you can replace the
 49 | features by any non-linear function of the inputs, and it'll
 50 | still be a linear model.
 51 | 
 52 | There are many differnt linear models for regression, and
 53 | they all share this formula for making predictions. The
 54 | difference between them is in how they find w and b based on
 55 | the training data.
 56 | 
 57 | ---
 58 | # Ordinary Least Squares
 59 | 
 60 | $$\hat{y} = w^T \mathbf{x} + b = \sum_{i=1}^p w_i x_i +b $$
 61 | 
 62 | `$$\min_{w \in \mathbb{R}^p, b\in\mathbb{R}} \sum_{i=1}^n (w^T\mathbf{x}_i + b - y_i)^2$$`
 63 | 
 64 | Unique solution if $\mathbf{X} = (\mathbf{x}_1, ... \mathbf{x}_n)^T$ has full column rank.
 65 | ???
 66 | The most straight-forward solution that goes back to Gauss
 67 | is ordinary least squares. In ordinary least squares, find w
 68 | and b such that the predictions on the training set are as
 69 | accurate as possible according the the squared error. That
 70 | intuitively makes sense: we want the predictions to be good
 71 | on the training set. If there is more samples than features
 72 | (and the samples span the whole feature space), then there
 73 | is a unique solution. The problem is what's called a least
 74 | squares problem, which is particularly easy to optimize and
 75 | get the unique solution to.
 76 | 
 77 | However, if there are more features than samples, there are
 78 | usually many perfect solutions that lead to 0 error on the
 79 | training set. Then it's not clear which solution to pick.
 80 | Even if there are more samples than features, if there are
 81 | strong correlations among features the results might be
 82 | unstable, and we'll see some examples of that soon.
 83 | 
 84 | Before we look at examples, I want to introduce a popular
 85 | alternative.
 86 | 
 87 | ---
 88 | # Ridge Regression
 89 | 
 90 | `$$ \min_{w \in \mathbb{R}^p, b\in\mathbb{R}} \sum_{i=1}^n (w^T\mathbf{x}_i + b - y_i)^2 + \alpha ||w||^2 $$`
 91 | 
 92 | Always has a unique solution.
 93 | 
 94 | Tuning parameter alpha.
 95 | 
 96 | ???
 97 | In Ridge regression we add another term to the optimization
 98 | problem. Not only do we want to fit the training data well,
 99 | we also want w to have a small squared l2 norm or squared
100 | euclidean norm. The idea here is that we're decreasing the
101 | "slope" along each of the feature by pushing the
102 | coefficients towards zero. This constraings the model to be
103 | more simple.
104 | 
105 | So there are two terms in this optimization problem, which
106 | is also called the objective function of the model: the data
107 | fitting term here that wants to be close to the training
108 | data according to the squared norm, and the prenalty or
109 | regularization term here that wants w to have small norm,
110 | and that doesn't depend on the data.
111 | 
112 | Usually these two goals are somewhat opposing. If we made w
113 | zero, the second term would be zero, but the predictions
114 | would be bad. So we need to trade off between these two. The
115 | trade off is problem specific and is specified by the user.
116 | If we set alpha to zero, we get linear regression, if we set
117 | alpha to infinity we get a constant model. Obviously usually
118 | we want something in between.
119 | 
120 | This is a very typical example of a general principle in
121 | machine learning, called regularized empirical risk
122 | minimization.
123 | 
124 | ---
125 | # (regularized) Empirical Risk Minimization
126 | 
127 | `$$ \min_{f \in F} \sum_{i=1}^n L(f(\mathbf{x}_i), y_i) + \alpha R(f) $$`
128 | 
129 | ???
130 | FIXME pointers data fitting / regularization!
131 | 
132 | Many models in machine learning, like linear models, SVMs
133 | and neural networks follow the general framework of
134 | empirical risk minimization, which you can see here. We
135 | formulate the machine learning problem as an optimization
136 | problem over a family of functions. In our case that was the
137 | family of linear functions parametrized by w and b. The
138 | minimization problem consists of two parts, the data fitting
139 | part and the model complexity part. The data fitting part
140 | says that the predictions mad eby our functions should be
141 | accurate according to some loss L. For our regression
142 | problems that was the squared loss. The model complexity
143 | part says that we prefer simple models and penalizes
144 | complicated f. Most machine learning algorithms can be cast
145 | into this, with a particular choice of family of functions
146 | f, loss function L and regularizer R. And most of machine
147 | learning theory is build around this framework. People proof
148 | for differnt choices of F and L and R that if you minimize
149 | this, you'll be able to generalize well. And that makes
150 | intuitive sense. To do well on the test set, we definitely
151 | want to do reasonably well on the training set. We don't
152 | expect that we can do better on a test set than the training
153 | set. But we also want to minimize the performance difference
154 | between training and test set. If we restrict our model to
155 | be simple via the regularizer R, we have better chances of
156 | the model generalizing.
157 | 
158 | ---
159 | # Reminder on model complexity
160 | 
161 | ![:scale 80%](images/overfitting_underfitting_cartoon_full.png)
162 | 
163 | ???
164 | I hope this sounds familiar from what we talked about last
165 | time. This is a particular way of dealing with overfitting
166 | and underfitting. For this framework in general, or for
167 | ridge regression in particular, trading off the data fitting
168 | and the regularization changes the model complexity. If we
169 | set alpha high we restrict the model, and we will be on the
170 | left side of the graph. If we make alpha small, we allow the
171 | model to fit the data more, and we're on the right side of
172 | the graph.
173 | 
174 | ---
175 | # Ames Housing Dataset
176 | 
177 | ![:scale 80%](images/ames_housing_scatter.png)
178 | .tiny[
179 | ```python
180 | print(X.shape)
181 | print(y.shape)
182 | ```
183 | ```
184 | (2195, 79)
185 | (2195,)
186 | ```
187 | ]
188 | ???
189 | Ok after all this pretty abstract talk, let's make this
190 | concrete. Let's do some regression on the boston housing
191 | dataset. After the last homework you're hopefully familiar
192 | with it. The idea is to predict prices of property in the
193 | boston area in different neighborhoods. This is a dataset
194 | from the 70s I think, so everything is pretty cheap. Most of
195 | the features you can see are continuous, with the exception
196 | of the charlston river variable which says whether the
197 | neighborhood is on the river.
198 | 
199 | Keep in mind that this data lives in a 13 dimensional space
200 | and these univariate plots only look at 13 different
201 | projections of the data, and can't capture any of the
202 | interactions.
203 | 
204 | But still we can see that the price clearly depends on some
205 | of these variables. It's also pretty clear that the
206 | dependency is non-linear for some of the variables. We'll
207 | still start with a linear model, because its a very simple
208 | class of models, and I'd always star approaching any model
209 | from the simplest baseline. In this case it's linear
210 | regression. We're having 506 samples and 13 features. We
211 | have much more samples than features. Linear regression
212 | should work just fine. Also it's a tiny dataset, so
213 | basically anything we'll try will run instantaneously, which
214 | is also good to keep in mind.
215 | 
216 | Another thing that you can see in this graph is that the
217 | features have very different scales. Here's a box plot that
218 | shows that even more clearly.
219 | ---
220 | 
221 | ![:scale 100%](images/ames_scaling.png)
222 | 
223 | ???
224 | That's something that will trip up the distance based models
225 | models we talked about last time, as well as the linear
226 | models we're talking about today.  For the penalized models
227 | the different scales mean that different features are
228 | penalized differently, which you usually want to avoid.
229 | Usually there is no particular semantics attached to the
230 | fact that one feature has different magnitutes than another.
231 | We could measure something in inches instead of miles, and
232 | that would change the outcome of the model. That's certainly
233 | not something we want. A good idea is to scale the data to
234 | get rid of this effect. We'll talk about that and other
235 | preprocessing methods in-depth on Wednesday next week. Today
236 | I'm mostly gonna ignore this. But let's get started with
237 | Linear Regression
238 | 
239 | ---
240 | # Coefficient of determination R^2
241 | 
242 | `$$ R^2(y, \hat{y}) = 1 - \frac{\sum_{i=0}^{n - 1} (y_i - \hat{y}_i)^2}{\sum_{i=0}^{n - 1} (y_i - \bar{y})^2} $$`
243 | 
244 | `$$ \bar{y} =  \frac{1}{n} \sum_{i=0}^{n - 1} y_i$$`
245 | 
246 | Can be negative for biased estimators - or the test set!
247 | 
248 | ???
249 | The scores are R squared or coefficient of determination.
250 | This is basically a score that's usually between zero and
251 | one, where one means perfect prediction or perfect
252 | correlation and zero means a random prediction. What it does
253 | is it computes the mean of the targets over the data you're
254 | evaluating it on and then it looks at the distance between
255 | the prediction and the ground truth relative to the mean. If
256 | it's negative, it means you do a worse job at predicting and
257 | just predicting the mean. It can happen if your model was
258 | really bad and bias. The other reason is if you use a test
259 | set. This is guaranteed to be positive on the data it was
260 | fit on with an unbiased linear model, which will nearly
261 | never apply to what we’re doing.
262 | 
263 | The R^2 can be misleading if there's outliers in the training
264 | data and some consider it a bad metric. Max Kuhn, author of APM
265 | thinks it's a bad metric. It's not clear to me that MSE is much
266 | better in general, though. Reducing anything to a single number
267 | is tricky.
268 | 
269 | ---
270 | class: smaller
271 | # Preprocessing
272 | ```python
273 | cat_preprocessing = make_pipeline(
274 |     SimpleImputer(strategy='constant', fill_value='NA'),
275 |     OneHotEncoder(handle_unknown='ignore'))
276 | 
277 | cont_preprocessing = make_pipeline(
278 |     SimpleImputer(),
279 |     StandardScaler())
280 | 
281 | preprocess = make_column_transformer(
282 |     (cat_preprocessing, make_column_selector(dtype_include='object')),
283 |     remainder=cont_preprocessing)
284 | 
285 | X_train, X_test, y_train, y_test = train_test_split(
286 |     X, y, random_state=0)
287 | 
288 | X_train_pre = preprocess.fit_transform(X_train)
289 | ridge = Ridge().fit(X_train_pre, y_train)
290 | 
291 | X_test_pre = preprocess.transform(X_test)
292 | ridge.score(X_test_pre)
293 | ```
294 | ```
295 | 0.95
296 | ```
297 | ???
298 | Let’s look at two simple models. Linear regression and Ridge
299 | regression. What I've done is I’ve split the data into
300 | training and test set and used 10 fold cross-validation to
301 | evaluate them. Here I use cross_val_score together with the
302 | model, the training data, training labels, and 10 fold
303 | cross-validation. This will return 10 scores and I'm going
304 | to compute the mean of them. I'm doing this for both linear
305 | regression and Ridge regression. Here is ridge regression
306 | uses a default value of alpha of 1. Here these two scores
307 | are quite similar.
308 | 
309 | 
310 | ---
311 | 
312 | .smallest[
313 | ```python
314 | from sklearn.model_selection import GridSearchCV
315 | param_grid = {'alpha': np.logspace(-3, 3, 13)}
316 | print(param_grid)
317 | ```
318 | ```
319 | {'alpha': array([ 0.001,  0.003, 0.01, 0.032, 0.1, 0.316, 1., 3.162,
320 |                     10., 31.623, 100., 316.228, 1000.])}
321 | ```
322 | 
323 | ```python
324 | grid = GridSearchCV(Ridge(), param_grid, cv=10)
325 | grid.fit(X_train, y_train)
326 | ```
327 | ]
328 | .center[
329 | ![:scale 50%](images/ridge_alpha_search.png)
330 | ]
331 | 
332 | ???
333 | Coming back to the ridge regression we used the standard
334 | alpha of one which is a reasonable default, but by no means,
335 | this is guaranteed to make sense in this particular problem.
336 | Here I’ve done the grid search. As we talked about on
337 | Monday, I defined a parameter grid where the key is the
338 | parameter I want to search (alpha in Ridge) and the
339 | parameters I want to try. For regularization parameters,
340 | like alpha, it’s usually good to do them on the logarithmic
341 | grid. I do a relatively fine grid here with 13 different
342 | points mostly because I wanted to have a nice plot. In
343 | reality, I would use a three or six or something like that.
344 | I’ve instantiated GridSearchCV with Ridge(), the parameter
345 | grid and do 10 fold cross-validation and then I called
346 | grid.fit. I’ve reported the mean training accuracy and mean
347 | test accuracy over 10 cross-validation folds for each of the
348 | parameter settings. Okay, you can see a couple things here.
349 | A) There's a lot of uncertainty B) The training set is
350 | always better than the test set. C) The most important thing
351 | that you can see here is that regularization didn't help.
352 | Making alpha as small as possible is the best. What I'm
353 | going to do next is I'm going to modify this dataset a
354 | little bit so that we can see the effect of the
355 | regularization. I’m going to modifying by using a polynomial
356 | expansion, again we're going to talk about a little bit more
357 | on Wednesday.
358 | 
359 | ---
360 | .padding-top[
361 | .left-column[
362 | ![:scale 100%](images/ridge_alpha_search.png)
363 | ]
364 | .right-column[
365 | ![:scale 100%](images/ridge_alpha_search_cv_runs.png)
366 | ]
367 | ]
368 | ---
369 | # Triazine Dataset
370 | 
371 | ```python
372 | triazines = fetch_openml('triazines')
373 | triazines.data.shape
374 | ```
375 | ```
376 | (186, 60)
377 | ```
378 | ```python
379 | pd.Series(triazines.target).hist()
380 | ```
381 | .center[
382 | ![:scale 40%](images/triazine_bar.png)
383 | ]
384 | 
385 | ---
386 | ```python
387 | X_train, X_test, y_train, y_test = train_test_split(
388 |     triazines.data, triazines.target, random_state=0)
389 | 
390 | cross_val_score(LinearRegression(), X_train, y_train, cv=5)
391 | ```
392 | ```
393 | array([-4.749e+24, -9.224e+24, -7.317e+23, -2.318e+23, -2.733e+22])
394 | ```
395 | ```python
396 | cross_val_score(Ridge(), X_train, y_train, cv=5)
397 | ```
398 | ```
399 | array([0.263, 0.455, 0.024, 0.23 , 0.036])
400 | ```
401 | 
402 | ---
403 | ```python
404 | param_grid = {'alpha': np.logspace(-3, 3, 13)}
405 | 
406 | grid = GridSearchCV(Ridge(), param_grid, cv=RepeatedKFold(10, 5),
407 |                     return_train_score=True)
408 | grid.fit(X_train, y_train)
409 | ```
410 | .center[
411 | ![:scale 40%](images/ridge_alpha_triazine.png)
412 | ]
413 | 
414 | ---
415 | # Plotting coefficient values (LR)
416 | 
417 | ```python
418 | lr = LinearRegression().fit(X_train, y_train)
419 | plt.scatter(range(X_train.shape[1]), lr.coef_,
420 |             c=np.sign(lr.coef_), cmap='bwr_r')
421 | ```
422 | .center[
423 | ![:scale 55%](images/lr_coefficients_large.png)
424 | ]
425 | 
426 | ???
427 | In the previous slide, linear regression did nearly as well
428 | as the Ridge Regression with the default parameters but
429 | let's look at the coefficients of a linear model. These are
430 | the coefficients of the linear model trained on the
431 | polynomial futures. I plot them adding a color to represent
432 | positive and negative. The magnitude here is 1 and then 13
433 | zeros. We have two features, one is like, probably more than
434 | trillions times two and then there's another one that's very
435 | negative. What I conclude from this is, these 2 features are
436 | very highly correlated. The model makes both of them really,
437 | really big and then they cancel each other out. This is not
438 | a very nice model, because it relates to American stability
439 | and also it tells me that these 2 features are like
440 | extremely important, but they might not be important at all.
441 | Like the other features might be more important, but they're
442 | nullified by this cancellation effect. They (0.2 and -0.8)
443 | need to cancel each other out because the predictions are
444 | reasonable and all the houses only cost like $70,000.
445 | 
446 | ---
447 | # Ridge Coefficients
448 | 
449 | ```python
450 | ridge = grid.best_estimator_
451 | plt.scatter(range(X_train.shape[1]), ridge.coef_,
452 |             c=np.sign(ridge.coef_), cmap="bwr_r")
453 | ```
454 | .center[
455 | ![:scale 55%](images/ridge_coefficients.png)
456 | ]
457 | 
458 | ???
459 | Let's look at the Ridge model. This is the best estimator,
460 | which is the model that was found in a grid search with the
461 | best parameter settings. This looks much more reasonable.
462 | This feature, which was a very negative one, still is very
463 | negative. But now this is actually three and minus three. So
464 | this is a much more reasonable range. We can also look at
465 | the features and the effect of different values of alpha.
466 | Here is a Ridge with 3 different values of alpha. In the
467 | previous random seat, alpha equal to 14 was the best now and
468 | now we have it equal to 30 something. The green one is more
469 | or less the best setting and then there's a smaller and a
470 | bigger one. You can see that basically what alpha does is,
471 | on average, it pushes all the coefficients toward zero. So
472 | here you can see this coefficient shrank going from 1 to 14
473 | going to 0 and the same here. So basically, they all push
474 | the different features towards 0. If you look at this long
475 | enough, you can see things that are interesting, the first
476 | one with alpha equal to one it's positive, and with alpha
477 | equal to 100, it's negative. That means depending on how
478 | much you regularize the direction of effect goes in opposite
479 | directions, what that tells me is don't interpret your
480 | models too much because clearly, either it has a positive or
481 | negative effect, it can't have both.
482 | 
483 | ---
484 | ```python
485 | ridge100 = Ridge(alpha=100).fit(X_train, y_train)
486 | ridge1 = Ridge(alpha=.1).fit(X_train, y_train)
487 | plt.figure(figsize=(8, 4))
488 | 
489 | plt.plot(ridge1.coef_, 'o', label="alpha=.1")
490 | plt.plot(ridge.coef_, 'o', label=f"alpha={ridge.alpha:.2f}")
491 | plt.plot(ridge100.coef_, 'o', label="alpha=100")
492 | plt.legend()
493 | ```
494 | 
495 | .center[
496 | ![:scale 60%](images/ridge_coefficients_alpha.png)
497 | ]
498 | 
499 | ???
500 | 
501 | One other way to visualize the coefficient is to look at the
502 | coefficient path or regularization path. On the x-axis is
503 | the alpha, and on the y-axis, the coefficient magnitude.
504 | Basically, I looped over all of the different alphas and you
505 | can see how they shrink towards zero to increase alpha.
506 | There are some very big coefficients that go to zero very
507 | quickly and some coefficients here that stay the same for a
508 | long time.
509 | 
510 | 
511 | ---
512 | # Lasso Regression
513 | 
514 | `$$ \min_{w \in \mathbb{R}^p, b\in\mathbb{R}} \sum_{i=1}^n (w^T\mathbf{x}_i + b - y_i)^2 + \alpha ||w||_1 $$`
515 | 
516 | - Shrinks w towards zero like Ridge
517 | 
518 | - Sets some w exactly to zero - automatic feature selection!
519 | 
520 | ???
521 | Lasso Regression looks very similar to Ridge Regression. The
522 | only thing that is changed is we use the L1 norm instead of
523 | the L2 norm. L2 norm is the sum of squares, the L1 norm is
524 | the sum of the absolute values. So again, we are shrinking w
525 | towards 0, but we're shrinking it in a different way. The L2
526 | norm penalizes very large coefficients more, the L1 norm
527 | penalizes all coefficients equally. What this does in
528 | practice is its sets some entries of W to exactly 0. It does
529 | automatic feature selection if the coefficient of zero means
530 | it doesn't influence the prediction and so you can just drop
531 | it out of the model. This model does features selection
532 | together with prediction. Ideally what you would want is,
533 | let's say you want a model that does features selections.
534 | The goal is to make our model automatically select the
535 | features that are good. What you would want to penalize the
536 | number of features that it uses, that would be L0 norm.
537 | 
538 | ---
539 | # Grid-Search for Lasso
540 | ```python
541 | param_grid = {'alpha': np.logspace(-3, 0, 13)}
542 | print(param_grid)
543 | ```
544 | ```
545 | {'alpha': array([ 0.001,  0.003, 0.01, 0.032, 0.1, 0.316, 1., 3.162,
546 |                     10., 31.623, 100., 316.228, 1000.])}
547 | ```
548 | ```python
549 | grid = GridSearchCV(Lasso(normalize=True), param_grid, cv=10)
550 | grid.fit(X_train, y_train)
551 | 
552 | print(grid.best_params_)
553 | print(grid.best_score_)
554 | ```
555 | ```
556 | {'alpha': 0.0016}
557 | 0.163
558 | ```
559 | 
560 | 
561 | ???
562 | 
563 | Now we can do Grid Search again, the default parameters
564 | usually don't work very well for Lasso. I use alpha on the
565 | logarithmic grid. I fitted and then I get the best score.
566 | ---
567 | 
568 | ![:scale 90%](images/lasso_alpha_triazine.png)
569 | 
570 | ???
571 | 
572 | Looking at the training test set performance, you can see
573 | that if you increase the regularization, this model gets
574 | really bad. The ridge regression didn't go this badly. If
575 | you set the realization to one, all coefficients become
576 | zero. Other than that there's reasonable performance, which
577 | is about as good as the ridge performance.
578 | ---
579 | .center[
580 | ![:scale 60%](images/lasso_coefficients.png)
581 | ]
582 | 
583 | ```python
584 | print(X_train.shape)
585 | np.sum(lasso.coef_ != 0)
586 | ```
587 | ```
588 | (139, 60)
589 | 13
590 | ```
591 | 
592 | ???
593 | 
594 | These are the coefficients of the model. Out of the 104
595 | features, it only selected 64 that are non-zero, the other
596 | ones are exactly zero. You can see this visualized here. The
597 | white ones are exactly zero and the other ones on non-zero.
598 | If I wanted I could prune the future space a lot and that
599 | makes the model possibly more interpretable. There's a
600 | slight caveat here if two of the features that are very
601 | correlated, Lasso will pick one of them at random and make
602 | the other one zero. Just because something's zero doesn't
603 | mean it's not important. It means you can drop it out of
604 | this model. If you have two features that are identical, one
605 | of them will be zero and one of them will be not zero and
606 | it's going to be randomly selected. That makes
607 | interpretation a little bit harder.
608 | 
609 | ---
610 | # Elastic Net
611 | 
612 | - Combines benefits of Ridge and Lasso
613 | 
614 | - two parameters to tune.
615 | 
616 | 
617 | `$$\min_{w \in \mathbb{R}^p, b\in\mathbb{R}} \sum_{i=1}^n ||w^T\mathbf{x}_i + b - y_i||^2 + \alpha_1 ||w||_1 +  \alpha_2 ||w||^2_2 $$`
618 | 
619 | ???
620 | You can also combine them. This actually what works best in
621 | practice. This is what's called the Elastic Net. Elastic Net
622 | tries to combine both of these penalizations together. You
623 | now have both terms, you have the L1 norm and the L2 norm
624 | and you have different values of alpha. Basically, this
625 | generalizes both. If you choose both these are alpha, it can
626 | become ridge and it can become Lasso, it can become any
627 | anything in between. Generally, ridge helps generalization.
628 | So it's a good idea to have the ridge penalty in there, but
629 | also maybe if there are some features that are really not
630 | useful, the L1 penalty helps makes the same exactly zero.
631 | 
632 | ---
633 | # Parametrization in scikit-learn
634 | `$$\min_{w \in \mathbb{R}^p, b\in\mathbb{R}} \sum_{i=1}^n (w^T\mathbf{x}_i + b - y_i)^2 + \alpha \eta ||w||_1 +  \alpha (1 - \eta) ||w||^2_2 $$`
635 | 
636 | Where $\eta$ is the relative amount of l1 penalty (`l1_ratio` in the code).
637 | ???
638 | The way this parameterize in scikit-learn is slightly
639 | different. In scikit-learn, you have a parameter alpha,
640 | which is the amount of regularization and then there's a
641 | parameter called l1_ratio, that says how much of the penalty
642 | should be L1 and L2. If you make this one, you have Lasso,
643 | if you make it zero, you have a Ridge. Don't use Lasso or
644 | Ridge and set alpha zero, because the solver will not handle
645 | it well. If you actually want alpha equal to zero, use
646 | linear regression. Now we have more parameters to tune, but
647 | we just have a more general model. This actually works
648 | pretty well often.
649 | 
650 | ---
651 | # Grid-searching ElasticNet
652 | 
653 | ```python
654 | from sklearn.linear_model import ElasticNet
655 | param_grid = {'alpha': np.logspace(-4, -1, 10),
656 |               'l1_ratio': [0.01, .1, .5, .8, .9, .95, .98, 1]}
657 | 
658 | grid = GridSearchCV(ElasticNet(), param_grid, cv=10)
659 | grid.fit(X_train, y_train)
660 | 
661 | print(grid.best_params_)
662 | print(grid.best_score_)
663 | ```
664 | ```
665 | {'alpha': 0.001, 'l1_ratio': 0.9}
666 | 0.100
667 | ```
668 | ```python
669 | (grid.best_estimator_.coef_!= 0).sum()
670 | ```
671 | ```
672 | 10
673 | ```
674 | ???
675 | Here is me doing a grid search. If you have two parameters
676 | for the grid search it will do all possible combinations.
677 | Here I do a logarithmic space for alpha and for the
678 | l1_ratio, I use something that’s very close to zero and
679 | something that's very close to one and some stuff in
680 | between. If you want to analyze the output of a 2D grid
681 | search a little bit harder we can’t do the nice curve
682 | anymore.
683 | 
684 | ---
685 | class: smaller
686 | # Analyzing grid-search results
687 | ```python
688 | import pandas as pd
689 | res = pd.pivot_table(pd.DataFrame(grid.cv_results_),
690 |     values='mean_test_score', index='param_alpha', columns='param_l1_ratio')
691 | ```
692 | .center[
693 | ![:scale 60%](images/elasticnet_search.png)
694 | ]
695 | ???
696 | The way that I like to do it is, here's the grip.cv results.
697 | And I put it in a data frame and then I'll make a pivot
698 | table where the values are test score, the index is one
699 | parameter and the columns are the other parameter. It allows
700 | me to visualize the grid search nicely. This is alpha and
701 | this is l1_ratio and you can see that if the l1_ratio is
702 | pretty high, there are some pretty good results, if you set
703 | the alpha accordingly. So here's like the diagonal of pretty
704 | good things. This is the model that did best. There's a
705 | slight caveat here that right now I did this with
706 | cross-validation and so this is the cross-validation
707 | accuracy. Last time I said, this is not really a good
708 | measure of generalization performance. So here, I searched
709 | way more parameters, I tried like 5 or 10 times as many
710 | models. So it's likely that by chance, I'll get better
711 | results. I didn't do this here in particular, because the
712 | data set is small and very noisy but in practice, if you
713 | want to compare models, you should evaluate it on a test set
714 | and see which of the models actually are better on the test.
715 | One more thing, why this is helpful is if the best value is
716 | on the edge of this graph that means my ranges were too
717 | small. Question is why we're using r square instead of the
718 | squared loss, one of the answers is that's the default in
719 | scikit-learn and the other answer is it's nice to know the
720 | range so you know that perfect prediction is one and you
721 | have some idea of what 0.5 means, the RMSE (the other norm
722 | that you usually use is the RMSE) depends on the scale of
723 | the output. So for example for the housing prices, it might
724 | be interesting to see what is the standard error in terms of
725 | dollars. If you want, like something that is in the units of
726 | the output, RMSE is good or mean absolute error might even
727 | be better. If you want something that is independent of the
728 | units of the output r square is pretty good because you know
729 | it's going to be between zero and one and it's measure
730 | something like the correlation and so if it's like 0.9, you
731 | know it’s a pretty good model. If my RMSE is 10,000 I don't
732 | know if have a good model or a bad model depends on what the
733 | range of the outputs is. The last thing I want to talk about
734 | today is this was basically changing the regularization
735 | parts. The two most times regularization we looked at is
736 | Ridge which is L2 penalty, Lasso which is an L1 penalty and
737 | combining two of them which is Elastic Net. So now I want to
738 | talk about changing the first part, which was the squared
739 | loss of the predictions, basically.
740 | 
741 | ---
742 | class: center, middle
743 | 
744 | # Notebook: Linear Models for Regression
745 | 
746 |     </textarea>
747 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
748 |     <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
749 | 
750 |     <script>
751 |     // Config Remark
752 |     remark.macros['scale'] = function (percentage) {
753 |         var url = this;
754 |         return '<img src="' + url + '" style="width: ' + percentage + '" />';
755 |     };
756 |     config_remark = {
757 |         highlightStyle: 'github',
758 |         highlightSpans: true,
759 |         highlightLines: true,
760 |         ratio: "16:9"
761 |     };
762 |       var slideshow = remark.create(config_remark);
763 | 
764 |     // Configure MathJax
765 |     MathJax.Hub.Config({
766 |     tex2jax: {
767 |         inlineMath: [['$','$'], ['\\(','\\)']],
768 |         processEscapes: true,
769 |         skipTags: ['script', 'noscript', 'style'] /* removed 'code' entry*/
770 |     }
771 |     });
772 |     </script>
773 |   </body>
774 | </html>
775 | 


--------------------------------------------------------------------------------
/slides/02-cross-validation-grid-search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <title>Cross Validation and Grid Search</title>
  5 |     <meta charset="utf-8">
  6 |     <link rel="stylesheet" href="style.css">
  7 |     <style>
  8 |         @import url(https://fonts.googleapis.com/css?family=Garamond);
  9 |         @import url(https://fonts.googleapis.com/css?family=Muli:400,700,400italic);
 10 |         @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
 11 |       </style>
 12 |     </head>
 13 |     <body>
 14 |       <textarea id="source">
 15 | class: center, middle
 16 | 
 17 | ![:scale 40%](images/sklearn_logo.png)
 18 | 
 19 | ### Intermediate Machine learning with scikit-learn
 20 | 
 21 | # Cross Validation and Grid Search
 22 | 
 23 | Andreas C. Müller
 24 | 
 25 | Columbia University, scikit-learn
 26 | 
 27 | .smaller[https://github.com/amueller/ml-workshop-2-of-4]
 28 | 
 29 | ---
 30 | class: center
 31 | 
 32 | # Influence of Number of Neighbors
 33 | 
 34 | ![:scale 50%](images/knn_boundary_k1.png)
 35 | 
 36 | ???
 37 | So this was the predictions as made by one-nearest neighbor.
 38 | But we can also consider more neighbors, for example three. Here is the
 39 | three nearest neighbors for each of the points and the corresponding
 40 | labels.
 41 | We can then make a prediction by considering the majority among these
 42 | three neighbors.
 43 | And as you can see, in this case all the points changed their labels! (I
 44 | was actually quite surprised when I saw that, I just picked some points
 45 | at random).
 46 | Clearly the number of neighbors that we consider matters a lot. But what
 47 | is the right number?
 48 | The is a problem you’ll encounter a lot in machine learning, the
 49 | problem of tuning parameters of the model, also called hyper-parameters,
 50 | which can not be learned directly from the data.
 51 | ---
 52 | class: center
 53 | 
 54 | # Influence of Number of Neighbors
 55 | 
 56 | ![:scale 50%](images/knn_boundary_k3.png)
 57 | 
 58 | ???
 59 | So this was the predictions as made by one-nearest neighbor.
 60 | But we can also consider more neighbors, for example three. Here is the
 61 | three nearest neighbors for each of the points and the corresponding
 62 | labels.
 63 | We can then make a prediction by considering the majority among these
 64 | three neighbors.
 65 | And as you can see, in this case all the points changed their labels! (I
 66 | was actually quite surprised when I saw that, I just picked some points
 67 | at random).
 68 | Clearly the number of neighbors that we consider matters a lot. But what
 69 | is the right number?
 70 | The is a problem you’ll encounter a lot in machine learning, the
 71 | problem of tuning parameters of the model, also called hyper-parameters,
 72 | which can not be learned directly from the data.
 73 | ---
 74 | class: center, some-space
 75 | 
 76 | # Influence of n_neighbors
 77 | 
 78 | ![:scale 45%](images/knn_boundary_varying_k.png)
 79 | 
 80 | ???
 81 | Here’s an overview of how the classification changes if we consider
 82 | different numbers of neighbors.
 83 | You can see as red and blue circles the training data. And the background
 84 | is colored according to which class a datapoint would be assigned to
 85 | for each location.
 86 | For one neighbor, you can see that each point in the training set has
 87 | a little area around it that would be classified according to it’s
 88 | label. This means all the training points would be classified correctly,
 89 | but it leads to a very complex shape of the decision boundary.
 90 | If we increase the number of neighbors, the boundary between red and
 91 | blue simplifies, and with 40 neighbors we mostly end up with a line.
 92 | This also means that now many of the training data points would be
 93 | labeled incorrectly.
 94 | ---
 95 | class: center, spacious
 96 | 
 97 | # Model complexity
 98 | 
 99 | ![:scale 75%](images/knn_model_complexity.png)
100 | 
101 | ???
102 | We can look at this in more detail by comparing training and test set
103 | scores for the different numbers of neighbors.
104 | Here, I did a random 75%/25% split again. This is a very noisy plot as
105 | the dataset is very small and I only did a random split, but you can
106 | see a trend here.
107 | You can see that for a single neighbor, the training score is 1 so perfect
108 | accuracy, but the test score is only 70%.  If we increase the number of
109 | neighbors we consider, the training score goes down, but the test score
110 | goes up, with an optimum at 19 and 21, but then both go down again.
111 | 
112 | This is a very typical behavior, that I sketched in a schematic for you.
113 | ---
114 | class: center, spacious
115 | # Overfitting and Underfitting
116 | 
117 | ![:scale 80%](images/overfitting_underfitting_cartoon_train.png)
118 | 
119 | ???
120 | here is a cartoon version of how this chart looks in general, though
121 | it's horizontally flipped to the one with saw for knn.
122 | This chart has accuracy on the y axis, and the abstract concept of model
123 | complexity on the x axis.
124 | If we make our machine learning models more complex, we will get better
125 | training set accuracy, as the model will be able to capture more of the
126 | variations in the data.
127 | ---
128 | class: center, spacious
129 | # Overfitting and Underfitting
130 | 
131 | ![:scale 80%](images/overfitting_underfitting_cartoon_generalization.png)
132 | 
133 | ???
134 | But if we look at the generalization performance, we get a different
135 | story. If the model complexity is too low, the model will not be able
136 | to capture the main trends, and a more complex model means better
137 | generalization.
138 | However, if we make the model too complex, generalization performance
139 | drops again, because we basically learn to memorize the dataset.
140 | 
141 | ---
142 | class: center, spacious
143 | # Overfitting and Underfitting
144 | 
145 | ![:scale 80%](images/overfitting_underfitting_cartoon_full.png)
146 | 
147 | ???
148 | If we use too simple a model, this is often called underfitting, while
149 | if we use to complex a model, this is called overfitting. And somewhere
150 | in the middle is a sweet spot.
151 | Most models have some way to tune model complexity, and we’ll see many
152 | of them in the next couple of weeks.
153 | So going back to nearest neighbors, what parameters correspond to high
154 | model complexity and what to low model complexity? high n_neighbors =
155 | low complexity!
156 | 
157 | ---
158 | 
159 | # So far: Train-test-split
160 | 
161 | ![:scale 100%](images/train_test_split_new.png)
162 | 
163 | ???
164 | So far we’ve work with a split of the data into a training and a test
165 | set, build the model on the training set, and evaluated on the test set.
166 | So now, lets say we want to adjust the parameter n_neigbhbors in k
167 | neighbors algorithm, how could we do this?
168 | [split, try out different values of k, choose the best on test set]
169 | What’s the problem with that?
170 | - good for choosing k, overly optimistic for getting accuracy!
171 | 
172 | ---
173 | 
174 | # Threefold split
175 | ![:scale 100%](images/train_test_validation_split.png)
176 | ???
177 | The simplest way to combat this overfitting to the test set is by using
178 | a three-fold split of the data, into a training, a validation and a
179 | test set as we just did.
180 | We use the training set for model building, the validation set for
181 | parameter selection and the test set for a final evaluation of the model.
182 | So how many models should you try out on the test set? Only one! Ideally
183 | use use the test-set exactly once, otherwise you make a multiple
184 | hypothesis testing error!
185 | 
186 | What are downsides of this? We lose a lot of data for evaluation, and
187 | the results depend on the particular sampling.
188 | ---
189 | class: center
190 | # Overfitting the validation set
191 | 
192 | ![:scale 80%](images/overfitting_validation_set_1.png)
193 | 
194 | ---
195 | class: center
196 | # Overfitting the validation set
197 | 
198 | ![:scale 80%](images/overfitting_validation_set_2.png)
199 | ---
200 | class: center
201 | # Overfitting the validation set
202 | 
203 | ![:scale 80%](images/overfitting_validation_set_3.png)
204 | ---
205 | class: center
206 | # Overfitting the validation set
207 | 
208 | ![:scale 80%](images/overfitting_validation_set_4.png)
209 | ---
210 | # Threefold Split for Hyper-Parameters
211 | 
212 | .smaller[
213 | ```python
214 | X_trainval, X_test, y_trainval, y_test = train_test_split(X, y)
215 | X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval)
216 | 
217 | val_scores = []
218 | neighbors = np.arange(1, 15, 2)
219 | for i in neighbors:
220 |     knn = KNeighborsClassifier(n_neighbors=i)
221 |     knn.fit(X_train, y_train)
222 |     val_scores.append(knn.score(X_val, y_val))
223 | print(f"best validation score: {np.max(val_scores):.3}")
224 | best_n_neighbors = neighbors[np.argmax(val_scores)]
225 | print("best n_neighbors:", best_n_neighbors)
226 | 
227 | knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
228 | knn.fit(X_trainval, y_trainval)
229 | print(f"test-set score: {knn.score(X_test, y_test):.3f}")
230 | ```
231 | 
232 | ```
233 | best validation score: 0.991
234 | best n_neighbors: 11
235 | test-set score: 0.951
236 | ```
237 | ]
238 | ???
239 | FIXME complex code
240 | Here is an implementation of the three-fold split for selecting the
241 | number of neighbors.
242 | For each number of neighbors that we want to try, we build a model on
243 | the training set, and evaluate it on the validation set.
244 | We then pick the best validation set score, here that’s 97.2%, achieved
245 | when using three neighbors.
246 | We then retrain the model with this parameter, and evaluate on the test set.
247 | The retraining step is somewhat optional. We could also just use the best
248 | model. But retraining allows us to make better use of all the data.
249 | 
250 | Still, depending on the test-set size we might be using only 70% or 80%
251 | of the data, and our results depend on how exactly we split the datasets.
252 | So how can we make this more robust?
253 | ---
254 | # Cross-validation
255 | .center[
256 | ![:scale 80%](images/cross_validation_new.png)
257 | ]
258 | ???
259 | The answer is of course cross-validation. In cross-validation, you split
260 | your data into multiple folds, usually 5 or 10, and built multiple models.
261 | You start by using fold1 as the test data, and the remaining ones as the
262 | training data. You build your model on the training data, and evaluate
263 | it on the test fold.
264 | For each of the splits of the data, you get a model evaluation and a
265 | score. In the end, you can aggregate the scores, for example by taking
266 | the mean.
267 | What are the pros and cons of this?
268 | Each data point is in the test-set exactly once!
269 | Takes 5 or 10 times longer!
270 | Better data use (larger training sets).
271 | Does that solve all problems? No, it replaces only one of the splits,
272 | usually the inner one!
273 | --
274 | .smaller[
275 | 
276 | pro: more stable, more data
277 | 
278 | con: slower
279 | 
280 | ]
281 | ???
282 | 
283 | ---
284 | 
285 | class: center, some-space
286 | # Cross-validation + test set
287 | 
288 | ![:scale 105%](images/grid_search_cross_validation_new.png)
289 | 
290 | ???
291 | Here is how the workflow looks like when we are using five-fold
292 | cross-validation together with a test-set split for adjusting parameters.
293 | We start out by splitting of the test data, and then we perform
294 | cross-validation on the training set.
295 | Once we found the right setting of the parameters, we retrain on the
296 | whole training set and evaluate on the test set.
297 | ---
298 | # Grid-Search with Cross-Validation
299 | 
300 | .smaller[
301 | ```python
302 | from sklearn.model_selection import cross_val_score
303 | 
304 | X_train, X_test, y_train, y_test = train_test_split(X, y)
305 | cross_val_scores = []
306 | 
307 | for i in neighbors:
308 |     knn = KNeighborsClassifier(n_neighbors=i)
309 |     scores = cross_val_score(knn, X_train, y_train, cv=10)
310 |     cross_val_scores.append(np.mean(scores))
311 | 
312 | print(f"best cross-validation score: {np.max(cross_val_scores):.3}")
313 | best_n_neighbors = neighbors[np.argmax(cross_val_scores)]
314 | print(f"best n_neighbors: {best_n_neighbors}")
315 | 
316 | knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
317 | knn.fit(X_train, y_train)
318 | print(f"test-set score: {knn.score(X_test, y_test):.3f}")
319 | ```
320 | 
321 | ```
322 | best cross-validation score: 0.967
323 | best n_neighbors: 9
324 | test-set score: 0.965
325 | ```
326 | ]
327 | 
328 | ???
329 | Here is an implementation of this  for k nearest neighbors.
330 | 
331 | We split the data, then we iterate over all parameters and for each of
332 | them we do cross-validation.
333 | 
334 | We had seven different values of n_neighbors, and we are running 10 fold
335 | cross-validation. How many models to we train in total?
336 | 10 * 7 + 1 = 71 (the one is the final model)
337 | ---
338 | class: center, middle
339 | ![:scale 80%](images/gridsearch_workflow.png)
340 | 
341 | ???
342 | Here is a conceptual overview of this way of tuning parameters, we start
343 | of with the dataset and a candidate set of parameters we want to try,
344 | labeled parameter grid, for example the number of neighbors.
345 | 
346 | We split the dataset in to training and test set. We use cross-validation
347 | and the parameter grid to find the best parameters.
348 | We use the best parameters and the training set to build a model with
349 | the best parameters,
350 | and finally evaluate it on the test set.
351 | Because this is such a common pattern, there is a helper class for this
352 | in scikit-learn, called GridSearch CV, which does most of these steps
353 | for you.
354 | ---
355 | # GridSearchCV
356 | .smaller[
357 | ```python
358 | from sklearn.model_selection import GridSearchCV
359 | 
360 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
361 | 
362 | 
363 | param_grid = {'n_neighbors':  np.arange(1, 30, 2)}
364 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10,
365 |                    return_train_score=True)
366 | grid.fit(X_train, y_train)
367 | print(f"best mean cross-validation score: {grid.best_score_}")
368 | print(f"best parameters: {grid.best_params_}")
369 | print(f"test-set score: {grid.score(X_test, y_test):.3f}")
370 | ```
371 | 
372 | ```
373 | best mean cross-validation score: 0.967
374 | best parameters: {'n_neighbors': 9}
375 | test-set score: 0.993
376 | ```
377 | ]
378 | ???
379 | Here is an example.
380 | We still need to split our data into training and test set.
381 | We declare the parameters we want to search over as a dictionary. In
382 | this example the parameter is just n_neighbors and the values we want
383 | to try out are a range. The keys of the dictionary are the parameter
384 | names and the values are the parameter settings we want to try. If you
385 | specify multiple parameters, all possible combinations are tried. This
386 | is where the name grid-search comes from - it’s an exhaustive search
387 | over all possible parameter combinations that you specify.
388 | 
389 | GridSearchCV is a class, and it behaves just like any other model in
390 | scikit-learn, with a fit, predict and score method.
391 | It’s what we call a meta-estimator, since you give it one estimator,
392 | here the KneighborsClassifier, and from that GridSearchCV constructs a
393 | new estimator that does the parameter search for you.
394 | You also specify the parameters you want to search, and the
395 | cross-validation strategy.
396 | Then GridSearchCV does all the other things we talked about, it does the
397 | cross-validation and parameter selection, and retrains a model with the
398 | best parameter settings that were found.
399 | We can check out the best cross-validation score and the best parameter
400 | setting with the best_score_ and best_params_ attributes.
401 | And finally we can compute the accuracy on the test set, simply but
402 | using the score method! That will use the retrained model under the hood.
403 | 
404 | ---
405 | class: compact
406 | 
407 | # GridSearchCV Results
408 | .tiny[
409 | ```python
410 | import pandas as pd
411 | results = pd.DataFrame(grid.cv_results_)
412 | results.columns
413 | ```
414 | ```
415 | Index(['mean_fit_time', 'mean_score_time', 'mean_test_score',
416 |        'mean_train_score', 'param_n_neighbors', 'params', 'rank_test_score',
417 |        'split0_test_score', 'split0_train_score', 'split1_test_score',
418 |        'split1_train_score', 'split2_test_score', 'split2_train_score',
419 |        'split3_test_score', 'split3_train_score', 'split4_test_score',
420 |        'split4_train_score', 'split5_test_score', 'split5_train_score',
421 |        'split6_test_score', 'split6_train_score', 'split7_test_score',
422 |        'split7_train_score', 'split8_test_score', 'split8_train_score',
423 |        'split9_test_score', 'split9_train_score', 'std_fit_time',
424 |        'std_score_time', 'std_test_score', 'std_train_score'],
425 |       dtype='object')
426 | ```
427 | 
428 | ```python
429 | results.params
430 | ```
431 | ```
432 | 0     {'n_neighbors': 1}
433 | 1     {'n_neighbors': 3}
434 | 2     {'n_neighbors': 5}
435 | 3     {'n_neighbors': 7}
436 | 4     {'n_neighbors': 9}
437 | 5    {'n_neighbors': 11}
438 | 6    {'n_neighbors': 13}
439 | Name: params, dtype: object
440 | ```
441 | ]
442 | 
443 | ???
444 | FIXME text size
445 | GridSearchCV also computes a lot of interesting statistics for you, which
446 | are stored in the cv_results_ attribute. That attribute is a dictionary,
447 | but it’s easiest to convert it to a pandas dataframe to look at it.
448 | Here you can see the columns. Theres mean fit time, mean score time,
449 | mean test scores, mean training scores, standard deviations and scores
450 | for each individual split of the data.
451 | And there is one row for each setting of the parameters we tried out.
452 | ---
453 | class: center
454 | # n_neighbors Search Results
455 | 
456 | ![:scale 70%](images/grid_search_n_neighbors.png)
457 | ???
458 | We can use this for example to plot the results of cross-validation over
459 | the different parameters.
460 | Here are the mean training score and mean test score together with one
461 | standard deviation.
462 | ---
463 | class: spacious
464 | # Nested Cross-Validation
465 | 
466 | - Replace outer split by CV loop
467 | - Doesn’t yield single model
468 | (inner loop might have different best parameter settings)
469 | - Takes a long time, not that useful in practice
470 | 
471 | ???
472 | We could additionally replace the outer split of the data by
473 | cross-validation. That would yield what’s known as nested
474 | cross-validation.
475 | This is sometimes interesting when comparing different models, but it will
476 | not actually yield one final model. It will yield one model for each loop
477 | of the outer fold, which might have different settings of the parameters.
478 | Also, this takes a really long time to train, by an additional factor
479 | of 5 or 10, so this is not used very commonly in practice.
480 | 
481 | But let’s dive into the cross-validation a bit more.
482 | ---
483 | class: center, middle
484 | # Cross-Validation Strategies
485 | 
486 | ???
487 | So I mentioned k-fold cross validation, where k is usually 5 or ten,
488 | but there are many other strategies.
489 | 
490 | One of the most commonly ones is stratified k-fold cross-validation.
491 | ---
492 | .center[
493 | ![:scale 90%](images/kfold_cv.png)
494 | ]
495 | ---
496 | class: compact
497 | .center[
498 | ![:scale 90%](images/stratified_cv.png)
499 | ]
500 | .smallest[
501 | Stratified:
502 | Ensure relative class frequencies in each fold reflect relative class
503 | frequencies on the whole dataset.]
504 | 
505 | ???
506 | The idea behind stratified k-fold cross-validation is that you want the
507 | test set to be as representative of the dataset as possible.
508 | StratifiedKFold preserves the class frequencies in each fold to be the
509 | same as of the overall dataset.
510 | Here is and example of a dataset with three classes that are ordered. If
511 | you apply standard three-fold to this, the first third of the data would
512 | be in the first fold, the second in the second fold and the third in
513 | the third fold. Because this data is sorted, that would be particularly
514 | bad. If you use stratified cross-validation it would make sure that each
515 | fold has exactly 1/3 of the data from each class.
516 | 
517 | This is also helpful if your data is very imbalanced. If some of the
518 | classes are very rare, it could otherwise happen that a class is not
519 | present at all in a particular fold.
520 | ---
521 | # Importance of Stratification
522 | .smaller[
523 | ```python
524 | y.value_counts()
525 | ```
526 | ```
527 | 0    60
528 | 1    40
529 | ```
530 | ```python
531 | from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
532 | from sklearn.dummy import DummyClassifier
533 | 
534 | dc = DummyClassifier('most_frequent')
535 | skf = StratifiedKFold(n_splits=5, shuffle=True)
536 | res = cross_val_score(dc, X, y, cv=skf)
537 | np.mean(res), res.std()
538 | ```
539 | ```
540 | (0.6, 0.0)
541 | ```
542 | ```python
543 | kf = KFold(n_splits=5, shuffle=True)
544 | res = cross_val_score(dc, X, y, cv=kf)
545 | np.mean(res), res.std()
546 | ```
547 | ```
548 | (0.6, 0.063)
549 | ```
550 | ]
551 | ---
552 | class: spacious
553 | # Repeated KFold and LeaveOneOut
554 | 
555 | - LeaveOneOut : KFold(n_folds=n_samples) <br \>
556 | High variance, takes a long time <br \>
557 | .tiny[(see [Raschka](https://arxiv.org/pdf/1811.12808.pdf) for a review and [Varoquaux](https://hal.inria.fr/hal-01545002/file/paper.pdf) for empirical evaluation)]
558 | 
559 | - Better: ShuffleSplit (aka Monte Carlo) <br \>
560 | Repeatedly sample a test set with replacement
561 | 
562 | - Even Better: RepeatedKFold. <br \>
563 | Apply KFold or StratifiedKFold multiple times with shuffled data.
564 | 
565 | ???
566 | If you want even better estimates of the generalization performance,
567 | you could try to increase the number of folds, with the extreme
568 | of creating one fold per sample. That’s called “LeaveOneOut
569 | cross-validation”. However, because the test-set is so small every time,
570 | and the training sets all have very large overlap, this method has very
571 | high variance.
572 | A better way to get a robust estimate is to run 5-fold or 10-fold
573 | cross-validation multiple times, while shuffling the dataset.
574 | ---
575 | class: compact
576 | .center[
577 | ![:scale 100%](images/shuffle_split_cv.png)
578 | ]
579 | .smaller[Number of iterations and test set size independent]
580 | 
581 | ???
582 | Another interesting variant is shuffle split and stratified shuffle
583 | split. In shuffle split, we repeatedly sample disjoint training and test
584 | sets randomly.
585 | You only have to specify the number of iterations, the training set size
586 | and the test set size. This also allows you to run many iterations with
587 | reasonably large test-sets.
588 | It’s also great if you have a very large training set and you want to
589 | subsample it to get quicker results.
590 | ---
591 | class: compact
592 | .center[
593 | ![:scale 100%](images/repeated_stratified_kfold.png)
594 | ]
595 | .smaller[
596 | Potentially less variance than StratifiedShuffleSplit.<br />
597 | Five times five fold or at most ten times ten fold is sufficient.
598 | ]
599 | ???
600 | ---
601 | class: spacious
602 | # Defaults in scikit-learn
603 | 
604 | - 5-fold in 0.22 (used to be 3 fold)
605 | - For classification cross-validation is stratified
606 | - train_test_split has stratify option:
607 | train_test_split(X, y, stratify=y)
608 | 
609 | - No shuffle by default!
610 | 
611 | ???
612 | By default, all cross-validation strategies are five fold.
613 | If you do cross-validation for classification, it will be stratified
614 | by default.
615 | Because of how the interface is done, that’s not true for
616 | train_test_split and if you want a stratified train_test_split, which
617 | is always a good idea, you should use stratify=y
618 | Another thing that’s important to keep in mind is that by default
619 | scikit-learn doesn’t shuffle! So if you run cross-validation twice
620 | with the default parameters, it will yield exactly the same results.
621 | ---
622 | class: center, middle
623 | # Cross-Validation with non-iid data
624 | 
625 | ???
626 | ---
627 | # Grouped Data
628 | ### Assume have data (medical, product, user...) from 5 cities
629 | - New York, San Francisco, Los Angeles, Chicago, Houston.
630 | 
631 | We can assume data within a city is more correlated then between cities.
632 | 
633 | ### Usage Scenarios
634 | - Assume all future users will be in one of these cities: i.i.d.
635 | - Assume we want to generalize to predict for a new city: not i.i.d.
636 | 
637 | ???
638 | Shipped product in 4 cities. Might ship in another one?
639 | States: you have all the states, no new state will start to exist
640 | 
641 | Similar thing for multiple measurements per patient.
642 | Or geospacial data.
643 | ---
644 | ![:scale 100%](images/group_kfold.png)
645 | 
646 | ???
647 | A somewhat more complicated approach is group k-fold.
648 | This is actually for data that doesn’t fulfill our IID assumption and
649 | has correlations between samples.
650 | The idea is that there are several groups in the data that each contain
651 | highly correlated samples.
652 | You could think about patient data where you have multiple samples for
653 | each patient, then the groups would be which patient a measurement was
654 | taken from.
655 | If you want to know how well your model generalizes to new patients,
656 | you need to ensure that the measurements from each patient are either
657 | all in the training set, or all in the test set.
658 | And that’s what GroupKFold does.
659 | In this example, there are four groups, and we want three folds. The
660 | data is divided such that each group is contained in exactly one fold.
661 | There are several other cross-validation methods in scikit-learn that
662 | use these groups.
663 | ---
664 | class: center
665 | # Correlations in time (and/or space)
666 | 
667 | ![:scale 70%](images/time_series1.png)
668 | 
669 | ???
670 | Not necessarily obvious that there is a time component!
671 | Data collection usually happens over time!
672 | 
673 | ---
674 | class: center
675 | # Correlations in time (and/or space)
676 | 
677 | ![:scale 70%](images/time_series2.png)
678 | 
679 | ???
680 | Not necessarily obvious that there is a time component!
681 | Data collection usually happens over time!
682 | 
683 | ---
684 | class: center
685 | # Correlations in time (and/or space)
686 | 
687 | ![:scale 70%](images/time_series3.png)
688 | 
689 | ???
690 | Not necessarily obvious that there is a time component!
691 | Data collection usually happens over time!
692 | 
693 | ---
694 | ![:scale 100%](images/time_series_walk_forward_cv.png)
695 | ???
696 | Another common case of data that’s not independent is time
697 | series. Usually todays stock price is correlated with yesterdays and
698 | tomorrows. If you randomly split time series, this makes predictions
699 | deceivingly simple. In applications, you usually have data up to some
700 | point, and then try to make predictions for the future, in other words,
701 | you’re trying to make a forecast.
702 | The TimeSeriesSplit in scikit-learn simulates that, by taking increasing
703 | chunks of data from the past and making predictions on the next
704 | chunk. This is quite different from the other was to do cross-validation,
705 | in that the training sets are all overlapping, but it’s more appropriate
706 | for time-series.
707 | ---
708 | ![:scale 100%](images/time_series_cv.png)
709 | 
710 | ???
711 | Another common case of data that’s not independent is time
712 | series. Usually todays stock price is correlated with yesterdays and
713 | tomorrows. If you randomly split time series, this makes predictions
714 | deceivingly simple. In applications, you usually have data up to some
715 | point, and then try to make predictions for the future, in other words,
716 | you’re trying to make a forecast.
717 | The TimeSeriesSplit in scikit-learn simulates that, by taking increasing
718 | chunks of data from the past and making predictions on the next
719 | chunk. This is quite different from the other was to do cross-validation,
720 | in that the training sets are all overlapping, but it’s more appropriate
721 | for time-series.
722 | ---
723 | # Using Cross-Validation Generators
724 | 
725 | .tiny[
726 | ```python
727 | from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, RepeatedStratifiedKFold
728 | kfold = KFold(n_splits=5)
729 | skfold = StratifiedKFold(n_splits=5, shuffle=True)
730 | ss = ShuffleSplit(n_splits=20, train_size=.4, test_size=.3)
731 | rs = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
732 | 
733 | print("KFold:")
734 | print(cross_val_score(KNeighborsClassifier(), X, y, cv=kfold))
735 | 
736 | print("StratifiedKFold:")
737 | print(cross_val_score(KNeighborsClassifier(), X, y, cv=skfold))
738 | 
739 | print("ShuffleSplit:")
740 | print(cross_val_score(KNeighborsClassifier(), X, y, cv=ss))
741 | 
742 | print("RepeatedStratifiedKFold:")
743 | print(cross_val_score(KNeighborsClassifier(), X, y, cv=rs))
744 | ```
745 | 
746 | ```
747 | KFold:
748 | [ 0.93  0.96  0.96  0.98  0.96]
749 | StratifiedKFold:
750 | [0.98 0.96 0.96 0.97 0.96]
751 | ShuffleSplit:
752 | [0.98 0.96 0.96 0.98 0.94 0.96 0.95 0.98 0.97 0.92 0.94 0.97 0.95 0.92
753 |  0.98 0.98 0.97 0.94 0.97 0.95]
754 | RepeatedStratifiedKFold:
755 | [0.99 0.96 0.97 0.97 0.95 0.98 0.97 0.98 0.97 0.96 0.97 0.99 0.94 0.96
756 |  0.96 0.98 0.97 0.96 0.96 0.97 0.97 0.96 0.96 0.96 0.98 0.96 0.97 0.97
757 |  0.97 0.96 0.96 0.95 0.96 0.99 0.98 0.93 0.96 0.98 0.98 0.96 0.96 0.95
758 |  0.97 0.97 0.96 0.97 0.97 0.97 0.96 0.96]
759 | ```
760 | ]
761 | ???
762 | Ok, so how do we use these cross-validation generators? We can simply
763 | pass the object to the cv parameter of the cross_val_score function,
764 | instead of passing a number. Then that generator will be used.
765 | Here are some examples for k-neighbors classifier.
766 | We instantiate a Kfold object with the number of splits equal to 5,
767 | and then pass it to cross_val_score.
768 | We can do the same with StratifiedKFold, and we can also shuffle if we
769 | like, or we can use Shuffle split.
770 | 
771 | ---
772 | # cross_validate function
773 | .smaller[
774 | ```python
775 | from sklearn.model_selection import cross_validate
776 | res = cross_validate(KNeighborsClassifier(), X, y, return_train_score=True,
777 |                      scoring=["accuracy", "roc_auc"])
778 | res_df = pd.DataFrame(res)
779 | ```
780 | 
781 | ```
782 | fit_time	score_time	test_accuracy	test_roc_auc	train_accuracy	train_roc_auc
783 | 0.000839	0.010204    0.965217	    0.996609	    0.980176	    0.997654
784 | 0.000870	0.014424    0.956522	    0.983689	    0.975771	    0.998650
785 | 0.000603	0.009298    0.982301	    0.999329	    0.971491	    0.996977
786 | 0.000698	0.006670    0.955752	    0.984071	    0.978070	    0.997820
787 | 0.000611	0.006559    0.964602	    0.994634	    0.978070	    0.998026
788 | ```
789 | ]
790 | 
791 | ???
792 | FIXME alignment
793 | ---
794 | 
795 | class: center, middle
796 | 
797 | # Notebook: Cross-validation and grid search
798 | 
799 |     </textarea>
800 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
801 |     <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
802 | 
803 |     <script>
804 |     // Config Remark
805 |     remark.macros['scale'] = function (percentage) {
806 |         var url = this;
807 |         return '<img src="' + url + '" style="width: ' + percentage + '" />';
808 |     };
809 |     config_remark = {
810 |         highlightStyle: 'github',
811 |         highlightSpans: true,
812 |         highlightLines: true,
813 |         ratio: "16:9"
814 |     };
815 |       var slideshow = remark.create(config_remark);
816 | 
817 |     // Configure MathJax
818 |     MathJax.Hub.Config({
819 |     tex2jax: {
820 |         inlineMath: [['$','$'], ['\\(','\\)']],
821 |         skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] /* removed 'code' entry*/
822 |     }
823 |     });
824 |     MathJax.Hub.Queue(function() {
825 |         var all = MathJax.Hub.getAllJax(), i;
826 |         for(i = 0; i < all.length; i += 1) {
827 |             all[i].SourceElement().parentNode.className += ' has-jax';
828 |         }
829 |     });
830 |     </script>
831 |   </body>
832 | </html>
833 | 


--------------------------------------------------------------------------------