├── .nojekyll
├── images
├── check_env-1.png
├── check_env-2.png
└── download-repo.png
├── slides
├── images
│ ├── PDSH.png
│ ├── esl.png
│ ├── imlp.png
│ ├── l1_kink.png
│ ├── logit.png
│ ├── api-table.png
│ ├── kfold_cv.png
│ ├── l1l2ball.png
│ ├── l2_l1_l0.png
│ ├── ovr_lines.png
│ ├── svm_or_lr.png
│ ├── ames_scaling.png
│ ├── binary_loss.png
│ ├── gradient_2d.png
│ ├── gradient_3d.png
│ ├── group_kfold.png
│ ├── max_depth_1.png
│ ├── max_depth_4.png
│ ├── max_margin.png
│ ├── no_pruning.png
│ ├── ram_prices.png
│ ├── sklearn-docs.png
│ ├── sklearn_logo.png
│ ├── time_series1.png
│ ├── time_series2.png
│ ├── time_series3.png
│ ├── tree_pruned.png
│ ├── triazine_bar.png
│ ├── average_voting.png
│ ├── boston_scaling.png
│ ├── feature_sample.png
│ ├── grid_ccp_alpha.png
│ ├── grid_max_depth.png
│ ├── instability_1.png
│ ├── instability_2.png
│ ├── max_margin_C_1.png
│ ├── mpl_tree_plot.png
│ ├── ovr_boundaries.png
│ ├── pruning_alpha.png
│ ├── random_forest.png
│ ├── splits_kinect.png
│ ├── stratified_cv.png
│ ├── time_series_cv.png
│ ├── bias_vs_variance.png
│ ├── binning_quantiles.png
│ ├── bootstrap_sample.png
│ ├── elasticnet_search.png
│ ├── grad_boost_depth2.png
│ ├── grad_boost_term_1.png
│ ├── grad_boost_term_2.png
│ ├── grad_boost_term_3.png
│ ├── graphviz_jupyter.png
│ ├── graphviz_source.png
│ ├── knn_boundary_k1.png
│ ├── knn_boundary_k3.png
│ ├── l1l2_elasticnet.png
│ ├── max_leaf_nodes_8.png
│ ├── max_margin_C_0.1.png
│ ├── ram_prices_test.png
│ ├── ram_prices_train.png
│ ├── robust_regression.png
│ ├── shuffle_split_cv.png
│ ├── supervised-ml-api.png
│ ├── threefold_split.png
│ ├── train-test-split.png
│ ├── tree_illustration.png
│ ├── tree_importances.png
│ ├── tree_prediction.png
│ ├── voting_classifier.png
│ ├── forest_importances.png
│ ├── grid_max_leaf_nodes.png
│ ├── gridsearch_workflow.png
│ ├── l1l2ball_intersect.png
│ ├── lasso_alpha_search.png
│ ├── lasso_coefficients.png
│ ├── ridge_alpha_search.png
│ ├── ridge_coefficients.png
│ ├── xgboost_hist_bench.png
│ ├── ames_housing_scatter.png
│ ├── boston_housing_scatter.png
│ ├── cross_validation_new.png
│ ├── hist_gradient_boosting.png
│ ├── knn_boundary_dataset.png
│ ├── knn_boundary_varying_k.png
│ ├── knn_model_complexity.png
│ ├── lasso_alpha_triazine.png
│ ├── linear_boundary_vector.png
│ ├── linear_regression_1d.png
│ ├── logreg_regularization.png
│ ├── lr_coefficients_large.png
│ ├── matrix-representation.png
│ ├── min_samples_split_50.png
│ ├── ridge_alpha_triazine.png
│ ├── ridge_alpha_triazines.png
│ ├── ridge_learning_curve.png
│ ├── supervised-ml-workflow.png
│ ├── train_test_split_new.png
│ ├── xgboost_sklearn_bench.png
│ ├── gradient_learning_rates.png
│ ├── grid_search_n_neighbors.png
│ ├── knn_boundary_test_points.png
│ ├── ridge_alpha_search_poly.png
│ ├── ridge_coefficients_alpha.png
│ ├── ridge_coefficients_large.png
│ ├── grad_boost_regression_steps.png
│ ├── linear_svm_regularization.png
│ ├── repeated_stratified_kfold.png
│ ├── ridge_alpha_search_cv_runs.png
│ ├── time_series_walk_forward_cv.png
│ ├── train_test_validation_split.png
│ ├── tree_building_iteration_1.png
│ ├── tree_building_iteration_2.png
│ ├── tree_building_iteration_9.png
│ ├── grid_search_cross_validation.png
│ ├── overfitting_validation_set_1.png
│ ├── overfitting_validation_set_2.png
│ ├── overfitting_validation_set_3.png
│ ├── overfitting_validation_set_4.png
│ ├── grid_search_cross_validation_new.png
│ ├── train_test_set_2d_classification.png
│ ├── overfitting_underfitting_cartoon_full.png
│ ├── overfitting_underfitting_cartoon_train.png
│ └── overfitting_underfitting_cartoon_generalization.png
├── 01-reminder-supervised-learning.html
├── style.css
├── 04-linear-models-classification.html
├── 06-gradient-boosting.html
├── 03-linear-models-regression.html
└── 02-cross-validation-grid-search.html
├── .gitignore
├── notebooks
├── solutions
│ ├── grid_search_k_neighbors.py
│ ├── bike_regression.py
│ ├── adult_classification.py
│ └── linear_models_diabetes.py
├── 04 - Linear Models for Classification.ipynb
├── 06 - Gradient Boosting.ipynb
├── data
│ ├── bank-campaign-desc.text
│ └── ram_price.csv
├── 01 - Review of Supervised Learning.ipynb
├── 02 - Cross-validation and Grid Search.ipynb
├── 05 - Trees.ipynb
└── 03 - Linear Models for Regression.ipynb
├── LICENSE
├── check_env.ipynb
└── README.md
/.nojekyll:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/check_env-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/check_env-1.png
--------------------------------------------------------------------------------
/images/check_env-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/check_env-2.png
--------------------------------------------------------------------------------
/slides/images/PDSH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/PDSH.png
--------------------------------------------------------------------------------
/slides/images/esl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/esl.png
--------------------------------------------------------------------------------
/slides/images/imlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/imlp.png
--------------------------------------------------------------------------------
/images/download-repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/download-repo.png
--------------------------------------------------------------------------------
/slides/images/l1_kink.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1_kink.png
--------------------------------------------------------------------------------
/slides/images/logit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/logit.png
--------------------------------------------------------------------------------
/slides/images/api-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/api-table.png
--------------------------------------------------------------------------------
/slides/images/kfold_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/kfold_cv.png
--------------------------------------------------------------------------------
/slides/images/l1l2ball.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2ball.png
--------------------------------------------------------------------------------
/slides/images/l2_l1_l0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l2_l1_l0.png
--------------------------------------------------------------------------------
/slides/images/ovr_lines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ovr_lines.png
--------------------------------------------------------------------------------
/slides/images/svm_or_lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/svm_or_lr.png
--------------------------------------------------------------------------------
/slides/images/ames_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ames_scaling.png
--------------------------------------------------------------------------------
/slides/images/binary_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/binary_loss.png
--------------------------------------------------------------------------------
/slides/images/gradient_2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_2d.png
--------------------------------------------------------------------------------
/slides/images/gradient_3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_3d.png
--------------------------------------------------------------------------------
/slides/images/group_kfold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/group_kfold.png
--------------------------------------------------------------------------------
/slides/images/max_depth_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_depth_1.png
--------------------------------------------------------------------------------
/slides/images/max_depth_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_depth_4.png
--------------------------------------------------------------------------------
/slides/images/max_margin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin.png
--------------------------------------------------------------------------------
/slides/images/no_pruning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/no_pruning.png
--------------------------------------------------------------------------------
/slides/images/ram_prices.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices.png
--------------------------------------------------------------------------------
/slides/images/sklearn-docs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/sklearn-docs.png
--------------------------------------------------------------------------------
/slides/images/sklearn_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/sklearn_logo.png
--------------------------------------------------------------------------------
/slides/images/time_series1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series1.png
--------------------------------------------------------------------------------
/slides/images/time_series2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series2.png
--------------------------------------------------------------------------------
/slides/images/time_series3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series3.png
--------------------------------------------------------------------------------
/slides/images/tree_pruned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_pruned.png
--------------------------------------------------------------------------------
/slides/images/triazine_bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/triazine_bar.png
--------------------------------------------------------------------------------
/slides/images/average_voting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/average_voting.png
--------------------------------------------------------------------------------
/slides/images/boston_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/boston_scaling.png
--------------------------------------------------------------------------------
/slides/images/feature_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/feature_sample.png
--------------------------------------------------------------------------------
/slides/images/grid_ccp_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_ccp_alpha.png
--------------------------------------------------------------------------------
/slides/images/grid_max_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_max_depth.png
--------------------------------------------------------------------------------
/slides/images/instability_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/instability_1.png
--------------------------------------------------------------------------------
/slides/images/instability_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/instability_2.png
--------------------------------------------------------------------------------
/slides/images/max_margin_C_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin_C_1.png
--------------------------------------------------------------------------------
/slides/images/mpl_tree_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/mpl_tree_plot.png
--------------------------------------------------------------------------------
/slides/images/ovr_boundaries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ovr_boundaries.png
--------------------------------------------------------------------------------
/slides/images/pruning_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/pruning_alpha.png
--------------------------------------------------------------------------------
/slides/images/random_forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/random_forest.png
--------------------------------------------------------------------------------
/slides/images/splits_kinect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/splits_kinect.png
--------------------------------------------------------------------------------
/slides/images/stratified_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/stratified_cv.png
--------------------------------------------------------------------------------
/slides/images/time_series_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series_cv.png
--------------------------------------------------------------------------------
/slides/images/bias_vs_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/bias_vs_variance.png
--------------------------------------------------------------------------------
/slides/images/binning_quantiles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/binning_quantiles.png
--------------------------------------------------------------------------------
/slides/images/bootstrap_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/bootstrap_sample.png
--------------------------------------------------------------------------------
/slides/images/elasticnet_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/elasticnet_search.png
--------------------------------------------------------------------------------
/slides/images/grad_boost_depth2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_depth2.png
--------------------------------------------------------------------------------
/slides/images/grad_boost_term_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_1.png
--------------------------------------------------------------------------------
/slides/images/grad_boost_term_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_2.png
--------------------------------------------------------------------------------
/slides/images/grad_boost_term_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_3.png
--------------------------------------------------------------------------------
/slides/images/graphviz_jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/graphviz_jupyter.png
--------------------------------------------------------------------------------
/slides/images/graphviz_source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/graphviz_source.png
--------------------------------------------------------------------------------
/slides/images/knn_boundary_k1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_k1.png
--------------------------------------------------------------------------------
/slides/images/knn_boundary_k3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_k3.png
--------------------------------------------------------------------------------
/slides/images/l1l2_elasticnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2_elasticnet.png
--------------------------------------------------------------------------------
/slides/images/max_leaf_nodes_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_leaf_nodes_8.png
--------------------------------------------------------------------------------
/slides/images/max_margin_C_0.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin_C_0.1.png
--------------------------------------------------------------------------------
/slides/images/ram_prices_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices_test.png
--------------------------------------------------------------------------------
/slides/images/ram_prices_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices_train.png
--------------------------------------------------------------------------------
/slides/images/robust_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/robust_regression.png
--------------------------------------------------------------------------------
/slides/images/shuffle_split_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/shuffle_split_cv.png
--------------------------------------------------------------------------------
/slides/images/supervised-ml-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/supervised-ml-api.png
--------------------------------------------------------------------------------
/slides/images/threefold_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/threefold_split.png
--------------------------------------------------------------------------------
/slides/images/train-test-split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train-test-split.png
--------------------------------------------------------------------------------
/slides/images/tree_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_illustration.png
--------------------------------------------------------------------------------
/slides/images/tree_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_importances.png
--------------------------------------------------------------------------------
/slides/images/tree_prediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_prediction.png
--------------------------------------------------------------------------------
/slides/images/voting_classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/voting_classifier.png
--------------------------------------------------------------------------------
/slides/images/forest_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/forest_importances.png
--------------------------------------------------------------------------------
/slides/images/grid_max_leaf_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_max_leaf_nodes.png
--------------------------------------------------------------------------------
/slides/images/gridsearch_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gridsearch_workflow.png
--------------------------------------------------------------------------------
/slides/images/l1l2ball_intersect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2ball_intersect.png
--------------------------------------------------------------------------------
/slides/images/lasso_alpha_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_alpha_search.png
--------------------------------------------------------------------------------
/slides/images/lasso_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_coefficients.png
--------------------------------------------------------------------------------
/slides/images/ridge_alpha_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search.png
--------------------------------------------------------------------------------
/slides/images/ridge_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients.png
--------------------------------------------------------------------------------
/slides/images/xgboost_hist_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/xgboost_hist_bench.png
--------------------------------------------------------------------------------
/slides/images/ames_housing_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ames_housing_scatter.png
--------------------------------------------------------------------------------
/slides/images/boston_housing_scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/boston_housing_scatter.png
--------------------------------------------------------------------------------
/slides/images/cross_validation_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/cross_validation_new.png
--------------------------------------------------------------------------------
/slides/images/hist_gradient_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/hist_gradient_boosting.png
--------------------------------------------------------------------------------
/slides/images/knn_boundary_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_dataset.png
--------------------------------------------------------------------------------
/slides/images/knn_boundary_varying_k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_varying_k.png
--------------------------------------------------------------------------------
/slides/images/knn_model_complexity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_model_complexity.png
--------------------------------------------------------------------------------
/slides/images/lasso_alpha_triazine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_alpha_triazine.png
--------------------------------------------------------------------------------
/slides/images/linear_boundary_vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_boundary_vector.png
--------------------------------------------------------------------------------
/slides/images/linear_regression_1d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_regression_1d.png
--------------------------------------------------------------------------------
/slides/images/logreg_regularization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/logreg_regularization.png
--------------------------------------------------------------------------------
/slides/images/lr_coefficients_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lr_coefficients_large.png
--------------------------------------------------------------------------------
/slides/images/matrix-representation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/matrix-representation.png
--------------------------------------------------------------------------------
/slides/images/min_samples_split_50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/min_samples_split_50.png
--------------------------------------------------------------------------------
/slides/images/ridge_alpha_triazine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_triazine.png
--------------------------------------------------------------------------------
/slides/images/ridge_alpha_triazines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_triazines.png
--------------------------------------------------------------------------------
/slides/images/ridge_learning_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_learning_curve.png
--------------------------------------------------------------------------------
/slides/images/supervised-ml-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/supervised-ml-workflow.png
--------------------------------------------------------------------------------
/slides/images/train_test_split_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_split_new.png
--------------------------------------------------------------------------------
/slides/images/xgboost_sklearn_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/xgboost_sklearn_bench.png
--------------------------------------------------------------------------------
/slides/images/gradient_learning_rates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_learning_rates.png
--------------------------------------------------------------------------------
/slides/images/grid_search_n_neighbors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_n_neighbors.png
--------------------------------------------------------------------------------
/slides/images/knn_boundary_test_points.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_test_points.png
--------------------------------------------------------------------------------
/slides/images/ridge_alpha_search_poly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search_poly.png
--------------------------------------------------------------------------------
/slides/images/ridge_coefficients_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients_alpha.png
--------------------------------------------------------------------------------
/slides/images/ridge_coefficients_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients_large.png
--------------------------------------------------------------------------------
/slides/images/grad_boost_regression_steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_regression_steps.png
--------------------------------------------------------------------------------
/slides/images/linear_svm_regularization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_svm_regularization.png
--------------------------------------------------------------------------------
/slides/images/repeated_stratified_kfold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/repeated_stratified_kfold.png
--------------------------------------------------------------------------------
/slides/images/ridge_alpha_search_cv_runs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search_cv_runs.png
--------------------------------------------------------------------------------
/slides/images/time_series_walk_forward_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series_walk_forward_cv.png
--------------------------------------------------------------------------------
/slides/images/train_test_validation_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_validation_split.png
--------------------------------------------------------------------------------
/slides/images/tree_building_iteration_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_1.png
--------------------------------------------------------------------------------
/slides/images/tree_building_iteration_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_2.png
--------------------------------------------------------------------------------
/slides/images/tree_building_iteration_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_9.png
--------------------------------------------------------------------------------
/slides/images/grid_search_cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_cross_validation.png
--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_1.png
--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_2.png
--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_3.png
--------------------------------------------------------------------------------
/slides/images/overfitting_validation_set_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_4.png
--------------------------------------------------------------------------------
/slides/images/grid_search_cross_validation_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_cross_validation_new.png
--------------------------------------------------------------------------------
/slides/images/train_test_set_2d_classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_set_2d_classification.png
--------------------------------------------------------------------------------
/slides/images/overfitting_underfitting_cartoon_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_full.png
--------------------------------------------------------------------------------
/slides/images/overfitting_underfitting_cartoon_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_train.png
--------------------------------------------------------------------------------
/slides/images/overfitting_underfitting_cartoon_generalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_generalization.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # exlude datasets and externals
2 | notebooks/datasets
3 | notebooks/joblib/
4 |
5 | # exclude temporary files
6 | .ipynb_checkpoints
7 | .DS_Store
8 | gmon.out
9 | __pycache__
10 | *.pyc
11 | *.o
12 | *.so
13 | *.gcno
14 | *.swp
15 | *.egg-info
16 | *.egg
17 | *~
18 | build
19 | dist
20 | lib/test
21 | doc/_build
22 | *env
23 | *ENV
24 | .idea
25 |
--------------------------------------------------------------------------------
/notebooks/solutions/grid_search_k_neighbors.py:
--------------------------------------------------------------------------------
1 | from sklearn.neighbors import KNeighborsClassifier
2 |
3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]}
4 |
5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid,
6 | return_train_score=True)
7 | grid.fit(X_train, y_train)
8 |
9 | print("best parameters: %s" % grid.best_params_)
10 | print("Training set accuracy: %s" % grid.score(X_train, y_train))
11 | print("Test set accuracy: %s" % grid.score(X_test, y_test))
12 | results = grid.cv_results_
13 | plt.plot(param_grid['n_neighbors'], results['mean_train_score'], label="train")
14 | plt.plot(param_grid['n_neighbors'], results['mean_test_score'], label="test")
15 | plt.legend()
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Andreas Mueller
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/notebooks/solutions/bike_regression.py:
--------------------------------------------------------------------------------
1 | data = pd.read_csv("data/bike_day_raw.csv")
2 | X = data.drop("cnt", axis=1)
3 | y = data.cnt
4 |
5 | display(data.head())
6 |
7 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
8 |
9 | from sklearn.linear_model import LinearRegression
10 |
11 | # for other models you should scale here
12 |
13 | lr = LinearRegression().fit(X_train, y_train)
14 |
15 | print(lr.score(X_train, y_train))
16 |
17 | print(lr.score(X_test, y_test))
18 |
19 | from sklearn.metrics import mean_squared_error
20 | y_pred = lr.predict(X_test)
21 | print(mean_squared_error(y_test, y_pred))
22 |
23 |
24 | from sklearn.compose import make_column_transformer
25 | from sklearn.preprocessing import OneHotEncoder
26 | ohe = make_column_transformer(
27 | (OneHotEncoder(sparse=False), X_train.columns[:6]),
28 | remainder='passthrough')
29 |
30 | X_train_ohe = ohe.fit_transform(X_train)
31 | X_test_ohe = ohe.transform(X_test)
32 |
33 | X_train.shape
34 |
35 | X_train_ohe.shape
36 |
37 |
38 | lr = LinearRegression().fit(X_train_ohe, y_train)
39 |
40 | print(lr.score(X_train_ohe, y_train))
41 |
42 | print(lr.score(X_test_ohe, y_test))
43 |
44 | from sklearn.metrics import mean_squared_error
45 | y_pred = lr.predict(X_test_ohe)
46 |
--------------------------------------------------------------------------------
/notebooks/04 - Linear Models for Classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Linear Models for Classification"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Exercise\n",
15 | "Load and preprocess the adult data as before.\n",
16 | "include dummy encoding and scaling\n",
17 | "Learn a logistic regression model and visualize the coefficients.\n",
18 | "Then grid-search the regularization parameter C.\n",
19 | "Compare the coefficients of the best model with the coefficients of a model with more regularization."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "adult = pd.read_csv(\"data/adult.csv\", index_col=0)"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "# %load solutions/adult_classification.py"
39 | ]
40 | }
41 | ],
42 | "metadata": {
43 | "anaconda-cloud": {},
44 | "kernelspec": {
45 | "display_name": "Python 3",
46 | "language": "python",
47 | "name": "python3"
48 | },
49 | "language_info": {
50 | "codemirror_mode": {
51 | "name": "ipython",
52 | "version": 3
53 | },
54 | "file_extension": ".py",
55 | "mimetype": "text/x-python",
56 | "name": "python",
57 | "nbconvert_exporter": "python",
58 | "pygments_lexer": "ipython3",
59 | "version": "3.7.3"
60 | }
61 | },
62 | "nbformat": 4,
63 | "nbformat_minor": 4
64 | }
65 |
--------------------------------------------------------------------------------
/notebooks/solutions/adult_classification.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | data = pd.read_csv("data/adult.csv", index_col=0)
3 | data.head()
4 |
5 | income = data.income
6 | data_features = data.drop("income", axis=1)
7 |
8 | display(data_features.head())
9 |
10 |
11 | ### one hot encode data
12 | data_one_hot = pd.get_dummies(data_features)
13 | data_one_hot.head()
14 |
15 |
16 | ### Preprocessing
17 | from sklearn.preprocessing import StandardScaler
18 | from sklearn.model_selection import train_test_split
19 | X_train, X_test, y_train, y_test = train_test_split(data_one_hot, income)
20 |
21 | scaler = StandardScaler().fit(X_train)
22 | X_train_scaled = scaler.transform(X_train)
23 |
24 | ### Cross-validation with default parameters
25 | from sklearn.model_selection import cross_val_score
26 | from sklearn.linear_model import LogisticRegression
27 |
28 | scores = cross_val_score(LogisticRegression(), X_train_scaled, y_train)
29 | print(scores.mean())
30 |
31 |
32 | ### do grid search
33 |
34 | import numpy as np
35 |
36 | param_grid = {'C': np.logspace(-3, 3, 7)}
37 | param_grid
38 |
39 | from sklearn.model_selection import GridSearchCV
40 | grid = GridSearchCV(LogisticRegression(), param_grid,
41 | return_train_score=True)
42 |
43 | grid.fit(X_train_scaled, y_train)
44 |
45 | grid.best_params_
46 | grid.best_score_
47 |
48 | # some visualization
49 |
50 | import pandas as pd
51 | %matplotlib inline
52 | res = pd.DataFrame(grid.cv_results_)
53 | res.mean_test_score.plot()
54 | res.mean_train_score.plot()
55 | import matplotlib.pyplot as plt
56 | plt.xscale("log")
57 |
58 | grid.score(X_test, y_test)
59 |
60 | important = np.argsort(np.abs(grid.best_estimator_.coef_)).ravel()
61 |
62 | plt.barh(range(10), grid.best_estimator_.coef_.ravel()[important[-10:]])
63 | plt.yticks(range(10), X_train.columns[important[-10:]]);
64 |
--------------------------------------------------------------------------------
/check_env.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from distutils.version import LooseVersion as Version\n",
10 | "import sys\n",
11 | "\n",
12 | "\n",
13 | "OK = '\\x1b[42m[ OK ]\\x1b[0m'\n",
14 | "FAIL = \"\\x1b[41m[FAIL]\\x1b[0m\"\n",
15 | "\n",
16 | "try:\n",
17 | " import importlib\n",
18 | "except ImportError:\n",
19 | " print(FAIL, \"Python version 3.5 is required,\"\n",
20 | " \" but %s is installed.\" % sys.version)\n",
21 | "\n",
22 | " \n",
23 | "def import_version(pkg, min_ver, fail_msg=\"\"):\n",
24 | " mod = None\n",
25 | " try:\n",
26 | " mod = importlib.import_module(pkg)\n",
27 | " ver = mod.__version__\n",
28 | " if Version(ver) < min_ver:\n",
29 | " print(FAIL, \"%s version %s or higher required, but %s installed.\"\n",
30 | " % (lib, min_ver, ver))\n",
31 | " else:\n",
32 | " print(OK, '%s version %s' % (pkg, ver))\n",
33 | " except ImportError:\n",
34 | " print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n",
35 | " return mod\n",
36 | "\n",
37 | "\n",
38 | "# first check the python version\n",
39 | "print('Using python in', sys.prefix)\n",
40 | "print(sys.version)\n",
41 | "pyversion = Version(sys.version)\n",
42 | "if pyversion < \"3.5\":\n",
43 | " print(FAIL, \"Python version 3.5 is required,\"\n",
44 | " \" but %s is installed.\" % sys.version)\n",
45 | "print()\n",
46 | "requirements = {'numpy': \"1.6.1\", 'scipy': \"1.0\", 'matplotlib': \"2.0\",\n",
47 | " 'IPython': \"3.0\", 'sklearn': \"0.22.1\", 'pandas': \"0.18\"}\n",
48 | "\n",
49 | "# now the dependencies\n",
50 | "for lib, required_version in list(requirements.items()):\n",
51 | " import_version(lib, required_version)"
52 | ]
53 | }
54 | ],
55 | "metadata": {
56 | "anaconda-cloud": {},
57 | "kernelspec": {
58 | "display_name": "Python 3",
59 | "language": "python",
60 | "name": "python3"
61 | },
62 | "language_info": {
63 | "codemirror_mode": {
64 | "name": "ipython",
65 | "version": 3
66 | },
67 | "file_extension": ".py",
68 | "mimetype": "text/x-python",
69 | "name": "python",
70 | "nbconvert_exporter": "python",
71 | "pygments_lexer": "ipython3",
72 | "version": "3.7.3"
73 | }
74 | },
75 | "nbformat": 4,
76 | "nbformat_minor": 4
77 | }
78 |
--------------------------------------------------------------------------------
/slides/01-reminder-supervised-learning.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Introduction to Supervised Learning
5 |
6 |
7 |
12 |
13 |
14 |
107 |
108 |
109 |
110 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/notebooks/solutions/linear_models_diabetes.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn.linear_model import Lasso, Ridge, LinearRegression
5 | from sklearn.model_selection import train_test_split, cross_val_score
6 | from sklearn.datasets import load_diabetes
7 |
8 | diabetes = load_diabetes()
9 |
10 | # create dataframe for easy boxplot
11 | df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
12 | df.boxplot()
13 |
14 | plt.figure()
15 | plt.title("Target distribution")
16 | plt.hist(diabetes.target, bins="auto")
17 |
18 | X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
19 | diabetes.target)
20 |
21 | scores_lr = cross_val_score(LinearRegression(), X_train, y_train, cv=10)
22 | print("Linear regression score:", scores_lr.mean())
23 | scores_ridge = cross_val_score(Ridge(), X_train, y_train, cv=10)
24 | print("Ridge Regression score:", scores_ridge.mean())
25 |
26 | # With scaled data
27 | from sklearn.preprocessing import StandardScaler
28 | scaler = StandardScaler().fit(X_train)
29 | X_train_scaled = scaler.transform(X_train)
30 | X_test_scaled = scaler.transform(X_test)
31 |
32 | scores_lr = cross_val_score(LinearRegression(), X_train_scaled, y_train, cv=10)
33 | print("Linear regression w/ scaling:", scores_lr.mean())
34 | scores_ridge = cross_val_score(Ridge(), X_train_scaled, y_train, cv=10)
35 | print("Ridge regression w/ scaling:", scores_ridge.mean())
36 |
37 | from sklearn.model_selection import GridSearchCV
38 | param_grid = {'alpha': np.logspace(-3, 3, 7)}
39 | grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
40 | grid.fit(X_train_scaled, y_train)
41 |
42 | res = pd.DataFrame(grid.cv_results_)
43 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True)
44 | plt.title("Ridge grid search")
45 |
46 |
47 | print(grid.best_params_, grid.best_score_)
48 |
49 | lr = LinearRegression().fit(X_train_scaled, y_train)
50 |
51 | plt.figure()
52 | plt.title("Coefficients LR vs Ridge")
53 | plt.hlines(0, 0, X_train.shape[1], linewidth=.5)
54 | plt.plot(grid.best_estimator_.coef_, 'o', label="Ridge({})".format(grid.best_params_['alpha']))
55 | plt.plot(lr.coef_, 'o', label="LR", alpha=.6)
56 | plt.legend()
57 |
58 | from sklearn.model_selection import GridSearchCV
59 | param_grid = {'alpha': np.logspace(-3, 3, 7)}
60 | grid_lasso = GridSearchCV(Lasso(), param_grid, cv=10, return_train_score=True)
61 | grid_lasso.fit(X_train_scaled, y_train)
62 |
63 | res = pd.DataFrame(grid_lasso.cv_results_)
64 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True)
65 | plt.title("Lasso grid search")
66 | print(grid_lasso.best_params_, grid_lasso.best_score_)
67 |
68 | plt.figure()
69 | plt.title("coefficients")
70 | plt.hlines(0, 0, X_train.shape[1], linewidth=.5)
71 | plt.plot(grid.best_estimator_.coef_, 'o', label="Ridge({})".format(grid.best_params_['alpha']))
72 | plt.plot(grid_lasso.best_estimator_.coef_, 'o', label="Lasso({})".format(grid_lasso.best_params_['alpha']))
73 | plt.plot(lr.coef_, 'o', label="LR", alpha=.6)
74 | plt.legend()
75 |
76 | from sklearn.preprocessing import PolynomialFeatures
77 | poly = PolynomialFeatures(include_bias=False)
78 |
79 | X_train_poly = poly.fit_transform(X_train_scaled)
80 | X_test_poly = poly.transform(X_test_scaled)
81 |
82 | scores_lr = cross_val_score(LinearRegression(), X_train_poly, y_train, cv=10)
83 | print("Linear regression poly features:", scores_lr.mean())
84 | scores_ridge = cross_val_score(Ridge(), X_train_poly, y_train, cv=10)
85 | print("Ridge regression poly features:", scores_ridge.mean())
86 |
87 | from sklearn.model_selection import GridSearchCV
88 | param_grid = {'alpha': np.logspace(-3, 3, 7)}
89 | grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
90 | grid.fit(X_train_poly, y_train)
91 |
92 | res = pd.DataFrame(grid.cv_results_)
93 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True)
94 | plt.title("Ridge grid search with polynomial features")
95 |
96 |
97 | print(grid.best_params_, grid.best_score_)
98 | # score with polynomial features is worse!
--------------------------------------------------------------------------------
/slides/style.css:
--------------------------------------------------------------------------------
1 | body {
2 | font-family: 'Muli';
3 | font-size: 140%;
4 | }
5 | h1, h2 {
6 | font-family: 'Garamond';
7 | font-weight: normal;
8 | margin-top: 10px;
9 | margin-bottom: 10px;
10 | }
11 | .remark-slide-content h1 {
12 | font-size: 70px;
13 | text-align: center;
14 | }
15 | .remark-slide-content p, .remark-slide-content li {
16 | font-size:30px;
17 | line-height: 1.4;
18 | }
19 | .remark-code {
20 | font-size:30px;
21 | }
22 | .remark-slide-content p {
23 | margin: 5px;
24 | }
25 | .remark-slide-container .spacious p,
26 | .remark-slide-container .spacious li{
27 | margin-bottom: 50px;
28 | margin-top: 50px;
29 | }
30 | .remark-slide-container .spacious h1{
31 | margin-bottom: 50px;
32 | }
33 | .remark-slide-container .some-space p,
34 | .remark-slide-container .some-space li,
35 | .remark-slide-container .some-space h1{
36 | margin-bottom: 30px;
37 | }
38 | .reset-column {
39 | overflow: auto;
40 | width: 100%;
41 | }
42 | .remark-slide-container .compact p, .remark-slide-container .compact li, .remark-slide-container .compact pre{
43 | line-height: 1.1;
44 | margin: 0px 0;
45 | }
46 | .remark-slide-container .compact .MathJax_Display{
47 | line-height: 1.1;
48 | margin: 1px 0;
49 | }
50 | .remark-slide-container .compact h1{
51 | margin-bottom: 3px;
52 | }
53 | .padding-top {
54 | padding-top: 100px;
55 | }
56 | .remark-slide-content .smaller p, .remark-slide-content .smaller p .MathJax, .remark-slide-content .smaller li,
57 | .remark-slide-content .smaller .remark-code, .smaller .remark-code-line,.remark-slide-content .smaller a,
58 | .remark-slide-content .smaller .dataframe{
59 | font-size: 25px;
60 | }
61 |
62 | .remark-slide-content .smallest p, .remark-slide-content .smallest .MathJax, .remark-slide-content .smallest li, .remark-slide-content .smallest .remark-code,
63 | .smallest .remark-code-line, .remark-slide-content .smallest .dataframe, .remark-slide-content span.smallest{
64 | font-size: 20px;
65 | }
66 | .remark-slide-content .tiny p, .remark-slide-content .tiny li, .remark-slide-content .tiny .remark-code,
67 | .tiny .remark-code-line, .remark-slide-content .tiny .dataframe{
68 | font-size: 16px;
69 | }
70 | .normal {
71 | font-size: 30px;
72 | }
73 | .quote_author {
74 | display: block;
75 | text-align: right;
76 | margin-top: 20px;
77 | font-size: 30px;
78 | font-family: 'Garamond';
79 | }
80 | .larger, .larger .remark-code {
81 | font-size: 40px;
82 | }
83 | .largest, .largest .remark-code {
84 | font-size: 50px;
85 | }
86 | .left-column, .right-column {
87 | width: 48%;
88 | }
89 | .right-column{
90 | float: right;
91 | }
92 | .left-column{
93 | float: left;
94 | }
95 | .clear-column{
96 | clear: both;
97 | }
98 | .narrow-right-column {
99 | float: right;
100 | width: 32%
101 | }
102 | .wide-left-column {
103 | float: left;
104 | width: 65%
105 | }
106 | .narrow-left-column {
107 | float: left;
108 | width: 32%
109 | }
110 | .wide-right-column {
111 | float: right;
112 | width: 65%
113 | }
114 |
115 | .invisible {
116 | visibility: hidden
117 | }
118 | .tiny-code .remark-code, .remark-inline-code .tiny-code{
119 | font-size: 15px;
120 | }
121 | .remark-code, .remark-inline-code { font-family: 'Ubuntu Mono';}
122 | .hljs.remark-code {background: #e0e0e0}
123 |
124 | /* Some additional styling taken form the Jupyter notebook CSS */
125 | table.dataframe {
126 | border: none;
127 | border-collapse: collapse;
128 | border-spacing: 0;
129 | color: black;
130 | table-layout: fixed;
131 | }
132 | table.dataframe thead {
133 | border-bottom: 1px solid black;
134 | vertical-align: bottom;
135 | }
136 | table.dataframe tr,
137 | table.dataframe th,
138 | table.dataframe td {
139 | text-align: right;
140 | vertical-align: middle;
141 | padding: 0.5em 0.5em;
142 | line-height: normal;
143 | white-space: normal;
144 | max-width: none;
145 | border: none;
146 | }
147 | table.dataframe th {
148 | font-weight: bold;
149 | }
150 | table.dataframe tbody tr:nth-child(odd) {
151 | background: #f5f5f5;
152 | }
153 | table.dataframe tbody tr:hover {
154 | background: rgba(66, 165, 245, 0.2);
155 | }
--------------------------------------------------------------------------------
/notebooks/06 - Gradient Boosting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import matplotlib.pyplot as plt\n",
11 | "import pandas as pd\n",
12 | "import sklearn\n",
13 | "sklearn.set_config(print_changed_only=True)"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Gradient Boosting"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "from sklearn.ensemble import GradientBoostingClassifier\n",
30 | "from sklearn.datasets import load_breast_cancer\n",
31 | "from sklearn.model_selection import train_test_split\n",
32 | "cancer = load_breast_cancer()\n",
33 | "\n",
34 | "X_train, X_test, y_train, y_test = train_test_split(\n",
35 | " cancer.data, cancer.target, random_state=0)\n",
36 | "\n",
37 | "gbrt = GradientBoostingClassifier(random_state=0)\n",
38 | "gbrt.fit(X_train, y_train)\n",
39 | "\n",
40 | "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n",
41 | "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)\n",
51 | "gbrt.fit(X_train, y_train)\n",
52 | "\n",
53 | "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n",
54 | "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)\n",
64 | "gbrt.fit(X_train, y_train)\n",
65 | "\n",
66 | "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n",
67 | "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)\n",
77 | "gbrt.fit(X_train, y_train)\n",
78 | "\n",
79 | "plt.barh(range(cancer.data.shape[1]), gbrt.feature_importances_)\n",
80 | "plt.yticks(range(cancer.data.shape[1]), cancer.feature_names);\n",
81 | "ax = plt.gca()\n",
82 | "ax.set_position([0.4, .2, .9, .9])"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "from xgboost import XGBClassifier\n",
92 | "xgb = XGBClassifier()\n",
93 | "xgb.fit(X_train, y_train)\n",
94 | "print(\"accuracy on training set: %f\" % xgb.score(X_train, y_train))\n",
95 | "print(\"accuracy on test set: %f\" % xgb.score(X_test, y_test))"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "from xgboost import XGBClassifier\n",
105 | "xgb = XGBClassifier(n_estimators=1000)\n",
106 | "xgb.fit(X_train, y_train)\n",
107 | "print(\"accuracy on training set: %f\" % xgb.score(X_train, y_train))\n",
108 | "print(\"accuracy on test set: %f\" % xgb.score(X_test, y_test))"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "# Exercise\n",
116 | "Use GradientBoostingRegressor on the Bike dataset.\n",
117 | "Search over the ``learning_rate`` and ``max_depth`` using ``GridSearchCV``.\n",
118 | "What happens if you change ``n_estimators``?\n",
119 | "\n",
120 | "Compare the speed of XGBClassifier with GradientBoostingRegressor. How well does XGBClassifier do with defaults on the ``Bike`` dataset? Can you make it do better?"
121 | ]
122 | }
123 | ],
124 | "metadata": {
125 | "anaconda-cloud": {},
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.7.3"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 4
146 | }
147 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Intermediate Machine learning with scikit-learn
2 | ========================================================
3 |
4 | Part 2 of 4
5 | -----------
6 | Other parts:
7 | - [Part 1](https://github.com/amueller/ml-workshop-1-of-4)
8 | - [Part 3](https://github.com/amueller/ml-workshop-3-of-4)
9 | - [Part 4](https://github.com/amueller/ml-workshop-4-of-4)
10 |
11 |
12 | Content
13 | -------
14 | - [Reminder on supervised learning](https://amueller.github.io/ml-workshop-2-of-4/slides/01-reminder-supervised-learning.html)
15 | - [Grid search and cross-validation](https://amueller.github.io/ml-workshop-2-of-4/slides/02-cross-validation-grid-search.html)
16 | - [Linear models for regression](https://amueller.github.io/ml-workshop-2-of-4/slides/03-linear-models-regression.html)
17 | - [Linear models for classification](https://amueller.github.io/ml-workshop-2-of-4/slides/04-linear-models-classification.html)
18 | - [Decision trees](https://amueller.github.io/ml-workshop-2-of-4/slides/05-trees-forests.html)
19 | - [Random Forests](https://amueller.github.io/ml-workshop-2-of-4/slides/05-trees-forests.html#26)
20 |
21 |
22 | Instructor
23 | -----------
24 |
25 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/amuellerml) - Columbia University; [Book: Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do)
26 |
27 | ---
28 |
29 | This repository will contain the teaching material and other info associated
30 | with the "Intermediate Machine Learning with scikit-learn" course.
31 |
32 | About the workshop
33 | ------------------
34 | Scikit-learn is a machine learning library in Python, that has become a
35 | valuable tool for many data science practitioners. This workshop will go beyond
36 | the basics and show how to effectively evaluate and tune algorithms. We will
37 | also discuss the most important machine learning algorithms that you're likely
38 | to see in practice, how and when to use them, and some details about how they
39 | work internally. The session will focus on linear models for classification and
40 | regression and tree-based models, including random forests.
41 |
42 | Prerequisites
43 | -------------
44 | This workshop assumes familiarity with Jupyter notebooks and basics of pandas, matplotlib and numpy.
45 | It also assumes familiarity with the basics of supervised learning, like training and test data and basics of model evaluation.
46 | You should have build a model with scikit-learn (or attend Introduction to Machine learning with scikit-learn) before
47 | taking this workshop.
48 |
49 | Obtaining the Tutorial Material
50 | --------------------------------
51 |
52 |
53 | If you are familiar with git, it is most convenient if you clone the GitHub repository. This
54 | is highly encouraged as it allows you to easily synchronize any changes to the material.
55 |
56 | ```
57 | git clone https://github.com/amueller/ml-workshop-2-of-4.git
58 | ```
59 |
60 | If you are not familiar with git, you can download the repository as a .zip file by heading over to the GitHub repository (https://github.com/amueller/ml-workshop-2-of-4) in your browser and click the green “Download” button in the upper right.
61 |
62 | 
63 |
64 | Please note that I may add and improve the material until shortly before the tutorial session, and we recommend you to update your copy of the materials one day before the tutorials. If you have an GitHub account and forked/cloned the repository via GitHub, you can sync your existing fork with via the following commands:
65 |
66 | ```
67 | git pull origin master
68 | ```
69 |
70 |
71 | Installation Notes
72 | ------------------
73 |
74 | This tutorial will require recent installations of
75 |
76 | - [NumPy](http://www.numpy.org)
77 | - [SciPy](http://www.scipy.org)
78 | - [matplotlib](http://matplotlib.org)
79 | - [pillow](https://python-pillow.org)
80 | - [pandas](http://pandas.pydata.org)
81 | - [scikit-learn](http://scikit-learn.org/stable/) (>=0.22.1)
82 | - [IPython](http://ipython.readthedocs.org/en/stable/)
83 | - [Jupyter Notebook](http://jupyter.org)
84 |
85 | The last one is important, you should be able to type:
86 |
87 | jupyter notebook
88 |
89 | in your terminal window and see the notebook panel load in your web browser.
90 | Try opening and running a notebook from the material to see check that it works.
91 |
92 | For users who do not yet have these packages installed, a relatively
93 | painless way to install all the requirements is to use a Python distribution
94 | such as [Anaconda](https://www.continuum.io/downloads), which includes
95 | the most relevant Python packages for science, math, engineering, and
96 | data analysis; Anaconda can be downloaded and installed for free
97 | including commercial use and redistribution.
98 | The code examples in this tutorial requires Python 3.5 or later.
99 |
100 | After obtaining the material, we **strongly recommend** you to open and execute
101 | a Jupyter Notebook `jupter notebook check_env.ipynb` that is located at the
102 | top level of this repository. Inside the repository, you can open the notebook
103 | by executing
104 |
105 | ```bash
106 | jupyter notebook check_env.ipynb
107 | ```
108 |
109 | inside this repository. Inside the Notebook, you can run the code cell by
110 | clicking on the "Run Cells" button as illustrated in the figure below:
111 |
112 | 
113 |
114 |
115 | Finally, if your environment satisfies the requirements for the tutorials, the executed code cell will produce an output message as shown below:
116 |
117 | 
118 |
--------------------------------------------------------------------------------
/notebooks/data/bank-campaign-desc.text:
--------------------------------------------------------------------------------
1 | Citation Request:
2 | This dataset is publicly available for research. The details are described in [Moro et al., 2014].
3 | Please include this citation if you plan to use this database:
4 |
5 | [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, In press, http://dx.doi.org/10.1016/j.dss.2014.03.001
6 |
7 | Available at: [pdf] http://dx.doi.org/10.1016/j.dss.2014.03.001
8 | [bib] http://www3.dsi.uminho.pt/pcortez/bib/2014-dss.txt
9 |
10 | 1. Title: Bank Marketing (with social/economic context)
11 |
12 | 2. Sources
13 | Created by: Sérgio Moro (ISCTE-IUL), Paulo Cortez (Univ. Minho) and Paulo Rita (ISCTE-IUL) @ 2014
14 |
15 | 3. Past Usage:
16 |
17 | The full dataset (bank-additional-full.csv) was described and analyzed in:
18 |
19 | S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems (2014), doi:10.1016/j.dss.2014.03.001.
20 |
21 | 4. Relevant Information:
22 |
23 | This dataset is based on "Bank Marketing" UCI dataset (please check the description at: http://archive.ics.uci.edu/ml/datasets/Bank+Marketing).
24 | The data is enriched by the addition of five new social and economic features/attributes (national wide indicators from a ~10M population country), published by the Banco de Portugal and publicly available at: https://www.bportugal.pt/estatisticasweb.
25 | This dataset is almost identical to the one used in [Moro et al., 2014] (it does not include all attributes due to privacy concerns).
26 | Using the rminer package and R tool (http://cran.r-project.org/web/packages/rminer/), we found that the addition of the five new social and economic attributes (made available here) lead to substantial improvement in the prediction of a success, even when the duration of the call is not included. Note: the file can be read in R using: d=read.table("bank-additional-full.csv",header=TRUE,sep=";")
27 |
28 | The zip file includes two datasets:
29 | 1) bank-additional-full.csv with all examples, ordered by date (from May 2008 to November 2010).
30 | 2) bank-additional.csv with 10% of the examples (4119), randomly selected from bank-additional-full.csv.
31 | The smallest dataset is provided to test more computationally demanding machine learning algorithms (e.g., SVM).
32 |
33 | The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y).
34 |
35 | 5. Number of Instances: 41188 for bank-additional-full.csv
36 |
37 | 6. Number of Attributes: 20 + output attribute.
38 |
39 | 7. Attribute information:
40 |
41 | For more information, read [Moro et al., 2014].
42 |
43 | Input variables:
44 | # bank client data:
45 | 1 - age (numeric)
46 | 2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
47 | 3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)
48 | 4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
49 | 5 - default: has credit in default? (categorical: "no","yes","unknown")
50 | 6 - housing: has housing loan? (categorical: "no","yes","unknown")
51 | 7 - loan: has personal loan? (categorical: "no","yes","unknown")
52 | # related with the last contact of the current campaign:
53 | 8 - contact: contact communication type (categorical: "cellular","telephone")
54 | 9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
55 | 10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
56 | 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
57 | # other attributes:
58 | 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
59 | 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
60 | 14 - previous: number of contacts performed before this campaign and for this client (numeric)
61 | 15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
62 | # social and economic context attributes
63 | 16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
64 | 17 - cons.price.idx: consumer price index - monthly indicator (numeric)
65 | 18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
66 | 19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
67 | 20 - nr.employed: number of employees - quarterly indicator (numeric)
68 |
69 | Output variable (desired target):
70 | 21 - y - has the client subscribed a term deposit? (binary: "yes","no")
71 |
72 | 8. Missing Attribute Values: There are several missing values in some categorical attributes, all coded with the "unknown" label. These missing values can be treated as a possible class label or using deletion or imputation techniques.
73 |
--------------------------------------------------------------------------------
/notebooks/01 - Review of Supervised Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Review of Supervised Learning with scikit-learn"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import matplotlib.pyplot as plt\n",
19 | "import sklearn\n",
20 | "sklearn.set_config(print_changed_only=True)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# read data.\n",
30 | "# you can find a description in data/bank-campaign-desc.txt\n",
31 | "data = pd.read_csv(\"data/bank-campaign.csv\")"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "data.shape"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "data.columns"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "data.head()"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "y = data.target"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "X = data.drop(\"target\", axis=1)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "X.shape"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "y.shape"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "y.head()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "data.target.value_counts()"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "data.target.value_counts(normalize=True)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "Splitting the data:"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "from sklearn.model_selection import train_test_split\n",
138 | "X_train, X_test, y_train, y_test = train_test_split(\n",
139 | " X, y, test_size=.2, random_state=42, stratify=y)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "np.sum(y_train == \"yes\") / len(y_train)"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "np.sum(y_test == \"yes\") / len(y_test)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": []
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "# import model\n",
174 | "from sklearn.linear_model import LogisticRegression\n",
175 | "# instantiate model, set parameters\n",
176 | "lr = LogisticRegression(C=0.1, max_iter=1000)\n",
177 | "# fit model\n",
178 | "lr.fit(X_train, y_train)\n",
179 | "lr.coef_"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "Make predictions:"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "lr.score(X_train, y_train)"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "(y_train == \"no\").mean()"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "lr.score(X_test, y_test)"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "# https://github.com/amueller/ml-workshop-2-of-4"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {},
226 | "source": [
227 | "# Exercise\n",
228 | "Load the dataset ``data/bike_day_raw.csv``, which has the regression target ``cnt``.\n",
229 | "This dataset is hourly bike rentals in the citybike platform. The ``cnt`` column is the number of rentals, which we want to predict from date and weather data.\n",
230 | "\n",
231 | "Split the data into a training and a test set using ``train_test_split``.\n",
232 | "Use the ``LinearRegression`` class to learn a regression model on this data. You can evaluate with the ``score`` method, which provides the $R^2$ or using the ``mean_squared_error`` function from ``sklearn.metrics`` (or write it yourself in numpy)."
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "# %load solutions/bike_regression.py"
242 | ]
243 | }
244 | ],
245 | "metadata": {
246 | "anaconda-cloud": {},
247 | "kernelspec": {
248 | "display_name": "Python 3",
249 | "language": "python",
250 | "name": "python3"
251 | },
252 | "language_info": {
253 | "codemirror_mode": {
254 | "name": "ipython",
255 | "version": 3
256 | },
257 | "file_extension": ".py",
258 | "mimetype": "text/x-python",
259 | "name": "python",
260 | "nbconvert_exporter": "python",
261 | "pygments_lexer": "ipython3",
262 | "version": "3.7.6"
263 | }
264 | },
265 | "nbformat": 4,
266 | "nbformat_minor": 4
267 | }
268 |
--------------------------------------------------------------------------------
/notebooks/02 - Cross-validation and Grid Search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Cross-validation"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import matplotlib.pyplot as plt\n",
17 | "import numpy as np\n",
18 | "import sklearn\n",
19 | "sklearn.set_config(print_changed_only=True)"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "from sklearn.datasets import load_digits\n",
29 | "from sklearn.model_selection import train_test_split\n",
30 | "\n",
31 | "digits = load_digits()\n",
32 | "X_train, X_test, y_train, y_test = train_test_split(\n",
33 | " digits.data, digits.target)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "from sklearn.model_selection import cross_val_score\n",
43 | "from sklearn.neighbors import KNeighborsClassifier"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "cross_val_score(KNeighborsClassifier(),\n",
53 | " X_train, y_train, cv=5)"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "from sklearn.model_selection import KFold, RepeatedStratifiedKFold"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "cross_val_score(KNeighborsClassifier(),\n",
72 | " X_train, y_train, cv=KFold(n_splits=10, shuffle=True, random_state=42))"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "cross_val_score(KNeighborsClassifier(),\n",
82 | " X_train, y_train,\n",
83 | " cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42))"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "Grid Searches\n",
91 | "================="
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "Grid-Search with build-in cross validation"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "from sklearn.model_selection import GridSearchCV\n",
108 | "from sklearn.svm import SVC"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "Define parameter grid:"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "import numpy as np\n",
125 | "\n",
126 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n",
127 | " 'gamma' : 10. ** np.arange(-5, 0)}\n",
128 | "\n",
129 | "np.set_printoptions(suppress=True)\n",
130 | "print(param_grid)"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "A GridSearchCV object behaves just like a normal classifier."
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "grid_search.fit(X_train, y_train)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "grid_search.predict(X_test)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "grid_search.score(X_test, y_test)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "grid_search.best_params_"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "grid_search.best_score_"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "grid_search.best_estimator_"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "# We extract just the scores\n",
210 | "\n",
211 | "scores = grid_search.cv_results_['mean_test_score']\n",
212 | "scores = np.array(scores).reshape(6, 5)\n",
213 | "\n",
214 | "plt.matshow(scores)\n",
215 | "plt.xlabel('gamma')\n",
216 | "plt.ylabel('C')\n",
217 | "plt.colorbar()\n",
218 | "plt.xticks(np.arange(5), param_grid['gamma'])\n",
219 | "plt.yticks(np.arange(6), param_grid['C']);"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "# Exercises\n",
227 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier."
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "# %load solutions/grid_search_k_neighbors.py"
237 | ]
238 | }
239 | ],
240 | "metadata": {
241 | "anaconda-cloud": {},
242 | "kernelspec": {
243 | "display_name": "Python 3",
244 | "language": "python",
245 | "name": "python3"
246 | },
247 | "language_info": {
248 | "codemirror_mode": {
249 | "name": "ipython",
250 | "version": 3
251 | },
252 | "file_extension": ".py",
253 | "mimetype": "text/x-python",
254 | "name": "python",
255 | "nbconvert_exporter": "python",
256 | "pygments_lexer": "ipython3",
257 | "version": "3.7.3"
258 | }
259 | },
260 | "nbformat": 4,
261 | "nbformat_minor": 4
262 | }
263 |
--------------------------------------------------------------------------------
/notebooks/data/ram_price.csv:
--------------------------------------------------------------------------------
1 | ,date,price
2 | 0,1957.0,411041792.0
3 | 1,1959.0,67947725.0
4 | 2,1960.0,5242880.0
5 | 3,1965.0,2642412.0
6 | 4,1970.0,734003.0
7 | 5,1973.0,399360.0
8 | 6,1974.0,314573.0
9 | 7,1975.0,421888.0
10 | 8,1975.08,180224.0
11 | 9,1975.25,67584.0
12 | 10,1975.75,49920.0
13 | 11,1976.0,40704.0
14 | 12,1976.17,48960.0
15 | 13,1976.42,23040.0
16 | 14,1976.58,32000.0
17 | 15,1977.08,36800.0
18 | 16,1978.17,28000.0
19 | 17,1978.25,29440.0
20 | 18,1978.33,19200.0
21 | 19,1978.5,24000.0
22 | 20,1978.58,16000.0
23 | 21,1978.75,15200.0
24 | 22,1979.0,10528.0
25 | 23,1979.75,6704.0
26 | 24,1980.0,6480.0
27 | 25,1981.0,8800.0
28 | 26,1981.58,4479.0
29 | 27,1982.0,3520.0
30 | 28,1982.17,4464.0
31 | 29,1982.67,1980.0
32 | 30,1983.0,2396.0
33 | 31,1983.67,1980.0
34 | 32,1984.0,1379.0
35 | 33,1984.58,1331.0
36 | 34,1985.0,880.0
37 | 35,1985.33,720.0
38 | 36,1985.42,550.0
39 | 37,1985.5,420.0
40 | 38,1985.58,350.0
41 | 39,1985.67,300.0
42 | 40,1985.83,300.0
43 | 41,1985.92,300.0
44 | 42,1986.0,300.0
45 | 43,1986.08,300.0
46 | 44,1986.17,300.0
47 | 45,1986.25,300.0
48 | 46,1986.33,190.0
49 | 47,1986.42,190.0
50 | 48,1986.5,190.0
51 | 49,1986.58,190.0
52 | 50,1986.67,190.0
53 | 51,1986.75,190.0
54 | 52,1986.92,190.0
55 | 53,1987.0,176.0
56 | 54,1987.08,176.0
57 | 55,1987.17,157.0
58 | 56,1987.25,154.0
59 | 57,1987.33,154.0
60 | 58,1987.42,154.0
61 | 59,1987.5,154.0
62 | 60,1987.58,154.0
63 | 61,1987.67,163.0
64 | 62,1987.75,133.0
65 | 63,1987.83,163.0
66 | 64,1987.92,163.0
67 | 65,1988.0,163.0
68 | 66,1988.08,182.0
69 | 67,1988.17,199.0
70 | 68,1988.33,199.0
71 | 69,1988.42,199.0
72 | 70,1988.5,505.0
73 | 71,1988.58,505.0
74 | 72,1988.67,505.0
75 | 73,1988.75,505.0
76 | 74,1988.83,505.0
77 | 75,1988.92,505.0
78 | 76,1989.0,505.0
79 | 77,1989.08,505.0
80 | 78,1989.17,505.0
81 | 79,1989.25,505.0
82 | 80,1989.42,344.0
83 | 81,1989.5,197.0
84 | 82,1989.58,188.0
85 | 83,1989.67,188.0
86 | 84,1989.75,128.0
87 | 85,1989.83,117.0
88 | 86,1989.92,113.0
89 | 87,1990.0,106.0
90 | 88,1990.17,98.3
91 | 89,1990.33,98.3
92 | 90,1990.42,89.5
93 | 91,1990.5,82.8
94 | 92,1990.58,81.1
95 | 93,1990.67,71.5
96 | 94,1990.75,59.0
97 | 95,1990.83,51.0
98 | 96,1990.92,45.5
99 | 97,1991.0,44.5
100 | 98,1991.08,44.5
101 | 99,1991.17,45.0
102 | 100,1991.25,45.0
103 | 101,1991.33,45.0
104 | 102,1991.42,43.8
105 | 103,1991.5,43.8
106 | 104,1991.58,41.3
107 | 105,1991.67,46.3
108 | 106,1991.75,45.0
109 | 107,1991.83,39.8
110 | 108,1991.92,39.8
111 | 109,1992.0,36.3
112 | 110,1992.08,36.3
113 | 111,1992.17,36.3
114 | 112,1992.25,34.8
115 | 113,1992.33,30.0
116 | 114,1992.42,32.5
117 | 115,1992.5,33.5
118 | 116,1992.58,31.0
119 | 117,1992.67,27.5
120 | 118,1992.75,26.3
121 | 119,1992.83,26.3
122 | 120,1992.92,26.3
123 | 121,1993.0,33.1
124 | 122,1993.08,27.5
125 | 123,1993.17,27.5
126 | 124,1993.25,27.5
127 | 125,1993.33,27.5
128 | 126,1993.42,30.0
129 | 127,1993.5,30.0
130 | 128,1993.58,30.0
131 | 129,1993.67,30.0
132 | 130,1993.75,36.0
133 | 131,1993.83,39.8
134 | 132,1993.92,35.8
135 | 133,1994.0,35.8
136 | 134,1994.08,35.8
137 | 135,1994.17,36.0
138 | 136,1994.25,37.3
139 | 137,1994.33,37.3
140 | 138,1994.42,37.3
141 | 139,1994.5,38.5
142 | 140,1994.58,37.0
143 | 141,1994.67,34.0
144 | 142,1994.75,33.5
145 | 143,1994.83,32.3
146 | 144,1994.92,32.3
147 | 145,1995.0,32.3
148 | 146,1995.08,32.0
149 | 147,1995.17,32.0
150 | 148,1995.25,31.2
151 | 149,1995.33,31.2
152 | 150,1995.42,31.1
153 | 151,1995.5,31.2
154 | 152,1995.58,30.6
155 | 153,1995.67,33.1
156 | 154,1995.75,33.1
157 | 155,1995.83,30.9
158 | 156,1995.92,30.9
159 | 157,1996.0,29.9
160 | 158,1996.08,28.8
161 | 159,1996.17,26.1
162 | 160,1996.25,24.7
163 | 161,1996.33,17.2
164 | 162,1996.42,14.9
165 | 163,1996.5,11.3
166 | 164,1996.58,9.06
167 | 165,1996.67,8.44
168 | 166,1996.75,8.0
169 | 167,1996.83,5.25
170 | 168,1996.92,5.25
171 | 169,1997.0,4.63
172 | 170,1997.08,3.63
173 | 171,1997.17,3.0
174 | 172,1997.25,3.0
175 | 173,1997.33,3.0
176 | 174,1997.42,3.69
177 | 175,1997.5,4.0
178 | 176,1997.58,4.13
179 | 177,1997.67,3.63
180 | 178,1997.75,3.41
181 | 179,1997.83,3.25
182 | 180,1997.92,2.16
183 | 181,1998.0,2.16
184 | 182,1998.08,0.91
185 | 183,1998.17,0.97
186 | 184,1998.25,1.22
187 | 185,1998.33,1.19
188 | 186,1998.42,0.97
189 | 187,1998.58,1.03
190 | 188,1998.67,0.97
191 | 189,1998.75,1.16
192 | 190,1998.83,0.84
193 | 191,1998.92,0.84
194 | 192,1999.08,1.44
195 | 193,1999.13,0.84
196 | 194,1999.17,1.25
197 | 195,1999.25,1.25
198 | 196,1999.33,0.86
199 | 197,1999.5,0.78
200 | 198,1999.67,0.87
201 | 199,1999.75,1.04
202 | 200,1999.83,1.34
203 | 201,1999.92,2.35
204 | 202,2000.0,1.56
205 | 203,2000.08,1.48
206 | 204,2000.17,1.08
207 | 205,2000.25,0.84
208 | 206,2000.33,0.7
209 | 207,2000.42,0.9
210 | 208,2000.5,0.77
211 | 209,2000.58,0.84
212 | 210,2000.67,1.07
213 | 211,2000.75,1.12
214 | 212,2000.83,1.12
215 | 213,2000.92,0.9
216 | 214,2001.0,0.75
217 | 215,2001.08,0.464
218 | 216,2001.17,0.464
219 | 217,2001.25,0.383
220 | 218,2001.33,0.387
221 | 219,2001.42,0.305
222 | 220,2001.5,0.352
223 | 221,2001.5,0.27
224 | 222,2001.58,0.191
225 | 223,2001.67,0.191
226 | 224,2001.75,0.169
227 | 225,2001.77,0.148
228 | 226,2002.08,0.134
229 | 227,2002.08,0.207
230 | 228,2002.25,0.193
231 | 229,2002.33,0.193
232 | 230,2002.42,0.33
233 | 231,2002.58,0.193
234 | 232,2002.75,0.193
235 | 233,2003.17,0.176
236 | 234,2003.25,0.076
237 | 235,2003.33,0.126
238 | 236,2003.42,0.115
239 | 237,2003.5,0.133
240 | 238,2003.58,0.129
241 | 239,2003.67,0.143
242 | 240,2003.75,0.148
243 | 241,2003.83,0.16
244 | 242,2003.99,0.166
245 | 243,2004.0,0.174
246 | 244,2004.08,0.148
247 | 245,2004.17,0.146
248 | 246,2004.33,0.156
249 | 247,2004.42,0.203
250 | 248,2004.5,0.176
251 | 249,2005.25,0.185
252 | 250,2005.42,0.149
253 | 251,2005.83,0.116
254 | 252,2005.92,0.185
255 | 253,2006.17,0.112
256 | 254,2006.33,0.073
257 | 255,2006.5,0.082
258 | 256,2006.67,0.073
259 | 257,2006.75,0.088
260 | 258,2006.83,0.098
261 | 259,2006.99,0.092
262 | 260,2007.0,0.082
263 | 261,2007.08,0.078
264 | 262,2007.17,0.066
265 | 263,2007.33,0.0464
266 | 264,2007.5,0.0386
267 | 265,2007.67,0.0351
268 | 266,2007.75,0.0322
269 | 267,2007.83,0.0244
270 | 268,2007.92,0.0244
271 | 269,2008.0,0.0232
272 | 270,2008.08,0.022
273 | 271,2008.33,0.022
274 | 272,2008.5,0.0207
275 | 273,2008.58,0.0176
276 | 274,2008.67,0.0146
277 | 275,2008.83,0.011
278 | 276,2008.92,0.0098
279 | 277,2009.0,0.0098
280 | 278,2009.08,0.0107
281 | 279,2009.25,0.0105
282 | 280,2009.42,0.0115
283 | 281,2009.5,0.011
284 | 282,2009.58,0.0127
285 | 283,2009.75,0.0183
286 | 284,2009.92,0.0205
287 | 285,2010.0,0.019
288 | 286,2010.08,0.0202
289 | 287,2010.17,0.0195
290 | 288,2010.33,0.0242
291 | 289,2010.5,0.021
292 | 290,2010.58,0.022
293 | 291,2010.75,0.0171
294 | 292,2010.83,0.0146
295 | 293,2010.92,0.0122
296 | 294,2011.0,0.01
297 | 295,2011.08,0.0103
298 | 296,2011.33,0.01
299 | 297,2011.42,0.0085
300 | 298,2011.67,0.0054
301 | 299,2011.75,0.0051
302 | 300,2012.0,0.0049
303 | 301,2012.08,0.0049
304 | 302,2012.25,0.005
305 | 303,2012.33,0.0049
306 | 304,2012.58,0.0048
307 | 305,2012.67,0.004
308 | 306,2012.83,0.0037
309 | 307,2013.0,0.0043
310 | 308,2013.08,0.0054
311 | 309,2013.33,0.0067
312 | 310,2013.42,0.0061
313 | 311,2013.58,0.0073
314 | 312,2013.67,0.0065
315 | 313,2013.75,0.0082
316 | 314,2013.83,0.0085
317 | 315,2013.92,0.0079
318 | 316,2014.08,0.0095
319 | 317,2014.17,0.0079
320 | 318,2014.25,0.0073
321 | 319,2014.42,0.0079
322 | 320,2014.58,0.0085
323 | 321,2014.67,0.0085
324 | 322,2014.83,0.0085
325 | 323,2015.0,0.0078
326 | 324,2015.08,0.0073
327 | 325,2015.25,0.0061
328 | 326,2015.33,0.0056
329 | 327,2015.5,0.0049
330 | 328,2015.58,0.0045
331 | 329,2015.67,0.0043
332 | 330,2015.75,0.0042
333 | 331,2015.83,0.0038
334 | 332,2015.92,0.0037
335 |
--------------------------------------------------------------------------------
/notebooks/05 - Trees.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Trees"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import matplotlib.pyplot as plt\n",
18 | "import sklearn\n",
19 | "sklearn.set_config(print_changed_only=True)\n",
20 | "import pandas as pd\n",
21 | "from sklearn.model_selection import train_test_split\n",
22 | "from sklearn.pipeline import make_pipeline\n",
23 | "from sklearn.preprocessing import scale, StandardScaler"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from sklearn.datasets import load_breast_cancer\n",
33 | "cancer = load_breast_cancer()"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "print(cancer.DESCR)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "X_train, X_test, y_train, y_test = train_test_split(\n",
52 | " cancer.data, cancer.target, stratify=cancer.target, random_state=0)"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "# tree visualization"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
69 | "tree = DecisionTreeClassifier(max_depth=2)\n",
70 | "tree.fit(X_train, y_train)"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "plt.figure(dpi=200)\n",
80 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "# Parameter Tuning"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "tree = DecisionTreeClassifier().fit(X_train, y_train)\n",
97 | "plt.figure(figsize=(15, 5))\n",
98 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)\n",
108 | "plt.figure(figsize=(15, 5))\n",
109 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "tree = DecisionTreeClassifier(max_leaf_nodes=8).fit(X_train, y_train)\n",
119 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "tree = DecisionTreeClassifier(min_samples_split=50).fit(X_train, y_train)\n",
129 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "tree = DecisionTreeClassifier(min_impurity_decrease=.01).fit(X_train, y_train)\n",
139 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "from sklearn.model_selection import GridSearchCV\n",
149 | "param_grid = {'max_depth':range(1, 7)}\n",
150 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=10)\n",
151 | "grid.fit(X_train, y_train)"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit\n",
161 | "param_grid = {'max_depth':range(1, 7)}\n",
162 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n",
163 | " cv=StratifiedShuffleSplit(100), return_train_score=True)\n",
164 | "grid.fit(X_train, y_train)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "scores = pd.DataFrame(grid.cv_results_)\n",
174 | "scores.plot(x='param_max_depth', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n",
175 | "plt.legend(loc=(1, 0))"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "from sklearn.model_selection import GridSearchCV\n",
185 | "param_grid = {'max_leaf_nodes': range(2, 20)}\n",
186 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n",
187 | " cv=StratifiedShuffleSplit(100, random_state=1),\n",
188 | " return_train_score=True)\n",
189 | "grid.fit(X_train, y_train)\n",
190 | "\n",
191 | "scores = pd.DataFrame(grid.cv_results_)\n",
192 | "scores.plot(x='param_max_leaf_nodes', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n",
193 | "plt.legend(loc=(1, 0))"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "scores = pd.DataFrame(grid.cv_results_)\n",
203 | "scores.plot(x='param_max_leaf_nodes', y='mean_train_score', yerr='std_train_score', ax=plt.gca())\n",
204 | "scores.plot(x='param_max_leaf_nodes', y='mean_test_score', yerr='std_test_score', ax=plt.gca())"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "grid.best_params_"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "plot_tree(grid.best_estimator_, feature_names=cancer.feature_names, filled=True)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "pd.Series(grid.best_estimator_.feature_importances_,\n",
232 | " index=cancer.feature_names).plot(kind=\"barh\")"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "# Exercise\n",
240 | "Apply a decision tree to the \"adult\" dataset and visualize it.\n",
241 | "\n",
242 | "Tune parameters with grid-search; try at least max_leaf_nodes and max_depth, but separately.\n",
243 | "\n",
244 | "Visualize the resulting tree and it's feature importances."
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": []
253 | }
254 | ],
255 | "metadata": {
256 | "anaconda-cloud": {},
257 | "kernelspec": {
258 | "display_name": "root *",
259 | "language": "python",
260 | "name": "conda-root-py"
261 | },
262 | "language_info": {
263 | "codemirror_mode": {
264 | "name": "ipython",
265 | "version": 3
266 | },
267 | "file_extension": ".py",
268 | "mimetype": "text/x-python",
269 | "name": "python",
270 | "nbconvert_exporter": "python",
271 | "pygments_lexer": "ipython3",
272 | "version": "3.7.3"
273 | }
274 | },
275 | "nbformat": 4,
276 | "nbformat_minor": 4
277 | }
278 |
--------------------------------------------------------------------------------
/notebooks/03 - Linear Models for Regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Linear Models for Regression"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import matplotlib.pyplot as plt\n",
17 | "import numpy as np\n",
18 | "import sklearn\n",
19 | "sklearn.set_config(print_changed_only=True)"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "from sklearn.linear_model import Ridge, LinearRegression"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "from sklearn.model_selection import cross_val_score"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "from sklearn.datasets import load_boston\n",
47 | "boston = load_boston()"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "X, y = boston.data, boston.target"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "print(boston.DESCR)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "X.shape"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "fig, axes = plt.subplots(3, 5, figsize=(20, 10))\n",
84 | "for i, ax in enumerate(axes.ravel()):\n",
85 | " if i > 12:\n",
86 | " ax.set_visible(False)\n",
87 | " continue\n",
88 | " ax.plot(X[:, i], y, 'o', alpha=.5)\n",
89 | " ax.set_title(\"{}: {}\".format(i, boston.feature_names[i]))\n",
90 | " ax.set_ylabel(\"MEDV\")"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "print(X.shape)\n",
100 | "print(y.shape)"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "from sklearn.model_selection import train_test_split\n",
110 | "X_train, X_test, y_train, y_test = train_test_split(\n",
111 | " X, y, random_state=42)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "np.mean(cross_val_score(LinearRegression(),\n",
121 | " X_train, y_train, cv=10))"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "np.mean(cross_val_score(\n",
131 | " Ridge(), X_train, y_train, cv=10))"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "from sklearn.model_selection import GridSearchCV\n",
141 | "param_grid = {'alpha': np.logspace(-3, 3, 14)}\n",
142 | "print(param_grid)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)\n",
152 | "grid.fit(X_train, y_train)"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "import pandas as pd\n",
162 | "plt.figure(dpi=200)\n",
163 | "results = pd.DataFrame(grid.cv_results_)\n",
164 | "results.plot('param_alpha', 'mean_train_score', ax=plt.gca())\n",
165 | "results.plot('param_alpha', 'mean_test_score', ax=plt.gca())\n",
166 | "\n",
167 | "plt.legend()\n",
168 | "plt.xscale(\"log\")"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "from sklearn.preprocessing import PolynomialFeatures, scale\n",
178 | "# being lazy and not really doing things properly whoops\n",
179 | "X_poly = PolynomialFeatures(include_bias=False).fit_transform(scale(X))\n",
180 | "print(X_poly.shape)\n",
181 | "X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=42)"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "np.mean(cross_val_score(LinearRegression(),\n",
191 | " X_train, y_train, cv=10))"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "np.mean(cross_val_score(Ridge(),\n",
201 | " X_train, y_train, cv=10))"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)\n",
211 | "grid.fit(X_train, y_train)"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "results = pd.DataFrame(grid.cv_results_)\n",
221 | "\n",
222 | "results.plot('param_alpha', 'mean_train_score', ax=plt.gca())\n",
223 | "results.plot('param_alpha', 'mean_test_score', ax=plt.gca())\n",
224 | "plt.legend()\n",
225 | "plt.xscale(\"log\")"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "print(grid.best_params_)\n",
235 | "print(grid.best_score_)"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "lr = LinearRegression().fit(X_train, y_train)\n",
245 | "plt.scatter(range(X_poly.shape[1]), lr.coef_, c=np.sign(lr.coef_), cmap=\"bwr_r\")"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "ridge = grid.best_estimator_\n",
255 | "plt.scatter(range(X_poly.shape[1]), ridge.coef_, c=np.sign(ridge.coef_), cmap=\"bwr_r\")"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "ridge100 = Ridge(alpha=100).fit(X_train, y_train)\n",
265 | "ridge1 = Ridge(alpha=1).fit(X_train, y_train)\n",
266 | "plt.figure(figsize=(8, 4))\n",
267 | "\n",
268 | "plt.plot(ridge1.coef_, 'o', label=\"alpha=1\")\n",
269 | "plt.plot(ridge.coef_, 'o', label=\"alpha=14\")\n",
270 | "plt.plot(ridge100.coef_, 'o', label=\"alpha=100\")\n",
271 | "plt.legend()"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "from sklearn.linear_model import Lasso\n",
281 | "\n",
282 | "lasso = Lasso().fit(X_train, y_train)\n",
283 | "print(\"Training set score: {:.2f}\".format(lasso.score(X_train, y_train)))\n",
284 | "print(\"Test set score: {:.2f}\".format(lasso.score(X_test, y_test)))\n",
285 | "print(\"Number of features used:\", np.sum(lasso.coef_ != 0))"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "# Exercise\n",
293 | "Load the diabetes dataset using ``sklearn.datasets.load_diabetes``. Apply ``LinearRegression``, ``Ridge`` and ``Lasso`` and visualize the coefficients. Try polynomial features."
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "# %load solutions/linear_models_diabetes.py"
303 | ]
304 | }
305 | ],
306 | "metadata": {
307 | "anaconda-cloud": {},
308 | "kernelspec": {
309 | "display_name": "root *",
310 | "language": "python",
311 | "name": "conda-root-py"
312 | },
313 | "language_info": {
314 | "codemirror_mode": {
315 | "name": "ipython",
316 | "version": 3
317 | },
318 | "file_extension": ".py",
319 | "mimetype": "text/x-python",
320 | "name": "python",
321 | "nbconvert_exporter": "python",
322 | "pygments_lexer": "ipython3",
323 | "version": "3.7.3"
324 | }
325 | },
326 | "nbformat": 4,
327 | "nbformat_minor": 4
328 | }
329 |
--------------------------------------------------------------------------------
/slides/04-linear-models-classification.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Linear Models for Classification
5 |
6 |
7 |
12 |
13 |
14 |
548 |
549 |
550 |
551 |
578 |
579 |
580 |
--------------------------------------------------------------------------------
/slides/06-gradient-boosting.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Gradient Boosting
5 |
6 |
7 |
12 |
13 |
14 |
770 |
771 |
772 |
773 |
800 |
801 |
802 |
--------------------------------------------------------------------------------
/slides/03-linear-models-regression.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Linear Models for Regression
5 |
6 |
7 |
12 |
13 |
14 |
747 |
748 |
749 |
750 |
773 |
774 |
775 |
--------------------------------------------------------------------------------
/slides/02-cross-validation-grid-search.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Cross Validation and Grid Search
5 |
6 |
7 |
12 |
13 |
14 |
800 |
801 |
802 |
803 |
831 |
832 |
833 |
--------------------------------------------------------------------------------