├── .nojekyll ├── images ├── check_env-1.png ├── check_env-2.png └── download-repo.png ├── slides ├── images │ ├── PDSH.png │ ├── esl.png │ ├── imlp.png │ ├── l1_kink.png │ ├── logit.png │ ├── api-table.png │ ├── kfold_cv.png │ ├── l1l2ball.png │ ├── l2_l1_l0.png │ ├── ovr_lines.png │ ├── svm_or_lr.png │ ├── ames_scaling.png │ ├── binary_loss.png │ ├── gradient_2d.png │ ├── gradient_3d.png │ ├── group_kfold.png │ ├── max_depth_1.png │ ├── max_depth_4.png │ ├── max_margin.png │ ├── no_pruning.png │ ├── ram_prices.png │ ├── sklearn-docs.png │ ├── sklearn_logo.png │ ├── time_series1.png │ ├── time_series2.png │ ├── time_series3.png │ ├── tree_pruned.png │ ├── triazine_bar.png │ ├── average_voting.png │ ├── boston_scaling.png │ ├── feature_sample.png │ ├── grid_ccp_alpha.png │ ├── grid_max_depth.png │ ├── instability_1.png │ ├── instability_2.png │ ├── max_margin_C_1.png │ ├── mpl_tree_plot.png │ ├── ovr_boundaries.png │ ├── pruning_alpha.png │ ├── random_forest.png │ ├── splits_kinect.png │ ├── stratified_cv.png │ ├── time_series_cv.png │ ├── bias_vs_variance.png │ ├── binning_quantiles.png │ ├── bootstrap_sample.png │ ├── elasticnet_search.png │ ├── grad_boost_depth2.png │ ├── grad_boost_term_1.png │ ├── grad_boost_term_2.png │ ├── grad_boost_term_3.png │ ├── graphviz_jupyter.png │ ├── graphviz_source.png │ ├── knn_boundary_k1.png │ ├── knn_boundary_k3.png │ ├── l1l2_elasticnet.png │ ├── max_leaf_nodes_8.png │ ├── max_margin_C_0.1.png │ ├── ram_prices_test.png │ ├── ram_prices_train.png │ ├── robust_regression.png │ ├── shuffle_split_cv.png │ ├── supervised-ml-api.png │ ├── threefold_split.png │ ├── train-test-split.png │ ├── tree_illustration.png │ ├── tree_importances.png │ ├── tree_prediction.png │ ├── voting_classifier.png │ ├── forest_importances.png │ ├── grid_max_leaf_nodes.png │ ├── gridsearch_workflow.png │ ├── l1l2ball_intersect.png │ ├── lasso_alpha_search.png │ ├── lasso_coefficients.png │ ├── ridge_alpha_search.png │ ├── ridge_coefficients.png │ ├── xgboost_hist_bench.png │ ├── ames_housing_scatter.png │ ├── boston_housing_scatter.png │ ├── cross_validation_new.png │ ├── hist_gradient_boosting.png │ ├── knn_boundary_dataset.png │ ├── knn_boundary_varying_k.png │ ├── knn_model_complexity.png │ ├── lasso_alpha_triazine.png │ ├── linear_boundary_vector.png │ ├── linear_regression_1d.png │ ├── logreg_regularization.png │ ├── lr_coefficients_large.png │ ├── matrix-representation.png │ ├── min_samples_split_50.png │ ├── ridge_alpha_triazine.png │ ├── ridge_alpha_triazines.png │ ├── ridge_learning_curve.png │ ├── supervised-ml-workflow.png │ ├── train_test_split_new.png │ ├── xgboost_sklearn_bench.png │ ├── gradient_learning_rates.png │ ├── grid_search_n_neighbors.png │ ├── knn_boundary_test_points.png │ ├── ridge_alpha_search_poly.png │ ├── ridge_coefficients_alpha.png │ ├── ridge_coefficients_large.png │ ├── grad_boost_regression_steps.png │ ├── linear_svm_regularization.png │ ├── repeated_stratified_kfold.png │ ├── ridge_alpha_search_cv_runs.png │ ├── time_series_walk_forward_cv.png │ ├── train_test_validation_split.png │ ├── tree_building_iteration_1.png │ ├── tree_building_iteration_2.png │ ├── tree_building_iteration_9.png │ ├── grid_search_cross_validation.png │ ├── overfitting_validation_set_1.png │ ├── overfitting_validation_set_2.png │ ├── overfitting_validation_set_3.png │ ├── overfitting_validation_set_4.png │ ├── grid_search_cross_validation_new.png │ ├── train_test_set_2d_classification.png │ ├── overfitting_underfitting_cartoon_full.png │ ├── overfitting_underfitting_cartoon_train.png │ └── overfitting_underfitting_cartoon_generalization.png ├── 01-reminder-supervised-learning.html ├── style.css ├── 04-linear-models-classification.html ├── 06-gradient-boosting.html ├── 03-linear-models-regression.html └── 02-cross-validation-grid-search.html ├── .gitignore ├── notebooks ├── solutions │ ├── grid_search_k_neighbors.py │ ├── bike_regression.py │ ├── adult_classification.py │ └── linear_models_diabetes.py ├── 04 - Linear Models for Classification.ipynb ├── 06 - Gradient Boosting.ipynb ├── data │ ├── bank-campaign-desc.text │ └── ram_price.csv ├── 01 - Review of Supervised Learning.ipynb ├── 02 - Cross-validation and Grid Search.ipynb ├── 05 - Trees.ipynb └── 03 - Linear Models for Regression.ipynb ├── LICENSE ├── check_env.ipynb └── README.md /.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/check_env-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/check_env-1.png -------------------------------------------------------------------------------- /images/check_env-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/check_env-2.png -------------------------------------------------------------------------------- /slides/images/PDSH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/PDSH.png -------------------------------------------------------------------------------- /slides/images/esl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/esl.png -------------------------------------------------------------------------------- /slides/images/imlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/imlp.png -------------------------------------------------------------------------------- /images/download-repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/images/download-repo.png -------------------------------------------------------------------------------- /slides/images/l1_kink.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1_kink.png -------------------------------------------------------------------------------- /slides/images/logit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/logit.png -------------------------------------------------------------------------------- /slides/images/api-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/api-table.png -------------------------------------------------------------------------------- /slides/images/kfold_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/kfold_cv.png -------------------------------------------------------------------------------- /slides/images/l1l2ball.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2ball.png -------------------------------------------------------------------------------- /slides/images/l2_l1_l0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l2_l1_l0.png -------------------------------------------------------------------------------- /slides/images/ovr_lines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ovr_lines.png -------------------------------------------------------------------------------- /slides/images/svm_or_lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/svm_or_lr.png -------------------------------------------------------------------------------- /slides/images/ames_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ames_scaling.png -------------------------------------------------------------------------------- /slides/images/binary_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/binary_loss.png -------------------------------------------------------------------------------- /slides/images/gradient_2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_2d.png -------------------------------------------------------------------------------- /slides/images/gradient_3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_3d.png -------------------------------------------------------------------------------- /slides/images/group_kfold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/group_kfold.png -------------------------------------------------------------------------------- /slides/images/max_depth_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_depth_1.png -------------------------------------------------------------------------------- /slides/images/max_depth_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_depth_4.png -------------------------------------------------------------------------------- /slides/images/max_margin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin.png -------------------------------------------------------------------------------- /slides/images/no_pruning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/no_pruning.png -------------------------------------------------------------------------------- /slides/images/ram_prices.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices.png -------------------------------------------------------------------------------- /slides/images/sklearn-docs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/sklearn-docs.png -------------------------------------------------------------------------------- /slides/images/sklearn_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/sklearn_logo.png -------------------------------------------------------------------------------- /slides/images/time_series1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series1.png -------------------------------------------------------------------------------- /slides/images/time_series2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series2.png -------------------------------------------------------------------------------- /slides/images/time_series3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series3.png -------------------------------------------------------------------------------- /slides/images/tree_pruned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_pruned.png -------------------------------------------------------------------------------- /slides/images/triazine_bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/triazine_bar.png -------------------------------------------------------------------------------- /slides/images/average_voting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/average_voting.png -------------------------------------------------------------------------------- /slides/images/boston_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/boston_scaling.png -------------------------------------------------------------------------------- /slides/images/feature_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/feature_sample.png -------------------------------------------------------------------------------- /slides/images/grid_ccp_alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_ccp_alpha.png -------------------------------------------------------------------------------- /slides/images/grid_max_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_max_depth.png -------------------------------------------------------------------------------- /slides/images/instability_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/instability_1.png -------------------------------------------------------------------------------- /slides/images/instability_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/instability_2.png -------------------------------------------------------------------------------- /slides/images/max_margin_C_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin_C_1.png -------------------------------------------------------------------------------- /slides/images/mpl_tree_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/mpl_tree_plot.png -------------------------------------------------------------------------------- /slides/images/ovr_boundaries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ovr_boundaries.png -------------------------------------------------------------------------------- /slides/images/pruning_alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/pruning_alpha.png -------------------------------------------------------------------------------- /slides/images/random_forest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/random_forest.png -------------------------------------------------------------------------------- /slides/images/splits_kinect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/splits_kinect.png -------------------------------------------------------------------------------- /slides/images/stratified_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/stratified_cv.png -------------------------------------------------------------------------------- /slides/images/time_series_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series_cv.png -------------------------------------------------------------------------------- /slides/images/bias_vs_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/bias_vs_variance.png -------------------------------------------------------------------------------- /slides/images/binning_quantiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/binning_quantiles.png -------------------------------------------------------------------------------- /slides/images/bootstrap_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/bootstrap_sample.png -------------------------------------------------------------------------------- /slides/images/elasticnet_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/elasticnet_search.png -------------------------------------------------------------------------------- /slides/images/grad_boost_depth2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_depth2.png -------------------------------------------------------------------------------- /slides/images/grad_boost_term_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_1.png -------------------------------------------------------------------------------- /slides/images/grad_boost_term_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_2.png -------------------------------------------------------------------------------- /slides/images/grad_boost_term_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_term_3.png -------------------------------------------------------------------------------- /slides/images/graphviz_jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/graphviz_jupyter.png -------------------------------------------------------------------------------- /slides/images/graphviz_source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/graphviz_source.png -------------------------------------------------------------------------------- /slides/images/knn_boundary_k1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_k1.png -------------------------------------------------------------------------------- /slides/images/knn_boundary_k3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_k3.png -------------------------------------------------------------------------------- /slides/images/l1l2_elasticnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2_elasticnet.png -------------------------------------------------------------------------------- /slides/images/max_leaf_nodes_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_leaf_nodes_8.png -------------------------------------------------------------------------------- /slides/images/max_margin_C_0.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/max_margin_C_0.1.png -------------------------------------------------------------------------------- /slides/images/ram_prices_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices_test.png -------------------------------------------------------------------------------- /slides/images/ram_prices_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ram_prices_train.png -------------------------------------------------------------------------------- /slides/images/robust_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/robust_regression.png -------------------------------------------------------------------------------- /slides/images/shuffle_split_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/shuffle_split_cv.png -------------------------------------------------------------------------------- /slides/images/supervised-ml-api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/supervised-ml-api.png -------------------------------------------------------------------------------- /slides/images/threefold_split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/threefold_split.png -------------------------------------------------------------------------------- /slides/images/train-test-split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train-test-split.png -------------------------------------------------------------------------------- /slides/images/tree_illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_illustration.png -------------------------------------------------------------------------------- /slides/images/tree_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_importances.png -------------------------------------------------------------------------------- /slides/images/tree_prediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_prediction.png -------------------------------------------------------------------------------- /slides/images/voting_classifier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/voting_classifier.png -------------------------------------------------------------------------------- /slides/images/forest_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/forest_importances.png -------------------------------------------------------------------------------- /slides/images/grid_max_leaf_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_max_leaf_nodes.png -------------------------------------------------------------------------------- /slides/images/gridsearch_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gridsearch_workflow.png -------------------------------------------------------------------------------- /slides/images/l1l2ball_intersect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/l1l2ball_intersect.png -------------------------------------------------------------------------------- /slides/images/lasso_alpha_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_alpha_search.png -------------------------------------------------------------------------------- /slides/images/lasso_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_coefficients.png -------------------------------------------------------------------------------- /slides/images/ridge_alpha_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search.png -------------------------------------------------------------------------------- /slides/images/ridge_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients.png -------------------------------------------------------------------------------- /slides/images/xgboost_hist_bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/xgboost_hist_bench.png -------------------------------------------------------------------------------- /slides/images/ames_housing_scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ames_housing_scatter.png -------------------------------------------------------------------------------- /slides/images/boston_housing_scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/boston_housing_scatter.png -------------------------------------------------------------------------------- /slides/images/cross_validation_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/cross_validation_new.png -------------------------------------------------------------------------------- /slides/images/hist_gradient_boosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/hist_gradient_boosting.png -------------------------------------------------------------------------------- /slides/images/knn_boundary_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_dataset.png -------------------------------------------------------------------------------- /slides/images/knn_boundary_varying_k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_varying_k.png -------------------------------------------------------------------------------- /slides/images/knn_model_complexity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_model_complexity.png -------------------------------------------------------------------------------- /slides/images/lasso_alpha_triazine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lasso_alpha_triazine.png -------------------------------------------------------------------------------- /slides/images/linear_boundary_vector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_boundary_vector.png -------------------------------------------------------------------------------- /slides/images/linear_regression_1d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_regression_1d.png -------------------------------------------------------------------------------- /slides/images/logreg_regularization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/logreg_regularization.png -------------------------------------------------------------------------------- /slides/images/lr_coefficients_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/lr_coefficients_large.png -------------------------------------------------------------------------------- /slides/images/matrix-representation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/matrix-representation.png -------------------------------------------------------------------------------- /slides/images/min_samples_split_50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/min_samples_split_50.png -------------------------------------------------------------------------------- /slides/images/ridge_alpha_triazine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_triazine.png -------------------------------------------------------------------------------- /slides/images/ridge_alpha_triazines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_triazines.png -------------------------------------------------------------------------------- /slides/images/ridge_learning_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_learning_curve.png -------------------------------------------------------------------------------- /slides/images/supervised-ml-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/supervised-ml-workflow.png -------------------------------------------------------------------------------- /slides/images/train_test_split_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_split_new.png -------------------------------------------------------------------------------- /slides/images/xgboost_sklearn_bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/xgboost_sklearn_bench.png -------------------------------------------------------------------------------- /slides/images/gradient_learning_rates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/gradient_learning_rates.png -------------------------------------------------------------------------------- /slides/images/grid_search_n_neighbors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_n_neighbors.png -------------------------------------------------------------------------------- /slides/images/knn_boundary_test_points.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/knn_boundary_test_points.png -------------------------------------------------------------------------------- /slides/images/ridge_alpha_search_poly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search_poly.png -------------------------------------------------------------------------------- /slides/images/ridge_coefficients_alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients_alpha.png -------------------------------------------------------------------------------- /slides/images/ridge_coefficients_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_coefficients_large.png -------------------------------------------------------------------------------- /slides/images/grad_boost_regression_steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grad_boost_regression_steps.png -------------------------------------------------------------------------------- /slides/images/linear_svm_regularization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/linear_svm_regularization.png -------------------------------------------------------------------------------- /slides/images/repeated_stratified_kfold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/repeated_stratified_kfold.png -------------------------------------------------------------------------------- /slides/images/ridge_alpha_search_cv_runs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/ridge_alpha_search_cv_runs.png -------------------------------------------------------------------------------- /slides/images/time_series_walk_forward_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/time_series_walk_forward_cv.png -------------------------------------------------------------------------------- /slides/images/train_test_validation_split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_validation_split.png -------------------------------------------------------------------------------- /slides/images/tree_building_iteration_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_1.png -------------------------------------------------------------------------------- /slides/images/tree_building_iteration_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_2.png -------------------------------------------------------------------------------- /slides/images/tree_building_iteration_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/tree_building_iteration_9.png -------------------------------------------------------------------------------- /slides/images/grid_search_cross_validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_cross_validation.png -------------------------------------------------------------------------------- /slides/images/overfitting_validation_set_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_1.png -------------------------------------------------------------------------------- /slides/images/overfitting_validation_set_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_2.png -------------------------------------------------------------------------------- /slides/images/overfitting_validation_set_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_3.png -------------------------------------------------------------------------------- /slides/images/overfitting_validation_set_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_validation_set_4.png -------------------------------------------------------------------------------- /slides/images/grid_search_cross_validation_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/grid_search_cross_validation_new.png -------------------------------------------------------------------------------- /slides/images/train_test_set_2d_classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/train_test_set_2d_classification.png -------------------------------------------------------------------------------- /slides/images/overfitting_underfitting_cartoon_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_full.png -------------------------------------------------------------------------------- /slides/images/overfitting_underfitting_cartoon_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_train.png -------------------------------------------------------------------------------- /slides/images/overfitting_underfitting_cartoon_generalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-2-of-4/HEAD/slides/images/overfitting_underfitting_cartoon_generalization.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # exlude datasets and externals 2 | notebooks/datasets 3 | notebooks/joblib/ 4 | 5 | # exclude temporary files 6 | .ipynb_checkpoints 7 | .DS_Store 8 | gmon.out 9 | __pycache__ 10 | *.pyc 11 | *.o 12 | *.so 13 | *.gcno 14 | *.swp 15 | *.egg-info 16 | *.egg 17 | *~ 18 | build 19 | dist 20 | lib/test 21 | doc/_build 22 | *env 23 | *ENV 24 | .idea 25 | -------------------------------------------------------------------------------- /notebooks/solutions/grid_search_k_neighbors.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | 3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]} 4 | 5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, 6 | return_train_score=True) 7 | grid.fit(X_train, y_train) 8 | 9 | print("best parameters: %s" % grid.best_params_) 10 | print("Training set accuracy: %s" % grid.score(X_train, y_train)) 11 | print("Test set accuracy: %s" % grid.score(X_test, y_test)) 12 | results = grid.cv_results_ 13 | plt.plot(param_grid['n_neighbors'], results['mean_train_score'], label="train") 14 | plt.plot(param_grid['n_neighbors'], results['mean_test_score'], label="test") 15 | plt.legend() 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Andreas Mueller 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /notebooks/solutions/bike_regression.py: -------------------------------------------------------------------------------- 1 | data = pd.read_csv("data/bike_day_raw.csv") 2 | X = data.drop("cnt", axis=1) 3 | y = data.cnt 4 | 5 | display(data.head()) 6 | 7 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 8 | 9 | from sklearn.linear_model import LinearRegression 10 | 11 | # for other models you should scale here 12 | 13 | lr = LinearRegression().fit(X_train, y_train) 14 | 15 | print(lr.score(X_train, y_train)) 16 | 17 | print(lr.score(X_test, y_test)) 18 | 19 | from sklearn.metrics import mean_squared_error 20 | y_pred = lr.predict(X_test) 21 | print(mean_squared_error(y_test, y_pred)) 22 | 23 | 24 | from sklearn.compose import make_column_transformer 25 | from sklearn.preprocessing import OneHotEncoder 26 | ohe = make_column_transformer( 27 | (OneHotEncoder(sparse=False), X_train.columns[:6]), 28 | remainder='passthrough') 29 | 30 | X_train_ohe = ohe.fit_transform(X_train) 31 | X_test_ohe = ohe.transform(X_test) 32 | 33 | X_train.shape 34 | 35 | X_train_ohe.shape 36 | 37 | 38 | lr = LinearRegression().fit(X_train_ohe, y_train) 39 | 40 | print(lr.score(X_train_ohe, y_train)) 41 | 42 | print(lr.score(X_test_ohe, y_test)) 43 | 44 | from sklearn.metrics import mean_squared_error 45 | y_pred = lr.predict(X_test_ohe) 46 | -------------------------------------------------------------------------------- /notebooks/04 - Linear Models for Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Linear Models for Classification" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Exercise\n", 15 | "Load and preprocess the adult data as before.\n", 16 | "include dummy encoding and scaling\n", 17 | "Learn a logistic regression model and visualize the coefficients.\n", 18 | "Then grid-search the regularization parameter C.\n", 19 | "Compare the coefficients of the best model with the coefficients of a model with more regularization." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "adult = pd.read_csv(\"data/adult.csv\", index_col=0)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# %load solutions/adult_classification.py" 39 | ] 40 | } 41 | ], 42 | "metadata": { 43 | "anaconda-cloud": {}, 44 | "kernelspec": { 45 | "display_name": "Python 3", 46 | "language": "python", 47 | "name": "python3" 48 | }, 49 | "language_info": { 50 | "codemirror_mode": { 51 | "name": "ipython", 52 | "version": 3 53 | }, 54 | "file_extension": ".py", 55 | "mimetype": "text/x-python", 56 | "name": "python", 57 | "nbconvert_exporter": "python", 58 | "pygments_lexer": "ipython3", 59 | "version": "3.7.3" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 4 64 | } 65 | -------------------------------------------------------------------------------- /notebooks/solutions/adult_classification.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | data = pd.read_csv("data/adult.csv", index_col=0) 3 | data.head() 4 | 5 | income = data.income 6 | data_features = data.drop("income", axis=1) 7 | 8 | display(data_features.head()) 9 | 10 | 11 | ### one hot encode data 12 | data_one_hot = pd.get_dummies(data_features) 13 | data_one_hot.head() 14 | 15 | 16 | ### Preprocessing 17 | from sklearn.preprocessing import StandardScaler 18 | from sklearn.model_selection import train_test_split 19 | X_train, X_test, y_train, y_test = train_test_split(data_one_hot, income) 20 | 21 | scaler = StandardScaler().fit(X_train) 22 | X_train_scaled = scaler.transform(X_train) 23 | 24 | ### Cross-validation with default parameters 25 | from sklearn.model_selection import cross_val_score 26 | from sklearn.linear_model import LogisticRegression 27 | 28 | scores = cross_val_score(LogisticRegression(), X_train_scaled, y_train) 29 | print(scores.mean()) 30 | 31 | 32 | ### do grid search 33 | 34 | import numpy as np 35 | 36 | param_grid = {'C': np.logspace(-3, 3, 7)} 37 | param_grid 38 | 39 | from sklearn.model_selection import GridSearchCV 40 | grid = GridSearchCV(LogisticRegression(), param_grid, 41 | return_train_score=True) 42 | 43 | grid.fit(X_train_scaled, y_train) 44 | 45 | grid.best_params_ 46 | grid.best_score_ 47 | 48 | # some visualization 49 | 50 | import pandas as pd 51 | %matplotlib inline 52 | res = pd.DataFrame(grid.cv_results_) 53 | res.mean_test_score.plot() 54 | res.mean_train_score.plot() 55 | import matplotlib.pyplot as plt 56 | plt.xscale("log") 57 | 58 | grid.score(X_test, y_test) 59 | 60 | important = np.argsort(np.abs(grid.best_estimator_.coef_)).ravel() 61 | 62 | plt.barh(range(10), grid.best_estimator_.coef_.ravel()[important[-10:]]) 63 | plt.yticks(range(10), X_train.columns[important[-10:]]); 64 | -------------------------------------------------------------------------------- /check_env.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from distutils.version import LooseVersion as Version\n", 10 | "import sys\n", 11 | "\n", 12 | "\n", 13 | "OK = '\\x1b[42m[ OK ]\\x1b[0m'\n", 14 | "FAIL = \"\\x1b[41m[FAIL]\\x1b[0m\"\n", 15 | "\n", 16 | "try:\n", 17 | " import importlib\n", 18 | "except ImportError:\n", 19 | " print(FAIL, \"Python version 3.5 is required,\"\n", 20 | " \" but %s is installed.\" % sys.version)\n", 21 | "\n", 22 | " \n", 23 | "def import_version(pkg, min_ver, fail_msg=\"\"):\n", 24 | " mod = None\n", 25 | " try:\n", 26 | " mod = importlib.import_module(pkg)\n", 27 | " ver = mod.__version__\n", 28 | " if Version(ver) < min_ver:\n", 29 | " print(FAIL, \"%s version %s or higher required, but %s installed.\"\n", 30 | " % (lib, min_ver, ver))\n", 31 | " else:\n", 32 | " print(OK, '%s version %s' % (pkg, ver))\n", 33 | " except ImportError:\n", 34 | " print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n", 35 | " return mod\n", 36 | "\n", 37 | "\n", 38 | "# first check the python version\n", 39 | "print('Using python in', sys.prefix)\n", 40 | "print(sys.version)\n", 41 | "pyversion = Version(sys.version)\n", 42 | "if pyversion < \"3.5\":\n", 43 | " print(FAIL, \"Python version 3.5 is required,\"\n", 44 | " \" but %s is installed.\" % sys.version)\n", 45 | "print()\n", 46 | "requirements = {'numpy': \"1.6.1\", 'scipy': \"1.0\", 'matplotlib': \"2.0\",\n", 47 | " 'IPython': \"3.0\", 'sklearn': \"0.22.1\", 'pandas': \"0.18\"}\n", 48 | "\n", 49 | "# now the dependencies\n", 50 | "for lib, required_version in list(requirements.items()):\n", 51 | " import_version(lib, required_version)" 52 | ] 53 | } 54 | ], 55 | "metadata": { 56 | "anaconda-cloud": {}, 57 | "kernelspec": { 58 | "display_name": "Python 3", 59 | "language": "python", 60 | "name": "python3" 61 | }, 62 | "language_info": { 63 | "codemirror_mode": { 64 | "name": "ipython", 65 | "version": 3 66 | }, 67 | "file_extension": ".py", 68 | "mimetype": "text/x-python", 69 | "name": "python", 70 | "nbconvert_exporter": "python", 71 | "pygments_lexer": "ipython3", 72 | "version": "3.7.3" 73 | } 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 4 77 | } 78 | -------------------------------------------------------------------------------- /slides/01-reminder-supervised-learning.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Introduction to Supervised Learning 5 | 6 | 7 | 12 | 13 | 14 | 107 | 108 | 109 | 110 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /notebooks/solutions/linear_models_diabetes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.linear_model import Lasso, Ridge, LinearRegression 5 | from sklearn.model_selection import train_test_split, cross_val_score 6 | from sklearn.datasets import load_diabetes 7 | 8 | diabetes = load_diabetes() 9 | 10 | # create dataframe for easy boxplot 11 | df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) 12 | df.boxplot() 13 | 14 | plt.figure() 15 | plt.title("Target distribution") 16 | plt.hist(diabetes.target, bins="auto") 17 | 18 | X_train, X_test, y_train, y_test = train_test_split(diabetes.data, 19 | diabetes.target) 20 | 21 | scores_lr = cross_val_score(LinearRegression(), X_train, y_train, cv=10) 22 | print("Linear regression score:", scores_lr.mean()) 23 | scores_ridge = cross_val_score(Ridge(), X_train, y_train, cv=10) 24 | print("Ridge Regression score:", scores_ridge.mean()) 25 | 26 | # With scaled data 27 | from sklearn.preprocessing import StandardScaler 28 | scaler = StandardScaler().fit(X_train) 29 | X_train_scaled = scaler.transform(X_train) 30 | X_test_scaled = scaler.transform(X_test) 31 | 32 | scores_lr = cross_val_score(LinearRegression(), X_train_scaled, y_train, cv=10) 33 | print("Linear regression w/ scaling:", scores_lr.mean()) 34 | scores_ridge = cross_val_score(Ridge(), X_train_scaled, y_train, cv=10) 35 | print("Ridge regression w/ scaling:", scores_ridge.mean()) 36 | 37 | from sklearn.model_selection import GridSearchCV 38 | param_grid = {'alpha': np.logspace(-3, 3, 7)} 39 | grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True) 40 | grid.fit(X_train_scaled, y_train) 41 | 42 | res = pd.DataFrame(grid.cv_results_) 43 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True) 44 | plt.title("Ridge grid search") 45 | 46 | 47 | print(grid.best_params_, grid.best_score_) 48 | 49 | lr = LinearRegression().fit(X_train_scaled, y_train) 50 | 51 | plt.figure() 52 | plt.title("Coefficients LR vs Ridge") 53 | plt.hlines(0, 0, X_train.shape[1], linewidth=.5) 54 | plt.plot(grid.best_estimator_.coef_, 'o', label="Ridge({})".format(grid.best_params_['alpha'])) 55 | plt.plot(lr.coef_, 'o', label="LR", alpha=.6) 56 | plt.legend() 57 | 58 | from sklearn.model_selection import GridSearchCV 59 | param_grid = {'alpha': np.logspace(-3, 3, 7)} 60 | grid_lasso = GridSearchCV(Lasso(), param_grid, cv=10, return_train_score=True) 61 | grid_lasso.fit(X_train_scaled, y_train) 62 | 63 | res = pd.DataFrame(grid_lasso.cv_results_) 64 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True) 65 | plt.title("Lasso grid search") 66 | print(grid_lasso.best_params_, grid_lasso.best_score_) 67 | 68 | plt.figure() 69 | plt.title("coefficients") 70 | plt.hlines(0, 0, X_train.shape[1], linewidth=.5) 71 | plt.plot(grid.best_estimator_.coef_, 'o', label="Ridge({})".format(grid.best_params_['alpha'])) 72 | plt.plot(grid_lasso.best_estimator_.coef_, 'o', label="Lasso({})".format(grid_lasso.best_params_['alpha'])) 73 | plt.plot(lr.coef_, 'o', label="LR", alpha=.6) 74 | plt.legend() 75 | 76 | from sklearn.preprocessing import PolynomialFeatures 77 | poly = PolynomialFeatures(include_bias=False) 78 | 79 | X_train_poly = poly.fit_transform(X_train_scaled) 80 | X_test_poly = poly.transform(X_test_scaled) 81 | 82 | scores_lr = cross_val_score(LinearRegression(), X_train_poly, y_train, cv=10) 83 | print("Linear regression poly features:", scores_lr.mean()) 84 | scores_ridge = cross_val_score(Ridge(), X_train_poly, y_train, cv=10) 85 | print("Ridge regression poly features:", scores_ridge.mean()) 86 | 87 | from sklearn.model_selection import GridSearchCV 88 | param_grid = {'alpha': np.logspace(-3, 3, 7)} 89 | grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True) 90 | grid.fit(X_train_poly, y_train) 91 | 92 | res = pd.DataFrame(grid.cv_results_) 93 | res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True) 94 | plt.title("Ridge grid search with polynomial features") 95 | 96 | 97 | print(grid.best_params_, grid.best_score_) 98 | # score with polynomial features is worse! -------------------------------------------------------------------------------- /slides/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Muli'; 3 | font-size: 140%; 4 | } 5 | h1, h2 { 6 | font-family: 'Garamond'; 7 | font-weight: normal; 8 | margin-top: 10px; 9 | margin-bottom: 10px; 10 | } 11 | .remark-slide-content h1 { 12 | font-size: 70px; 13 | text-align: center; 14 | } 15 | .remark-slide-content p, .remark-slide-content li { 16 | font-size:30px; 17 | line-height: 1.4; 18 | } 19 | .remark-code { 20 | font-size:30px; 21 | } 22 | .remark-slide-content p { 23 | margin: 5px; 24 | } 25 | .remark-slide-container .spacious p, 26 | .remark-slide-container .spacious li{ 27 | margin-bottom: 50px; 28 | margin-top: 50px; 29 | } 30 | .remark-slide-container .spacious h1{ 31 | margin-bottom: 50px; 32 | } 33 | .remark-slide-container .some-space p, 34 | .remark-slide-container .some-space li, 35 | .remark-slide-container .some-space h1{ 36 | margin-bottom: 30px; 37 | } 38 | .reset-column { 39 | overflow: auto; 40 | width: 100%; 41 | } 42 | .remark-slide-container .compact p, .remark-slide-container .compact li, .remark-slide-container .compact pre{ 43 | line-height: 1.1; 44 | margin: 0px 0; 45 | } 46 | .remark-slide-container .compact .MathJax_Display{ 47 | line-height: 1.1; 48 | margin: 1px 0; 49 | } 50 | .remark-slide-container .compact h1{ 51 | margin-bottom: 3px; 52 | } 53 | .padding-top { 54 | padding-top: 100px; 55 | } 56 | .remark-slide-content .smaller p, .remark-slide-content .smaller p .MathJax, .remark-slide-content .smaller li, 57 | .remark-slide-content .smaller .remark-code, .smaller .remark-code-line,.remark-slide-content .smaller a, 58 | .remark-slide-content .smaller .dataframe{ 59 | font-size: 25px; 60 | } 61 | 62 | .remark-slide-content .smallest p, .remark-slide-content .smallest .MathJax, .remark-slide-content .smallest li, .remark-slide-content .smallest .remark-code, 63 | .smallest .remark-code-line, .remark-slide-content .smallest .dataframe, .remark-slide-content span.smallest{ 64 | font-size: 20px; 65 | } 66 | .remark-slide-content .tiny p, .remark-slide-content .tiny li, .remark-slide-content .tiny .remark-code, 67 | .tiny .remark-code-line, .remark-slide-content .tiny .dataframe{ 68 | font-size: 16px; 69 | } 70 | .normal { 71 | font-size: 30px; 72 | } 73 | .quote_author { 74 | display: block; 75 | text-align: right; 76 | margin-top: 20px; 77 | font-size: 30px; 78 | font-family: 'Garamond'; 79 | } 80 | .larger, .larger .remark-code { 81 | font-size: 40px; 82 | } 83 | .largest, .largest .remark-code { 84 | font-size: 50px; 85 | } 86 | .left-column, .right-column { 87 | width: 48%; 88 | } 89 | .right-column{ 90 | float: right; 91 | } 92 | .left-column{ 93 | float: left; 94 | } 95 | .clear-column{ 96 | clear: both; 97 | } 98 | .narrow-right-column { 99 | float: right; 100 | width: 32% 101 | } 102 | .wide-left-column { 103 | float: left; 104 | width: 65% 105 | } 106 | .narrow-left-column { 107 | float: left; 108 | width: 32% 109 | } 110 | .wide-right-column { 111 | float: right; 112 | width: 65% 113 | } 114 | 115 | .invisible { 116 | visibility: hidden 117 | } 118 | .tiny-code .remark-code, .remark-inline-code .tiny-code{ 119 | font-size: 15px; 120 | } 121 | .remark-code, .remark-inline-code { font-family: 'Ubuntu Mono';} 122 | .hljs.remark-code {background: #e0e0e0} 123 | 124 | /* Some additional styling taken form the Jupyter notebook CSS */ 125 | table.dataframe { 126 | border: none; 127 | border-collapse: collapse; 128 | border-spacing: 0; 129 | color: black; 130 | table-layout: fixed; 131 | } 132 | table.dataframe thead { 133 | border-bottom: 1px solid black; 134 | vertical-align: bottom; 135 | } 136 | table.dataframe tr, 137 | table.dataframe th, 138 | table.dataframe td { 139 | text-align: right; 140 | vertical-align: middle; 141 | padding: 0.5em 0.5em; 142 | line-height: normal; 143 | white-space: normal; 144 | max-width: none; 145 | border: none; 146 | } 147 | table.dataframe th { 148 | font-weight: bold; 149 | } 150 | table.dataframe tbody tr:nth-child(odd) { 151 | background: #f5f5f5; 152 | } 153 | table.dataframe tbody tr:hover { 154 | background: rgba(66, 165, 245, 0.2); 155 | } -------------------------------------------------------------------------------- /notebooks/06 - Gradient Boosting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "import pandas as pd\n", 12 | "import sklearn\n", 13 | "sklearn.set_config(print_changed_only=True)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Gradient Boosting" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from sklearn.ensemble import GradientBoostingClassifier\n", 30 | "from sklearn.datasets import load_breast_cancer\n", 31 | "from sklearn.model_selection import train_test_split\n", 32 | "cancer = load_breast_cancer()\n", 33 | "\n", 34 | "X_train, X_test, y_train, y_test = train_test_split(\n", 35 | " cancer.data, cancer.target, random_state=0)\n", 36 | "\n", 37 | "gbrt = GradientBoostingClassifier(random_state=0)\n", 38 | "gbrt.fit(X_train, y_train)\n", 39 | "\n", 40 | "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n", 41 | "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)\n", 51 | "gbrt.fit(X_train, y_train)\n", 52 | "\n", 53 | "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n", 54 | "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)\n", 64 | "gbrt.fit(X_train, y_train)\n", 65 | "\n", 66 | "print(\"accuracy on training set: %f\" % gbrt.score(X_train, y_train))\n", 67 | "print(\"accuracy on test set: %f\" % gbrt.score(X_test, y_test))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)\n", 77 | "gbrt.fit(X_train, y_train)\n", 78 | "\n", 79 | "plt.barh(range(cancer.data.shape[1]), gbrt.feature_importances_)\n", 80 | "plt.yticks(range(cancer.data.shape[1]), cancer.feature_names);\n", 81 | "ax = plt.gca()\n", 82 | "ax.set_position([0.4, .2, .9, .9])" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from xgboost import XGBClassifier\n", 92 | "xgb = XGBClassifier()\n", 93 | "xgb.fit(X_train, y_train)\n", 94 | "print(\"accuracy on training set: %f\" % xgb.score(X_train, y_train))\n", 95 | "print(\"accuracy on test set: %f\" % xgb.score(X_test, y_test))" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "from xgboost import XGBClassifier\n", 105 | "xgb = XGBClassifier(n_estimators=1000)\n", 106 | "xgb.fit(X_train, y_train)\n", 107 | "print(\"accuracy on training set: %f\" % xgb.score(X_train, y_train))\n", 108 | "print(\"accuracy on test set: %f\" % xgb.score(X_test, y_test))" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "# Exercise\n", 116 | "Use GradientBoostingRegressor on the Bike dataset.\n", 117 | "Search over the ``learning_rate`` and ``max_depth`` using ``GridSearchCV``.\n", 118 | "What happens if you change ``n_estimators``?\n", 119 | "\n", 120 | "Compare the speed of XGBClassifier with GradientBoostingRegressor. How well does XGBClassifier do with defaults on the ``Bike`` dataset? Can you make it do better?" 121 | ] 122 | } 123 | ], 124 | "metadata": { 125 | "anaconda-cloud": {}, 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.7.3" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 4 146 | } 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Intermediate Machine learning with scikit-learn 2 | ======================================================== 3 | 4 | Part 2 of 4 5 | ----------- 6 | Other parts: 7 | - [Part 1](https://github.com/amueller/ml-workshop-1-of-4) 8 | - [Part 3](https://github.com/amueller/ml-workshop-3-of-4) 9 | - [Part 4](https://github.com/amueller/ml-workshop-4-of-4) 10 | 11 | 12 | Content 13 | ------- 14 | - [Reminder on supervised learning](https://amueller.github.io/ml-workshop-2-of-4/slides/01-reminder-supervised-learning.html) 15 | - [Grid search and cross-validation](https://amueller.github.io/ml-workshop-2-of-4/slides/02-cross-validation-grid-search.html) 16 | - [Linear models for regression](https://amueller.github.io/ml-workshop-2-of-4/slides/03-linear-models-regression.html) 17 | - [Linear models for classification](https://amueller.github.io/ml-workshop-2-of-4/slides/04-linear-models-classification.html) 18 | - [Decision trees](https://amueller.github.io/ml-workshop-2-of-4/slides/05-trees-forests.html) 19 | - [Random Forests](https://amueller.github.io/ml-workshop-2-of-4/slides/05-trees-forests.html#26) 20 | 21 | 22 | Instructor 23 | ----------- 24 | 25 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/amuellerml) - Columbia University; [Book: Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do) 26 | 27 | --- 28 | 29 | This repository will contain the teaching material and other info associated 30 | with the "Intermediate Machine Learning with scikit-learn" course. 31 | 32 | About the workshop 33 | ------------------ 34 | Scikit-learn is a machine learning library in Python, that has become a 35 | valuable tool for many data science practitioners. This workshop will go beyond 36 | the basics and show how to effectively evaluate and tune algorithms. We will 37 | also discuss the most important machine learning algorithms that you're likely 38 | to see in practice, how and when to use them, and some details about how they 39 | work internally. The session will focus on linear models for classification and 40 | regression and tree-based models, including random forests. 41 | 42 | Prerequisites 43 | ------------- 44 | This workshop assumes familiarity with Jupyter notebooks and basics of pandas, matplotlib and numpy. 45 | It also assumes familiarity with the basics of supervised learning, like training and test data and basics of model evaluation. 46 | You should have build a model with scikit-learn (or attend Introduction to Machine learning with scikit-learn) before 47 | taking this workshop. 48 | 49 | Obtaining the Tutorial Material 50 | -------------------------------- 51 | 52 | 53 | If you are familiar with git, it is most convenient if you clone the GitHub repository. This 54 | is highly encouraged as it allows you to easily synchronize any changes to the material. 55 | 56 | ``` 57 | git clone https://github.com/amueller/ml-workshop-2-of-4.git 58 | ``` 59 | 60 | If you are not familiar with git, you can download the repository as a .zip file by heading over to the GitHub repository (https://github.com/amueller/ml-workshop-2-of-4) in your browser and click the green “Download” button in the upper right. 61 | 62 | ![](images/download-repo.png) 63 | 64 | Please note that I may add and improve the material until shortly before the tutorial session, and we recommend you to update your copy of the materials one day before the tutorials. If you have an GitHub account and forked/cloned the repository via GitHub, you can sync your existing fork with via the following commands: 65 | 66 | ``` 67 | git pull origin master 68 | ``` 69 | 70 | 71 | Installation Notes 72 | ------------------ 73 | 74 | This tutorial will require recent installations of 75 | 76 | - [NumPy](http://www.numpy.org) 77 | - [SciPy](http://www.scipy.org) 78 | - [matplotlib](http://matplotlib.org) 79 | - [pillow](https://python-pillow.org) 80 | - [pandas](http://pandas.pydata.org) 81 | - [scikit-learn](http://scikit-learn.org/stable/) (>=0.22.1) 82 | - [IPython](http://ipython.readthedocs.org/en/stable/) 83 | - [Jupyter Notebook](http://jupyter.org) 84 | 85 | The last one is important, you should be able to type: 86 | 87 | jupyter notebook 88 | 89 | in your terminal window and see the notebook panel load in your web browser. 90 | Try opening and running a notebook from the material to see check that it works. 91 | 92 | For users who do not yet have these packages installed, a relatively 93 | painless way to install all the requirements is to use a Python distribution 94 | such as [Anaconda](https://www.continuum.io/downloads), which includes 95 | the most relevant Python packages for science, math, engineering, and 96 | data analysis; Anaconda can be downloaded and installed for free 97 | including commercial use and redistribution. 98 | The code examples in this tutorial requires Python 3.5 or later. 99 | 100 | After obtaining the material, we **strongly recommend** you to open and execute 101 | a Jupyter Notebook `jupter notebook check_env.ipynb` that is located at the 102 | top level of this repository. Inside the repository, you can open the notebook 103 | by executing 104 | 105 | ```bash 106 | jupyter notebook check_env.ipynb 107 | ``` 108 | 109 | inside this repository. Inside the Notebook, you can run the code cell by 110 | clicking on the "Run Cells" button as illustrated in the figure below: 111 | 112 | ![](images/check_env-1.png) 113 | 114 | 115 | Finally, if your environment satisfies the requirements for the tutorials, the executed code cell will produce an output message as shown below: 116 | 117 | ![](images/check_env-2.png) 118 | -------------------------------------------------------------------------------- /notebooks/data/bank-campaign-desc.text: -------------------------------------------------------------------------------- 1 | Citation Request: 2 | This dataset is publicly available for research. The details are described in [Moro et al., 2014]. 3 | Please include this citation if you plan to use this database: 4 | 5 | [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, In press, http://dx.doi.org/10.1016/j.dss.2014.03.001 6 | 7 | Available at: [pdf] http://dx.doi.org/10.1016/j.dss.2014.03.001 8 | [bib] http://www3.dsi.uminho.pt/pcortez/bib/2014-dss.txt 9 | 10 | 1. Title: Bank Marketing (with social/economic context) 11 | 12 | 2. Sources 13 | Created by: Sérgio Moro (ISCTE-IUL), Paulo Cortez (Univ. Minho) and Paulo Rita (ISCTE-IUL) @ 2014 14 | 15 | 3. Past Usage: 16 | 17 | The full dataset (bank-additional-full.csv) was described and analyzed in: 18 | 19 | S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems (2014), doi:10.1016/j.dss.2014.03.001. 20 | 21 | 4. Relevant Information: 22 | 23 | This dataset is based on "Bank Marketing" UCI dataset (please check the description at: http://archive.ics.uci.edu/ml/datasets/Bank+Marketing). 24 | The data is enriched by the addition of five new social and economic features/attributes (national wide indicators from a ~10M population country), published by the Banco de Portugal and publicly available at: https://www.bportugal.pt/estatisticasweb. 25 | This dataset is almost identical to the one used in [Moro et al., 2014] (it does not include all attributes due to privacy concerns). 26 | Using the rminer package and R tool (http://cran.r-project.org/web/packages/rminer/), we found that the addition of the five new social and economic attributes (made available here) lead to substantial improvement in the prediction of a success, even when the duration of the call is not included. Note: the file can be read in R using: d=read.table("bank-additional-full.csv",header=TRUE,sep=";") 27 | 28 | The zip file includes two datasets: 29 | 1) bank-additional-full.csv with all examples, ordered by date (from May 2008 to November 2010). 30 | 2) bank-additional.csv with 10% of the examples (4119), randomly selected from bank-additional-full.csv. 31 | The smallest dataset is provided to test more computationally demanding machine learning algorithms (e.g., SVM). 32 | 33 | The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y). 34 | 35 | 5. Number of Instances: 41188 for bank-additional-full.csv 36 | 37 | 6. Number of Attributes: 20 + output attribute. 38 | 39 | 7. Attribute information: 40 | 41 | For more information, read [Moro et al., 2014]. 42 | 43 | Input variables: 44 | # bank client data: 45 | 1 - age (numeric) 46 | 2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown") 47 | 3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed) 48 | 4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown") 49 | 5 - default: has credit in default? (categorical: "no","yes","unknown") 50 | 6 - housing: has housing loan? (categorical: "no","yes","unknown") 51 | 7 - loan: has personal loan? (categorical: "no","yes","unknown") 52 | # related with the last contact of the current campaign: 53 | 8 - contact: contact communication type (categorical: "cellular","telephone") 54 | 9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec") 55 | 10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri") 56 | 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model. 57 | # other attributes: 58 | 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact) 59 | 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted) 60 | 14 - previous: number of contacts performed before this campaign and for this client (numeric) 61 | 15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success") 62 | # social and economic context attributes 63 | 16 - emp.var.rate: employment variation rate - quarterly indicator (numeric) 64 | 17 - cons.price.idx: consumer price index - monthly indicator (numeric) 65 | 18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric) 66 | 19 - euribor3m: euribor 3 month rate - daily indicator (numeric) 67 | 20 - nr.employed: number of employees - quarterly indicator (numeric) 68 | 69 | Output variable (desired target): 70 | 21 - y - has the client subscribed a term deposit? (binary: "yes","no") 71 | 72 | 8. Missing Attribute Values: There are several missing values in some categorical attributes, all coded with the "unknown" label. These missing values can be treated as a possible class label or using deletion or imputation techniques. 73 | -------------------------------------------------------------------------------- /notebooks/01 - Review of Supervised Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Review of Supervised Learning with scikit-learn" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import sklearn\n", 20 | "sklearn.set_config(print_changed_only=True)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# read data.\n", 30 | "# you can find a description in data/bank-campaign-desc.txt\n", 31 | "data = pd.read_csv(\"data/bank-campaign.csv\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "data.shape" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "data.columns" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "data.head()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "y = data.target" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "X = data.drop(\"target\", axis=1)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "X.shape" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "y.shape" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "y.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "data.target.value_counts()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "data.target.value_counts(normalize=True)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Splitting the data:" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from sklearn.model_selection import train_test_split\n", 138 | "X_train, X_test, y_train, y_test = train_test_split(\n", 139 | " X, y, test_size=.2, random_state=42, stratify=y)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "np.sum(y_train == \"yes\") / len(y_train)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "np.sum(y_test == \"yes\") / len(y_test)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "# import model\n", 174 | "from sklearn.linear_model import LogisticRegression\n", 175 | "# instantiate model, set parameters\n", 176 | "lr = LogisticRegression(C=0.1, max_iter=1000)\n", 177 | "# fit model\n", 178 | "lr.fit(X_train, y_train)\n", 179 | "lr.coef_" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "Make predictions:" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "lr.score(X_train, y_train)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "(y_train == \"no\").mean()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "lr.score(X_test, y_test)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "# https://github.com/amueller/ml-workshop-2-of-4" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "# Exercise\n", 228 | "Load the dataset ``data/bike_day_raw.csv``, which has the regression target ``cnt``.\n", 229 | "This dataset is hourly bike rentals in the citybike platform. The ``cnt`` column is the number of rentals, which we want to predict from date and weather data.\n", 230 | "\n", 231 | "Split the data into a training and a test set using ``train_test_split``.\n", 232 | "Use the ``LinearRegression`` class to learn a regression model on this data. You can evaluate with the ``score`` method, which provides the $R^2$ or using the ``mean_squared_error`` function from ``sklearn.metrics`` (or write it yourself in numpy)." 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# %load solutions/bike_regression.py" 242 | ] 243 | } 244 | ], 245 | "metadata": { 246 | "anaconda-cloud": {}, 247 | "kernelspec": { 248 | "display_name": "Python 3", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.7.6" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 4 267 | } 268 | -------------------------------------------------------------------------------- /notebooks/02 - Cross-validation and Grid Search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Cross-validation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import matplotlib.pyplot as plt\n", 17 | "import numpy as np\n", 18 | "import sklearn\n", 19 | "sklearn.set_config(print_changed_only=True)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from sklearn.datasets import load_digits\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "\n", 31 | "digits = load_digits()\n", 32 | "X_train, X_test, y_train, y_test = train_test_split(\n", 33 | " digits.data, digits.target)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from sklearn.model_selection import cross_val_score\n", 43 | "from sklearn.neighbors import KNeighborsClassifier" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "cross_val_score(KNeighborsClassifier(),\n", 53 | " X_train, y_train, cv=5)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from sklearn.model_selection import KFold, RepeatedStratifiedKFold" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "cross_val_score(KNeighborsClassifier(),\n", 72 | " X_train, y_train, cv=KFold(n_splits=10, shuffle=True, random_state=42))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "cross_val_score(KNeighborsClassifier(),\n", 82 | " X_train, y_train,\n", 83 | " cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42))" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "Grid Searches\n", 91 | "=================" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Grid-Search with build-in cross validation" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "from sklearn.model_selection import GridSearchCV\n", 108 | "from sklearn.svm import SVC" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Define parameter grid:" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "import numpy as np\n", 125 | "\n", 126 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n", 127 | " 'gamma' : 10. ** np.arange(-5, 0)}\n", 128 | "\n", 129 | "np.set_printoptions(suppress=True)\n", 130 | "print(param_grid)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "A GridSearchCV object behaves just like a normal classifier." 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "grid_search.fit(X_train, y_train)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "grid_search.predict(X_test)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "grid_search.score(X_test, y_test)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "grid_search.best_params_" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "grid_search.best_score_" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "grid_search.best_estimator_" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "# We extract just the scores\n", 210 | "\n", 211 | "scores = grid_search.cv_results_['mean_test_score']\n", 212 | "scores = np.array(scores).reshape(6, 5)\n", 213 | "\n", 214 | "plt.matshow(scores)\n", 215 | "plt.xlabel('gamma')\n", 216 | "plt.ylabel('C')\n", 217 | "plt.colorbar()\n", 218 | "plt.xticks(np.arange(5), param_grid['gamma'])\n", 219 | "plt.yticks(np.arange(6), param_grid['C']);" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "# Exercises\n", 227 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "# %load solutions/grid_search_k_neighbors.py" 237 | ] 238 | } 239 | ], 240 | "metadata": { 241 | "anaconda-cloud": {}, 242 | "kernelspec": { 243 | "display_name": "Python 3", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.7.3" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 4 262 | } 263 | -------------------------------------------------------------------------------- /notebooks/data/ram_price.csv: -------------------------------------------------------------------------------- 1 | ,date,price 2 | 0,1957.0,411041792.0 3 | 1,1959.0,67947725.0 4 | 2,1960.0,5242880.0 5 | 3,1965.0,2642412.0 6 | 4,1970.0,734003.0 7 | 5,1973.0,399360.0 8 | 6,1974.0,314573.0 9 | 7,1975.0,421888.0 10 | 8,1975.08,180224.0 11 | 9,1975.25,67584.0 12 | 10,1975.75,49920.0 13 | 11,1976.0,40704.0 14 | 12,1976.17,48960.0 15 | 13,1976.42,23040.0 16 | 14,1976.58,32000.0 17 | 15,1977.08,36800.0 18 | 16,1978.17,28000.0 19 | 17,1978.25,29440.0 20 | 18,1978.33,19200.0 21 | 19,1978.5,24000.0 22 | 20,1978.58,16000.0 23 | 21,1978.75,15200.0 24 | 22,1979.0,10528.0 25 | 23,1979.75,6704.0 26 | 24,1980.0,6480.0 27 | 25,1981.0,8800.0 28 | 26,1981.58,4479.0 29 | 27,1982.0,3520.0 30 | 28,1982.17,4464.0 31 | 29,1982.67,1980.0 32 | 30,1983.0,2396.0 33 | 31,1983.67,1980.0 34 | 32,1984.0,1379.0 35 | 33,1984.58,1331.0 36 | 34,1985.0,880.0 37 | 35,1985.33,720.0 38 | 36,1985.42,550.0 39 | 37,1985.5,420.0 40 | 38,1985.58,350.0 41 | 39,1985.67,300.0 42 | 40,1985.83,300.0 43 | 41,1985.92,300.0 44 | 42,1986.0,300.0 45 | 43,1986.08,300.0 46 | 44,1986.17,300.0 47 | 45,1986.25,300.0 48 | 46,1986.33,190.0 49 | 47,1986.42,190.0 50 | 48,1986.5,190.0 51 | 49,1986.58,190.0 52 | 50,1986.67,190.0 53 | 51,1986.75,190.0 54 | 52,1986.92,190.0 55 | 53,1987.0,176.0 56 | 54,1987.08,176.0 57 | 55,1987.17,157.0 58 | 56,1987.25,154.0 59 | 57,1987.33,154.0 60 | 58,1987.42,154.0 61 | 59,1987.5,154.0 62 | 60,1987.58,154.0 63 | 61,1987.67,163.0 64 | 62,1987.75,133.0 65 | 63,1987.83,163.0 66 | 64,1987.92,163.0 67 | 65,1988.0,163.0 68 | 66,1988.08,182.0 69 | 67,1988.17,199.0 70 | 68,1988.33,199.0 71 | 69,1988.42,199.0 72 | 70,1988.5,505.0 73 | 71,1988.58,505.0 74 | 72,1988.67,505.0 75 | 73,1988.75,505.0 76 | 74,1988.83,505.0 77 | 75,1988.92,505.0 78 | 76,1989.0,505.0 79 | 77,1989.08,505.0 80 | 78,1989.17,505.0 81 | 79,1989.25,505.0 82 | 80,1989.42,344.0 83 | 81,1989.5,197.0 84 | 82,1989.58,188.0 85 | 83,1989.67,188.0 86 | 84,1989.75,128.0 87 | 85,1989.83,117.0 88 | 86,1989.92,113.0 89 | 87,1990.0,106.0 90 | 88,1990.17,98.3 91 | 89,1990.33,98.3 92 | 90,1990.42,89.5 93 | 91,1990.5,82.8 94 | 92,1990.58,81.1 95 | 93,1990.67,71.5 96 | 94,1990.75,59.0 97 | 95,1990.83,51.0 98 | 96,1990.92,45.5 99 | 97,1991.0,44.5 100 | 98,1991.08,44.5 101 | 99,1991.17,45.0 102 | 100,1991.25,45.0 103 | 101,1991.33,45.0 104 | 102,1991.42,43.8 105 | 103,1991.5,43.8 106 | 104,1991.58,41.3 107 | 105,1991.67,46.3 108 | 106,1991.75,45.0 109 | 107,1991.83,39.8 110 | 108,1991.92,39.8 111 | 109,1992.0,36.3 112 | 110,1992.08,36.3 113 | 111,1992.17,36.3 114 | 112,1992.25,34.8 115 | 113,1992.33,30.0 116 | 114,1992.42,32.5 117 | 115,1992.5,33.5 118 | 116,1992.58,31.0 119 | 117,1992.67,27.5 120 | 118,1992.75,26.3 121 | 119,1992.83,26.3 122 | 120,1992.92,26.3 123 | 121,1993.0,33.1 124 | 122,1993.08,27.5 125 | 123,1993.17,27.5 126 | 124,1993.25,27.5 127 | 125,1993.33,27.5 128 | 126,1993.42,30.0 129 | 127,1993.5,30.0 130 | 128,1993.58,30.0 131 | 129,1993.67,30.0 132 | 130,1993.75,36.0 133 | 131,1993.83,39.8 134 | 132,1993.92,35.8 135 | 133,1994.0,35.8 136 | 134,1994.08,35.8 137 | 135,1994.17,36.0 138 | 136,1994.25,37.3 139 | 137,1994.33,37.3 140 | 138,1994.42,37.3 141 | 139,1994.5,38.5 142 | 140,1994.58,37.0 143 | 141,1994.67,34.0 144 | 142,1994.75,33.5 145 | 143,1994.83,32.3 146 | 144,1994.92,32.3 147 | 145,1995.0,32.3 148 | 146,1995.08,32.0 149 | 147,1995.17,32.0 150 | 148,1995.25,31.2 151 | 149,1995.33,31.2 152 | 150,1995.42,31.1 153 | 151,1995.5,31.2 154 | 152,1995.58,30.6 155 | 153,1995.67,33.1 156 | 154,1995.75,33.1 157 | 155,1995.83,30.9 158 | 156,1995.92,30.9 159 | 157,1996.0,29.9 160 | 158,1996.08,28.8 161 | 159,1996.17,26.1 162 | 160,1996.25,24.7 163 | 161,1996.33,17.2 164 | 162,1996.42,14.9 165 | 163,1996.5,11.3 166 | 164,1996.58,9.06 167 | 165,1996.67,8.44 168 | 166,1996.75,8.0 169 | 167,1996.83,5.25 170 | 168,1996.92,5.25 171 | 169,1997.0,4.63 172 | 170,1997.08,3.63 173 | 171,1997.17,3.0 174 | 172,1997.25,3.0 175 | 173,1997.33,3.0 176 | 174,1997.42,3.69 177 | 175,1997.5,4.0 178 | 176,1997.58,4.13 179 | 177,1997.67,3.63 180 | 178,1997.75,3.41 181 | 179,1997.83,3.25 182 | 180,1997.92,2.16 183 | 181,1998.0,2.16 184 | 182,1998.08,0.91 185 | 183,1998.17,0.97 186 | 184,1998.25,1.22 187 | 185,1998.33,1.19 188 | 186,1998.42,0.97 189 | 187,1998.58,1.03 190 | 188,1998.67,0.97 191 | 189,1998.75,1.16 192 | 190,1998.83,0.84 193 | 191,1998.92,0.84 194 | 192,1999.08,1.44 195 | 193,1999.13,0.84 196 | 194,1999.17,1.25 197 | 195,1999.25,1.25 198 | 196,1999.33,0.86 199 | 197,1999.5,0.78 200 | 198,1999.67,0.87 201 | 199,1999.75,1.04 202 | 200,1999.83,1.34 203 | 201,1999.92,2.35 204 | 202,2000.0,1.56 205 | 203,2000.08,1.48 206 | 204,2000.17,1.08 207 | 205,2000.25,0.84 208 | 206,2000.33,0.7 209 | 207,2000.42,0.9 210 | 208,2000.5,0.77 211 | 209,2000.58,0.84 212 | 210,2000.67,1.07 213 | 211,2000.75,1.12 214 | 212,2000.83,1.12 215 | 213,2000.92,0.9 216 | 214,2001.0,0.75 217 | 215,2001.08,0.464 218 | 216,2001.17,0.464 219 | 217,2001.25,0.383 220 | 218,2001.33,0.387 221 | 219,2001.42,0.305 222 | 220,2001.5,0.352 223 | 221,2001.5,0.27 224 | 222,2001.58,0.191 225 | 223,2001.67,0.191 226 | 224,2001.75,0.169 227 | 225,2001.77,0.148 228 | 226,2002.08,0.134 229 | 227,2002.08,0.207 230 | 228,2002.25,0.193 231 | 229,2002.33,0.193 232 | 230,2002.42,0.33 233 | 231,2002.58,0.193 234 | 232,2002.75,0.193 235 | 233,2003.17,0.176 236 | 234,2003.25,0.076 237 | 235,2003.33,0.126 238 | 236,2003.42,0.115 239 | 237,2003.5,0.133 240 | 238,2003.58,0.129 241 | 239,2003.67,0.143 242 | 240,2003.75,0.148 243 | 241,2003.83,0.16 244 | 242,2003.99,0.166 245 | 243,2004.0,0.174 246 | 244,2004.08,0.148 247 | 245,2004.17,0.146 248 | 246,2004.33,0.156 249 | 247,2004.42,0.203 250 | 248,2004.5,0.176 251 | 249,2005.25,0.185 252 | 250,2005.42,0.149 253 | 251,2005.83,0.116 254 | 252,2005.92,0.185 255 | 253,2006.17,0.112 256 | 254,2006.33,0.073 257 | 255,2006.5,0.082 258 | 256,2006.67,0.073 259 | 257,2006.75,0.088 260 | 258,2006.83,0.098 261 | 259,2006.99,0.092 262 | 260,2007.0,0.082 263 | 261,2007.08,0.078 264 | 262,2007.17,0.066 265 | 263,2007.33,0.0464 266 | 264,2007.5,0.0386 267 | 265,2007.67,0.0351 268 | 266,2007.75,0.0322 269 | 267,2007.83,0.0244 270 | 268,2007.92,0.0244 271 | 269,2008.0,0.0232 272 | 270,2008.08,0.022 273 | 271,2008.33,0.022 274 | 272,2008.5,0.0207 275 | 273,2008.58,0.0176 276 | 274,2008.67,0.0146 277 | 275,2008.83,0.011 278 | 276,2008.92,0.0098 279 | 277,2009.0,0.0098 280 | 278,2009.08,0.0107 281 | 279,2009.25,0.0105 282 | 280,2009.42,0.0115 283 | 281,2009.5,0.011 284 | 282,2009.58,0.0127 285 | 283,2009.75,0.0183 286 | 284,2009.92,0.0205 287 | 285,2010.0,0.019 288 | 286,2010.08,0.0202 289 | 287,2010.17,0.0195 290 | 288,2010.33,0.0242 291 | 289,2010.5,0.021 292 | 290,2010.58,0.022 293 | 291,2010.75,0.0171 294 | 292,2010.83,0.0146 295 | 293,2010.92,0.0122 296 | 294,2011.0,0.01 297 | 295,2011.08,0.0103 298 | 296,2011.33,0.01 299 | 297,2011.42,0.0085 300 | 298,2011.67,0.0054 301 | 299,2011.75,0.0051 302 | 300,2012.0,0.0049 303 | 301,2012.08,0.0049 304 | 302,2012.25,0.005 305 | 303,2012.33,0.0049 306 | 304,2012.58,0.0048 307 | 305,2012.67,0.004 308 | 306,2012.83,0.0037 309 | 307,2013.0,0.0043 310 | 308,2013.08,0.0054 311 | 309,2013.33,0.0067 312 | 310,2013.42,0.0061 313 | 311,2013.58,0.0073 314 | 312,2013.67,0.0065 315 | 313,2013.75,0.0082 316 | 314,2013.83,0.0085 317 | 315,2013.92,0.0079 318 | 316,2014.08,0.0095 319 | 317,2014.17,0.0079 320 | 318,2014.25,0.0073 321 | 319,2014.42,0.0079 322 | 320,2014.58,0.0085 323 | 321,2014.67,0.0085 324 | 322,2014.83,0.0085 325 | 323,2015.0,0.0078 326 | 324,2015.08,0.0073 327 | 325,2015.25,0.0061 328 | 326,2015.33,0.0056 329 | 327,2015.5,0.0049 330 | 328,2015.58,0.0045 331 | 329,2015.67,0.0043 332 | 330,2015.75,0.0042 333 | 331,2015.83,0.0038 334 | 332,2015.92,0.0037 335 | -------------------------------------------------------------------------------- /notebooks/05 - Trees.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Trees" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import sklearn\n", 19 | "sklearn.set_config(print_changed_only=True)\n", 20 | "import pandas as pd\n", 21 | "from sklearn.model_selection import train_test_split\n", 22 | "from sklearn.pipeline import make_pipeline\n", 23 | "from sklearn.preprocessing import scale, StandardScaler" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from sklearn.datasets import load_breast_cancer\n", 33 | "cancer = load_breast_cancer()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "print(cancer.DESCR)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "X_train, X_test, y_train, y_test = train_test_split(\n", 52 | " cancer.data, cancer.target, stratify=cancer.target, random_state=0)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# tree visualization" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", 69 | "tree = DecisionTreeClassifier(max_depth=2)\n", 70 | "tree.fit(X_train, y_train)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "plt.figure(dpi=200)\n", 80 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Parameter Tuning" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "tree = DecisionTreeClassifier().fit(X_train, y_train)\n", 97 | "plt.figure(figsize=(15, 5))\n", 98 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)\n", 108 | "plt.figure(figsize=(15, 5))\n", 109 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "tree = DecisionTreeClassifier(max_leaf_nodes=8).fit(X_train, y_train)\n", 119 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "tree = DecisionTreeClassifier(min_samples_split=50).fit(X_train, y_train)\n", 129 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "tree = DecisionTreeClassifier(min_impurity_decrease=.01).fit(X_train, y_train)\n", 139 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from sklearn.model_selection import GridSearchCV\n", 149 | "param_grid = {'max_depth':range(1, 7)}\n", 150 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=10)\n", 151 | "grid.fit(X_train, y_train)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit\n", 161 | "param_grid = {'max_depth':range(1, 7)}\n", 162 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n", 163 | " cv=StratifiedShuffleSplit(100), return_train_score=True)\n", 164 | "grid.fit(X_train, y_train)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "scores = pd.DataFrame(grid.cv_results_)\n", 174 | "scores.plot(x='param_max_depth', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n", 175 | "plt.legend(loc=(1, 0))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "from sklearn.model_selection import GridSearchCV\n", 185 | "param_grid = {'max_leaf_nodes': range(2, 20)}\n", 186 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n", 187 | " cv=StratifiedShuffleSplit(100, random_state=1),\n", 188 | " return_train_score=True)\n", 189 | "grid.fit(X_train, y_train)\n", 190 | "\n", 191 | "scores = pd.DataFrame(grid.cv_results_)\n", 192 | "scores.plot(x='param_max_leaf_nodes', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n", 193 | "plt.legend(loc=(1, 0))" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "scores = pd.DataFrame(grid.cv_results_)\n", 203 | "scores.plot(x='param_max_leaf_nodes', y='mean_train_score', yerr='std_train_score', ax=plt.gca())\n", 204 | "scores.plot(x='param_max_leaf_nodes', y='mean_test_score', yerr='std_test_score', ax=plt.gca())" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "grid.best_params_" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "plot_tree(grid.best_estimator_, feature_names=cancer.feature_names, filled=True)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "pd.Series(grid.best_estimator_.feature_importances_,\n", 232 | " index=cancer.feature_names).plot(kind=\"barh\")" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "# Exercise\n", 240 | "Apply a decision tree to the \"adult\" dataset and visualize it.\n", 241 | "\n", 242 | "Tune parameters with grid-search; try at least max_leaf_nodes and max_depth, but separately.\n", 243 | "\n", 244 | "Visualize the resulting tree and it's feature importances." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "anaconda-cloud": {}, 257 | "kernelspec": { 258 | "display_name": "root *", 259 | "language": "python", 260 | "name": "conda-root-py" 261 | }, 262 | "language_info": { 263 | "codemirror_mode": { 264 | "name": "ipython", 265 | "version": 3 266 | }, 267 | "file_extension": ".py", 268 | "mimetype": "text/x-python", 269 | "name": "python", 270 | "nbconvert_exporter": "python", 271 | "pygments_lexer": "ipython3", 272 | "version": "3.7.3" 273 | } 274 | }, 275 | "nbformat": 4, 276 | "nbformat_minor": 4 277 | } 278 | -------------------------------------------------------------------------------- /notebooks/03 - Linear Models for Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Linear Models for Regression" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import matplotlib.pyplot as plt\n", 17 | "import numpy as np\n", 18 | "import sklearn\n", 19 | "sklearn.set_config(print_changed_only=True)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from sklearn.linear_model import Ridge, LinearRegression" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from sklearn.model_selection import cross_val_score" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.datasets import load_boston\n", 47 | "boston = load_boston()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "X, y = boston.data, boston.target" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "print(boston.DESCR)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "X.shape" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "fig, axes = plt.subplots(3, 5, figsize=(20, 10))\n", 84 | "for i, ax in enumerate(axes.ravel()):\n", 85 | " if i > 12:\n", 86 | " ax.set_visible(False)\n", 87 | " continue\n", 88 | " ax.plot(X[:, i], y, 'o', alpha=.5)\n", 89 | " ax.set_title(\"{}: {}\".format(i, boston.feature_names[i]))\n", 90 | " ax.set_ylabel(\"MEDV\")" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "print(X.shape)\n", 100 | "print(y.shape)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "from sklearn.model_selection import train_test_split\n", 110 | "X_train, X_test, y_train, y_test = train_test_split(\n", 111 | " X, y, random_state=42)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "np.mean(cross_val_score(LinearRegression(),\n", 121 | " X_train, y_train, cv=10))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "np.mean(cross_val_score(\n", 131 | " Ridge(), X_train, y_train, cv=10))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "from sklearn.model_selection import GridSearchCV\n", 141 | "param_grid = {'alpha': np.logspace(-3, 3, 14)}\n", 142 | "print(param_grid)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)\n", 152 | "grid.fit(X_train, y_train)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "import pandas as pd\n", 162 | "plt.figure(dpi=200)\n", 163 | "results = pd.DataFrame(grid.cv_results_)\n", 164 | "results.plot('param_alpha', 'mean_train_score', ax=plt.gca())\n", 165 | "results.plot('param_alpha', 'mean_test_score', ax=plt.gca())\n", 166 | "\n", 167 | "plt.legend()\n", 168 | "plt.xscale(\"log\")" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "from sklearn.preprocessing import PolynomialFeatures, scale\n", 178 | "# being lazy and not really doing things properly whoops\n", 179 | "X_poly = PolynomialFeatures(include_bias=False).fit_transform(scale(X))\n", 180 | "print(X_poly.shape)\n", 181 | "X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=42)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "np.mean(cross_val_score(LinearRegression(),\n", 191 | " X_train, y_train, cv=10))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "np.mean(cross_val_score(Ridge(),\n", 201 | " X_train, y_train, cv=10))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)\n", 211 | "grid.fit(X_train, y_train)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "results = pd.DataFrame(grid.cv_results_)\n", 221 | "\n", 222 | "results.plot('param_alpha', 'mean_train_score', ax=plt.gca())\n", 223 | "results.plot('param_alpha', 'mean_test_score', ax=plt.gca())\n", 224 | "plt.legend()\n", 225 | "plt.xscale(\"log\")" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "print(grid.best_params_)\n", 235 | "print(grid.best_score_)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "lr = LinearRegression().fit(X_train, y_train)\n", 245 | "plt.scatter(range(X_poly.shape[1]), lr.coef_, c=np.sign(lr.coef_), cmap=\"bwr_r\")" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "ridge = grid.best_estimator_\n", 255 | "plt.scatter(range(X_poly.shape[1]), ridge.coef_, c=np.sign(ridge.coef_), cmap=\"bwr_r\")" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "ridge100 = Ridge(alpha=100).fit(X_train, y_train)\n", 265 | "ridge1 = Ridge(alpha=1).fit(X_train, y_train)\n", 266 | "plt.figure(figsize=(8, 4))\n", 267 | "\n", 268 | "plt.plot(ridge1.coef_, 'o', label=\"alpha=1\")\n", 269 | "plt.plot(ridge.coef_, 'o', label=\"alpha=14\")\n", 270 | "plt.plot(ridge100.coef_, 'o', label=\"alpha=100\")\n", 271 | "plt.legend()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "from sklearn.linear_model import Lasso\n", 281 | "\n", 282 | "lasso = Lasso().fit(X_train, y_train)\n", 283 | "print(\"Training set score: {:.2f}\".format(lasso.score(X_train, y_train)))\n", 284 | "print(\"Test set score: {:.2f}\".format(lasso.score(X_test, y_test)))\n", 285 | "print(\"Number of features used:\", np.sum(lasso.coef_ != 0))" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "# Exercise\n", 293 | "Load the diabetes dataset using ``sklearn.datasets.load_diabetes``. Apply ``LinearRegression``, ``Ridge`` and ``Lasso`` and visualize the coefficients. Try polynomial features." 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# %load solutions/linear_models_diabetes.py" 303 | ] 304 | } 305 | ], 306 | "metadata": { 307 | "anaconda-cloud": {}, 308 | "kernelspec": { 309 | "display_name": "root *", 310 | "language": "python", 311 | "name": "conda-root-py" 312 | }, 313 | "language_info": { 314 | "codemirror_mode": { 315 | "name": "ipython", 316 | "version": 3 317 | }, 318 | "file_extension": ".py", 319 | "mimetype": "text/x-python", 320 | "name": "python", 321 | "nbconvert_exporter": "python", 322 | "pygments_lexer": "ipython3", 323 | "version": "3.7.3" 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 4 328 | } 329 | -------------------------------------------------------------------------------- /slides/04-linear-models-classification.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Linear Models for Classification 5 | 6 | 7 | 12 | 13 | 14 | 548 | 549 | 550 | 551 | 578 | 579 | 580 | -------------------------------------------------------------------------------- /slides/06-gradient-boosting.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Gradient Boosting 5 | 6 | 7 | 12 | 13 | 14 | 770 | 771 | 772 | 773 | 800 | 801 | 802 | -------------------------------------------------------------------------------- /slides/03-linear-models-regression.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Linear Models for Regression 5 | 6 | 7 | 12 | 13 | 14 | 747 | 748 | 749 | 750 | 773 | 774 | 775 | -------------------------------------------------------------------------------- /slides/02-cross-validation-grid-search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Cross Validation and Grid Search 5 | 6 | 7 | 12 | 13 | 14 | 800 | 801 | 802 | 803 | 831 | 832 | 833 | --------------------------------------------------------------------------------