├── .gitignore ├── README.Rmd ├── README.md ├── _config.yml ├── data ├── US Presidential Data.csv ├── adult.RData ├── ames_data.RData ├── ex.data ├── german_credit.RData ├── lm_nn_Yacht_NN2.RData ├── meta.data ├── meta_cnames.Rdata ├── ml_gbm_fit.RData ├── ml_gbm_fit2.RData ├── ml_gbm_reg_linear.RData ├── ml_gbm_tune.RData ├── ml_randomforest_m1.RData ├── ml_rf_OOB_RMSE.RData ├── ml_rf_ames_randomForest.RData ├── ml_rf_dec.RData ├── ml_rf_grid_perf.data ├── ml_rf_h2o.grid.RData ├── ml_rf_hypergrid_oobrmse.RData ├── ml_rf_m2.RData ├── ml_rf_oob_comp.RData ├── ml_rf_optimal_ranger.RData ├── ml_rf_random_grid.RData ├── ml_rf_xgb.fit1.RData ├── ml_rf_xgb.fit3.RData ├── model_auc.RData ├── model_eta.RData ├── negative-words.txt ├── osmsa_PLZ_14.RData ├── positive-words.txt └── titanic.RData ├── machine_learning.Rproj ├── misc └── along.Rmd ├── rcode ├── GESISPanel.R ├── a1_intro_r.R ├── a2_intro_ml.R ├── b1_regression.R ├── b2_regularization.R ├── c1_trees_bagging.R ├── c2_random_forests.R ├── c2b_random_forests_h2o.R ├── c3_gbm_regression.R ├── c3b_gbm_regression_h2o.R ├── creating_rcode.R ├── d_neuralNetworks.R ├── e_Clustering.R ├── f_dangers_ml.R ├── fitting.R ├── g_reticulate_umap.R ├── incourse1.R ├── incourse2.R ├── ml_part1.R ├── preparing_bagging.R ├── purl_the_slides.R └── randomforests_boosting.R ├── slides ├── a1_intro_ml.Rmd ├── a1_intro_ml.html ├── a1_intro_ml.md ├── a1_intro_ml.pdf ├── a1_intro_r_cache │ └── slidy │ │ ├── __packages │ │ ├── unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.RData │ │ ├── unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdb │ │ └── unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdx ├── a2_intro_r.Rmd ├── a2_intro_r.html ├── a2_intro_r.md ├── a2_intro_r.pdf ├── a2_intro_r_cache │ ├── beamer │ │ ├── __packages │ │ ├── unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.RData │ │ ├── unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdb │ │ └── unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdx │ └── slidy │ │ ├── __packages │ │ ├── unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.RData │ │ ├── unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdb │ │ └── unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdx ├── a2_intro_r_files │ ├── figure-beamer │ │ └── unnamed-chunk-51-1.pdf │ └── figure-slidy │ │ └── unnamed-chunk-51-1.png ├── b1_regression.Rmd ├── b1_regression.html ├── b1_regression.md ├── b1_regression.pdf ├── b1_regression_files │ └── figure-slidy │ │ ├── unnamed-chunk-25-1.png │ │ ├── unnamed-chunk-26-1.png │ │ ├── unnamed-chunk-3-1.png │ │ ├── unnamed-chunk-48-1.png │ │ ├── unnamed-chunk-49-1.png │ │ ├── unnamed-chunk-51-1.png │ │ ├── unnamed-chunk-52-1.png │ │ ├── unnamed-chunk-53-1.png │ │ └── unnamed-chunk-58-1.png ├── b2_regularization.Rmd ├── b2_regularization.md ├── b2_regularization.pdf ├── c1_trees_bagging.Rmd ├── c1_trees_bagging.md ├── c1_trees_bagging.pdf ├── c2_random_forests.Rmd ├── c2_random_forests.aux ├── c2_random_forests.md ├── c2_random_forests.pdf ├── c2_random_forests.vrb ├── c3_gbm_regression.md ├── c3_gbm_regression.pdf ├── c3_gbm_regression_short.Rmd ├── c3_gbm_regression_short.html ├── c3_gbm_regression_short.md ├── c3_gbm_regression_short.pdf ├── c3b_gbm_regression_h2o.Rmd ├── d_neuralNetworks.Rmd ├── d_neuralNetworks.html ├── d_neuralNetworks.pdf ├── e_Clustering-exported.html ├── e_Clustering.Rmd ├── e_Clustering.html ├── e_Clustering.md ├── f_dangers_ml.Rmd ├── f_dangers_ml.html ├── f_dangers_ml.md ├── figure │ ├── 3d-coordinate-plane.png │ ├── 450px-Overfitting.svg.png │ ├── AmesTableau01.png │ ├── ArtificialNeuronModel_english.png │ ├── BBRXC.png │ ├── Blausen_0657_MultipolarNeuron.png │ ├── Decision-Tree-Example.jpg │ ├── Diagslr.PNG │ ├── OneHotEncoding.PNG │ ├── Overfitting_fig1.PNG │ ├── Picture3.jpg │ ├── SMLProcess.png │ ├── The_Signal_and_the_Noise.jpg │ ├── activation_funs.PNG │ ├── activations-1.png │ ├── addins.PNG │ ├── bagging3.png │ ├── bias_variance_tradeoff.PNG │ ├── bias_variance_tradeoff2.png │ ├── biglasso.PNG │ ├── book_ml1.jpg │ ├── boosted-trees-process.png │ ├── boosting-in-action-1.png │ ├── bostondata.PNG │ ├── bostonscaled.PNG │ ├── class01-1.png │ ├── classification_regression.png │ ├── confusionMatrix.png │ ├── content_flowchart1.png │ ├── datasetsload.PNG │ ├── decissiontree.PNG │ ├── dplyr_vignette.PNG │ ├── dt_amesdata.PNG │ ├── duckduckgo.PNG │ ├── electoral_precedent.png │ ├── ex_regression_tree.png │ ├── expl_rf.png │ ├── factor3vars_visreg.PNG │ ├── fig3_loglambda.PNG │ ├── fig3_loglambda.svg │ ├── four_regmods.PNG │ ├── gbmtopmodelsvars.PNG │ ├── ggpairs_yacht.png │ ├── gradient_descent.png │ ├── influentalValues_lasso.PNG │ ├── interplot_wt_disp.PNG │ ├── iris.png │ ├── kyphosis_helppage.PNG │ ├── learning_rate_comparison.png │ ├── limeplot.png │ ├── magrittr_vignette.jpg │ ├── ml_emoji.png │ ├── ml_ice_curves.png │ ├── ml_rf_errorrate_m1.png │ ├── ml_rf_hist_OOB_RMSE.png │ ├── ml_rf_varimp_ranger.png │ ├── ml_tb_rpart_iris.png │ ├── mtcars_model_interact.PNG │ ├── neuralnetfig.PNG │ ├── neuralnets.PNG │ ├── nyc_map.png │ ├── overview_ml_algorithms.jpg │ ├── package_gbm.PNG │ ├── pic_hiddenlayers.PNG │ ├── prediction_mtcars.PNG │ ├── random_trees_fig1.PNG │ ├── reg_3algos.PNG │ ├── resid_fitted.PNG │ ├── ridgeTop25influentalVars.PNG │ ├── ridge_coef.png │ ├── stargazertabex.PNG │ ├── stochastic_gradient_descent.png │ ├── swissfertality.PNG │ ├── taskviewmachinelearning.PNG │ ├── three_algos_complete.PNG │ ├── titanicdata.PNG │ ├── top-20-r-packages-machine-learning-downloads.jpg │ ├── top10gbms.PNG │ ├── tree-correlation-1.png │ ├── tree-variance-1.svg │ ├── tree.ps │ ├── tree_m1.PNG │ ├── unsupervisedLearning.png │ ├── visreg.PNG │ ├── visreg2.PNG │ ├── visreg_m6.PNG │ ├── visregcat.PNG │ └── visregplot1.PNG ├── g_reticulate_umap-exported.html ├── g_reticulate_umap.Rmd ├── long │ ├── c2_random_forests.Rmd │ ├── c2_random_forests.pdf │ ├── d_neuralNetworks.Rmd │ └── d_neuralNetworks.pdf └── old │ ├── A_ml_motiv.Rmd │ ├── a1_intro_r.Rmd │ ├── a2_intro_ml.Rmd │ ├── a2_intro_ml.html │ ├── a2_intro_ml.md │ ├── a2_intro_ml.pdf │ ├── a_intro_ml.Rmd │ ├── a_intro_ml.html │ ├── a_intro_ml.md │ ├── a_intro_ml.pdf │ ├── advanced_regression.Rmd │ ├── b1_regression.Rmd │ ├── b2_lasso_regression (2).Rmd │ ├── b2_lasso_regression.Rmd │ ├── b2_lasso_regression.html │ ├── b2_lasso_regression.log │ ├── b2_lasso_regression.md │ ├── b2_lasso_regression.nav │ ├── b2_lasso_regression.snm │ ├── b2_lasso_regression.tex │ ├── b2_lasso_regression.toc │ ├── b2_lasso_regression.vrb │ ├── b2_lasso_regression_files │ ├── figure-beamer │ │ ├── unnamed-chunk-12-1.pdf │ │ ├── unnamed-chunk-17-1.pdf │ │ ├── unnamed-chunk-20-1.pdf │ │ ├── unnamed-chunk-23-1.pdf │ │ ├── unnamed-chunk-25-1.pdf │ │ ├── unnamed-chunk-27-1.pdf │ │ └── unnamed-chunk-36-1.pdf │ └── figure-slidy │ │ ├── unnamed-chunk-12-1.png │ │ ├── unnamed-chunk-17-1.png │ │ ├── unnamed-chunk-20-1.png │ │ ├── unnamed-chunk-23-1.png │ │ ├── unnamed-chunk-25-1.png │ │ ├── unnamed-chunk-27-1.png │ │ └── unnamed-chunk-36-1.png │ ├── b2_regularization.Rmd │ ├── b_lasso_regression.Rmd │ ├── c2_random_forests.Rmd │ ├── c_bagging_boosting_trees.Rmd │ ├── c_bagging_boosting_trees.pdf │ ├── caret.Rmd │ ├── conditional_inference_trees.Rmd │ ├── d_neuralNetworks.Rmd │ ├── doParallel.Rmd │ ├── evaluation.pdf │ ├── gradient_boosting.Rmd │ ├── gradient_boosting.pdf │ ├── lasso_regression.pdf │ ├── logit_model.Rmd │ ├── ml_part1.Rmd │ ├── ml_part1.log │ ├── ml_part1.pdf │ ├── random_forests.Rmd │ ├── supervised_learning.Rmd │ └── supervised_learning.html └── tutorial ├── g_ml_applying_algorithms.Rmd ├── ml_exercises.Rmd ├── ml_exercises.html ├── ml_exercises.pdf ├── ml_exercises_a1_introR.Rmd ├── ml_exercises_a1_introR.log ├── ml_exercises_a1_introR.pdf ├── ml_exercises_a1_introR.tex ├── ml_exercises_a_visualizing.Rmd ├── ml_exercises_a_visualizing.pdf ├── ml_exercises_b_regression.Rmd ├── ml_exercises_b_regression.pdf ├── ml_exercises_c1_treesbagging.Rmd ├── ml_exercises_c1_treesbagging.pdf ├── ml_exercises_c2_randomforests.Rmd ├── ml_exercises_c2_randomforests.pdf ├── ml_exercises_c3_xtremeboosting.Rmd ├── ml_exercises_c3_xtremeboosting.pdf ├── ml_exercises_d_neuralnets.Rmd ├── ml_exercises_d_neuralnets.pdf └── prepare_apply_5ml.Rmd /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "README" 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: md_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Part A - introduction 13 | 14 | - A1 - [Introduction to machine learning](slides/a1_intro_ml.md) ([pdf](slides/a1_intro_ml.pdf)) 15 | - A2 - [Laying the foundations in R](slides/a1_intro_r.md) ([Browser](slides/a1_intro_r.md) | [pdf](slides/a1_intro_r.pdf) | [rcode](rcode/a1_intro_r.R)) 16 | 17 | 21 | ## Part B - lasso and ridge regression 22 | 23 | - [B1 - A small recap on linear regression](slides/b1_regression.md) ([Browser](slides/b1_regression.md) | [pdf](slides/b1_regression.pdf) | [rcode](rcode/b1_regression.R)) 24 | - [B2 - Using regularization to prevent overfitting and perform feature selection](slides/b2_regularization.md) ([Browser](slides/b2_regularization.md) | [pdf](slides/b2_regularization.pdf) | [rcode](rcode/b2_regularization.R)) 25 | 26 | 27 | 30 | 31 | 32 | ## Part C - Supervised Learning: Bagging and Boosting, tree-methods 33 | 34 | - [C1 - Supervised learning: tress and bagging](slides/c1_trees_bagging.md) ([Browser](slides/c1_trees_bagging.md) | [pdf](slides/c1_trees_bagging.pdf) | [rcode](rcode/c1_trees_bagging.R)) 35 | 36 | - [C2 - Supervised learning: random forests](slides/c2_random_forests.md) ([Browser](slides/c2_random_forests.md) | [pdf](slides/c2_random_forests.pdf) | [rcode](rcode/c2_random_forests.R)) 37 | 38 | - [C3 - Supervised learning: gradient boosting](slides/c3_gbm_regression.md) ([Browser](slides/c3_gbm_regression.md) | [pdf](slides/c3_gbm_regression.pdf) | [rcode](rcode/c3_gbm_regression.R)) 39 | 40 | 41 | ## Part D - Supervised Learning: Neural Network 42 | 43 | - [D - Supervised learning: neural network](slides/d_neuralNetworks.md) ([Browser](slides/d_neuralNetworks.md) | [pdf](slides/d_neuralNetworks.pdf) | [rcode](rcode/d_neuralNetworks.R)) 44 | 45 | ## Part E - Unsupervised Learning: kmeans, hdbscan 46 | 47 | - [E - Unsupervised Learning: kmeans, hdbscan](slides/e_Clustering.md) 48 | 49 | ## Part F - The dangers of machine learning 50 | 51 | - [F - The dangers of machine learning](slides/f_dangers_ml.md) 52 | 53 | ## Part G - `reticulate` package: Umap 54 | 55 | - [G - reticulate package: Umap](slides/g_reticulate_umap.md) 56 | 57 | 58 | 59 | # Remarks 60 | 61 | The sources are often linked in the headline. Please ask if something is unclear. 62 | 63 | 66 | 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Machine Learning with R 2 | ----------------------- 3 | 4 | Part A - introduction 5 | --------------------- 6 | 7 | - A1 - Laying the foundations in R ([Browser](slides/a1_intro_r.md) | 8 | [pdf](slides/a1_intro_r.pdf) | [rcode](rcode/a1_intro_r.R)) 9 | 10 | 14 | 15 | Part B - lasso and ridge regression 16 | ----------------------------------- 17 | 18 | - [B1 - A small recap on linear regression](slides/b1_regression.md) 19 | ([Browser](slides/b1_regression.md) | 20 | [pdf](slides/b1_regression.pdf) | [rcode](rcode/b1_regression.R)) 21 | - [B2 - Using regularization to prevent overfitting and perform 22 | feature selection](slides/b2_regularization.md) 23 | ([Browser](slides/b2_regularization.md) | 24 | [pdf](slides/b2_regularization.pdf) | 25 | [rcode](rcode/b2_regularization.R)) 26 | 27 | 30 | 31 | Part C - Supervised Learning: Bagging and Boosting, tree-methods 32 | ---------------------------------------------------------------- 33 | 34 | - [C1 - Supervised learning: tress and 35 | bagging](slides/c1_trees_bagging.md) 36 | ([Browser](slides/c1_trees_bagging.md) | 37 | [pdf](slides/c1_trees_bagging.pdf) | 38 | [rcode](rcode/c1_trees_bagging.R)) 39 | 40 | - [C2 - Supervised learning: random 41 | forests](slides/c2_random_forests.md) 42 | ([Browser](slides/c2_random_forests.md) | 43 | [pdf](slides/c2_random_forests.pdf) | 44 | [rcode](rcode/c2_random_forests.R)) 45 | 46 | - [C3 - Supervised learning: gradient 47 | boosting](slides/c3_gbm_regression.md) 48 | ([Browser](slides/c3_gbm_regression.md) | 49 | [pdf](slides/c3_gbm_regression.pdf) | 50 | [rcode](rcode/c3_gbm_regression.R)) 51 | 52 | Part D - Supervised Learning: Neural Network 53 | -------------------------------------------- 54 | 55 | - [D - Supervised learning: neural 56 | network](slides/d_neuralNetworks.md) 57 | ([Browser](slides/d_neuralNetworks.md) | 58 | [pdf](slides/d_neuralNetworks.pdf) | 59 | [rcode](rcode/d_neuralNetworks.R)) 60 | 61 | Part E - Unsupervised Learning: kmeans, hdbscan 62 | ----------------------------------------------- 63 | 64 | - [E - Unsupervised Learning: kmeans, hdbscan](slides/e_Clustering.md) 65 | 66 | Part F - The dangers of machine learning 67 | ---------------------------------------- 68 | 69 | - [F - The dangers of machine learning](slides/f_dangers_ml.md) 70 | 71 | Part G - `reticulate` package: Umap 72 | ----------------------------------- 73 | 74 | - [G - reticulate package: Umap](slides/g_reticulate_umap.md) 75 | 76 | 79 | 93 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-leap-day -------------------------------------------------------------------------------- /data/adult.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/adult.RData -------------------------------------------------------------------------------- /data/ames_data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ames_data.RData -------------------------------------------------------------------------------- /data/ex.data: -------------------------------------------------------------------------------- 1 | TITLE extra line 2 | # a comment 3 | 2 3 5 7 4 | 11 13 17 5 | -------------------------------------------------------------------------------- /data/german_credit.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/german_credit.RData -------------------------------------------------------------------------------- /data/lm_nn_Yacht_NN2.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/lm_nn_Yacht_NN2.RData -------------------------------------------------------------------------------- /data/meta.data: -------------------------------------------------------------------------------- 1 | age 2 | workclass 3 | fnlwgt 4 | education 5 | education-num 6 | marital-status 7 | occupation 8 | relationhip 9 | race 10 | sex 11 | capital-gain 12 | capital-loss 13 | hours-per-week 14 | native-country 15 | class 16 | -------------------------------------------------------------------------------- /data/meta_cnames.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/meta_cnames.Rdata -------------------------------------------------------------------------------- /data/ml_gbm_fit.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_fit.RData -------------------------------------------------------------------------------- /data/ml_gbm_fit2.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_fit2.RData -------------------------------------------------------------------------------- /data/ml_gbm_reg_linear.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_reg_linear.RData -------------------------------------------------------------------------------- /data/ml_gbm_tune.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_tune.RData -------------------------------------------------------------------------------- /data/ml_randomforest_m1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_randomforest_m1.RData -------------------------------------------------------------------------------- /data/ml_rf_OOB_RMSE.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_OOB_RMSE.RData -------------------------------------------------------------------------------- /data/ml_rf_ames_randomForest.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_ames_randomForest.RData -------------------------------------------------------------------------------- /data/ml_rf_dec.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_dec.RData -------------------------------------------------------------------------------- /data/ml_rf_grid_perf.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_grid_perf.data -------------------------------------------------------------------------------- /data/ml_rf_h2o.grid.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_h2o.grid.RData -------------------------------------------------------------------------------- /data/ml_rf_hypergrid_oobrmse.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_hypergrid_oobrmse.RData -------------------------------------------------------------------------------- /data/ml_rf_m2.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_m2.RData -------------------------------------------------------------------------------- /data/ml_rf_oob_comp.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_oob_comp.RData -------------------------------------------------------------------------------- /data/ml_rf_optimal_ranger.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_optimal_ranger.RData -------------------------------------------------------------------------------- /data/ml_rf_random_grid.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_random_grid.RData -------------------------------------------------------------------------------- /data/ml_rf_xgb.fit1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_xgb.fit1.RData -------------------------------------------------------------------------------- /data/ml_rf_xgb.fit3.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_xgb.fit3.RData -------------------------------------------------------------------------------- /data/model_auc.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/model_auc.RData -------------------------------------------------------------------------------- /data/model_eta.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/model_eta.RData -------------------------------------------------------------------------------- /data/negative-words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/negative-words.txt -------------------------------------------------------------------------------- /data/osmsa_PLZ_14.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/osmsa_PLZ_14.RData -------------------------------------------------------------------------------- /data/titanic.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/titanic.RData -------------------------------------------------------------------------------- /machine_learning.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /misc/along.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Along the project" 3 | author: "Jan-Philipp Kolb" 4 | date: "16 1 2020" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE) 10 | ``` 11 | 12 | ## 13 | 14 | [The Lasso under Heteroscedasticity](https://statistics.berkeley.edu/sites/default/files/tech-reports/783.pdf) 15 | 16 | https://sites.google.com/site/nationalekonomigrunder/regression-analysis/assumptions -------------------------------------------------------------------------------- /rcode/GESISPanel.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=FALSE------------------------------------------------ 2 | knitr::opts_chunk$set(echo = TRUE) 3 | 4 | ## ------------------------------------------------------------------------ 5 | wave <- "fb" 6 | 7 | ## ------------------------------------------------------------------------ 8 | wavedatapath <- "J:/Work/GESISPanel_DATA/01_post_processing/c01/f_2018/fb/02_master/data/STATA14/" 9 | 10 | ## ------------------------------------------------------------------------ 11 | setwd(wavedatapath) 12 | dat <- readstata13::read.dta13("fb_master_20180814_COMPLETE.dta",convert.factors = F) 13 | 14 | ## ------------------------------------------------------------------------ 15 | ncol(dat) 16 | 17 | ## ------------------------------------------------------------------------ 18 | indvar_aapor <- grep("za006a",colnames(dat)) 19 | 20 | colnames(dat)[indvar_aapor] 21 | 22 | ## ------------------------------------------------------------------------ 23 | waves <- paste0(rep(letters[1:6],each=6,),rep(letters[1:6],6)) 24 | waves <- waves[-which(waves%in%c("ad","ae","af","fc","fd","fe","ff"))] 25 | 26 | G_response_list <- list() 27 | for (i in 1:length(waves)){ 28 | ind_aapor <- which(colnames(dat)==paste0(waves[i],"za006a")) 29 | respvar <- dat[,ind_aapor] 30 | dat1 <- dat[!="-22",] 31 | G_response <- rep(0,nrow(dat1)) 32 | G_response[dat1[,ind_aapor]%in%c("211","212","319","21121","211221")] <- 1 33 | G_response_list[[i]] <- G_response 34 | } 35 | 36 | sumtab_resp <- lapply(G_response_list,table) 37 | 38 | sumtab_resp2 <- do.call(rbind, sumtab_resp) 39 | 40 | ## ------------------------------------------------------------------------ 41 | table(dat$D_response) 42 | 43 | -------------------------------------------------------------------------------- /rcode/c2b_random_forests_h2o.R: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: "Random Forests with h2o" 3 | #' author: "Jan-Philipp Kolb" 4 | #' date: "24 Mai 2019" 5 | #' output: html_document 6 | #' --- 7 | #' 8 | ## ----setup, include=FALSE------------------------------------------------ 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | #' 12 | #' ## The Ames housing data 13 | #' 14 | ## ------------------------------------------------------------------------ 15 | set.seed(123) 16 | ames_split <- rsample::initial_split(AmesHousing::make_ames(), 17 | prop = .7) 18 | ames_train <- rsample::training(ames_split) 19 | ames_test <- rsample::testing(ames_split) 20 | 21 | #' 22 | #' 23 | #' 24 | #' ## Full grid search with H2O 25 | #' 26 | ## ------------------------------------------------------------------------ 27 | library(h2o) # an extremely fast java-based platform 28 | 29 | #' 30 | #' 31 | #' - If you ran the grid search code above you probably noticed the code took a while to run. 32 | #' - `ranger` is computationally efficient, but as the grid search space expands, the manual for loop process becomes less efficient. 33 | #' - `h2o` is a powerful and efficient java-based interface that provides parallel distributed algorithms. 34 | #' - `h2o` allows for different optimal search paths in our grid search. This allows us to be more efficient in tuning our models. Here, I demonstrate how to tune a random forest model with `h2o`. Lets go ahead and start up h2o: 35 | #' 36 | #' 39 | #' 40 | #' 41 | #' 42 | ## ------------------------------------------------------------------------ 43 | # start up h2o 44 | h2o.no_progress() 45 | h2o.init(max_mem_size = "5g") 46 | 47 | #' 48 | #' ## Random forests with `h2o` 49 | #' 50 | #' - We can try a comprehensive (full cartesian) grid search, which means we will examine every combination of hyperparameter settings that we specify in `hyper_grid.h2o`. 51 | #' - We search across 96 models but since we perform a full cartesian search this process is not any faster. 52 | #' - Note that the best performing model has an OOB RMSE of 24504, which is lower than what we achieved previously. 53 | #' - This is because some of the default settings regarding minimum node size, tree depth, etc. are more “generous” than `ranger` and `randomForest` 54 | #' - E.g. `h2o` has a default minimum node size of one whereas `ranger` and `randomForest` default settings are 5. 55 | #' 56 | #' 57 | #' ## Preparation for `h2o` 58 | #' 59 | ## ------------------------------------------------------------------------ 60 | # create feature names 61 | y <- "Sale_Price" 62 | x <- setdiff(names(ames_train), y) 63 | # turn training set into h2o object 64 | train.h2o <- as.h2o(ames_train) 65 | # hyperparameter grid 66 | hyper_grid.h2o <- list( 67 | ntrees = seq(200, 500, by = 100), 68 | mtries = seq(20, 30, by = 2), 69 | sample_rate = c(.55, .632, .70, .80) 70 | ) 71 | 72 | #' 73 | #' ## 74 | #' 75 | #' 78 | #' 79 | ## ----eval=F-------------------------------------------------------------- 80 | ## # build grid search 81 | ## grid <- h2o.grid( 82 | ## algorithm = "randomForest", 83 | ## grid_id = "rf_grid", 84 | ## x = x, 85 | ## y = y, 86 | ## training_frame = train.h2o, 87 | ## hyper_params = hyper_grid.h2o, 88 | ## search_criteria = list(strategy = "Cartesian") 89 | ## ) 90 | 91 | #' 92 | ## ----eval=F,echo=F------------------------------------------------------- 93 | ## save(grid,file="../data/ml_rf_h2o.grid.RData") 94 | 95 | #' 96 | ## ----echo=F-------------------------------------------------------------- 97 | load("../data/ml_rf_h2o.grid.RData") 98 | 99 | #' 100 | #' 101 | ## ------------------------------------------------------------------------ 102 | # collect the results and sort by our model performance 103 | # metric of choice 104 | grid_perf <- h2o.getGrid( 105 | grid_id = "rf_grid", 106 | sort_by = "mse", 107 | decreasing = FALSE 108 | ) 109 | 110 | #' 111 | #' ## 112 | #' 113 | ## ----eval=F,echo=F------------------------------------------------------- 114 | ## save(grid_perf,file = "../data/ml_rf_grid_perf.data") 115 | 116 | #' 117 | ## ----echo=F,eval=T------------------------------------------------------- 118 | load("../data/ml_rf_grid_perf.data") 119 | 120 | #' 121 | #' 122 | ## ----eval=T-------------------------------------------------------------- 123 | print(grid_perf) 124 | 125 | #' 126 | #' 127 | #' ## Combinatorial explosion 128 | #' 129 | #' - Because of the [**combinatorial explosion**](https://en.wikipedia.org/wiki/Combinatorial_explosion), each additional hyperparameter added has a huge effect on the time. 130 | #' - `h2o` provides an additional grid search path called “RandomDiscrete”, which will jump from one random combination to another and stop once a certain level of improvement has been made, certain amount of time has been exceeded, or a certain amount of models have been ran (or a combination of these have been met). 131 | #' - A random discrete search path will likely not find the optimal model, but it does a good job of finding a very good model. 132 | #' 133 | #' - E.g., the following code searches 2,025 hyperparameter combinations. 134 | #' - Our random grid search will stop if none of the last 10 models provides a 0.5% improvement in MSE. 135 | #' - If we continue to find improvements then I cut the grid search off after 600 seconds (30 minutes). 136 | #' - Our grid search assessed 190 models and the best model (max_depth = 30, min_rows = 1, mtries = 25, nbins = 30, ntrees = 200, sample_rate = .8) achived an RMSE of 24686 137 | #' ). 138 | #' 139 | #' ## 140 | #' 141 | ## ------------------------------------------------------------------------ 142 | # hyperparameter grid 143 | hyper_grid.h2o <- list( 144 | ntrees = seq(200, 500, by = 150), 145 | mtries = seq(15, 35, by = 10), 146 | max_depth = seq(20, 40, by = 5), 147 | min_rows = seq(1, 5, by = 2), 148 | nbins = seq(10, 30, by = 5), 149 | sample_rate = c(.55, .632, .75) 150 | ) 151 | 152 | #' 153 | ## ------------------------------------------------------------------------ 154 | # random grid search criteria 155 | search_criteria <- list( 156 | strategy = "RandomDiscrete", 157 | stopping_metric = "mse", 158 | stopping_tolerance = 0.005, 159 | stopping_rounds = 10, 160 | max_runtime_secs = 30*60 161 | ) 162 | 163 | #' 164 | #' 165 | #' ## 166 | #' 167 | #' 170 | #' 171 | ## ------------------------------------------------------------------------ 172 | # build grid search 173 | random_grid <- h2o.grid( 174 | algorithm = "randomForest", 175 | grid_id = "rf_grid2", 176 | x = x, 177 | y = y, 178 | training_frame = train.h2o, 179 | hyper_params = hyper_grid.h2o, 180 | search_criteria = search_criteria 181 | ) 182 | 183 | #' 184 | ## ------------------------------------------------------------------------ 185 | # collect the results and sort by our model performance 186 | # metric of choice 187 | grid_perf2 <- h2o.getGrid( 188 | grid_id = "rf_grid2", 189 | sort_by = "mse", 190 | decreasing = FALSE 191 | ) 192 | 193 | #' 194 | ## ----eval=F,echo=F------------------------------------------------------- 195 | ## save(random_grid,grid_perf2,file="../data/ml_rf_random_grid.RData") 196 | 197 | #' 198 | ## ----echo=F,eval=T------------------------------------------------------- 199 | load("../data/ml_rf_random_grid.RData") 200 | 201 | #' 202 | #' 203 | #' ## 204 | #' 205 | ## ----eval=T-------------------------------------------------------------- 206 | print(grid_perf2) 207 | 208 | #' 209 | #' ## Hold-out test 210 | #' 211 | #' - Once we’ve identifed the best model we can get that model and apply it to our hold-out test set to compute our final test error. 212 | #' 213 | ## ------------------------------------------------------------------------ 214 | # Grab the model_id for the top model, 215 | # chosen by validation error 216 | best_model_id <- grid_perf2@model_ids[[1]] 217 | best_model <- h2o.getModel(best_model_id) 218 | 219 | #' 220 | #' 223 | #' 224 | ## ----eval=F-------------------------------------------------------------- 225 | ## # Now let’s evaluate the model performance on a test set 226 | ## ames_test.h2o <- as.h2o(ames_test) 227 | ## best_model_perf <- h2o.performance(model = best_model, 228 | ## newdata = ames_test.h2o) 229 | ## 230 | ## # RMSE of best model 231 | ## h2o.mse(best_model_perf) %>% sqrt() 232 | 233 | #' 234 | #' 235 | #' - We have reduced our RMSE to near 23,000, which is a 10K reduction compared to elastic nets and bagging. 236 | #' 237 | #' 238 | #' ## Links 239 | #' 240 | #' - [Download h2o](http://h2o.ai/download/) 241 | #' 242 | -------------------------------------------------------------------------------- /rcode/c3b_gbm_regression_h2o.R: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: "Gradient boosting with h2o" 3 | #' author: "Jan-Philipp Kolb" 4 | #' date: "24 Mai 2019" 5 | #' output: ioslides_presentation 6 | #' --- 7 | #' 8 | ## ----setup, include=FALSE------------------------------------------------ 9 | knitr::opts_chunk$set(echo = FALSE) 10 | 11 | #' 12 | #' 13 | #' ## h2o 14 | #' 15 | ## ------------------------------------------------------------------------ 16 | library(h2o) # a java-based platform 17 | 18 | #' 19 | #' 20 | #' The h2o R package is a powerful and efficient java-based interface that allows for local and cluster-based deployment. It comes with a fairly comprehensive online resource that includes methodology and code documentation along with tutorials. 21 | #' 22 | #' ## Features include: 23 | #' 24 | #' - Distributed and parallelized computation on either a single node or a multi-node cluster. 25 | #' - Automatic early stopping based on convergence of user-specified metrics to user-specified relative tolerance. 26 | #' - Stochastic GBM with column and row sampling (per split and per tree) for better generalization. 27 | #' - Support for exponential families (Poisson, Gamma, Tweedie) and loss functions in addition to binomial (Bernoulli), Gaussian and multinomial distributions, such as Quantile regression (including Laplace). 28 | #' - Grid search for hyperparameter optimization and model selection. 29 | #' - Data-distributed, which means the entire dataset does not need to fit into memory on a single node, hence scales to any size training set. 30 | #' - Uses histogram approximations of continuous variables for speedup. 31 | #' - Uses dynamic binning - bin limits are reset at each tree level based on the split bins’ min and max values discovered during the last pass. 32 | #' - Uses squared error to determine optimal splits. 33 | #' 36 | #' - Unlimited factor levels. 37 | #' - Multiclass trees (one for each class) built in parallel with each other. 38 | #' - Apache 2.0 Licensed. 39 | #' - Model export in plain Java code for deployment in production environments. 40 | #' 41 | #' ## 42 | -------------------------------------------------------------------------------- /rcode/creating_rcode.R: -------------------------------------------------------------------------------- 1 | # Jan-Philipp Kolb 2 | # Thu May 23 13:02:28 2019 3 | 4 | #-------------------------------------------------# 5 | # Installing necessary packages 6 | #-------------------------------------------------# 7 | 8 | necpackages <- c("knitr","rmarkdown","tidyverse") 9 | 10 | 11 | for (i in 1:length(necpackages)){ 12 | if (!require(necpackages[i])){ 13 | install.packages(necpackages[i]) 14 | } 15 | library(necpackages[i]) 16 | } 17 | 18 | #-------------------------------------------------# 19 | # Load libraries 20 | #-------------------------------------------------# 21 | 22 | library(knitr) 23 | library(rmarkdown) 24 | library(lme4) 25 | 26 | #-------------------------------------------------# 27 | # Define paths 28 | #-------------------------------------------------# 29 | 30 | main_path <- "D:/Daten/GitHub/machine_learning/" 31 | main_path <- "D:/github/machine_learning/" 32 | slide_path <- paste0(main_path,"slides/") 33 | rcode_path <- paste0(main_path,"rcode/") 34 | 35 | #-------------------------------------------------# 36 | # Parts of the presentation 37 | #-------------------------------------------------# 38 | 39 | dirnamen <- dir(slide_path) 40 | presparts <- grep(".Rmd",dirnamen,value = T) 41 | 42 | 43 | # setwd("D:/gitlab/IntroDataAnalysis/rcode/") 44 | setwd(rcode_path) 45 | 46 | for (i in 1:length(presparts)){ 47 | purl(paste0("../slides/",presparts[i]),documentation = 2) 48 | } 49 | 50 | #-------------------------------------------------# 51 | # Creating pdf slides 52 | #-------------------------------------------------# 53 | 54 | # setwd("D:/Daten/GitLab/IntroDataAnalysis/slides") 55 | setwd(slide_path) 56 | 57 | 58 | for (i in 1:length(presparts)){ 59 | rmarkdown::render(presparts[i], "beamer_presentation") 60 | } 61 | 62 | 63 | for (i in 1:length(presparts)){ 64 | rmarkdown::render(presparts[i], "all") 65 | } 66 | 67 | 68 | for (i in 3:length(presparts)){ 69 | rmarkdown::render(presparts[i], "md_document") 70 | } 71 | 72 | # B1_DataProcessing 73 | 74 | 75 | #-------------------------------------------------# 76 | # Create rcode in course 77 | #-------------------------------------------------# 78 | 79 | setwd(rcode_path) 80 | 81 | purl("../slides/C2_hierarchMods.Rmd",documentation = 2) 82 | purl("../slides/D1_webScrapping.Rmd",documentation = 2) 83 | purl("../slides/D2_dataCleaning.Rmd",documentation = 2) 84 | 85 | #-------------------------------------------------# 86 | # Install necessary packages 87 | #-------------------------------------------------# 88 | 89 | 90 | install.packages("lme4") 91 | 92 | #-------------------------------------------------# 93 | # Links 94 | #-------------------------------------------------# 95 | 96 | 97 | # https://rmarkdown.rstudio.com/authoring_quick_tour.html 98 | # https://www.r-bloggers.com/function-to-simplify-loading-and-installing-packages/ -------------------------------------------------------------------------------- /rcode/e_Clustering.R: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: "Clustering" 3 | #' author: "Jan-Philipp Kolb and Alexander Murray-Watters" 4 | #' date: "18 Januar 2019" 5 | #' output: beamer_presentation 6 | #' --- 7 | #' 8 | ## ----setupClustering, include=FALSE-------------------------------------- 9 | knitr::opts_chunk$set(echo = FALSE) 10 | 11 | #' 12 | #' 13 | #' 14 | #' 15 | #' ## Resources 16 | #' 17 | #' 18 | ## ----echo=F, eval=FALSE-------------------------------------------------- 19 | ## slides_path <- getwd() 20 | ## git_path <- gsub("slides","",slides_path) 21 | ## if (Sys.info()$nodename=="MAC14077"){ 22 | ## git_path <- "D:/Daten/GitHub/machine_learning/" 23 | ## slides_path <- paste0(git_path,"/slides") 24 | ## } 25 | 26 | #' 27 | #' 28 | #' - [Package `kknn`](https://cran.r-project.org/web/packages/kknn/kknn.pdf) 29 | #' 30 | ## ----eval=F-------------------------------------------------------------- 31 | ## install.packages("kknn") 32 | 33 | #' 34 | ## ------------------------------------------------------------------------ 35 | library("kknn") 36 | 37 | #' 38 | #' 39 | #' ## [Geographic clustering of UK cities](https://www.r-bloggers.com/geographic-clustering-of-uk-cities/) 40 | #' 41 | #' Animated example: 42 | #' https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68 43 | #' 44 | #' 45 | #' ## Exercise: Kmeans 46 | #' 47 | #' Apply kmeans to to the `iris` dataset with 2, 3, and 4 48 | #' clusters. Produce three scatter plots, with the points colored 49 | #' according to cluster assignment. 50 | #' 51 | #' 52 | #' ## hdbscan 53 | #' 54 | #' A fairly new alternative to kmeans, hdbscan does not require you to 55 | #' specify the number of categories to be assigned. It only requires a 56 | #' decision as to the minimum number of points needed to be included in a 57 | #' cluster. This minimum number acts as a smoothing parameter (such as a 58 | #' density bandwidth parameter or a histograms bin/bar width), with lower 59 | #' values finding more clusters. Other advantages of hdbscan include . 60 | #' 61 | ## ---- eval=FALSE--------------------------------------------------------- 62 | ## install.packages("dbscan") 63 | 64 | #' 65 | #' 66 | #' 67 | ## ------------------------------------------------------------------------ 68 | library(ggplot2) 69 | library(dplyr) 70 | library(maps) 71 | library(dbscan) 72 | 73 | ## Example where kmeans finds only 1 cluster. 74 | two.clust.eg <- rbind(matrix(rnorm(1000, sd = 0.8), ncol=2), 75 | matrix(rnorm(100, mean = 120, sd = 0.12), ncol = 2)) 76 | 77 | clust <- kmeans(two.clust.eg, centers=2) 78 | 79 | plot(x, col = clust$cluster) 80 | ## points(cl$centers, col = 1:2, pch = 8, cex = 2) 81 | 82 | 83 | 84 | #' 85 | ## ------------------------------------------------------------------------ 86 | 87 | 88 | 89 | 90 | 91 | data(moons) 92 | 93 | ## Running HDBscan with the minimum number of points set to 5. 94 | res <- dbscan::hdbscan(moons, minPts = 3) 95 | 96 | plot(moons, col = res$cluster + 1, main="R implementation") 97 | 98 | #' 99 | #' 100 | #' 101 | #' ## Exercise: Apply kmeans to the moons dataset and compare the results. 102 | #' -- Be sure to try different numbers of centers. 103 | #' 104 | #' 105 | #' ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 106 | #' 107 | #' ## Exercise: Apply both kmeans and hdbscan to the `ChickWeight` dataset's "weight" "Time" variables, and see how well you can get each to perform. 108 | #' 109 | #' 110 | #' 111 | #' 112 | ## ---- eval=FALSE, echo=FALSE--------------------------------------------- 113 | ## ## kmeans 114 | ## plot(ChickWeight[,1:2], col=kmeans(ChickWeight[,1:2], centers=4)$centers) 115 | ## 116 | ## ## hdbscan, minPts=10 117 | ## plot(ChickWeight[,1:2], col=dbscan::hdbscan(ChickWeight[,1:2], minPts=10)$cluster) 118 | ## 119 | ## ## Diet cat. for comparison. 120 | ## plot(ChickWeight[,1:2], col=ChickWeight$Diet) 121 | ## 122 | ## ## Chick cat. for comparison. 123 | ## plot(ChickWeight[,1:2], col=ChickWeight$Chick) 124 | ## 125 | ## 126 | 127 | #' 128 | #' 129 | ## ---- eval=FALSE--------------------------------------------------------- 130 | ## load(paste0(git_path,"/data/osmsa_PLZ_14.RData")) 131 | 132 | #' 133 | #' 134 | #' 135 | #' ## [US Census Data](https://elitedatascience.com/datasets) 136 | #' 137 | #' - [US Census Data (Clustering)](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29) – Clustering based on demographics is a tried and true way to perform market research and segmentation. 138 | #' 139 | #' 140 | #' 141 | #' ## Links 142 | #' 143 | #' - [Using clusterlab to benchmark clustering algorithms](https://www.r-bloggers.com/using-clusterlab-to-benchmark-clustering-algorithms/) 144 | -------------------------------------------------------------------------------- /rcode/fitting.R: -------------------------------------------------------------------------------- 1 | simbias <- function(seed=8765){ 2 | # The default seed guarantees a nice histogram. This is the only 3 | # reason that accepting the default, x1c <- simbias(), is required in the lesson. 4 | # The effect will be evident with other seeds as well. 5 | set.seed(seed) 6 | temp <- rnorm(100) 7 | # Point A 8 | x1 <- (temp + rnorm(100))/sqrt(2) 9 | x2 <- (temp + rnorm(100))/sqrt(2) 10 | x3 <- rnorm(100) 11 | # Function to simulate regression of y on 2 variables. 12 | f <- function(k){ 13 | # Point B 14 | y <- x1 + x2 + x3 + .3*rnorm(100) 15 | # Point C 16 | c(lm(y ~ x1 + x2)$coef[2], 17 | lm(y ~ x1 + x3)$coef[2]) 18 | } 19 | # Point D 20 | sapply(1:150, f) 21 | } 22 | 23 | # Illustrate the effect of bogus regressors on residual squared error. 24 | bogus <- function(){ 25 | temp <- swiss 26 | # Add 41 columns of random regressors to a copy of the swiss data. 27 | for(n in 1:41){temp[,paste0("random",n)] <- rnorm(nrow(temp))} 28 | # Define a function to compute the deviance of Fertility regressed 29 | # on all regressors up to column n. The function, deviance(model), computes 30 | # the residual sum of squares of the model given as its argument. 31 | f <- function(n){deviance(lm(Fertility ~ ., temp[,1:n]))} 32 | # Apply f to data from n=6, i.e., the legitimate regressors, 33 | # through n=47, i.e., a full complement of bogus regressors. 34 | rss <- sapply(6:47, f) 35 | # Display result. 36 | plot(0:41, rss, xlab="Number of bogus regressors.", ylab="Residual squared error.", 37 | main="Residual Squared Error for Swiss Data\nUsing Irrelevant (Bogus) Regressors", 38 | pch=21, bg='red') 39 | } 40 | 41 | # Plot histograms illustrating bias in estimates of a regressor 42 | # coefficient 1) when an uncorrelated regressor is missing and 43 | # 2) when a correlated regressor is missing. 44 | x1hist <- function(x1c){ 45 | p1 <- hist(x1c[1,], plot=FALSE) 46 | p2 <- hist(x1c[2,], plot=FALSE) 47 | yrange <- c(0, max(p1$counts, p2$counts)) 48 | plot(p1, col=rgb(0,0,1,1/4), xlim=range(x1c), ylim=yrange, xlab="Estimated coefficient of x1", 49 | main="Bias Effect of Omitted Regressor") 50 | plot(p2, col=rgb(1,0,0,1/4), xlim=range(x1c), ylim=yrange, add=TRUE) 51 | legend(1.1, 40, c("Uncorrelated regressor, x3, omitted", "Correlated regressor, x2, omitted"), 52 | fill=c(rgb(0,0,1,1/4), rgb(1,0,0,1/4))) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /rcode/incourse1.R: -------------------------------------------------------------------------------- 1 | # Jan-Phillip Kolb 2 | # 3 | 4 | 5 | # install.packages("lme4") 6 | 7 | library(lme4) 8 | 9 | install.packages("keras") 10 | 11 | # to coop overfitting 12 | install.packages("glmnet") 13 | 14 | # xgboost 15 | 16 | install.packages("xgboost") 17 | 18 | install.packages("rpart") 19 | 20 | install.packages("gbm") 21 | 22 | install.packages("nnet") 23 | 24 | ?knn 25 | 26 | ?kmeans 27 | 28 | kmeans() 29 | 30 | install.packages("tidyverse") 31 | 32 | ############################# 33 | 34 | path1<-"https://raw.githubusercontent.com/" 35 | path2<- "thomaspernet/data_csv_r/master/data/" 36 | dname <- "titanic_csv.csv" 37 | titanic <- read.csv(paste0(path1,path2,dname)) 38 | 39 | data(Titanic) 40 | head(Titanic) 41 | 42 | install.packages("datasets.load") 43 | 44 | install.packages("colourpicker") 45 | c("#8B2323", "#7FFFD4") 46 | 47 | 48 | # lme4:: 49 | 50 | ### Exercise swiss data 51 | 52 | # 1) 53 | data(swiss) 54 | dim(swiss) 55 | nrow(swiss) 56 | ncol(swiss) 57 | 58 | head(swiss,n=10) 59 | tail(swiss) 60 | View(swiss) 61 | str(swiss) 62 | 63 | # install.packages("DT") 64 | 65 | DT::datatable(swiss) 66 | 67 | #### 68 | 69 | data(airquality) 70 | 71 | (airq <- data.table::data.table(airquality)) 72 | 73 | airq 74 | 75 | rm(airq) 76 | 77 | ### Solution: random number 78 | 79 | set.seed(10) 80 | (x <- runif(8)) 81 | 82 | 83 | round(exp(diff(log(x))), 1) 84 | 85 | clean_titanic <- titanic %>% 86 | mutate(pclass=factor(pclass,levels = c(1, 2, 3), 87 | labels=c('Upper','Middle','Lower')), 88 | survived = factor(survived,levels = c(0, 1), 89 | labels=c('No', 'Yes'))) %>% 90 | na.omit() 91 | 92 | library(dplyr) 93 | 94 | tit_wna <- na.omit(titanic) 95 | 96 | # mutate(tit_wna,...) 97 | 98 | clean_titanic <- mutate(,pclass=factor(pclass,levels = c(1, 2, 3), 99 | labels=c('Upper','Middle','Lower')))) 100 | 101 | 102 | numerics <- c(1,2,3) 103 | str(numerics) 104 | 105 | charvec <- c("hj",7,"iu") 106 | str(charvec) 107 | 108 | ab <- as.factor(c(1,2,1,2)) 109 | str(ab) 110 | ######################### 111 | 112 | library(dplyr) 113 | library(tidyr) 114 | stocks <- tibble( 115 | time = as.Date('2009-01-01') + 0:9, 116 | X = rnorm(10, 0, 1), 117 | Y = rnorm(10, 0, 2), 118 | Z = rnorm(10, 0, 4) 119 | ) 120 | 121 | 122 | head(gather(stocks, "stock", "price", -time)) 123 | -------------------------------------------------------------------------------- /rcode/incourse2.R: -------------------------------------------------------------------------------- 1 | # Jan-Philipp Kolb 2 | # Mon Jun 03 16:47:47 2019 3 | # In course part 2 4 | 5 | data(mtcars) 6 | 7 | m1 <- lm(mpg~wt,data=mtcars) 8 | 9 | sum_mod <- summary(m1) 10 | sum_mod$coefficients 11 | 12 | ############################## 13 | 14 | dev.off() 15 | 16 | plot(mtcars$wt,mtcars$mpg) 17 | abline(m1) 18 | segments(mtcars$wt, mtcars$mpg, mtcars$wt, pre, col="red") 19 | 20 | ################################# 21 | 22 | 23 | ames_data <- AmesHousing::make_ames()# 1) 24 | # alternative 25 | library(AmesHousing) 26 | ames_data <- make_ames() 27 | 28 | 29 | colnames(ames_data) 30 | m1 <- lm(Sale_Price ~ Gr_Liv_Area + TotRms_AbvGrd, data = ames_data) 31 | m2 <- lm(Sale_Price ~ Gr_Liv_Area, data = ames_data) 32 | m3 <- lm(Sale_Price ~ TotRms_AbvGrd, data = ames_data) 33 | 34 | m1$coefficients 35 | m2$coefficients 36 | m3$coefficients 37 | 38 | ########## 39 | 40 | for (i in 1:3){ 41 | eval(parse(text=paste0("summary(m",i,")"))) 42 | } 43 | 44 | ################################# 45 | 46 | ?glmnet 47 | 48 | library(AmesHousing) 49 | ames_data <- AmesHousing::make_ames() 50 | 51 | ncol(ames_data) 52 | 53 | ames_train_x <- model.matrix(Sale_Price ~ ., ames_train)[, -1] 54 | ames_train_y <- log(ames_train$Sale_Price) 55 | ames_test_x <- model.matrix(Sale_Price ~ ., ames_test)[, -1] 56 | ames_test_y <- log(ames_test$Sale_Price) 57 | 58 | library(glmnet) 59 | ames_ridge <- glmnet(x = ames_train_x,y = ames_train_y, 60 | alpha = 0) 61 | 62 | coef(ames_ridge) 63 | 64 | #################################### 65 | 66 | install.packages("lars") 67 | library(lars) # 1) 68 | data(diabetes) 69 | 70 | 71 | library(glmnet) #2) 72 | # Create the scatterplots 73 | set.seed(1234) 74 | par(mfrow=c(2,5)) 75 | for(i in 1:10){ # 3) 76 | plot(diabetes$x[,i], diabetes$y) 77 | abline(lm(diabetes$y~diabetes$x[,i]),col="red") 78 | } 79 | 80 | model_ols <- lm(diabetes$y ~ diabetes$x) # 4) 81 | summary(model_ols) 82 | 83 | lambdas <- 10^seq(7, -3) 84 | model_ridge <- glmnet(diabetes$x, diabetes$y, 85 | alpha = 0, lambda = lambdas) 86 | plot.glmnet(model_ridge, xvar = "lambda", label = TRUE) 87 | 88 | cv_fit <- cv.glmnet(x=diabetes$x, y=diabetes$y, 89 | alpha = 0, nlambda = 1000) 90 | cv_fit$lambda.min 91 | 92 | plot.cv.glmnet(cv_fit) 93 | 94 | fit <- glmnet(x=diabetes$x, y=diabetes$y, 95 | alpha = 0, lambda=cv_fit$lambda.min) 96 | fit$beta 97 | 98 | fit <- glmnet(x=diabetes$x, y=diabetes$y, 99 | alpha = 0, lambda=cv_fit$lambda.1se) 100 | fit$beta 101 | 102 | # install.packages("rpart") 103 | 104 | library(caret) 105 | intrain <- createDataPartition(y=diabetes$y, 106 | p = 0.8, 107 | list = FALSE) 108 | training <- diabetes[intrain,] 109 | testing <- diabetes[-intrain,] 110 | 111 | cv_ridge <- cv.glmnet(x=training$x, y=training$y, 112 | alpha = 0, nlambda = 1000) 113 | ridge_reg <- glmnet(x=training$x, y=training$y, 114 | alpha = 0, lambda=cv_ridge$lambda.min) 115 | ridge_reg$beta 116 | 117 | ridge_reg <- glmnet(x=training$x, y=training$y, 118 | alpha = 0, lambda=cv_ridge$lambda.1se) 119 | ridge_reg$beta 120 | 121 | ridge_reg <- glmnet(x=training$x, y=training$y, 122 | alpha = 0, lambda=cv_ridge$lambda.min) 123 | ridge_pred<-predict.glmnet(ridge_reg, 124 | s = cv_ridge$lambda.min,newx = testing$x) 125 | sd((ridge_pred - testing$y)^2)/sqrt(length(testing$y)) 126 | 127 | 128 | ridge_reg <- glmnet(x=training$x, y=training$y, 129 | alpha = 0, lambda=cv_ridge$lambda.1se) 130 | ridge_pred <- predict.glmnet(ridge_reg, 131 | s = cv_ridge$lambda.1se, newx = testing$x) 132 | sd((ridge_pred - testing$y)^2)/sqrt(length(testing$y)) 133 | 134 | ols_reg <- lm(y ~ x, data = training) 135 | summary(ols_reg) 136 | 137 | ols_pred <- predict(ols_reg, newdata=testing$x, 138 | type = "response") 139 | sd((ols_pred - testing$y)^2)/sqrt(length(testing$y)) 140 | 141 | coef(model_ols) 142 | 143 | 144 | library(Metrics) 145 | mse(testing$y,ols_pred) 146 | mse(ridge_pred,testing$y) 147 | -------------------------------------------------------------------------------- /rcode/preparing_bagging.R: -------------------------------------------------------------------------------- 1 | # Jan-Philipp Kolb 2 | # Thu May 02 11:09:41 2019 3 | # Source: https://www.r-bloggers.com/machine-learning-explained-bagging/ 4 | 5 | require(data.table) 6 | library(rpart) 7 | require(ggplot2) 8 | 9 | set.seed(456) 10 | 11 | ## Reading data 12 | bagging_data <- data.table(airquality) 13 | 14 | ggplot(bagging_data,aes(Wind,Ozone))+geom_point()+ 15 | ggtitle("Ozone vs wind speed") 16 | 17 | data_test <- na.omit(bagging_data[,.(Ozone,Wind)]) 18 | 19 | ## Training data 20 | 21 | train_index <- sample.int(nrow(data_test), 22 | size=round(nrow(data_test)*0.8), 23 | replace = F) 24 | 25 | data_test[train_index,train:=TRUE][-train_index,train:=FALSE] 26 | 27 | ## Model without bagging 28 | no_bag_model <- rpart(Ozone~Wind,data_test[train_index],control=rpart.control(minsplit=6)) 29 | result_no_bag <- predict(no_bag_model,bagging_data) 30 | 31 | ##Training of the bagged model 32 | n_model=100 33 | bagged_models=list() 34 | for (i in 1:n_model) 35 | { 36 | new_sample=sample(train_index,size=length(train_index),replace=T) 37 | bagged_models=c(bagged_models,list(rpart(Ozone~Wind,data_test[new_sample],control=rpart.control(minsplit=6)))) 38 | } 39 | 40 | ##Getting estimate from the bagged model 41 | bagged_result=NULL 42 | i=0 43 | for (from_bag_model in bagged_models) 44 | { 45 | if (is.null(bagged_result)) 46 | bagged_result=predict(from_bag_model,bagging_data) 47 | else 48 | bagged_result=(i*bagged_result+predict(from_bag_model,bagging_data))/(i+1) 49 | i=i+1 50 | } 51 | 52 | ##Plot 53 | require(ggplot2) 54 | gg=ggplot(data_test,aes(Wind,Ozone))+geom_point(aes(color=train)) 55 | for (tree_model in bagged_models[1:100]) 56 | { 57 | prediction=predict(tree_model,bagging_data) 58 | data_plot=data.table(Wind=bagging_data$Wind,Ozone=prediction) 59 | gg=gg+geom_line(data=data_plot[order(Wind)],aes(x=Wind,y=Ozone),alpha=0.2) 60 | } 61 | data_bagged=data.table(Wind=bagging_data$Wind,Ozone=bagged_result) 62 | gg=gg+geom_line(data=data_bagged[order(Wind)],aes(x=Wind,y=Ozone),color='green') 63 | 64 | data_no_bag=data.table(Wind=bagging_data$Wind,Ozone=result_no_bag) 65 | gg=gg+geom_line(data=data_no_bag[order(Wind)],aes(x=Wind,y=Ozone),color='red') 66 | gg 67 | -------------------------------------------------------------------------------- /rcode/purl_the_slides.R: -------------------------------------------------------------------------------- 1 | # Jan-Philipp Kolb 2 | # Fri Sep 28 11:27:43 2018 3 | 4 | 5 | library(knitr) 6 | 7 | setwd("D:/github/machine_learning/slides") 8 | purl("GESISPanel.Rmd") 9 | -------------------------------------------------------------------------------- /rcode/randomforests_boosting.R: -------------------------------------------------------------------------------- 1 | # Random Forests and Boosting 2 | 3 | # Bagging suffers from tree correlation, which reduces the overall performance of the model. 4 | # Random forests are a modification of bagging that builds a large collection of de-correlated trees 5 | # Similar to bagging, each tree is grown to a bootstrap resampled data set, 6 | # which makes them different and decorrelates them. 7 | 8 | library(rsample) # data splitting 9 | library(randomForest) # basic implementation 10 | library(ranger) # a faster implementation of randomForest 11 | library(caret) 12 | 13 | 14 | ## The Ames housing data 15 | 16 | load("../data/ames_data.RData") 17 | set.seed(123) 18 | ames_split <- rsample::initial_split(ames_data,prop=.7) 19 | ames_train <- rsample::training(ames_split) 20 | ames_test <- rsample::testing(ames_split) 21 | 22 | ############ 23 | 24 | set.seed(123) 25 | # default RF model 26 | (m1 <- randomForest(formula = Sale_Price ~ .,data=ames_train)) 27 | 28 | plot(m1) 29 | 30 | # ntreeTry - We want enough trees to stabalize the error but using too 31 | # many trees is inefficient, esp. for large data sets. 32 | 33 | # mtry - number of variables as candidates at each split. 34 | # When mtry=p -> bagging. 35 | # When mtry=1 the split variable is completely random 36 | 37 | # package ranger is faster 38 | library(ranger) 39 | ames_ranger <- ranger(formula=Sale_Price ~ ., 40 | data = ames_train,num.trees = 500, 41 | mtry = floor(length(features) / 3)) 42 | 43 | ames_ranger 44 | head(ames_ranger$predictions) 45 | 46 | ## tuning with a hypergrid 47 | 48 | hyper_grid <- expand.grid( 49 | mtry = seq(20, 30, by = 2), 50 | node_size = seq(3, 9, by = 2), 51 | sampe_size = c(.55, .632, .70, .80), 52 | OOB_RMSE = 0 53 | ) 54 | 55 | nrow(hyper_grid) 56 | 57 | for(i in 1:nrow(hyper_grid)) { 58 | model <- ranger(formula= Sale_Price ~ .,data= ames_train, 59 | num.trees = 500,mtry= hyper_grid$mtry[i], 60 | min.node.size = hyper_grid$node_size[i], 61 | sample.fraction = hyper_grid$sampe_size[i], 62 | seed = 123) 63 | # add OOB error to grid 64 | hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error) 65 | } 66 | 67 | hyper_grid %>% dplyr::arrange(OOB_RMSE) %>% head(10) 68 | 69 | # Variable importance 70 | 71 | varimp_ranger <- optimal_ranger$variable.importance 72 | 73 | lattice::barchart(sort(varimp_ranger)[1:25],col="royalblue") 74 | 75 | pred_randomForest <- predict(ames_randomForest, ames_test) 76 | head(pred_randomForest) 77 | 78 | ######################################################## 79 | # Boosting 80 | 81 | library(rsample) # data splitting 82 | library(gbm) # basic implementation 83 | library(xgboost) # a faster implementation of gbm 84 | library(caret) # aggregator package - machine learning 85 | library(pdp) # model visualization 86 | library(ggplot2) # model visualization 87 | library(lime) # model visualization 88 | 89 | ames_data <- AmesHousing::make_ames() 90 | set.seed(123) 91 | ames_split <- initial_split(ames_data,prop=.7) 92 | ames_train <- training(ames_split) 93 | ames_test <- testing(ames_split) 94 | 95 | # distribution - depends on the response (e.g. bernoulli for binomial) 96 | # n.tress - number of trees to fit 97 | # interaction depth - 1 is for additive model 98 | # 2 allows for 2-way interactions 99 | # cv.folds - number of cross validation folds 100 | # shrinkage - learning rate - a smaller learning rate typically requires more trees. 101 | 102 | gbm.fit <- gbm(formula = Sale_Price ~ .,distribution="gaussian", 103 | data = ames_train,n.trees = 100,interaction.depth = 1, 104 | shrinkage = 0.001,cv.folds = 5) 105 | 106 | # this means on average our model is about $29,133 off from the actual sales price 107 | sqrt(min(gbm.fit$cv.error)) 108 | 109 | 110 | # make prediction 111 | pred <- predict(gbm.fit, ames_test) 112 | 113 | -------------------------------------------------------------------------------- /slides/a1_intro_ml.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning - what is it" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | fontsize: 10pt 5 | output: 6 | slidy_presentation: 7 | highlight: haddock 8 | keep_md: yes 9 | beamer_presentation: 10 | colortheme: dolphin 11 | fig_height: 3 12 | fig_width: 5 13 | fonttheme: structuresmallcapsserif 14 | highlight: haddock 15 | theme: Dresden 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = FALSE) 20 | ``` 21 | 22 | 23 | ## Target of the course 24 | 25 | - What is machine learning? 26 | - Why do we need it? / When do we need it? 27 | - How to prepare your data for ML 28 | 29 | ## Preliminaries 30 | 31 | - This topic is huge - we concentrate on presenting the applications in R 32 | - Usually we have big differences in knowledge and abilities of the participants - please tell, if it is too fast or slow. 33 | - We have many [**exercises**](http://web.math.ku.dk/~helle/R-intro/exercises.pdf) because at the end you can only learn on your own 34 | - We have many [**examples**](https://www.showmeshiny.com/) - try them! 35 | - If there are questions - always ask 36 | - R is more fun together - ask your neighbor 37 | 38 | 39 | 40 | ## Introduction round 41 | 42 | ### Please tell us shortly... 43 | 44 | - Where are you from? What are you studying/working? 45 | - What is your experience level in R/other programming languages? 46 | - What are your expectations of this course? 47 | - Where do you think you can use Machine Learning in the future? 48 | 49 | 50 | 51 | 52 | ## [Prediction vs interpretability](https://machinelearningmastery.com/model-prediction-versus-interpretation-in-machine-learning/) 53 | 54 | - We have a trade-off of model prediction accuracy versus model interpretation. 55 | 56 | - It is critical to have a clear idea of the which is a priority 57 | 58 | 59 | ## [The bias-variance tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff) (I) 60 | 61 | - The bias–variance tradeoff is the property of a set of predictive models whereby models with a lower bias in parameter estimation have a higher variance of the parameter estimates across samples, and vice versa. 62 | 63 | [![](figure/bias_variance_tradeoff2.png)](https://towardsdatascience.com/understanding-the-bias-variance-tradeoff-165e6942b229) 64 | 65 | 70 | 71 | ## The bias-variance tradeoff (II) 72 | 73 | ![](figure/bias_variance_tradeoff.PNG) 74 | 75 | 76 | ## [Bootstrapping](https://www.statmethods.net/advstats/bootstrapping.html) 77 | 78 | - [**Bootstrap**](https://www.datacamp.com/community/tutorials/bootstrap-r) is a method of inference about a population using sample data. 79 | 80 | ## [The curse of dimensionality](https://www.freecodecamp.org/news/the-curse-of-dimensionality-how-we-can-save-big-data-from-itself-d9fa0f872335/) 81 | 82 | - We have a high number of possible features 83 | - We want to find the best representation of data in a lower-dimensional space 84 | 85 | 91 | 92 | 106 | 107 | ## [regression and classification](https://www.youtube.com/watch?v=Z0v9QMkA3dA&list=PLOg0ngHtcqbPTlZzRHA2ocQZqB1D_qZ5V&index=2) 108 | 109 | ### regression problem 110 | 111 | y is quantitative 112 | 113 | ### classification problem 114 | 115 | y is binomial/categorical 116 | -------------------------------------------------------------------------------- /slides/a1_intro_ml.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning - what is it" 3 | date: "13 Januar, 2020" 4 | fontsize: 10pt 5 | output: 6 | slidy_presentation: 7 | highlight: haddock 8 | keep_md: yes 9 | beamer_presentation: 10 | colortheme: dolphin 11 | fig_height: 3 12 | fig_width: 5 13 | fonttheme: structuresmallcapsserif 14 | highlight: haddock 15 | theme: Dresden 16 | --- 17 | 18 | 19 | 20 | 21 | ## Target of the course 22 | 23 | - What is machine learning? 24 | - Why do we need it? / When do we need it? 25 | - How to prepare your data for ML 26 | 27 | ## Preliminaries 28 | 29 | - This topic is huge - we concentrate on presenting the applications in R 30 | - Usually we have big differences in knowledge and abilities of the participants - please tell, if it is too fast or slow. 31 | - We have many [**exercises**](http://web.math.ku.dk/~helle/R-intro/exercises.pdf) because at the end you can only learn on your own 32 | - We have many [**examples**](https://www.showmeshiny.com/) - try them! 33 | - If there are questions - always ask 34 | - R is more fun together - ask your neighbor 35 | 36 | 37 | 38 | ## Introduction round 39 | 40 | ### Please tell us shortly... 41 | 42 | - Where are you from? What are you studying/working? 43 | - What is your experience level in R/other programming languages? 44 | - What are your expectations of this course? 45 | - Where do you think you can use Machine Learning in the future? 46 | 47 | 48 | 49 | 50 | ## [Prediction vs interpretability](https://machinelearningmastery.com/model-prediction-versus-interpretation-in-machine-learning/) 51 | 52 | - We have a trade-off of model prediction accuracy versus model interpretation. 53 | 54 | - It is critical to have a clear idea of the which is a priority 55 | 56 | 57 | ## [Bootstrapping](https://www.statmethods.net/advstats/bootstrapping.html) 58 | 59 | - [**Bootstrap**](https://www.datacamp.com/community/tutorials/bootstrap-r) is a method of inference about a population using sample data. 60 | 61 | ## [The curse of dimensionality](https://www.freecodecamp.org/news/the-curse-of-dimensionality-how-we-can-save-big-data-from-itself-d9fa0f872335/) 62 | 63 | - We have a high number of possible features 64 | - We want to find the best representation of data in a lower-dimensional space 65 | 66 | 72 | 73 | 87 | -------------------------------------------------------------------------------- /slides/a1_intro_ml.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_ml.pdf -------------------------------------------------------------------------------- /slides/a1_intro_r_cache/slidy/__packages: -------------------------------------------------------------------------------- 1 | base 2 | methods 3 | datasets 4 | utils 5 | grDevices 6 | graphics 7 | stats 8 | knitr 9 | dplyr 10 | magrittr 11 | data.table 12 | purrr 13 | tidyr 14 | MASS 15 | tidyverse 16 | ggplot2 17 | tibble 18 | readr 19 | stringr 20 | forcats 21 | -------------------------------------------------------------------------------- /slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.RData -------------------------------------------------------------------------------- /slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdb -------------------------------------------------------------------------------- /slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdx -------------------------------------------------------------------------------- /slides/a2_intro_r.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r.pdf -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/beamer/__packages: -------------------------------------------------------------------------------- 1 | base 2 | methods 3 | datasets 4 | utils 5 | grDevices 6 | graphics 7 | stats 8 | knitr 9 | dplyr 10 | magrittr 11 | data.table 12 | purrr 13 | tidyr 14 | MASS 15 | tidyverse 16 | ggplot2 17 | tibble 18 | readr 19 | stringr 20 | forcats 21 | -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.RData -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdb -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdx -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/slidy/__packages: -------------------------------------------------------------------------------- 1 | base 2 | methods 3 | datasets 4 | utils 5 | grDevices 6 | graphics 7 | stats 8 | knitr 9 | dplyr 10 | magrittr 11 | data.table 12 | purrr 13 | tidyr 14 | MASS 15 | tidyverse 16 | ggplot2 17 | tibble 18 | readr 19 | stringr 20 | forcats 21 | -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.RData -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdb -------------------------------------------------------------------------------- /slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdx -------------------------------------------------------------------------------- /slides/a2_intro_r_files/figure-beamer/unnamed-chunk-51-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_files/figure-beamer/unnamed-chunk-51-1.pdf -------------------------------------------------------------------------------- /slides/a2_intro_r_files/figure-slidy/unnamed-chunk-51-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_files/figure-slidy/unnamed-chunk-51-1.png -------------------------------------------------------------------------------- /slides/b1_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression.pdf -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-25-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-25-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-26-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-26-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-48-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-48-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-49-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-49-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-51-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-51-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-52-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-52-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-53-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-53-1.png -------------------------------------------------------------------------------- /slides/b1_regression_files/figure-slidy/unnamed-chunk-58-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-58-1.png -------------------------------------------------------------------------------- /slides/b2_regularization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b2_regularization.pdf -------------------------------------------------------------------------------- /slides/c1_trees_bagging.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c1_trees_bagging.pdf -------------------------------------------------------------------------------- /slides/c2_random_forests.aux: -------------------------------------------------------------------------------- 1 | \relax 2 | \providecommand\hyper@newdestlabel[2]{} 3 | \providecommand\BKM@entry[2]{} 4 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} 5 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined 6 | \global\let\oldcontentsline\contentsline 7 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} 8 | \global\let\oldnewlabel\newlabel 9 | \gdef\newlabel#1#2{\newlabelxx{#1}#2} 10 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} 11 | \AtEndDocument{\ifx\hyper@anchor\@undefined 12 | \let\contentsline\oldcontentsline 13 | \let\newlabel\oldnewlabel 14 | \fi} 15 | \fi} 16 | \global\let\hyper@last\relax 17 | \gdef\HyperFirstAtBeginDocument#1{#1} 18 | \providecommand\HyField@AuxAddToFields[1]{} 19 | \providecommand\HyField@AuxAddToCoFields[2]{} 20 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}}} 21 | \@writefile{nav}{\headcommand {\beamer@framepages {1}{1}}} 22 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}}} 23 | \@writefile{nav}{\headcommand {\beamer@framepages {2}{2}}} 24 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{3}{3/3}{}{0}}} 25 | \@writefile{nav}{\headcommand {\beamer@framepages {3}{3}}} 26 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{4}{4/4}{}{0}}} 27 | \@writefile{nav}{\headcommand {\beamer@framepages {4}{4}}} 28 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{5}{5/5}{}{0}}} 29 | \@writefile{nav}{\headcommand {\beamer@framepages {5}{5}}} 30 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{6}{6/6}{}{0}}} 31 | \@writefile{nav}{\headcommand {\beamer@framepages {6}{6}}} 32 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{7}{7/7}{}{0}}} 33 | \@writefile{nav}{\headcommand {\beamer@framepages {7}{7}}} 34 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{8}{8/8}{}{0}}} 35 | \@writefile{nav}{\headcommand {\beamer@framepages {8}{8}}} 36 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{9}{9/9}{}{0}}} 37 | \@writefile{nav}{\headcommand {\beamer@framepages {9}{9}}} 38 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{10}{10/10}{}{0}}} 39 | \@writefile{nav}{\headcommand {\beamer@framepages {10}{10}}} 40 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{11}{11/11}{}{0}}} 41 | \@writefile{nav}{\headcommand {\beamer@framepages {11}{11}}} 42 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{12}{12/12}{}{0}}} 43 | \@writefile{nav}{\headcommand {\beamer@framepages {12}{12}}} 44 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{13}{13/13}{}{0}}} 45 | \@writefile{nav}{\headcommand {\beamer@framepages {13}{13}}} 46 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{14}{14/14}{}{0}}} 47 | \@writefile{nav}{\headcommand {\beamer@framepages {14}{14}}} 48 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{15}{15/15}{}{0}}} 49 | \@writefile{nav}{\headcommand {\beamer@framepages {15}{15}}} 50 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{16}{16/16}{}{0}}} 51 | \@writefile{nav}{\headcommand {\beamer@framepages {16}{16}}} 52 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{17}{17/17}{}{0}}} 53 | \@writefile{nav}{\headcommand {\beamer@framepages {17}{17}}} 54 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{18}{18/18}{}{0}}} 55 | \@writefile{nav}{\headcommand {\beamer@framepages {18}{18}}} 56 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{19}{19/19}{}{0}}} 57 | \@writefile{nav}{\headcommand {\beamer@framepages {19}{19}}} 58 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{20}{20/20}{}{0}}} 59 | \@writefile{nav}{\headcommand {\beamer@framepages {20}{20}}} 60 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{21}{21/21}{}{0}}} 61 | \@writefile{nav}{\headcommand {\beamer@framepages {21}{21}}} 62 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{22}{22/22}{}{0}}} 63 | \@writefile{nav}{\headcommand {\beamer@framepages {22}{22}}} 64 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{23}{23/23}{}{0}}} 65 | \@writefile{nav}{\headcommand {\beamer@framepages {23}{23}}} 66 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{24}{24/24}{}{0}}} 67 | \@writefile{nav}{\headcommand {\beamer@framepages {24}{24}}} 68 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{25}{25/25}{}{0}}} 69 | \@writefile{nav}{\headcommand {\beamer@framepages {25}{25}}} 70 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{26}{26/26}{}{0}}} 71 | \@writefile{nav}{\headcommand {\beamer@framepages {26}{26}}} 72 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{27}{27/27}{}{0}}} 73 | \@writefile{nav}{\headcommand {\beamer@framepages {27}{27}}} 74 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{28}{28/28}{}{0}}} 75 | \@writefile{nav}{\headcommand {\beamer@framepages {28}{28}}} 76 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{29}{29/29}{}{0}}} 77 | \@writefile{nav}{\headcommand {\beamer@framepages {29}{29}}} 78 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{30}{30/30}{}{0}}} 79 | \@writefile{nav}{\headcommand {\beamer@framepages {30}{30}}} 80 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{31}{31/31}{}{0}}} 81 | \@writefile{nav}{\headcommand {\beamer@framepages {31}{31}}} 82 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{32}{32/32}{}{0}}} 83 | \@writefile{nav}{\headcommand {\beamer@framepages {32}{32}}} 84 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{33}{33/33}{}{0}}} 85 | \@writefile{nav}{\headcommand {\beamer@framepages {33}{33}}} 86 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{34}{34/34}{}{0}}} 87 | \@writefile{nav}{\headcommand {\beamer@framepages {34}{34}}} 88 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{35}{35/35}{}{0}}} 89 | \@writefile{nav}{\headcommand {\beamer@framepages {35}{35}}} 90 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{36}{36/36}{}{0}}} 91 | \@writefile{nav}{\headcommand {\beamer@framepages {36}{36}}} 92 | \@writefile{nav}{\headcommand {\beamer@partpages {1}{36}}} 93 | \@writefile{nav}{\headcommand {\beamer@subsectionpages {1}{36}}} 94 | \@writefile{nav}{\headcommand {\beamer@sectionpages {1}{36}}} 95 | \@writefile{nav}{\headcommand {\beamer@documentpages {36}}} 96 | \@writefile{nav}{\headcommand {\gdef \inserttotalframenumber {36}}} 97 | -------------------------------------------------------------------------------- /slides/c2_random_forests.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c2_random_forests.pdf -------------------------------------------------------------------------------- /slides/c2_random_forests.vrb: -------------------------------------------------------------------------------- 1 | \frametitle{Predicting} 2 | \protect\hypertarget{predicting}{} 3 | 4 | \begin{itemize} 5 | \tightlist 6 | \item 7 | With the preferred model we can use the traditional predict function 8 | to make predictions on a new data set. 9 | \item 10 | We can use this for all our model types (\texttt{randomForest} and 11 | \texttt{ranger}); although the outputs differ slightly. 12 | \end{itemize} 13 | 14 | \begin{Shaded} 15 | \begin{Highlighting}[] 16 | \CommentTok{# randomForest} 17 | \NormalTok{pred_randomForest <-}\StringTok{ }\KeywordTok{predict}\NormalTok{(ames_randomForest, ames_test)} 18 | \KeywordTok{head}\NormalTok{(pred_randomForest)} 19 | \end{Highlighting} 20 | \end{Shaded} 21 | 22 | \begin{verbatim} 23 | ## 1 2 3 4 5 6 24 | ## 113543.1 185556.4 259258.1 190943.9 179071.0 480952.3 25 | \end{verbatim} 26 | 27 | \begin{Shaded} 28 | \begin{Highlighting}[] 29 | \CommentTok{# ranger} 30 | \NormalTok{pred_ranger <-}\StringTok{ }\KeywordTok{predict}\NormalTok{(ames_ranger, ames_test)} 31 | \KeywordTok{head}\NormalTok{(pred_ranger}\OperatorTok{$}\NormalTok{predictions)} 32 | \end{Highlighting} 33 | \end{Shaded} 34 | 35 | \begin{verbatim} 36 | ## [1] 129258.1 186520.7 265628.2 197745.5 175517.6 392691.7 37 | \end{verbatim} 38 | 39 | -------------------------------------------------------------------------------- /slides/c3_gbm_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c3_gbm_regression.pdf -------------------------------------------------------------------------------- /slides/c3_gbm_regression_short.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c3_gbm_regression_short.pdf -------------------------------------------------------------------------------- /slides/c3b_gbm_regression_h2o.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Gradient boosting with h2o" 3 | author: "Jan-Philipp Kolb" 4 | date: "24 Mai 2019" 5 | output: ioslides_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE) 10 | ``` 11 | 12 | 13 | ## h2o 14 | 15 | ```{r} 16 | library(h2o) # a java-based platform 17 | ``` 18 | 19 | 20 | The h2o R package is a powerful and efficient java-based interface that allows for local and cluster-based deployment. It comes with a fairly comprehensive online resource that includes methodology and code documentation along with tutorials. 21 | 22 | ## Features include: 23 | 24 | - Distributed and parallelized computation on either a single node or a multi-node cluster. 25 | - Automatic early stopping based on convergence of user-specified metrics to user-specified relative tolerance. 26 | - Stochastic GBM with column and row sampling (per split and per tree) for better generalization. 27 | - Support for exponential families (Poisson, Gamma, Tweedie) and loss functions in addition to binomial (Bernoulli), Gaussian and multinomial distributions, such as Quantile regression (including Laplace). 28 | - Grid search for hyperparameter optimization and model selection. 29 | - Data-distributed, which means the entire dataset does not need to fit into memory on a single node, hence scales to any size training set. 30 | - Uses histogram approximations of continuous variables for speedup. 31 | - Uses dynamic binning - bin limits are reset at each tree level based on the split bins’ min and max values discovered during the last pass. 32 | - Uses squared error to determine optimal splits. 33 | 36 | - Unlimited factor levels. 37 | - Multiclass trees (one for each class) built in parallel with each other. 38 | - Apache 2.0 Licensed. 39 | - Model export in plain Java code for deployment in production environments. 40 | 41 | ## 42 | -------------------------------------------------------------------------------- /slides/d_neuralNetworks.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Neural Networks" 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | fontsize: 10pt 6 | output: 7 | beamer_presentation: 8 | colortheme: dolphin 9 | fig_height: 3 10 | fig_width: 5 11 | fig_caption: no 12 | fonttheme: structuresmallcapsserif 13 | highlight: haddock 14 | theme: Dresden 15 | pdf_document: 16 | keep_tex: yes 17 | toc: yes 18 | slidy_presentation: 19 | css: mycss.css 20 | keep_md: yes 21 | --- 22 | 23 | ```{r setup, include=FALSE} 24 | knitr::opts_chunk$set(echo = T,message=F,warning=F,eval=T) 25 | ``` 26 | 27 | ## Examples of Multi-Neuron 28 | 29 | 30 | ![](figure/neuralnets.PNG) 31 | 32 | 33 | 45 | 64 | 65 | ## Artifical Neuron 66 | 67 | 74 | 75 | - Inputs correspond to raw data values 76 | 79 | - The transfer function sums all the inputs together (cumulative inputs). 80 | - If the summed input values reach a specified threshold, the activation function generates an output signal (all or nothing). 81 | - The output signal then moves to a raw output or other neurons. 82 | - This basic artificial neuron is combined with multiple other artificial neurons to create an ANN. 83 | 84 | 87 | 88 | ![](figure/ArtificialNeuronModel_english.png){height=40%} 89 | 90 | 94 | 95 | 104 | 105 | 117 | 118 | 121 | 122 | 123 | ## Activation Functions 124 | 125 | - The capability of ANNs to learn any function, (given sufficient training data examples) are dependent on the appropriate selection of the [**activation function(s)**](https://en.wikipedia.org/wiki/Activation_function) present in the network. 126 | - They enable the ANN to learn non-linear properties present in the data. 127 | 130 | - The input into the activation function is the weighted sum of the input features from the preceding layer. 131 | - Let $o_j$ be the output from the jth neuron in a given layer for a network for k input vector features. 132 | 133 | $$ 134 | o_j = \Phi(b_j + \sum\limits_{i=1}^pw_ix_i) 135 | $$ 136 | 137 | 138 | 139 | ## Common ANN Activation functions 140 | 141 | ![](figure/activations-1.png){height=90%} 142 | 143 | 144 | 149 | 150 | ## The output ($o_j$)... 151 | 152 | - ... can feed into the output layer of a neural network, or in deeper architectures may feed into additional hidden layers. 153 | - The activation function determines if the sum of the weighted inputs plus a bias term is sufficiently large to trigger the firing of the neuron. 154 | - No universal best choice for the activation function, researchers have provided information regarding what activation functions work well for ANN solutions to many common problems. 155 | - The choice of the activation function governs the required data scaling necessary for ANN analysis. 156 | 159 | 160 | ## How ANNs Learn 161 | 162 | 166 | 167 | 168 | - We have some features $(X)$ describing an output ($y$) 169 | 172 | - To begin training our single-layer one-neuron neural network we initially randomly assign weights. 173 | - We then run the neural network with the random weights and record the outputs generated. 174 | - This is called a forward pass. Output values, in our case called $y$, are a function of the input values ($X$), the random initial weights ($w$) and our choice of the threshold function ($T$). 175 | 176 | 177 | $$ 178 | \hat{y}= f(X,w,T) 179 | $$ 180 | 181 | 182 | 183 | ## Choice of the performance function 184 | 185 | - Once we have our ANN output values ($\hat{y}$) we can compare them to the data set output values ($y$). 186 | - To do this we use a performance function $P$. 187 | - The choice of the performance function is a choice of the analyst, we choose 188 | 191 | Sum of Squared Errors (SSE). 192 | 193 | -------------------------------------------------------------------------------- /slides/d_neuralNetworks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/d_neuralNetworks.pdf -------------------------------------------------------------------------------- /slides/e_Clustering.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Clustering" 3 | author: "Jan-Philipp Kolb and Alexander Murray-Watters" 4 | date: "18 Januar 2019" 5 | output: 6 | slidy_presentation: 7 | keep_md: yes 8 | --- 9 | 10 | ```{r setupClustering, include=FALSE} 11 | knitr::opts_chunk$set(echo = FALSE,eval=F) 12 | ``` 13 | 14 | 15 | 16 | 17 | ## Resources 18 | 19 | 20 | ```{r,echo=F, eval=FALSE} 21 | slides_path <- getwd() 22 | git_path <- gsub("slides","",slides_path) 23 | if (Sys.info()$nodename=="MAC14077"){ 24 | git_path <- "D:/Daten/GitHub/machine_learning/" 25 | slides_path <- paste0(git_path,"/slides") 26 | } 27 | ``` 28 | 29 | 30 | - [Package `kknn`](https://cran.r-project.org/web/packages/kknn/kknn.pdf) 31 | 32 | ```{r,eval=F} 33 | install.packages("kknn") 34 | ``` 35 | 36 | ```{r} 37 | library("kknn") 38 | ``` 39 | 40 | 41 | ## [Geographic clustering of UK cities](https://www.r-bloggers.com/geographic-clustering-of-uk-cities/) 42 | 43 | Animated example: 44 | https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68 45 | 46 | 47 | ## Exercise: Kmeans 48 | 49 | Apply kmeans to to the `iris` dataset with 2, 3, and 4 50 | clusters. Produce three scatter plots, with the points colored 51 | according to cluster assignment. 52 | 53 | 54 | 55 | ## hdbscan 56 | 57 | A fairly new alternative to kmeans, hdbscan does not require you to 58 | specify the number of categories to be assigned. It only requires a 59 | decision as to the minimum number of points needed to be included in a 60 | cluster. This minimum number acts as a smoothing parameter (such as a 61 | density bandwidth parameter or a histograms bin/bar width), with lower 62 | values finding more clusters. Other advantages of hdbscan include . 63 | 64 | ```{r, eval=FALSE} 65 | install.packages("dbscan") 66 | ``` 67 | 68 | 69 | 70 | ```{r} 71 | library(ggplot2) 72 | library(dplyr) 73 | library(maps) 74 | library(dbscan) 75 | 76 | ## Example where kmeans finds only 1 cluster. 77 | two.clust.eg <- rbind(matrix(rnorm(1000, sd = 0.8), ncol=2), 78 | matrix(rnorm(100, mean = 120, sd = 0.12), ncol = 2)) 79 | 80 | clust <- kmeans(two.clust.eg, centers=2) 81 | 82 | plot(two.clust.eg, col = clust$cluster) 83 | ## points(cl$centers, col = 1:2, pch = 8, cex = 2) 84 | 85 | 86 | ``` 87 | 88 | ```{r} 89 | 90 | 91 | 92 | 93 | 94 | data(moons) 95 | 96 | ## Running HDBscan with the minimum number of points set to 5. 97 | res <- dbscan::hdbscan(moons, minPts = 3) 98 | 99 | plot(moons, col = res$cluster + 1, main="R implementation") 100 | ``` 101 | 102 | 103 | 104 | ## Exercise: Apply kmeans to the moons dataset and compare the results. 105 | -- Be sure to try different numbers of centers. 106 | 107 | 108 | ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 109 | 110 | ## Exercise: Apply both kmeans and hdbscan to the `ChickWeight` dataset's "weight" "Time" variables, and see how well you can get each to perform. 111 | 112 | 113 | 114 | 115 | ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 116 | ```{r, eval=FALSE, echo=FALSE} 117 | ## kmeans 118 | plot(ChickWeight[,1:2], col=kmeans(ChickWeight[,1:2], centers=4)$centers) 119 | 120 | ## hdbscan, minPts=10 121 | plot(ChickWeight[,1:2], col=dbscan::hdbscan(ChickWeight[,1:2], minPts=10)$cluster) 122 | 123 | ## Diet cat. for comparison. 124 | plot(ChickWeight[,1:2], col=ChickWeight$Diet) 125 | 126 | ## Chick cat. for comparison. 127 | plot(ChickWeight[,1:2], col=ChickWeight$Chick) 128 | 129 | 130 | ``` 131 | 132 | 133 | ```{r, eval=FALSE} 134 | load(paste0(git_path,"/data/osmsa_PLZ_14.RData")) 135 | ``` 136 | 137 | 138 | 139 | ## [US Census Data](https://elitedatascience.com/datasets) 140 | 141 | - [US Census Data (Clustering)](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29) – Clustering based on demographics is a tried and true way to perform market research and segmentation. 142 | 143 | 144 | 145 | ## Links 146 | 147 | - [Using clusterlab to benchmark clustering algorithms](https://www.r-bloggers.com/using-clusterlab-to-benchmark-clustering-algorithms/) 148 | -------------------------------------------------------------------------------- /slides/e_Clustering.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Clustering" 3 | author: "Jan-Philipp Kolb and Alexander Murray-Watters" 4 | date: "18 Januar 2019" 5 | output: 6 | slidy_presentation: 7 | keep_md: yes 8 | --- 9 | 10 | 11 | 12 | 13 | 14 | 15 | ## Resources 16 | 17 | 18 | 19 | 20 | 21 | - [Package `kknn`](https://cran.r-project.org/web/packages/kknn/kknn.pdf) 22 | 23 | 24 | 25 | 26 | 27 | 28 | ## [Geographic clustering of UK cities](https://www.r-bloggers.com/geographic-clustering-of-uk-cities/) 29 | 30 | Animated example: 31 | https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68 32 | 33 | 34 | ## Exercise: Kmeans 35 | 36 | Apply kmeans to to the `iris` dataset with 2, 3, and 4 37 | clusters. Produce three scatter plots, with the points colored 38 | according to cluster assignment. 39 | 40 | 41 | ## hdbscan 42 | 43 | A fairly new alternative to kmeans, hdbscan does not require you to 44 | specify the number of categories to be assigned. It only requires a 45 | decision as to the minimum number of points needed to be included in a 46 | cluster. This minimum number acts as a smoothing parameter (such as a 47 | density bandwidth parameter or a histograms bin/bar width), with lower 48 | values finding more clusters. Other advantages of hdbscan include . 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | ## Exercise: Apply kmeans to the moons dataset and compare the results. 61 | -- Be sure to try different numbers of centers. 62 | 63 | 64 | ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 65 | 66 | ## Exercise: Apply both kmeans and hdbscan to the `ChickWeight` dataset's "weight" "Time" variables, and see how well you can get each to perform. 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | ## [US Census Data](https://elitedatascience.com/datasets) 79 | 80 | - [US Census Data (Clustering)](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29) – Clustering based on demographics is a tried and true way to perform market research and segmentation. 81 | 82 | 83 | 84 | ## Links 85 | 86 | - [Using clusterlab to benchmark clustering algorithms](https://www.r-bloggers.com/using-clusterlab-to-benchmark-clustering-algorithms/) 87 | -------------------------------------------------------------------------------- /slides/figure/3d-coordinate-plane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/3d-coordinate-plane.png -------------------------------------------------------------------------------- /slides/figure/450px-Overfitting.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/450px-Overfitting.svg.png -------------------------------------------------------------------------------- /slides/figure/AmesTableau01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/AmesTableau01.png -------------------------------------------------------------------------------- /slides/figure/ArtificialNeuronModel_english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ArtificialNeuronModel_english.png -------------------------------------------------------------------------------- /slides/figure/BBRXC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/BBRXC.png -------------------------------------------------------------------------------- /slides/figure/Blausen_0657_MultipolarNeuron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Blausen_0657_MultipolarNeuron.png -------------------------------------------------------------------------------- /slides/figure/Decision-Tree-Example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Decision-Tree-Example.jpg -------------------------------------------------------------------------------- /slides/figure/Diagslr.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Diagslr.PNG -------------------------------------------------------------------------------- /slides/figure/OneHotEncoding.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/OneHotEncoding.PNG -------------------------------------------------------------------------------- /slides/figure/Overfitting_fig1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Overfitting_fig1.PNG -------------------------------------------------------------------------------- /slides/figure/Picture3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Picture3.jpg -------------------------------------------------------------------------------- /slides/figure/SMLProcess.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/SMLProcess.png -------------------------------------------------------------------------------- /slides/figure/The_Signal_and_the_Noise.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/The_Signal_and_the_Noise.jpg -------------------------------------------------------------------------------- /slides/figure/activation_funs.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/activation_funs.PNG -------------------------------------------------------------------------------- /slides/figure/activations-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/activations-1.png -------------------------------------------------------------------------------- /slides/figure/addins.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/addins.PNG -------------------------------------------------------------------------------- /slides/figure/bagging3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bagging3.png -------------------------------------------------------------------------------- /slides/figure/bias_variance_tradeoff.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bias_variance_tradeoff.PNG -------------------------------------------------------------------------------- /slides/figure/bias_variance_tradeoff2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bias_variance_tradeoff2.png -------------------------------------------------------------------------------- /slides/figure/biglasso.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/biglasso.PNG -------------------------------------------------------------------------------- /slides/figure/book_ml1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/book_ml1.jpg -------------------------------------------------------------------------------- /slides/figure/boosted-trees-process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/boosted-trees-process.png -------------------------------------------------------------------------------- /slides/figure/boosting-in-action-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/boosting-in-action-1.png -------------------------------------------------------------------------------- /slides/figure/bostondata.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bostondata.PNG -------------------------------------------------------------------------------- /slides/figure/bostonscaled.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bostonscaled.PNG -------------------------------------------------------------------------------- /slides/figure/class01-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/class01-1.png -------------------------------------------------------------------------------- /slides/figure/classification_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/classification_regression.png -------------------------------------------------------------------------------- /slides/figure/confusionMatrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/confusionMatrix.png -------------------------------------------------------------------------------- /slides/figure/content_flowchart1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/content_flowchart1.png -------------------------------------------------------------------------------- /slides/figure/datasetsload.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/datasetsload.PNG -------------------------------------------------------------------------------- /slides/figure/decissiontree.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/decissiontree.PNG -------------------------------------------------------------------------------- /slides/figure/dplyr_vignette.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/dplyr_vignette.PNG -------------------------------------------------------------------------------- /slides/figure/dt_amesdata.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/dt_amesdata.PNG -------------------------------------------------------------------------------- /slides/figure/duckduckgo.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/duckduckgo.PNG -------------------------------------------------------------------------------- /slides/figure/electoral_precedent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/electoral_precedent.png -------------------------------------------------------------------------------- /slides/figure/ex_regression_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ex_regression_tree.png -------------------------------------------------------------------------------- /slides/figure/expl_rf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/expl_rf.png -------------------------------------------------------------------------------- /slides/figure/factor3vars_visreg.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/factor3vars_visreg.PNG -------------------------------------------------------------------------------- /slides/figure/fig3_loglambda.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/fig3_loglambda.PNG -------------------------------------------------------------------------------- /slides/figure/four_regmods.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/four_regmods.PNG -------------------------------------------------------------------------------- /slides/figure/gbmtopmodelsvars.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/gbmtopmodelsvars.PNG -------------------------------------------------------------------------------- /slides/figure/ggpairs_yacht.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ggpairs_yacht.png -------------------------------------------------------------------------------- /slides/figure/gradient_descent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/gradient_descent.png -------------------------------------------------------------------------------- /slides/figure/influentalValues_lasso.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/influentalValues_lasso.PNG -------------------------------------------------------------------------------- /slides/figure/interplot_wt_disp.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/interplot_wt_disp.PNG -------------------------------------------------------------------------------- /slides/figure/iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/iris.png -------------------------------------------------------------------------------- /slides/figure/kyphosis_helppage.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/kyphosis_helppage.PNG -------------------------------------------------------------------------------- /slides/figure/learning_rate_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/learning_rate_comparison.png -------------------------------------------------------------------------------- /slides/figure/limeplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/limeplot.png -------------------------------------------------------------------------------- /slides/figure/magrittr_vignette.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/magrittr_vignette.jpg -------------------------------------------------------------------------------- /slides/figure/ml_emoji.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_emoji.png -------------------------------------------------------------------------------- /slides/figure/ml_ice_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_ice_curves.png -------------------------------------------------------------------------------- /slides/figure/ml_rf_errorrate_m1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_rf_errorrate_m1.png -------------------------------------------------------------------------------- /slides/figure/ml_rf_hist_OOB_RMSE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_rf_hist_OOB_RMSE.png -------------------------------------------------------------------------------- /slides/figure/ml_rf_varimp_ranger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_rf_varimp_ranger.png -------------------------------------------------------------------------------- /slides/figure/ml_tb_rpart_iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_tb_rpart_iris.png -------------------------------------------------------------------------------- /slides/figure/mtcars_model_interact.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/mtcars_model_interact.PNG -------------------------------------------------------------------------------- /slides/figure/neuralnetfig.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/neuralnetfig.PNG -------------------------------------------------------------------------------- /slides/figure/neuralnets.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/neuralnets.PNG -------------------------------------------------------------------------------- /slides/figure/nyc_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/nyc_map.png -------------------------------------------------------------------------------- /slides/figure/overview_ml_algorithms.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/overview_ml_algorithms.jpg -------------------------------------------------------------------------------- /slides/figure/package_gbm.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/package_gbm.PNG -------------------------------------------------------------------------------- /slides/figure/pic_hiddenlayers.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/pic_hiddenlayers.PNG -------------------------------------------------------------------------------- /slides/figure/prediction_mtcars.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/prediction_mtcars.PNG -------------------------------------------------------------------------------- /slides/figure/random_trees_fig1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/random_trees_fig1.PNG -------------------------------------------------------------------------------- /slides/figure/reg_3algos.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/reg_3algos.PNG -------------------------------------------------------------------------------- /slides/figure/resid_fitted.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/resid_fitted.PNG -------------------------------------------------------------------------------- /slides/figure/ridgeTop25influentalVars.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ridgeTop25influentalVars.PNG -------------------------------------------------------------------------------- /slides/figure/ridge_coef.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ridge_coef.png -------------------------------------------------------------------------------- /slides/figure/stargazertabex.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/stargazertabex.PNG -------------------------------------------------------------------------------- /slides/figure/stochastic_gradient_descent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/stochastic_gradient_descent.png -------------------------------------------------------------------------------- /slides/figure/swissfertality.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/swissfertality.PNG -------------------------------------------------------------------------------- /slides/figure/taskviewmachinelearning.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/taskviewmachinelearning.PNG -------------------------------------------------------------------------------- /slides/figure/three_algos_complete.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/three_algos_complete.PNG -------------------------------------------------------------------------------- /slides/figure/titanicdata.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/titanicdata.PNG -------------------------------------------------------------------------------- /slides/figure/top-20-r-packages-machine-learning-downloads.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/top-20-r-packages-machine-learning-downloads.jpg -------------------------------------------------------------------------------- /slides/figure/top10gbms.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/top10gbms.PNG -------------------------------------------------------------------------------- /slides/figure/tree-correlation-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/tree-correlation-1.png -------------------------------------------------------------------------------- /slides/figure/tree_m1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/tree_m1.PNG -------------------------------------------------------------------------------- /slides/figure/unsupervisedLearning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/unsupervisedLearning.png -------------------------------------------------------------------------------- /slides/figure/visreg.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visreg.PNG -------------------------------------------------------------------------------- /slides/figure/visreg2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visreg2.PNG -------------------------------------------------------------------------------- /slides/figure/visreg_m6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visreg_m6.PNG -------------------------------------------------------------------------------- /slides/figure/visregcat.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visregcat.PNG -------------------------------------------------------------------------------- /slides/figure/visregplot1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visregplot1.PNG -------------------------------------------------------------------------------- /slides/long/c2_random_forests.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/long/c2_random_forests.pdf -------------------------------------------------------------------------------- /slides/long/d_neuralNetworks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/long/d_neuralNetworks.pdf -------------------------------------------------------------------------------- /slides/old/A_ml_motiv.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Motivation for Machine Learning" 3 | author: "Jan-Philipp Kolb" 4 | date: "18 Januar 2019" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r setupmlmotiv, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE) 10 | ``` 11 | 12 | 13 | 14 | 15 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/) 16 | 17 | ```{r} 18 | start_time <- Sys.time() 19 | ab <- runif(10000000) 20 | end_time <- Sys.time() 21 | 22 | end_time - start_time 23 | ``` 24 | 25 | 26 | ## How many cores are available 27 | 28 | 29 | ```{r} 30 | library(doParallel) 31 | detectCores() 32 | ``` 33 | 34 | ## 35 | 36 | ```{r} 37 | cl <- makeCluster(detectCores()) 38 | registerDoParallel(cl) 39 | ``` 40 | 41 | ```{r} 42 | start_time <- Sys.time() 43 | ab <- runif(10000000) 44 | end_time <- Sys.time() 45 | 46 | end_time - start_time 47 | ``` 48 | 49 | ```{r} 50 | stopCluster(cl) 51 | ``` 52 | 53 | 54 | ```{r} 55 | ?parallel::makeCluster 56 | ``` 57 | 58 | 59 | 60 | 61 | ## Links 62 | 63 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/) 64 | 65 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/) 66 | 67 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/) 68 | 69 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/) 70 | 71 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/) 72 | 73 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/) 74 | 75 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html) 76 | 77 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/) 78 | 79 | 80 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/) 81 | -------------------------------------------------------------------------------- /slides/old/a2_intro_ml.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introducing Machine Learning " 3 | author: "Jan-Philipp Kolb" 4 | date: "03 May 2019" 5 | output: 6 | slidy_presentation: 7 | keep_md: yes 8 | --- 9 | 10 | 11 | 12 | ## [Modern Machine Learning Algorithms](https://elitedatascience.com/machine-learning-algorithms) 13 | 14 | Categorizing machine learning algorithms is tricky, and there are several reasonable approaches; they can be grouped into generative/discriminative, parametric/non-parametric, supervised/unsupervised, and so on. 15 | 16 | 17 | 20 | 21 | ## [Machine Learning - Components](https://www.linkedin.com/pulse/20140822073217-180198720-6-components-of-a-machine-learning-algorithm) 22 | 23 | - Feature Extraction + Domain knowledge 24 | 25 | - Feature Selection 26 | 27 | - Choice of Algorithm 28 | 29 | Naive Bayes, [Support Vector Machines](https://github.com/Japhilko/DataAnalysis/blob/master/Machine%20Learning/SupportVectorMachines.md), Decision Trees, k-Means Clustering, ... 30 | 31 | - Training 32 | 33 | - Choice of Metrics/Evaluation Criteria 34 | 35 | - Testing 36 | 37 | 38 | ## [Feature selection](https://en.wikipedia.org/wiki/Feature_selection) 39 | 40 | 41 | ## [Supervised vs unsupervised learning](https://towardsdatascience.com/supervised-vs-unsupervised-learning-14f68e32ea8d) 42 | 43 | ### Supervised Learning 44 | 45 | - we have prior knowledge of what the output values for our samples should be. 46 | 47 | 48 | ## Task: Find R-packages 49 | 50 | Go to https://cran.r-project.org/ and search for packages that,... 51 | 52 | - can be used for lasso regression 53 | 54 | 57 | 58 | ## Task View Machine Learning 59 | 60 | 61 | ![](figure/taskviewmachinelearning.PNG) 62 | 63 | 64 | 65 | ## Install all packages of a task view 66 | 67 | 68 | ```r 69 | install.packages("ctv") 70 | ctv::install.views("MachineLearning") 71 | ``` 72 | 73 | ## [Prediction vs. Causation in Regression Analysis](https://statisticalhorizons.com/prediction-vs-causation-in-regression-analysis) 74 | 75 | ## Literature for machine learning 76 | 77 | ![](figure/book_ml1.jpg) 78 | 79 | 80 | 88 | 89 | ## Introduction to machine learning with R 90 | 91 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/) 92 | 93 | 94 | - chapter about machine learning in [awesome R](https://awesome-r.com/) 95 | 96 | 97 | - [Shiny App for machine learning](https://www.showmeshiny.com/machlearn/) 98 | 99 | 100 | ## [The Curse of Dimensionality](https://elitedatascience.com/dimensionality-reduction-algorithms) 101 | 102 | ![](figure/3d-coordinate-plane.png) 103 | 104 | 105 | ## Links 106 | 107 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/) 108 | 109 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/) 110 | 111 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/) 112 | 113 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/) 114 | 115 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/) 116 | 117 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/) 118 | 119 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html) 120 | 121 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/) 122 | 123 | 124 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/) 125 | 135 | 136 | - Google`s [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/) 137 | 138 | - [A prelude to machine learning](https://eight2late.wordpress.com/2017/02/23/a-prelude-to-machine-learning/) 139 | 140 | - [caret webinar by Max Kuhn - on youtube](https://www.youtube.com/watch?v=7Jbb2ItbTC4) 141 | 142 | - [learn-math-for-data-science](https://elitedatascience.com/learn-math-for-data-science) 143 | - [learn-statistics-for-data-science](https://elitedatascience.com/learn-statistics-for-data-science) 144 | 145 | - [machine-learning-projects-for-beginners](https://elitedatascience.com/machine-learning-projects-for-beginners) 146 | 147 | 148 | - [An Introduction to machine learning](http://www-bcf.usc.edu/~gareth/ISL/) 149 | - [ISLR book](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Seventh%20Printing.pdf) 150 | -------------------------------------------------------------------------------- /slides/old/a2_intro_ml.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/a2_intro_ml.pdf -------------------------------------------------------------------------------- /slides/old/a_intro_ml.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introducing Machine Learning " 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | beamer_presentation: 7 | colortheme: rose 8 | fonttheme: structurebold 9 | highlight: pygments 10 | theme: Darmstadt 11 | fig_width: 8 12 | fig_height: 4 13 | slidy_presentation: 14 | keep_md: yes 15 | --- 16 | 17 | ```{r setupMlintro, include=FALSE} 18 | knitr::opts_chunk$set(echo = TRUE,cache=T,warning=F) 19 | ``` 20 | 21 | ## Intro Machine Learning 22 | 23 | 26 | 27 | Categorizing machine learning algorithms is tricky 28 | 29 | - ... they can be grouped into generative/discriminative, parametric/non-parametric, supervised/unsupervised, and so on. 30 | 31 | - Scikit-Learn’s documentation page groups algorithms by their learning mechanism. This produces categories such as: Generalized linear models, Support vector machines, nearest neighbors, decision trees, neural networks, ... 32 | 33 | 34 | ## [Machine Learning - Components](https://www.linkedin.com/pulse/20140822073217-180198720-6-components-of-a-machine-learning-algorithm) 35 | 36 | - Feature Extraction + Domain knowledge 37 | 38 | - Feature Selection 39 | 40 | - Choice of Algorithm - e.g. Naive Bayes, [Support Vector Machines](https://github.com/Japhilko/DataAnalysis/blob/master/Machine%20Learning/SupportVectorMachines.md), Decision Trees, k-Means Clustering, ... 41 | 42 | - Training 43 | 44 | - Choice of Metrics/Evaluation Criteria 45 | 46 | - Testing 47 | 48 | 49 | ## [Feature selection](https://elitedatascience.com/dimensionality-reduction-algorithms#feature-selection) 50 | 51 | 54 | 55 | Feature selection is for filtering irrelevant or redundant features from your dataset. The key difference between feature selection and extraction is that feature selection keeps a subset of the original features while feature extraction creates brand new ones. 56 | 57 | To be clear, some supervised algorithms already have built-in feature selection, such as Regularized Regression and Random Forests. Typically, we recommend starting with these algorithms if they fit your task. 58 | 59 | As a stand-alone task, feature selection can be unsupervised (e.g. Variance Thresholds) or supervised (e.g. Genetic Algorithms). You can also combine multiple methods if needed. 60 | 61 | 62 | ## [Supervised vs unsupervised learning](https://towardsdatascience.com/supervised-vs-unsupervised-learning-14f68e32ea8d) 63 | 64 | ### Supervised Learning 65 | 66 | - we have prior knowledge of what the output values for our samples should be. 67 | 68 | ### [Unsupervised Learning](https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/unsupervised.pdf) 69 | 70 | - In unsupervised learning we observe only the features $X_1, X_2,...,X_p$ 71 | . We are not interested in prediction, because we do not have an 72 | associated response variable $Y$. 73 | 74 | 75 | ## Task: Find R-packages 76 | 77 | Go to https://cran.r-project.org/ and search for packages that,... 78 | 79 | - can be used for lasso regression 80 | 81 | 84 | 85 | ## Task View Machine Learning 86 | 87 | 88 | ![](figure/taskviewmachinelearning.PNG) 89 | 90 | 91 | ### Install all packages of a task view 92 | 93 | ```{r,eval=F} 94 | install.packages("ctv") 95 | ctv::install.views("MachineLearning") 96 | ``` 97 | 98 | ## [Prediction vs. Causation in Regression Analysis](https://statisticalhorizons.com/prediction-vs-causation-in-regression-analysis) 99 | 100 | ## R-packages needed for machine learning 101 | 102 | - caret: Classification and Regression Training 103 | - ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics 104 | - mlbench 105 | - class 106 | - caTools 107 | - randomForest 108 | - impute 109 | - ranger 110 | - kernlab 111 | - class 112 | - glmnet 113 | - naivebayes 114 | -rpart 115 | -rpart.plot 116 | 117 | 118 | 119 | 124 | 125 | 126 | 127 | 128 | 136 | 137 | ## Introduction to machine learning with R 138 | 139 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/) 140 | 141 | 142 | - chapter about machine learning in [awesome R](https://awesome-r.com/) 143 | 144 | 145 | - [Shiny App for machine learning](https://www.showmeshiny.com/machlearn/) 146 | 147 | 148 | 149 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/) 150 | 151 | ```{r} 152 | start_time <- Sys.time() 153 | ab <- runif(10000000) 154 | end_time <- Sys.time() 155 | 156 | end_time - start_time 157 | ``` 158 | 159 | 160 | ## How many cores are available 161 | 162 | 163 | ```{r} 164 | library(doParallel) 165 | detectCores() 166 | ``` 167 | 168 | ## 169 | 170 | ```{r} 171 | cl <- makeCluster(detectCores()) 172 | registerDoParallel(cl) 173 | ``` 174 | 175 | ```{r} 176 | start_time <- Sys.time() 177 | ab <- runif(10000000) 178 | end_time <- Sys.time() 179 | 180 | end_time - start_time 181 | ``` 182 | 183 | ```{r} 184 | stopCluster(cl) 185 | ``` 186 | 187 | 188 | ```{r,eval=F} 189 | ?parallel::makeCluster 190 | ``` 191 | 192 | 193 | 194 | 195 | ## Links 196 | 197 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/) 198 | 199 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/) 200 | 201 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/) 202 | 203 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/) 204 | 205 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/) 206 | 207 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/) 208 | 209 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html) 210 | 211 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/) 212 | 213 | 214 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/) 215 | 222 | 223 | - Google`s [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/) 224 | 225 | - [A prelude to machine learning](https://eight2late.wordpress.com/2017/02/23/a-prelude-to-machine-learning/) 226 | 227 | - [caret webinar on youtube](https://www.youtube.com/watch?v=7Jbb2ItbTC4) 228 | 229 | - [beginner-mistakes](https://elitedatascience.com/beginner-mistakes) 230 | 231 | 232 | 235 | 236 | -------------------------------------------------------------------------------- /slides/old/a_intro_ml.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introducing Machine Learning " 3 | author: "Jan-Philipp Kolb" 4 | date: "03 May 2019" 5 | output: 6 | slidy_presentation: 7 | keep_md: yes 8 | --- 9 | 10 | 11 | 12 | 13 | 14 | ## [Machine Learning - Components](https://www.linkedin.com/pulse/20140822073217-180198720-6-components-of-a-machine-learning-algorithm) 15 | 16 | - Feature Extraction + Domain knowledge 17 | 18 | - Feature Selection 19 | 20 | - Choice of Algorithm 21 | 22 | Naive Bayes, [Support Vector Machines](https://github.com/Japhilko/DataAnalysis/blob/master/Machine%20Learning/SupportVectorMachines.md), Decision Trees, k-Means Clustering, ... 23 | 24 | - Training 25 | 26 | - Choice of Metrics/Evaluation Criteria 27 | 28 | - Testing 29 | 30 | 31 | ## [Feature selection](https://en.wikipedia.org/wiki/Feature_selection) 32 | 33 | 34 | ## [Supervised vs unsupervised learning](https://towardsdatascience.com/supervised-vs-unsupervised-learning-14f68e32ea8d) 35 | 36 | ### Supervised Learning 37 | 38 | - we have prior knowledge of what the output values for our samples should be. 39 | 40 | 41 | ## Task: Find R-packages 42 | 43 | Go to https://cran.r-project.org/ and search for packages that,... 44 | 45 | - can be used for lasso regression 46 | 47 | 50 | 51 | ## Task View Machine Learning 52 | 53 | 54 | ![](figure/taskviewmachinelearning.PNG) 55 | 56 | 57 | 58 | ## Install all packages of a task view 59 | 60 | 61 | ```r 62 | install.packages("ctv") 63 | ctv::install.views("MachineLearning") 64 | ``` 65 | 66 | ## [Prediction vs. Causation in Regression Analysis](https://statisticalhorizons.com/prediction-vs-causation-in-regression-analysis) 67 | 68 | ## Literature for machine learning 69 | 70 | ![](figure/book_ml1.jpg) 71 | 72 | 73 | 81 | 82 | ## Introduction to machine learning with R 83 | 84 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/) 85 | 86 | 87 | - chapter about machine learning in [awesome R](https://awesome-r.com/) 88 | 89 | 90 | - [Shiny App for machine learning](https://www.showmeshiny.com/machlearn/) 91 | 92 | 93 | 94 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/) 95 | 96 | 97 | ```r 98 | start_time <- Sys.time() 99 | ab <- runif(10000000) 100 | end_time <- Sys.time() 101 | 102 | end_time - start_time 103 | ``` 104 | 105 | ``` 106 | ## Time difference of 1.286074 secs 107 | ``` 108 | 109 | 110 | ## How many cores are available 111 | 112 | 113 | 114 | ```r 115 | library(doParallel) 116 | ``` 117 | 118 | ``` 119 | ## Warning: package 'doParallel' was built under R version 3.5.2 120 | ``` 121 | 122 | ``` 123 | ## Loading required package: foreach 124 | ``` 125 | 126 | ``` 127 | ## Warning: package 'foreach' was built under R version 3.5.1 128 | ``` 129 | 130 | ``` 131 | ## Loading required package: iterators 132 | ``` 133 | 134 | ``` 135 | ## Loading required package: parallel 136 | ``` 137 | 138 | ```r 139 | detectCores() 140 | ``` 141 | 142 | ``` 143 | ## [1] 4 144 | ``` 145 | 146 | ## 147 | 148 | 149 | ```r 150 | cl <- makeCluster(detectCores()) 151 | registerDoParallel(cl) 152 | ``` 153 | 154 | 155 | ```r 156 | start_time <- Sys.time() 157 | ab <- runif(10000000) 158 | end_time <- Sys.time() 159 | 160 | end_time - start_time 161 | ``` 162 | 163 | ``` 164 | ## Time difference of 0.454026 secs 165 | ``` 166 | 167 | 168 | ```r 169 | stopCluster(cl) 170 | ``` 171 | 172 | 173 | 174 | ```r 175 | ?parallel::makeCluster 176 | ``` 177 | 178 | 179 | 180 | 181 | ## Links 182 | 183 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/) 184 | 185 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/) 186 | 187 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/) 188 | 189 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/) 190 | 191 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/) 192 | 193 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/) 194 | 195 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html) 196 | 197 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/) 198 | 199 | 200 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/) 201 | 208 | 209 | - Google`s [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/) 210 | 211 | - [A prelude to machine learning](https://eight2late.wordpress.com/2017/02/23/a-prelude-to-machine-learning/) 212 | 213 | - [caret webinar on youtube](https://www.youtube.com/watch?v=7Jbb2ItbTC4) 214 | 215 | - [beginner-mistakes](https://elitedatascience.com/beginner-mistakes) 216 | -------------------------------------------------------------------------------- /slides/old/a_intro_ml.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/a_intro_ml.pdf -------------------------------------------------------------------------------- /slides/old/advanced_regression.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Advanced Regression" 3 | author: "Jan-Philipp Kolb" 4 | date: "18 Januar 2019" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## [Marginal effects](https://www.r-bloggers.com/ggeffects-0-8-0-now-on-cran-marginal-effects-for-regression-models-rstats/) 13 | 14 | - marginal effects for regression models -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression.nav: -------------------------------------------------------------------------------- 1 | \beamer@endinputifotherversion {3.36pt} 2 | \headcommand {\slideentry {0}{0}{1}{1/1}{}{0}} 3 | \headcommand {\beamer@framepages {1}{1}} 4 | \headcommand {\slideentry {0}{0}{2}{2/2}{}{0}} 5 | \headcommand {\beamer@framepages {2}{2}} 6 | \headcommand {\slideentry {0}{0}{3}{3/3}{}{0}} 7 | \headcommand {\beamer@framepages {3}{3}} 8 | \headcommand {\slideentry {0}{0}{4}{4/4}{}{0}} 9 | \headcommand {\beamer@framepages {4}{4}} 10 | \headcommand {\slideentry {0}{0}{5}{5/5}{}{0}} 11 | \headcommand {\beamer@framepages {5}{5}} 12 | \headcommand {\slideentry {0}{0}{6}{6/6}{}{0}} 13 | \headcommand {\beamer@framepages {6}{6}} 14 | \headcommand {\slideentry {0}{0}{7}{7/7}{}{0}} 15 | \headcommand {\beamer@framepages {7}{7}} 16 | \headcommand {\slideentry {0}{0}{8}{8/8}{}{0}} 17 | \headcommand {\beamer@framepages {8}{8}} 18 | \headcommand {\slideentry {0}{0}{9}{9/9}{}{0}} 19 | \headcommand {\beamer@framepages {9}{9}} 20 | \headcommand {\slideentry {0}{0}{10}{10/10}{}{0}} 21 | \headcommand {\beamer@framepages {10}{10}} 22 | \headcommand {\slideentry {0}{0}{11}{11/11}{}{0}} 23 | \headcommand {\beamer@framepages {11}{11}} 24 | \headcommand {\slideentry {0}{0}{12}{12/12}{}{0}} 25 | \headcommand {\beamer@framepages {12}{12}} 26 | \headcommand {\slideentry {0}{0}{13}{13/13}{}{0}} 27 | \headcommand {\beamer@framepages {13}{13}} 28 | \headcommand {\slideentry {0}{0}{14}{14/14}{}{0}} 29 | \headcommand {\beamer@framepages {14}{14}} 30 | \headcommand {\slideentry {0}{0}{15}{15/15}{}{0}} 31 | \headcommand {\beamer@framepages {15}{15}} 32 | \headcommand {\slideentry {0}{0}{16}{16/16}{}{0}} 33 | \headcommand {\beamer@framepages {16}{16}} 34 | \headcommand {\slideentry {0}{0}{17}{17/17}{}{0}} 35 | \headcommand {\beamer@framepages {17}{17}} 36 | \headcommand {\slideentry {0}{0}{18}{18/18}{}{0}} 37 | \headcommand {\beamer@framepages {18}{18}} 38 | \headcommand {\slideentry {0}{0}{19}{19/19}{}{0}} 39 | \headcommand {\beamer@framepages {19}{19}} 40 | \headcommand {\slideentry {0}{0}{20}{20/20}{}{0}} 41 | \headcommand {\beamer@framepages {20}{20}} 42 | \headcommand {\slideentry {0}{0}{21}{21/21}{}{0}} 43 | \headcommand {\beamer@framepages {21}{21}} 44 | \headcommand {\slideentry {0}{0}{22}{22/22}{}{0}} 45 | \headcommand {\beamer@framepages {22}{22}} 46 | \headcommand {\slideentry {0}{0}{23}{23/23}{}{0}} 47 | \headcommand {\beamer@framepages {23}{23}} 48 | \headcommand {\slideentry {0}{0}{24}{24/24}{}{0}} 49 | \headcommand {\beamer@framepages {24}{24}} 50 | \headcommand {\slideentry {0}{0}{25}{25/25}{}{0}} 51 | \headcommand {\beamer@framepages {25}{25}} 52 | \headcommand {\slideentry {0}{0}{26}{26/26}{}{0}} 53 | \headcommand {\beamer@framepages {26}{26}} 54 | \headcommand {\slideentry {0}{0}{27}{27/27}{}{0}} 55 | \headcommand {\beamer@framepages {27}{27}} 56 | \headcommand {\slideentry {0}{0}{28}{28/28}{}{0}} 57 | \headcommand {\beamer@framepages {28}{28}} 58 | \headcommand {\slideentry {0}{0}{29}{29/29}{}{0}} 59 | \headcommand {\beamer@framepages {29}{29}} 60 | \headcommand {\slideentry {0}{0}{30}{30/30}{}{0}} 61 | \headcommand {\beamer@framepages {30}{30}} 62 | \headcommand {\slideentry {0}{0}{31}{31/31}{}{0}} 63 | \headcommand {\beamer@framepages {31}{31}} 64 | \headcommand {\slideentry {0}{0}{32}{32/32}{}{0}} 65 | \headcommand {\beamer@framepages {32}{32}} 66 | \headcommand {\slideentry {0}{0}{33}{33/33}{}{0}} 67 | \headcommand {\beamer@framepages {33}{33}} 68 | \headcommand {\slideentry {0}{0}{34}{34/34}{}{0}} 69 | \headcommand {\beamer@framepages {34}{34}} 70 | \headcommand {\slideentry {0}{0}{35}{35/35}{}{0}} 71 | \headcommand {\beamer@framepages {35}{35}} 72 | \headcommand {\slideentry {0}{0}{36}{36/36}{}{0}} 73 | \headcommand {\beamer@framepages {36}{36}} 74 | \headcommand {\slideentry {0}{0}{37}{37/37}{}{0}} 75 | \headcommand {\beamer@framepages {37}{37}} 76 | \headcommand {\slideentry {0}{0}{38}{38/38}{}{0}} 77 | \headcommand {\beamer@framepages {38}{38}} 78 | \headcommand {\slideentry {0}{0}{39}{39/39}{}{0}} 79 | \headcommand {\beamer@framepages {39}{39}} 80 | \headcommand {\slideentry {0}{0}{40}{40/40}{}{0}} 81 | \headcommand {\beamer@framepages {40}{40}} 82 | \headcommand {\slideentry {0}{0}{41}{41/41}{}{0}} 83 | \headcommand {\beamer@framepages {41}{41}} 84 | \headcommand {\slideentry {0}{0}{42}{42/42}{}{0}} 85 | \headcommand {\beamer@framepages {42}{42}} 86 | \headcommand {\slideentry {0}{0}{43}{43/43}{}{0}} 87 | \headcommand {\beamer@framepages {43}{43}} 88 | \headcommand {\slideentry {0}{0}{44}{44/44}{}{0}} 89 | \headcommand {\beamer@framepages {44}{44}} 90 | \headcommand {\slideentry {0}{0}{45}{45/45}{}{0}} 91 | \headcommand {\beamer@framepages {45}{45}} 92 | \headcommand {\slideentry {0}{0}{46}{46/46}{}{0}} 93 | \headcommand {\beamer@framepages {46}{46}} 94 | \headcommand {\slideentry {0}{0}{47}{47/47}{}{0}} 95 | \headcommand {\beamer@framepages {47}{47}} 96 | \headcommand {\slideentry {0}{0}{48}{48/48}{}{0}} 97 | \headcommand {\beamer@framepages {48}{48}} 98 | \headcommand {\slideentry {0}{0}{49}{49/49}{}{0}} 99 | \headcommand {\beamer@framepages {49}{49}} 100 | \headcommand {\slideentry {0}{0}{50}{50/50}{}{0}} 101 | \headcommand {\beamer@framepages {50}{50}} 102 | \headcommand {\slideentry {0}{0}{51}{51/51}{}{0}} 103 | \headcommand {\beamer@framepages {51}{51}} 104 | \headcommand {\slideentry {0}{0}{52}{52/52}{}{0}} 105 | \headcommand {\beamer@framepages {52}{52}} 106 | \headcommand {\slideentry {0}{0}{53}{53/53}{}{0}} 107 | \headcommand {\beamer@framepages {53}{53}} 108 | \headcommand {\slideentry {0}{0}{54}{54/54}{}{0}} 109 | \headcommand {\beamer@framepages {54}{54}} 110 | \headcommand {\slideentry {0}{0}{55}{55/55}{}{0}} 111 | \headcommand {\beamer@framepages {55}{55}} 112 | \headcommand {\slideentry {0}{0}{56}{56/56}{}{0}} 113 | \headcommand {\beamer@framepages {56}{56}} 114 | \headcommand {\slideentry {0}{0}{57}{57/57}{}{0}} 115 | \headcommand {\beamer@framepages {57}{57}} 116 | \headcommand {\slideentry {0}{0}{58}{58/58}{}{0}} 117 | \headcommand {\beamer@framepages {58}{58}} 118 | \headcommand {\slideentry {0}{0}{59}{59/59}{}{0}} 119 | \headcommand {\beamer@framepages {59}{59}} 120 | \headcommand {\slideentry {0}{0}{60}{60/60}{}{0}} 121 | \headcommand {\beamer@framepages {60}{60}} 122 | \headcommand {\slideentry {0}{0}{61}{61/61}{}{0}} 123 | \headcommand {\beamer@framepages {61}{61}} 124 | \headcommand {\slideentry {0}{0}{62}{62/62}{}{0}} 125 | \headcommand {\beamer@framepages {62}{62}} 126 | \headcommand {\slideentry {0}{0}{63}{63/63}{}{0}} 127 | \headcommand {\beamer@framepages {63}{63}} 128 | \headcommand {\slideentry {0}{0}{64}{64/64}{}{0}} 129 | \headcommand {\beamer@framepages {64}{64}} 130 | \headcommand {\slideentry {0}{0}{65}{65/65}{}{0}} 131 | \headcommand {\beamer@framepages {65}{65}} 132 | \headcommand {\slideentry {0}{0}{66}{66/66}{}{0}} 133 | \headcommand {\beamer@framepages {66}{66}} 134 | \headcommand {\slideentry {0}{0}{67}{67/67}{}{0}} 135 | \headcommand {\beamer@framepages {67}{67}} 136 | \headcommand {\slideentry {0}{0}{68}{68/68}{}{0}} 137 | \headcommand {\beamer@framepages {68}{68}} 138 | \headcommand {\slideentry {0}{0}{69}{69/69}{}{0}} 139 | \headcommand {\beamer@framepages {69}{69}} 140 | \headcommand {\slideentry {0}{0}{70}{70/70}{}{0}} 141 | \headcommand {\beamer@framepages {70}{70}} 142 | \headcommand {\slideentry {0}{0}{71}{71/71}{}{0}} 143 | \headcommand {\beamer@framepages {71}{71}} 144 | \headcommand {\slideentry {0}{0}{72}{72/72}{}{0}} 145 | \headcommand {\beamer@framepages {72}{72}} 146 | \headcommand {\slideentry {0}{0}{73}{73/73}{}{0}} 147 | \headcommand {\beamer@framepages {73}{73}} 148 | \headcommand {\beamer@partpages {1}{73}} 149 | \headcommand {\beamer@subsectionpages {1}{73}} 150 | \headcommand {\beamer@sectionpages {1}{73}} 151 | \headcommand {\beamer@documentpages {73}} 152 | \headcommand {\def \inserttotalframenumber {73}} 153 | -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression.snm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression.snm -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,ignorenonframetext,]{beamer} 2 | \setbeamertemplate{caption}[numbered] 3 | \setbeamertemplate{caption label separator}{: } 4 | \setbeamercolor{caption name}{fg=normal text.fg} 5 | \beamertemplatenavigationsymbolsempty 6 | \usepackage{lmodern} 7 | \usepackage{amssymb,amsmath} 8 | \usepackage{ifxetex,ifluatex} 9 | \usepackage{fixltx2e} % provides \textsubscript 10 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex 11 | \usepackage[T1]{fontenc} 12 | \usepackage[utf8]{inputenc} 13 | \else % if luatex or xelatex 14 | \ifxetex 15 | \usepackage{mathspec} 16 | \else 17 | \usepackage{fontspec} 18 | \fi 19 | \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase} 20 | \fi 21 | \usetheme[]{Dresden} 22 | \usecolortheme{dolphin} 23 | \usefonttheme{structuresmallcapsserif} 24 | % use upquote if available, for straight quotes in verbatim environments 25 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{} 26 | % use microtype if available 27 | \IfFileExists{microtype.sty}{% 28 | \usepackage{microtype} 29 | \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts 30 | }{} 31 | \newif\ifbibliography 32 | \hypersetup{ 33 | pdftitle={Regularization methods}, 34 | pdfauthor={Jan-Philipp Kolb}, 35 | pdfborder={0 0 0}, 36 | breaklinks=true} 37 | \urlstyle{same} % don't use monospace font for urls 38 | 39 | % Prevent slide breaks in the middle of a paragraph: 40 | \widowpenalties 1 10000 41 | \raggedbottom 42 | 43 | \AtBeginPart{ 44 | \let\insertpartnumber\relax 45 | \let\partname\relax 46 | \frame{\partpage} 47 | } 48 | \AtBeginSection{ 49 | \ifbibliography 50 | \else 51 | \let\insertsectionnumber\relax 52 | \let\sectionname\relax 53 | \frame{\sectionpage} 54 | \fi 55 | } 56 | \AtBeginSubsection{ 57 | \let\insertsubsectionnumber\relax 58 | \let\subsectionname\relax 59 | \frame{\subsectionpage} 60 | } 61 | 62 | \setlength{\parindent}{0pt} 63 | \setlength{\parskip}{6pt plus 2pt minus 1pt} 64 | \setlength{\emergencystretch}{3em} % prevent overfull lines 65 | \providecommand{\tightlist}{% 66 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} 67 | \setcounter{secnumdepth}{0} 68 | 69 | \title{Regularization methods} 70 | \author{Jan-Philipp Kolb} 71 | \date{17 Mai, 2019} 72 | 73 | \begin{document} 74 | \frame{\titlepage} 75 | 76 | \begin{frame} 77 | 78 | \end{frame} 79 | 80 | \end{document} 81 | -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression.toc: -------------------------------------------------------------------------------- 1 | \beamer@endinputifotherversion {3.36pt} 2 | -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression.vrb: -------------------------------------------------------------------------------- 1 | \frametitle{Further packages} 2 | \protect\hypertarget{further-packages}{} 3 | 4 | \begin{Shaded} 5 | \begin{Highlighting}[] 6 | \CommentTok{# https://cran.rstudio.com/web/packages/biglasso/biglasso.pdf} 7 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"biglasso"}\NormalTok{)} 8 | \end{Highlighting} 9 | \end{Shaded} 10 | 11 | -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-12-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-12-1.pdf -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-17-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-17-1.pdf -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-20-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-20-1.pdf -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-23-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-23-1.pdf -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-25-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-25-1.pdf -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-27-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-27-1.pdf -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-36-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-36-1.pdf -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-20-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-20-1.png -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-23-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-23-1.png -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-25-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-25-1.png -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-27-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-27-1.png -------------------------------------------------------------------------------- /slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-36-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-36-1.png -------------------------------------------------------------------------------- /slides/old/b_lasso_regression.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lasso Regression" 3 | author: "Jan-Philipp Kolb" 4 | date: "25 September 2018" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r setuplasso, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE,message = F,warning = F) 10 | ``` 11 | 12 | 13 | ## [Lasso Regression](https://en.wikipedia.org/wiki/Lasso_(statistics)) 14 | 15 | ### Lasso - least absolute shrinkage and selection operator 16 | 17 | - lasso is a regression analysis method that performs variable selection and regularization (reduce overfitting) 18 | - We want to enhance prediction accuracy and interpretability of the statistical model. 19 | 20 | 23 | 24 | - We could remove less important variables, after checking that they are not important. 25 | - We can do that manually by examining p-values of coefficients and discarding those variables whose coefficients are not significant. 26 | - This can become tedious for classification problems with many independent variables 27 | 28 | 29 | ## History of lasso 30 | 31 | - Originally introduced in geophysics literature in 1986 32 | - Independently rediscovered and popularized in 1996 by Robert Tibshirani, who coined the term and provided further insights into the observed performance. 33 | 34 | 35 | 36 | Lasso was originally formulated for least squares models and this simple case reveals a substantial amount about the behavior of the estimator, including its relationship to ridge regression and best subset selection and the connections between lasso coefficient estimates and so-called soft thresholding. It also reveals that (like standard linear regression) the coefficient estimates need not be unique if covariates are collinear. 37 | 38 | ## Lasso for other models than least squares 39 | 40 | Though originally defined for least squares, lasso regularization is easily extended to a wide variety of statistical models including generalized linear models, generalized estimating equations, proportional hazards models, and M-estimators, in a straightforward fashion. 41 | 42 | - Lasso’s ability to perform subset selection relies on the form of the constraint and has a variety of interpretations including in terms of geometry, Bayesian statistics, and convex analysis. 43 | 44 | The LASSO is closely related to basis pursuit denoising. 45 | 46 | 47 | 48 | 49 | ## What is [lasso regression](http://www.statisticshowto.com/lasso-regression/) 50 | 51 | - Lasso regression uses shrinkage 52 | - data values are shrunk towards a central point 53 | 54 | - [Ridge and lasso regularization work by adding a penalty term to the log likelihood function.](https://eight2late.wordpress.com/2017/07/11/a-gentle-introduction-to-logistic-regression-and-lasso-regularisation-using-r/) 55 | 56 | - A tuning parameter, $\lambda$ controls the strength of the L1 penalty. 57 | 58 | $$ 59 | \sum\limits_{i=1}^n \big( y_i -\beta_0 - \sum\limits_{j=1}^p \beta_jx_{ij} \big)^2 + \lambda \sum\limits_{j=1}^p |\beta_j| = RSS + \lambda\sum\limits_{j=1}^p |\beta_j|. 60 | $$ 61 | 67 | 68 | ## [Regularization](https://en.wikipedia.org/wiki/Regularization_(mathematics)) 69 | 70 | 71 | regularization is the process of adding information in order to solve an ill-posed problem or to prevent [overfitting](https://en.wikipedia.org/wiki/Overfitting). 72 | 73 | ![](figure/450px-Overfitting.svg.png) 74 | 75 | The green line represents an overfitted model and the black line represents a regularized model. While the green line best follows the training data, it is too dependent on that data and it is likely to have a higher error rate on new unseen data, compared to the black line. 76 | 77 | 80 | 81 | 82 | ## [The L1 norm explained](https://stats.stackexchange.com/questions/347257/geometrical-interpretation-of-l1-regression) 83 | 84 | ![](figure/BBRXC.png) 85 | 86 | ## [Ridge Regression and the Lasso](https://www.r-bloggers.com/ridge-regression-and-the-lasso/) 87 | 88 | ```{r} 89 | swiss <- datasets::swiss 90 | x <- model.matrix(Fertility~., swiss)[,-1] 91 | y <- swiss$Fertility 92 | lambda <- 10^seq(10, -2, length = 100) 93 | ``` 94 | 95 | ## Test and train dataset 96 | 97 | ```{r} 98 | library(glmnet) 99 | set.seed(489) 100 | train = sample(1:nrow(x), nrow(x)/2) 101 | test = (-train) 102 | ytest = y[test] 103 | ``` 104 | 105 | 106 | ## A first ols model 107 | 108 | ```{r} 109 | #OLS 110 | swisslm <- lm(Fertility~., data = swiss) 111 | coef(swisslm) 112 | ``` 113 | 114 | ## A ridge model 115 | 116 | ```{r} 117 | #ridge 118 | ridge.mod <- glmnet(x, y, alpha = 0, lambda = lambda) 119 | predict(ridge.mod, s = 0, type = 'coefficients')[1:6,] 120 | ``` 121 | 122 | 123 | ## Lasso regression with package `glmnet` 124 | 125 | ```{r,eval=F} 126 | install.packages("glmnet") 127 | ``` 128 | 129 | ```{r} 130 | library(glmnet) 131 | ``` 132 | 133 | ```{r} 134 | x=matrix(rnorm(100*20),100,20) 135 | g2=sample(1:2,100,replace=TRUE) 136 | fit2=glmnet(x,g2,family="binomial") 137 | ``` 138 | 139 | ```{r,eval=T} 140 | caret::varImp(fit2,lambda=0.0007567) 141 | ``` 142 | 143 | 144 | ## 145 | 146 | - LASSO is a feature selection method. 147 | 150 | - LASSO regression has inbuilt penalization functions to reduce overfitting. 151 | 154 | 155 | 156 | ## 157 | 158 | - The logarithmic function is used for the link between probability and logits 159 | 160 | - The Logit function is used to [linearize sigmoid curves](https://de.wikipedia.org/wiki/Logit). 161 | 162 | 165 | 166 | ## The package `caret` 167 | 168 | - Classification and Regression Training 169 | 170 | ```{r,eval=F} 171 | install.packages("caret") 172 | ``` 173 | 174 | ```{r} 175 | library("caret") 176 | ``` 177 | 178 | - [**Vignette `caret` package **](https://cran.r-project.org/web/packages/caret/vignettes/caret.html) - 179 | 180 | ## 181 | 182 | ```{r,eval=F} 183 | ?caret::train 184 | ``` 185 | 186 | 187 | 188 | ```{r,eval=F} 189 | logit<-train(,data = gp.train.c, 190 | method = 'glm', 191 | family = 'binomial', 192 | trControl = ctrl0)") 193 | ``` 194 | 195 | 196 | ## Further packages 197 | 198 | ```{r,eval=F} 199 | # https://cran.rstudio.com/web/packages/biglasso/biglasso.pdf 200 | install.packages("biglasso") 201 | ``` 202 | 203 | 204 | 205 | ## Links 206 | 207 | 208 | [A comprehensive beginners guide for Linear, Ridge and Lasso Regression](https://www.analyticsvidhya.com/blog/2017/06/a-comprehensive-guide-for-linear-ridge-and-lasso-regression/) 209 | 210 | - Course for statistical learning - [Youtube - Videos](https://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/) 211 | 212 | - [pcLasso: a new method for sparse regression](https://www.r-bloggers.com/pclasso-a-new-method-for-sparse-regression/) 213 | 214 | - [Youtube - lasso regression - clearly explained](https://www.youtube.com/watch?v=NGf0voTMlcs) 215 | 216 | - [Glmnet Vignette](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html) 217 | 218 | - [Regularization Methods in R](https://www.geo.fu-berlin.de/en/v/soga/Geodata-analysis/multiple-regression/Regularization-Methods/Regularization-Methods-in-R/index.html) 219 | 220 | - [A gentle introduction to logistic regression and lasso regularisation using R](https://eight2late.wordpress.com/2017/07/11/a-gentle-introduction-to-logistic-regression-and-lasso-regularisation-using-r/) 221 | 222 | - [Penalized Regression in R](https://machinelearningmastery.com/penalized-regression-in-r/) 223 | 224 | - [Penalized Logistic Regression Essentials in R](http://www.sthda.com/english/articles/36-classification-methods-essentials/149-penalized-logistic-regression-essentials-in-r-ridge-lasso-and-elastic-net/) -------------------------------------------------------------------------------- /slides/old/c_bagging_boosting_trees.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/c_bagging_boosting_trees.pdf -------------------------------------------------------------------------------- /slides/old/caret.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The package caret" 3 | author: "Jan-Philipp Kolb" 4 | date: "21 November 2018" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Loading the libraries 13 | 14 | ```{r} 15 | library(MLmetrics) 16 | library(party) 17 | library(partykit) 18 | library(caret) 19 | ``` 20 | 21 | ## An example dataset 22 | 23 | ```{r} 24 | n <-1000 25 | gp.train.c <- data.frame(D_dropout=as.factor(sample(c("yes","no"),n,replace=T)), 26 | sd_habit=runif(n), 27 | sd_identify=runif(n), 28 | another_var=as.factor(sample(c("yes","no","maybe"),n,replace=T))) 29 | ``` 30 | 31 | ## Preliminaries 32 | 33 | - No missing values are possible in the dataset 34 | 35 | 36 | ## 37 | 38 | ```{r} 39 | cvIndex <- caret::createFolds(gp.train.c$D_dropout, 10, returnTrain = T) 40 | fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...)) 41 | 42 | ctrl <- caret::trainControl(method = "cv", 43 | number = 10, 44 | index = cvIndex, 45 | summaryFunction = fiveStats, 46 | classProbs = TRUE) 47 | ``` 48 | 49 | 50 | 51 | 52 | 53 | ```{r} 54 | grid <- expand.grid(alpha = c(0,1), 55 | lambda = seq(0.5,0,length=50)) 56 | ``` 57 | 58 | 59 | ```{r} 60 | lasso<-caret::train( D_dropout ~ sd_habit + sd_identify +another_var , 61 | data=gp.train.c,method ='glmnet', 62 | family= 'binomial',trControl = ctrl, 63 | tuneGrid = grid,metric = 'Kappa') 64 | ``` 65 | 66 | -------------------------------------------------------------------------------- /slides/old/conditional_inference_trees.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Conditional Inference Trees" 3 | author: "Jan-Philipp Kolb" 4 | date: "28 Juni 2019" 5 | output: pdf_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | 13 | ## [ctree example](https://datawookie.netlify.com/blog/2013/05/package-party-conditional-inference-trees/) 14 | 15 | ```{r,eval=F} 16 | install.packages("party") 17 | ``` 18 | 19 | ## The data behind 20 | 21 | ```{r} 22 | airq <- subset(airquality, !is.na(Ozone)) 23 | summary(airq$Temp) 24 | ``` 25 | 26 | ## A first model 27 | 28 | ```{r} 29 | library(party) 30 | ``` 31 | 32 | 33 | ```{r} 34 | air.ct <- ctree(Ozone ~ ., data = airq, controls = ctree_control(maxsurrogate = 3)) 35 | ``` 36 | 37 | 38 | ## The plot for `ctree` 39 | 40 | ```{r} 41 | plot(air.ct) 42 | ``` 43 | 44 | 45 | 46 | 47 | ## Recursive partitioning algorithms are special cases of a 48 | simple two-stage algorithm 49 | 50 | - First partition the observations by univariate splits in a recursive way and 51 | - second fit a constant model in each cell of the resulting partition. 52 | 53 | 54 | ## [`ctree` - Regression](https://stats.stackexchange.com/questions/171301/interpreting-ctree-partykit-output-in-r) 55 | 56 | ```{r} 57 | library(partykit) 58 | ``` 59 | 60 | ```{r,eval=F} 61 | ?ctree 62 | ``` 63 | 64 | ```{r} 65 | airq <- subset(airquality, !is.na(Ozone)) 66 | airct <- ctree(Ozone ~ ., data = airq) 67 | plot(airct, type = "simple") 68 | ``` 69 | 70 | ## Links 71 | 72 | - [**Vignette**](https://cran.r-project.org/web/packages/partykit/vignettes/ctree.pdf) for package `partykit` 73 | 74 | - [Conditional Inference Trees](https://rpubs.com/awanindra01/ctree) -------------------------------------------------------------------------------- /slides/old/doParallel.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "parallel" 3 | author: "Jan-Philipp Kolb" 4 | date: "30 Januar 2019" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | 13 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/) 14 | 15 | ```{r} 16 | start_time <- Sys.time() 17 | ab <- runif(10000000) 18 | end_time <- Sys.time() 19 | 20 | end_time - start_time 21 | ``` 22 | 23 | 24 | ## How many cores are available 25 | 26 | 27 | ```{r} 28 | library(doParallel) 29 | detectCores() 30 | ``` 31 | 32 | ## 33 | 34 | ```{r} 35 | cl <- makeCluster(detectCores()) 36 | registerDoParallel(cl) 37 | ``` 38 | 39 | ```{r} 40 | start_time <- Sys.time() 41 | ab <- runif(10000000) 42 | end_time <- Sys.time() 43 | 44 | end_time - start_time 45 | ``` 46 | 47 | ```{r} 48 | stopCluster(cl) 49 | ``` 50 | 51 | 52 | ```{r} 53 | ?parallel::makeCluster 54 | ``` 55 | 56 | -------------------------------------------------------------------------------- /slides/old/evaluation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/evaluation.pdf -------------------------------------------------------------------------------- /slides/old/gradient_boosting.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Gradient Boosting" 3 | author: "Jan-Philipp Kolb" 4 | date: "4 September 2018" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE) 10 | ``` 11 | 12 | 13 | ## [Gradient boosting](https://en.wikipedia.org/wiki/Gradient_boosting) 14 | 15 | Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function. 16 | 17 | The idea of gradient boosting originated in the observation by Leo Breiman that boosting can be interpreted as an optimization algorithm on a suitable cost function. 18 | 19 | 20 | Breiman, L. (1997). "Arcing The Edge". Technical Report 486. Statistics Department, University of California, Berkeley. 21 | 22 | 23 | 25 | 26 | ## Explicit algorithms 27 | 28 | Explicit regression gradient boosting algorithms were subsequently developed by Jerome H. Friedman,[2][3] simultaneously with the more general functional gradient boosting perspective of Llew Mason, Jonathan Baxter, Peter Bartlett and Marcus Frean.[4][5] 29 | 30 | 31 | The latter two papers introduced the view of boosting algorithms as iterative functional gradient descent algorithms. That is, algorithms that optimize a cost function over function space by iteratively choosing a function (weak hypothesis) that points in the negative gradient direction. This functional gradient view of boosting has led to the development of boosting algorithms in many areas of machine learning and statistics beyond regression and classification. 32 | 33 | 34 | ## [**Advantages of gradient boosting**](http://uc-r.github.io/gbm_regression) 35 | 36 | - Often provides predictive accuracy that cannot be beat. 37 | - Lots of flexibility - can optimize on different loss functions and provides several hyperparameter tuning options that make the function fit very flexible. 38 | - No data pre-processing required - often works great with categorical and numerical values as is. 39 | - Handles missing data - imputation not required. 40 | 41 | ## [**Disadvantages**](http://uc-r.github.io/gbm_regression) of gradient boosting 42 | 43 | 44 | - GBMs will continue improving to minimize all errors. This can overemphasize outliers and cause overfitting. Must use cross-validation to neutralize. 45 | - Computationally expensive - GBMs often require many trees (>1000) which can be time and memory exhaustive. 46 | - The high flexibility results in many parameters that interact and influence heavily the behavior of the approach (number of iterations, tree depth, regularization parameters, etc.). This requires a large grid search during tuning. 47 | - Less interpretable although this is easily addressed with various tools (variable importance, partial dependence plots, LIME, etc.). 48 | 49 | 50 | ## Two types of errors for tree methods 51 | 52 | ### Bias related errors 53 | 54 | - Adaptive boosting 55 | - Gradient boosting 56 | 57 | ### Variance related errors 58 | 59 | - Bagging 60 | - Random forest 61 | 62 | 69 | 70 | 71 | 72 | ## [Gradient Boosting for Linear Regression - why does it not work?](https://stats.stackexchange.com/questions/186966/gradient-boosting-for-linear-regression-why-does-it-not-work) 73 | 74 | While learning about Gradient Boosting, I haven't heard about any constraints regarding the properties of a "weak classifier" that the method uses to build and ensemble model. However, I could not imagine an application of a GB that uses linear regression, and in fact when I've performed some tests - it doesn't work. I was testing the most standard approach with a gradient of sum of squared residuals and adding the subsequent models together. 75 | 76 | The obvious problem is that the residuals from the first model are populated in such manner that there is really no regression line to fit anymore. My another observation is that a sum of subsequent linear regression models can be represented as a single regression model as well (adding all intercepts and corresponding coefficients) so I cannot imagine how that could ever improve the model. The last observation is that a linear regression (the most typical approach) is using sum of squared residuals as a loss function - the same one that GB is using. 77 | 78 | I also thought about lowering the learning rate or using only a subset of predictors for each iteration, but that could still be summed up to a single model representation eventually, so I guess it would bring no improvement. 79 | 80 | What am I missing here? Is linear regression somehow inappropriate to use with Gradient Boosting? Is it because the linear regression uses the sum of squared residuals as a loss function? Are there any particular constraints on the weak predictors so they can be applied to Gradient Boosting? 81 | 82 | 83 | 84 | ## Links 85 | 86 | - [**Gradient Boosting Machines**](http://uc-r.github.io/gbm_regression) 87 | 88 | 89 | - [How to Visualize Gradient Boosting Decision Trees With XGBoost in Python](https://machinelearningmastery.com/visualize-gradient-boosting-decision-trees-xgboost-python/) 90 | 91 | 92 | -------------------------------------------------------------------------------- /slides/old/gradient_boosting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/gradient_boosting.pdf -------------------------------------------------------------------------------- /slides/old/lasso_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/lasso_regression.pdf -------------------------------------------------------------------------------- /slides/old/logit_model.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Logit model" 3 | author: "Jan-Philipp Kolb" 4 | date: "4 September 2018" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE) 10 | ``` 11 | 12 | ## 13 | 14 | - The logarithmic function is used for the link between probability and logits 15 | 16 | - The Logit function is used to [linearize sigmoid curves](https://de.wikipedia.org/wiki/Logit). 17 | 18 | 21 | 22 | ## The package `caret` 23 | 24 | - Classification and Regression Training 25 | 26 | ```{r,eval=F} 27 | install.packages("caret") 28 | ``` 29 | 30 | ```{r} 31 | library("caret") 32 | ``` 33 | 34 | - [**Vignette `caret` package **](https://cran.r-project.org/web/packages/caret/vignettes/caret.html) - 35 | 36 | ## 37 | 38 | ```{r,eval=F} 39 | ?caret::train 40 | ``` 41 | 42 | 43 | 44 | ```{r} 45 | logit<-train(,data = gp.train.c, 46 | method = 'glm', 47 | family = 'binomial', 48 | trControl = ctrl0)") 49 | ``` 50 | 51 | -------------------------------------------------------------------------------- /slides/old/ml_part1.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning with R - part 1" 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | fontsize: 10pt 6 | output: 7 | beamer_presentation: 8 | colortheme: dolphin 9 | fig_height: 3 10 | fig_width: 5 11 | fig_caption: no 12 | fonttheme: structuresmallcapsserif 13 | highlight: haddock 14 | theme: Dresden 15 | pdf_document: 16 | keep_tex: yes 17 | toc: yes 18 | slidy_presentation: 19 | css: mycss.css 20 | keep_md: yes 21 | --- 22 | 23 | ```{r, include=FALSE} 24 | knitr::opts_chunk$set(echo = FALSE,message = F,warning=F) 25 | ``` 26 | 27 | # Introduction to R 28 | 29 | ```{r child = 'a1_intro_r.Rmd'} 30 | ``` 31 | 32 | # Introduction to machine learning 33 | 34 | ```{r child = 'a2_intro_ml.Rmd'} 35 | ``` 36 | 37 | # Simple regression 38 | 39 | ```{r child = 'b1_regression.Rmd'} 40 | ``` 41 | -------------------------------------------------------------------------------- /slides/old/ml_part1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/ml_part1.pdf -------------------------------------------------------------------------------- /slides/old/random_forests.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Decission Trees and Random Forests" 3 | author: "Jan-Philipp Kolb" 4 | date: "1 Oktober 2018" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE) 10 | ``` 11 | 12 | 13 | ## [Decision Trees](http://www.statmethods.net/advstats/cart.html) 14 | 15 | [Regression tree vs. classification tree](http://www.statmethods.net/advstats/cart.html) 16 | 17 | 18 | ```{r} 19 | library(rpart) 20 | ``` 21 | 22 | Grow a tree 23 | 24 | ```{r} 25 | fit <- rpart(Kyphosis ~ Age + Number + Start, 26 | method="class", data=kyphosis) 27 | 28 | printcp(fit) # display the results 29 | plotcp(fit) # visualize cross-validation results 30 | summary(fit) # detailed summary of splits 31 | ``` 32 | 33 | ```{r} 34 | # plot tree 35 | plot(fit, uniform=TRUE, 36 | main="Classification Tree for Kyphosis") 37 | text(fit, use.n=TRUE, all=TRUE, cex=.8) 38 | ``` 39 | 40 | [Decision Trees and Random Forest](https://cran.r-project.org/doc/contrib/Zhao_R_and_data_mining.pdf) 41 | 42 | 43 | 44 | ## [Random Forest](https://www.datascience.com/resources/notebooks/random-forest-intro) 45 | 46 | > Random forest aims to reduce the previously mentioned correlation issue by choosing only a subsample of the feature space at each split. Essentially, it aims to make the trees de-correlated and prune the trees by setting a stopping criteria for node splits, which I will cover in more detail later. 47 | 48 | ## [Random forest](https://en.wikipedia.org/wiki/Random_forest) 49 | 50 | - Ensemble learning method - multitude of decision trees 51 | - Random forests correct for decision trees' habit of overfitting to their training set. 52 | 53 | 54 | ![](figure/expl_rf.png) 55 | 56 | 57 | 60 | 61 | 62 | ```{r,eval=F} 63 | install.packages("randomForest") 64 | # https://www.instituteofanalytics.com/forum/uploads/editor/ls/4kivialj5lvj.pdf 65 | # devtools::install_github('araastat/reprtree') 66 | ``` 67 | 68 | 69 | 70 | ```{r,eval=F} 71 | library(randomForest) 72 | library(reprtree) 73 | 74 | model <- randomForest(Species ~ ., data=iris, importance=TRUE, ntree=500, mtry = 2, do.trace=100) 75 | 76 | reprtree:::plot.getTree(model) 77 | ``` 78 | 79 | 86 | 87 | 88 | ## Random forests in package `caret` 89 | 90 | - [models: A List of Available Models in train](https://rdrr.io/cran/caret/man/models.html) 91 | 92 | - [Practical guide to implement machine learning with CARET package in R](https://www.analyticsvidhya.com/blog/2016/12/practical-guide-to-implement-machine-learning-with-caret-package-in-r-with-practice-problem/) 93 | 94 | 95 | ## Links 96 | 97 | - [The Random Forest Algorithm](https://towardsdatascience.com/the-random-forest-algorithm-d457d499ffcd) 98 | 99 | - CRAN Task View [Machine & Statistical Learning](http://cran.r-project.org/web/views/MachineLearning.html) -------------------------------------------------------------------------------- /slides/old/supervised_learning.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part 1 - Intro Supervised Learning" 3 | author: "Jan-Philipp Kolb" 4 | date: "2 4 2019" 5 | output: ioslides_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE,message = F) 10 | ``` 11 | 12 | ## [Supervised Learning](https://www.datasciencecentral.com/profiles/blogs/supervised-learning-everything-you-need-to-know) 13 | 14 | ![](figure/SMLProcess.png) 15 | 16 | ## R-packages for machine learning 17 | 18 | ![](figure/top-20-r-packages-machine-learning-downloads.jpg) 19 | 20 | 21 | ## [k-nearest neighbour](https://www.r-bloggers.com/k-nearest-neighbor-step-by-step-tutorial/) 22 | 23 | ```{r} 24 | library(caret) 25 | library(e1071) 26 | ``` 27 | 28 | 29 | ```{r} 30 | data1 <- read.csv("../data/US Presidential Data.csv") 31 | ``` 32 | 33 | ```{r} 34 | #Partitioning the data into training and validation data 35 | set.seed(101) 36 | index = createDataPartition(data1$Win.Loss, p = 0.7, list = F ) 37 | train = data1[index,] 38 | validation = data1[-index,] 39 | ``` 40 | 41 | ```{r} 42 | # Explore data 43 | dim(train) 44 | dim(validation) 45 | names(train) 46 | head(train) 47 | head(validation) 48 | ``` 49 | 50 | 51 | ```{r} 52 | # Setting levels for both training and validation data 53 | levels(train$Win.Loss) <- make.names(levels(factor(train$Win.Loss))) 54 | levels(validation$Win.Loss) <- make.names(levels(factor(validation$Win.Loss))) 55 | ``` 56 | 57 | ## [](https://www.dataiku.com/learn/guide/academy/machine-learning/identify_clusters.html) 58 | 59 | ![How to identify clusters and name them](figure/nyc_map.png) 60 | 61 | 62 | ## Links 63 | 64 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/) 65 | 66 | - [Top 20 R Machine Learning and Data Science packages](https://www.kdnuggets.com/2015/06/top-20-r-machine-learning-packages.html) 67 | 68 | - [Statistical NLP on OpenStreetMap](https://machinelearnings.co/statistical-nlp-on-openstreetmap-b9d573e6cc86) 69 | 70 | - [How to identify clusters and name them](https://www.dataiku.com/learn/guide/academy/machine-learning/identify_clusters.html) 71 | 72 | - [Setting the course for Machine Learning](https://blog.arup.io/setting-the-course-for-machine-learning-760133aa334d) 73 | 74 | - [The provision of urban green space and its accessibility: Spatial data effects in Brussels](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0204684) 75 | 76 | - [Residential scene classification for gridded population sampling in developing countries using deep convolutional neural networks on satellite imagery](https://ij-healthgeographics.biomedcentral.com/articles/10.1186/s12942-018-0132-1) 77 | 78 | 79 | - [Using Convolutional Neural Networks to detect features in satellite images](http://ataspinar.com/2017/12/04/using-convolutional-neural-networks-to-detect-features-in-sattelite-images/) 80 | 81 | 82 | -------------------------------------------------------------------------------- /tutorial/g_ml_applying_algorithms.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Applying machine learning algorithms – exercises" 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | fontsize: 10pt 6 | output: 7 | beamer_presentation: 8 | colortheme: dolphin 9 | fig_height: 3 10 | fig_width: 5 11 | fig_caption: no 12 | fonttheme: structuresmallcapsserif 13 | highlight: haddock 14 | theme: Dresden 15 | pdf_document: 16 | keep_tex: yes 17 | toc: yes 18 | slidy_presentation: 19 | css: mycss.css 20 | keep_md: yes 21 | --- 22 | 23 | ```{r setup, include=FALSE} 24 | knitr::opts_chunk$set(echo = TRUE) 25 | ``` 26 | 27 | ## [](https://www.r-exercises.com/2017/09/15/applying-machine-learning-algorithms-exercises/) 28 | 29 | ### Exercise 1 30 | 31 | Create a list named “control” that runs a 10-fold cross-validation. HINT: Use trainControl(). 32 | 33 | ### Exercise 2 34 | 35 | Use the metric of “Accuracy” to evaluate models. 36 | 37 | ### Exercise 3 38 | 39 | Build the “LDA”, “CART”, “kNN”, “SVM” and “RF” models. 40 | 41 | ### Exercise 4 42 | 43 | Create a list of the 5 models you just built and name it “results”. HINT: Use `resamples()`. 44 | 45 | ### Exercise 5 46 | 47 | Report the accuracy of each model by using the summary function on the list “results”. HINT: Use summary(). 48 | 49 | ### Exercise 6 50 | 51 | Create a plot of the model evaluation results and compare the spread and the mean accuracy of each model. HINT: Use dotplot(). 52 | 53 | ### Exercise 7 54 | 55 | Which model seems to be the most accurate? 56 | 57 | ### Exercise 8 58 | 59 | Summarize the results of the best model and print them. HINT: Use print(). 60 | 61 | ### Exercise 9 62 | 63 | Run the “LDA” model directly on the validation set to create a factor named “predictions”. HINT: Use predict(). 64 | 65 | ### Exercise 10 66 | 67 | Summarize the results in a confusion matrix. HINT: Use `confusionMatrix()`. 68 | -------------------------------------------------------------------------------- /tutorial/ml_exercises.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises.pdf -------------------------------------------------------------------------------- /tutorial/ml_exercises_a1_introR.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "ML exercises - basics R" 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | fontsize: 10pt 6 | output: 7 | slidy_presentation: 8 | css: mycss.css 9 | keep_md: yes 10 | pdf_document: 11 | keep_tex: yes 12 | toc: yes 13 | beamer_presentation: 14 | colortheme: dolphin 15 | fig_height: 3 16 | fig_width: 5 17 | fig_caption: no 18 | fonttheme: structuresmallcapsserif 19 | highlight: haddock 20 | theme: Dresden 21 | --- 22 | 23 | ```{r setup, include=FALSE} 24 | knitr::opts_chunk$set(echo = T,warning=F,message=F) 25 | ``` 26 | 27 | 28 | ## Exercise: Find R-packages 29 | 30 | Go to https://cran.r-project.org/ and search for packages that can be used: 31 | 32 | 1) to reduce overfitting 33 | 2) for regression trees 34 | 3) for gradient boosting 35 | 4) for neural networks 36 | 5) for clustering 37 | 38 | ## Solution: Find R-packages 39 | 40 | ```{r,eval=F} 41 | install.packages("glmnet") #1) 42 | install.packages("rpart") #2) 43 | install.packages("gbm") #3) 44 | install.packages("neuralnet") #4) 45 | install.packages("kknn") #5) 46 | ``` 47 | 48 | ## Exercise: load built-in data 49 | 50 | ### Load the the built-in dataset `swiss` 51 | 54 | 1) How many observations and variables are available? 55 | 2) What is the scale level of the variables? 56 | 57 | ### Interactive data table 58 | 59 | 3) Create an interactive data table 60 | 61 | ## Solution: load built-in data 62 | 63 | ```{r} 64 | # 1) 65 | data(swiss) 66 | dim(swiss) 67 | str(swiss) 68 | ``` 69 | 70 | ```{r,eval=F} 71 | # 2) 72 | DT::datatable(swiss) 73 | ``` 74 | 75 | ## [Exercise](https://www.datacamp.com/community/tutorials/pipe-r-tutorial): random numbers 76 | 77 | ```{r,echo=F} 78 | x <- c(0.109, 0.359, 0.63, 0.996, 0.515, 0.142, 0.017, 79 | 0.829, 0.907) 80 | x <- runif(8) 81 | ``` 82 | 83 | 1) Draw 8 random numbers from the uniform distribution and save them in a vector `x` 84 | 2) Compute the logarithm of `x`, return suitably lagged and iterated differences, 85 | 3) compute the exponential function and round the result 86 | 87 | ```{r,echo=F} 88 | round(exp(diff(log(x))), 1) 89 | ``` 90 | 91 | ## Solution: random numbers 92 | 93 | ```{r,echo=F} 94 | x <- runif(8) #1) 95 | round(exp(diff(log(x))), 1) #2) and 3) 96 | ``` 97 | 98 | 99 | 100 | 108 | -------------------------------------------------------------------------------- /tutorial/ml_exercises_a1_introR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_a1_introR.pdf -------------------------------------------------------------------------------- /tutorial/ml_exercises_a1_introR.tex: -------------------------------------------------------------------------------- 1 | \PassOptionsToPackage{unicode=true}{hyperref} % options for packages loaded elsewhere 2 | \PassOptionsToPackage{hyphens}{url} 3 | % 4 | \documentclass[ 5 | 10pt, 6 | ignorenonframetext, 7 | ]{beamer} 8 | \usepackage{pgfpages} 9 | \setbeamertemplate{caption}[numbered] 10 | \setbeamertemplate{caption label separator}{: } 11 | \setbeamercolor{caption name}{fg=normal text.fg} 12 | \beamertemplatenavigationsymbolsempty 13 | % Prevent slide breaks in the middle of a paragraph: 14 | \widowpenalties 1 10000 15 | \raggedbottom 16 | \setbeamertemplate{part page}{ 17 | \centering 18 | \begin{beamercolorbox}[sep=16pt,center]{part title} 19 | \usebeamerfont{part title}\insertpart\par 20 | \end{beamercolorbox} 21 | } 22 | \setbeamertemplate{section page}{ 23 | \centering 24 | \begin{beamercolorbox}[sep=12pt,center]{part title} 25 | \usebeamerfont{section title}\insertsection\par 26 | \end{beamercolorbox} 27 | } 28 | \setbeamertemplate{subsection page}{ 29 | \centering 30 | \begin{beamercolorbox}[sep=8pt,center]{part title} 31 | \usebeamerfont{subsection title}\insertsubsection\par 32 | \end{beamercolorbox} 33 | } 34 | \AtBeginPart{ 35 | \frame{\partpage} 36 | } 37 | \AtBeginSection{ 38 | \ifbibliography 39 | \else 40 | \frame{\sectionpage} 41 | \fi 42 | } 43 | \AtBeginSubsection{ 44 | \frame{\subsectionpage} 45 | } 46 | \usepackage{lmodern} 47 | \usepackage{amssymb,amsmath} 48 | \usepackage{ifxetex,ifluatex} 49 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex 50 | \usepackage[T1]{fontenc} 51 | \usepackage[utf8]{inputenc} 52 | \usepackage{textcomp} % provides euro and other symbols 53 | \else % if luatex or xelatex 54 | \usepackage{unicode-math} 55 | \defaultfontfeatures{Scale=MatchLowercase} 56 | \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} 57 | \fi 58 | \usetheme[]{Dresden} 59 | \usecolortheme{dolphin} 60 | \usefonttheme{structuresmallcapsserif} 61 | % use upquote if available, for straight quotes in verbatim environments 62 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{} 63 | \IfFileExists{microtype.sty}{% use microtype if available 64 | \usepackage[]{microtype} 65 | \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts 66 | }{} 67 | \makeatletter 68 | \@ifundefined{KOMAClassName}{% if non-KOMA class 69 | \IfFileExists{parskip.sty}{% 70 | \usepackage{parskip} 71 | }{% else 72 | \setlength{\parindent}{0pt} 73 | \setlength{\parskip}{6pt plus 2pt minus 1pt}} 74 | }{% if KOMA class 75 | \KOMAoptions{parskip=half}} 76 | \makeatother 77 | \usepackage{xcolor} 78 | \IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available 79 | \IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} 80 | \hypersetup{ 81 | pdftitle={ML exercises - basics R}, 82 | pdfauthor={Jan-Philipp Kolb}, 83 | pdfborder={0 0 0}, 84 | breaklinks=true} 85 | \urlstyle{same} % don't use monospace font for urls 86 | \newif\ifbibliography 87 | \usepackage{color} 88 | \usepackage{fancyvrb} 89 | \newcommand{\VerbBar}{|} 90 | \newcommand{\VERB}{\Verb[commandchars=\\\{\}]} 91 | \DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} 92 | % Add ',fontsize=\small' for more characters per line 93 | \newenvironment{Shaded}{}{} 94 | \newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{#1}} 95 | \newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}} 96 | \newcommand{\AttributeTok}[1]{#1} 97 | \newcommand{\BaseNTok}[1]{#1} 98 | \newcommand{\BuiltInTok}[1]{#1} 99 | \newcommand{\CharTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}} 100 | \newcommand{\CommentTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}} 101 | \newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}} 102 | \newcommand{\ConstantTok}[1]{#1} 103 | \newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.00,1.00}{#1}} 104 | \newcommand{\DataTypeTok}[1]{#1} 105 | \newcommand{\DecValTok}[1]{#1} 106 | \newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}} 107 | \newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{#1}}} 108 | \newcommand{\ExtensionTok}[1]{#1} 109 | \newcommand{\FloatTok}[1]{#1} 110 | \newcommand{\FunctionTok}[1]{#1} 111 | \newcommand{\ImportTok}[1]{#1} 112 | \newcommand{\InformationTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}} 113 | \newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.00,1.00}{#1}} 114 | \newcommand{\NormalTok}[1]{#1} 115 | \newcommand{\OperatorTok}[1]{#1} 116 | \newcommand{\OtherTok}[1]{\textcolor[rgb]{1.00,0.25,0.00}{#1}} 117 | \newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{1.00,0.25,0.00}{#1}} 118 | \newcommand{\RegionMarkerTok}[1]{#1} 119 | \newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}} 120 | \newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}} 121 | \newcommand{\StringTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}} 122 | \newcommand{\VariableTok}[1]{#1} 123 | \newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}} 124 | \newcommand{\WarningTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{\textbf{#1}}} 125 | \setlength{\emergencystretch}{3em} % prevent overfull lines 126 | \providecommand{\tightlist}{% 127 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} 128 | \setcounter{secnumdepth}{-2} 129 | 130 | % set default figure placement to htbp 131 | \makeatletter 132 | \def\fps@figure{htbp} 133 | \makeatother 134 | 135 | 136 | \title{ML exercises - basics R} 137 | \author{Jan-Philipp Kolb} 138 | \date{03 Juni, 2019} 139 | 140 | \begin{document} 141 | \frame{\titlepage} 142 | 143 | \begin{frame}{Exercise: Find R-packages} 144 | \protect\hypertarget{exercise-find-r-packages}{} 145 | 146 | Go to \url{https://cran.r-project.org/} and search for packages that can 147 | be used: 148 | 149 | \begin{enumerate} 150 | [1)] 151 | \tightlist 152 | \item 153 | to reduce overfitting 154 | \item 155 | for regression trees 156 | \item 157 | for gradient boosting 158 | \item 159 | for neural networks 160 | \item 161 | for clustering 162 | \end{enumerate} 163 | 164 | \end{frame} 165 | 166 | \begin{frame}[fragile]{Solution: Find R-packages} 167 | \protect\hypertarget{solution-find-r-packages}{} 168 | 169 | \begin{Shaded} 170 | \begin{Highlighting}[] 171 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\CommentTok{#1)} 172 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"rpart"}\NormalTok{) }\CommentTok{#2)} 173 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"gbm"}\NormalTok{) }\CommentTok{#3)} 174 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"neuralnet"}\NormalTok{) }\CommentTok{#4)} 175 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"kknn"}\NormalTok{) }\CommentTok{#5)} 176 | \end{Highlighting} 177 | \end{Shaded} 178 | 179 | \end{frame} 180 | 181 | \begin{frame}[fragile]{Exercise: load built-in data} 182 | \protect\hypertarget{exercise-load-built-in-data}{} 183 | 184 | \begin{block}{Load the the built-in dataset \texttt{swiss}} 185 | 186 | \begin{enumerate} 187 | [1)] 188 | \tightlist 189 | \item 190 | How many observations and variables are available? 191 | \item 192 | What is the scale level of the variables? 193 | \end{enumerate} 194 | 195 | \end{block} 196 | 197 | \begin{block}{Interactive data table} 198 | 199 | \begin{enumerate} 200 | [1)] 201 | \setcounter{enumi}{2} 202 | \tightlist 203 | \item 204 | Create an interactive data table 205 | \end{enumerate} 206 | 207 | \end{block} 208 | 209 | \end{frame} 210 | 211 | \begin{frame}[fragile]{Solution: load built-in data} 212 | \protect\hypertarget{solution-load-built-in-data}{} 213 | 214 | \begin{Shaded} 215 | \begin{Highlighting}[] 216 | \CommentTok{# 1)} 217 | \KeywordTok{data}\NormalTok{(swiss) } 218 | \KeywordTok{dim}\NormalTok{(swiss) } 219 | \end{Highlighting} 220 | \end{Shaded} 221 | 222 | \begin{verbatim} 223 | ## [1] 47 6 224 | \end{verbatim} 225 | 226 | \begin{Shaded} 227 | \begin{Highlighting}[] 228 | \KeywordTok{str}\NormalTok{(swiss) } 229 | \end{Highlighting} 230 | \end{Shaded} 231 | 232 | \begin{verbatim} 233 | ## 'data.frame': 47 obs. of 6 variables: 234 | ## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ... 235 | ## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ... 236 | ## $ Examination : int 15 6 5 12 17 9 16 14 12 16 ... 237 | ## $ Education : int 12 9 5 7 15 7 7 8 7 13 ... 238 | ## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ... 239 | ## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ... 240 | \end{verbatim} 241 | 242 | \begin{Shaded} 243 | \begin{Highlighting}[] 244 | \CommentTok{# 2)} 245 | \NormalTok{DT}\OperatorTok{::}\KeywordTok{datatable}\NormalTok{(swiss)} 246 | \end{Highlighting} 247 | \end{Shaded} 248 | 249 | \end{frame} 250 | 251 | \begin{frame}[fragile]{\href{https://www.datacamp.com/community/tutorials/pipe-r-tutorial}{Exercise}: 252 | random numbers} 253 | \protect\hypertarget{exercise-random-numbers}{} 254 | 255 | \begin{enumerate} 256 | [1)] 257 | \tightlist 258 | \item 259 | Draw 8 random numbers from the uniform distribution and save them in a 260 | vector \texttt{x} 261 | \item 262 | Compute the logarithm of \texttt{x}, return suitably lagged and 263 | iterated differences, 264 | \item 265 | compute the exponential function and round the result 266 | \end{enumerate} 267 | 268 | \begin{verbatim} 269 | ## [1] 6.4 0.6 2.2 0.7 1.5 0.8 1.0 270 | \end{verbatim} 271 | 272 | \end{frame} 273 | 274 | \begin{frame}[fragile]{Solution: random numbers} 275 | \protect\hypertarget{solution-random-numbers}{} 276 | 277 | \begin{verbatim} 278 | ## [1] 1.0 1.1 0.3 0.5 2.1 0.6 6.2 279 | \end{verbatim} 280 | 281 | \end{frame} 282 | 283 | \end{document} 284 | -------------------------------------------------------------------------------- /tutorial/ml_exercises_a_visualizing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Visualizing dataset as preparation for machine learning' 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | fontsize: 10pt 6 | output: 7 | beamer_presentation: 8 | theme: Dresden 9 | colortheme: dolphin 10 | fig_height: 3 11 | fig_width: 5 12 | fig_caption: no 13 | fonttheme: structuresmallcapsserif 14 | highlight: haddock 15 | --- 16 | 17 | ```{r setup, include=FALSE} 18 | knitr::opts_chunk$set(echo = T,message = F,warning=F,cache=F) 19 | ``` 20 | 21 | ## Exercises: Visualizing dataset to apply machine learning 22 | 23 | - Exercise based on [r-exercises - visualizing for ml](https://www.r-exercises.com/2017/09/08/visualizing-dataset-to-apply-machine-learning-exercises/) 24 | 25 | 26 | ### Exercise 1 27 | 28 | Create a variable “x” and attach to it the input attributes of the “iris” dataset. HINT: Use columns 1 to 4. 29 | 30 | ### Exercise 2 31 | 32 | Create a variable “y” and attach to it the output attribute of the “iris” dataset. HINT: Use column 5. 33 | 34 | ### Exercise 3 35 | 36 | Create a whisker plot (boxplot) for the variable of the first column of the “iris” dataset. HINT: Use `boxplot()`. 37 | 38 | ## Exercises 39 | 40 | ### Exercise 4 41 | 42 | Now create a whisker plot for each one of the four input variables of the “iris” dataset in one image. HINT: Use par(). 43 | 44 | ### Exercise 5 45 | 46 | Create a barplot to breakdown your output attribute. HINT: Use plot(). 47 | 48 | ### Exercise 6 49 | 50 | Create a scatterplot matrix of the “iris” dataset using the “x” and “y” variables. HINT: Use featurePlot(). 51 | 52 | ### Exercise 7 53 | 54 | Create a scatterplot matrix with ellipses around each separated group. HINT: Use plot="ellipse". 55 | 56 | ## Exercises 57 | 58 | ### Exercise 8 59 | 60 | Create box and whisker plots of each input variable again, but this time broken down into separated plots for each class. HINT: Use plot="box". 61 | 62 | ### Exercise 9 63 | 64 | Create a list named “scales” that includes the “x” and “y” variables and set relation to “free” for both of them. HINT: Use `list()` 65 | 66 | ### Exercise 10 67 | 68 | Create a density plot matrix for each attribute by class value. HINT: Use `featurePlot()`. 69 | 70 | 71 | ## [Solutions](https://www.r-exercises.com/2017/09/08/visualizing-dataset-to-apply-machine-learning-exercises-solutions/) 72 | 73 | ### Solution Exercise 1 74 | 75 | 78 | 79 | ```{r} 80 | library(caret) 81 | data(iris) 82 | validation <- createDataPartition(iris$Species, p=0.80, 83 | list=FALSE) 84 | validation20 <- iris[-validation,] 85 | iris <- iris[validation,] 86 | x <- iris[,1:4] 87 | ``` 88 | 89 | ### Solution Exercise 2 90 | 91 | ```{r} 92 | library(caret) 93 | y <- iris[,5] 94 | ``` 95 | 96 | 97 | ## Solution Exercise 3 98 | 99 | ```{r} 100 | library(caret) 101 | boxplot(x[,1], main=names(iris)[1]) 102 | ``` 103 | 104 | 105 | ## Solution Exercise 4 106 | 107 | ```{r} 108 | library(caret) 109 | par(mfrow=c(1,4)) 110 | for(i in 1:4) { 111 | boxplot(x[,i], main=names(iris)[i]) 112 | } 113 | ``` 114 | 115 | 116 | ## Solution Exercise 5 117 | 118 | ```{r} 119 | library(caret) 120 | plot(y) 121 | ``` 122 | 123 | 124 | ## Solutions - Visualizing ML 125 | 126 | ### Solution Exercise 6 127 | 128 | ```{r} 129 | library(caret) 130 | featurePlot(x=x, y=y) 131 | ``` 132 | 133 | 134 | ## Solution Exercise 7 135 | 136 | ```{r} 137 | # install.packages("ellipse") 138 | library(ellipse) 139 | library(caret) 140 | featurePlot(x=x, y=y,plot="ellipse") 141 | ``` 142 | 143 | 144 | ## Solutions - Visualizing ML 145 | 146 | ### Solution Exercise 8 147 | 148 | ```{r} 149 | library(caret) 150 | featurePlot(x=x, y=y, plot="box") 151 | ``` 152 | 153 | 154 | ## Solutions - Visualizing ML 155 | 156 | ### Solution Exercise 9 157 | 158 | ```{r} 159 | library(caret) 160 | scales <- list(x=list(relation="free"), y=list(relation="free")) 161 | ``` 162 | 163 | 164 | 165 | ### Solution Exercise 10 166 | 167 | ```{r,eval=F} 168 | library(caret) 169 | scales <- list(x=list(relation="free"), y=list(relation="free")) 170 | featurePlot(x=x, y=y, plot="density", scales=scales) 171 | ``` 172 | 173 | ## Solution Exercise 10 174 | 175 | ```{r,echo=F} 176 | library(caret) 177 | scales <- list(x=list(relation="free"), y=list(relation="free")) 178 | featurePlot(x=x, y=y, plot="density", scales=scales) 179 | ``` 180 | -------------------------------------------------------------------------------- /tutorial/ml_exercises_a_visualizing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_a_visualizing.pdf -------------------------------------------------------------------------------- /tutorial/ml_exercises_b_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_b_regression.pdf -------------------------------------------------------------------------------- /tutorial/ml_exercises_c1_treesbagging.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning - Decision Trees Exercises/Solution" 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | fontsize: 10pt 6 | output: 7 | beamer_presentation: 8 | colortheme: dolphin 9 | fig_height: 3 10 | fig_width: 5 11 | fig_caption: no 12 | fonttheme: structuresmallcapsserif 13 | highlight: haddock 14 | theme: Dresden 15 | pdf_document: 16 | keep_tex: yes 17 | toc: yes 18 | slidy_presentation: 19 | css: mycss.css 20 | keep_md: yes 21 | --- 22 | 23 | ```{r setup, include=FALSE} 24 | knitr::opts_chunk$set(echo = TRUE,cache=T,warning = F,message = F) 25 | ``` 26 | 27 | ## [Exercise - `rpart` Kyphosis](https://www.r-exercises.com/2016/12/13/recursive-partitioning-and-regression-trees-exercises/) 28 | 29 | ### Consider the Kyphosis data frame 30 | 31 | 34 | 35 | 1) Which variables are in the `kyphosis` dataset 36 | 2) Build a tree to classify Kyphosis from Age, Number and Start. 37 | 38 | ### Consider the tree build above. 39 | 40 | 3) Which variables are used to explain Kyphosis presence? 41 | 4) How many observations contain the terminal nodes. 42 | 43 | ### Consider the Kyphosis data frame. 44 | 45 | 5) Build a tree using the first 60 observations of kyphosis. 46 | 6) Predict the kyphosis presence for the other 21 observations. 47 | 7) Which is the misclassification rate (prediction error) 48 | 49 | ## [The dataset kyphosis](https://www.r-exercises.com/2016/12/13/recursive-partitioning-and-regression-trees-solutions/) 50 | 51 | ### The dataset contains (1): 52 | 53 | - Kyphosis: a factor with levels absent and present, indicating if a kyphosis (a type of deformation) was present after the operation. 54 | - Age: in months. 55 | - Number: the number of vertebrae involved. 56 | - Start: the number of the first (topmost) vertebra operated on. 57 | 58 | ```{r} 59 | data(kyphosis,package = "rpart") 60 | dplyr::glimpse(kyphosis) 61 | ``` 62 | 63 | ## Build the tree (2) 64 | 65 | ```{r} 66 | (TREE<-rpart::rpart(Kyphosis~Age+Number+Start, 67 | data=kyphosis,method="class")) 68 | ``` 69 | 70 | ## [Plot the result](https://www.rdocumentation.org/packages/rpart.plot/versions/3.0.7/topics/rpart.plot) 71 | 72 | ```{r} 73 | rpart.plot::rpart.plot(TREE,extra=1) 74 | ``` 75 | 76 | 77 | ```{r,eval=F,echo=F} 78 | rpart.plot::rpart.plot(TREE,type=5) 79 | rpart.plot::rpart.plot(TREE,extra=1) 80 | ``` 81 | 82 | ## Answers 83 | 84 | 3) Which variables are used to explain Kyphosis presence? 85 | 86 | - The variables are Start and Age 87 | 88 | 4) How many observations contain the terminal nodes. 89 | 90 | 93 | 94 | - The terminal nodes have 29, 12, 12, 3 and 8 observations 95 | 96 | 99 | 100 | 5) Build a tree using the first 60 observations of kyphosis. 101 | 102 | - `y` is a factor $\Rightarrow$ we choose `method="class"` 103 | ```{r} 104 | TREE <- rpart(Kyphosis ~ Age + Number + Start, 105 | data=kyphosis[1:60,],method="class") 106 | ``` 107 | 108 | 109 | ## Further answers 110 | 111 | 6) Predict the kyphosis presence for the other 21 observations. 112 | 113 | ```{r} 114 | PR <- predict(TREE,kyphosis[61:81,],type='class') 115 | ``` 116 | 117 | 118 | 7) Which is the [**misclassification rate**](https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/) (prediction error) 119 | 120 | ```{r} 121 | test <- kyphosis$Kyphosis[61:81] 122 | table(PR,test) 123 | ``` 124 | 125 | ```{r} 126 | (rate <- 100*length(which(PR!=test))/length(PR)) 127 | ``` 128 | 129 | ```{r,echo=F,eval=F} 130 | cat('the misclassification rate is:',rate) 131 | ``` 132 | 133 | 134 | ## Exercise `rpart` - `iris` 135 | 136 | ### Consider the `iris` data frame 137 | 138 | 1) Build a tree to classify Species from the other variables. 139 | 2) Plot the trees, add nodes information. 140 | 141 | ### Consider the tree build before 142 | 143 | 3) Prune the tree using median complexity parameter (cp) associated to the tree. 144 | 4) Plot in the same window, the pruned and the original tree. 145 | 5) In which terminal nodes is clasified each oobservations of `iris`? 146 | 6) Which Specie has a flower of `Petal.Length` greater than 2.45 and `Petal.Width` less than 1.75. 147 | 148 | ## Solution - `rpart` - `iris` (I) 149 | 150 | 1) Build a tree to classify Species from the other variables. 151 | 152 | ```{r} 153 | (TREE2 <- rpart(Species ~ ., data=iris,method="class")) 154 | ``` 155 | 156 | ## Solution - `rpart` - `iris` (II) 157 | 158 | 159 | 2) Plot the trees, add nodes information. 160 | 161 | ```{r} 162 | library(rpart.plot) 163 | rpart.plot(TREE2) 164 | ``` 165 | 166 | ```{r,echo=F,eval=F} 167 | library(rpart.plot) 168 | png("figure/ml_tb_rpart_iris.png") 169 | rpart.plot(TREE2) 170 | dev.off() 171 | ``` 172 | 173 | 174 | 175 | ## Solution - `rpart` - `iris` (III) 176 | 177 | 178 | 3) Prune the the using median complexity parameter (cp) associated to the tree. 179 | 180 | ```{r} 181 | TP <- prune(TREE2,cp=median(TREE2$cptable[,'CP'])) 182 | ``` 183 | 184 | 4) Plot in the same window, the pruned and the original tree. 185 | 186 | ```{r,fig.height=3,echo=F,eval=F} 187 | par(mfrow=c(1,2)) 188 | plot(TREE2);text(TREE2,use.n=T) 189 | plot(TP);text(TP,use.n=T) 190 | ``` 191 | 192 | ```{r,fig.height=3,eval=F} 193 | par(mfrow=c(1,2)) 194 | rpart.plot(TREE2) 195 | rpart.plot(TP) 196 | ``` 197 | 198 | ## The plotted results 199 | 200 | ```{r,echo=F} 201 | par(mfrow=c(1,2)) 202 | rpart.plot(TREE2) 203 | rpart.plot(TP) 204 | ``` 205 | 206 | 207 | ## Solution - `rpart` - `iris` (IV) 208 | 209 | 5) In which terminal nodes is clasified each observations of iris? 210 | 211 | ```{r} 212 | TREE2$where 213 | ``` 214 | 215 | ## Solution - `rpart` - `iris` (V) 216 | 217 | 218 | 6) Which species has a flower of `Petal.Length` greater than 2.45 and `Petal.Width` less than 1.75. 219 | 220 | ```{r} 221 | print('versicolor') 222 | unique(iris[iris$Petal.Length>2.45 & 223 | iris$Petal.Width<1.75,"Species"]) 224 | ``` 225 | 226 | -------------------------------------------------------------------------------- /tutorial/ml_exercises_c1_treesbagging.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_c1_treesbagging.pdf -------------------------------------------------------------------------------- /tutorial/ml_exercises_c2_randomforests.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_c2_randomforests.pdf -------------------------------------------------------------------------------- /tutorial/ml_exercises_c3_xtremeboosting.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "ML Exercises - Gradient Boosting" 3 | author: "Jan-Philipp Kolb" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | fontsize: 10pt 6 | output: 7 | beamer_presentation: 8 | theme: Dresden 9 | colortheme: dolphin 10 | fig_height: 3 11 | fig_width: 5 12 | fig_caption: no 13 | fonttheme: structuresmallcapsserif 14 | highlight: haddock 15 | --- 16 | 17 | ```{r setup, include=FALSE} 18 | knitr::opts_chunk$set(echo = TRUE,cache=T,message=F,warning=F) 19 | ``` 20 | 21 | ## [eXtremely Boost your machine learning Exercises (Part-1)](https://www.r-exercises.com/2017/09/24/extremely-boost-your-machine-learning-exercises-part-1/) 22 | 23 | ```{r,eval=F,echo=F} 24 | install.packages("xgboost") 25 | ``` 26 | 27 | 28 | - eXtreme Gradient Boosting is a machine learning model which became really popular few years ago after winning several Kaggle competitions. 29 | - It is very powerful algorithm that use an ensemble of weak learners to obtain a strong learner. 30 | - Its R implementation is available in xgboost package and it is really worth including into anyone’s machine learning portfolio. 31 | 32 | 39 | 40 | ## Boosting Exercises - first part 41 | 42 | ### Exercise 1 43 | Load `xgboost` library and download German Credit dataset. Your goal will be to predict creditability (the first column in the dataset). 44 | 45 | ### Exercise 2 46 | Convert columns `c(2,4,5,7,8,9,10,11,12,13,15,16,17,18,19,20)` to factors and then encode them as dummy variables. HINT: use the command `model.matrix()` 47 | 48 | ### Exercise 3 49 | Split data into training and test set 700:300. Create `xgb.DMatrix` for both sets with Creditability as label. 50 | 51 | ## Boosting Exercises - second part 52 | 53 | ### Exercise 4 54 | Train `xgboost` with logistic objective and 30 rounds of training and maximal depth 2. 55 | 56 | ### Exercise 5 57 | To check model performance calculate test set classification error. 58 | 59 | ### Exercise 6 60 | Plot predictors importance. 61 | 62 | ## Boosting Exercises - third part 63 | 64 | ### Exercise 7 65 | Use `xgb.train()` instead of `xgboost()` to add both train and test sets as a watchlist. Train model with same parameters, but 100 rounds to see how it performs during training. 66 | 67 | ### Exercise 8 68 | Train model again adding AUC and Log Loss as evaluation metrices. 69 | 70 | ### Exercise 9 71 | Plot how AUC and Log Loss for train and test sets was changing during training process. Use plotting function/library of your choice. 72 | 73 | ### Exercise 10 74 | Check how setting parameter eta to 0.01 influences the AUC and Log Loss curves. 75 | image_pdf 76 | 77 | ## [Solutions: boosting exercises](https://www.r-exercises.com/2017/09/24/extremely-boost-your-machine-learning-solutions-part-1/) 78 | 79 | ### Solution Exercise 1 - import dataset 80 | 81 | ```{r} 82 | library(xgboost) 83 | ``` 84 | 85 | 86 | ```{r,eval=F} 87 | url <- "http://freakonometrics.free.fr/german_credit.csv" 88 | credit <- read.csv(url, header = TRUE, sep = ",") 89 | ``` 90 | 91 | 92 | ```{r,eval=F,echo=F} 93 | save(credit,file="../data/german_credit.RData") 94 | ``` 95 | 96 | ```{r,echo=F} 97 | load("../data/german_credit.RData") 98 | ``` 99 | 100 | ```{r} 101 | head(credit) 102 | ``` 103 | 104 | ## Solutions boosting exercises - first part 105 | 106 | ### Solution Exercise 2 - convert columns 107 | 108 | ```{r} 109 | factor_columns <- c(2,4,5,7,8,9,10,11,12,13,15,16,17,18,19,20) 110 | for(i in factor_columns) credit[,i] <- as.factor(credit[,i]) 111 | X <- model.matrix(~ . - Creditability, data=credit) 112 | ``` 113 | 114 | ### Solution Exercise 3 115 | 116 | ```{r} 117 | inTraining <- sample(1:nrow(credit),size=700) 118 | dtrain <- xgboost::xgb.DMatrix(X[inTraining,], 119 | label=credit$Creditability[inTraining]) 120 | dtest <- xgboost::xgb.DMatrix(X[-inTraining,], 121 | label=credit$Creditability[-inTraining]) 122 | ``` 123 | 124 | ## Solutions boosting exercises - second part 125 | 126 | ### Solution Exercise 4 - train `xgboost` model 127 | 128 | ```{r} 129 | model <- xgboost(data = dtrain, 130 | max_depth = 2, 131 | nrounds = 30, 132 | objective = "binary:logistic") 133 | ``` 134 | 135 | ## Solutions boosting exercises - third part 136 | 137 | ### Solution Exercise 5 138 | 139 | ```{r} 140 | err<-mean(round(predict(model,dtest))!=getinfo(dtest,'label')) 141 | print(paste("test-error=", err)) 142 | ``` 143 | 144 | 145 | ### Solution Exercise 6 146 | 147 | ```{r,eval=F} 148 | importance.matrix <- xgb.importance(model = model, 149 | feature_names = colnames(X)) 150 | xgb.plot.importance(importance.matrix) 151 | ``` 152 | 153 | ## Importance plot 154 | 155 | ```{r,echo=F} 156 | importance.matrix <- xgb.importance(model = model, 157 | feature_names = colnames(X)) 158 | xgb.plot.importance(importance.matrix) 159 | ``` 160 | 161 | 162 | ## Solution Exercise 7 163 | 164 | ```{r} 165 | model_watchlist <- xgb.train(data = dtrain, 166 | max_depth = 2,nrounds = 100, 167 | objective = "binary:logistic", 168 | watchlist = list(train=dtrain, 169 | test=dtest)) 170 | ``` 171 | 172 | 173 | 174 | ## Solution Exercise 8 175 | 176 | ```{r,eval=F} 177 | model_auc<-xgb.train(data = dtrain,max_depth = 2, 178 | nrounds = 100,objective = "binary:logistic", 179 | watchlist = list(train=dtrain,test=dtest), 180 | eval_metric = 'auc',eval_metric = 'logloss') 181 | ``` 182 | 183 | ```{r,eval=F,echo=F} 184 | save(model_auc,file="../data/model_auc.RData") 185 | ``` 186 | 187 | ```{r,echo=F} 188 | load("../data/model_auc.RData") 189 | ``` 190 | 191 | ## Output `model_auc` 192 | 193 | ```{r} 194 | model_auc 195 | ``` 196 | 197 | 198 | ## Solution Exercise 9 199 | 200 | ```{r,eval=F} 201 | library(tidyverse) 202 | model_auc$evaluation_log %>% 203 | gather(metric, value, -iter) %>% 204 | separate(metric, c('set','metric')) %>% 205 | ggplot(aes(iter, value, color = set)) + 206 | geom_line() + 207 | facet_grid(metric~.) 208 | ``` 209 | 210 | ## Evaluation plot 211 | 212 | ```{r,echo=F} 213 | library(tidyverse) 214 | model_auc$evaluation_log %>% 215 | gather(metric, value, -iter) %>% 216 | separate(metric, c('set','metric')) %>% 217 | ggplot(aes(iter, value, color = set)) + 218 | geom_line() + 219 | facet_grid(metric~.) 220 | ``` 221 | 222 | 223 | ## Solution Exercise 10 224 | 225 | ```{r,eval=F} 226 | model_eta<-xgb.train(data=dtrain,max_depth = 2,eta = 0.05, 227 | nrounds = 100,objective = "binary:logistic", 228 | watchlist = list(train=dtrain, test=dtest), 229 | eval_metric = 'auc',eval_metric = 'logloss') 230 | ``` 231 | 232 | ```{r,echo=F,eval=F} 233 | save(model_eta,file="../data/model_eta.RData") 234 | ``` 235 | 236 | ```{r,echo=F} 237 | load("../data/model_eta.RData") 238 | ``` 239 | 240 | ## Output of model eta 241 | 242 | ```{r} 243 | model_eta 244 | ``` 245 | 246 | -------------------------------------------------------------------------------- /tutorial/ml_exercises_c3_xtremeboosting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_c3_xtremeboosting.pdf -------------------------------------------------------------------------------- /tutorial/ml_exercises_d_neuralnets.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_d_neuralnets.pdf -------------------------------------------------------------------------------- /tutorial/prepare_apply_5ml.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Prepare Machine Learning" 3 | author: "Jan-Philipp Kolb" 4 | date: "31 5 2019" 5 | output: beamer_presentation 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE) 10 | ``` 11 | 12 | ## [How to prepare and apply machine learning to your dataset](https://www.r-exercises.com/2017/08/25/machine-learning-tutorial/) 13 | 14 | ### Content of this section 15 | 16 | 1) Use one of the most popular machine learning packages in R. 17 | 2) Explore a dataset by using statistical summaries and data visualization. 18 | 3) Build 5 machine-learning models, pick the best, and build confidence that the accuracy is reliable. 19 | 20 | ### standard and necessary steps: 21 | 22 | 1. Define Problem. 23 | 2. Prepare Data. 24 | 3. Evaluate Algorithms. 25 | 4. Improve Results. 26 | 5. Present Results. --------------------------------------------------------------------------------