├── .gitignore
├── README.Rmd
├── README.md
├── _config.yml
├── data
    ├── US Presidential Data.csv
    ├── adult.RData
    ├── ames_data.RData
    ├── ex.data
    ├── german_credit.RData
    ├── lm_nn_Yacht_NN2.RData
    ├── meta.data
    ├── meta_cnames.Rdata
    ├── ml_gbm_fit.RData
    ├── ml_gbm_fit2.RData
    ├── ml_gbm_reg_linear.RData
    ├── ml_gbm_tune.RData
    ├── ml_randomforest_m1.RData
    ├── ml_rf_OOB_RMSE.RData
    ├── ml_rf_ames_randomForest.RData
    ├── ml_rf_dec.RData
    ├── ml_rf_grid_perf.data
    ├── ml_rf_h2o.grid.RData
    ├── ml_rf_hypergrid_oobrmse.RData
    ├── ml_rf_m2.RData
    ├── ml_rf_oob_comp.RData
    ├── ml_rf_optimal_ranger.RData
    ├── ml_rf_random_grid.RData
    ├── ml_rf_xgb.fit1.RData
    ├── ml_rf_xgb.fit3.RData
    ├── model_auc.RData
    ├── model_eta.RData
    ├── negative-words.txt
    ├── osmsa_PLZ_14.RData
    ├── positive-words.txt
    └── titanic.RData
├── machine_learning.Rproj
├── misc
    └── along.Rmd
├── rcode
    ├── GESISPanel.R
    ├── a1_intro_r.R
    ├── a2_intro_ml.R
    ├── b1_regression.R
    ├── b2_regularization.R
    ├── c1_trees_bagging.R
    ├── c2_random_forests.R
    ├── c2b_random_forests_h2o.R
    ├── c3_gbm_regression.R
    ├── c3b_gbm_regression_h2o.R
    ├── creating_rcode.R
    ├── d_neuralNetworks.R
    ├── e_Clustering.R
    ├── f_dangers_ml.R
    ├── fitting.R
    ├── g_reticulate_umap.R
    ├── incourse1.R
    ├── incourse2.R
    ├── ml_part1.R
    ├── preparing_bagging.R
    ├── purl_the_slides.R
    └── randomforests_boosting.R
├── slides
    ├── a1_intro_ml.Rmd
    ├── a1_intro_ml.html
    ├── a1_intro_ml.md
    ├── a1_intro_ml.pdf
    ├── a1_intro_r_cache
    │   └── slidy
    │   │   ├── __packages
    │   │   ├── unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.RData
    │   │   ├── unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdb
    │   │   └── unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdx
    ├── a2_intro_r.Rmd
    ├── a2_intro_r.html
    ├── a2_intro_r.md
    ├── a2_intro_r.pdf
    ├── a2_intro_r_cache
    │   ├── beamer
    │   │   ├── __packages
    │   │   ├── unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.RData
    │   │   ├── unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdb
    │   │   └── unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdx
    │   └── slidy
    │   │   ├── __packages
    │   │   ├── unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.RData
    │   │   ├── unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdb
    │   │   └── unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdx
    ├── a2_intro_r_files
    │   ├── figure-beamer
    │   │   └── unnamed-chunk-51-1.pdf
    │   └── figure-slidy
    │   │   └── unnamed-chunk-51-1.png
    ├── b1_regression.Rmd
    ├── b1_regression.html
    ├── b1_regression.md
    ├── b1_regression.pdf
    ├── b1_regression_files
    │   └── figure-slidy
    │   │   ├── unnamed-chunk-25-1.png
    │   │   ├── unnamed-chunk-26-1.png
    │   │   ├── unnamed-chunk-3-1.png
    │   │   ├── unnamed-chunk-48-1.png
    │   │   ├── unnamed-chunk-49-1.png
    │   │   ├── unnamed-chunk-51-1.png
    │   │   ├── unnamed-chunk-52-1.png
    │   │   ├── unnamed-chunk-53-1.png
    │   │   └── unnamed-chunk-58-1.png
    ├── b2_regularization.Rmd
    ├── b2_regularization.md
    ├── b2_regularization.pdf
    ├── c1_trees_bagging.Rmd
    ├── c1_trees_bagging.md
    ├── c1_trees_bagging.pdf
    ├── c2_random_forests.Rmd
    ├── c2_random_forests.aux
    ├── c2_random_forests.md
    ├── c2_random_forests.pdf
    ├── c2_random_forests.vrb
    ├── c3_gbm_regression.md
    ├── c3_gbm_regression.pdf
    ├── c3_gbm_regression_short.Rmd
    ├── c3_gbm_regression_short.html
    ├── c3_gbm_regression_short.md
    ├── c3_gbm_regression_short.pdf
    ├── c3b_gbm_regression_h2o.Rmd
    ├── d_neuralNetworks.Rmd
    ├── d_neuralNetworks.html
    ├── d_neuralNetworks.pdf
    ├── e_Clustering-exported.html
    ├── e_Clustering.Rmd
    ├── e_Clustering.html
    ├── e_Clustering.md
    ├── f_dangers_ml.Rmd
    ├── f_dangers_ml.html
    ├── f_dangers_ml.md
    ├── figure
    │   ├── 3d-coordinate-plane.png
    │   ├── 450px-Overfitting.svg.png
    │   ├── AmesTableau01.png
    │   ├── ArtificialNeuronModel_english.png
    │   ├── BBRXC.png
    │   ├── Blausen_0657_MultipolarNeuron.png
    │   ├── Decision-Tree-Example.jpg
    │   ├── Diagslr.PNG
    │   ├── OneHotEncoding.PNG
    │   ├── Overfitting_fig1.PNG
    │   ├── Picture3.jpg
    │   ├── SMLProcess.png
    │   ├── The_Signal_and_the_Noise.jpg
    │   ├── activation_funs.PNG
    │   ├── activations-1.png
    │   ├── addins.PNG
    │   ├── bagging3.png
    │   ├── bias_variance_tradeoff.PNG
    │   ├── bias_variance_tradeoff2.png
    │   ├── biglasso.PNG
    │   ├── book_ml1.jpg
    │   ├── boosted-trees-process.png
    │   ├── boosting-in-action-1.png
    │   ├── bostondata.PNG
    │   ├── bostonscaled.PNG
    │   ├── class01-1.png
    │   ├── classification_regression.png
    │   ├── confusionMatrix.png
    │   ├── content_flowchart1.png
    │   ├── datasetsload.PNG
    │   ├── decissiontree.PNG
    │   ├── dplyr_vignette.PNG
    │   ├── dt_amesdata.PNG
    │   ├── duckduckgo.PNG
    │   ├── electoral_precedent.png
    │   ├── ex_regression_tree.png
    │   ├── expl_rf.png
    │   ├── factor3vars_visreg.PNG
    │   ├── fig3_loglambda.PNG
    │   ├── fig3_loglambda.svg
    │   ├── four_regmods.PNG
    │   ├── gbmtopmodelsvars.PNG
    │   ├── ggpairs_yacht.png
    │   ├── gradient_descent.png
    │   ├── influentalValues_lasso.PNG
    │   ├── interplot_wt_disp.PNG
    │   ├── iris.png
    │   ├── kyphosis_helppage.PNG
    │   ├── learning_rate_comparison.png
    │   ├── limeplot.png
    │   ├── magrittr_vignette.jpg
    │   ├── ml_emoji.png
    │   ├── ml_ice_curves.png
    │   ├── ml_rf_errorrate_m1.png
    │   ├── ml_rf_hist_OOB_RMSE.png
    │   ├── ml_rf_varimp_ranger.png
    │   ├── ml_tb_rpart_iris.png
    │   ├── mtcars_model_interact.PNG
    │   ├── neuralnetfig.PNG
    │   ├── neuralnets.PNG
    │   ├── nyc_map.png
    │   ├── overview_ml_algorithms.jpg
    │   ├── package_gbm.PNG
    │   ├── pic_hiddenlayers.PNG
    │   ├── prediction_mtcars.PNG
    │   ├── random_trees_fig1.PNG
    │   ├── reg_3algos.PNG
    │   ├── resid_fitted.PNG
    │   ├── ridgeTop25influentalVars.PNG
    │   ├── ridge_coef.png
    │   ├── stargazertabex.PNG
    │   ├── stochastic_gradient_descent.png
    │   ├── swissfertality.PNG
    │   ├── taskviewmachinelearning.PNG
    │   ├── three_algos_complete.PNG
    │   ├── titanicdata.PNG
    │   ├── top-20-r-packages-machine-learning-downloads.jpg
    │   ├── top10gbms.PNG
    │   ├── tree-correlation-1.png
    │   ├── tree-variance-1.svg
    │   ├── tree.ps
    │   ├── tree_m1.PNG
    │   ├── unsupervisedLearning.png
    │   ├── visreg.PNG
    │   ├── visreg2.PNG
    │   ├── visreg_m6.PNG
    │   ├── visregcat.PNG
    │   └── visregplot1.PNG
    ├── g_reticulate_umap-exported.html
    ├── g_reticulate_umap.Rmd
    ├── long
    │   ├── c2_random_forests.Rmd
    │   ├── c2_random_forests.pdf
    │   ├── d_neuralNetworks.Rmd
    │   └── d_neuralNetworks.pdf
    └── old
    │   ├── A_ml_motiv.Rmd
    │   ├── a1_intro_r.Rmd
    │   ├── a2_intro_ml.Rmd
    │   ├── a2_intro_ml.html
    │   ├── a2_intro_ml.md
    │   ├── a2_intro_ml.pdf
    │   ├── a_intro_ml.Rmd
    │   ├── a_intro_ml.html
    │   ├── a_intro_ml.md
    │   ├── a_intro_ml.pdf
    │   ├── advanced_regression.Rmd
    │   ├── b1_regression.Rmd
    │   ├── b2_lasso_regression (2).Rmd
    │   ├── b2_lasso_regression.Rmd
    │   ├── b2_lasso_regression.html
    │   ├── b2_lasso_regression.log
    │   ├── b2_lasso_regression.md
    │   ├── b2_lasso_regression.nav
    │   ├── b2_lasso_regression.snm
    │   ├── b2_lasso_regression.tex
    │   ├── b2_lasso_regression.toc
    │   ├── b2_lasso_regression.vrb
    │   ├── b2_lasso_regression_files
    │       ├── figure-beamer
    │       │   ├── unnamed-chunk-12-1.pdf
    │       │   ├── unnamed-chunk-17-1.pdf
    │       │   ├── unnamed-chunk-20-1.pdf
    │       │   ├── unnamed-chunk-23-1.pdf
    │       │   ├── unnamed-chunk-25-1.pdf
    │       │   ├── unnamed-chunk-27-1.pdf
    │       │   └── unnamed-chunk-36-1.pdf
    │       └── figure-slidy
    │       │   ├── unnamed-chunk-12-1.png
    │       │   ├── unnamed-chunk-17-1.png
    │       │   ├── unnamed-chunk-20-1.png
    │       │   ├── unnamed-chunk-23-1.png
    │       │   ├── unnamed-chunk-25-1.png
    │       │   ├── unnamed-chunk-27-1.png
    │       │   └── unnamed-chunk-36-1.png
    │   ├── b2_regularization.Rmd
    │   ├── b_lasso_regression.Rmd
    │   ├── c2_random_forests.Rmd
    │   ├── c_bagging_boosting_trees.Rmd
    │   ├── c_bagging_boosting_trees.pdf
    │   ├── caret.Rmd
    │   ├── conditional_inference_trees.Rmd
    │   ├── d_neuralNetworks.Rmd
    │   ├── doParallel.Rmd
    │   ├── evaluation.pdf
    │   ├── gradient_boosting.Rmd
    │   ├── gradient_boosting.pdf
    │   ├── lasso_regression.pdf
    │   ├── logit_model.Rmd
    │   ├── ml_part1.Rmd
    │   ├── ml_part1.log
    │   ├── ml_part1.pdf
    │   ├── random_forests.Rmd
    │   ├── supervised_learning.Rmd
    │   └── supervised_learning.html
└── tutorial
    ├── g_ml_applying_algorithms.Rmd
    ├── ml_exercises.Rmd
    ├── ml_exercises.html
    ├── ml_exercises.pdf
    ├── ml_exercises_a1_introR.Rmd
    ├── ml_exercises_a1_introR.log
    ├── ml_exercises_a1_introR.pdf
    ├── ml_exercises_a1_introR.tex
    ├── ml_exercises_a_visualizing.Rmd
    ├── ml_exercises_a_visualizing.pdf
    ├── ml_exercises_b_regression.Rmd
    ├── ml_exercises_b_regression.pdf
    ├── ml_exercises_c1_treesbagging.Rmd
    ├── ml_exercises_c1_treesbagging.pdf
    ├── ml_exercises_c2_randomforests.Rmd
    ├── ml_exercises_c2_randomforests.pdf
    ├── ml_exercises_c3_xtremeboosting.Rmd
    ├── ml_exercises_c3_xtremeboosting.pdf
    ├── ml_exercises_d_neuralnets.Rmd
    ├── ml_exercises_d_neuralnets.pdf
    └── prepare_apply_5ml.Rmd


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "README"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | output: md_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## Part A - introduction
13 | 
14 | - A1 - [Introduction to machine learning](slides/a1_intro_ml.md) ([pdf](slides/a1_intro_ml.pdf))
15 | - A2 - [Laying the foundations in R](slides/a1_intro_r.md) ([Browser](slides/a1_intro_r.md) | [pdf](slides/a1_intro_r.pdf) | [rcode](rcode/a1_intro_r.R))
16 | 
17 | <!--
18 | - [A2 - What is machine learning and why should you care](slides/intro_ml.Rmd)
19 | - [Parallelization](doParallel.Rmd)
20 | -->
21 | ## Part B - lasso and ridge regression
22 | 
23 | - [B1 - A small recap on linear regression](slides/b1_regression.md) ([Browser](slides/b1_regression.md) | [pdf](slides/b1_regression.pdf) | [rcode](rcode/b1_regression.R))
24 | - [B2 - Using regularization to prevent overfitting and perform feature selection](slides/b2_regularization.md) ([Browser](slides/b2_regularization.md) | [pdf](slides/b2_regularization.pdf) | [rcode](rcode/b2_regularization.R))
25 | 
26 | 
27 | <!--
28 | - The `glmnet` package
29 | -->
30 | 
31 | 
32 | ## Part C - Supervised Learning: Bagging and Boosting, tree-methods 
33 | 
34 | - [C1 - Supervised learning: tress and bagging](slides/c1_trees_bagging.md) ([Browser](slides/c1_trees_bagging.md) | [pdf](slides/c1_trees_bagging.pdf) | [rcode](rcode/c1_trees_bagging.R))
35 | 
36 | - [C2 - Supervised learning: random forests](slides/c2_random_forests.md) ([Browser](slides/c2_random_forests.md) | [pdf](slides/c2_random_forests.pdf) | [rcode](rcode/c2_random_forests.R))
37 | 
38 | - [C3 - Supervised learning: gradient boosting](slides/c3_gbm_regression.md) ([Browser](slides/c3_gbm_regression.md) | [pdf](slides/c3_gbm_regression.pdf) | [rcode](rcode/c3_gbm_regression.R))
39 | 
40 | 
41 | ## Part D - Supervised Learning: Neural Network
42 | 
43 | - [D - Supervised learning: neural network](slides/d_neuralNetworks.md) ([Browser](slides/d_neuralNetworks.md) | [pdf](slides/d_neuralNetworks.pdf) | [rcode](rcode/d_neuralNetworks.R))
44 | 
45 | ## Part E - Unsupervised Learning: kmeans, hdbscan
46 | 
47 | - [E - Unsupervised Learning: kmeans, hdbscan](slides/e_Clustering.md)
48 | 
49 | ## Part F - The dangers of machine learning
50 | 
51 | - [F - The dangers of machine learning](slides/f_dangers_ml.md)
52 | 
53 | ## Part G - `reticulate` package: Umap
54 | 
55 | - [G - reticulate package: Umap](slides/g_reticulate_umap.md)
56 | 
57 | 
58 | 
59 | # Remarks
60 | 
61 | The sources are often linked in the headline. Please ask if something is unclear. 
62 | 
63 | <!--
64 | - work with python and R
65 | -->
66 | 
67 | <!--
68 | ## Links
69 | 
70 | -   [Your First Machine Learning Project in R
71 |     Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/)
72 | 
73 | -   chapter about machine learning in [awesome
74 |     R](https://awesome-r.com/)
75 | 
76 | -   [Shiny App for machine
77 |     learning](https://www.showmeshiny.com/machlearn/)
78 | 
79 | https://www.r-exercises.com/?s=machine+learning
80 | 
81 | https://www.youtube.com/watch?v=5N9V07EIfIg&list=PLOg0ngHtcqbPTlZzRHA2ocQZqB1D_qZ5V
82 | -->


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Machine Learning with R
 2 | -----------------------
 3 | 
 4 | Part A - introduction
 5 | ---------------------
 6 | 
 7 | -   A1 - Laying the foundations in R ([Browser](slides/a1_intro_r.md) |
 8 |     [pdf](slides/a1_intro_r.pdf) | [rcode](rcode/a1_intro_r.R))
 9 | 
10 | <!--
11 | - [A2 - What is machine learning and why should you care](slides/intro_ml.Rmd)
12 | - [Parallelization](doParallel.Rmd)
13 | -->
14 | 
15 | Part B - lasso and ridge regression
16 | -----------------------------------
17 | 
18 | -   [B1 - A small recap on linear regression](slides/b1_regression.md)
19 |     ([Browser](slides/b1_regression.md) |
20 |     [pdf](slides/b1_regression.pdf) | [rcode](rcode/b1_regression.R))
21 | -   [B2 - Using regularization to prevent overfitting and perform
22 |     feature selection](slides/b2_regularization.md)
23 |     ([Browser](slides/b2_regularization.md) |
24 |     [pdf](slides/b2_regularization.pdf) |
25 |     [rcode](rcode/b2_regularization.R))
26 | 
27 | <!--
28 | - The `glmnet` package
29 | -->
30 | 
31 | Part C - Supervised Learning: Bagging and Boosting, tree-methods
32 | ----------------------------------------------------------------
33 | 
34 | -   [C1 - Supervised learning: tress and
35 |     bagging](slides/c1_trees_bagging.md)
36 |     ([Browser](slides/c1_trees_bagging.md) |
37 |     [pdf](slides/c1_trees_bagging.pdf) |
38 |     [rcode](rcode/c1_trees_bagging.R))
39 | 
40 | -   [C2 - Supervised learning: random
41 |     forests](slides/c2_random_forests.md)
42 |     ([Browser](slides/c2_random_forests.md) |
43 |     [pdf](slides/c2_random_forests.pdf) |
44 |     [rcode](rcode/c2_random_forests.R))
45 | 
46 | -   [C3 - Supervised learning: gradient
47 |     boosting](slides/c3_gbm_regression.md)
48 |     ([Browser](slides/c3_gbm_regression.md) |
49 |     [pdf](slides/c3_gbm_regression.pdf) |
50 |     [rcode](rcode/c3_gbm_regression.R))
51 | 
52 | Part D - Supervised Learning: Neural Network
53 | --------------------------------------------
54 | 
55 | -   [D - Supervised learning: neural
56 |     network](slides/d_neuralNetworks.md)
57 |     ([Browser](slides/d_neuralNetworks.md) |
58 |     [pdf](slides/d_neuralNetworks.pdf) |
59 |     [rcode](rcode/d_neuralNetworks.R))
60 | 
61 | Part E - Unsupervised Learning: kmeans, hdbscan
62 | -----------------------------------------------
63 | 
64 | -   [E - Unsupervised Learning: kmeans, hdbscan](slides/e_Clustering.md)
65 | 
66 | Part F - The dangers of machine learning
67 | ----------------------------------------
68 | 
69 | -   [F - The dangers of machine learning](slides/f_dangers_ml.md)
70 | 
71 | Part G - `reticulate` package: Umap
72 | -----------------------------------
73 | 
74 | -   [G - reticulate package: Umap](slides/g_reticulate_umap.md)
75 | 
76 | <!--
77 | - work with python and R
78 | -->
79 | <!--
80 | ## Links
81 | 
82 | -   [Your First Machine Learning Project in R
83 |     Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/)
84 | 
85 | -   chapter about machine learning in [awesome
86 |     R](https://awesome-r.com/)
87 | 
88 | -   [Shiny App for machine
89 |     learning](https://www.showmeshiny.com/machlearn/)
90 | 
91 | https://www.r-exercises.com/?s=machine+learning
92 | -->
93 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-leap-day


--------------------------------------------------------------------------------
/data/adult.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/adult.RData


--------------------------------------------------------------------------------
/data/ames_data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ames_data.RData


--------------------------------------------------------------------------------
/data/ex.data:
--------------------------------------------------------------------------------
1 | TITLE extra line
2 | # a comment
3 | 2 3 5 7
4 | 11 13 17
5 | 


--------------------------------------------------------------------------------
/data/german_credit.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/german_credit.RData


--------------------------------------------------------------------------------
/data/lm_nn_Yacht_NN2.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/lm_nn_Yacht_NN2.RData


--------------------------------------------------------------------------------
/data/meta.data:
--------------------------------------------------------------------------------
 1 | age
 2 | workclass
 3 | fnlwgt
 4 | education
 5 | education-num
 6 | marital-status
 7 | occupation
 8 | relationhip
 9 | race
10 | sex
11 | capital-gain
12 | capital-loss
13 | hours-per-week
14 | native-country
15 | class
16 | 


--------------------------------------------------------------------------------
/data/meta_cnames.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/meta_cnames.Rdata


--------------------------------------------------------------------------------
/data/ml_gbm_fit.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_fit.RData


--------------------------------------------------------------------------------
/data/ml_gbm_fit2.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_fit2.RData


--------------------------------------------------------------------------------
/data/ml_gbm_reg_linear.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_reg_linear.RData


--------------------------------------------------------------------------------
/data/ml_gbm_tune.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_gbm_tune.RData


--------------------------------------------------------------------------------
/data/ml_randomforest_m1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_randomforest_m1.RData


--------------------------------------------------------------------------------
/data/ml_rf_OOB_RMSE.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_OOB_RMSE.RData


--------------------------------------------------------------------------------
/data/ml_rf_ames_randomForest.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_ames_randomForest.RData


--------------------------------------------------------------------------------
/data/ml_rf_dec.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_dec.RData


--------------------------------------------------------------------------------
/data/ml_rf_grid_perf.data:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_grid_perf.data


--------------------------------------------------------------------------------
/data/ml_rf_h2o.grid.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_h2o.grid.RData


--------------------------------------------------------------------------------
/data/ml_rf_hypergrid_oobrmse.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_hypergrid_oobrmse.RData


--------------------------------------------------------------------------------
/data/ml_rf_m2.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_m2.RData


--------------------------------------------------------------------------------
/data/ml_rf_oob_comp.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_oob_comp.RData


--------------------------------------------------------------------------------
/data/ml_rf_optimal_ranger.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_optimal_ranger.RData


--------------------------------------------------------------------------------
/data/ml_rf_random_grid.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_random_grid.RData


--------------------------------------------------------------------------------
/data/ml_rf_xgb.fit1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_xgb.fit1.RData


--------------------------------------------------------------------------------
/data/ml_rf_xgb.fit3.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/ml_rf_xgb.fit3.RData


--------------------------------------------------------------------------------
/data/model_auc.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/model_auc.RData


--------------------------------------------------------------------------------
/data/model_eta.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/model_eta.RData


--------------------------------------------------------------------------------
/data/negative-words.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/negative-words.txt


--------------------------------------------------------------------------------
/data/osmsa_PLZ_14.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/osmsa_PLZ_14.RData


--------------------------------------------------------------------------------
/data/titanic.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/data/titanic.RData


--------------------------------------------------------------------------------
/machine_learning.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/misc/along.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Along the project"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "16 1 2020"
 5 | output: beamer_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | ```
11 | 
12 | ## 
13 | 
14 | [The Lasso under Heteroscedasticity](https://statistics.berkeley.edu/sites/default/files/tech-reports/783.pdf)
15 | 
16 | https://sites.google.com/site/nationalekonomigrunder/regression-analysis/assumptions


--------------------------------------------------------------------------------
/rcode/GESISPanel.R:
--------------------------------------------------------------------------------
 1 | ## ----setup, include=FALSE------------------------------------------------
 2 | knitr::opts_chunk$set(echo = TRUE)
 3 | 
 4 | ## ------------------------------------------------------------------------
 5 | wave <- "fb"
 6 | 
 7 | ## ------------------------------------------------------------------------
 8 | wavedatapath <- "J:/Work/GESISPanel_DATA/01_post_processing/c01/f_2018/fb/02_master/data/STATA14/"
 9 | 
10 | ## ------------------------------------------------------------------------
11 | setwd(wavedatapath)
12 | dat <- readstata13::read.dta13("fb_master_20180814_COMPLETE.dta",convert.factors = F)
13 | 
14 | ## ------------------------------------------------------------------------
15 | ncol(dat)
16 | 
17 | ## ------------------------------------------------------------------------
18 | indvar_aapor <- grep("za006a",colnames(dat))
19 | 
20 | colnames(dat)[indvar_aapor]
21 | 
22 | ## ------------------------------------------------------------------------
23 | waves <- paste0(rep(letters[1:6],each=6,),rep(letters[1:6],6))
24 | waves <- waves[-which(waves%in%c("ad","ae","af","fc","fd","fe","ff"))]
25 | 
26 | G_response_list <- list()
27 | for (i in 1:length(waves)){
28 |   ind_aapor <- which(colnames(dat)==paste0(waves[i],"za006a"))
29 |   respvar <- dat[,ind_aapor]
30 |   dat1 <- dat[!="-22",]
31 |   G_response <- rep(0,nrow(dat1))
32 |   G_response[dat1[,ind_aapor]%in%c("211","212","319","21121","211221")] <- 1
33 |   G_response_list[[i]] <- G_response
34 | }
35 | 
36 | sumtab_resp <- lapply(G_response_list,table)
37 | 
38 | sumtab_resp2 <- do.call(rbind, sumtab_resp)
39 | 
40 | ## ------------------------------------------------------------------------
41 | table(dat$D_response)
42 | 
43 | 


--------------------------------------------------------------------------------
/rcode/c2b_random_forests_h2o.R:
--------------------------------------------------------------------------------
  1 | #' ---
  2 | #' title: "Random Forests with h2o"
  3 | #' author: "Jan-Philipp Kolb"
  4 | #' date: "24 Mai 2019"
  5 | #' output: html_document
  6 | #' ---
  7 | #' 
  8 | ## ----setup, include=FALSE------------------------------------------------
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | #' 
 12 | #' ## The Ames housing data
 13 | #' 
 14 | ## ------------------------------------------------------------------------
 15 | set.seed(123)
 16 | ames_split <- rsample::initial_split(AmesHousing::make_ames(), 
 17 |                                      prop = .7)
 18 | ames_train <- rsample::training(ames_split)
 19 | ames_test  <- rsample::testing(ames_split)
 20 | 
 21 | #' 
 22 | #' 
 23 | #' 
 24 | #' ## Full grid search with H2O
 25 | #' 
 26 | ## ------------------------------------------------------------------------
 27 | library(h2o)          # an extremely fast java-based platform
 28 | 
 29 | #' 
 30 | #' 
 31 | #' - If you ran the grid search code above you probably noticed the code took a while to run. 
 32 | #' - `ranger` is computationally efficient, but as the grid search space expands, the manual for loop process becomes less efficient. 
 33 | #' - `h2o` is a powerful and efficient java-based interface that provides parallel distributed algorithms. 
 34 | #' - `h2o` allows for different optimal search paths in our grid search. This allows us to be more efficient in tuning our models. Here, I demonstrate how to tune a random forest model with `h2o`. Lets go ahead and start up h2o:
 35 | #' 
 36 | #' <!--
 37 | #' (I turn off progress bars when creating reports/tutorials)
 38 | #' -->
 39 | #' 
 40 | #' 
 41 | #' 
 42 | ## ------------------------------------------------------------------------
 43 | # start up h2o 
 44 | h2o.no_progress()
 45 | h2o.init(max_mem_size = "5g")
 46 | 
 47 | #' 
 48 | #' ## Random forests with `h2o`
 49 | #' 
 50 | #' - We can try a comprehensive (full cartesian) grid search, which means we will examine every combination of hyperparameter settings that we specify in `hyper_grid.h2o`. 
 51 | #' - We search across 96 models but since we perform a full cartesian search this process is not any faster. 
 52 | #' - Note that the best performing model has an OOB RMSE of 24504, which is lower than what we achieved previously. 
 53 | #' - This is because some of the default settings regarding minimum node size, tree depth, etc. are more “generous” than `ranger` and `randomForest` 
 54 | #' - E.g. `h2o` has a default minimum node size of one whereas `ranger` and `randomForest` default settings are 5.
 55 | #' 
 56 | #' 
 57 | #' ## Preparation for `h2o`
 58 | #' 
 59 | ## ------------------------------------------------------------------------
 60 | # create feature names
 61 | y <- "Sale_Price"
 62 | x <- setdiff(names(ames_train), y)
 63 | # turn training set into h2o object
 64 | train.h2o <- as.h2o(ames_train)
 65 | # hyperparameter grid
 66 | hyper_grid.h2o <- list(
 67 |   ntrees      = seq(200, 500, by = 100),
 68 |   mtries      = seq(20, 30, by = 2),
 69 |   sample_rate = c(.55, .632, .70, .80)
 70 | )
 71 | 
 72 | #' 
 73 | #' ##
 74 | #' 
 75 | #' <!--
 76 | #' Achtung: folgendes dauert sehr lange
 77 | #' -->
 78 | #' 
 79 | ## ----eval=F--------------------------------------------------------------
 80 | ## # build grid search
 81 | ## grid <- h2o.grid(
 82 | ##   algorithm = "randomForest",
 83 | ##   grid_id = "rf_grid",
 84 | ##   x = x,
 85 | ##   y = y,
 86 | ##   training_frame = train.h2o,
 87 | ##   hyper_params = hyper_grid.h2o,
 88 | ##   search_criteria = list(strategy = "Cartesian")
 89 | ##   )
 90 | 
 91 | #' 
 92 | ## ----eval=F,echo=F-------------------------------------------------------
 93 | ## save(grid,file="../data/ml_rf_h2o.grid.RData")
 94 | 
 95 | #' 
 96 | ## ----echo=F--------------------------------------------------------------
 97 | load("../data/ml_rf_h2o.grid.RData")
 98 | 
 99 | #' 
100 | #' 
101 | ## ------------------------------------------------------------------------
102 | # collect the results and sort by our model performance 
103 | # metric of choice
104 | grid_perf <- h2o.getGrid(
105 |   grid_id = "rf_grid", 
106 |   sort_by = "mse", 
107 |   decreasing = FALSE
108 |   )
109 | 
110 | #' 
111 | #' ##
112 | #' 
113 | ## ----eval=F,echo=F-------------------------------------------------------
114 | ## save(grid_perf,file = "../data/ml_rf_grid_perf.data")
115 | 
116 | #' 
117 | ## ----echo=F,eval=T-------------------------------------------------------
118 | load("../data/ml_rf_grid_perf.data")
119 | 
120 | #' 
121 | #' 
122 | ## ----eval=T--------------------------------------------------------------
123 | print(grid_perf)
124 | 
125 | #' 
126 | #' 
127 | #' ## Combinatorial explosion
128 | #' 
129 | #' - Because of the [**combinatorial explosion**](https://en.wikipedia.org/wiki/Combinatorial_explosion), each additional hyperparameter added has a huge effect on the time. 
130 | #' - `h2o` provides an additional grid search path called “RandomDiscrete”, which will jump from one random combination to another and stop once a certain level of improvement has been made, certain amount of time has been exceeded, or a certain amount of models have been ran (or a combination of these have been met). 
131 | #' - A random discrete search path will likely not find the optimal model, but it does a good job of finding a very good model.
132 | #' 
133 | #' - E.g., the following code searches 2,025 hyperparameter combinations. 
134 | #' - Our random grid search will stop if none of the last 10 models provides a 0.5% improvement in MSE. 
135 | #' - If we continue to find improvements then I cut the grid search off after 600 seconds (30 minutes). 
136 | #' - Our grid search assessed 190 models and the best model (max_depth = 30, min_rows = 1, mtries = 25, nbins = 30, ntrees = 200, sample_rate = .8) achived an RMSE of 24686 
137 | #' ).
138 | #' 
139 | #' ##
140 | #' 
141 | ## ------------------------------------------------------------------------
142 | # hyperparameter grid
143 | hyper_grid.h2o <- list(
144 |   ntrees      = seq(200, 500, by = 150),
145 |   mtries      = seq(15, 35, by = 10),
146 |   max_depth   = seq(20, 40, by = 5),
147 |   min_rows    = seq(1, 5, by = 2),
148 |   nbins       = seq(10, 30, by = 5),
149 |   sample_rate = c(.55, .632, .75)
150 | )
151 | 
152 | #' 
153 | ## ------------------------------------------------------------------------
154 | # random grid search criteria
155 | search_criteria <- list(
156 |   strategy = "RandomDiscrete",
157 |   stopping_metric = "mse",
158 |   stopping_tolerance = 0.005,
159 |   stopping_rounds = 10,
160 |   max_runtime_secs = 30*60
161 |   )
162 | 
163 | #' 
164 | #' 
165 | #' ##
166 | #' 
167 | #' <!--
168 | #' Folgendes dauert wieder lange
169 | #' -->
170 | #' 
171 | ## ------------------------------------------------------------------------
172 | # build grid search 
173 | random_grid <- h2o.grid(
174 |   algorithm = "randomForest",
175 |   grid_id = "rf_grid2",
176 |   x = x, 
177 |   y = y, 
178 |   training_frame = train.h2o,
179 |   hyper_params = hyper_grid.h2o,
180 |   search_criteria = search_criteria
181 |   )
182 | 
183 | #' 
184 | ## ------------------------------------------------------------------------
185 | # collect the results and sort by our model performance 
186 | # metric of choice
187 | grid_perf2 <- h2o.getGrid(
188 |   grid_id = "rf_grid2", 
189 |   sort_by = "mse", 
190 |   decreasing = FALSE
191 |   )
192 | 
193 | #' 
194 | ## ----eval=F,echo=F-------------------------------------------------------
195 | ## save(random_grid,grid_perf2,file="../data/ml_rf_random_grid.RData")
196 | 
197 | #' 
198 | ## ----echo=F,eval=T-------------------------------------------------------
199 | load("../data/ml_rf_random_grid.RData")
200 | 
201 | #' 
202 | #' 
203 | #' ## 
204 | #' 
205 | ## ----eval=T--------------------------------------------------------------
206 | print(grid_perf2)
207 | 
208 | #' 
209 | #' ## Hold-out test
210 | #' 
211 | #' - Once we’ve identifed the best model we can get that model and apply it to our hold-out test set to compute our final test error. 
212 | #' 
213 | ## ------------------------------------------------------------------------
214 | # Grab the model_id for the top model, 
215 | # chosen by validation error
216 | best_model_id <- grid_perf2@model_ids[[1]]
217 | best_model <- h2o.getModel(best_model_id)
218 | 
219 | #' 
220 | #' <!--
221 | #' Folgendes funktioniert nicht - man braucht ein größeres Cluster
222 | #' -->
223 | #' 
224 | ## ----eval=F--------------------------------------------------------------
225 | ## # Now let’s evaluate the model performance on a test set
226 | ## ames_test.h2o <- as.h2o(ames_test)
227 | ## best_model_perf <- h2o.performance(model = best_model,
228 | ##                                    newdata = ames_test.h2o)
229 | ## 
230 | ## # RMSE of best model
231 | ## h2o.mse(best_model_perf) %>% sqrt()
232 | 
233 | #' 
234 | #' 
235 | #' - We have reduced our RMSE to near 23,000, which is a 10K reduction compared to elastic nets and bagging.
236 | #' 
237 | #' 
238 | #' ## Links
239 | #' 
240 | #' - [Download h2o](http://h2o.ai/download/)
241 | #' 
242 | 


--------------------------------------------------------------------------------
/rcode/c3b_gbm_regression_h2o.R:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title: "Gradient boosting with h2o"
 3 | #' author: "Jan-Philipp Kolb"
 4 | #' date: "24 Mai 2019"
 5 | #' output: ioslides_presentation
 6 | #' ---
 7 | #' 
 8 | ## ----setup, include=FALSE------------------------------------------------
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | 
11 | #' 
12 | #' 
13 | #' ## h2o
14 | #' 
15 | ## ------------------------------------------------------------------------
16 | library(h2o)          # a java-based platform
17 | 
18 | #' 
19 | #' 
20 | #' The h2o R package is a powerful and efficient java-based interface that allows for local and cluster-based deployment. It comes with a fairly comprehensive online resource that includes methodology and code documentation along with tutorials.
21 | #' 
22 | #' ## Features include:
23 | #' 
24 | #' - Distributed and parallelized computation on either a single node or a multi-node cluster.
25 | #' - Automatic early stopping based on convergence of user-specified metrics to user-specified relative tolerance.
26 | #' - Stochastic GBM with column and row sampling (per split and per tree) for better generalization.
27 | #' - Support for exponential families (Poisson, Gamma, Tweedie) and loss functions in addition to binomial (Bernoulli), Gaussian and multinomial distributions, such as  Quantile regression (including Laplace).
28 | #' - Grid search for hyperparameter optimization and model selection.
29 | #' - Data-distributed, which means the entire dataset does not need to fit into memory on a single node, hence scales to any size training set.
30 | #' - Uses histogram approximations of continuous variables for speedup.
31 | #' - Uses dynamic binning - bin limits are reset at each tree level based on the split bins’ min and max values discovered during the last pass.
32 | #' - Uses squared error to determine optimal splits.
33 | #' <!--
34 | #' - Distributed implementation details outlined in a blog post by Cliff Click.
35 | #' -->
36 | #' - Unlimited factor levels.
37 | #' - Multiclass trees (one for each class) built in parallel with each other.
38 | #' - Apache 2.0 Licensed.
39 | #' - Model export in plain Java code for deployment in production environments.
40 | #' 
41 | #' ## 
42 | 


--------------------------------------------------------------------------------
/rcode/creating_rcode.R:
--------------------------------------------------------------------------------
 1 | # Jan-Philipp Kolb
 2 | # Thu May 23 13:02:28 2019
 3 | 
 4 | #-------------------------------------------------#
 5 | # Installing necessary packages
 6 | #-------------------------------------------------#
 7 | 
 8 | necpackages <- c("knitr","rmarkdown","tidyverse")
 9 | 
10 | 
11 | for (i in 1:length(necpackages)){
12 |   if (!require(necpackages[i])){
13 |     install.packages(necpackages[i])    
14 |   }
15 |   library(necpackages[i])
16 | }
17 | 
18 | #-------------------------------------------------#
19 | # Load libraries
20 | #-------------------------------------------------#
21 | 
22 | library(knitr)
23 | library(rmarkdown)
24 | library(lme4)
25 | 
26 | #-------------------------------------------------#
27 | # Define paths
28 | #-------------------------------------------------#
29 | 
30 | main_path <- "D:/Daten/GitHub/machine_learning/"
31 | main_path <- "D:/github/machine_learning/" 
32 | slide_path <- paste0(main_path,"slides/")
33 | rcode_path <- paste0(main_path,"rcode/")
34 | 
35 | #-------------------------------------------------#
36 | # Parts of the presentation
37 | #-------------------------------------------------#
38 | 
39 | dirnamen <- dir(slide_path)
40 | presparts <- grep(".Rmd",dirnamen,value = T)
41 | 
42 | 
43 | # setwd("D:/gitlab/IntroDataAnalysis/rcode/")
44 | setwd(rcode_path)
45 | 
46 | for (i in 1:length(presparts)){
47 |   purl(paste0("../slides/",presparts[i]),documentation = 2)  
48 | }
49 | 
50 | #-------------------------------------------------#
51 | # Creating pdf slides
52 | #-------------------------------------------------#
53 | 
54 | # setwd("D:/Daten/GitLab/IntroDataAnalysis/slides")
55 | setwd(slide_path)
56 | 
57 | 
58 | for (i in 1:length(presparts)){
59 |   rmarkdown::render(presparts[i], "beamer_presentation")
60 | }
61 | 
62 | 
63 | for (i in 1:length(presparts)){
64 |   rmarkdown::render(presparts[i], "all")
65 | }
66 | 
67 | 
68 | for (i in 3:length(presparts)){
69 |   rmarkdown::render(presparts[i], "md_document")
70 | }
71 | 
72 | # B1_DataProcessing
73 | 
74 | 
75 | #-------------------------------------------------#
76 | # Create rcode in course
77 | #-------------------------------------------------#
78 | 
79 | setwd(rcode_path)
80 | 
81 | purl("../slides/C2_hierarchMods.Rmd",documentation = 2)
82 | purl("../slides/D1_webScrapping.Rmd",documentation = 2)
83 | purl("../slides/D2_dataCleaning.Rmd",documentation = 2)
84 | 
85 | #-------------------------------------------------#
86 | # Install necessary packages
87 | #-------------------------------------------------#
88 | 
89 | 
90 | install.packages("lme4")
91 | 
92 | #-------------------------------------------------#
93 | # Links
94 | #-------------------------------------------------#
95 | 
96 | 
97 | # https://rmarkdown.rstudio.com/authoring_quick_tour.html
98 | # https://www.r-bloggers.com/function-to-simplify-loading-and-installing-packages/


--------------------------------------------------------------------------------
/rcode/e_Clustering.R:
--------------------------------------------------------------------------------
  1 | #' ---
  2 | #' title: "Clustering"
  3 | #' author: "Jan-Philipp Kolb and Alexander Murray-Watters"
  4 | #' date: "18 Januar 2019"
  5 | #' output: beamer_presentation
  6 | #' ---
  7 | #' 
  8 | ## ----setupClustering, include=FALSE--------------------------------------
  9 | knitr::opts_chunk$set(echo = FALSE)
 10 | 
 11 | #' 
 12 | #' 
 13 | #' 
 14 | #' 
 15 | #' ## Resources
 16 | #' 
 17 | #' 
 18 | ## ----echo=F, eval=FALSE--------------------------------------------------
 19 | ## slides_path <- getwd()
 20 | ## git_path <- gsub("slides","",slides_path)
 21 | ## if (Sys.info()$nodename=="MAC14077"){
 22 | ##   git_path <- "D:/Daten/GitHub/machine_learning/"
 23 | ##   slides_path <- paste0(git_path,"/slides")
 24 | ## }
 25 | 
 26 | #' 
 27 | #' 
 28 | #' - [Package `kknn`](https://cran.r-project.org/web/packages/kknn/kknn.pdf)
 29 | #' 
 30 | ## ----eval=F--------------------------------------------------------------
 31 | ## install.packages("kknn")
 32 | 
 33 | #' 
 34 | ## ------------------------------------------------------------------------
 35 | library("kknn")
 36 | 
 37 | #' 
 38 | #' 
 39 | #' ## [Geographic clustering of UK cities](https://www.r-bloggers.com/geographic-clustering-of-uk-cities/)
 40 | #' 
 41 | #' Animated example: 
 42 | #' https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68
 43 | #' 
 44 | #' 
 45 | #' ## Exercise: Kmeans
 46 | #' 
 47 | #' Apply kmeans to to the `iris` dataset with 2, 3, and 4
 48 | #' clusters. Produce three scatter plots, with the points colored
 49 | #' according to cluster assignment.
 50 | #' 
 51 | #' 
 52 | #' ## hdbscan
 53 | #' 
 54 | #' A fairly new alternative to kmeans, hdbscan does not require you to
 55 | #' specify the number of categories to be assigned. It only requires a
 56 | #' decision as to the minimum number of points needed to be included in a
 57 | #' cluster. This minimum number acts as a smoothing parameter (such as a
 58 | #' density bandwidth parameter or a histograms bin/bar width), with lower
 59 | #' values finding more clusters. Other advantages of hdbscan include .
 60 | #' 
 61 | ## ---- eval=FALSE---------------------------------------------------------
 62 | ## install.packages("dbscan")
 63 | 
 64 | #' 
 65 | #' 
 66 | #' 
 67 | ## ------------------------------------------------------------------------
 68 | library(ggplot2)
 69 | library(dplyr)
 70 | library(maps)
 71 | library(dbscan)
 72 | 
 73 | ## Example where kmeans finds only 1 cluster.
 74 | two.clust.eg <- rbind(matrix(rnorm(1000, sd = 0.8), ncol=2),
 75 |                       matrix(rnorm(100, mean = 120, sd = 0.12), ncol = 2))
 76 | 
 77 | clust <- kmeans(two.clust.eg, centers=2)
 78 | 
 79 | plot(x, col = clust$cluster)
 80 | ##     points(cl$centers, col = 1:2, pch = 8, cex = 2)
 81 | 
 82 | 
 83 | 
 84 | #' 
 85 | ## ------------------------------------------------------------------------
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | data(moons)
 92 | 
 93 | ## Running HDBscan with the minimum number of points set to 5.
 94 | res <- dbscan::hdbscan(moons, minPts = 3)
 95 | 
 96 | plot(moons, col = res$cluster + 1, main="R implementation")
 97 | 
 98 | #' 
 99 | #' 
100 | #' 
101 | #' ## Exercise: Apply kmeans to the moons dataset and compare the results. 
102 | #' -- Be sure to try different numbers of centers.
103 | #' 
104 | #' 
105 | #' ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 
106 | #' 
107 | #' ## Exercise: Apply both kmeans and hdbscan to the `ChickWeight` dataset's "weight" "Time" variables, and see how well you can get each to perform.
108 | #' 
109 | #' 
110 | #' 
111 | #' 
112 | ## ---- eval=FALSE, echo=FALSE---------------------------------------------
113 | ## ## kmeans
114 | ## plot(ChickWeight[,1:2], col=kmeans(ChickWeight[,1:2], centers=4)$centers)
115 | ## 
116 | ## ## hdbscan, minPts=10
117 | ## plot(ChickWeight[,1:2], col=dbscan::hdbscan(ChickWeight[,1:2], minPts=10)$cluster)
118 | ## 
119 | ## ## Diet cat. for comparison.
120 | ## plot(ChickWeight[,1:2], col=ChickWeight$Diet)
121 | ## 
122 | ## ## Chick cat. for comparison.
123 | ## plot(ChickWeight[,1:2], col=ChickWeight$Chick)
124 | ## 
125 | ## 
126 | 
127 | #' 
128 | #' 
129 | ## ---- eval=FALSE---------------------------------------------------------
130 | ## load(paste0(git_path,"/data/osmsa_PLZ_14.RData"))
131 | 
132 | #' 
133 | #' 
134 | #' 
135 | #' ## [US Census Data](https://elitedatascience.com/datasets)
136 | #' 
137 | #' - [US Census Data (Clustering)](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29) – Clustering based on demographics is a tried and true way to perform market research and segmentation.
138 | #' 
139 | #' 
140 | #' 
141 | #' ## Links
142 | #' 
143 | #' - [Using clusterlab to benchmark clustering algorithms](https://www.r-bloggers.com/using-clusterlab-to-benchmark-clustering-algorithms/)
144 | 


--------------------------------------------------------------------------------
/rcode/fitting.R:
--------------------------------------------------------------------------------
 1 | simbias <- function(seed=8765){
 2 |   # The default seed guarantees a nice histogram. This is the only
 3 |   # reason that accepting the default, x1c <- simbias(), is required in the lesson. 
 4 |   # The effect will be evident with other seeds as well.
 5 |   set.seed(seed) 
 6 |   temp <- rnorm(100)
 7 |   # Point A
 8 |   x1 <- (temp + rnorm(100))/sqrt(2)
 9 |   x2 <- (temp + rnorm(100))/sqrt(2)
10 |   x3 <- rnorm(100)
11 |   # Function to simulate regression of y on 2 variables.
12 |   f <- function(k){
13 |     # Point B
14 |     y <- x1 + x2 + x3 + .3*rnorm(100)
15 |     # Point C
16 |     c(lm(y ~ x1 + x2)$coef[2],
17 |        lm(y ~ x1 + x3)$coef[2])
18 |   }
19 |   # Point D
20 |   sapply(1:150, f)
21 | }
22 | 
23 | # Illustrate the effect of bogus regressors on residual squared error.
24 | bogus <- function(){
25 |   temp <- swiss
26 |   # Add 41 columns of random regressors to a copy of the swiss data.
27 |   for(n in 1:41){temp[,paste0("random",n)] <- rnorm(nrow(temp))}
28 |   # Define a function to compute the deviance of Fertility regressed
29 |   # on all regressors up to column n. The function, deviance(model), computes
30 |   # the residual sum of squares of the model given as its argument.
31 |   f <- function(n){deviance(lm(Fertility ~ ., temp[,1:n]))}
32 |   # Apply f to data from n=6, i.e., the legitimate regressors,
33 |   # through n=47, i.e., a full complement of bogus regressors.
34 |   rss <- sapply(6:47, f)
35 |   # Display result.
36 |   plot(0:41, rss, xlab="Number of bogus regressors.", ylab="Residual squared error.",
37 |        main="Residual Squared Error for Swiss Data\nUsing Irrelevant (Bogus) Regressors",
38 |        pch=21, bg='red')
39 | }
40 | 
41 | # Plot histograms illustrating bias in estimates of a regressor
42 | # coefficient 1) when an uncorrelated regressor is missing and
43 | # 2) when a correlated regressor is missing.
44 | x1hist <- function(x1c){
45 |   p1 <- hist(x1c[1,], plot=FALSE)
46 |   p2 <- hist(x1c[2,], plot=FALSE)
47 |   yrange <- c(0, max(p1$counts, p2$counts))
48 |   plot(p1, col=rgb(0,0,1,1/4), xlim=range(x1c), ylim=yrange, xlab="Estimated coefficient of x1",
49 |         main="Bias Effect of Omitted Regressor")
50 |   plot(p2, col=rgb(1,0,0,1/4), xlim=range(x1c), ylim=yrange, add=TRUE)
51 |   legend(1.1, 40, c("Uncorrelated regressor, x3, omitted", "Correlated regressor, x2, omitted"),
52 |          fill=c(rgb(0,0,1,1/4), rgb(1,0,0,1/4)))
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/rcode/incourse1.R:
--------------------------------------------------------------------------------
  1 | # Jan-Phillip Kolb
  2 | # 
  3 | 
  4 | 
  5 | # install.packages("lme4")
  6 | 
  7 | library(lme4)
  8 | 
  9 | install.packages("keras")
 10 | 
 11 | # to coop overfitting
 12 | install.packages("glmnet")
 13 | 
 14 | # xgboost
 15 | 
 16 | install.packages("xgboost")
 17 | 
 18 | install.packages("rpart")
 19 | 
 20 | install.packages("gbm")
 21 | 
 22 | install.packages("nnet")
 23 | 
 24 | ?knn
 25 | 
 26 | ?kmeans
 27 | 
 28 | kmeans()
 29 | 
 30 | install.packages("tidyverse")
 31 | 
 32 | #############################
 33 | 
 34 | path1<-"https://raw.githubusercontent.com/"
 35 | path2<- "thomaspernet/data_csv_r/master/data/"
 36 | dname <- "titanic_csv.csv"
 37 | titanic <- read.csv(paste0(path1,path2,dname))
 38 | 
 39 | data(Titanic)
 40 | head(Titanic)
 41 | 
 42 | install.packages("datasets.load")
 43 | 
 44 | install.packages("colourpicker")
 45 | c("#8B2323", "#7FFFD4")
 46 | 
 47 | 
 48 | # lme4::
 49 | 
 50 | ### Exercise swiss data
 51 | 
 52 | # 1)
 53 | data(swiss) 
 54 | dim(swiss) 
 55 | nrow(swiss)
 56 | ncol(swiss)
 57 | 
 58 | head(swiss,n=10)
 59 | tail(swiss)
 60 | View(swiss)
 61 | str(swiss) 
 62 | 
 63 | # install.packages("DT")
 64 | 
 65 | DT::datatable(swiss)
 66 | 
 67 | ####
 68 | 
 69 | data(airquality)
 70 | 
 71 | (airq <- data.table::data.table(airquality))
 72 | 
 73 | airq
 74 | 
 75 | rm(airq)
 76 | 
 77 | ### Solution: random number 
 78 | 
 79 | set.seed(10)
 80 | (x <- runif(8))
 81 | 
 82 | 
 83 | round(exp(diff(log(x))), 1)
 84 | 
 85 | clean_titanic <- titanic %>%
 86 |   mutate(pclass=factor(pclass,levels = c(1, 2, 3),
 87 |                        labels=c('Upper','Middle','Lower')),
 88 |          survived = factor(survived,levels = c(0, 1),
 89 |                            labels=c('No', 'Yes'))) %>%
 90 |   na.omit()
 91 | 
 92 | library(dplyr)
 93 | 
 94 | tit_wna <- na.omit(titanic)
 95 | 
 96 | # mutate(tit_wna,...)
 97 | 
 98 | clean_titanic <- mutate(,pclass=factor(pclass,levels = c(1, 2, 3),
 99 |                                               labels=c('Upper','Middle','Lower'))))
100 | 
101 | 
102 | numerics <- c(1,2,3)
103 | str(numerics)
104 | 
105 | charvec <- c("hj",7,"iu")
106 | str(charvec)
107 | 
108 | ab <- as.factor(c(1,2,1,2))
109 | str(ab)
110 | #########################
111 | 
112 | library(dplyr)
113 | library(tidyr)
114 | stocks <- tibble(
115 |   time = as.Date('2009-01-01') + 0:9,
116 |   X = rnorm(10, 0, 1),
117 |   Y = rnorm(10, 0, 2),
118 |   Z = rnorm(10, 0, 4)
119 | )
120 | 
121 | 
122 | head(gather(stocks, "stock", "price", -time))
123 | 


--------------------------------------------------------------------------------
/rcode/incourse2.R:
--------------------------------------------------------------------------------
  1 | # Jan-Philipp Kolb
  2 | # Mon Jun 03 16:47:47 2019
  3 | # In course part 2
  4 | 
  5 | data(mtcars)
  6 | 
  7 | m1 <- lm(mpg~wt,data=mtcars)
  8 | 
  9 | sum_mod <- summary(m1)
 10 | sum_mod$coefficients
 11 | 
 12 | ##############################
 13 | 
 14 | dev.off()
 15 | 
 16 | plot(mtcars$wt,mtcars$mpg)
 17 | abline(m1)
 18 | segments(mtcars$wt, mtcars$mpg, mtcars$wt, pre, col="red")
 19 | 
 20 | #################################
 21 | 
 22 | 
 23 | ames_data <- AmesHousing::make_ames()# 1)
 24 | # alternative
 25 | library(AmesHousing)
 26 | ames_data <- make_ames()
 27 | 
 28 | 
 29 | colnames(ames_data)
 30 | m1 <- lm(Sale_Price ~ Gr_Liv_Area + TotRms_AbvGrd, data = ames_data)
 31 | m2 <- lm(Sale_Price ~ Gr_Liv_Area, data = ames_data)
 32 | m3 <- lm(Sale_Price ~ TotRms_AbvGrd, data = ames_data)
 33 | 
 34 | m1$coefficients
 35 | m2$coefficients
 36 | m3$coefficients
 37 | 
 38 | ##########
 39 | 
 40 | for (i in 1:3){
 41 |   eval(parse(text=paste0("summary(m",i,")")))
 42 | }
 43 | 
 44 | #################################
 45 | 
 46 | ?glmnet
 47 | 
 48 | library(AmesHousing)
 49 | ames_data <- AmesHousing::make_ames()
 50 | 
 51 | ncol(ames_data)
 52 | 
 53 | ames_train_x <- model.matrix(Sale_Price ~ ., ames_train)[, -1]
 54 | ames_train_y <- log(ames_train$Sale_Price)
 55 | ames_test_x <- model.matrix(Sale_Price ~ ., ames_test)[, -1]
 56 | ames_test_y <- log(ames_test$Sale_Price)
 57 | 
 58 | library(glmnet)
 59 | ames_ridge <- glmnet(x = ames_train_x,y = ames_train_y,
 60 |                      alpha = 0)
 61 | 
 62 | coef(ames_ridge)
 63 | 
 64 | ####################################
 65 | 
 66 | install.packages("lars")
 67 | library(lars) # 1)
 68 | data(diabetes)
 69 | 
 70 | 
 71 | library(glmnet) #2)
 72 | # Create the scatterplots
 73 | set.seed(1234)
 74 | par(mfrow=c(2,5))
 75 | for(i in 1:10){ # 3)
 76 |   plot(diabetes$x[,i], diabetes$y)
 77 |   abline(lm(diabetes$y~diabetes$x[,i]),col="red")
 78 | }
 79 | 
 80 | model_ols <- lm(diabetes$y ~ diabetes$x) # 4)
 81 | summary(model_ols)
 82 | 
 83 | lambdas <- 10^seq(7, -3)
 84 | model_ridge <- glmnet(diabetes$x, diabetes$y, 
 85 |                       alpha = 0, lambda = lambdas)
 86 | plot.glmnet(model_ridge, xvar = "lambda", label = TRUE)
 87 | 
 88 | cv_fit <- cv.glmnet(x=diabetes$x, y=diabetes$y, 
 89 |                     alpha = 0, nlambda = 1000)
 90 | cv_fit$lambda.min
 91 | 
 92 | plot.cv.glmnet(cv_fit)
 93 | 
 94 | fit <- glmnet(x=diabetes$x, y=diabetes$y, 
 95 |               alpha = 0, lambda=cv_fit$lambda.min)
 96 | fit$beta
 97 | 
 98 | fit <- glmnet(x=diabetes$x, y=diabetes$y, 
 99 |               alpha = 0, lambda=cv_fit$lambda.1se)
100 | fit$beta
101 | 
102 | # install.packages("rpart")
103 | 
104 | library(caret)
105 | intrain <- createDataPartition(y=diabetes$y,
106 |                                p = 0.8,
107 |                                list = FALSE)
108 | training <- diabetes[intrain,]
109 | testing <- diabetes[-intrain,]
110 | 
111 | cv_ridge <- cv.glmnet(x=training$x, y=training$y,
112 |                       alpha = 0, nlambda = 1000)
113 | ridge_reg <- glmnet(x=training$x, y=training$y,
114 |                     alpha = 0, lambda=cv_ridge$lambda.min)
115 | ridge_reg$beta
116 | 
117 | ridge_reg <- glmnet(x=training$x, y=training$y,
118 |                     alpha = 0, lambda=cv_ridge$lambda.1se)
119 | ridge_reg$beta
120 | 
121 | ridge_reg <- glmnet(x=training$x, y=training$y,
122 |                     alpha = 0, lambda=cv_ridge$lambda.min)
123 | ridge_pred<-predict.glmnet(ridge_reg,
124 |                            s = cv_ridge$lambda.min,newx = testing$x)
125 | sd((ridge_pred - testing$y)^2)/sqrt(length(testing$y))
126 | 
127 | 
128 | ridge_reg <- glmnet(x=training$x, y=training$y,
129 |                     alpha = 0, lambda=cv_ridge$lambda.1se)
130 | ridge_pred <- predict.glmnet(ridge_reg,
131 |                              s = cv_ridge$lambda.1se, newx = testing$x)
132 | sd((ridge_pred - testing$y)^2)/sqrt(length(testing$y))
133 | 
134 | ols_reg <- lm(y ~ x, data = training)
135 | summary(ols_reg)
136 | 
137 | ols_pred <- predict(ols_reg, newdata=testing$x,
138 |                     type = "response")
139 | sd((ols_pred - testing$y)^2)/sqrt(length(testing$y))
140 | 
141 | coef(model_ols)
142 | 
143 | 
144 | library(Metrics)
145 | mse(testing$y,ols_pred)
146 | mse(ridge_pred,testing$y)
147 | 


--------------------------------------------------------------------------------
/rcode/preparing_bagging.R:
--------------------------------------------------------------------------------
 1 | # Jan-Philipp Kolb
 2 | # Thu May 02 11:09:41 2019
 3 | # Source: https://www.r-bloggers.com/machine-learning-explained-bagging/
 4 | 
 5 | require(data.table)
 6 | library(rpart)
 7 | require(ggplot2)
 8 | 
 9 | set.seed(456)
10 | 
11 | ## Reading data
12 | bagging_data <- data.table(airquality)
13 | 
14 | ggplot(bagging_data,aes(Wind,Ozone))+geom_point()+
15 |   ggtitle("Ozone vs wind speed")
16 | 
17 | data_test <- na.omit(bagging_data[,.(Ozone,Wind)])
18 | 
19 | ## Training data
20 | 
21 | train_index <- sample.int(nrow(data_test),
22 |                           size=round(nrow(data_test)*0.8),
23 |                           replace = F)
24 | 
25 | data_test[train_index,train:=TRUE][-train_index,train:=FALSE]
26 | 
27 | ##  Model without bagging
28 | no_bag_model <- rpart(Ozone~Wind,data_test[train_index],control=rpart.control(minsplit=6))
29 | result_no_bag <- predict(no_bag_model,bagging_data)
30 | 
31 | ##Training of the bagged model
32 | n_model=100
33 | bagged_models=list()
34 | for (i in 1:n_model)
35 | {
36 |   new_sample=sample(train_index,size=length(train_index),replace=T)
37 |   bagged_models=c(bagged_models,list(rpart(Ozone~Wind,data_test[new_sample],control=rpart.control(minsplit=6))))
38 | }
39 | 
40 | ##Getting estimate from the bagged model
41 | bagged_result=NULL
42 | i=0
43 | for (from_bag_model in bagged_models)
44 | {
45 |   if (is.null(bagged_result))
46 |     bagged_result=predict(from_bag_model,bagging_data)
47 |   else
48 |     bagged_result=(i*bagged_result+predict(from_bag_model,bagging_data))/(i+1)
49 |   i=i+1
50 | }
51 | 
52 | ##Plot
53 | require(ggplot2)
54 | gg=ggplot(data_test,aes(Wind,Ozone))+geom_point(aes(color=train))
55 | for (tree_model in bagged_models[1:100])
56 | {
57 |   prediction=predict(tree_model,bagging_data)
58 |   data_plot=data.table(Wind=bagging_data$Wind,Ozone=prediction)
59 |   gg=gg+geom_line(data=data_plot[order(Wind)],aes(x=Wind,y=Ozone),alpha=0.2)
60 | }
61 | data_bagged=data.table(Wind=bagging_data$Wind,Ozone=bagged_result)
62 | gg=gg+geom_line(data=data_bagged[order(Wind)],aes(x=Wind,y=Ozone),color='green')
63 | 
64 | data_no_bag=data.table(Wind=bagging_data$Wind,Ozone=result_no_bag)
65 | gg=gg+geom_line(data=data_no_bag[order(Wind)],aes(x=Wind,y=Ozone),color='red')
66 | gg
67 | 


--------------------------------------------------------------------------------
/rcode/purl_the_slides.R:
--------------------------------------------------------------------------------
1 | # Jan-Philipp Kolb
2 | # Fri Sep 28 11:27:43 2018
3 | 
4 | 
5 | library(knitr)
6 | 
7 | setwd("D:/github/machine_learning/slides")
8 | purl("GESISPanel.Rmd")
9 | 


--------------------------------------------------------------------------------
/rcode/randomforests_boosting.R:
--------------------------------------------------------------------------------
  1 | # Random Forests and Boosting
  2 | 
  3 | # Bagging suffers from tree correlation, which reduces the overall performance of the model.
  4 | # Random forests are a modification of bagging that builds a large collection of de-correlated trees
  5 | # Similar to bagging, each tree is grown to a bootstrap resampled data set, 
  6 | # which makes them different and decorrelates them.
  7 | 
  8 | library(rsample) # data splitting
  9 | library(randomForest) # basic implementation
 10 | library(ranger) # a faster implementation of randomForest
 11 | library(caret)
 12 | 
 13 | 
 14 | ## The Ames housing data
 15 | 
 16 | load("../data/ames_data.RData")
 17 | set.seed(123)
 18 | ames_split <- rsample::initial_split(ames_data,prop=.7)
 19 | ames_train <- rsample::training(ames_split)
 20 | ames_test <- rsample::testing(ames_split)
 21 | 
 22 | ############
 23 | 
 24 | set.seed(123)
 25 | # default RF model
 26 | (m1 <- randomForest(formula = Sale_Price ~ .,data=ames_train))
 27 | 
 28 | plot(m1)
 29 | 
 30 | # ntreeTry - We want enough trees to stabalize the error but using too
 31 | # many trees is inefficient, esp. for large data sets.
 32 | 
 33 | # mtry - number of variables as candidates at each split.
 34 | # When mtry=p -> bagging.
 35 | # When mtry=1 the split variable is completely random
 36 | 
 37 |   # package ranger is faster
 38 | library(ranger)
 39 | ames_ranger <- ranger(formula=Sale_Price ~ .,
 40 |                       data = ames_train,num.trees = 500,
 41 |                       mtry = floor(length(features) / 3))
 42 | 
 43 | ames_ranger
 44 | head(ames_ranger$predictions)
 45 | 
 46 | ## tuning with a hypergrid
 47 | 
 48 | hyper_grid <- expand.grid(
 49 |   mtry = seq(20, 30, by = 2),
 50 |   node_size = seq(3, 9, by = 2),
 51 |   sampe_size = c(.55, .632, .70, .80),
 52 |   OOB_RMSE = 0
 53 | )
 54 | 
 55 | nrow(hyper_grid)
 56 | 
 57 | for(i in 1:nrow(hyper_grid)) {
 58 |   model <- ranger(formula= Sale_Price ~ .,data= ames_train,
 59 |                   num.trees = 500,mtry= hyper_grid$mtry[i],
 60 |                   min.node.size = hyper_grid$node_size[i],
 61 |                   sample.fraction = hyper_grid$sampe_size[i],
 62 |                   seed = 123)
 63 |   # add OOB error to grid
 64 |   hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
 65 | }
 66 | 
 67 | hyper_grid %>% dplyr::arrange(OOB_RMSE) %>% head(10)
 68 | 
 69 |   # Variable importance
 70 | 
 71 | varimp_ranger <- optimal_ranger$variable.importance
 72 | 
 73 | lattice::barchart(sort(varimp_ranger)[1:25],col="royalblue")
 74 | 
 75 | pred_randomForest <- predict(ames_randomForest, ames_test)
 76 | head(pred_randomForest)
 77 | 
 78 | ########################################################
 79 | # Boosting
 80 | 
 81 | library(rsample) # data splitting
 82 | library(gbm) # basic implementation
 83 | library(xgboost) # a faster implementation of gbm
 84 | library(caret) # aggregator package - machine learning
 85 | library(pdp) # model visualization
 86 | library(ggplot2) # model visualization
 87 | library(lime) # model visualization
 88 | 
 89 | ames_data <- AmesHousing::make_ames()
 90 | set.seed(123)
 91 | ames_split <- initial_split(ames_data,prop=.7)
 92 | ames_train <- training(ames_split)
 93 | ames_test <- testing(ames_split)
 94 | 
 95 | # distribution - depends on the response (e.g. bernoulli for binomial)
 96 | # n.tress - number of trees to fit
 97 | # interaction depth - 1 is for additive model
 98 |                     # 2 allows for 2-way interactions
 99 | # cv.folds - number of cross validation folds
100 | # shrinkage - learning rate - a smaller learning rate typically requires more trees. 
101 | 
102 | gbm.fit <- gbm(formula = Sale_Price ~ .,distribution="gaussian",
103 |                data = ames_train,n.trees = 100,interaction.depth = 1,
104 |                shrinkage = 0.001,cv.folds = 5)
105 | 
106 | # this means on average our model is about $29,133 off from the actual sales price
107 | sqrt(min(gbm.fit$cv.error))
108 | 
109 | 
110 | # make prediction
111 | pred <- predict(gbm.fit, ames_test)
112 | 
113 | 


--------------------------------------------------------------------------------
/slides/a1_intro_ml.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning - what is it"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | fontsize: 10pt
  5 | output: 
  6 |   slidy_presentation: 
  7 |     highlight: haddock
  8 |     keep_md: yes
  9 |   beamer_presentation: 
 10 |     colortheme: dolphin
 11 |     fig_height: 3
 12 |     fig_width: 5
 13 |     fonttheme: structuresmallcapsserif
 14 |     highlight: haddock
 15 |     theme: Dresden
 16 | ---
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = FALSE)
 20 | ```
 21 | 
 22 | 
 23 | ## Target of the course
 24 | 
 25 | - What is machine learning?
 26 | - Why do we need it? / When do we need it?
 27 | - How to prepare your data for ML
 28 | 
 29 | ## Preliminaries
 30 | 
 31 | - This topic is huge - we concentrate on presenting the applications in R
 32 | - Usually we have big differences in knowledge and abilities of the participants - please tell, if it is too fast or slow.
 33 | - We have many [**exercises**](http://web.math.ku.dk/~helle/R-intro/exercises.pdf) because at the end you can only learn on your own
 34 | - We have many [**examples**](https://www.showmeshiny.com/) - try them!
 35 | - If there are questions - always ask
 36 | - R is more fun together - ask your neighbor
 37 | 
 38 | 
 39 | 
 40 | ## Introduction round
 41 | 
 42 | ### Please tell us shortly...
 43 | 
 44 | - Where are you from? What are you studying/working?
 45 | - What is your experience level in R/other programming languages?
 46 | - What are your expectations of this course?
 47 | - Where do you think you can use Machine Learning in the future?
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## [Prediction vs interpretability](https://machinelearningmastery.com/model-prediction-versus-interpretation-in-machine-learning/)
 53 | 
 54 | - We have a trade-off of model prediction accuracy versus model interpretation.
 55 | 
 56 | - It is critical to have a clear idea of the which is a priority
 57 | 
 58 | 
 59 | ## [The bias-variance tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff) (I)
 60 | 
 61 | - The bias–variance tradeoff is the property of a set of predictive models whereby models with a lower bias in parameter estimation have a higher variance of the parameter estimates across samples, and vice versa. 
 62 | 
 63 | [![](figure/bias_variance_tradeoff2.png)](https://towardsdatascience.com/understanding-the-bias-variance-tradeoff-165e6942b229)
 64 | 
 65 | <!--
 66 | https://lbelzile.github.io/lineaRmodels/bias-and-variance-tradeoff.html
 67 | http://www.sthda.com/english/articles/38-regression-model-validation/157-cross-validation-essentials-in-r/
 68 | https://daviddalpiaz.github.io/r4sl/biasvariance-tradeoff.html
 69 | -->
 70 | 
 71 | ## The bias-variance tradeoff (II)
 72 | 
 73 | ![](figure/bias_variance_tradeoff.PNG)
 74 | 
 75 | 
 76 | ## [Bootstrapping](https://www.statmethods.net/advstats/bootstrapping.html)
 77 | 
 78 | - [**Bootstrap**](https://www.datacamp.com/community/tutorials/bootstrap-r) is a method of inference about a population using sample data.
 79 | 
 80 | ## [The curse of dimensionality](https://www.freecodecamp.org/news/the-curse-of-dimensionality-how-we-can-save-big-data-from-itself-d9fa0f872335/)
 81 | 
 82 | - We have a high number of possible features
 83 | - We want to find the best representation of data in a lower-dimensional space
 84 | 
 85 | <!--
 86 | ## Parts not used yet
 87 | 
 88 | - The datasets we use for this course
 89 | - What to do with missing data?
 90 | -->
 91 | 
 92 | <!--
 93 | Links
 94 | 
 95 | gitlab.com/ShirinG/intro_to_ml_workshop
 96 | 
 97 | https://de.slideshare.net/ShirinGlander/workshop-introduction-to-machine-learning-with-r
 98 | https://www.shirin-glander.de/
 99 | 
100 | https://www.data2day.de/
101 | 
102 | https://www.codecentric.de/karriere/standorte/karlsruhe/
103 | 
104 | 
105 | -->
106 | 
107 | ## [regression and classification](https://www.youtube.com/watch?v=Z0v9QMkA3dA&list=PLOg0ngHtcqbPTlZzRHA2ocQZqB1D_qZ5V&index=2)
108 | 
109 | ### regression problem
110 | 
111 | y is quantitative
112 | 
113 | ### classification problem
114 | 
115 | y is binomial/categorical
116 | 


--------------------------------------------------------------------------------
/slides/a1_intro_ml.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Machine Learning - what is it"
 3 | date: "13 Januar, 2020"
 4 | fontsize: 10pt
 5 | output: 
 6 |   slidy_presentation: 
 7 |     highlight: haddock
 8 |     keep_md: yes
 9 |   beamer_presentation: 
10 |     colortheme: dolphin
11 |     fig_height: 3
12 |     fig_width: 5
13 |     fonttheme: structuresmallcapsserif
14 |     highlight: haddock
15 |     theme: Dresden
16 | ---
17 | 
18 | 
19 | 
20 | 
21 | ## Target of the course
22 | 
23 | - What is machine learning?
24 | - Why do we need it? / When do we need it?
25 | - How to prepare your data for ML
26 | 
27 | ## Preliminaries
28 | 
29 | - This topic is huge - we concentrate on presenting the applications in R
30 | - Usually we have big differences in knowledge and abilities of the participants - please tell, if it is too fast or slow.
31 | - We have many [**exercises**](http://web.math.ku.dk/~helle/R-intro/exercises.pdf) because at the end you can only learn on your own
32 | - We have many [**examples**](https://www.showmeshiny.com/) - try them!
33 | - If there are questions - always ask
34 | - R is more fun together - ask your neighbor
35 | 
36 | 
37 | 
38 | ## Introduction round
39 | 
40 | ### Please tell us shortly...
41 | 
42 | - Where are you from? What are you studying/working?
43 | - What is your experience level in R/other programming languages?
44 | - What are your expectations of this course?
45 | - Where do you think you can use Machine Learning in the future?
46 | 
47 | 
48 | 
49 | 
50 | ## [Prediction vs interpretability](https://machinelearningmastery.com/model-prediction-versus-interpretation-in-machine-learning/)
51 | 
52 | - We have a trade-off of model prediction accuracy versus model interpretation.
53 | 
54 | - It is critical to have a clear idea of the which is a priority
55 | 
56 | 
57 | ## [Bootstrapping](https://www.statmethods.net/advstats/bootstrapping.html)
58 | 
59 | - [**Bootstrap**](https://www.datacamp.com/community/tutorials/bootstrap-r) is a method of inference about a population using sample data.
60 | 
61 | ## [The curse of dimensionality](https://www.freecodecamp.org/news/the-curse-of-dimensionality-how-we-can-save-big-data-from-itself-d9fa0f872335/)
62 | 
63 | - We have a high number of possible features
64 | - We want to find the best representation of data in a lower-dimensional space
65 | 
66 | <!--
67 | ## Parts not used yet
68 | 
69 | - The datasets we use for this course
70 | - What to do with missing data?
71 | -->
72 | 
73 | <!--
74 | Links
75 | 
76 | gitlab.com/ShirinG/intro_to_ml_workshop
77 | 
78 | https://de.slideshare.net/ShirinGlander/workshop-introduction-to-machine-learning-with-r
79 | https://www.shirin-glander.de/
80 | 
81 | https://www.data2day.de/
82 | 
83 | https://www.codecentric.de/karriere/standorte/karlsruhe/
84 | 
85 | 
86 | -->
87 | 


--------------------------------------------------------------------------------
/slides/a1_intro_ml.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_ml.pdf


--------------------------------------------------------------------------------
/slides/a1_intro_r_cache/slidy/__packages:
--------------------------------------------------------------------------------
 1 | base
 2 | methods
 3 | datasets
 4 | utils
 5 | grDevices
 6 | graphics
 7 | stats
 8 | knitr
 9 | dplyr
10 | magrittr
11 | data.table
12 | purrr
13 | tidyr
14 | MASS
15 | tidyverse
16 | ggplot2
17 | tibble
18 | readr
19 | stringr
20 | forcats
21 | 


--------------------------------------------------------------------------------
/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.RData


--------------------------------------------------------------------------------
/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdb


--------------------------------------------------------------------------------
/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a1_intro_r_cache/slidy/unnamed-chunk-51_1231435b8811d585dacc7bafd9d553ac.rdx


--------------------------------------------------------------------------------
/slides/a2_intro_r.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r.pdf


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/beamer/__packages:
--------------------------------------------------------------------------------
 1 | base
 2 | methods
 3 | datasets
 4 | utils
 5 | grDevices
 6 | graphics
 7 | stats
 8 | knitr
 9 | dplyr
10 | magrittr
11 | data.table
12 | purrr
13 | tidyr
14 | MASS
15 | tidyverse
16 | ggplot2
17 | tibble
18 | readr
19 | stringr
20 | forcats
21 | 


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.RData


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdb


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/beamer/unnamed-chunk-51_a69c0e7fdfc8fd360351fd72e763ebfb.rdx


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/slidy/__packages:
--------------------------------------------------------------------------------
 1 | base
 2 | methods
 3 | datasets
 4 | utils
 5 | grDevices
 6 | graphics
 7 | stats
 8 | knitr
 9 | dplyr
10 | magrittr
11 | data.table
12 | purrr
13 | tidyr
14 | MASS
15 | tidyverse
16 | ggplot2
17 | tibble
18 | readr
19 | stringr
20 | forcats
21 | 


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.RData


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdb


--------------------------------------------------------------------------------
/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_cache/slidy/unnamed-chunk-51_11dfc3d248c92bee11d12b1ab257dc47.rdx


--------------------------------------------------------------------------------
/slides/a2_intro_r_files/figure-beamer/unnamed-chunk-51-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_files/figure-beamer/unnamed-chunk-51-1.pdf


--------------------------------------------------------------------------------
/slides/a2_intro_r_files/figure-slidy/unnamed-chunk-51-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/a2_intro_r_files/figure-slidy/unnamed-chunk-51-1.png


--------------------------------------------------------------------------------
/slides/b1_regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression.pdf


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-25-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-26-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-48-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-48-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-49-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-49-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-51-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-51-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-52-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-52-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-53-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-53-1.png


--------------------------------------------------------------------------------
/slides/b1_regression_files/figure-slidy/unnamed-chunk-58-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b1_regression_files/figure-slidy/unnamed-chunk-58-1.png


--------------------------------------------------------------------------------
/slides/b2_regularization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/b2_regularization.pdf


--------------------------------------------------------------------------------
/slides/c1_trees_bagging.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c1_trees_bagging.pdf


--------------------------------------------------------------------------------
/slides/c2_random_forests.aux:
--------------------------------------------------------------------------------
 1 | \relax 
 2 | \providecommand\hyper@newdestlabel[2]{}
 3 | \providecommand\BKM@entry[2]{}
 4 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
 5 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
 6 | \global\let\oldcontentsline\contentsline
 7 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
 8 | \global\let\oldnewlabel\newlabel
 9 | \gdef\newlabel#1#2{\newlabelxx{#1}#2}
10 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
11 | \AtEndDocument{\ifx\hyper@anchor\@undefined
12 | \let\contentsline\oldcontentsline
13 | \let\newlabel\oldnewlabel
14 | \fi}
15 | \fi}
16 | \global\let\hyper@last\relax 
17 | \gdef\HyperFirstAtBeginDocument#1{#1}
18 | \providecommand\HyField@AuxAddToFields[1]{}
19 | \providecommand\HyField@AuxAddToCoFields[2]{}
20 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}}}
21 | \@writefile{nav}{\headcommand {\beamer@framepages {1}{1}}}
22 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}}}
23 | \@writefile{nav}{\headcommand {\beamer@framepages {2}{2}}}
24 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{3}{3/3}{}{0}}}
25 | \@writefile{nav}{\headcommand {\beamer@framepages {3}{3}}}
26 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{4}{4/4}{}{0}}}
27 | \@writefile{nav}{\headcommand {\beamer@framepages {4}{4}}}
28 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{5}{5/5}{}{0}}}
29 | \@writefile{nav}{\headcommand {\beamer@framepages {5}{5}}}
30 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{6}{6/6}{}{0}}}
31 | \@writefile{nav}{\headcommand {\beamer@framepages {6}{6}}}
32 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{7}{7/7}{}{0}}}
33 | \@writefile{nav}{\headcommand {\beamer@framepages {7}{7}}}
34 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{8}{8/8}{}{0}}}
35 | \@writefile{nav}{\headcommand {\beamer@framepages {8}{8}}}
36 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{9}{9/9}{}{0}}}
37 | \@writefile{nav}{\headcommand {\beamer@framepages {9}{9}}}
38 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{10}{10/10}{}{0}}}
39 | \@writefile{nav}{\headcommand {\beamer@framepages {10}{10}}}
40 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{11}{11/11}{}{0}}}
41 | \@writefile{nav}{\headcommand {\beamer@framepages {11}{11}}}
42 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{12}{12/12}{}{0}}}
43 | \@writefile{nav}{\headcommand {\beamer@framepages {12}{12}}}
44 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{13}{13/13}{}{0}}}
45 | \@writefile{nav}{\headcommand {\beamer@framepages {13}{13}}}
46 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{14}{14/14}{}{0}}}
47 | \@writefile{nav}{\headcommand {\beamer@framepages {14}{14}}}
48 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{15}{15/15}{}{0}}}
49 | \@writefile{nav}{\headcommand {\beamer@framepages {15}{15}}}
50 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{16}{16/16}{}{0}}}
51 | \@writefile{nav}{\headcommand {\beamer@framepages {16}{16}}}
52 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{17}{17/17}{}{0}}}
53 | \@writefile{nav}{\headcommand {\beamer@framepages {17}{17}}}
54 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{18}{18/18}{}{0}}}
55 | \@writefile{nav}{\headcommand {\beamer@framepages {18}{18}}}
56 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{19}{19/19}{}{0}}}
57 | \@writefile{nav}{\headcommand {\beamer@framepages {19}{19}}}
58 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{20}{20/20}{}{0}}}
59 | \@writefile{nav}{\headcommand {\beamer@framepages {20}{20}}}
60 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{21}{21/21}{}{0}}}
61 | \@writefile{nav}{\headcommand {\beamer@framepages {21}{21}}}
62 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{22}{22/22}{}{0}}}
63 | \@writefile{nav}{\headcommand {\beamer@framepages {22}{22}}}
64 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{23}{23/23}{}{0}}}
65 | \@writefile{nav}{\headcommand {\beamer@framepages {23}{23}}}
66 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{24}{24/24}{}{0}}}
67 | \@writefile{nav}{\headcommand {\beamer@framepages {24}{24}}}
68 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{25}{25/25}{}{0}}}
69 | \@writefile{nav}{\headcommand {\beamer@framepages {25}{25}}}
70 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{26}{26/26}{}{0}}}
71 | \@writefile{nav}{\headcommand {\beamer@framepages {26}{26}}}
72 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{27}{27/27}{}{0}}}
73 | \@writefile{nav}{\headcommand {\beamer@framepages {27}{27}}}
74 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{28}{28/28}{}{0}}}
75 | \@writefile{nav}{\headcommand {\beamer@framepages {28}{28}}}
76 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{29}{29/29}{}{0}}}
77 | \@writefile{nav}{\headcommand {\beamer@framepages {29}{29}}}
78 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{30}{30/30}{}{0}}}
79 | \@writefile{nav}{\headcommand {\beamer@framepages {30}{30}}}
80 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{31}{31/31}{}{0}}}
81 | \@writefile{nav}{\headcommand {\beamer@framepages {31}{31}}}
82 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{32}{32/32}{}{0}}}
83 | \@writefile{nav}{\headcommand {\beamer@framepages {32}{32}}}
84 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{33}{33/33}{}{0}}}
85 | \@writefile{nav}{\headcommand {\beamer@framepages {33}{33}}}
86 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{34}{34/34}{}{0}}}
87 | \@writefile{nav}{\headcommand {\beamer@framepages {34}{34}}}
88 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{35}{35/35}{}{0}}}
89 | \@writefile{nav}{\headcommand {\beamer@framepages {35}{35}}}
90 | \@writefile{nav}{\headcommand {\slideentry {0}{0}{36}{36/36}{}{0}}}
91 | \@writefile{nav}{\headcommand {\beamer@framepages {36}{36}}}
92 | \@writefile{nav}{\headcommand {\beamer@partpages {1}{36}}}
93 | \@writefile{nav}{\headcommand {\beamer@subsectionpages {1}{36}}}
94 | \@writefile{nav}{\headcommand {\beamer@sectionpages {1}{36}}}
95 | \@writefile{nav}{\headcommand {\beamer@documentpages {36}}}
96 | \@writefile{nav}{\headcommand {\gdef \inserttotalframenumber {36}}}
97 | 


--------------------------------------------------------------------------------
/slides/c2_random_forests.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c2_random_forests.pdf


--------------------------------------------------------------------------------
/slides/c2_random_forests.vrb:
--------------------------------------------------------------------------------
 1 | \frametitle{Predicting}
 2 | \protect\hypertarget{predicting}{}
 3 | 
 4 | \begin{itemize}
 5 | \tightlist
 6 | \item
 7 |   With the preferred model we can use the traditional predict function
 8 |   to make predictions on a new data set.
 9 | \item
10 |   We can use this for all our model types (\texttt{randomForest} and
11 |   \texttt{ranger}); although the outputs differ slightly.
12 | \end{itemize}
13 | 
14 | \begin{Shaded}
15 | \begin{Highlighting}[]
16 | \CommentTok{# randomForest}
17 | \NormalTok{pred_randomForest <-}\StringTok{ }\KeywordTok{predict}\NormalTok{(ames_randomForest, ames_test)}
18 | \KeywordTok{head}\NormalTok{(pred_randomForest)}
19 | \end{Highlighting}
20 | \end{Shaded}
21 | 
22 | \begin{verbatim}
23 | ##        1        2        3        4        5        6
24 | ## 113543.1 185556.4 259258.1 190943.9 179071.0 480952.3
25 | \end{verbatim}
26 | 
27 | \begin{Shaded}
28 | \begin{Highlighting}[]
29 | \CommentTok{# ranger}
30 | \NormalTok{pred_ranger <-}\StringTok{ }\KeywordTok{predict}\NormalTok{(ames_ranger, ames_test)}
31 | \KeywordTok{head}\NormalTok{(pred_ranger}\OperatorTok{$}\NormalTok{predictions)}
32 | \end{Highlighting}
33 | \end{Shaded}
34 | 
35 | \begin{verbatim}
36 | ## [1] 129258.1 186520.7 265628.2 197745.5 175517.6 392691.7
37 | \end{verbatim}
38 | 
39 | 


--------------------------------------------------------------------------------
/slides/c3_gbm_regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c3_gbm_regression.pdf


--------------------------------------------------------------------------------
/slides/c3_gbm_regression_short.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/c3_gbm_regression_short.pdf


--------------------------------------------------------------------------------
/slides/c3b_gbm_regression_h2o.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Gradient boosting with h2o"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "24 Mai 2019"
 5 | output: ioslides_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | ```
11 | 
12 | 
13 | ## h2o
14 | 
15 | ```{r}
16 | library(h2o)          # a java-based platform
17 | ```
18 | 
19 | 
20 | The h2o R package is a powerful and efficient java-based interface that allows for local and cluster-based deployment. It comes with a fairly comprehensive online resource that includes methodology and code documentation along with tutorials.
21 | 
22 | ## Features include:
23 | 
24 | - Distributed and parallelized computation on either a single node or a multi-node cluster.
25 | - Automatic early stopping based on convergence of user-specified metrics to user-specified relative tolerance.
26 | - Stochastic GBM with column and row sampling (per split and per tree) for better generalization.
27 | - Support for exponential families (Poisson, Gamma, Tweedie) and loss functions in addition to binomial (Bernoulli), Gaussian and multinomial distributions, such as  Quantile regression (including Laplace).
28 | - Grid search for hyperparameter optimization and model selection.
29 | - Data-distributed, which means the entire dataset does not need to fit into memory on a single node, hence scales to any size training set.
30 | - Uses histogram approximations of continuous variables for speedup.
31 | - Uses dynamic binning - bin limits are reset at each tree level based on the split bins’ min and max values discovered during the last pass.
32 | - Uses squared error to determine optimal splits.
33 | <!--
34 | - Distributed implementation details outlined in a blog post by Cliff Click.
35 | -->
36 | - Unlimited factor levels.
37 | - Multiclass trees (one for each class) built in parallel with each other.
38 | - Apache 2.0 Licensed.
39 | - Model export in plain Java code for deployment in production environments.
40 | 
41 | ## 
42 | 


--------------------------------------------------------------------------------
/slides/d_neuralNetworks.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Neural Networks"
  3 | author: "Jan-Philipp Kolb"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | fontsize: 10pt
  6 | output:
  7 |   beamer_presentation: 
  8 |     colortheme: dolphin
  9 |     fig_height: 3
 10 |     fig_width: 5
 11 |     fig_caption: no
 12 |     fonttheme: structuresmallcapsserif
 13 |     highlight: haddock
 14 |     theme: Dresden
 15 |   pdf_document: 
 16 |     keep_tex: yes
 17 |     toc: yes
 18 |   slidy_presentation: 
 19 |     css: mycss.css
 20 |     keep_md: yes
 21 | ---
 22 | 
 23 | ```{r setup, include=FALSE}
 24 | knitr::opts_chunk$set(echo = T,message=F,warning=F,eval=T)
 25 | ```
 26 | 
 27 | ## Examples of Multi-Neuron 
 28 | 
 29 | 
 30 | ![](figure/neuralnets.PNG)
 31 | 
 32 | 
 33 | <!--
 34 | ## Biologic Model
 35 | 
 36 | - While some researchers used ANNs to study animal brains, most researchers view neural networks as being inspired by, not models of, neurological systems.
 37 | - The following shows the basic functional unit of the brain, a biologic neuron.
 38 | 
 39 | ### Biologic Neuron 
 40 | 
 41 | ![](figure/Blausen_0657_MultipolarNeuron.png){height=50%}
 42 | 
 43 | Source: Bruce Blaus at [**Wikipedia**](https://commons.wikimedia.org/wiki/File:Blausen_0657_MultipolarNeuron.png)
 44 | -->
 45 | <!--
 46 | ## ANN neurons
 47 | 
 48 | - ANN neurons are simple representations of their biologic counterparts. 
 49 | - In the biologic neuron figure please note the dendrite, cell body, and the axon with the synaptic terminals. 
 50 | - In biologic systems, information (in the form of neuroelectric signals) flow into the neuron through the dendrites. 
 51 | - If a sufficient number of input signals enter the neuron through the dendrites, the cell body generates a response signal and transmits it down the axon to the synaptic terminals. 
 52 | - The specific number of input signals required for a response signal is dependent on the individual neuron. 
 53 | - When the generated signal reaches the synaptic terminals neurotransmitters flow out of the synaptic terminals and interact with dendrites of adjoining neurons.
 54 | 
 55 | 
 56 | ## Three major takeaways from the biologic neuron
 57 | 
 58 | 1.) The neuron only generates a signal if a sufficient number of input signals enter the neurons dendrites (all or nothing)
 59 | 
 60 | 2.) Neurons receive inputs from many adjacent neurons upstream, and can transmit signals to many adjacent signals downstream (cumulative inputs)
 61 | 
 62 | 3.) Each neuron has its own threshold for activation (synaptic weight).
 63 | -->
 64 | 
 65 | ## Artifical Neuron
 66 | 
 67 | <!--
 68 | - The artificial analog of the biologic neuron is shown below. 
 69 | - In the artificial model the inputs correspond to the dendrites, 
 70 | 
 71 | - The transfer function, net input, and activation function correspond to the cell body, and the activation corresponds to the axon and synaptic terminal.
 72 | 
 73 | -->
 74 | 
 75 | - Inputs correspond to raw data values
 76 | <!--
 77 | , or in deeper architectures, may be outputs from preceding artificial neurons. 
 78 | -->
 79 | - The transfer function sums all the inputs together (cumulative inputs). 
 80 | - If the summed input values reach a specified threshold, the activation function generates an output signal (all or nothing). 
 81 | - The output signal then moves to a raw output or other neurons. 
 82 | - This basic artificial neuron is combined with multiple other artificial neurons to create an ANN.
 83 | 
 84 | <!--
 85 | ### Artifical Neuron–Source: [Chrislb Wikipedia](https://commons.wikimedia.org/wiki/File:ArtificialNeuronModel_english.png)
 86 | -->
 87 | 
 88 | ![](figure/ArtificialNeuronModel_english.png){height=40%}
 89 | 
 90 | <!--
 91 | ## The inputs to the artificial neuron
 92 | 
 93 | -->
 94 | 
 95 | <!--
 96 | ## ANNs described
 97 | 
 98 | - ANNs have an input layer, hidden layer, and output layer. 
 99 | - The input layer reads in data values from a user provided input. 
100 | - Within the hidden layer a majority of the "learning" takes place, and the output layer displays the results of the ANN. 
101 | - Each of the red input nodes correspond to an input vector $x_i$. Each of the black lines with correspond to a weight, $w^{(l)}_{ij}$, and describe how artificial neurons are connections to one another within the ANN. 
102 | - The $i$ subscript identifies the source and the $j$ subscript describes to which artificial neuron the weight connects the source to. The green output nodes are the output vectors $y_q$.
103 | -->
104 | 
105 | <!--
106 | ## Examination of the figure
107 | 
108 | - Examination of the figure’s top-left and top-right plots show two possible ANN configurations. 
109 | - In the top-left, we see a network with one hidden layer with $q$ artificial neurons, $p$ input vectors $x$, and generates $q$ output vectors $y$. 
110 | - The bias term is a simple constant valued 1 to each hidden node acting akin to the grand mean in a simple linear regression. 
111 | - Each bias term in a ANN has its own associated weight $w$. 
112 | - In the top-right ANN we have a network with two hidden layers. 
113 | - This network adds superscript notation to the bias terms and the weights to identify to which layer each term belongs. 
114 | - Weights and biases with a superscript 1 act on connecting the input layer to the first layer of artificial neurons and terms with a superscript 2 connect the output of the second hidden layer to the output vectors.
115 | - The size and structure of ANNs are only limited by the imagination of the analyst.
116 | -->
117 | 
118 | <!--
119 | - The bias inputs to each hidden node, denoted by the $b_q$. 
120 | -->
121 | 
122 | 
123 | ## Activation Functions
124 | 
125 | - The capability of ANNs to learn any function, (given sufficient training data examples) are dependent on the appropriate selection of the [**activation function(s)**](https://en.wikipedia.org/wiki/Activation_function) present in the network. 
126 | - They enable the ANN to learn non-linear properties present in the data. 
127 | <!--
128 | We represent the activation function here as $\Phi(\cdot)$. 
129 | -->
130 | - The input into the activation function is the weighted sum of the input features from the preceding layer. 
131 | - Let $o_j$ be the output from the jth neuron in a given layer for a network for k input vector features.
132 | 
133 | $$
134 | o_j = \Phi(b_j + \sum\limits_{i=1}^pw_ix_i)
135 | $$
136 | 
137 | 
138 | 
139 | ## Common ANN Activation functions
140 | 
141 | ![](figure/activations-1.png){height=90%}
142 | 
143 | 
144 | <!--
145 | ## Activation functions
146 | 
147 | [![](figure/activation_funs.PNG)](https://en.wikipedia.org/wiki/Activation_function)
148 | -->
149 | 
150 | ## The output ($o_j$)...
151 | 
152 | - ... can feed into the output layer of a neural network, or in deeper architectures may feed into additional hidden layers. 
153 | - The activation function determines if the sum of the weighted inputs plus a bias term is sufficiently large to trigger the firing of the neuron. 
154 | - No universal best choice for the activation function, researchers have provided information regarding what activation functions work well for ANN solutions to many common problems. 
155 | - The choice of the activation function governs the required data scaling necessary for ANN analysis. 
156 | <!--
157 | Below we present activation functions commonly seen in may ANNs.
158 | -->
159 | 
160 | ## How ANNs Learn
161 | 
162 | <!--
163 | - We have described the structure of ANNs, we have not touched on how these networks learn. 
164 | - Assume that we have a data set of labeled observations. 
165 | -->
166 | 
167 | 
168 | - We have some features $(X)$ describing an output ($y$) 
169 | <!--
170 | fall under machine learning techniques called Supervised Learning. 
171 | -->
172 | - To begin training our single-layer one-neuron neural network we initially randomly assign weights. 
173 | - We then run the neural network with the random weights and record the outputs generated. 
174 | - This is called a forward pass. Output values, in our case called $y$, are a function of the input values ($X$), the random initial weights ($w$) and our choice of the threshold function ($T$).
175 | 
176 | 
177 | $$
178 | \hat{y}= f(X,w,T)
179 | $$
180 | 
181 | 
182 | 
183 | ## Choice of the performance function
184 | 
185 | - Once we have our ANN output values ($\hat{y}$) we can compare them to the data set output values ($y$). 
186 | - To do this we use a performance function $P$. 
187 | - The choice of the performance function is a choice of the analyst, we choose 
188 | <!--
189 | to use the One-Half Square Error Cost Function otherwise known as the 
190 | -->
191 | Sum of Squared Errors (SSE).
192 | 
193 | 


--------------------------------------------------------------------------------
/slides/d_neuralNetworks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/d_neuralNetworks.pdf


--------------------------------------------------------------------------------
/slides/e_Clustering.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Clustering"
  3 | author: "Jan-Philipp Kolb and Alexander Murray-Watters"
  4 | date: "18 Januar 2019"
  5 | output: 
  6 |   slidy_presentation: 
  7 |     keep_md: yes
  8 | ---
  9 | 
 10 | ```{r setupClustering, include=FALSE}
 11 | knitr::opts_chunk$set(echo = FALSE,eval=F)
 12 | ```
 13 | 
 14 | 
 15 | 
 16 | 
 17 | ## Resources
 18 | 
 19 | 
 20 | ```{r,echo=F, eval=FALSE}
 21 | slides_path <- getwd()
 22 | git_path <- gsub("slides","",slides_path)
 23 | if (Sys.info()$nodename=="MAC14077"){
 24 |   git_path <- "D:/Daten/GitHub/machine_learning/"
 25 |   slides_path <- paste0(git_path,"/slides")
 26 | }
 27 | ```
 28 | 
 29 | 
 30 | - [Package `kknn`](https://cran.r-project.org/web/packages/kknn/kknn.pdf)
 31 | 
 32 | ```{r,eval=F}
 33 | install.packages("kknn")
 34 | ```
 35 | 
 36 | ```{r}
 37 | library("kknn")
 38 | ```
 39 | 
 40 | 
 41 | ## [Geographic clustering of UK cities](https://www.r-bloggers.com/geographic-clustering-of-uk-cities/)
 42 | 
 43 | Animated example: 
 44 | https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68
 45 | 
 46 | 
 47 | ## Exercise: Kmeans
 48 | 
 49 | Apply kmeans to to the `iris` dataset with 2, 3, and 4
 50 | clusters. Produce three scatter plots, with the points colored
 51 | according to cluster assignment.
 52 | 
 53 | 
 54 | 
 55 | ## hdbscan
 56 | 
 57 | A fairly new alternative to kmeans, hdbscan does not require you to
 58 | specify the number of categories to be assigned. It only requires a
 59 | decision as to the minimum number of points needed to be included in a
 60 | cluster. This minimum number acts as a smoothing parameter (such as a
 61 | density bandwidth parameter or a histograms bin/bar width), with lower
 62 | values finding more clusters. Other advantages of hdbscan include .
 63 | 
 64 | ```{r, eval=FALSE}
 65 | install.packages("dbscan")
 66 | ```
 67 | 
 68 | 
 69 | 
 70 | ```{r}
 71 | library(ggplot2)
 72 | library(dplyr)
 73 | library(maps)
 74 | library(dbscan)
 75 | 
 76 | ## Example where kmeans finds only 1 cluster.
 77 | two.clust.eg <- rbind(matrix(rnorm(1000, sd = 0.8), ncol=2),
 78 |                       matrix(rnorm(100, mean = 120, sd = 0.12), ncol = 2))
 79 | 
 80 | clust <- kmeans(two.clust.eg, centers=2)
 81 | 
 82 | plot(two.clust.eg, col = clust$cluster)
 83 | ##     points(cl$centers, col = 1:2, pch = 8, cex = 2)
 84 | 
 85 | 
 86 | ```
 87 | 
 88 | ```{r}
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | data(moons)
 95 | 
 96 | ## Running HDBscan with the minimum number of points set to 5.
 97 | res <- dbscan::hdbscan(moons, minPts = 3)
 98 | 
 99 | plot(moons, col = res$cluster + 1, main="R implementation")
100 | ```
101 | 
102 | 
103 | 
104 | ## Exercise: Apply kmeans to the moons dataset and compare the results. 
105 | -- Be sure to try different numbers of centers.
106 | 
107 | 
108 | ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 
109 | 
110 | ## Exercise: Apply both kmeans and hdbscan to the `ChickWeight` dataset's "weight" "Time" variables, and see how well you can get each to perform.
111 | 
112 | 
113 | 
114 | 
115 | ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 
116 | ```{r, eval=FALSE, echo=FALSE}
117 | ## kmeans
118 | plot(ChickWeight[,1:2], col=kmeans(ChickWeight[,1:2], centers=4)$centers)
119 | 
120 | ## hdbscan, minPts=10
121 | plot(ChickWeight[,1:2], col=dbscan::hdbscan(ChickWeight[,1:2], minPts=10)$cluster)
122 | 
123 | ## Diet cat. for comparison.
124 | plot(ChickWeight[,1:2], col=ChickWeight$Diet)
125 | 
126 | ## Chick cat. for comparison.
127 | plot(ChickWeight[,1:2], col=ChickWeight$Chick)
128 | 
129 | 
130 | ```
131 | 
132 | 
133 | ```{r, eval=FALSE}
134 | load(paste0(git_path,"/data/osmsa_PLZ_14.RData"))
135 | ```
136 | 
137 | 
138 | 
139 | ## [US Census Data](https://elitedatascience.com/datasets)
140 | 
141 | - [US Census Data (Clustering)](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29) – Clustering based on demographics is a tried and true way to perform market research and segmentation.
142 | 
143 | 
144 | 
145 | ## Links
146 | 
147 | - [Using clusterlab to benchmark clustering algorithms](https://www.r-bloggers.com/using-clusterlab-to-benchmark-clustering-algorithms/)
148 | 


--------------------------------------------------------------------------------
/slides/e_Clustering.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Clustering"
 3 | author: "Jan-Philipp Kolb and Alexander Murray-Watters"
 4 | date: "18 Januar 2019"
 5 | output: 
 6 |   slidy_presentation: 
 7 |     keep_md: yes
 8 | ---
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | ## Resources
16 | 
17 | 
18 | 
19 | 
20 | 
21 | - [Package `kknn`](https://cran.r-project.org/web/packages/kknn/kknn.pdf)
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | ## [Geographic clustering of UK cities](https://www.r-bloggers.com/geographic-clustering-of-uk-cities/)
29 | 
30 | Animated example: 
31 | https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68
32 | 
33 | 
34 | ## Exercise: Kmeans
35 | 
36 | Apply kmeans to to the `iris` dataset with 2, 3, and 4
37 | clusters. Produce three scatter plots, with the points colored
38 | according to cluster assignment.
39 | 
40 | 
41 | ## hdbscan
42 | 
43 | A fairly new alternative to kmeans, hdbscan does not require you to
44 | specify the number of categories to be assigned. It only requires a
45 | decision as to the minimum number of points needed to be included in a
46 | cluster. This minimum number acts as a smoothing parameter (such as a
47 | density bandwidth parameter or a histograms bin/bar width), with lower
48 | values finding more clusters. Other advantages of hdbscan include .
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | ## Exercise: Apply kmeans to the moons dataset and compare the results. 
61 | -- Be sure to try different numbers of centers.
62 | 
63 | 
64 | ## Exercise: Apply hdbscan to the moons dataset with different minimums for the number of points. 
65 | 
66 | ## Exercise: Apply both kmeans and hdbscan to the `ChickWeight` dataset's "weight" "Time" variables, and see how well you can get each to perform.
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | ## [US Census Data](https://elitedatascience.com/datasets)
79 | 
80 | - [US Census Data (Clustering)](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29) – Clustering based on demographics is a tried and true way to perform market research and segmentation.
81 | 
82 | 
83 | 
84 | ## Links
85 | 
86 | - [Using clusterlab to benchmark clustering algorithms](https://www.r-bloggers.com/using-clusterlab-to-benchmark-clustering-algorithms/)
87 | 


--------------------------------------------------------------------------------
/slides/figure/3d-coordinate-plane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/3d-coordinate-plane.png


--------------------------------------------------------------------------------
/slides/figure/450px-Overfitting.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/450px-Overfitting.svg.png


--------------------------------------------------------------------------------
/slides/figure/AmesTableau01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/AmesTableau01.png


--------------------------------------------------------------------------------
/slides/figure/ArtificialNeuronModel_english.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ArtificialNeuronModel_english.png


--------------------------------------------------------------------------------
/slides/figure/BBRXC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/BBRXC.png


--------------------------------------------------------------------------------
/slides/figure/Blausen_0657_MultipolarNeuron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Blausen_0657_MultipolarNeuron.png


--------------------------------------------------------------------------------
/slides/figure/Decision-Tree-Example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Decision-Tree-Example.jpg


--------------------------------------------------------------------------------
/slides/figure/Diagslr.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Diagslr.PNG


--------------------------------------------------------------------------------
/slides/figure/OneHotEncoding.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/OneHotEncoding.PNG


--------------------------------------------------------------------------------
/slides/figure/Overfitting_fig1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Overfitting_fig1.PNG


--------------------------------------------------------------------------------
/slides/figure/Picture3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/Picture3.jpg


--------------------------------------------------------------------------------
/slides/figure/SMLProcess.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/SMLProcess.png


--------------------------------------------------------------------------------
/slides/figure/The_Signal_and_the_Noise.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/The_Signal_and_the_Noise.jpg


--------------------------------------------------------------------------------
/slides/figure/activation_funs.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/activation_funs.PNG


--------------------------------------------------------------------------------
/slides/figure/activations-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/activations-1.png


--------------------------------------------------------------------------------
/slides/figure/addins.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/addins.PNG


--------------------------------------------------------------------------------
/slides/figure/bagging3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bagging3.png


--------------------------------------------------------------------------------
/slides/figure/bias_variance_tradeoff.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bias_variance_tradeoff.PNG


--------------------------------------------------------------------------------
/slides/figure/bias_variance_tradeoff2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bias_variance_tradeoff2.png


--------------------------------------------------------------------------------
/slides/figure/biglasso.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/biglasso.PNG


--------------------------------------------------------------------------------
/slides/figure/book_ml1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/book_ml1.jpg


--------------------------------------------------------------------------------
/slides/figure/boosted-trees-process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/boosted-trees-process.png


--------------------------------------------------------------------------------
/slides/figure/boosting-in-action-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/boosting-in-action-1.png


--------------------------------------------------------------------------------
/slides/figure/bostondata.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bostondata.PNG


--------------------------------------------------------------------------------
/slides/figure/bostonscaled.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/bostonscaled.PNG


--------------------------------------------------------------------------------
/slides/figure/class01-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/class01-1.png


--------------------------------------------------------------------------------
/slides/figure/classification_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/classification_regression.png


--------------------------------------------------------------------------------
/slides/figure/confusionMatrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/confusionMatrix.png


--------------------------------------------------------------------------------
/slides/figure/content_flowchart1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/content_flowchart1.png


--------------------------------------------------------------------------------
/slides/figure/datasetsload.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/datasetsload.PNG


--------------------------------------------------------------------------------
/slides/figure/decissiontree.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/decissiontree.PNG


--------------------------------------------------------------------------------
/slides/figure/dplyr_vignette.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/dplyr_vignette.PNG


--------------------------------------------------------------------------------
/slides/figure/dt_amesdata.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/dt_amesdata.PNG


--------------------------------------------------------------------------------
/slides/figure/duckduckgo.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/duckduckgo.PNG


--------------------------------------------------------------------------------
/slides/figure/electoral_precedent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/electoral_precedent.png


--------------------------------------------------------------------------------
/slides/figure/ex_regression_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ex_regression_tree.png


--------------------------------------------------------------------------------
/slides/figure/expl_rf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/expl_rf.png


--------------------------------------------------------------------------------
/slides/figure/factor3vars_visreg.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/factor3vars_visreg.PNG


--------------------------------------------------------------------------------
/slides/figure/fig3_loglambda.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/fig3_loglambda.PNG


--------------------------------------------------------------------------------
/slides/figure/four_regmods.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/four_regmods.PNG


--------------------------------------------------------------------------------
/slides/figure/gbmtopmodelsvars.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/gbmtopmodelsvars.PNG


--------------------------------------------------------------------------------
/slides/figure/ggpairs_yacht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ggpairs_yacht.png


--------------------------------------------------------------------------------
/slides/figure/gradient_descent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/gradient_descent.png


--------------------------------------------------------------------------------
/slides/figure/influentalValues_lasso.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/influentalValues_lasso.PNG


--------------------------------------------------------------------------------
/slides/figure/interplot_wt_disp.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/interplot_wt_disp.PNG


--------------------------------------------------------------------------------
/slides/figure/iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/iris.png


--------------------------------------------------------------------------------
/slides/figure/kyphosis_helppage.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/kyphosis_helppage.PNG


--------------------------------------------------------------------------------
/slides/figure/learning_rate_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/learning_rate_comparison.png


--------------------------------------------------------------------------------
/slides/figure/limeplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/limeplot.png


--------------------------------------------------------------------------------
/slides/figure/magrittr_vignette.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/magrittr_vignette.jpg


--------------------------------------------------------------------------------
/slides/figure/ml_emoji.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_emoji.png


--------------------------------------------------------------------------------
/slides/figure/ml_ice_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_ice_curves.png


--------------------------------------------------------------------------------
/slides/figure/ml_rf_errorrate_m1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_rf_errorrate_m1.png


--------------------------------------------------------------------------------
/slides/figure/ml_rf_hist_OOB_RMSE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_rf_hist_OOB_RMSE.png


--------------------------------------------------------------------------------
/slides/figure/ml_rf_varimp_ranger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_rf_varimp_ranger.png


--------------------------------------------------------------------------------
/slides/figure/ml_tb_rpart_iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ml_tb_rpart_iris.png


--------------------------------------------------------------------------------
/slides/figure/mtcars_model_interact.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/mtcars_model_interact.PNG


--------------------------------------------------------------------------------
/slides/figure/neuralnetfig.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/neuralnetfig.PNG


--------------------------------------------------------------------------------
/slides/figure/neuralnets.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/neuralnets.PNG


--------------------------------------------------------------------------------
/slides/figure/nyc_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/nyc_map.png


--------------------------------------------------------------------------------
/slides/figure/overview_ml_algorithms.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/overview_ml_algorithms.jpg


--------------------------------------------------------------------------------
/slides/figure/package_gbm.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/package_gbm.PNG


--------------------------------------------------------------------------------
/slides/figure/pic_hiddenlayers.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/pic_hiddenlayers.PNG


--------------------------------------------------------------------------------
/slides/figure/prediction_mtcars.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/prediction_mtcars.PNG


--------------------------------------------------------------------------------
/slides/figure/random_trees_fig1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/random_trees_fig1.PNG


--------------------------------------------------------------------------------
/slides/figure/reg_3algos.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/reg_3algos.PNG


--------------------------------------------------------------------------------
/slides/figure/resid_fitted.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/resid_fitted.PNG


--------------------------------------------------------------------------------
/slides/figure/ridgeTop25influentalVars.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ridgeTop25influentalVars.PNG


--------------------------------------------------------------------------------
/slides/figure/ridge_coef.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/ridge_coef.png


--------------------------------------------------------------------------------
/slides/figure/stargazertabex.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/stargazertabex.PNG


--------------------------------------------------------------------------------
/slides/figure/stochastic_gradient_descent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/stochastic_gradient_descent.png


--------------------------------------------------------------------------------
/slides/figure/swissfertality.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/swissfertality.PNG


--------------------------------------------------------------------------------
/slides/figure/taskviewmachinelearning.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/taskviewmachinelearning.PNG


--------------------------------------------------------------------------------
/slides/figure/three_algos_complete.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/three_algos_complete.PNG


--------------------------------------------------------------------------------
/slides/figure/titanicdata.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/titanicdata.PNG


--------------------------------------------------------------------------------
/slides/figure/top-20-r-packages-machine-learning-downloads.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/top-20-r-packages-machine-learning-downloads.jpg


--------------------------------------------------------------------------------
/slides/figure/top10gbms.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/top10gbms.PNG


--------------------------------------------------------------------------------
/slides/figure/tree-correlation-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/tree-correlation-1.png


--------------------------------------------------------------------------------
/slides/figure/tree_m1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/tree_m1.PNG


--------------------------------------------------------------------------------
/slides/figure/unsupervisedLearning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/unsupervisedLearning.png


--------------------------------------------------------------------------------
/slides/figure/visreg.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visreg.PNG


--------------------------------------------------------------------------------
/slides/figure/visreg2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visreg2.PNG


--------------------------------------------------------------------------------
/slides/figure/visreg_m6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visreg_m6.PNG


--------------------------------------------------------------------------------
/slides/figure/visregcat.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visregcat.PNG


--------------------------------------------------------------------------------
/slides/figure/visregplot1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/figure/visregplot1.PNG


--------------------------------------------------------------------------------
/slides/long/c2_random_forests.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/long/c2_random_forests.pdf


--------------------------------------------------------------------------------
/slides/long/d_neuralNetworks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/long/d_neuralNetworks.pdf


--------------------------------------------------------------------------------
/slides/old/A_ml_motiv.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Motivation for Machine Learning"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "18 Januar 2019"
 5 | output: beamer_presentation
 6 | ---
 7 | 
 8 | ```{r setupmlmotiv, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | ```
11 | 
12 | 
13 | 
14 | 
15 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/)
16 | 
17 | ```{r}
18 | start_time <- Sys.time()
19 | ab <- runif(10000000)
20 | end_time <- Sys.time()
21 | 
22 | end_time - start_time
23 | ```
24 | 
25 | 
26 | ## How many cores are available
27 | 
28 | 
29 | ```{r}
30 | library(doParallel)
31 | detectCores()
32 | ```
33 | 
34 | ## 
35 | 
36 | ```{r}
37 | cl <- makeCluster(detectCores())
38 | registerDoParallel(cl)
39 | ```
40 | 
41 | ```{r}
42 | start_time <- Sys.time()
43 | ab <- runif(10000000)
44 | end_time <- Sys.time()
45 | 
46 | end_time - start_time
47 | ```
48 | 
49 | ```{r}
50 | stopCluster(cl)
51 | ```
52 | 
53 | 
54 | ```{r}
55 | ?parallel::makeCluster
56 | ```
57 | 
58 | 
59 | 
60 | 
61 | ## Links
62 | 
63 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/)
64 | 
65 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/)
66 | 
67 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/)
68 | 
69 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/)
70 | 
71 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/)
72 | 
73 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/)
74 | 
75 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html)
76 | 
77 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/)
78 | 
79 | 
80 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/)
81 | <!--
82 | Datacamp Course
83 | 
84 | https://www.r-bloggers.com/my-course-on-hyperparameter-tuning-in-r-is-now-on-data-camp/
85 | 
86 | company quantide
87 | -->


--------------------------------------------------------------------------------
/slides/old/a2_intro_ml.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introducing Machine Learning "
  3 | author: "Jan-Philipp Kolb"
  4 | date: "03 May 2019"
  5 | output: 
  6 |   slidy_presentation: 
  7 |     keep_md: yes
  8 | ---
  9 | 
 10 | 
 11 | 
 12 | ## [Modern Machine Learning Algorithms](https://elitedatascience.com/machine-learning-algorithms)
 13 | 
 14 | Categorizing machine learning algorithms is tricky, and there are several reasonable approaches; they can be grouped into generative/discriminative, parametric/non-parametric, supervised/unsupervised, and so on.
 15 | 
 16 | 
 17 | <!--
 18 | https://lgatto.github.io/IntroMachineLearningWithR/an-introduction-to-machine-learning-with-r.html
 19 | -->
 20 | 
 21 | ## [Machine Learning - Components](https://www.linkedin.com/pulse/20140822073217-180198720-6-components-of-a-machine-learning-algorithm)
 22 | 
 23 | - Feature Extraction + Domain knowledge
 24 | 
 25 | - Feature Selection
 26 | 
 27 | - Choice of Algorithm
 28 | 
 29 | Naive Bayes, [Support Vector Machines](https://github.com/Japhilko/DataAnalysis/blob/master/Machine%20Learning/SupportVectorMachines.md), Decision Trees, k-Means Clustering, ...
 30 | 
 31 | - Training
 32 | 
 33 | - Choice of Metrics/Evaluation Criteria
 34 | 
 35 | - Testing
 36 | 
 37 | 
 38 | ## [Feature selection](https://en.wikipedia.org/wiki/Feature_selection)
 39 | 
 40 | 
 41 | ## [Supervised vs unsupervised learning](https://towardsdatascience.com/supervised-vs-unsupervised-learning-14f68e32ea8d)
 42 | 
 43 | ### Supervised Learning 
 44 | 
 45 | - we have prior knowledge of what the output values for our samples should be. 
 46 | 
 47 | 
 48 | ## Task: Find R-packages
 49 | 
 50 | Go to https://cran.r-project.org/ and search for packages that,...
 51 | 
 52 | - can be used for lasso regression
 53 | 
 54 | <!--
 55 | https://www.r-bloggers.com/what-are-the-best-machine-learning-packages-in-r/
 56 | -->
 57 | 
 58 | ## Task View Machine Learning
 59 | 
 60 | 
 61 | ![](figure/taskviewmachinelearning.PNG)
 62 | 
 63 | 
 64 | 
 65 | ## Install all packages of a task view
 66 | 
 67 | 
 68 | ```r
 69 | install.packages("ctv")
 70 | ctv::install.views("MachineLearning")
 71 | ```
 72 | 
 73 | ## [Prediction vs. Causation in Regression Analysis](https://statisticalhorizons.com/prediction-vs-causation-in-regression-analysis)
 74 | 
 75 | ## Literature for machine learning
 76 | 
 77 | ![](figure/book_ml1.jpg)
 78 | 
 79 | 
 80 | <!--
 81 | https://lgatto.github.io/IntroMachineLearningWithR/index.html
 82 | https://www.kaggle.com/camnugent/introduction-to-machine-learning-in-r-tutorial
 83 | 
 84 | https://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/
 85 | 
 86 | https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-678/
 87 | -->
 88 | 
 89 | ## Introduction to machine learning with R
 90 | 
 91 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/)
 92 | 
 93 | 
 94 | - chapter about machine learning in [awesome R](https://awesome-r.com/)
 95 | 
 96 | 
 97 | - [Shiny App for machine learning](https://www.showmeshiny.com/machlearn/)
 98 | 
 99 | 
100 | ## [The Curse of Dimensionality](https://elitedatascience.com/dimensionality-reduction-algorithms)
101 | 
102 | ![](figure/3d-coordinate-plane.png)
103 | 
104 | 
105 | ## Links
106 | 
107 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/)
108 | 
109 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/)
110 | 
111 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/)
112 | 
113 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/)
114 | 
115 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/)
116 | 
117 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/)
118 | 
119 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html)
120 | 
121 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/)
122 | 
123 | 
124 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/)
125 | <!--
126 | Datacamp Course
127 | 
128 | https://www.r-bloggers.com/my-course-on-hyperparameter-tuning-in-r-is-now-on-data-camp/
129 | 
130 | company quantide
131 | 
132 | 
133 | https://medium.freecodecamp.org/every-single-machine-learning-course-on-the-internet-ranked-by-your-reviews-3c4a7b8026c0
134 | -->
135 | 
136 | - Google`s [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/)
137 | 
138 | - [A prelude to machine learning](https://eight2late.wordpress.com/2017/02/23/a-prelude-to-machine-learning/)
139 | 
140 | - [caret webinar by Max Kuhn - on youtube](https://www.youtube.com/watch?v=7Jbb2ItbTC4)
141 | 
142 | - [learn-math-for-data-science](https://elitedatascience.com/learn-math-for-data-science)
143 | - [learn-statistics-for-data-science](https://elitedatascience.com/learn-statistics-for-data-science)
144 | 
145 | - [machine-learning-projects-for-beginners](https://elitedatascience.com/machine-learning-projects-for-beginners)
146 | 
147 | 
148 | - [An Introduction to machine learning](http://www-bcf.usc.edu/~gareth/ISL/)
149 | - [ISLR book](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Seventh%20Printing.pdf)
150 | 


--------------------------------------------------------------------------------
/slides/old/a2_intro_ml.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/a2_intro_ml.pdf


--------------------------------------------------------------------------------
/slides/old/a_intro_ml.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introducing Machine Learning "
  3 | author: "Jan-Philipp Kolb"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output: 
  6 |   beamer_presentation: 
  7 |     colortheme: rose
  8 |     fonttheme: structurebold
  9 |     highlight: pygments
 10 |     theme: Darmstadt
 11 |     fig_width: 8 
 12 |     fig_height: 4 
 13 |   slidy_presentation: 
 14 |     keep_md: yes
 15 | ---
 16 | 
 17 | ```{r setupMlintro, include=FALSE}
 18 | knitr::opts_chunk$set(echo = TRUE,cache=T,warning=F)
 19 | ```
 20 | 
 21 | ## Intro Machine Learning
 22 | 
 23 | <!--
 24 | https://elitedatascience.com/learn-machine-learning#what
 25 | -->
 26 | 
 27 | Categorizing machine learning algorithms is tricky
 28 | 
 29 | - ... they can be grouped into generative/discriminative, parametric/non-parametric, supervised/unsupervised, and so on.
 30 | 
 31 | -  Scikit-Learn’s documentation page groups algorithms by their learning mechanism. This produces categories such as: Generalized linear models,     Support vector machines, nearest neighbors, decision trees, neural networks, ...
 32 | 
 33 | 
 34 | ## [Machine Learning - Components](https://www.linkedin.com/pulse/20140822073217-180198720-6-components-of-a-machine-learning-algorithm)
 35 | 
 36 | - Feature Extraction + Domain knowledge
 37 | 
 38 | - Feature Selection
 39 | 
 40 | - Choice of Algorithm - e.g. Naive Bayes, [Support Vector Machines](https://github.com/Japhilko/DataAnalysis/blob/master/Machine%20Learning/SupportVectorMachines.md), Decision Trees, k-Means Clustering, ...
 41 | 
 42 | - Training
 43 | 
 44 | - Choice of Metrics/Evaluation Criteria
 45 | 
 46 | - Testing
 47 | 
 48 | 
 49 | ## [Feature selection](https://elitedatascience.com/dimensionality-reduction-algorithms#feature-selection)
 50 | 
 51 | <!--
 52 | https://en.wikipedia.org/wiki/Feature_selection
 53 | -->
 54 | 
 55 | Feature selection is for filtering irrelevant or redundant features from your dataset. The key difference between feature selection and extraction is that feature selection keeps a subset of the original features while feature extraction creates brand new ones.
 56 | 
 57 | To be clear, some supervised algorithms already have built-in feature selection, such as Regularized Regression and Random Forests. Typically, we recommend starting with these algorithms if they fit your task.
 58 | 
 59 | As a stand-alone task, feature selection can be unsupervised (e.g. Variance Thresholds) or supervised (e.g. Genetic Algorithms). You can also combine multiple methods if needed.
 60 | 
 61 | 
 62 | ## [Supervised vs unsupervised learning](https://towardsdatascience.com/supervised-vs-unsupervised-learning-14f68e32ea8d)
 63 | 
 64 | ### Supervised Learning 
 65 | 
 66 | - we have prior knowledge of what the output values for our samples should be. 
 67 | 
 68 | ### [Unsupervised Learning](https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/unsupervised.pdf)
 69 | 
 70 | - In unsupervised learning we observe only the features $X_1, X_2,...,X_p$
 71 | .  We are not interested in prediction, because we do not have an
 72 | associated response variable $Y$.
 73 | 
 74 | 
 75 | ## Task: Find R-packages
 76 | 
 77 | Go to https://cran.r-project.org/ and search for packages that,...
 78 | 
 79 | - can be used for lasso regression
 80 | 
 81 | <!--
 82 | https://www.r-bloggers.com/what-are-the-best-machine-learning-packages-in-r/
 83 | -->
 84 | 
 85 | ## Task View Machine Learning
 86 | 
 87 | 
 88 | ![](figure/taskviewmachinelearning.PNG)
 89 | 
 90 | 
 91 | ### Install all packages of a task view
 92 | 
 93 | ```{r,eval=F}
 94 | install.packages("ctv")
 95 | ctv::install.views("MachineLearning")
 96 | ```
 97 | 
 98 | ## [Prediction vs. Causation in Regression Analysis](https://statisticalhorizons.com/prediction-vs-causation-in-regression-analysis)
 99 | 
100 | ## R-packages needed for machine learning
101 | 
102 | - caret: Classification and Regression Training
103 | -  ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics
104 | -    mlbench
105 | -   class
106 | -    caTools
107 | -    randomForest
108 | -    impute
109 | -    ranger
110 | -    kernlab
111 |  -   class
112 |   -  glmnet
113 |    - naivebayes
114 |     -rpart
115 |     -rpart.plot
116 | 
117 | 
118 | 
119 | <!--
120 | ## Literature for machine learning
121 | 
122 | ![](figure/book_ml1.jpg)
123 | -->
124 | 
125 | 
126 | 
127 | 
128 | <!--
129 | https://lgatto.github.io/IntroMachineLearningWithR/index.html
130 | https://www.kaggle.com/camnugent/introduction-to-machine-learning-in-r-tutorial
131 | 
132 | https://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/
133 | 
134 | https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-678/
135 | -->
136 | 
137 | ## Introduction to machine learning with R
138 | 
139 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/)
140 | 
141 | 
142 | - chapter about machine learning in [awesome R](https://awesome-r.com/)
143 | 
144 | 
145 | - [Shiny App for machine learning](https://www.showmeshiny.com/machlearn/)
146 | 
147 | 
148 | 
149 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/)
150 | 
151 | ```{r}
152 | start_time <- Sys.time()
153 | ab <- runif(10000000)
154 | end_time <- Sys.time()
155 | 
156 | end_time - start_time
157 | ```
158 | 
159 | 
160 | ## How many cores are available
161 | 
162 | 
163 | ```{r}
164 | library(doParallel)
165 | detectCores()
166 | ```
167 | 
168 | ## 
169 | 
170 | ```{r}
171 | cl <- makeCluster(detectCores())
172 | registerDoParallel(cl)
173 | ```
174 | 
175 | ```{r}
176 | start_time <- Sys.time()
177 | ab <- runif(10000000)
178 | end_time <- Sys.time()
179 | 
180 | end_time - start_time
181 | ```
182 | 
183 | ```{r}
184 | stopCluster(cl)
185 | ```
186 | 
187 | 
188 | ```{r,eval=F}
189 | ?parallel::makeCluster
190 | ```
191 | 
192 | 
193 | 
194 | 
195 | ## Links
196 | 
197 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/)
198 | 
199 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/)
200 | 
201 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/)
202 | 
203 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/)
204 | 
205 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/)
206 | 
207 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/)
208 | 
209 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html)
210 | 
211 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/)
212 | 
213 | 
214 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/)
215 | <!--
216 | Datacamp Course
217 | 
218 | https://www.r-bloggers.com/my-course-on-hyperparameter-tuning-in-r-is-now-on-data-camp/
219 | 
220 | company quantide
221 | -->
222 | 
223 | - Google`s [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/)
224 | 
225 | - [A prelude to machine learning](https://eight2late.wordpress.com/2017/02/23/a-prelude-to-machine-learning/)
226 | 
227 | - [caret webinar on youtube](https://www.youtube.com/watch?v=7Jbb2ItbTC4)
228 | 
229 | - [beginner-mistakes](https://elitedatascience.com/beginner-mistakes)
230 | 
231 | 
232 | <!--
233 | https://www.r-bloggers.com/visualising-bias-and-unbiasedness/
234 | -->
235 | 
236 | <!--
237 | Further possible topics of this section:
238 | 
239 | MSE
240 | -->


--------------------------------------------------------------------------------
/slides/old/a_intro_ml.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introducing Machine Learning "
  3 | author: "Jan-Philipp Kolb"
  4 | date: "03 May 2019"
  5 | output: 
  6 |   slidy_presentation: 
  7 |     keep_md: yes
  8 | ---
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | ## [Machine Learning - Components](https://www.linkedin.com/pulse/20140822073217-180198720-6-components-of-a-machine-learning-algorithm)
 15 | 
 16 | - Feature Extraction + Domain knowledge
 17 | 
 18 | - Feature Selection
 19 | 
 20 | - Choice of Algorithm
 21 | 
 22 | Naive Bayes, [Support Vector Machines](https://github.com/Japhilko/DataAnalysis/blob/master/Machine%20Learning/SupportVectorMachines.md), Decision Trees, k-Means Clustering, ...
 23 | 
 24 | - Training
 25 | 
 26 | - Choice of Metrics/Evaluation Criteria
 27 | 
 28 | - Testing
 29 | 
 30 | 
 31 | ## [Feature selection](https://en.wikipedia.org/wiki/Feature_selection)
 32 | 
 33 | 
 34 | ## [Supervised vs unsupervised learning](https://towardsdatascience.com/supervised-vs-unsupervised-learning-14f68e32ea8d)
 35 | 
 36 | ### Supervised Learning 
 37 | 
 38 | - we have prior knowledge of what the output values for our samples should be. 
 39 | 
 40 | 
 41 | ## Task: Find R-packages
 42 | 
 43 | Go to https://cran.r-project.org/ and search for packages that,...
 44 | 
 45 | - can be used for lasso regression
 46 | 
 47 | <!--
 48 | https://www.r-bloggers.com/what-are-the-best-machine-learning-packages-in-r/
 49 | -->
 50 | 
 51 | ## Task View Machine Learning
 52 | 
 53 | 
 54 | ![](figure/taskviewmachinelearning.PNG)
 55 | 
 56 | 
 57 | 
 58 | ## Install all packages of a task view
 59 | 
 60 | 
 61 | ```r
 62 | install.packages("ctv")
 63 | ctv::install.views("MachineLearning")
 64 | ```
 65 | 
 66 | ## [Prediction vs. Causation in Regression Analysis](https://statisticalhorizons.com/prediction-vs-causation-in-regression-analysis)
 67 | 
 68 | ## Literature for machine learning
 69 | 
 70 | ![](figure/book_ml1.jpg)
 71 | 
 72 | 
 73 | <!--
 74 | https://lgatto.github.io/IntroMachineLearningWithR/index.html
 75 | https://www.kaggle.com/camnugent/introduction-to-machine-learning-in-r-tutorial
 76 | 
 77 | https://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/
 78 | 
 79 | https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-678/
 80 | -->
 81 | 
 82 | ## Introduction to machine learning with R
 83 | 
 84 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/)
 85 | 
 86 | 
 87 | - chapter about machine learning in [awesome R](https://awesome-r.com/)
 88 | 
 89 | 
 90 | - [Shiny App for machine learning](https://www.showmeshiny.com/machlearn/)
 91 | 
 92 | 
 93 | 
 94 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/)
 95 | 
 96 | 
 97 | ```r
 98 | start_time <- Sys.time()
 99 | ab <- runif(10000000)
100 | end_time <- Sys.time()
101 | 
102 | end_time - start_time
103 | ```
104 | 
105 | ```
106 | ## Time difference of 1.286074 secs
107 | ```
108 | 
109 | 
110 | ## How many cores are available
111 | 
112 | 
113 | 
114 | ```r
115 | library(doParallel)
116 | ```
117 | 
118 | ```
119 | ## Warning: package 'doParallel' was built under R version 3.5.2
120 | ```
121 | 
122 | ```
123 | ## Loading required package: foreach
124 | ```
125 | 
126 | ```
127 | ## Warning: package 'foreach' was built under R version 3.5.1
128 | ```
129 | 
130 | ```
131 | ## Loading required package: iterators
132 | ```
133 | 
134 | ```
135 | ## Loading required package: parallel
136 | ```
137 | 
138 | ```r
139 | detectCores()
140 | ```
141 | 
142 | ```
143 | ## [1] 4
144 | ```
145 | 
146 | ## 
147 | 
148 | 
149 | ```r
150 | cl <- makeCluster(detectCores())
151 | registerDoParallel(cl)
152 | ```
153 | 
154 | 
155 | ```r
156 | start_time <- Sys.time()
157 | ab <- runif(10000000)
158 | end_time <- Sys.time()
159 | 
160 | end_time - start_time
161 | ```
162 | 
163 | ```
164 | ## Time difference of 0.454026 secs
165 | ```
166 | 
167 | 
168 | ```r
169 | stopCluster(cl)
170 | ```
171 | 
172 | 
173 | 
174 | ```r
175 | ?parallel::makeCluster
176 | ```
177 | 
178 | 
179 | 
180 | 
181 | ## Links
182 | 
183 | - [Presentations on ‘Elements of Neural Networks & Deep Learning’ ](https://www.r-bloggers.com/my-presentations-on-elements-of-neural-networks-deep-learning-parts-45/)
184 | 
185 | - [Understanding the Magic of Neural Networks](https://www.r-bloggers.com/understanding-the-magic-of-neural-networks/)
186 | 
187 | - [Neural Text Modelling with R package ruimtehol](https://www.r-bloggers.com/neural-text-modelling-with-r-package-ruimtehol/)
188 | 
189 | - [Feature Selection using Genetic Algorithms in R](https://www.r-bloggers.com/feature-selection-using-genetic-algorithms-in-r/)
190 | 
191 | - [Lecture slides: Real-World Data Science (Fraud Detection, Customer Churn & Predictive Maintenance)](https://www.r-bloggers.com/lecture-slides-real-world-data-science-fraud-detection-customer-churn-predictive-maintenance/)
192 | 
193 | - [Automated Dashboard for Credit Modelling with Decision trees and Random forests in R](https://www.r-bloggers.com/automated-dashboard-for-credit-modelling-with-decision-trees-and-random-forests-in-r/)
194 | 
195 | - [Looking Back at Google’s Research Efforts in 2018](https://ai.googleblog.com/2019/01/looking-back-at-googles-research.html)
196 | 
197 | - [Selecting ‘special’ photos on your phone](https://www.r-bloggers.com/selecting-special-photos-on-your-phone/)
198 | 
199 | 
200 | - [Open Source AI, ML & Data Science News](https://www.r-bloggers.com/ai-machine-learning-and-data-science-roundup-january-2019/)
201 | <!--
202 | Datacamp Course
203 | 
204 | https://www.r-bloggers.com/my-course-on-hyperparameter-tuning-in-r-is-now-on-data-camp/
205 | 
206 | company quantide
207 | -->
208 | 
209 | - Google`s [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/)
210 | 
211 | - [A prelude to machine learning](https://eight2late.wordpress.com/2017/02/23/a-prelude-to-machine-learning/)
212 | 
213 | - [caret webinar on youtube](https://www.youtube.com/watch?v=7Jbb2ItbTC4)
214 | 
215 | - [beginner-mistakes](https://elitedatascience.com/beginner-mistakes)
216 | 


--------------------------------------------------------------------------------
/slides/old/a_intro_ml.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/a_intro_ml.pdf


--------------------------------------------------------------------------------
/slides/old/advanced_regression.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Advanced Regression"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "18 Januar 2019"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## [Marginal effects](https://www.r-bloggers.com/ggeffects-0-8-0-now-on-cran-marginal-effects-for-regression-models-rstats/)
13 | 
14 | - marginal effects for regression models


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression.nav:
--------------------------------------------------------------------------------
  1 | \beamer@endinputifotherversion {3.36pt}
  2 | \headcommand {\slideentry {0}{0}{1}{1/1}{}{0}}
  3 | \headcommand {\beamer@framepages {1}{1}}
  4 | \headcommand {\slideentry {0}{0}{2}{2/2}{}{0}}
  5 | \headcommand {\beamer@framepages {2}{2}}
  6 | \headcommand {\slideentry {0}{0}{3}{3/3}{}{0}}
  7 | \headcommand {\beamer@framepages {3}{3}}
  8 | \headcommand {\slideentry {0}{0}{4}{4/4}{}{0}}
  9 | \headcommand {\beamer@framepages {4}{4}}
 10 | \headcommand {\slideentry {0}{0}{5}{5/5}{}{0}}
 11 | \headcommand {\beamer@framepages {5}{5}}
 12 | \headcommand {\slideentry {0}{0}{6}{6/6}{}{0}}
 13 | \headcommand {\beamer@framepages {6}{6}}
 14 | \headcommand {\slideentry {0}{0}{7}{7/7}{}{0}}
 15 | \headcommand {\beamer@framepages {7}{7}}
 16 | \headcommand {\slideentry {0}{0}{8}{8/8}{}{0}}
 17 | \headcommand {\beamer@framepages {8}{8}}
 18 | \headcommand {\slideentry {0}{0}{9}{9/9}{}{0}}
 19 | \headcommand {\beamer@framepages {9}{9}}
 20 | \headcommand {\slideentry {0}{0}{10}{10/10}{}{0}}
 21 | \headcommand {\beamer@framepages {10}{10}}
 22 | \headcommand {\slideentry {0}{0}{11}{11/11}{}{0}}
 23 | \headcommand {\beamer@framepages {11}{11}}
 24 | \headcommand {\slideentry {0}{0}{12}{12/12}{}{0}}
 25 | \headcommand {\beamer@framepages {12}{12}}
 26 | \headcommand {\slideentry {0}{0}{13}{13/13}{}{0}}
 27 | \headcommand {\beamer@framepages {13}{13}}
 28 | \headcommand {\slideentry {0}{0}{14}{14/14}{}{0}}
 29 | \headcommand {\beamer@framepages {14}{14}}
 30 | \headcommand {\slideentry {0}{0}{15}{15/15}{}{0}}
 31 | \headcommand {\beamer@framepages {15}{15}}
 32 | \headcommand {\slideentry {0}{0}{16}{16/16}{}{0}}
 33 | \headcommand {\beamer@framepages {16}{16}}
 34 | \headcommand {\slideentry {0}{0}{17}{17/17}{}{0}}
 35 | \headcommand {\beamer@framepages {17}{17}}
 36 | \headcommand {\slideentry {0}{0}{18}{18/18}{}{0}}
 37 | \headcommand {\beamer@framepages {18}{18}}
 38 | \headcommand {\slideentry {0}{0}{19}{19/19}{}{0}}
 39 | \headcommand {\beamer@framepages {19}{19}}
 40 | \headcommand {\slideentry {0}{0}{20}{20/20}{}{0}}
 41 | \headcommand {\beamer@framepages {20}{20}}
 42 | \headcommand {\slideentry {0}{0}{21}{21/21}{}{0}}
 43 | \headcommand {\beamer@framepages {21}{21}}
 44 | \headcommand {\slideentry {0}{0}{22}{22/22}{}{0}}
 45 | \headcommand {\beamer@framepages {22}{22}}
 46 | \headcommand {\slideentry {0}{0}{23}{23/23}{}{0}}
 47 | \headcommand {\beamer@framepages {23}{23}}
 48 | \headcommand {\slideentry {0}{0}{24}{24/24}{}{0}}
 49 | \headcommand {\beamer@framepages {24}{24}}
 50 | \headcommand {\slideentry {0}{0}{25}{25/25}{}{0}}
 51 | \headcommand {\beamer@framepages {25}{25}}
 52 | \headcommand {\slideentry {0}{0}{26}{26/26}{}{0}}
 53 | \headcommand {\beamer@framepages {26}{26}}
 54 | \headcommand {\slideentry {0}{0}{27}{27/27}{}{0}}
 55 | \headcommand {\beamer@framepages {27}{27}}
 56 | \headcommand {\slideentry {0}{0}{28}{28/28}{}{0}}
 57 | \headcommand {\beamer@framepages {28}{28}}
 58 | \headcommand {\slideentry {0}{0}{29}{29/29}{}{0}}
 59 | \headcommand {\beamer@framepages {29}{29}}
 60 | \headcommand {\slideentry {0}{0}{30}{30/30}{}{0}}
 61 | \headcommand {\beamer@framepages {30}{30}}
 62 | \headcommand {\slideentry {0}{0}{31}{31/31}{}{0}}
 63 | \headcommand {\beamer@framepages {31}{31}}
 64 | \headcommand {\slideentry {0}{0}{32}{32/32}{}{0}}
 65 | \headcommand {\beamer@framepages {32}{32}}
 66 | \headcommand {\slideentry {0}{0}{33}{33/33}{}{0}}
 67 | \headcommand {\beamer@framepages {33}{33}}
 68 | \headcommand {\slideentry {0}{0}{34}{34/34}{}{0}}
 69 | \headcommand {\beamer@framepages {34}{34}}
 70 | \headcommand {\slideentry {0}{0}{35}{35/35}{}{0}}
 71 | \headcommand {\beamer@framepages {35}{35}}
 72 | \headcommand {\slideentry {0}{0}{36}{36/36}{}{0}}
 73 | \headcommand {\beamer@framepages {36}{36}}
 74 | \headcommand {\slideentry {0}{0}{37}{37/37}{}{0}}
 75 | \headcommand {\beamer@framepages {37}{37}}
 76 | \headcommand {\slideentry {0}{0}{38}{38/38}{}{0}}
 77 | \headcommand {\beamer@framepages {38}{38}}
 78 | \headcommand {\slideentry {0}{0}{39}{39/39}{}{0}}
 79 | \headcommand {\beamer@framepages {39}{39}}
 80 | \headcommand {\slideentry {0}{0}{40}{40/40}{}{0}}
 81 | \headcommand {\beamer@framepages {40}{40}}
 82 | \headcommand {\slideentry {0}{0}{41}{41/41}{}{0}}
 83 | \headcommand {\beamer@framepages {41}{41}}
 84 | \headcommand {\slideentry {0}{0}{42}{42/42}{}{0}}
 85 | \headcommand {\beamer@framepages {42}{42}}
 86 | \headcommand {\slideentry {0}{0}{43}{43/43}{}{0}}
 87 | \headcommand {\beamer@framepages {43}{43}}
 88 | \headcommand {\slideentry {0}{0}{44}{44/44}{}{0}}
 89 | \headcommand {\beamer@framepages {44}{44}}
 90 | \headcommand {\slideentry {0}{0}{45}{45/45}{}{0}}
 91 | \headcommand {\beamer@framepages {45}{45}}
 92 | \headcommand {\slideentry {0}{0}{46}{46/46}{}{0}}
 93 | \headcommand {\beamer@framepages {46}{46}}
 94 | \headcommand {\slideentry {0}{0}{47}{47/47}{}{0}}
 95 | \headcommand {\beamer@framepages {47}{47}}
 96 | \headcommand {\slideentry {0}{0}{48}{48/48}{}{0}}
 97 | \headcommand {\beamer@framepages {48}{48}}
 98 | \headcommand {\slideentry {0}{0}{49}{49/49}{}{0}}
 99 | \headcommand {\beamer@framepages {49}{49}}
100 | \headcommand {\slideentry {0}{0}{50}{50/50}{}{0}}
101 | \headcommand {\beamer@framepages {50}{50}}
102 | \headcommand {\slideentry {0}{0}{51}{51/51}{}{0}}
103 | \headcommand {\beamer@framepages {51}{51}}
104 | \headcommand {\slideentry {0}{0}{52}{52/52}{}{0}}
105 | \headcommand {\beamer@framepages {52}{52}}
106 | \headcommand {\slideentry {0}{0}{53}{53/53}{}{0}}
107 | \headcommand {\beamer@framepages {53}{53}}
108 | \headcommand {\slideentry {0}{0}{54}{54/54}{}{0}}
109 | \headcommand {\beamer@framepages {54}{54}}
110 | \headcommand {\slideentry {0}{0}{55}{55/55}{}{0}}
111 | \headcommand {\beamer@framepages {55}{55}}
112 | \headcommand {\slideentry {0}{0}{56}{56/56}{}{0}}
113 | \headcommand {\beamer@framepages {56}{56}}
114 | \headcommand {\slideentry {0}{0}{57}{57/57}{}{0}}
115 | \headcommand {\beamer@framepages {57}{57}}
116 | \headcommand {\slideentry {0}{0}{58}{58/58}{}{0}}
117 | \headcommand {\beamer@framepages {58}{58}}
118 | \headcommand {\slideentry {0}{0}{59}{59/59}{}{0}}
119 | \headcommand {\beamer@framepages {59}{59}}
120 | \headcommand {\slideentry {0}{0}{60}{60/60}{}{0}}
121 | \headcommand {\beamer@framepages {60}{60}}
122 | \headcommand {\slideentry {0}{0}{61}{61/61}{}{0}}
123 | \headcommand {\beamer@framepages {61}{61}}
124 | \headcommand {\slideentry {0}{0}{62}{62/62}{}{0}}
125 | \headcommand {\beamer@framepages {62}{62}}
126 | \headcommand {\slideentry {0}{0}{63}{63/63}{}{0}}
127 | \headcommand {\beamer@framepages {63}{63}}
128 | \headcommand {\slideentry {0}{0}{64}{64/64}{}{0}}
129 | \headcommand {\beamer@framepages {64}{64}}
130 | \headcommand {\slideentry {0}{0}{65}{65/65}{}{0}}
131 | \headcommand {\beamer@framepages {65}{65}}
132 | \headcommand {\slideentry {0}{0}{66}{66/66}{}{0}}
133 | \headcommand {\beamer@framepages {66}{66}}
134 | \headcommand {\slideentry {0}{0}{67}{67/67}{}{0}}
135 | \headcommand {\beamer@framepages {67}{67}}
136 | \headcommand {\slideentry {0}{0}{68}{68/68}{}{0}}
137 | \headcommand {\beamer@framepages {68}{68}}
138 | \headcommand {\slideentry {0}{0}{69}{69/69}{}{0}}
139 | \headcommand {\beamer@framepages {69}{69}}
140 | \headcommand {\slideentry {0}{0}{70}{70/70}{}{0}}
141 | \headcommand {\beamer@framepages {70}{70}}
142 | \headcommand {\slideentry {0}{0}{71}{71/71}{}{0}}
143 | \headcommand {\beamer@framepages {71}{71}}
144 | \headcommand {\slideentry {0}{0}{72}{72/72}{}{0}}
145 | \headcommand {\beamer@framepages {72}{72}}
146 | \headcommand {\slideentry {0}{0}{73}{73/73}{}{0}}
147 | \headcommand {\beamer@framepages {73}{73}}
148 | \headcommand {\beamer@partpages {1}{73}}
149 | \headcommand {\beamer@subsectionpages {1}{73}}
150 | \headcommand {\beamer@sectionpages {1}{73}}
151 | \headcommand {\beamer@documentpages {73}}
152 | \headcommand {\def \inserttotalframenumber {73}}
153 | 


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression.snm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression.snm


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[10pt,ignorenonframetext,]{beamer}
 2 | \setbeamertemplate{caption}[numbered]
 3 | \setbeamertemplate{caption label separator}{: }
 4 | \setbeamercolor{caption name}{fg=normal text.fg}
 5 | \beamertemplatenavigationsymbolsempty
 6 | \usepackage{lmodern}
 7 | \usepackage{amssymb,amsmath}
 8 | \usepackage{ifxetex,ifluatex}
 9 | \usepackage{fixltx2e} % provides \textsubscript
10 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
11 |   \usepackage[T1]{fontenc}
12 |   \usepackage[utf8]{inputenc}
13 | \else % if luatex or xelatex
14 |   \ifxetex
15 |     \usepackage{mathspec}
16 |   \else
17 |     \usepackage{fontspec}
18 |   \fi
19 |   \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
20 | \fi
21 | \usetheme[]{Dresden}
22 | \usecolortheme{dolphin}
23 | \usefonttheme{structuresmallcapsserif}
24 | % use upquote if available, for straight quotes in verbatim environments
25 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{}
26 | % use microtype if available
27 | \IfFileExists{microtype.sty}{%
28 | \usepackage{microtype}
29 | \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
30 | }{}
31 | \newif\ifbibliography
32 | \hypersetup{
33 |             pdftitle={Regularization methods},
34 |             pdfauthor={Jan-Philipp Kolb},
35 |             pdfborder={0 0 0},
36 |             breaklinks=true}
37 | \urlstyle{same}  % don't use monospace font for urls
38 | 
39 | % Prevent slide breaks in the middle of a paragraph:
40 | \widowpenalties 1 10000
41 | \raggedbottom
42 | 
43 | \AtBeginPart{
44 |   \let\insertpartnumber\relax
45 |   \let\partname\relax
46 |   \frame{\partpage}
47 | }
48 | \AtBeginSection{
49 |   \ifbibliography
50 |   \else
51 |     \let\insertsectionnumber\relax
52 |     \let\sectionname\relax
53 |     \frame{\sectionpage}
54 |   \fi
55 | }
56 | \AtBeginSubsection{
57 |   \let\insertsubsectionnumber\relax
58 |   \let\subsectionname\relax
59 |   \frame{\subsectionpage}
60 | }
61 | 
62 | \setlength{\parindent}{0pt}
63 | \setlength{\parskip}{6pt plus 2pt minus 1pt}
64 | \setlength{\emergencystretch}{3em}  % prevent overfull lines
65 | \providecommand{\tightlist}{%
66 |   \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
67 | \setcounter{secnumdepth}{0}
68 | 
69 | \title{Regularization methods}
70 | \author{Jan-Philipp Kolb}
71 | \date{17 Mai, 2019}
72 | 
73 | \begin{document}
74 | \frame{\titlepage}
75 | 
76 | \begin{frame}
77 | 
78 | \end{frame}
79 | 
80 | \end{document}
81 | 


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression.toc:
--------------------------------------------------------------------------------
1 | \beamer@endinputifotherversion {3.36pt}
2 | 


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression.vrb:
--------------------------------------------------------------------------------
 1 | \frametitle{Further packages}
 2 | \protect\hypertarget{further-packages}{}
 3 | 
 4 | \begin{Shaded}
 5 | \begin{Highlighting}[]
 6 | \CommentTok{# https://cran.rstudio.com/web/packages/biglasso/biglasso.pdf}
 7 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"biglasso"}\NormalTok{)}
 8 | \end{Highlighting}
 9 | \end{Shaded}
10 | 
11 | 


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-12-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-12-1.pdf


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-17-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-17-1.pdf


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-20-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-20-1.pdf


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-23-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-23-1.pdf


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-25-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-25-1.pdf


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-27-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-27-1.pdf


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-36-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-beamer/unnamed-chunk-36-1.pdf


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-20-1.png


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-25-1.png


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-27-1.png


--------------------------------------------------------------------------------
/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-36-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/b2_lasso_regression_files/figure-slidy/unnamed-chunk-36-1.png


--------------------------------------------------------------------------------
/slides/old/b_lasso_regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Lasso Regression"
  3 | author: "Jan-Philipp Kolb"
  4 | date: "25 September 2018"
  5 | output: beamer_presentation
  6 | ---
  7 | 
  8 | ```{r setuplasso, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE,message = F,warning = F)
 10 | ```
 11 | 
 12 | 
 13 | ## [Lasso Regression](https://en.wikipedia.org/wiki/Lasso_(statistics))
 14 | 
 15 | ### Lasso - least absolute shrinkage and selection operator 
 16 | 
 17 | - lasso is a regression analysis method that performs variable selection and regularization (reduce overfitting)
 18 | - We want to enhance prediction accuracy and interpretability of the statistical model.
 19 | 
 20 | <!--
 21 | https://eight2late.wordpress.com/2017/07/11/a-gentle-introduction-to-logistic-regression-and-lasso-regularisation-using-r/
 22 | -->
 23 | 
 24 | - We could remove less important variables, after checking that they are not important.
 25 | - We can do that manually by examining p-values of coefficients and discarding those variables whose coefficients are not significant.
 26 | - This can become tedious for classification problems with many independent variables
 27 | 
 28 | 
 29 | ## History of lasso
 30 | 
 31 | - Originally introduced in geophysics literature in 1986
 32 | - Independently rediscovered and popularized in 1996 by Robert Tibshirani, who coined the term and provided further insights into the observed performance.
 33 | 
 34 | 
 35 | 
 36 | Lasso was originally formulated for least squares models and this simple case reveals a substantial amount about the behavior of the estimator, including its relationship to ridge regression and best subset selection and the connections between lasso coefficient estimates and so-called soft thresholding. It also reveals that (like standard linear regression) the coefficient estimates need not be unique if covariates are collinear.
 37 | 
 38 | ## Lasso for other models than least squares
 39 | 
 40 | Though originally defined for least squares, lasso regularization is easily extended to a wide variety of statistical models including generalized linear models, generalized estimating equations, proportional hazards models, and M-estimators, in a straightforward fashion.
 41 | 
 42 | - Lasso’s ability to perform subset selection relies on the form of the constraint and has a variety of interpretations including in terms of geometry, Bayesian statistics, and convex analysis.
 43 | 
 44 | The LASSO is closely related to basis pursuit denoising.
 45 | 
 46 | 
 47 | 
 48 | 
 49 | ## What is [lasso regression](http://www.statisticshowto.com/lasso-regression/)
 50 | 
 51 | - Lasso regression uses shrinkage
 52 | - data values are shrunk towards a central point
 53 | 
 54 | - [Ridge and lasso regularization work by adding a penalty term to the log likelihood function.](https://eight2late.wordpress.com/2017/07/11/a-gentle-introduction-to-logistic-regression-and-lasso-regularisation-using-r/)
 55 | 
 56 | - A tuning parameter, $\lambda$ controls the strength of the L1 penalty.
 57 | 
 58 | $$
 59 | \sum\limits_{i=1}^n \big( y_i -\beta_0 - \sum\limits_{j=1}^p \beta_jx_{ij} \big)^2 + \lambda \sum\limits_{j=1}^p |\beta_j| = RSS + \lambda\sum\limits_{j=1}^p |\beta_j|.
 60 | $$
 61 | <!--
 62 | wir haben einen penalty term, der hoch ist, wenn die Parameterschätzwerte hoch sind.
 63 | 
 64 | Youtube Video zu Lasso
 65 | https://www.youtube.com/watch?v=A5I1G1MfUmA
 66 | -->
 67 | 
 68 | ## [Regularization](https://en.wikipedia.org/wiki/Regularization_(mathematics))
 69 | 
 70 | 
 71 | regularization is the process of adding information in order to solve an ill-posed problem or to prevent [overfitting](https://en.wikipedia.org/wiki/Overfitting).
 72 | 
 73 | ![](figure/450px-Overfitting.svg.png)
 74 | 
 75 | The green line represents an overfitted model and the black line represents a regularized model. While the green line best follows the training data, it is too dependent on that data and it is likely to have a higher error rate on new unseen data, compared to the black line.
 76 | 
 77 | <!--
 78 | https://en.wikipedia.org/wiki/Overfitting
 79 | -->
 80 | 
 81 | 
 82 | ## [The L1 norm explained](https://stats.stackexchange.com/questions/347257/geometrical-interpretation-of-l1-regression)
 83 | 
 84 | ![](figure/BBRXC.png)
 85 | 
 86 | ## [Ridge Regression and the Lasso](https://www.r-bloggers.com/ridge-regression-and-the-lasso/)
 87 | 
 88 | ```{r}
 89 | swiss <- datasets::swiss
 90 | x <- model.matrix(Fertility~., swiss)[,-1]
 91 | y <- swiss$Fertility
 92 | lambda <- 10^seq(10, -2, length = 100)
 93 | ```
 94 | 
 95 | ## Test and train dataset
 96 | 
 97 | ```{r}
 98 | library(glmnet)
 99 | set.seed(489)
100 | train = sample(1:nrow(x), nrow(x)/2)
101 | test = (-train)
102 | ytest = y[test]
103 | ```
104 | 
105 | 
106 | ## A first ols model
107 | 
108 | ```{r}
109 | #OLS
110 | swisslm <- lm(Fertility~., data = swiss)
111 | coef(swisslm)
112 | ```
113 | 
114 | ## A ridge model
115 | 
116 | ```{r}
117 | #ridge
118 | ridge.mod <- glmnet(x, y, alpha = 0, lambda = lambda)
119 | predict(ridge.mod, s = 0, type = 'coefficients')[1:6,]
120 | ```
121 | 
122 | 
123 | ## Lasso regression with package `glmnet`
124 | 
125 | ```{r,eval=F}
126 | install.packages("glmnet")
127 | ```
128 | 
129 | ```{r}
130 | library(glmnet)
131 | ```
132 | 
133 | ```{r}
134 | x=matrix(rnorm(100*20),100,20)
135 | g2=sample(1:2,100,replace=TRUE)
136 | fit2=glmnet(x,g2,family="binomial")
137 | ```
138 | 
139 | ```{r,eval=T}
140 | caret::varImp(fit2,lambda=0.0007567)
141 | ```
142 | 
143 | 
144 | ## 
145 | 
146 | - LASSO is a feature selection method.
147 | <!--
148 | https://eight2late.wordpress.com/2017/07/11/a-gentle-introduction-to-logistic-regression-and-lasso-regularisation-using-r/
149 | -->
150 | - LASSO regression has inbuilt penalization functions to reduce overfitting.
151 | <!--
152 | https://www.analyticsvidhya.com/blog/2016/12/introduction-to-feature-selection-methods-with-an-example-or-how-to-select-the-right-variables/
153 | -->
154 | 
155 | 
156 | ## 
157 | 
158 | - The logarithmic function is used for the link between probability and logits
159 | 
160 | - The Logit function is used to [linearize sigmoid curves](https://de.wikipedia.org/wiki/Logit).
161 | 
162 | <!--
163 | Die Logit-Funktion wird zur Linearisierung von sigmoiden Kurven verwendet.
164 | -->
165 | 
166 | ## The package `caret`
167 | 
168 | - Classification and Regression Training
169 | 
170 | ```{r,eval=F}
171 | install.packages("caret")
172 | ```
173 | 
174 | ```{r}
175 | library("caret")
176 | ```
177 | 
178 | - [**Vignette `caret` package **](https://cran.r-project.org/web/packages/caret/vignettes/caret.html) - 
179 | 
180 | ## 
181 | 
182 | ```{r,eval=F}
183 | ?caret::train
184 | ```
185 | 
186 | 
187 | 
188 | ```{r,eval=F}
189 | logit<-train(,data = gp.train.c,
190 |                         method = 'glm',
191 |                         family = 'binomial',
192 |                         trControl = ctrl0)")
193 | ```
194 | 
195 | 
196 | ## Further packages 
197 | 
198 | ```{r,eval=F}
199 | # https://cran.rstudio.com/web/packages/biglasso/biglasso.pdf
200 | install.packages("biglasso")
201 | ```
202 | 
203 | 
204 | 
205 | ## Links
206 | 
207 | 
208 | [A comprehensive beginners guide for Linear, Ridge and Lasso Regression](https://www.analyticsvidhya.com/blog/2017/06/a-comprehensive-guide-for-linear-ridge-and-lasso-regression/)
209 | 
210 | - Course for statistical learning - [Youtube - Videos](https://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/)
211 | 
212 | - [pcLasso: a new method for sparse regression](https://www.r-bloggers.com/pclasso-a-new-method-for-sparse-regression/)
213 | 
214 | - [Youtube - lasso regression - clearly explained](https://www.youtube.com/watch?v=NGf0voTMlcs) 
215 | 
216 | - [Glmnet Vignette](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html)
217 | 
218 | - [Regularization Methods in R](https://www.geo.fu-berlin.de/en/v/soga/Geodata-analysis/multiple-regression/Regularization-Methods/Regularization-Methods-in-R/index.html)
219 | 
220 | - [A gentle introduction to logistic regression and lasso regularisation using R](https://eight2late.wordpress.com/2017/07/11/a-gentle-introduction-to-logistic-regression-and-lasso-regularisation-using-r/)
221 | 
222 | - [Penalized Regression in R](https://machinelearningmastery.com/penalized-regression-in-r/)
223 | 
224 | - [Penalized Logistic Regression Essentials in R](http://www.sthda.com/english/articles/36-classification-methods-essentials/149-penalized-logistic-regression-essentials-in-r-ridge-lasso-and-elastic-net/)


--------------------------------------------------------------------------------
/slides/old/c_bagging_boosting_trees.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/c_bagging_boosting_trees.pdf


--------------------------------------------------------------------------------
/slides/old/caret.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "The package caret"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "21 November 2018"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## Loading the libraries
13 | 
14 | ```{r}
15 | library(MLmetrics)
16 | library(party)
17 | library(partykit)
18 | library(caret)
19 | ```
20 | 
21 | ## An example dataset
22 | 
23 | ```{r}
24 | n <-1000
25 | gp.train.c <- data.frame(D_dropout=as.factor(sample(c("yes","no"),n,replace=T)),
26 |                          sd_habit=runif(n),
27 |                          sd_identify=runif(n),
28 |                          another_var=as.factor(sample(c("yes","no","maybe"),n,replace=T)))
29 | ```
30 | 
31 | ## Preliminaries
32 | 
33 | - No missing values are possible in the dataset
34 | 
35 | 
36 | ## 
37 | 
38 | ```{r}
39 | cvIndex <- caret::createFolds(gp.train.c$D_dropout, 10, returnTrain = T)
40 | fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))
41 |   
42 | ctrl  <- caret::trainControl(method = "cv",
43 |                                number = 10,
44 |                                index = cvIndex,
45 |                                summaryFunction = fiveStats,
46 |                                classProbs = TRUE)
47 | ```
48 | 
49 | 
50 | 
51 | 
52 | 
53 | ```{r}
54 | grid <- expand.grid(alpha = c(0,1),
55 |                       lambda = seq(0.5,0,length=50))
56 | ```
57 | 
58 | 
59 | ```{r}
60 | lasso<-caret::train( D_dropout ~  sd_habit + sd_identify +another_var ,
61 |                      data=gp.train.c,method ='glmnet',
62 |                      family= 'binomial',trControl = ctrl,
63 |                      tuneGrid = grid,metric = 'Kappa')
64 | ```
65 | 
66 | <!--
67 | https://github.com/topepo/caret/issues/145
68 | 
69 | https://github.com/topepo/caret/issues/330
70 | 
71 | https://discuss.analyticsvidhya.com/t/something-is-wrong-all-the-accuracy-metric-values-are-missing/64825
72 | https://stackoverflow.com/questions/36181840/something-is-wrong-all-the-accuracy-metric-values-are-missing-error-in-caret
73 | https://github.com/topepo/caret/issues/160
74 | -->


--------------------------------------------------------------------------------
/slides/old/conditional_inference_trees.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Conditional Inference Trees"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "28 Juni 2019"
 5 | output: pdf_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | 
13 | ## [ctree example](https://datawookie.netlify.com/blog/2013/05/package-party-conditional-inference-trees/)
14 | 
15 | ```{r,eval=F}
16 | install.packages("party")
17 | ```
18 | 
19 | ## The data behind
20 | 
21 | ```{r}
22 | airq <- subset(airquality, !is.na(Ozone))
23 | summary(airq$Temp)
24 | ```
25 | 
26 | ## A first model
27 | 
28 | ```{r}
29 | library(party)
30 | ```
31 | 
32 | 
33 | ```{r}
34 | air.ct <- ctree(Ozone ~ ., data = airq, controls = ctree_control(maxsurrogate = 3))
35 | ```
36 | 
37 | 
38 | ## The plot for `ctree`
39 | 
40 | ```{r}
41 | plot(air.ct)
42 | ```
43 | 
44 | 
45 | 
46 | 
47 | ## Recursive partitioning algorithms are special cases of a
48 | simple two-stage algorithm
49 | 
50 | - First partition the observations by univariate splits in a recursive way and 
51 | - second fit a constant model in each cell of the resulting partition.
52 | 
53 | 
54 | ## [`ctree` - Regression](https://stats.stackexchange.com/questions/171301/interpreting-ctree-partykit-output-in-r)
55 | 
56 | ```{r}
57 | library(partykit)
58 | ```
59 | 
60 | ```{r,eval=F}
61 | ?ctree
62 | ```
63 | 
64 | ```{r}
65 | airq <- subset(airquality, !is.na(Ozone))
66 | airct <- ctree(Ozone ~ ., data = airq)
67 | plot(airct, type = "simple")
68 | ```
69 | 
70 | ## Links
71 | 
72 | - [**Vignette**](https://cran.r-project.org/web/packages/partykit/vignettes/ctree.pdf) for package `partykit` 
73 | 
74 | - [Conditional Inference Trees](https://rpubs.com/awanindra01/ctree)


--------------------------------------------------------------------------------
/slides/old/doParallel.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "parallel"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "30 Januar 2019"
 5 | output: beamer_presentation
 6 | ---
 7 | 
 8 | ```{r, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | 
13 | ## [Time measurement](https://www.r-bloggers.com/5-ways-to-measure-running-time-of-r-code/)
14 | 
15 | ```{r}
16 | start_time <- Sys.time()
17 | ab <- runif(10000000)
18 | end_time <- Sys.time()
19 | 
20 | end_time - start_time
21 | ```
22 | 
23 | 
24 | ## How many cores are available
25 | 
26 | 
27 | ```{r}
28 | library(doParallel)
29 | detectCores()
30 | ```
31 | 
32 | ## 
33 | 
34 | ```{r}
35 | cl <- makeCluster(detectCores())
36 | registerDoParallel(cl)
37 | ```
38 | 
39 | ```{r}
40 | start_time <- Sys.time()
41 | ab <- runif(10000000)
42 | end_time <- Sys.time()
43 | 
44 | end_time - start_time
45 | ```
46 | 
47 | ```{r}
48 | stopCluster(cl)
49 | ```
50 | 
51 | 
52 | ```{r}
53 | ?parallel::makeCluster
54 | ```
55 | 
56 | 


--------------------------------------------------------------------------------
/slides/old/evaluation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/evaluation.pdf


--------------------------------------------------------------------------------
/slides/old/gradient_boosting.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Gradient Boosting"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "4 September 2018"
 5 | output: beamer_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | ```
11 | 
12 | 
13 | ## [Gradient boosting](https://en.wikipedia.org/wiki/Gradient_boosting)
14 | 
15 | Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function.
16 | 
17 | The idea of gradient boosting originated in the observation by Leo Breiman that boosting can be interpreted as an optimization algorithm on a suitable cost function.
18 | 
19 | 
20 | Breiman, L. (1997). "Arcing The Edge". Technical Report 486. Statistics Department, University of California, Berkeley.
21 | 
22 | 
23 | <!--
24 | -->
25 | 
26 | ## Explicit algorithms
27 | 
28 | Explicit regression gradient boosting algorithms were subsequently developed by Jerome H. Friedman,[2][3] simultaneously with the more general functional gradient boosting perspective of Llew Mason, Jonathan Baxter, Peter Bartlett and Marcus Frean.[4][5] 
29 | 
30 | 
31 | The latter two papers introduced the view of boosting algorithms as iterative functional gradient descent algorithms. That is, algorithms that optimize a cost function over function space by iteratively choosing a function (weak hypothesis) that points in the negative gradient direction. This functional gradient view of boosting has led to the development of boosting algorithms in many areas of machine learning and statistics beyond regression and classification.
32 | 
33 | 
34 | ## [**Advantages of gradient boosting**](http://uc-r.github.io/gbm_regression)
35 | 
36 | - Often provides predictive accuracy that cannot be beat.
37 | - Lots of flexibility - can optimize on different loss functions and provides several hyperparameter tuning options that make the function fit very flexible.
38 | - No data pre-processing required - often works great with categorical and numerical values as is.
39 | - Handles missing data - imputation not required.
40 | 
41 | ## [**Disadvantages**](http://uc-r.github.io/gbm_regression) of gradient boosting
42 | 
43 | 
44 | - GBMs will continue improving to minimize all errors. This can overemphasize outliers and cause overfitting. Must use cross-validation to neutralize.
45 | - Computationally expensive - GBMs often require many trees (>1000) which can be time and memory exhaustive.
46 | - The high flexibility results in many parameters that interact and influence heavily the behavior of the approach (number of iterations, tree depth, regularization parameters, etc.). This requires a large grid search during tuning.
47 | - Less interpretable although this is easily addressed with various tools (variable importance, partial dependence plots, LIME, etc.).
48 | 
49 | 
50 | ## Two types of errors for tree methods
51 | 
52 | ### Bias related errors
53 | 
54 | - Adaptive boosting
55 | - Gradient boosting
56 | 
57 | ### Variance related errors
58 | 
59 | - Bagging
60 | - Random forest
61 | 
62 | <!--
63 | https://www.slideshare.net/JaroslawSzymczak1/gradient-boosting-in-practice-a-deep-dive-into-xgboost
64 | 
65 | What if we, instead of reweighting examples, made some corrections to prediction errors directly?
66 | 
67 | Residual is a gradient of single observation error contribution in one of the most common evaluation measure for regression: RMSE
68 | -->
69 | 
70 | 
71 | 
72 | ## [Gradient Boosting for Linear Regression - why does it not work?](https://stats.stackexchange.com/questions/186966/gradient-boosting-for-linear-regression-why-does-it-not-work)
73 | 
74 | While learning about Gradient Boosting, I haven't heard about any constraints regarding the properties of a "weak classifier" that the method uses to build and ensemble model. However, I could not imagine an application of a GB that uses linear regression, and in fact when I've performed some tests - it doesn't work. I was testing the most standard approach with a gradient of sum of squared residuals and adding the subsequent models together.
75 | 
76 | The obvious problem is that the residuals from the first model are populated in such manner that there is really no regression line to fit anymore. My another observation is that a sum of subsequent linear regression models can be represented as a single regression model as well (adding all intercepts and corresponding coefficients) so I cannot imagine how that could ever improve the model. The last observation is that a linear regression (the most typical approach) is using sum of squared residuals as a loss function - the same one that GB is using.
77 | 
78 | I also thought about lowering the learning rate or using only a subset of predictors for each iteration, but that could still be summed up to a single model representation eventually, so I guess it would bring no improvement.
79 | 
80 | What am I missing here? Is linear regression somehow inappropriate to use with Gradient Boosting? Is it because the linear regression uses the sum of squared residuals as a loss function? Are there any particular constraints on the weak predictors so they can be applied to Gradient Boosting?
81 | 
82 | 
83 | 
84 | ## Links
85 | 
86 | - [**Gradient Boosting Machines**](http://uc-r.github.io/gbm_regression)
87 | 
88 | 
89 | - [How to Visualize Gradient Boosting Decision Trees With XGBoost in Python](https://machinelearningmastery.com/visualize-gradient-boosting-decision-trees-xgboost-python/)
90 | 
91 | 
92 | <!--
93 | https://www.researchgate.net/figure/A-simple-example-of-visualizing-gradient-boosting_fig5_326379229
94 | -->


--------------------------------------------------------------------------------
/slides/old/gradient_boosting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/gradient_boosting.pdf


--------------------------------------------------------------------------------
/slides/old/lasso_regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/lasso_regression.pdf


--------------------------------------------------------------------------------
/slides/old/logit_model.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Logit model"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "4 September 2018"
 5 | output: beamer_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | ```
11 | 
12 | ## 
13 | 
14 | - The logarithmic function is used for the link between probability and logits
15 | 
16 | - The Logit function is used to [linearize sigmoid curves](https://de.wikipedia.org/wiki/Logit).
17 | 
18 | <!--
19 | Die Logit-Funktion wird zur Linearisierung von sigmoiden Kurven verwendet.
20 | -->
21 | 
22 | ## The package `caret`
23 | 
24 | - Classification and Regression Training
25 | 
26 | ```{r,eval=F}
27 | install.packages("caret")
28 | ```
29 | 
30 | ```{r}
31 | library("caret")
32 | ```
33 | 
34 | - [**Vignette `caret` package **](https://cran.r-project.org/web/packages/caret/vignettes/caret.html) - 
35 | 
36 | ## 
37 | 
38 | ```{r,eval=F}
39 | ?caret::train
40 | ```
41 | 
42 | 
43 | 
44 | ```{r}
45 | logit<-train(,data = gp.train.c,
46 |                         method = 'glm',
47 |                         family = 'binomial',
48 |                         trControl = ctrl0)")
49 | ```
50 | 
51 | 


--------------------------------------------------------------------------------
/slides/old/ml_part1.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Machine Learning with R - part 1"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | fontsize: 10pt
 6 | output:
 7 |   beamer_presentation: 
 8 |     colortheme: dolphin
 9 |     fig_height: 3
10 |     fig_width: 5
11 |     fig_caption: no
12 |     fonttheme: structuresmallcapsserif
13 |     highlight: haddock
14 |     theme: Dresden
15 |   pdf_document: 
16 |     keep_tex: yes
17 |     toc: yes
18 |   slidy_presentation: 
19 |     css: mycss.css
20 |     keep_md: yes
21 | ---
22 | 
23 | ```{r, include=FALSE}
24 | knitr::opts_chunk$set(echo = FALSE,message = F,warning=F)
25 | ```
26 | 
27 | # Introduction to R
28 | 
29 | ```{r child = 'a1_intro_r.Rmd'}
30 | ```
31 | 
32 | # Introduction to machine learning
33 | 
34 | ```{r child = 'a2_intro_ml.Rmd'}
35 | ```
36 | 
37 | # Simple regression
38 | 
39 | ```{r child = 'b1_regression.Rmd'}
40 | ```
41 | 


--------------------------------------------------------------------------------
/slides/old/ml_part1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/slides/old/ml_part1.pdf


--------------------------------------------------------------------------------
/slides/old/random_forests.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Decission Trees and Random Forests"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "1 Oktober 2018"
 5 | output: beamer_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | ```
11 | 
12 | 
13 | ## [Decision Trees](http://www.statmethods.net/advstats/cart.html)
14 | 
15 | [Regression tree vs. classification tree](http://www.statmethods.net/advstats/cart.html)
16 | 
17 | 
18 | ```{r}
19 | library(rpart)
20 | ```
21 | 
22 | Grow a tree
23 | 
24 | ```{r}
25 | fit <- rpart(Kyphosis ~ Age + Number + Start,
26 |    method="class", data=kyphosis)
27 | 
28 | printcp(fit) # display the results
29 | plotcp(fit) # visualize cross-validation results
30 | summary(fit) # detailed summary of splits
31 | ```
32 | 
33 | ```{r}
34 | # plot tree
35 | plot(fit, uniform=TRUE,
36 |    main="Classification Tree for Kyphosis")
37 | text(fit, use.n=TRUE, all=TRUE, cex=.8)
38 | ```
39 | 
40 | [Decision Trees and Random Forest](https://cran.r-project.org/doc/contrib/Zhao_R_and_data_mining.pdf)
41 | 
42 | 
43 | 
44 | ## [Random Forest](https://www.datascience.com/resources/notebooks/random-forest-intro)
45 | 
46 | > Random forest aims to reduce the previously mentioned correlation issue by choosing only a subsample of the feature space at each split. Essentially, it aims to make the trees de-correlated and prune the trees by setting a stopping criteria for node splits, which I will cover in more detail later.
47 | 
48 | ## [Random forest](https://en.wikipedia.org/wiki/Random_forest)
49 | 
50 | - Ensemble learning method - multitude of decision trees 
51 | - Random forests correct for decision trees' habit of overfitting to their training set.
52 | 
53 | 
54 | ![](figure/expl_rf.png)
55 | 
56 | 
57 | <!--
58 | https://stats.stackexchange.com/questions/41443/how-to-actually-plot-a-sample-tree-from-randomforestgettree
59 | -->
60 | 
61 | 
62 | ```{r,eval=F}
63 | install.packages("randomForest")
64 |  # https://www.instituteofanalytics.com/forum/uploads/editor/ls/4kivialj5lvj.pdf
65 | # devtools::install_github('araastat/reprtree')
66 | ```
67 | 
68 | 
69 | 
70 | ```{r,eval=F}
71 | library(randomForest)
72 | library(reprtree)
73 | 
74 | model <- randomForest(Species ~ ., data=iris, importance=TRUE, ntree=500, mtry = 2, do.trace=100)
75 | 
76 | reprtree:::plot.getTree(model)
77 | ```
78 | 
79 | <!--
80 | https://de.slideshare.net/m80m07/random-forest
81 | 
82 | http://www.math.uwaterloo.ca/~hachipma/stat946/koulis.pdf
83 | 
84 | http://www.lsta.upmc.fr/BIAU/bs.pdf
85 | -->
86 | 
87 | 
88 | ## Random forests in package `caret`
89 | 
90 | - [models: A List of Available Models in train](https://rdrr.io/cran/caret/man/models.html)
91 | 
92 | - [Practical guide to implement machine learning with CARET package in R](https://www.analyticsvidhya.com/blog/2016/12/practical-guide-to-implement-machine-learning-with-caret-package-in-r-with-practice-problem/)
93 | 
94 | 
95 | ## Links
96 | 
97 | - [The Random Forest Algorithm](https://towardsdatascience.com/the-random-forest-algorithm-d457d499ffcd)
98 | 
99 | - CRAN Task View [Machine & Statistical Learning](http://cran.r-project.org/web/views/MachineLearning.html)


--------------------------------------------------------------------------------
/slides/old/supervised_learning.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Part 1 - Intro Supervised Learning"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "2 4 2019"
 5 | output: ioslides_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE,message = F)
10 | ```
11 | 
12 | ## [Supervised Learning](https://www.datasciencecentral.com/profiles/blogs/supervised-learning-everything-you-need-to-know)
13 | 
14 | ![](figure/SMLProcess.png)
15 | 
16 | ## R-packages for machine learning
17 | 
18 | ![](figure/top-20-r-packages-machine-learning-downloads.jpg)
19 | 
20 | 
21 | ## [k-nearest neighbour](https://www.r-bloggers.com/k-nearest-neighbor-step-by-step-tutorial/)
22 | 
23 | ```{r}
24 | library(caret)
25 | library(e1071)
26 | ```
27 | 
28 | 
29 | ```{r}
30 | data1 <- read.csv("../data/US Presidential Data.csv")
31 | ```
32 | 
33 | ```{r}
34 | #Partitioning the data into training and validation data
35 | set.seed(101)
36 | index = createDataPartition(data1$Win.Loss, p = 0.7, list = F )
37 | train = data1[index,]
38 | validation = data1[-index,]
39 | ```
40 | 
41 | ```{r}
42 | # Explore data
43 | dim(train)
44 | dim(validation)
45 | names(train)
46 | head(train)
47 | head(validation)
48 | ```
49 | 
50 | 
51 | ```{r}
52 | # Setting levels for both training and validation data
53 | levels(train$Win.Loss) <- make.names(levels(factor(train$Win.Loss)))
54 | levels(validation$Win.Loss) <- make.names(levels(factor(validation$Win.Loss)))
55 | ```
56 | 
57 | ## [](https://www.dataiku.com/learn/guide/academy/machine-learning/identify_clusters.html)
58 | 
59 | ![How to identify clusters and name them](figure/nyc_map.png)
60 | 
61 | 
62 | ## Links
63 | 
64 | - [Your First Machine Learning Project in R Step-By-Step](https://machinelearningmastery.com/machine-learning-in-r-step-by-step/)
65 | 
66 | - [Top 20 R Machine Learning and Data Science packages](https://www.kdnuggets.com/2015/06/top-20-r-machine-learning-packages.html)
67 | 
68 | - [Statistical NLP on OpenStreetMap](https://machinelearnings.co/statistical-nlp-on-openstreetmap-b9d573e6cc86)
69 | 
70 | - [How to identify clusters and name them](https://www.dataiku.com/learn/guide/academy/machine-learning/identify_clusters.html)
71 | 
72 | - [Setting the course for Machine Learning](https://blog.arup.io/setting-the-course-for-machine-learning-760133aa334d)
73 | 
74 | - [The provision of urban green space and its accessibility: Spatial data effects in Brussels](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0204684)
75 | 
76 | - [Residential scene classification for gridded population sampling in developing countries using deep convolutional neural networks on satellite imagery](https://ij-healthgeographics.biomedcentral.com/articles/10.1186/s12942-018-0132-1)
77 | 
78 | 
79 | - [Using Convolutional Neural Networks to detect features in satellite images](http://ataspinar.com/2017/12/04/using-convolutional-neural-networks-to-detect-features-in-sattelite-images/)
80 | 
81 | 
82 | <!--
83 | http://www.informatik.tuwien.ac.at/master-abschluss/epilog/ausstellungen/26#thesis-3103
84 | -->


--------------------------------------------------------------------------------
/tutorial/g_ml_applying_algorithms.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Applying machine learning algorithms – exercises"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | fontsize: 10pt
 6 | output:
 7 |   beamer_presentation: 
 8 |     colortheme: dolphin
 9 |     fig_height: 3
10 |     fig_width: 5
11 |     fig_caption: no
12 |     fonttheme: structuresmallcapsserif
13 |     highlight: haddock
14 |     theme: Dresden
15 |   pdf_document: 
16 |     keep_tex: yes
17 |     toc: yes
18 |   slidy_presentation: 
19 |     css: mycss.css
20 |     keep_md: yes
21 | ---
22 | 
23 | ```{r setup, include=FALSE}
24 | knitr::opts_chunk$set(echo = TRUE)
25 | ```
26 | 
27 | ## [](https://www.r-exercises.com/2017/09/15/applying-machine-learning-algorithms-exercises/)
28 | 
29 | ### Exercise 1
30 | 
31 | Create a list named “control” that runs a 10-fold cross-validation. HINT: Use trainControl().
32 | 
33 | ### Exercise 2
34 | 
35 | Use the metric of “Accuracy” to evaluate models.
36 | 
37 | ### Exercise 3
38 | 
39 | Build the “LDA”, “CART”, “kNN”, “SVM” and “RF” models.
40 | 
41 | ### Exercise 4
42 | 
43 | Create a list of the 5 models you just built and name it “results”. HINT: Use `resamples()`.
44 | 
45 | ### Exercise 5
46 | 
47 | Report the accuracy of each model by using the summary function on the list “results”. HINT: Use summary().
48 | 
49 | ### Exercise 6
50 | 
51 | Create a plot of the model evaluation results and compare the spread and the mean accuracy of each model. HINT: Use dotplot().
52 | 
53 | ### Exercise 7
54 | 
55 | Which model seems to be the most accurate?
56 | 
57 | ### Exercise 8
58 | 
59 | Summarize the results of the best model and print them. HINT: Use print().
60 | 
61 | ### Exercise 9
62 | 
63 | Run the “LDA” model directly on the validation set to create a factor named “predictions”. HINT: Use predict().
64 | 
65 | ### Exercise 10
66 | 
67 | Summarize the results in a confusion matrix. HINT: Use `confusionMatrix()`.
68 | 


--------------------------------------------------------------------------------
/tutorial/ml_exercises.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises.pdf


--------------------------------------------------------------------------------
/tutorial/ml_exercises_a1_introR.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "ML exercises - basics R"
  3 | author: "Jan-Philipp Kolb"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | fontsize: 10pt
  6 | output:
  7 |   slidy_presentation: 
  8 |     css: mycss.css
  9 |     keep_md: yes
 10 |   pdf_document: 
 11 |     keep_tex: yes
 12 |     toc: yes
 13 |   beamer_presentation: 
 14 |     colortheme: dolphin
 15 |     fig_height: 3
 16 |     fig_width: 5
 17 |     fig_caption: no
 18 |     fonttheme: structuresmallcapsserif
 19 |     highlight: haddock
 20 |     theme: Dresden
 21 | ---
 22 | 
 23 | ```{r setup, include=FALSE}
 24 | knitr::opts_chunk$set(echo = T,warning=F,message=F)
 25 | ```
 26 | 
 27 | 
 28 | ## Exercise: Find R-packages
 29 | 
 30 | Go to https://cran.r-project.org/ and search for packages that can be used:
 31 | 
 32 | 1) to reduce overfitting
 33 | 2) for regression trees
 34 | 3) for gradient boosting
 35 | 4) for neural networks
 36 | 5) for clustering
 37 | 
 38 | ## Solution: Find R-packages
 39 | 
 40 | ```{r,eval=F}
 41 | install.packages("glmnet") #1)
 42 | install.packages("rpart") #2)
 43 | install.packages("gbm") #3)
 44 | install.packages("neuralnet") #4)
 45 | install.packages("kknn") #5)
 46 | ```
 47 | 
 48 | ## Exercise: load built-in data
 49 | 
 50 | ### Load the the built-in dataset `swiss` 
 51 | <!--
 52 | and answer the following questions:
 53 | -->
 54 | 1) How many observations and variables are available?
 55 | 2) What is the scale level of the variables?
 56 | 
 57 | ### Interactive data table 
 58 | 
 59 | 3) Create an interactive data table
 60 | 
 61 | ## Solution: load built-in data
 62 | 
 63 | ```{r}
 64 | # 1)
 65 | data(swiss) 
 66 | dim(swiss) 
 67 | str(swiss) 
 68 | ```
 69 | 
 70 | ```{r,eval=F}
 71 | # 2)
 72 | DT::datatable(swiss)
 73 | ```
 74 | 
 75 | ## [Exercise](https://www.datacamp.com/community/tutorials/pipe-r-tutorial): random numbers
 76 | 
 77 | ```{r,echo=F}
 78 | x <- c(0.109, 0.359, 0.63, 0.996, 0.515, 0.142, 0.017, 
 79 |        0.829, 0.907)
 80 | x <- runif(8)
 81 | ```
 82 | 
 83 | 1) Draw 8 random numbers from the uniform distribution and save them in a vector `x`
 84 | 2) Compute the logarithm of `x`, return suitably lagged and iterated differences, 
 85 | 3) compute the exponential function and round the result
 86 | 
 87 | ```{r,echo=F}
 88 | round(exp(diff(log(x))), 1)
 89 | ```
 90 | 
 91 | ## Solution: random numbers
 92 | 
 93 | ```{r,echo=F}
 94 | x <- runif(8) #1)
 95 | round(exp(diff(log(x))), 1) #2) and 3)
 96 | ```
 97 | 
 98 | 
 99 | 
100 | <!--
101 | Exercise cross validation
102 | https://www.r-exercises.com/2017/09/15/applying-machine-learning-algorithms-exercises/
103 | 
104 | Applying machine learning algorithms – exercises: solutions
105 | 
106 | https://www.r-exercises.com/2017/11/28/machine-learning-with-h2o-part-3-exercises/
107 | -->
108 | 


--------------------------------------------------------------------------------
/tutorial/ml_exercises_a1_introR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_a1_introR.pdf


--------------------------------------------------------------------------------
/tutorial/ml_exercises_a1_introR.tex:
--------------------------------------------------------------------------------
  1 | \PassOptionsToPackage{unicode=true}{hyperref} % options for packages loaded elsewhere
  2 | \PassOptionsToPackage{hyphens}{url}
  3 | %
  4 | \documentclass[
  5 |   10pt,
  6 |   ignorenonframetext,
  7 | ]{beamer}
  8 | \usepackage{pgfpages}
  9 | \setbeamertemplate{caption}[numbered]
 10 | \setbeamertemplate{caption label separator}{: }
 11 | \setbeamercolor{caption name}{fg=normal text.fg}
 12 | \beamertemplatenavigationsymbolsempty
 13 | % Prevent slide breaks in the middle of a paragraph:
 14 | \widowpenalties 1 10000
 15 | \raggedbottom
 16 | \setbeamertemplate{part page}{
 17 |   \centering
 18 |   \begin{beamercolorbox}[sep=16pt,center]{part title}
 19 |     \usebeamerfont{part title}\insertpart\par
 20 |   \end{beamercolorbox}
 21 | }
 22 | \setbeamertemplate{section page}{
 23 |   \centering
 24 |   \begin{beamercolorbox}[sep=12pt,center]{part title}
 25 |     \usebeamerfont{section title}\insertsection\par
 26 |   \end{beamercolorbox}
 27 | }
 28 | \setbeamertemplate{subsection page}{
 29 |   \centering
 30 |   \begin{beamercolorbox}[sep=8pt,center]{part title}
 31 |     \usebeamerfont{subsection title}\insertsubsection\par
 32 |   \end{beamercolorbox}
 33 | }
 34 | \AtBeginPart{
 35 |   \frame{\partpage}
 36 | }
 37 | \AtBeginSection{
 38 |   \ifbibliography
 39 |   \else
 40 |     \frame{\sectionpage}
 41 |   \fi
 42 | }
 43 | \AtBeginSubsection{
 44 |   \frame{\subsectionpage}
 45 | }
 46 | \usepackage{lmodern}
 47 | \usepackage{amssymb,amsmath}
 48 | \usepackage{ifxetex,ifluatex}
 49 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
 50 |   \usepackage[T1]{fontenc}
 51 |   \usepackage[utf8]{inputenc}
 52 |   \usepackage{textcomp} % provides euro and other symbols
 53 | \else % if luatex or xelatex
 54 |   \usepackage{unicode-math}
 55 |   \defaultfontfeatures{Scale=MatchLowercase}
 56 |   \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
 57 | \fi
 58 | \usetheme[]{Dresden}
 59 | \usecolortheme{dolphin}
 60 | \usefonttheme{structuresmallcapsserif}
 61 | % use upquote if available, for straight quotes in verbatim environments
 62 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{}
 63 | \IfFileExists{microtype.sty}{% use microtype if available
 64 |   \usepackage[]{microtype}
 65 |   \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
 66 | }{}
 67 | \makeatletter
 68 | \@ifundefined{KOMAClassName}{% if non-KOMA class
 69 |   \IfFileExists{parskip.sty}{%
 70 |     \usepackage{parskip}
 71 |   }{% else
 72 |     \setlength{\parindent}{0pt}
 73 |     \setlength{\parskip}{6pt plus 2pt minus 1pt}}
 74 | }{% if KOMA class
 75 |   \KOMAoptions{parskip=half}}
 76 | \makeatother
 77 | \usepackage{xcolor}
 78 | \IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
 79 | \IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
 80 | \hypersetup{
 81 |   pdftitle={ML exercises - basics R},
 82 |   pdfauthor={Jan-Philipp Kolb},
 83 |   pdfborder={0 0 0},
 84 |   breaklinks=true}
 85 | \urlstyle{same}  % don't use monospace font for urls
 86 | \newif\ifbibliography
 87 | \usepackage{color}
 88 | \usepackage{fancyvrb}
 89 | \newcommand{\VerbBar}{|}
 90 | \newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
 91 | \DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
 92 | % Add ',fontsize=\small' for more characters per line
 93 | \newenvironment{Shaded}{}{}
 94 | \newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{#1}}
 95 | \newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}}
 96 | \newcommand{\AttributeTok}[1]{#1}
 97 | \newcommand{\BaseNTok}[1]{#1}
 98 | \newcommand{\BuiltInTok}[1]{#1}
 99 | \newcommand{\CharTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}}
100 | \newcommand{\CommentTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}}
101 | \newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}}
102 | \newcommand{\ConstantTok}[1]{#1}
103 | \newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.00,1.00}{#1}}
104 | \newcommand{\DataTypeTok}[1]{#1}
105 | \newcommand{\DecValTok}[1]{#1}
106 | \newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}}
107 | \newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{#1}}}
108 | \newcommand{\ExtensionTok}[1]{#1}
109 | \newcommand{\FloatTok}[1]{#1}
110 | \newcommand{\FunctionTok}[1]{#1}
111 | \newcommand{\ImportTok}[1]{#1}
112 | \newcommand{\InformationTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}}
113 | \newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.00,1.00}{#1}}
114 | \newcommand{\NormalTok}[1]{#1}
115 | \newcommand{\OperatorTok}[1]{#1}
116 | \newcommand{\OtherTok}[1]{\textcolor[rgb]{1.00,0.25,0.00}{#1}}
117 | \newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{1.00,0.25,0.00}{#1}}
118 | \newcommand{\RegionMarkerTok}[1]{#1}
119 | \newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}}
120 | \newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}}
121 | \newcommand{\StringTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}}
122 | \newcommand{\VariableTok}[1]{#1}
123 | \newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.00,0.50,0.50}{#1}}
124 | \newcommand{\WarningTok}[1]{\textcolor[rgb]{0.00,0.50,0.00}{\textbf{#1}}}
125 | \setlength{\emergencystretch}{3em}  % prevent overfull lines
126 | \providecommand{\tightlist}{%
127 |   \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
128 | \setcounter{secnumdepth}{-2}
129 | 
130 | % set default figure placement to htbp
131 | \makeatletter
132 | \def\fps@figure{htbp}
133 | \makeatother
134 | 
135 | 
136 | \title{ML exercises - basics R}
137 | \author{Jan-Philipp Kolb}
138 | \date{03 Juni, 2019}
139 | 
140 | \begin{document}
141 | \frame{\titlepage}
142 | 
143 | \begin{frame}{Exercise: Find R-packages}
144 | \protect\hypertarget{exercise-find-r-packages}{}
145 | 
146 | Go to \url{https://cran.r-project.org/} and search for packages that can
147 | be used:
148 | 
149 | \begin{enumerate}
150 | [1)]
151 | \tightlist
152 | \item
153 |   to reduce overfitting
154 | \item
155 |   for regression trees
156 | \item
157 |   for gradient boosting
158 | \item
159 |   for neural networks
160 | \item
161 |   for clustering
162 | \end{enumerate}
163 | 
164 | \end{frame}
165 | 
166 | \begin{frame}[fragile]{Solution: Find R-packages}
167 | \protect\hypertarget{solution-find-r-packages}{}
168 | 
169 | \begin{Shaded}
170 | \begin{Highlighting}[]
171 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\CommentTok{#1)}
172 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"rpart"}\NormalTok{) }\CommentTok{#2)}
173 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"gbm"}\NormalTok{) }\CommentTok{#3)}
174 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"neuralnet"}\NormalTok{) }\CommentTok{#4)}
175 | \KeywordTok{install.packages}\NormalTok{(}\StringTok{"kknn"}\NormalTok{) }\CommentTok{#5)}
176 | \end{Highlighting}
177 | \end{Shaded}
178 | 
179 | \end{frame}
180 | 
181 | \begin{frame}[fragile]{Exercise: load built-in data}
182 | \protect\hypertarget{exercise-load-built-in-data}{}
183 | 
184 | \begin{block}{Load the the built-in dataset \texttt{swiss}}
185 | 
186 | \begin{enumerate}
187 | [1)]
188 | \tightlist
189 | \item
190 |   How many observations and variables are available?
191 | \item
192 |   What is the scale level of the variables?
193 | \end{enumerate}
194 | 
195 | \end{block}
196 | 
197 | \begin{block}{Interactive data table}
198 | 
199 | \begin{enumerate}
200 | [1)]
201 | \setcounter{enumi}{2}
202 | \tightlist
203 | \item
204 |   Create an interactive data table
205 | \end{enumerate}
206 | 
207 | \end{block}
208 | 
209 | \end{frame}
210 | 
211 | \begin{frame}[fragile]{Solution: load built-in data}
212 | \protect\hypertarget{solution-load-built-in-data}{}
213 | 
214 | \begin{Shaded}
215 | \begin{Highlighting}[]
216 | \CommentTok{# 1)}
217 | \KeywordTok{data}\NormalTok{(swiss) }
218 | \KeywordTok{dim}\NormalTok{(swiss) }
219 | \end{Highlighting}
220 | \end{Shaded}
221 | 
222 | \begin{verbatim}
223 | ## [1] 47  6
224 | \end{verbatim}
225 | 
226 | \begin{Shaded}
227 | \begin{Highlighting}[]
228 | \KeywordTok{str}\NormalTok{(swiss) }
229 | \end{Highlighting}
230 | \end{Shaded}
231 | 
232 | \begin{verbatim}
233 | ## 'data.frame':    47 obs. of  6 variables:
234 | ##  $ Fertility       : num  80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
235 | ##  $ Agriculture     : num  17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
236 | ##  $ Examination     : int  15 6 5 12 17 9 16 14 12 16 ...
237 | ##  $ Education       : int  12 9 5 7 15 7 7 8 7 13 ...
238 | ##  $ Catholic        : num  9.96 84.84 93.4 33.77 5.16 ...
239 | ##  $ Infant.Mortality: num  22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
240 | \end{verbatim}
241 | 
242 | \begin{Shaded}
243 | \begin{Highlighting}[]
244 | \CommentTok{# 2)}
245 | \NormalTok{DT}\OperatorTok{::}\KeywordTok{datatable}\NormalTok{(swiss)}
246 | \end{Highlighting}
247 | \end{Shaded}
248 | 
249 | \end{frame}
250 | 
251 | \begin{frame}[fragile]{\href{https://www.datacamp.com/community/tutorials/pipe-r-tutorial}{Exercise}:
252 | random numbers}
253 | \protect\hypertarget{exercise-random-numbers}{}
254 | 
255 | \begin{enumerate}
256 | [1)]
257 | \tightlist
258 | \item
259 |   Draw 8 random numbers from the uniform distribution and save them in a
260 |   vector \texttt{x}
261 | \item
262 |   Compute the logarithm of \texttt{x}, return suitably lagged and
263 |   iterated differences,
264 | \item
265 |   compute the exponential function and round the result
266 | \end{enumerate}
267 | 
268 | \begin{verbatim}
269 | ## [1] 6.4 0.6 2.2 0.7 1.5 0.8 1.0
270 | \end{verbatim}
271 | 
272 | \end{frame}
273 | 
274 | \begin{frame}[fragile]{Solution: random numbers}
275 | \protect\hypertarget{solution-random-numbers}{}
276 | 
277 | \begin{verbatim}
278 | ## [1] 1.0 1.1 0.3 0.5 2.1 0.6 6.2
279 | \end{verbatim}
280 | 
281 | \end{frame}
282 | 
283 | \end{document}
284 | 


--------------------------------------------------------------------------------
/tutorial/ml_exercises_a_visualizing.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Visualizing dataset as preparation for machine learning'
  3 | author: "Jan-Philipp Kolb"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | fontsize: 10pt
  6 | output: 
  7 |   beamer_presentation: 
  8 |     theme: Dresden
  9 |     colortheme: dolphin
 10 |     fig_height: 3
 11 |     fig_width: 5
 12 |     fig_caption: no
 13 |     fonttheme: structuresmallcapsserif
 14 |     highlight: haddock
 15 | ---
 16 | 
 17 | ```{r setup, include=FALSE}
 18 | knitr::opts_chunk$set(echo = T,message = F,warning=F,cache=F)
 19 | ```
 20 | 
 21 | ## Exercises: Visualizing dataset to apply machine learning
 22 | 
 23 | - Exercise based on [r-exercises - visualizing for ml](https://www.r-exercises.com/2017/09/08/visualizing-dataset-to-apply-machine-learning-exercises/)
 24 | 
 25 | 
 26 | ### Exercise 1
 27 | 
 28 | Create a variable “x” and attach to it the input attributes of the “iris” dataset. HINT: Use columns 1 to 4.
 29 | 
 30 | ### Exercise 2
 31 | 
 32 | Create a variable “y” and attach to it the output attribute of the “iris” dataset. HINT: Use column 5.
 33 | 
 34 | ### Exercise 3
 35 | 
 36 | Create a whisker plot (boxplot) for the variable of the first column of the “iris” dataset. HINT: Use `boxplot()`.
 37 | 
 38 | ## Exercises
 39 | 
 40 | ### Exercise 4
 41 | 
 42 | Now create a whisker plot for each one of the four input variables of the “iris” dataset in one image. HINT: Use par().
 43 | 
 44 | ### Exercise 5
 45 | 
 46 | Create a barplot to breakdown your output attribute. HINT: Use plot().
 47 | 
 48 | ### Exercise 6
 49 | 
 50 | Create a scatterplot matrix of the “iris” dataset using the “x” and “y” variables. HINT: Use featurePlot().
 51 | 
 52 | ### Exercise 7
 53 | 
 54 | Create a scatterplot matrix with ellipses around each separated group. HINT: Use plot="ellipse".
 55 | 
 56 | ## Exercises
 57 | 
 58 | ### Exercise 8
 59 | 
 60 | Create box and whisker plots of each input variable again, but this time broken down into separated plots for each class. HINT: Use plot="box".
 61 | 
 62 | ### Exercise 9
 63 | 
 64 | Create a list named “scales” that includes the “x” and “y” variables and set relation to “free” for both of them. HINT: Use `list()`
 65 | 
 66 | ### Exercise 10
 67 | 
 68 | Create a density plot matrix for each attribute by class value. HINT: Use `featurePlot()`.
 69 | 
 70 | 
 71 | ## [Solutions](https://www.r-exercises.com/2017/09/08/visualizing-dataset-to-apply-machine-learning-exercises-solutions/)
 72 | 
 73 | ###   Solution Exercise 1
 74 | 
 75 | <!--
 76 | # install.packages("caret")
 77 | -->
 78 | 
 79 | ```{r}
 80 | library(caret)
 81 | data(iris)
 82 | validation <- createDataPartition(iris$Species, p=0.80, 
 83 |                                   list=FALSE)
 84 | validation20 <- iris[-validation,]
 85 | iris <- iris[validation,]
 86 | x <- iris[,1:4]
 87 | ```
 88 | 
 89 | ###   Solution Exercise 2 
 90 | 
 91 | ```{r}
 92 | library(caret)
 93 | y <- iris[,5]
 94 | ```
 95 | 
 96 | 
 97 | ##   Solution Exercise 3 
 98 | 
 99 | ```{r}
100 | library(caret)
101 | boxplot(x[,1], main=names(iris)[1])
102 | ```
103 | 
104 | 
105 | ##    Solution Exercise 4   
106 | 
107 | ```{r}
108 | library(caret)
109 | par(mfrow=c(1,4))
110 | for(i in 1:4) {
111 |   boxplot(x[,i], main=names(iris)[i])
112 | }
113 | ```
114 | 
115 | 
116 | ##    Solution Exercise 5   
117 | 
118 | ```{r}
119 | library(caret)
120 | plot(y)
121 | ```
122 | 
123 | 
124 | ##    Solutions - Visualizing ML
125 | 
126 | ### Solution Exercise 6   
127 | 
128 | ```{r}
129 | library(caret)
130 | featurePlot(x=x, y=y)
131 | ```
132 | 
133 | 
134 | ##    Solution Exercise 7   
135 | 
136 | ```{r}
137 | # install.packages("ellipse")
138 | library(ellipse)
139 | library(caret)
140 | featurePlot(x=x, y=y,plot="ellipse")
141 | ```
142 | 
143 | 
144 | ##    Solutions - Visualizing ML  
145 | 
146 | ### Solution Exercise 8   
147 | 
148 | ```{r}
149 | library(caret)
150 | featurePlot(x=x, y=y, plot="box")
151 | ```
152 | 
153 | 
154 | ##    Solutions - Visualizing ML    
155 | 
156 | ### Solution Exercise 9   
157 | 
158 | ```{r}
159 | library(caret)
160 | scales <- list(x=list(relation="free"), y=list(relation="free"))
161 | ```
162 | 
163 | 
164 | 
165 | ###    Solution Exercise 10  
166 | 
167 | ```{r,eval=F}
168 | library(caret)
169 | scales <- list(x=list(relation="free"), y=list(relation="free"))
170 | featurePlot(x=x, y=y, plot="density", scales=scales)
171 | ```
172 | 
173 | ##    Solution Exercise 10  
174 | 
175 | ```{r,echo=F}
176 | library(caret)
177 | scales <- list(x=list(relation="free"), y=list(relation="free"))
178 | featurePlot(x=x, y=y, plot="density", scales=scales)
179 | ```
180 | 


--------------------------------------------------------------------------------
/tutorial/ml_exercises_a_visualizing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_a_visualizing.pdf


--------------------------------------------------------------------------------
/tutorial/ml_exercises_b_regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_b_regression.pdf


--------------------------------------------------------------------------------
/tutorial/ml_exercises_c1_treesbagging.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning - Decision Trees Exercises/Solution"
  3 | author: "Jan-Philipp Kolb"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | fontsize: 10pt
  6 | output:
  7 |   beamer_presentation: 
  8 |     colortheme: dolphin
  9 |     fig_height: 3
 10 |     fig_width: 5
 11 |     fig_caption: no
 12 |     fonttheme: structuresmallcapsserif
 13 |     highlight: haddock
 14 |     theme: Dresden
 15 |   pdf_document: 
 16 |     keep_tex: yes
 17 |     toc: yes
 18 |   slidy_presentation: 
 19 |     css: mycss.css
 20 |     keep_md: yes
 21 | ---
 22 | 
 23 | ```{r setup, include=FALSE}
 24 | knitr::opts_chunk$set(echo = TRUE,cache=T,warning = F,message = F)
 25 | ```
 26 | 
 27 | ## [Exercise - `rpart` Kyphosis](https://www.r-exercises.com/2016/12/13/recursive-partitioning-and-regression-trees-exercises/)
 28 | 
 29 | ### Consider the Kyphosis data frame
 30 | 
 31 | <!--
 32 | (type `help("kyphosis")` for more details)
 33 | -->
 34 | 
 35 | 1) Which variables are in the `kyphosis` dataset
 36 | 2) Build a tree to classify Kyphosis from Age, Number and Start.
 37 | 
 38 | ### Consider the tree build above.
 39 | 
 40 | 3) Which variables are used to explain Kyphosis presence?
 41 | 4) How many observations contain the terminal nodes.
 42 | 
 43 | ### Consider the Kyphosis data frame.
 44 | 
 45 | 5) Build a tree using the first 60 observations of kyphosis.
 46 | 6) Predict the kyphosis presence for the other 21 observations.
 47 | 7) Which is the misclassification rate (prediction error)
 48 | 
 49 | ## [The dataset kyphosis](https://www.r-exercises.com/2016/12/13/recursive-partitioning-and-regression-trees-solutions/)
 50 | 
 51 | ### The dataset contains (1):
 52 | 
 53 | - Kyphosis: a factor with levels absent and present, indicating if a kyphosis (a type of deformation) was present after the operation.
 54 | - Age: in months.
 55 | - Number: the number of vertebrae involved.
 56 | - Start: the number of the first (topmost) vertebra operated on.
 57 | 
 58 | ```{r}
 59 | data(kyphosis,package = "rpart")
 60 | dplyr::glimpse(kyphosis)
 61 | ```
 62 | 
 63 | ## Build the tree (2)
 64 | 
 65 | ```{r}
 66 | (TREE<-rpart::rpart(Kyphosis~Age+Number+Start,
 67 |                     data=kyphosis,method="class"))
 68 | ```
 69 | 
 70 | ## [Plot the result](https://www.rdocumentation.org/packages/rpart.plot/versions/3.0.7/topics/rpart.plot)
 71 | 
 72 | ```{r}
 73 | rpart.plot::rpart.plot(TREE,extra=1)
 74 | ```
 75 | 
 76 | 
 77 | ```{r,eval=F,echo=F}
 78 | rpart.plot::rpart.plot(TREE,type=5)
 79 | rpart.plot::rpart.plot(TREE,extra=1)
 80 | ```
 81 | 
 82 | ## Answers
 83 | 
 84 | 3) Which variables are used to explain Kyphosis presence?
 85 | 
 86 | - The variables are Start and Age
 87 | 
 88 | 4) How many observations contain the terminal nodes.
 89 | 
 90 | <!--
 91 | *denotes terminal nodes.
 92 | -->
 93 | 
 94 | - The terminal nodes have 29, 12, 12, 3 and 8 observations
 95 | 
 96 | <!--
 97 | ## Consider the Kyphosis data frame.
 98 | -->
 99 | 
100 | 5) Build a tree using the first 60 observations of kyphosis.
101 | 
102 | - `y` is a factor $\Rightarrow$ we choose `method="class"` 
103 | ```{r}
104 | TREE <- rpart(Kyphosis ~ Age + Number + Start, 
105 |               data=kyphosis[1:60,],method="class")
106 | ```
107 | 
108 | 
109 | ## Further answers
110 | 
111 | 6) Predict the kyphosis presence for the other 21 observations.
112 | 
113 | ```{r}
114 | PR <- predict(TREE,kyphosis[61:81,],type='class')
115 | ```
116 | 
117 | 
118 | 7) Which is the [**misclassification rate**](https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/) (prediction error) 
119 | 
120 | ```{r}
121 | test <- kyphosis$Kyphosis[61:81]
122 | table(PR,test)
123 | ```
124 | 
125 | ```{r}
126 | (rate <- 100*length(which(PR!=test))/length(PR))
127 | ```
128 | 
129 | ```{r,echo=F,eval=F}
130 | cat('the misclassification rate is:',rate)
131 | ```
132 | 
133 | 
134 | ## Exercise `rpart` -  `iris`
135 | 
136 | ### Consider the `iris` data frame
137 | 
138 | 1) Build a tree to classify Species from the other variables.
139 | 2) Plot the trees, add nodes information.
140 | 
141 | ### Consider the tree build before
142 | 
143 | 3) Prune the tree using median complexity parameter (cp) associated to the tree.
144 | 4) Plot in the same window, the pruned and the original tree.
145 | 5) In which terminal nodes is clasified each oobservations of `iris`?
146 | 6) Which Specie has a flower of `Petal.Length` greater than 2.45 and `Petal.Width` less than 1.75.
147 | 
148 | ## Solution - `rpart` -  `iris` (I)
149 | 
150 | 1) Build a tree to classify Species from the other variables.
151 | 
152 | ```{r}
153 | (TREE2 <- rpart(Species ~ ., data=iris,method="class"))
154 | ```
155 | 
156 | ## Solution - `rpart` -  `iris` (II)
157 | 
158 | 
159 | 2) Plot the trees, add nodes information.
160 | 
161 | ```{r}
162 | library(rpart.plot)
163 | rpart.plot(TREE2)
164 | ```
165 | 
166 | ```{r,echo=F,eval=F}
167 | library(rpart.plot)
168 | png("figure/ml_tb_rpart_iris.png")
169 | rpart.plot(TREE2)
170 | dev.off()
171 | ```
172 | 
173 | 
174 | 
175 | ## Solution - `rpart` -  `iris` (III)
176 | 
177 | 
178 | 3) Prune the the using median complexity parameter (cp) associated to the tree.
179 | 
180 | ```{r}
181 | TP <- prune(TREE2,cp=median(TREE2$cptable[,'CP']))
182 | ```
183 | 
184 | 4) Plot in the same window, the pruned and the original tree.
185 | 
186 | ```{r,fig.height=3,echo=F,eval=F}
187 | par(mfrow=c(1,2))
188 | plot(TREE2);text(TREE2,use.n=T)
189 | plot(TP);text(TP,use.n=T)
190 | ```
191 | 
192 | ```{r,fig.height=3,eval=F}
193 | par(mfrow=c(1,2))
194 | rpart.plot(TREE2)
195 | rpart.plot(TP)
196 | ```
197 | 
198 | ## The plotted results
199 | 
200 | ```{r,echo=F}
201 | par(mfrow=c(1,2))
202 | rpart.plot(TREE2)
203 | rpart.plot(TP)
204 | ```
205 | 
206 | 
207 | ## Solution - `rpart` -  `iris` (IV)
208 | 
209 | 5) In which terminal nodes is clasified each observations of iris?
210 | 
211 | ```{r}
212 | TREE2$where
213 | ```
214 | 
215 | ## Solution - `rpart` -  `iris` (V)
216 | 
217 | 
218 | 6) Which species has a flower of `Petal.Length` greater than 2.45 and `Petal.Width` less than 1.75.
219 | 
220 | ```{r}
221 | print('versicolor')
222 | unique(iris[iris$Petal.Length>2.45 & 
223 |               iris$Petal.Width<1.75,"Species"])
224 | ```
225 | 
226 | 


--------------------------------------------------------------------------------
/tutorial/ml_exercises_c1_treesbagging.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_c1_treesbagging.pdf


--------------------------------------------------------------------------------
/tutorial/ml_exercises_c2_randomforests.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_c2_randomforests.pdf


--------------------------------------------------------------------------------
/tutorial/ml_exercises_c3_xtremeboosting.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "ML Exercises - Gradient Boosting"
  3 | author: "Jan-Philipp Kolb"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | fontsize: 10pt
  6 | output: 
  7 |   beamer_presentation: 
  8 |     theme: Dresden
  9 |     colortheme: dolphin
 10 |     fig_height: 3
 11 |     fig_width: 5
 12 |     fig_caption: no
 13 |     fonttheme: structuresmallcapsserif
 14 |     highlight: haddock
 15 | ---
 16 | 
 17 | ```{r setup, include=FALSE}
 18 | knitr::opts_chunk$set(echo = TRUE,cache=T,message=F,warning=F)
 19 | ```
 20 | 
 21 | ## [eXtremely Boost your machine learning Exercises (Part-1)](https://www.r-exercises.com/2017/09/24/extremely-boost-your-machine-learning-exercises-part-1/)
 22 | 
 23 | ```{r,eval=F,echo=F}
 24 | install.packages("xgboost")
 25 | ```
 26 | 
 27 | 
 28 | - eXtreme Gradient Boosting is a machine learning model which became really popular few years ago after winning several Kaggle competitions. 
 29 | - It is very powerful algorithm that use an ensemble of weak learners to obtain a strong learner. 
 30 | - Its R implementation is available in xgboost package and it is really worth including into anyone’s machine learning portfolio.
 31 | 
 32 | <!--
 33 | This is the first part of eXtremely Boost your machine learning series. For other parts follow the tag xgboost.
 34 | 
 35 | Answers to the exercises are available here.
 36 | 
 37 | If you obtained a different (correct) answer than those listed on the solutions page, please feel free to post your answer as a comment on that page.
 38 | -->
 39 | 
 40 | ## Boosting Exercises - first part
 41 | 
 42 | ### Exercise 1
 43 | Load `xgboost` library and download German Credit dataset. Your goal will be to predict creditability (the first column in the dataset).
 44 | 
 45 | ### Exercise 2
 46 | Convert columns `c(2,4,5,7,8,9,10,11,12,13,15,16,17,18,19,20)` to factors and then encode them as dummy variables. HINT: use the command `model.matrix()`
 47 | 
 48 | ### Exercise 3
 49 | Split data into training and test set 700:300. Create `xgb.DMatrix` for both sets with Creditability as label.
 50 | 
 51 | ## Boosting Exercises - second part
 52 | 
 53 | ### Exercise 4
 54 | Train `xgboost` with logistic objective and 30 rounds of training and maximal depth 2.
 55 | 
 56 | ### Exercise 5
 57 | To check model performance calculate test set classification error.
 58 | 
 59 | ### Exercise 6
 60 | Plot predictors importance.
 61 | 
 62 | ## Boosting Exercises - third part
 63 | 
 64 | ### Exercise 7
 65 | Use `xgb.train()` instead of `xgboost()` to add both train and test sets as a watchlist. Train model with same parameters, but 100 rounds to see how it performs during training.
 66 | 
 67 | ### Exercise 8
 68 | Train model again adding AUC and Log Loss as evaluation metrices.
 69 | 
 70 | ### Exercise 9
 71 | Plot how AUC and Log Loss for train and test sets was changing during training process. Use plotting function/library of your choice.
 72 | 
 73 | ### Exercise 10
 74 | Check how setting parameter eta to 0.01 influences the AUC and Log Loss curves.
 75 | image_pdf
 76 | 
 77 | ## [Solutions: boosting exercises](https://www.r-exercises.com/2017/09/24/extremely-boost-your-machine-learning-solutions-part-1/)
 78 | 
 79 | ### Solution Exercise 1 - import dataset
 80 | 
 81 | ```{r}
 82 | library(xgboost)
 83 | ```
 84 | 
 85 | 
 86 | ```{r,eval=F}
 87 | url <- "http://freakonometrics.free.fr/german_credit.csv"
 88 | credit <- read.csv(url, header = TRUE, sep = ",")
 89 | ```
 90 | 
 91 | 
 92 | ```{r,eval=F,echo=F}
 93 | save(credit,file="../data/german_credit.RData")
 94 | ```
 95 | 
 96 | ```{r,echo=F}
 97 | load("../data/german_credit.RData")
 98 | ```
 99 | 
100 | ```{r}
101 | head(credit)
102 | ```
103 | 
104 | ## Solutions boosting exercises - first part
105 | 
106 | ### Solution Exercise 2 - convert columns
107 | 
108 | ```{r}
109 | factor_columns <- c(2,4,5,7,8,9,10,11,12,13,15,16,17,18,19,20)
110 | for(i in factor_columns) credit[,i] <- as.factor(credit[,i])
111 | X <- model.matrix(~ . - Creditability, data=credit)
112 | ```
113 | 
114 | ###  Solution  Exercise 3 
115 | 
116 | ```{r}
117 | inTraining <- sample(1:nrow(credit),size=700)
118 | dtrain <- xgboost::xgb.DMatrix(X[inTraining,],
119 |                       label=credit$Creditability[inTraining])
120 | dtest <- xgboost::xgb.DMatrix(X[-inTraining,],
121 |                      label=credit$Creditability[-inTraining])
122 | ```
123 | 
124 | ## Solutions boosting exercises - second part
125 | 
126 | ### Solution Exercise 4 - train `xgboost` model 
127 | 
128 | ```{r}
129 | model <- xgboost(data = dtrain,
130 |                  max_depth = 2,
131 |                  nrounds = 30,
132 |                  objective = "binary:logistic")
133 | ```
134 | 
135 | ## Solutions boosting exercises - third part
136 | 
137 | ### Solution    Exercise 5   
138 | 
139 | ```{r}
140 | err<-mean(round(predict(model,dtest))!=getinfo(dtest,'label'))
141 | print(paste("test-error=", err))
142 | ```
143 | 
144 | 
145 | ### Solution    Exercise 6   
146 | 
147 | ```{r,eval=F}
148 | importance.matrix <- xgb.importance(model = model, 
149 |                                     feature_names = colnames(X))
150 | xgb.plot.importance(importance.matrix)
151 | ```
152 | 
153 | ## Importance plot
154 | 
155 | ```{r,echo=F}
156 | importance.matrix <- xgb.importance(model = model, 
157 |                                 feature_names = colnames(X))
158 | xgb.plot.importance(importance.matrix)
159 | ```
160 | 
161 | 
162 | ##  Solution   Exercise 7  
163 | 
164 | ```{r}
165 | model_watchlist <- xgb.train(data = dtrain,
166 |                       max_depth = 2,nrounds = 100,
167 |                       objective = "binary:logistic",
168 |                       watchlist = list(train=dtrain, 
169 |                                        test=dtest))
170 | ```
171 | 
172 | 
173 | 
174 | ##  Solution  Exercise 8   
175 | 
176 | ```{r,eval=F}
177 | model_auc<-xgb.train(data = dtrain,max_depth = 2,
178 |         nrounds = 100,objective = "binary:logistic",
179 |         watchlist = list(train=dtrain,test=dtest),
180 |         eval_metric = 'auc',eval_metric = 'logloss')
181 | ```
182 | 
183 | ```{r,eval=F,echo=F}
184 | save(model_auc,file="../data/model_auc.RData")
185 | ```
186 | 
187 | ```{r,echo=F}
188 | load("../data/model_auc.RData")
189 | ```
190 | 
191 | ## Output `model_auc`
192 | 
193 | ```{r}
194 | model_auc
195 | ```
196 | 
197 | 
198 | ## Solution  Exercise 9   
199 | 
200 | ```{r,eval=F}
201 | library(tidyverse)
202 | model_auc$evaluation_log %>%
203 |   gather(metric, value, -iter) %>%
204 |   separate(metric, c('set','metric')) %>%
205 |   ggplot(aes(iter, value, color = set)) +
206 |   geom_line() +
207 |   facet_grid(metric~.)
208 | ```
209 | 
210 | ## Evaluation plot
211 | 
212 | ```{r,echo=F}
213 | library(tidyverse)
214 | model_auc$evaluation_log %>%
215 |   gather(metric, value, -iter) %>%
216 |   separate(metric, c('set','metric')) %>%
217 |   ggplot(aes(iter, value, color = set)) +
218 |   geom_line() +
219 |   facet_grid(metric~.)
220 | ```
221 | 
222 | 
223 | ##  Solution Exercise 10  
224 | 
225 | ```{r,eval=F}
226 | model_eta<-xgb.train(data=dtrain,max_depth = 2,eta = 0.05,
227 |           nrounds = 100,objective = "binary:logistic",
228 |           watchlist = list(train=dtrain, test=dtest),
229 |           eval_metric = 'auc',eval_metric = 'logloss')
230 | ```
231 | 
232 | ```{r,echo=F,eval=F}
233 | save(model_eta,file="../data/model_eta.RData")
234 | ```
235 | 
236 | ```{r,echo=F}
237 | load("../data/model_eta.RData")
238 | ```
239 | 
240 | ## Output of model eta
241 | 
242 | ```{r}
243 | model_eta
244 | ```
245 | 
246 | <!--
247 | https://www.r-exercises.com/2017/10/01/extremely-boost-your-machine-learning-exercises-part-2/
248 | 
249 | https://www.r-exercises.com/2017/09/24/extremely-boost-your-machine-learning-solutions-part-1/
250 | -->


--------------------------------------------------------------------------------
/tutorial/ml_exercises_c3_xtremeboosting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_c3_xtremeboosting.pdf


--------------------------------------------------------------------------------
/tutorial/ml_exercises_d_neuralnets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Japhilko/machine_learning/f66edd594c737270dbb161f023fc55c332407bfc/tutorial/ml_exercises_d_neuralnets.pdf


--------------------------------------------------------------------------------
/tutorial/prepare_apply_5ml.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Prepare Machine Learning"
 3 | author: "Jan-Philipp Kolb"
 4 | date: "31 5 2019"
 5 | output: beamer_presentation
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = FALSE)
10 | ```
11 | 
12 | ## [How to prepare and apply machine learning to your dataset](https://www.r-exercises.com/2017/08/25/machine-learning-tutorial/)
13 | 
14 | ### Content of this section
15 | 
16 | 1) Use one of the most popular machine learning packages in R.
17 | 2) Explore a dataset by using statistical summaries and data visualization.
18 | 3) Build 5 machine-learning models, pick the best, and build confidence that the accuracy is reliable.
19 | 
20 | ### standard and necessary steps:
21 | 
22 | 1. Define Problem.
23 | 2. Prepare Data.
24 | 3. Evaluate Algorithms.
25 | 4. Improve Results.
26 | 5. Present Results.


--------------------------------------------------------------------------------