├── 1_Overview.pdf
├── 2_Regularized_Regression.pdf
├── 3_Trees_Forests.pdf
├── 4_Deep_Learning.pdf
├── 5_Unsupervised.pdf
├── 6_confounders_with_ML.pdf
├── 7_causal_forest.pdf
├── 8_optimal_policy.pdf
├── 9_reinforcement_learning.pdf
├── Example Lecture 3
    └── deep_learning_example.R
├── Examples lecture 1
    ├── Data
    │   ├── job_corps.csv
    │   ├── mylemon.csv
    │   ├── used_cars_test.csv
    │   └── used_cars_train.csv
    ├── examples_first_lecture.html
    └── examples_first_lecture.ipynb
├── Group Data Challenge 2025
    ├── data_challenge.pdf
    ├── juice.csv
    ├── new_grocery.csv
    ├── orange_juice.html
    ├── orange_juice.ipynb
    └── orange_juice.r
├── Individual Home Assignment 2025
    ├── grading_grid.pdf
    └── research_proposal.pdf
├── Literature
    ├── Athey_2017.pdf
    ├── Athey_et_al_2019.pdf
    ├── Belloni_et_al_2012.pdf
    ├── Belloni_et_al_2014a.pdf
    ├── Belloni_et_al_2014b.pdf
    ├── Cagala_et_al_2021.pdf
    ├── Chernozhukov_et_al_2017.pdf
    ├── Chetverikov_et_al_2020.pdf
    ├── Google flu trends.pdf
    ├── Mullainathan_Spiess_2017.pdf
    └── Semenova_Chernozhukov_2020.pdf
├── PC Lab 1
    ├── help files
    │   └── glmnet_package.pdf
    ├── penalize_regression_tutorial.r
    ├── penalized_regression_solution.html
    ├── penalized_regression_solution.ipynb
    ├── penalized_regression_tutorial.ipynb
    ├── student-mat-test.Rdata
    └── student-mat-train.Rdata
├── PC Lab 2
    ├── browser-sites.txt
    ├── browser_2006.csv
    ├── browser_new.csv
    ├── help files
    │   ├── grf.pdf
    │   └── rpart.pdf
    ├── trees_foests_solution.html
    ├── trees_foests_solution.ipynb
    ├── trees_foests_tutorial.ipynb
    └── trees_foests_tutorial.r
├── PC Lab 3
    ├── help files
    │   ├── R_ K-Means Clustering.html
    │   └── R_ Principal Components Analysis.html
    ├── rollcall-members.Rdata
    ├── rollcall-votes.Rdata
    ├── unsupervised_solution.html
    ├── unsupervised_solution.ipynb
    ├── unsupervised_tutorial.ipynb
    └── unsupervised_tutorial.r
├── PC Lab 4
    ├── help files
    │   ├── glmnet_package.pdf
    │   └── hdm_package.pdf
    ├── job_corps.csv
    ├── post_double_selection_solution.html
    ├── post_double_selection_solution.ipynb
    ├── post_double_selection_tutorial.ipynb
    └── post_double_selection_tutorial.r
├── PC Lab 5
    ├── double_machine_learning_solution.html
    ├── double_machine_learning_solution.ipynb
    ├── double_machine_learning_tutorial.ipynb
    ├── double_machine_learning_tutorial.r
    ├── help files
    │   ├── glmnet_package.pdf
    │   └── grf_package.pdf
    └── job_corps.csv
├── PC Lab 6
    ├── causal_forest.html
    ├── causal_forest.ipynb
    ├── causal_forest.r
    ├── fundraising.csv
    └── help files
    │   └── grf_package.pdf
├── PC Lab 7
    ├── fundraising.csv
    ├── help files
    │   ├── grf_package.pdf
    │   └── rpart_package.pdf
    ├── optimal_policy_learning.html
    ├── optimal_policy_learning.ipynb
    └── optimal_policy_learning.r
├── README.md
├── Stata Example
    ├── ajr_example.do
    ├── ivlasso.ado
    ├── lassoutils.ado
    ├── pdslasso.ado
    └── rlasso.ado
└── binder
    └── environment.yml


/1_Overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/1_Overview.pdf


--------------------------------------------------------------------------------
/2_Regularized_Regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/2_Regularized_Regression.pdf


--------------------------------------------------------------------------------
/3_Trees_Forests.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/3_Trees_Forests.pdf


--------------------------------------------------------------------------------
/4_Deep_Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/4_Deep_Learning.pdf


--------------------------------------------------------------------------------
/5_Unsupervised.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/5_Unsupervised.pdf


--------------------------------------------------------------------------------
/6_confounders_with_ML.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/6_confounders_with_ML.pdf


--------------------------------------------------------------------------------
/7_causal_forest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/7_causal_forest.pdf


--------------------------------------------------------------------------------
/8_optimal_policy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/8_optimal_policy.pdf


--------------------------------------------------------------------------------
/9_reinforcement_learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/9_reinforcement_learning.pdf


--------------------------------------------------------------------------------
/Example Lecture 3/deep_learning_example.R:
--------------------------------------------------------------------------------
  1 | ### Lab: Deep Learning
  2 | 
  3 | ## In this version of the Ch10 lab, we  use the `luz` package, which interfaces to the
  4 | ## `torch` package which in turn links to efficient
  5 | ## `C++` code in the LibTorch library.
  6 | 
  7 | ## This version of the lab was produced by Daniel Falbel and Sigrid
  8 | ## Keydana, both data scientists at Rstudio where these packages were
  9 | ## produced.
 10 | 
 11 | ## An advantage over our original `keras` implementation is that this
 12 | ## version does not require a separate `python` installation.
 13 | 
 14 | ##########################################
 15 | ## Single Layer Network on Hitters Data ##
 16 | ##########################################
 17 | 
 18 | ## Load various packages
 19 | library(ISLR2)
 20 | library(glmnet)
 21 | library(torch)
 22 | library(luz) # high-level interface for torch
 23 | library(torchvision) # for datasets and image transformation
 24 | library(torchdatasets) # for datasets we are going to use
 25 | library(zeallot)
 26 | library(ggplot2)
 27 | library(grf)
 28 | 
 29 | ## Loading the dataset
 30 | ## We use the example data with baseball player salaries from lecture 4
 31 | Gitters <- na.omit(Hitters)
 32 | n <- nrow(Gitters)
 33 | print(paste("Number of observations:", n))
 34 | 
 35 | ## Define tes sample
 36 | set.seed(13)
 37 | ntest <- trunc(n / 3)
 38 | testid <- sample(1:n, ntest)
 39 | 
 40 | 
 41 | #######################
 42 | ## Linear Regression ##
 43 | #######################
 44 | lfit <- lm(Salary ~ ., data = Gitters[-testid, ])
 45 | summary(lfit)
 46 | lpred <- predict(lfit, Gitters[testid, ])
 47 | print(paste("MAE:", mean(abs(Gitters$Salary[testid] - lpred))))
 48 | 
 49 | ## Dafine y and x as matrix
 50 | x <- scale(model.matrix(Salary ~ . - 1, data = Gitters))
 51 | print(paste("Number of controls:", ncol(x)))
 52 | y <- Gitters$Salary
 53 | 
 54 | ############
 55 | ## Lasso ##
 56 | ###########
 57 | cvfit <- cv.glmnet(x[-testid, ], y[-testid], type.measure = "mae")
 58 | coef(cvfit)
 59 | cpred <- predict(cvfit, x[testid, ], s = "lambda.min")
 60 | print(paste("MAE:",mean(abs(y[testid] - cpred))))
 61 | 
 62 | ###################
 63 | ## Random Forest ##
 64 | ###################
 65 | forest <- regression_forest(x[-testid, ], y[-testid])
 66 | fpred <- predict(forest, x[testid, ])
 67 | print(paste("MAE:",mean(abs(y[testid] - fpred$prediction))))
 68 | 
 69 | ####################
 70 | ## Neural Network ##
 71 | ####################
 72 | 
 73 | torch_manual_seed(13)
 74 | 
 75 | # single hidden layer
 76 | # 10 hidden units
 77 | # ReLU activation function
 78 | # dropout layer, in which a random 40% of the 10 activations from the 
 79 |   # previous layer are set to zero during each iteration of the stochastic 
 80 |   # gradient descent algorithm
 81 | # One output
 82 | # linear output function
 83 | modnn <- nn_module(
 84 |   initialize = function(input_size) {
 85 |     self$hidden <- nn_linear(input_size, 10)
 86 |     self$activation <- nn_relu()
 87 |     self$dropout <- nn_dropout(0.4)
 88 |     self$output <- nn_linear(10, 1)
 89 |   },
 90 |   forward = function(x) {
 91 |     x %>%
 92 |       self$hidden() %>%
 93 |       self$activation() %>%
 94 |       self$dropout() %>%
 95 |       self$output()
 96 |   }
 97 | )
 98 | 
 99 | # Specify optimisation algorithm
100 | # Here mse loss
101 | modnn <- modnn %>%
102 |   setup(
103 |     loss = nn_mse_loss(),
104 |     optimizer = optim_rmsprop,
105 |     metrics = list(luz_metric_mae())
106 |   ) %>%
107 |   set_hparams(input_size = ncol(x))
108 | 
109 | # Train the neural network in 1500 iterations
110 | fitted <- modnn %>%
111 |   fit(
112 |     data = list(x[-testid, ], matrix(y[-testid], ncol = 1)),
113 |     valid_data = list(x[testid, ], matrix(y[testid], ncol = 1)),
114 |     epochs = 1500
115 |   )
116 | #plot(fitted)
117 | 
118 | 
119 | npred <- predict(fitted, x[testid, ])
120 | mean(abs(y[testid] - npred))
121 | 


--------------------------------------------------------------------------------
/Examples lecture 1/Data/used_cars_test.csv:
--------------------------------------------------------------------------------
  1 | "","first_price","mileage","age_car_years","diesel","other_car_owner","bmw_320","opel_astra","mercedes_c","vw_golf","vw_passat","pm_green","private_seller","guarantee","inspection","maintenance_cert","co2_em","euro_norm","mile_20","mile_30","mile_40","mile_50","mile_100","mile_150","mileage2","mileage3","mileage4","age_3","age_6","age_car_years2","age_car_years3","age_car_years4","dur_next_ins_0","dur_next_ins_1_2","new_inspection","euro_1","euro_2","euro_3","euro_4","euro_5","euro_6"
  2 | "1",25.5,79.85,3.1,1,1,1,0,0,0,0,1,0,0,2,0,124,5,1,1,1,1,0,0,6376.0225,509125.41,40653664,1,0,9.6099997,29.791,92.352097,0,0,1,0,0,0,0,1,0
  3 | "3",7.47,142.5,9.6,0,4,1,0,0,0,0,1,0,0,0,0,182,4,1,1,1,1,1,0,20306.25,2893640.5,412343776,1,1,92.160004,884.73602,8493.4658,1,0,0,0,0,0,1,0,0
  4 | "10",20.882,76.85,3.3,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,1,1,1,0,0,5905.9224,453870.16,34879920,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0
  5 | "12",11.389,143,7.6,1,1,1,0,0,0,0,1,0,0,1,1,131,4,1,1,1,1,1,0,20449,2924207,418161600,1,1,57.759998,438.97601,3336.2175,0,1,0,0,0,0,1,0,0
  6 | "15",23.015,47.2,1.4,1,1,0,0,0,0,1,1,0,0,0,0,123,5,1,1,1,0,0,0,2227.8401,105154.05,4963271,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0
  7 | "17",25.26,42.495,2.9,1,1,1,0,0,0,0,1,0,0,0,1,123,5,1,1,1,0,0,0,1805.8251,76738.531,3261004,0,0,8.4099998,24.389,70.728104,1,0,0,0,0,0,0,1,0
  8 | "18",19.029,29.4,2.3,1,1,0,0,0,0,1,1,0,0,1,1,122,5,1,0,0,0,0,0,864.35999,25412.184,747118.19,0,0,5.29,12.167,27.9841,0,1,0,0,0,0,0,1,0
  9 | "24",17.339,54.936,4.4,1,1,0,0,0,1,0,0,0,0,0,1,139,5,1,1,1,1,0,0,3017.9641,165794.88,9108107,1,0,19.360001,85.183998,374.8096,1,0,0,0,0,0,0,1,0
 10 | "26",8.96,75.19,7.4,1,4,0,0,0,1,0,1,0,0,1,1,137,4,1,1,1,1,0,0,5653.5361,425089.38,31962470,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0
 11 | "27",13.65,71,4.7,1,2,0,0,0,1,0,1,1,0,0,1,125,5,1,1,1,1,0,0,5041,357911,25411680,1,0,22.09,103.823,487.96811,1,0,0,0,0,0,0,1,0
 12 | "28",20.42,127.634,3.2,1,1,0,0,0,0,1,1,0,0,1,1,155,5,1,1,1,1,1,0,16290.438,2079213.8,265378368,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
 13 | "29",15.36,22.931,3.2,0,0,0,1,0,0,0,1,0,0,1,1,144,5,1,0,0,0,0,0,525.83075,12057.825,276498,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
 14 | "34",7,186.3,4.3,1,1,0,1,0,0,0,1,0,0,0,1,134,5,1,1,1,1,1,1,34707.691,6466042.5,1204623744,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0
 15 | "36",10.61,94,7.4,1,0,0,0,0,0,1,1,1,0,2,0,149,5,1,1,1,1,0,0,8836,830584,78074896,1,1,54.759998,405.224,2998.6577,0,0,1,0,0,0,0,1,0
 16 | "40",6.9,184,8,1,0,0,0,0,1,0,1,1,0,2,0,159,4,1,1,1,1,1,1,33856,6229504,1146228736,1,1,64,512,4096,0,0,1,0,0,0,1,0,0
 17 | "41",32.78,44.778,4.8,0,2,1,0,0,0,0,1,0,0,1,1,235,5,1,1,1,0,0,0,2005.0693,89782.992,4020302.8,1,0,23.040001,110.592,530.84161,0,1,0,0,0,0,0,1,0
 18 | "42",14.53,69.028,3.6,1,1,0,0,0,1,0,1,0,0,1,1,119,5,1,1,1,1,0,0,4764.8647,328909.09,22703936,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0
 19 | "43",24.16,119,1.9,1,1,1,0,0,0,0,1,0,0,1,1,112,5,1,1,1,1,1,0,14161,1685159,200533920,0,0,3.6099999,6.8590002,13.0321,0,1,0,0,0,0,0,1,0
 20 | "44",5.669,155.1,12.5,1,3,1,0,0,0,0,0,1,0,0,1,158,3,1,1,1,1,1,1,24056.01,3731087.3,578691648,1,1,156.25,1953.125,24414.063,1,0,0,0,0,1,0,0,0
 21 | "45",22.46,54.49,4,1,2,0,0,0,0,1,1,0,0,0,0,151,5,1,1,1,1,0,0,2969.1602,161789.53,8815912,1,0,16,64,256,1,0,0,0,0,0,0,1,0
 22 | "46",15.98,111.1,3.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,12343.21,1371330.6,152354832,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
 23 | "51",10.27,154.907,3.8,1,1,0,1,0,0,0,1,0,0,1,1,148,5,1,1,1,1,1,1,23996.178,3717176,575816576,1,0,14.44,54.872002,208.5136,0,1,0,0,0,0,0,1,0
 24 | "52",22.48,69.97,3.2,1,1,1,0,0,0,0,1,0,0,2,0,123,5,1,1,1,1,0,0,4895.8008,342559.19,23968866,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
 25 | "54",29.27,25.86,1.6,1,1,1,0,0,0,0,1,0,0,0,1,119,6,1,0,0,0,0,0,668.73962,17293.605,447212.66,0,0,2.5599999,4.0960002,6.5535998,1,0,0,0,0,0,0,0,1
 26 | "56",13.39,62.715,3.5,1,2,0,1,0,0,0,1,0,0,0,0,119,5,1,1,1,1,0,0,3933.1711,246668.83,15469836,1,0,12.25,42.875,150.0625,1,0,0,0,0,0,0,1,0
 27 | "60",11.86,75.9,2.2,1,1,0,1,0,0,0,1,0,0,1,1,113,5,1,1,1,1,0,0,5760.8101,437245.47,33186932,0,0,4.8400002,10.648,23.4256,0,1,0,0,0,0,0,1,0
 28 | "63",7.2,155,6.3,1,1,0,0,0,1,0,1,0,0,0,1,137,4,1,1,1,1,1,1,24025,3723875,577200640,1,1,39.689999,250.047,1575.2961,1,0,0,0,0,0,1,0,0
 29 | "66",5.63,171,10.4,1,0,0,0,0,0,1,1,0,0,1,1,156,4,1,1,1,1,1,1,29241,5000211,855036096,1,1,108.16,1124.864,11698.586,0,1,0,0,0,0,1,0,0
 30 | "68",9.57,144,10.1,0,0,1,0,0,0,0,1,0,0,1,1,182,4,1,1,1,1,1,0,20736,2985984,429981696,1,1,102.01,1030.301,10406.04,0,1,0,0,0,0,1,0,0
 31 | "71",13.61,20.85,1.4,0,1,0,1,0,0,0,1,0,0,1,1,134,5,1,0,0,0,0,0,434.7225,9063.9639,188983.66,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,1,0
 32 | "72",20.119,160,4,1,2,0,0,1,0,0,0,1,0,0,1,128,5,1,1,1,1,1,1,25600,4096000,655360000,1,0,16,64,256,1,0,0,0,0,0,0,1,0
 33 | "76",10.71,160.871,3.5,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,1,1,25879.479,4163257.5,669747392,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,1,0
 34 | "77",14.68,125,3.4,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,1,1,1,1,0,15625,1953125,244140624,1,0,11.56,39.304001,133.63361,0,1,0,0,0,0,0,1,0
 35 | "80",22.335,18.704,1.2,1,1,0,0,0,0,1,1,0,0,1,1,125,6,0,0,0,0,0,0,349.83963,6543.4004,122387.76,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,0,1
 36 | "81",8.29,163.44,6.3,1,1,0,0,0,1,0,1,0,0,0,1,122,4,1,1,1,1,1,1,26712.633,4365913,713564800,1,1,39.689999,250.047,1575.2961,1,0,0,0,0,0,1,0,0
 37 | "82",17.48,150,3.3,1,1,1,0,0,0,0,0,0,0,2,1,124,5,1,1,1,1,1,1,22500,3375000,506249984,1,0,10.89,35.937,118.5921,0,0,1,0,0,0,0,1,0
 38 | "83",15.84,19,1.7,0,0,0,0,0,1,0,1,0,0,0,1,124,5,0,0,0,0,0,0,361,6859,130321,0,0,2.8900001,4.9130001,8.3521004,1,0,0,0,0,0,0,1,0
 39 | "86",14.99,91.735,5.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,0,0,8415.3105,771978.5,70817448,1,0,26.01,132.651,676.52008,0,1,0,0,0,0,0,1,0
 40 | "88",15.27,109.6,2.4,1,1,0,1,0,0,0,1,0,1,0,1,154,5,1,1,1,1,1,0,12012.16,1316532.8,144291984,0,0,5.7600002,13.824,33.177601,1,0,0,0,0,0,0,1,0
 41 | "91",6.09,140,6.7,1,2,0,0,0,1,0,1,0,0,0,1,137,4,1,1,1,1,1,0,19600,2744000,384160000,1,1,44.889999,300.763,2015.1121,1,0,0,0,0,0,1,0,0
 42 | "92",29.25,31.988,1.5,1,1,0,0,1,0,0,1,0,0,0,0,128,5,1,1,0,0,0,0,1023.2321,32731.15,1047004,0,0,2.25,3.375,5.0625,1,0,0,0,0,0,0,1,0
 43 | "94",11.71,199.95,8,0,2,0,0,1,0,0,1,0,0,0,1,229,4,1,1,1,1,1,1,39980.004,7994001.5,1598400640,1,1,64,512,4096,1,0,0,0,0,0,1,0,0
 44 | "97",13.37,112,6.2,1,2,1,0,0,0,0,1,0,0,0,1,142,5,1,1,1,1,1,0,12544,1404928,157351936,1,1,38.439999,238.328,1477.6335,1,0,0,0,0,0,0,1,0
 45 | "98",15.359,61.404,5.8,0,1,0,0,0,1,0,1,0,0,1,1,139,5,1,1,1,1,0,0,3770.4512,231520.78,14216302,1,0,33.639999,195.112,1131.6497,0,1,0,0,0,0,0,1,0
 46 | "99",16.53,114.5,3.5,1,1,0,0,0,0,1,1,0,0,1,0,125,6,1,1,1,1,1,0,13110.25,1501123.6,171878656,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,0,1
 47 | "100",6.87,171.914,3.9,1,0,0,1,0,0,0,1,0,0,2,1,117,5,1,1,1,1,1,1,29554.424,5080819,873463936,1,0,15.21,59.319,231.3441,0,0,1,0,0,0,0,1,0
 48 | "102",16.91,88.639,3.3,1,0,0,0,0,0,1,1,0,0,0,1,120,5,1,1,1,1,0,0,7856.8726,696425.31,61730444,1,0,10.89,35.937,118.5921,1,0,0,0,0,0,0,1,0
 49 | "103",20.35,22.81,3.8,1,0,0,0,0,0,1,1,0,0,2,0,139,5,1,0,0,0,0,0,520.29608,11867.954,270708.03,1,0,14.44,54.872002,208.5136,0,0,1,0,0,0,0,1,0
 50 | "104",8.05,120,8.9,1,3,0,0,0,0,1,1,0,0,0,0,177,4,1,1,1,1,1,0,14400,1728000,207360000,1,1,79.209999,704.96899,6274.2241,1,0,0,0,0,0,1,0,0
 51 | "107",18.34,25.117,1.4,0,1,0,0,0,1,0,1,0,0,1,1,126,5,1,0,0,0,0,0,630.86371,15845.403,397989,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,1,0
 52 | "110",22.72,42.8,1.5,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,0,0,0,1831.84,78402.75,3355637.8,0,0,2.25,3.375,5.0625,0,1,0,0,0,0,0,1,0
 53 | "113",18.25,150.99,4.1,1,1,1,0,0,0,0,1,0,0,1,0,128,5,1,1,1,1,1,1,22797.98,3442267,519747904,1,0,16.809999,68.920998,282.57611,0,1,0,0,0,0,0,1,0
 54 | "114",16.85,57,4.1,1,1,0,0,0,0,1,1,0,0,0,1,123,5,1,1,1,1,0,0,3249,185193,10556001,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
 55 | "120",35.7,42,1.1,1,1,0,0,1,0,0,1,0,0,1,1,108,6,1,1,1,0,0,0,1764,74088,3111696,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1
 56 | "121",8.26,108,6.2,0,0,0,0,0,1,0,1,0,0,1,0,170,4,1,1,1,1,1,0,11664,1259712,136048896,1,1,38.439999,238.328,1477.6335,0,1,0,0,0,0,1,0,0
 57 | "125",13.31,98,4.1,1,2,0,1,0,0,0,0,1,0,0,1,156,5,1,1,1,1,0,0,9604,941192,92236816,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
 58 | "126",17.045,134.7,3.9,1,1,0,0,0,0,1,1,0,1,0,1,120,5,1,1,1,1,1,0,18144.09,2444009,329208000,1,0,15.21,59.319,231.3441,1,0,0,0,0,0,0,1,0
 59 | "131",14.74,37.703,1.4,0,1,0,1,0,0,0,1,0,0,1,0,139,6,1,1,0,0,0,0,1421.5162,53595.426,2020708.4,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,0,1
 60 | "132",7.1,84.78,7,0,1,0,0,0,1,0,1,0,0,1,1,176,4,1,1,1,1,0,0,7187.6484,609368.81,51662288,1,1,49,343,2401,0,1,0,0,0,0,1,0,0
 61 | "136",6.63,95.55,5.9,0,1,0,0,0,1,0,1,0,0,0,1,165,5,1,1,1,1,0,0,9129.8027,872352.63,83353296,1,0,34.810001,205.379,1211.7361,1,0,0,0,0,0,0,1,0
 62 | "137",30.65,30.456,1.1,1,1,1,0,0,0,0,1,0,0,2,0,123,5,1,1,0,0,0,0,927.56793,28250.01,860382.25,0,0,1.21,1.331,1.4641,0,0,1,0,0,0,0,1,0
 63 | "138",6.29,189.285,9.5,1,0,1,0,0,0,0,1,0,0,0,0,158,4,1,1,1,1,1,1,35828.813,6781856.5,1283703680,1,1,90.25,857.375,8145.0625,1,0,0,0,0,0,1,0,0
 64 | "139",23.109,31.972,1.2,1,1,1,0,0,0,0,1,0,0,1,1,119,5,1,1,0,0,0,0,1022.2088,32682.059,1044910.8,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0
 65 | "140",11.56,110.8,4.7,1,1,1,0,0,0,0,1,0,0,1,0,109,5,1,1,1,1,1,0,12276.64,1360251.8,150715888,1,0,22.09,103.823,487.96811,0,1,0,0,0,0,0,1,0
 66 | "141",29.06,29,1.6,1,0,1,0,0,0,0,1,0,0,0,1,129,6,1,0,0,0,0,0,841,24389,707281,0,0,2.5599999,4.0960002,6.5535998,1,0,0,0,0,0,0,0,1
 67 | "143",16.26,88.315,3.5,1,1,0,0,0,0,1,1,0,0,1,1,123,6,1,1,1,1,0,0,7799.5391,688816.31,60832812,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,0,1
 68 | "144",21.12,72.656,2.4,1,1,0,0,1,0,0,1,0,0,1,1,124,5,1,1,1,1,0,0,5278.8945,383543.34,27866726,0,0,5.7600002,13.824,33.177601,0,1,0,0,0,0,0,1,0
 69 | "145",22.389,30.681,4.4,1,2,1,0,0,0,0,1,0,0,1,1,142,4,1,1,0,0,0,0,941.32379,28880.754,886090.44,1,0,19.360001,85.183998,374.8096,0,1,0,0,0,0,1,0,0
 70 | "147",12.65,150.3,3.1,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,1,1,22590.09,3395290.5,510312160,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
 71 | "148",17.38,131.672,4.2,1,1,0,0,1,0,0,1,0,0,0,1,134,5,1,1,1,1,1,0,17337.516,2282865.3,300589440,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0
 72 | "150",18.34,93.472,4.9,1,2,0,0,0,0,1,1,0,1,1,1,125,5,1,1,1,1,0,0,8737.0146,816666.25,76335424,1,0,24.01,117.649,576.4801,0,1,0,0,0,0,0,1,0
 73 | "152",10.75,95.223,4.3,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,0,0,9067.4199,863426.94,82218104,1,0,18.49,79.507004,341.8801,0,1,0,0,0,0,0,1,0
 74 | "153",19.699,55.641,2,1,1,1,0,0,0,0,1,0,0,1,1,119,5,1,1,1,1,0,0,3095.9209,172260.14,9584726,0,0,4,8,16,0,1,0,0,0,0,0,1,0
 75 | "154",21.354,69.998,3.2,1,1,0,0,0,0,1,1,0,0,2,1,120,5,1,1,1,1,0,0,4899.7202,342970.59,24007256,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
 76 | "158",20.01,88.112,3.6,1,2,0,0,0,0,1,1,0,0,1,1,123,5,1,1,1,1,0,0,7763.7246,684077.31,60275420,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0
 77 | "159",10.97,174,7,1,0,1,0,0,0,0,1,0,0,1,1,150,5,1,1,1,1,1,1,30276,5268024,916636160,1,1,49,343,2401,0,1,0,0,0,0,0,1,0
 78 | "160",26.62,59.998,3.2,1,1,1,0,0,0,0,1,0,0,2,1,125,5,1,1,1,1,0,0,3599.76,215978.41,12958272,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
 79 | "161",7.23,173,8.8,0,2,1,0,0,0,0,1,0,0,1,1,182,4,1,1,1,1,1,1,29929,5177717,895745024,1,1,77.440002,681.47198,5996.9536,0,1,0,0,0,0,1,0,0
 80 | "165",19.74,72.645,3.3,1,1,0,0,0,0,1,1,0,0,1,1,1,5,1,1,1,1,0,0,5277.2959,383369.16,27849854,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0
 81 | "166",15.83,66.8,4.5,1,2,0,0,0,0,1,1,0,0,1,1,120,4,1,1,1,1,0,0,4462.2402,298077.63,19911586,1,0,20.25,91.125,410.0625,0,1,0,0,0,0,1,0,0
 82 | "169",9.23,128,9.2,1,1,0,0,0,0,1,1,0,0,1,1,158,4,1,1,1,1,1,0,16384,2097152,268435456,1,1,84.639999,778.68799,7163.9297,0,1,0,0,0,0,1,0,0
 83 | "170",14.26,128.2,3.4,1,1,0,0,0,0,1,1,0,1,1,1,121,5,1,1,1,1,1,0,16435.24,2106997.8,270117120,1,0,11.56,39.304001,133.63361,0,1,0,0,0,0,0,1,0
 84 | "171",15.62,106,4.2,0,1,0,0,0,0,1,1,0,0,1,1,163,5,1,1,1,1,1,0,11236,1191016,126247696,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0
 85 | "172",20.06,21.5,1.5,0,0,0,0,0,1,0,1,0,0,0,0,117,5,1,0,0,0,0,0,462.25,9938.375,213675.06,0,0,2.25,3.375,5.0625,1,0,0,0,0,0,0,1,0
 86 | "173",4.79,133.1,7.5,1,2,0,1,0,0,0,1,0,0,0,1,146,4,1,1,1,1,1,0,17715.609,2357947.8,313842848,1,1,56.25,421.875,3164.0625,1,0,0,0,0,0,1,0,0
 87 | "175",21.83,89.976,4.6,1,1,0,0,1,0,0,1,0,0,1,1,180,4,1,1,1,1,0,0,8095.6807,728416.94,65540044,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,1,0,0
 88 | "178",14.12,124.471,4.4,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,15493.03,1928432.9,240033968,1,0,19.360001,85.183998,374.8096,0,1,0,0,0,0,0,1,0
 89 | "180",5.92,132,7.7,1,0,0,0,0,1,0,0,1,0,0,1,137,4,1,1,1,1,1,0,17424,2299968,303595776,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0
 90 | "181",23.08,61.208,3.2,1,1,1,0,0,0,0,1,0,0,1,1,124,5,1,1,1,1,0,0,3746.4192,229310.83,14035657,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
 91 | "182",10.48,173.51,2.8,1,1,0,1,0,0,0,1,0,0,1,1,129,5,1,1,1,1,1,1,30105.721,5223643.5,906354368,0,0,7.8400002,21.952,61.465599,0,1,0,0,0,0,0,1,0
 92 | "183",13.955,163.9,5.5,1,2,0,0,0,0,1,0,1,0,0,1,159,5,1,1,1,1,1,1,26863.211,4402880,721632064,1,0,30.25,166.375,915.0625,1,0,0,0,0,0,0,1,0
 93 | "184",11.94,177,4.3,1,2,0,0,0,0,1,1,0,0,0,1,121,5,1,1,1,1,1,1,31329,5545233,981506240,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0
 94 | "185",8.09,125,4.2,1,0,0,0,0,1,0,0,0,1,0,1,119,5,1,1,1,1,1,0,15625,1953125,244140624,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0
 95 | "186",10.61,182.301,3.9,1,1,1,0,0,0,0,1,0,0,1,0,120,5,1,1,1,1,1,1,33233.656,6058528.5,1104475776,1,0,15.21,59.319,231.3441,0,1,0,0,0,0,0,1,0
 96 | "187",18.7,164.618,2.9,1,1,0,0,0,0,1,1,0,1,1,1,125,5,1,1,1,1,1,1,27099.086,4460997.5,734360448,0,0,8.4099998,24.389,70.728104,0,1,0,0,0,0,0,1,0
 97 | "188",10.91,91.54,4.8,0,2,0,1,0,0,0,1,0,0,0,1,144,5,1,1,1,1,0,0,8379.5713,767066,70217224,1,0,23.040001,110.592,530.84161,1,0,0,0,0,0,0,1,0
 98 | "192",29.081,63.169,2.9,1,3,1,0,0,0,0,1,0,0,0,0,123,5,1,1,1,1,0,0,3990.3225,252064.69,15922674,0,0,8.4099998,24.389,70.728104,1,0,0,0,0,0,0,1,0
 99 | "195",15.199,39,1.2,1,1,0,1,0,0,0,1,0,0,2,1,104,5,1,1,0,0,0,0,1521,59319,2313441,0,0,1.4400001,1.728,2.0736001,0,0,1,0,0,0,0,1,0
100 | "196",12.77,98,4.1,1,1,0,1,0,0,0,1,0,0,0,1,134,5,1,1,1,1,0,0,9604,941192,92236816,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
101 | "199",16,14.104,1.2,0,1,0,0,0,1,0,1,0,0,1,1,119,5,0,0,0,0,0,0,198.92282,2805.6074,39570.285,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0
102 | "200",17.08,147,4.2,1,1,1,0,0,0,0,1,0,0,1,1,140,5,1,1,1,1,1,0,21609,3176523,466948896,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0
103 | 


--------------------------------------------------------------------------------
/Examples lecture 1/Data/used_cars_train.csv:
--------------------------------------------------------------------------------
  1 | "","first_price","mileage","age_car_years","diesel","other_car_owner","bmw_320","opel_astra","mercedes_c","vw_golf","vw_passat","pm_green","private_seller","guarantee","inspection","maintenance_cert","co2_em","euro_norm","mile_20","mile_30","mile_40","mile_50","mile_100","mile_150","mileage2","mileage3","mileage4","age_3","age_6","age_car_years2","age_car_years3","age_car_years4","dur_next_ins_0","dur_next_ins_1_2","new_inspection","euro_1","euro_2","euro_3","euro_4","euro_5","euro_6"
  2 | "2",21.91,77.1,3.7,1,1,0,0,0,0,1,1,0,0,0,1,136,5,1,1,1,1,0,0,5944.4102,458314,35336012,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0
  3 | "4",14.58,45.45,5,0,2,0,0,0,0,1,1,0,0,1,1,145,5,1,1,1,0,0,0,2065.7024,93886.18,4267127,1,0,25,125,625,0,1,0,0,0,0,0,1,0
  4 | "5",17.98,183.5,3.6,1,1,1,0,0,0,0,1,0,0,1,1,124,5,1,1,1,1,1,1,33672.25,6178858,1133820416,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0
  5 | "6",19.03,74.85,3.1,1,1,0,0,0,0,1,1,0,0,1,1,125,5,1,1,1,1,0,0,5602.5225,419348.81,31388258,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
  6 | "7",10.969,174,6.8,1,1,0,0,1,0,0,1,0,0,1,1,154,5,1,1,1,1,1,1,30276,5268024,916636160,1,1,46.240002,314.43201,2138.1377,0,1,0,0,0,0,0,1,0
  7 | "8",24.11,51.001,2.3,1,2,1,0,0,0,0,1,0,0,0,0,123,5,1,1,1,1,0,0,2601.1021,132658.8,6765731.5,0,0,5.29,12.167,27.9841,1,0,0,0,0,0,0,1,0
  8 | "9",13.26,62,2.6,1,2,0,0,0,1,0,1,0,0,0,0,119,5,1,1,1,1,0,0,3844,238328,14776336,0,0,6.7600002,17.576,45.697601,1,0,0,0,0,0,0,1,0
  9 | "11",23.2,16.901,1.4,1,1,1,0,0,0,0,1,0,0,0,1,119,5,0,0,0,0,0,0,285.6438,4827.666,81592.383,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0
 10 | "13",13.65,119.636,4.2,1,0,0,0,0,0,1,0,0,0,2,0,123,5,1,1,1,1,1,0,14312.772,1712322.9,204855456,1,0,17.639999,74.087997,311.16959,0,0,1,0,0,0,0,1,0
 11 | "14",11.74,83,7.4,1,2,1,0,0,0,0,0,1,0,0,1,120,5,1,1,1,1,0,0,6889,571787,47458320,1,1,54.759998,405.224,2998.6577,1,0,0,0,0,0,0,1,0
 12 | "16",12.07,46.36,7.1,0,1,1,0,0,0,0,1,0,0,1,1,147,4,1,1,1,0,0,0,2149.2495,99639.211,4619274,1,1,50.41,357.91101,2541.1682,0,1,0,0,0,0,1,0,0
 13 | "19",16.79,18.4,1.1,1,1,0,0,0,1,0,1,0,1,1,1,92,6,0,0,0,0,0,0,338.56,6229.5039,114622.88,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1
 14 | "20",8.18,110.375,9.4,1,2,0,0,0,0,1,1,0,0,0,1,156,4,1,1,1,1,1,0,12182.641,1344659,148416736,1,1,88.360001,830.58398,7807.4897,1,0,0,0,0,0,1,0,0
 15 | "21",5.43,151,13,0,0,1,0,0,0,0,1,0,0,1,0,199,4,1,1,1,1,1,1,22801,3442951,519885600,1,1,169,2197,28561,0,1,0,0,0,0,1,0,0
 16 | "22",16.719,94.435,3.3,1,1,0,0,0,0,1,1,0,0,2,1,123,5,1,1,1,1,0,0,8917.9688,842168.44,79530176,1,0,10.89,35.937,118.5921,0,0,1,0,0,0,0,1,0
 17 | "23",22.42,84.89,4.3,1,1,0,0,1,0,0,1,0,0,0,1,124,5,1,1,1,1,0,0,7206.312,611743.81,51930936,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0
 18 | "25",8.82,83,4,1,0,0,1,0,0,0,1,0,0,2,1,119,5,1,1,1,1,0,0,6889,571787,47458320,1,0,16,64,256,0,0,1,0,0,0,0,1,0
 19 | "30",11.24,95.2,2.9,1,1,0,1,0,0,0,1,0,1,1,1,120,5,1,1,1,1,0,0,9063.04,862801.44,82138696,0,0,8.4099998,24.389,70.728104,0,1,0,0,0,0,0,1,0
 20 | "31",14.54,85.606,4.8,1,0,0,0,1,0,0,0,0,0,2,0,130,5,1,1,1,1,0,0,7328.3872,627353.94,53705260,1,0,23.040001,110.592,530.84161,0,0,1,0,0,0,0,1,0
 21 | "32",29.78,69.89,3.4,1,0,1,0,0,0,0,1,0,0,0,1,123,5,1,1,1,1,0,0,4884.6123,341385.53,23859436,1,0,11.56,39.304001,133.63361,1,0,0,0,0,0,0,1,0
 22 | "33",22.43,22.208,1,0,1,0,0,0,0,1,1,0,0,1,1,119,6,1,0,0,0,0,0,493.19525,10952.881,243241.56,0,0,1,1,1,0,1,0,0,0,0,0,0,1
 23 | "35",9.27,97,3.5,1,1,0,1,0,0,0,1,0,1,0,1,120,5,1,1,1,1,0,0,9409,912673,88529280,1,0,12.25,42.875,150.0625,1,0,0,0,0,0,0,1,0
 24 | "37",10.33,93.1,5.4,0,1,0,0,0,0,1,1,0,0,0,1,158,5,1,1,1,1,0,0,8667.6104,806954.5,75127464,1,0,29.16,157.464,850.3056,1,0,0,0,0,0,0,1,0
 25 | "38",17.71,92.568,3.1,1,1,0,0,0,0,1,1,0,1,1,1,120,5,1,1,1,1,0,0,8568.835,793199.88,73424928,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
 26 | "39",10.9,129.781,5,1,0,0,0,0,0,1,0,0,0,2,0,120,5,1,1,1,1,1,0,16843.107,2185915.5,283690272,1,0,25,125,625,0,0,1,0,0,0,0,1,0
 27 | "47",6.71,98.82,12,0,2,1,0,0,0,0,1,0,0,1,1,185,4,1,1,1,1,0,0,9765.3926,965016.06,95362888,1,1,144,1728,20736,0,1,0,0,0,0,1,0,0
 28 | "48",21.8,76.5,4.6,1,2,0,0,1,0,0,1,0,1,1,1,136,5,1,1,1,1,0,0,5852.25,447697.13,34248832,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,0,1,0
 29 | "49",20.24,17,1.3,1,1,0,0,0,1,0,1,0,0,2,0,119,5,0,0,0,0,0,0,289,4913,83521,0,0,1.6900001,2.197,2.8561001,0,0,1,0,0,0,0,1,0
 30 | "50",13.26,126.248,3.1,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,1,0,15938.558,2012211,254037616,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
 31 | "53",6.49,159,7.3,1,2,0,0,0,1,0,1,0,0,0,1,122,4,1,1,1,1,1,1,25281,4019679,639128960,1,1,53.290001,389.017,2839.8242,1,0,0,0,0,0,1,0,0
 32 | "55",24.01,96,3.1,1,0,1,0,0,0,0,1,0,0,2,0,119,5,1,1,1,1,0,0,9216,884736,84934656,1,0,9.6099997,29.791,92.352097,0,0,1,0,0,0,0,1,0
 33 | "57",33.89,25.575,1.4,1,1,0,0,0,0,1,1,0,0,0,1,156,5,1,0,0,0,0,0,654.08063,16728.111,427821.47,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0
 34 | "58",18.85,81.804,2.6,1,1,0,0,0,0,1,1,0,0,1,1,113,5,1,1,1,1,0,0,6691.8945,547423.75,44781452,0,0,6.7600002,17.576,45.697601,0,1,0,0,0,0,0,1,0
 35 | "59",6.55,65.24,10.1,0,1,1,0,0,0,0,1,0,0,0,1,182,4,1,1,1,1,0,0,4256.2578,277678.25,18115728,1,1,102.01,1030.301,10406.04,1,0,0,0,0,0,1,0,0
 36 | "61",9.72,200,7.2,1,1,1,0,0,0,0,1,0,0,1,1,150,5,1,1,1,1,1,1,40000,8e+06,1.6e+09,1,1,51.84,373.24799,2687.3855,0,1,0,0,0,0,0,1,0
 37 | "62",16.44,166,3.3,1,1,0,0,1,0,0,1,0,0,0,1,124,5,1,1,1,1,1,1,27556,4574296,759333120,1,0,10.89,35.937,118.5921,1,0,0,0,0,0,0,1,0
 38 | "64",15.2,70,4.4,1,1,1,0,0,0,0,1,0,0,2,1,142,5,1,1,1,1,0,0,4900,343000,24010000,1,0,19.360001,85.183998,374.8096,0,0,1,0,0,0,0,1,0
 39 | "65",14.04,51.5,4.9,0,2,0,0,1,0,0,1,0,0,1,1,164,5,1,1,1,1,0,0,2652.25,136590.88,7034430,1,0,24.01,117.649,576.4801,0,1,0,0,0,0,0,1,0
 40 | "67",33.9,16.994,1.6,1,2,0,0,1,0,0,1,0,0,1,1,171,6,0,0,0,0,0,0,288.79605,4907.7998,83403.148,0,0,2.5599999,4.0960002,6.5535998,0,1,0,0,0,0,0,0,1
 41 | "69",25.35,23.33,1,1,1,0,0,0,0,1,1,0,0,1,0,119,6,1,0,0,0,0,0,544.28888,12698.26,296250.41,0,0,1,1,1,0,1,0,0,0,0,0,0,1
 42 | "70",9.049,199.98,7.6,1,1,0,0,0,0,1,1,0,0,0,1,189,4,1,1,1,1,1,1,39992,7997600,1599360128,1,1,57.759998,438.97601,3336.2175,1,0,0,0,0,0,1,0,0
 43 | "73",12.68,143.4,7.7,1,0,0,0,1,0,0,1,0,0,0,1,161,4,1,1,1,1,1,0,20563.561,2948814.5,422860000,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0
 44 | "74",22.38,111.326,3.4,1,1,0,0,1,0,0,1,0,1,2,0,136,5,1,1,1,1,1,0,12393.479,1379716.4,153598304,1,0,11.56,39.304001,133.63361,0,0,1,0,0,0,0,1,0
 45 | "75",5.5,129.651,7.7,0,2,0,1,0,0,0,1,0,0,1,1,163,4,1,1,1,1,1,0,16809.381,2179353.3,282555328,1,1,59.290001,456.53299,3515.3042,0,1,0,0,0,0,1,0,0
 46 | "78",9.43,139.9,3.8,1,1,0,0,0,1,0,1,0,1,0,1,109,5,1,1,1,1,1,0,19572.01,2738124.3,383063584,1,0,14.44,54.872002,208.5136,1,0,0,0,0,0,0,1,0
 47 | "79",14.82,189.2,3.8,1,1,0,0,1,0,0,1,0,0,0,1,133,5,1,1,1,1,1,1,35796.641,6772724.5,1281399424,1,0,14.44,54.872002,208.5136,1,0,0,0,0,0,0,1,0
 48 | "84",19.59,24.976,1,0,1,0,0,0,1,0,1,0,0,1,0,126,6,1,0,0,0,0,0,623.8006,15580.043,389127.16,0,0,1,1,1,0,1,0,0,0,0,0,0,1
 49 | "85",8.02,149,10,0,0,0,0,0,0,1,1,0,0,2,1,214,4,1,1,1,1,1,0,22201,3307949,492884416,1,1,100,1000,10000,0,0,1,0,0,0,1,0,0
 50 | "87",13.17,65.77,5.1,1,3,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,0,0,4325.6929,284500.81,18711620,1,0,26.01,132.651,676.52008,0,1,0,0,0,0,0,1,0
 51 | "89",16.85,13.55,1.7,0,1,0,1,0,0,0,0,0,0,2,0,137,5,0,0,0,0,0,0,183.60249,2487.814,33709.879,0,0,2.8900001,4.9130001,8.3521004,0,0,1,0,0,0,0,1,0
 52 | "90",13.47,106.5,5.4,1,2,0,0,0,0,1,1,0,0,1,0,170,5,1,1,1,1,1,0,11342.25,1207949.6,128646632,1,0,29.16,157.464,850.3056,0,1,0,0,0,0,0,1,0
 53 | "93",9.92,180,4.1,1,1,0,0,0,0,1,1,0,0,0,1,116,5,1,1,1,1,1,1,32400,5832000,1049760000,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
 54 | "95",13.12,100.898,8,1,1,0,0,0,0,1,1,0,0,1,1,177,4,1,1,1,1,1,0,10180.406,1027182.6,103640672,1,1,64,512,4096,0,1,0,0,0,0,1,0,0
 55 | "96",21.91,21.336,1,1,1,0,0,0,1,0,1,0,1,1,1,117,6,1,0,0,0,0,0,455.22488,9712.6787,207229.7,0,0,1,1,1,0,1,0,0,0,0,0,0,1
 56 | "101",18.62,108.697,3.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,11815.038,1284259.1,139595120,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
 57 | "105",17.15,89.414,3.1,1,2,0,0,0,0,1,1,0,1,1,1,135,5,1,1,1,1,0,0,7994.8633,714852.69,63917840,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
 58 | "106",24.36,96.35,3.3,1,1,0,0,1,0,0,1,0,0,1,1,128,5,1,1,1,1,0,0,9283.3223,894448.13,86180080,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0
 59 | "108",12.059,123.862,8.7,0,0,1,0,0,0,0,1,0,0,0,1,182,4,1,1,1,1,1,0,15341.795,1900265.4,235370672,1,1,75.690002,658.50299,5728.9761,1,0,0,0,0,0,1,0,0
 60 | "109",13.27,118.955,3.2,1,2,0,0,0,1,0,1,0,0,0,1,148,5,1,1,1,1,1,0,14150.292,1683248,200230768,1,0,10.24,32.768002,104.8576,1,0,0,0,0,0,0,1,0
 61 | "111",15.319,125,2.4,1,1,0,0,0,0,1,1,0,0,0,1,135,5,1,1,1,1,1,0,15625,1953125,244140624,0,0,5.7600002,13.824,33.177601,1,0,0,0,0,0,0,1,0
 62 | "112",17.34,112.601,3.2,1,1,0,0,1,0,0,1,0,0,1,1,127,5,1,1,1,1,1,0,12678.985,1427666.4,160756672,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
 63 | "115",21.06,18.55,1,1,1,0,0,0,0,1,1,0,0,2,1,120,5,0,0,0,0,0,0,344.10251,6383.1016,118406.53,0,0,1,1,1,0,0,1,0,0,0,0,1,0
 64 | "116",9.3,99.6,7.6,1,2,0,0,0,1,0,1,1,0,1,1,137,4,1,1,1,1,0,0,9920.1602,988047.94,98409576,1,1,57.759998,438.97601,3336.2175,0,1,0,0,0,0,1,0,0
 65 | "117",9.62,56.979,7.4,1,1,0,0,0,1,0,1,0,0,1,1,137,4,1,1,1,1,0,0,3246.6064,184988.39,10540453,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0
 66 | "118",10.889,189.3,3,1,0,0,0,0,0,1,1,0,0,2,1,121,5,1,1,1,1,1,1,35834.488,6783469,1284110720,1,0,9,27,81,0,0,1,0,0,0,0,1,0
 67 | "119",18.21,78.95,3.2,1,0,0,0,0,0,1,1,0,0,2,0,122,5,1,1,1,1,0,0,6233.1025,492103.44,38851568,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
 68 | "122",9.87,181,4.8,1,2,0,0,0,1,0,1,1,0,0,0,139,5,1,1,1,1,1,1,32761,5929741,1073283136,1,0,23.040001,110.592,530.84161,1,0,0,0,0,0,0,1,0
 69 | "123",11.915,111.35,4.1,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,1,0,12398.822,1380608.9,153730800,1,0,16.809999,68.920998,282.57611,0,1,0,0,0,0,0,1,0
 70 | "124",6.268,183,4.6,1,2,0,1,0,0,0,1,0,0,0,1,119,5,1,1,1,1,1,1,33489,6128487,1121513088,1,0,21.16,97.335999,447.74561,1,0,0,0,0,0,0,1,0
 71 | "127",23.79,24.422,1.2,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,0,0,0,0,0,596.43408,14566.113,355733.63,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0
 72 | "128",12.02,89.498,3.2,1,1,0,0,0,1,0,1,0,0,1,1,125,5,1,1,1,1,0,0,8009.8921,716869.31,64158368,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
 73 | "129",9.74,185,9,0,4,1,0,0,0,0,1,0,0,0,1,196,4,1,1,1,1,1,1,34225,6331625,1171350656,1,1,81,729,6561,1,0,0,0,0,0,1,0,0
 74 | "130",5.109,105.098,7.4,1,0,0,1,0,0,0,1,0,0,1,0,149,4,1,1,1,1,1,0,11045.59,1160869.4,122005048,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0
 75 | "133",16.38,87.317,5.1,0,1,1,0,0,0,0,1,0,0,2,1,159,5,1,1,1,1,0,0,7624.2583,665727.38,58129316,1,0,26.01,132.651,676.52008,0,0,1,0,0,0,0,1,0
 76 | "134",13.26,173.136,7.8,0,1,0,0,0,0,1,1,0,1,0,1,204,4,1,1,1,1,1,1,29976.074,5189937.5,898565056,1,1,60.84,474.552,3701.5056,1,0,0,0,0,0,1,0,0
 77 | "135",26.83,50,3,1,1,1,0,0,0,0,1,0,1,1,1,124,5,1,1,1,1,0,0,2500,125000,6250000,1,0,9,27,81,0,1,0,0,0,0,0,1,0
 78 | "142",6.91,149.8,8.9,1,2,0,1,0,0,0,1,0,1,0,0,159,4,1,1,1,1,1,0,22440.039,3361518,503555392,1,1,79.209999,704.96899,6274.2241,1,0,0,0,0,0,1,0,0
 79 | "146",15.4,87.385,4.3,1,2,0,0,0,0,1,1,0,0,1,1,116,5,1,1,1,1,0,0,7636.1382,667283.94,58310608,1,0,18.49,79.507004,341.8801,0,1,0,0,0,0,0,1,0
 80 | "149",21.07,26.086,1,0,1,0,0,0,1,0,1,0,0,1,1,121,6,1,0,0,0,0,0,680.47937,17750.986,463052.22,0,0,1,1,1,0,1,0,0,0,0,0,0,1
 81 | "151",15.93,106.02,4.3,1,1,0,0,1,0,0,1,0,0,2,1,133,5,1,1,1,1,1,0,11240.24,1191690.3,126343008,1,0,18.49,79.507004,341.8801,0,0,1,0,0,0,0,1,0
 82 | "155",12.75,96,8.1,0,2,1,0,0,0,0,1,0,0,1,1,194,4,1,1,1,1,0,0,9216,884736,84934656,1,1,65.610001,531.44098,4304.6719,0,1,0,0,0,0,1,0,0
 83 | "156",13.42,102.648,4.6,1,2,0,0,0,1,0,1,0,1,1,1,121,5,1,1,1,1,1,0,10536.612,1081562.1,111020192,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,0,1,0
 84 | "157",1.2,158.2,17.6,0,0,0,0,0,0,1,1,0,0,2,0,216,3,1,1,1,1,1,1,25027.24,3959309.3,626362752,1,1,309.76001,5451.7759,95951.258,0,0,1,0,0,1,0,0,0
 85 | "162",19.49,102.943,3.7,1,1,0,0,0,0,1,0,0,0,2,0,125,5,1,1,1,1,1,0,10597.262,1090913.9,112301944,1,0,13.69,50.653,187.41611,0,0,1,0,0,0,0,1,0
 86 | "163",13.249,94.41,3.1,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,0,0,8913.248,841499.75,79445992,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
 87 | "164",34.149,20.217,1.2,1,1,0,0,0,0,1,1,0,1,1,1,119,6,1,0,0,0,0,0,408.72708,8263.2354,167057.83,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,0,1
 88 | "167",25.78,33.235,2.7,1,1,1,0,0,0,0,1,0,0,0,1,124,5,1,1,0,0,0,0,1104.5652,36710.227,1220064.4,0,0,7.29,19.683001,53.1441,1,0,0,0,0,0,0,1,0
 89 | "168",3.94,159,8.7,1,2,0,0,0,0,1,0,1,0,2,0,177,4,1,1,1,1,1,1,25281,4019679,639128960,1,1,75.690002,658.50299,5728.9761,0,0,1,0,0,0,1,0,0
 90 | "174",17.46,119.95,4.2,1,1,0,0,0,0,1,1,0,0,1,1,139,5,1,1,1,1,1,0,14388.003,1725840.9,207014608,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0
 91 | "176",29.499,18.238,1.1,1,1,0,0,1,0,0,1,0,1,1,0,108,6,0,0,0,0,0,0,332.62463,6066.4082,110639.16,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1
 92 | "177",13.91,55.6,3.7,1,1,0,0,0,0,1,1,0,0,0,1,116,5,1,1,1,1,0,0,3091.3601,171879.61,9556507,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0
 93 | "179",19.67,77,3.7,1,1,0,0,0,0,1,0,1,0,0,1,136,5,1,1,1,1,0,0,5929,456533,35153040,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0
 94 | "189",8.5,41.326,5.4,1,1,0,1,0,0,0,1,0,0,0,1,149,5,1,1,1,0,0,0,1707.8383,70578.125,2916711.5,1,0,29.16,157.464,850.3056,1,0,0,0,0,0,0,1,0
 95 | "190",21.73,53.71,7.7,0,0,0,0,1,0,0,1,0,1,0,1,235,4,1,1,1,1,0,0,2884.7642,154940.69,8321864,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0
 96 | "191",17.52,49.308,2.8,1,1,1,0,0,0,0,1,0,0,0,1,119,5,1,1,1,0,0,0,2431.2788,119881.5,5911117,0,0,7.8400002,21.952,61.465599,1,0,0,0,0,0,0,1,0
 97 | "193",16.85,150,3.4,1,1,1,0,0,0,0,0,0,0,2,1,124,5,1,1,1,1,1,1,22500,3375000,506249984,1,0,11.56,39.304001,133.63361,0,0,1,0,0,0,0,1,0
 98 | "194",9.72,123,4.2,1,1,0,0,0,1,0,1,0,0,0,1,109,5,1,1,1,1,1,0,15129,1860867,228886640,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0
 99 | "197",27.68,20.489,1,1,1,1,0,0,0,0,1,0,1,1,0,127,5,1,0,0,0,0,0,419.79913,8601.2646,176231.3,0,0,1,1,1,0,1,0,0,0,0,0,1,0
100 | "198",9.49,175.1,7.8,1,2,1,0,0,0,0,1,1,0,0,1,146,4,1,1,1,1,1,1,30660.01,5368568,940036224,1,1,60.84,474.552,3701.5056,1,0,0,0,0,0,1,0,0
101 | 


--------------------------------------------------------------------------------
/Group Data Challenge 2025/data_challenge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Group Data Challenge 2025/data_challenge.pdf


--------------------------------------------------------------------------------
/Group Data Challenge 2025/orange_juice.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#  Wholesale Manager"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "**Author:**\n",
 15 |     "[Anthony Strittmatter](http://www.anthonystrittmatter.com)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "You manage a wholesale store. The data file juice.csv contains orange juice sales (sales) and prices (price) of different grocery stores that you deliver. Your product range contains three different orange juice brands: Tropicana, Minute Maid, and Dominicks. Some stores advertise/feature specific orange juice brands, which is indicated by the dummy variable feat. The data contains also the store ID (id). You deliver new grocery stores. The new stores sent you the file new grocery.csv, which\n",
 23 |     "contains the planned prices and advertisements for the different brands. Your job as wholesale manager is to predict the sales of the new grocery stores and deliver the right amount of orange juice."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Load Packages and Data"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "[1] \"Packages and data successfully loaded.\"\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "########################  Load Packages and Data  ########################\n",
 48 |     "\n",
 49 |     "# Load packages\n",
 50 |     "library(rpart)\n",
 51 |     "library(rpart.plot)\n",
 52 |     "library(grf)\n",
 53 |     "library(glmnet)\n",
 54 |     "\n",
 55 |     "# Load data\n",
 56 |     "juice <- read.csv(\"juice.csv\", sep = \",\")\n",
 57 |     "new_grocery <- read.csv(\"new_grocery.csv\", sep = \",\")\n",
 58 |     "\n",
 59 |     "print('Packages and data successfully loaded.')\n",
 60 |     "\n",
 61 |     "#############################################################################"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## Inspect Data"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 2,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/html": [
 79 |        "<table>\n",
 80 |        "<thead><tr><th scope=col>X</th><th scope=col>id</th><th scope=col>sales</th><th scope=col>price</th><th scope=col>brand</th><th scope=col>feat</th></tr></thead>\n",
 81 |        "<tbody>\n",
 82 |        "\t<tr><td>1          </td><td>1140       </td><td> 11970     </td><td>2.47       </td><td>minute.maid</td><td>0          </td></tr>\n",
 83 |        "\t<tr><td>3          </td><td>7182       </td><td> 30205     </td><td>1.57       </td><td>dominicks  </td><td>1          </td></tr>\n",
 84 |        "\t<tr><td>4          </td><td>1741       </td><td>  3521     </td><td>2.55       </td><td>minute.maid</td><td>0          </td></tr>\n",
 85 |        "\t<tr><td>5          </td><td>1725       </td><td> 11777     </td><td>1.41       </td><td>dominicks  </td><td>0          </td></tr>\n",
 86 |        "\t<tr><td>6          </td><td>7565       </td><td>129151     </td><td>2.05       </td><td>minute.maid</td><td>1          </td></tr>\n",
 87 |        "\t<tr><td>8          </td><td>5617       </td><td>  7104     </td><td>3.74       </td><td>tropicana  </td><td>0          </td></tr>\n",
 88 |        "</tbody>\n",
 89 |        "</table>\n"
 90 |       ],
 91 |       "text/latex": [
 92 |        "\\begin{tabular}{r|llllll}\n",
 93 |        " X & id & sales & price & brand & feat\\\\\n",
 94 |        "\\hline\n",
 95 |        "\t 1           & 1140        &  11970      & 2.47        & minute.maid & 0          \\\\\n",
 96 |        "\t 3           & 7182        &  30205      & 1.57        & dominicks   & 1          \\\\\n",
 97 |        "\t 4           & 1741        &   3521      & 2.55        & minute.maid & 0          \\\\\n",
 98 |        "\t 5           & 1725        &  11777      & 1.41        & dominicks   & 0          \\\\\n",
 99 |        "\t 6           & 7565        & 129151      & 2.05        & minute.maid & 1          \\\\\n",
100 |        "\t 8           & 5617        &   7104      & 3.74        & tropicana   & 0          \\\\\n",
101 |        "\\end{tabular}\n"
102 |       ],
103 |       "text/markdown": [
104 |        "\n",
105 |        "| X | id | sales | price | brand | feat |\n",
106 |        "|---|---|---|---|---|---|\n",
107 |        "| 1           | 1140        |  11970      | 2.47        | minute.maid | 0           |\n",
108 |        "| 3           | 7182        |  30205      | 1.57        | dominicks   | 1           |\n",
109 |        "| 4           | 1741        |   3521      | 2.55        | minute.maid | 0           |\n",
110 |        "| 5           | 1725        |  11777      | 1.41        | dominicks   | 0           |\n",
111 |        "| 6           | 7565        | 129151      | 2.05        | minute.maid | 1           |\n",
112 |        "| 8           | 5617        |   7104      | 3.74        | tropicana   | 0           |\n",
113 |        "\n"
114 |       ],
115 |       "text/plain": [
116 |        "  X id   sales  price brand       feat\n",
117 |        "1 1 1140  11970 2.47  minute.maid 0   \n",
118 |        "2 3 7182  30205 1.57  dominicks   1   \n",
119 |        "3 4 1741   3521 2.55  minute.maid 0   \n",
120 |        "4 5 1725  11777 1.41  dominicks   0   \n",
121 |        "5 6 7565 129151 2.05  minute.maid 1   \n",
122 |        "6 8 5617   7104 3.74  tropicana   0   "
123 |       ]
124 |      },
125 |      "metadata": {},
126 |      "output_type": "display_data"
127 |     },
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "[1] \"Old data: 9685 observations\"\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "########################  Describe Old Data  ########################\n",
138 |     "\n",
139 |     "# Print first few rows of old data\n",
140 |     "head(juice)\n",
141 |     "\n",
142 |     "# Number of observations\n",
143 |     "print(paste0('Old data: ',nrow(juice),' observations'))\n",
144 |     "\n",
145 |     "######################################################################"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 3,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/html": [
156 |        "<table>\n",
157 |        "<thead><tr><th scope=col>X</th><th scope=col>id</th><th scope=col>price</th><th scope=col>brand</th><th scope=col>feat</th></tr></thead>\n",
158 |        "<tbody>\n",
159 |        "\t<tr><td> 2         </td><td>10171      </td><td>1.81       </td><td>dominicks  </td><td>1          </td></tr>\n",
160 |        "\t<tr><td> 7         </td><td> 7489      </td><td>  NA       </td><td>tropicana  </td><td>0          </td></tr>\n",
161 |        "\t<tr><td>10         </td><td> 7559      </td><td>3.29       </td><td>tropicana  </td><td>0          </td></tr>\n",
162 |        "\t<tr><td>11         </td><td> 1236      </td><td>1.77       </td><td>minute.maid</td><td>1          </td></tr>\n",
163 |        "\t<tr><td>16         </td><td> 5361      </td><td>1.53       </td><td>dominicks  </td><td>0          </td></tr>\n",
164 |        "\t<tr><td>17         </td><td>  108      </td><td>1.42       </td><td>dominicks  </td><td>0          </td></tr>\n",
165 |        "</tbody>\n",
166 |        "</table>\n"
167 |       ],
168 |       "text/latex": [
169 |        "\\begin{tabular}{r|lllll}\n",
170 |        " X & id & price & brand & feat\\\\\n",
171 |        "\\hline\n",
172 |        "\t  2          & 10171       & 1.81        & dominicks   & 1          \\\\\n",
173 |        "\t  7          &  7489       &   NA        & tropicana   & 0          \\\\\n",
174 |        "\t 10          &  7559       & 3.29        & tropicana   & 0          \\\\\n",
175 |        "\t 11          &  1236       & 1.77        & minute.maid & 1          \\\\\n",
176 |        "\t 16          &  5361       & 1.53        & dominicks   & 0          \\\\\n",
177 |        "\t 17          &   108       & 1.42        & dominicks   & 0          \\\\\n",
178 |        "\\end{tabular}\n"
179 |       ],
180 |       "text/markdown": [
181 |        "\n",
182 |        "| X | id | price | brand | feat |\n",
183 |        "|---|---|---|---|---|\n",
184 |        "|  2          | 10171       | 1.81        | dominicks   | 1           |\n",
185 |        "|  7          |  7489       |   NA        | tropicana   | 0           |\n",
186 |        "| 10          |  7559       | 3.29        | tropicana   | 0           |\n",
187 |        "| 11          |  1236       | 1.77        | minute.maid | 1           |\n",
188 |        "| 16          |  5361       | 1.53        | dominicks   | 0           |\n",
189 |        "| 17          |   108       | 1.42        | dominicks   | 0           |\n",
190 |        "\n"
191 |       ],
192 |       "text/plain": [
193 |        "  X  id    price brand       feat\n",
194 |        "1  2 10171 1.81  dominicks   1   \n",
195 |        "2  7  7489   NA  tropicana   0   \n",
196 |        "3 10  7559 3.29  tropicana   0   \n",
197 |        "4 11  1236 1.77  minute.maid 1   \n",
198 |        "5 16  5361 1.53  dominicks   0   \n",
199 |        "6 17   108 1.42  dominicks   0   "
200 |       ]
201 |      },
202 |      "metadata": {},
203 |      "output_type": "display_data"
204 |     },
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "[1] \"New data: 3262 observations\"\n"
210 |      ]
211 |     }
212 |    ],
213 |    "source": [
214 |     "########################  Describe Old Data  ########################\n",
215 |     "\n",
216 |     "# Print first few rows of new data\n",
217 |     "head(new_grocery)\n",
218 |     "\n",
219 |     "# Number of observations\n",
220 |     "print(paste0('New data: ',nrow(new_grocery),' observations'))\n",
221 |     "\n",
222 |     "######################################################################"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "## Prepare Data"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 4,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/plain": [
240 |        "     sales            price          missing         minute.maid    \n",
241 |        " Min.   :    63   Min.   :0.000   Min.   :0.00000   Min.   :0.0000  \n",
242 |        " 1st Qu.:  4800   1st Qu.:1.710   1st Qu.:0.00000   1st Qu.:0.0000  \n",
243 |        " Median :  8256   Median :2.120   Median :0.00000   Median :0.0000  \n",
244 |        " Mean   : 17023   Mean   :2.174   Mean   :0.04801   Mean   :0.3284  \n",
245 |        " 3rd Qu.: 16896   3rd Qu.:2.720   3rd Qu.:0.00000   3rd Qu.:1.0000  \n",
246 |        " Max.   :716415   Max.   :4.170   Max.   :1.00000   Max.   :1.0000  \n",
247 |        "   dominicks        tropicana        featured     \n",
248 |        " Min.   :0.0000   Min.   :0.000   Min.   :0.0000  \n",
249 |        " 1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  \n",
250 |        " Median :0.0000   Median :0.000   Median :0.0000  \n",
251 |        " Mean   :0.3405   Mean   :0.331   Mean   :0.2355  \n",
252 |        " 3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:0.0000  \n",
253 |        " Max.   :1.0000   Max.   :1.000   Max.   :1.0000  "
254 |       ]
255 |      },
256 |      "metadata": {},
257 |      "output_type": "display_data"
258 |     },
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "[1] \"Data is prepared.\"\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "########################  Data Preparation  ########################\n",
269 |     "\n",
270 |     "# Generate dummy for missing prices\n",
271 |     "missing <- (is.na(juice$price) == TRUE)\n",
272 |     "new_missing <- (is.na(new_grocery$price) == TRUE)\n",
273 |     "\n",
274 |     "# Replace missing prices with zero\n",
275 |     "juice$price[is.na(juice$price)] <-0\n",
276 |     "new_grocery$price[is.na(new_grocery$price)] <-0\n",
277 |     "\n",
278 |     "# Generate Dummies for Brands\n",
279 |     "brand_1 <- (juice$brand == \"minute.maid\")\n",
280 |     "brand_2 <- (juice$brand == \"dominicks\")\n",
281 |     "brand_3 <- (juice$brand == \"tropicana\")\n",
282 |     "\n",
283 |     "new_brand_1 <- (new_grocery$brand == \"minute.maid\")\n",
284 |     "new_brand_2 <- (new_grocery$brand == \"dominicks\")\n",
285 |     "new_brand_3 <- (new_grocery$brand == \"tropicana\")\n",
286 |     "\n",
287 |     "# Generate outcome and control variables\n",
288 |     "y <- as.matrix(juice$sales)\n",
289 |     "colnames(y) <- c(\"sales\")\n",
290 |     "\n",
291 |     "x <- as.matrix(cbind(juice$price, missing, brand_1, brand_2, brand_3, juice$feat))\n",
292 |     "colnames(x) <- c(\"price\", \"missing\", \"minute.maid\", \"dominicks\", \"tropicana\", \"featured\")\n",
293 |     "\n",
294 |     "new_x <- as.matrix(cbind(new_grocery$price, new_missing, new_brand_1, new_brand_2, new_brand_3, new_grocery$feat))\n",
295 |     "colnames(new_x) <- c(\"price\", \"missing\", \"minute.maid\", \"dominicks\", \"tropicana\", \"featured\")\n",
296 |     "\n",
297 |     "# Descriptive statistics\n",
298 |     "summary(cbind(y,x))\n",
299 |     "\n",
300 |     "print('Data is prepared.')\n",
301 |     "\n",
302 |     "#############################################################################"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "**$\\Rightarrow$ It is possible to add non-linear and interaction terms.**"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "## Generate Training and Test Sample"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 5,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "name": "stdout",
326 |      "output_type": "stream",
327 |      "text": [
328 |       "[1] \"Training and test samples created.\"\n"
329 |      ]
330 |     }
331 |    ],
332 |    "source": [
333 |     "########################  Training and Test Samples  ########################\n",
334 |     "\n",
335 |     "set.seed(???)\n",
336 |     "\n",
337 |     "# Generate variable with the rows in training data\n",
338 |     "\n",
339 |     "\n",
340 |     "print('Training and test samples created.')\n",
341 |     "\n",
342 |     "#############################################################################"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "## Predict Orange Juice Prices in Training Sample and Assess Model in Test Sample"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 6,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "name": "stdout",
359 |      "output_type": "stream",
360 |      "text": [
361 |       "[1] \"R-squared Penalized Regression: 0.278\"\n"
362 |      ]
363 |     }
364 |    ],
365 |    "source": [
366 |     "########################  LASSO, Ridge, Elastic Net  ##############################\n",
367 |     "\n",
368 |     "set.seed(???)\n",
369 |     "penalized.cv <- ???\n",
370 |     "\n",
371 |     "\n",
372 |     "# Fitted values\n",
373 |     "pred_penalized <- ???\n",
374 |     "\n",
375 |     "# Calculate the MSE\n",
376 |     "MSE_penalized <- mean((y[-training_set] - pred_penalized[-training_set])^2)\n",
377 |     "R2_penalized <- round(1- MSE_penalized/var(y[-training_set]), digits = 3)\n",
378 |     "\n",
379 |     "print(paste0(\"R-squared Penalized Regression: \", R2_penalized))\n",
380 |     "                                   \n",
381 |     "################################################################"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 11,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "name": "stdout",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "[1] \"R-squared Tree: 0.365\"\n"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "######################  Regression Tree  #######################\n",
399 |     "\n",
400 |     "set.seed(???)\n",
401 |     "# Prepare data for tree estimator\n",
402 |     "outcome <- y[training_set]\n",
403 |     "tree_data <- data.frame(outcome, x[training_set,])\n",
404 |     "\n",
405 |     "deep_tree <- ???\n",
406 |     "\n",
407 |     "# Optimal tree size\n",
408 |     "op.index <- ???\n",
409 |     "\n",
410 |     "## Select the Tree that Minimises CV-MSE\n",
411 |     "cp.vals <- ???\n",
412 |     "\n",
413 |     "# Prune the deep tree\n",
414 |     "pruned_tree <- ???\n",
415 |     "\n",
416 |     "## Plot tree structure\n",
417 |     "#rpart.plot(pruned_tree,digits=3)\n",
418 |     "\n",
419 |     "# Fitted values\n",
420 |     "predtree <- ???\n",
421 |     "\n",
422 |     "# Calculate the MSE\n",
423 |     "MSEtree <- mean((y[-training_set] - predtree[-training_set])^2)\n",
424 |     "R2tree <- round(1- MSEtree/var(y[-training_set]), digits = 3)\n",
425 |     "\n",
426 |     "print(paste0(\"R-squared Tree: \", R2tree))\n",
427 |     "\n",
428 |     "################################################################"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 8,
434 |    "metadata": {},
435 |    "outputs": [
436 |     {
437 |      "name": "stdout",
438 |      "output_type": "stream",
439 |      "text": [
440 |       "[1] \"R-squared Forest: 0.411\"\n"
441 |      ]
442 |     }
443 |    ],
444 |    "source": [
445 |     "########################  Random Forest  #######################\n",
446 |     "\n",
447 |     "set.seed(???)\n",
448 |     "\n",
449 |     "rep <- ??? # number of trees\n",
450 |     "cov <- ??? # share of covariates\n",
451 |     "frac <- ??? # fraction of subsample\n",
452 |     "min_obs <- ??? # max. size of terminal leaves in trees\n",
453 |     "\n",
454 |     "# Build Forest\n",
455 |     "forest <- ???\n",
456 |     "\n",
457 |     "# Fitted values\n",
458 |     "predforest <- ???\n",
459 |     "\n",
460 |     "# Calculate MSE\n",
461 |     "MSEforest <- mean((y[-training_set] - predforest[-training_set])^2)\n",
462 |     "R2forest <- round(1- MSEforest/var(y[-training_set]), digits = 3)\n",
463 |     "\n",
464 |     "print(paste0(\"R-squared Forest: \", R2forest))\n",
465 |     "\n",
466 |     "################################################################"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "markdown",
471 |    "metadata": {},
472 |    "source": [
473 |     "## Select Favorite Model and Extrapolate to New Data"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 9,
479 |    "metadata": {},
480 |    "outputs": [
481 |     {
482 |      "name": "stdout",
483 |      "output_type": "stream",
484 |      "text": [
485 |       "[1] \"Out-of-sample sales are predicted.\"\n"
486 |      ]
487 |     }
488 |    ],
489 |    "source": [
490 |     "########################  Out-of-Sample Prediction  #######################\n",
491 |     "\n",
492 |     "# Fitted values\n",
493 |     "new_prediction <- ???\n",
494 |     "\n",
495 |     "print('Out-of-sample sales are predicted.')\n",
496 |     "\n",
497 |     "###########################################################################"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "markdown",
502 |    "metadata": {},
503 |    "source": [
504 |     "## Store Out-of-Sample Predictions"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": 10,
510 |    "metadata": {},
511 |    "outputs": [
512 |     {
513 |      "name": "stdout",
514 |      "output_type": "stream",
515 |      "text": [
516 |       "[1] \"File is stored.\"\n",
517 |       "[1] \"Send your results to anthony.strittmatter@unibas.ch\"\n"
518 |      ]
519 |     }
520 |    ],
521 |    "source": [
522 |     "########################  Store Results  #######################\n",
523 |     "\n",
524 |     "id_new <- as.matrix(new_grocery$id)\n",
525 |     "\n",
526 |     "# Replace ??? with your group name\n",
527 |     "write.csv(cbind(id_new,new_prediction),\"???.csv\")\n",
528 |     "\n",
529 |     "print('File is stored.')\n",
530 |     "print('Send your results to anthony.strittmatter@unibas.ch')\n",
531 |     "\n",
532 |     "################################################################"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": []
541 |   }
542 |  ],
543 |  "metadata": {
544 |   "kernelspec": {
545 |    "display_name": "R",
546 |    "language": "R",
547 |    "name": "ir"
548 |   },
549 |   "language_info": {
550 |    "codemirror_mode": "r",
551 |    "file_extension": ".r",
552 |    "mimetype": "text/x-r-source",
553 |    "name": "R",
554 |    "pygments_lexer": "r",
555 |    "version": "3.6.1"
556 |   }
557 |  },
558 |  "nbformat": 4,
559 |  "nbformat_minor": 4
560 | }
561 | 


--------------------------------------------------------------------------------
/Group Data Challenge 2025/orange_juice.r:
--------------------------------------------------------------------------------
  1 | ########################  Load Packages and Data  ########################
  2 | 
  3 | # Load packages
  4 | library(rpart)
  5 | library(rpart.plot)
  6 | library(grf)
  7 | library(glmnet)
  8 | 
  9 | # Load data
 10 | juice <- read.csv("juice.csv", sep = ",")
 11 | new_grocery <- read.csv("new_grocery.csv", sep = ",")
 12 | 
 13 | print('Packages and data successfully loaded.')
 14 | 
 15 | #############################################################################
 16 | 
 17 | ########################  Describe Old Data  ########################
 18 | 
 19 | # Print first few rows of old data
 20 | head(juice)
 21 | 
 22 | # Number of observations
 23 | print(paste0('Old data: ',nrow(juice),' observations'))
 24 | 
 25 | ######################################################################
 26 | 
 27 | ########################  Describe Old Data  ########################
 28 | 
 29 | # Print first few rows of new data
 30 | head(new_grocery)
 31 | 
 32 | # Number of observations
 33 | print(paste0('New data: ',nrow(new_grocery),' observations'))
 34 | 
 35 | ######################################################################
 36 | 
 37 | ########################  Data Preparation  ########################
 38 | 
 39 | # Generate dummy for missing prices
 40 | missing <- (is.na(juice$price) == TRUE)
 41 | new_missing <- (is.na(new_grocery$price) == TRUE)
 42 | 
 43 | # Replace missing prices with zero
 44 | juice$price[is.na(juice$price)] <-0
 45 | new_grocery$price[is.na(new_grocery$price)] <-0
 46 | 
 47 | # Generate Dummies for Brands
 48 | brand_1 <- (juice$brand == "minute.maid")
 49 | brand_2 <- (juice$brand == "dominicks")
 50 | brand_3 <- (juice$brand == "tropicana")
 51 | 
 52 | new_brand_1 <- (new_grocery$brand == "minute.maid")
 53 | new_brand_2 <- (new_grocery$brand == "dominicks")
 54 | new_brand_3 <- (new_grocery$brand == "tropicana")
 55 | 
 56 | # Generate outcome and control variables
 57 | y <- as.matrix(juice$sales)
 58 | colnames(y) <- c("sales")
 59 | 
 60 | x <- as.matrix(cbind(juice$price, missing, brand_1, brand_2, brand_3, juice$feat))
 61 | colnames(x) <- c("price", "missing", "minute.maid", "dominicks", "tropicana", "featured")
 62 | 
 63 | new_x <- as.matrix(cbind(new_grocery$price, new_missing, new_brand_1, new_brand_2, new_brand_3, new_grocery$feat))
 64 | colnames(new_x) <- c("price", "missing", "minute.maid", "dominicks", "tropicana", "featured")
 65 | 
 66 | # Descriptive statistics
 67 | summary(cbind(y,x))
 68 | 
 69 | print('Data is prepared.')
 70 | 
 71 | #############################################################################
 72 | 
 73 | ########################  Training and Test Samples  ########################
 74 | 
 75 | set.seed(???)
 76 | 
 77 | # Generate variable with the rows in training data
 78 | 
 79 | 
 80 | print('Training and test samples created.')
 81 | 
 82 | #############################################################################
 83 | 
 84 | ########################  LASSO, Ridge, Elastic Net  ##############################
 85 | 
 86 | set.seed(???)
 87 | penalized.cv <- ???
 88 | 
 89 | 
 90 | # Fitted values
 91 | pred_penalized <- ???
 92 | 
 93 | # Calculate the MSE
 94 | MSE_penalized <- mean((y[-training_set] - pred_penalized[-training_set])^2)
 95 | R2_penalized <- round(1- MSE_penalized/var(y[-training_set]), digits = 3)
 96 | 
 97 | print(paste0("R-squared Penalized Regression: ", R2_penalized))
 98 |                                    
 99 | ################################################################
100 | 
101 | ######################  Regression Tree  #######################
102 | 
103 | set.seed(???)
104 | # Prepare data for tree estimator
105 | outcome <- y[training_set]
106 | tree_data <- data.frame(outcome, x[training_set,])
107 | 
108 | deep_tree <- ???
109 | 
110 | # Optimal tree size
111 | op.index <- ???
112 | 
113 | ## Select the Tree that Minimises CV-MSE
114 | cp.vals <- ???
115 | 
116 | # Prune the deep tree
117 | pruned_tree <- ???
118 | 
119 | ## Plot tree structure
120 | #rpart.plot(pruned_tree,digits=3)
121 | 
122 | # Fitted values
123 | predtree <- ???
124 | 
125 | # Calculate the MSE
126 | MSEtree <- mean((y[-training_set] - predtree[-training_set])^2)
127 | R2tree <- round(1- MSEtree/var(y[-training_set]), digits = 3)
128 | 
129 | print(paste0("R-squared Tree: ", R2tree))
130 | 
131 | ################################################################
132 | 
133 | ########################  Random Forest  #######################
134 | 
135 | set.seed(???)
136 | 
137 | rep <- ??? # number of trees
138 | cov <- ??? # share of covariates
139 | frac <- ??? # fraction of subsample
140 | min_obs <- ??? # max. size of terminal leaves in trees
141 | 
142 | # Build Forest
143 | forest <- ???
144 | 
145 | # Fitted values
146 | predforest <- ???
147 | 
148 | # Calculate MSE
149 | MSEforest <- mean((y[-training_set] - predforest[-training_set])^2)
150 | R2forest <- round(1- MSEforest/var(y[-training_set]), digits = 3)
151 | 
152 | print(paste0("R-squared Forest: ", R2forest))
153 | 
154 | ################################################################
155 | 
156 | ########################  Out-of-Sample Prediction  #######################
157 | 
158 | # Fitted values
159 | new_prediction <- ???
160 | 
161 | print('Out-of-sample sales are predicted.')
162 | 
163 | ###########################################################################
164 | 
165 | ########################  Store Results  #######################
166 | 
167 | id_new <- as.matrix(new_grocery$id)
168 | 
169 | # Replace ??? with your group name
170 | write.csv(cbind(id_new,new_prediction),"???.csv")
171 | 
172 | print('File is stored.')
173 | print('Send your results to anthony.strittmatter@unibas.ch')
174 | 
175 | ################################################################
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/Individual Home Assignment 2025/grading_grid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Individual Home Assignment 2025/grading_grid.pdf


--------------------------------------------------------------------------------
/Individual Home Assignment 2025/research_proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Individual Home Assignment 2025/research_proposal.pdf


--------------------------------------------------------------------------------
/Literature/Athey_2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Athey_2017.pdf


--------------------------------------------------------------------------------
/Literature/Athey_et_al_2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Athey_et_al_2019.pdf


--------------------------------------------------------------------------------
/Literature/Belloni_et_al_2012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2012.pdf


--------------------------------------------------------------------------------
/Literature/Belloni_et_al_2014a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2014a.pdf


--------------------------------------------------------------------------------
/Literature/Belloni_et_al_2014b.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2014b.pdf


--------------------------------------------------------------------------------
/Literature/Cagala_et_al_2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Cagala_et_al_2021.pdf


--------------------------------------------------------------------------------
/Literature/Chernozhukov_et_al_2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Chernozhukov_et_al_2017.pdf


--------------------------------------------------------------------------------
/Literature/Chetverikov_et_al_2020.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Chetverikov_et_al_2020.pdf


--------------------------------------------------------------------------------
/Literature/Google flu trends.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Google flu trends.pdf


--------------------------------------------------------------------------------
/Literature/Mullainathan_Spiess_2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Mullainathan_Spiess_2017.pdf


--------------------------------------------------------------------------------
/Literature/Semenova_Chernozhukov_2020.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Semenova_Chernozhukov_2020.pdf


--------------------------------------------------------------------------------
/PC Lab 1/help files/glmnet_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/help files/glmnet_package.pdf


--------------------------------------------------------------------------------
/PC Lab 1/penalize_regression_tutorial.r:
--------------------------------------------------------------------------------
  1 | ########################  Load Packages and Data  ########################
  2 | 
  3 | # Load packages
  4 | library(glmnet)
  5 | library(corrplot)
  6 | 
  7 | # Load data
  8 | load("student-mat-train.Rdata")
  9 | load("student-mat-test.Rdata")
 10 | 
 11 | # Number of observations
 12 | print(paste0('Training set: ',nrow(train),' obs'))
 13 | print(paste0('Test set: ',nrow(test),' obs'))
 14 | 
 15 | ###########################################################################
 16 | 
 17 | ########################  Correlation analysis  ########################
 18 | cor <- round(cor(train[,c(1:25)]),2) # Variable 26 is the depedendent variable
 19 | corrplot(cor)
 20 | 
 21 | ########################  Estimation of the linear regression  ########################
 22 | 
 23 | ols <- lm(G3 ~ ., data = train)
 24 | summary(ols)
 25 | 
 26 | # Calculate the MSE
 27 | test$predols <- predict(ols, newdata = test)
 28 | 
 29 | predMSEols <- mean((test$G3 - test$predols)^2)
 30 | print(predMSEols)
 31 | 
 32 | ########################################################################################
 33 | 
 34 | ########################  OLS model  ########################
 35 | 
 36 | ols_small <- lm(??? , data = train)
 37 | 
 38 | # Calculate the MSE
 39 | test$predols_small <- predict(ols_small, newdata = test)
 40 | 
 41 | predMSEols_small <- mean((test$G3 - test$predols_small)^2)
 42 | print(predMSEols_small)
 43 | 
 44 | ########################  Lasso Path  ########################
 45 | 
 46 | # We make a plot that shows how the Lasso coefficients change with lambda
 47 | # glmnet is the standard R package for Lasso, Ridge, and Elastic Net
 48 | # alpha is a parmeter that allows to specify a Lasso, Ridge, or Elastic Net model
 49 | # alpha = 1 for Lasso; alpha = 0 for Ridge, 0 < alpha < 1 for Elastic Net
 50 | # The control variables are train[,c(1:25)]
 51 | # The outcome variable is train$G3 (math grades)
 52 | 
 53 | # Estimate a Lasso model
 54 | lasso <- glmnet(as.matrix(train[,c(1:25)]), train$G3, alpha = 1) # We save the model under the name "lasso"
 55 | plot(lasso, xvar = "lambda", label = TRUE)
 56 | 
 57 | ###############################################################
 58 | 
 59 | ########################  Cross-Validaton  ########################
 60 | 
 61 | # Set starting value for replicability
 62 | set.seed(27112019) 
 63 | 
 64 | # cv.glmnet performs a cross-validation to determine the optimal lambda value
 65 | # type.measure specifies the measure we use to assess the model accuracy (here MSE)
 66 | # nfolds specifies the number of cross-validation folds we use (here 5)
 67 | 
 68 | # Cross-validate the Lasso
 69 | lasso.cv <- cv.glmnet(as.matrix(train[,c(1:25)]), train$G3, type.measure = "mse", nfolds = 5, alpha = 1)
 70 | 
 71 | # Plot the MSE for the different lambda values
 72 | plot(lasso.cv)
 73 | 
 74 | #####################################################################
 75 | 
 76 | ########################  Optimal Lambda Value  ########################
 77 | 
 78 | # Print the optimal lambda value
 79 | print(paste0("Optimal lambda that minimizes cross-validated MSE: ", lasso.cv$lambda.min))
 80 | print(paste0("Optimal lambda using one-standard-error-rule: ", lasso.cv$lambda.1se))
 81 | 
 82 | #########################################################################
 83 | 
 84 | ########################  Lasso Coefficients  ########################
 85 | 
 86 | # Print Lasso coefficients
 87 | print(coef(lasso.cv, s = "lambda.min"))
 88 | 
 89 | # Save for later comparison
 90 | coef_lasso1 <- coef(lasso.cv, s = "lambda.min") 
 91 | 
 92 | #######################################################################
 93 | 
 94 | ########################  Test Sample MSE  ########################
 95 | 
 96 | # Estimate the fitted values of the Lasso model in the test sample
 97 | # We use the model "lasso.cv" and the lambda value which we estimated in the training sample
 98 | # The control variables "newx" are from the test sample
 99 | 
100 | # Fitted values
101 | test$predlasso <- predict(lasso.cv, newx = as.matrix(test[,c(1:25)]), s = lasso.cv$lambda.min)
102 | 
103 | # Calculate the MSE
104 | predMSElasso <- mean((test$G3 - test$predlasso)^2)
105 | print(paste0("MSE: ", predMSElasso))
106 |       
107 | #####################################################################
108 | 
109 | ########################  Different Starting Value  ########################
110 | 
111 | # Change the starting value
112 | set.seed(27112025) # 27112024
113 | 
114 | # Re-estimate the Lasso model
115 | lasso.cv <- cv.glmnet(???)
116 | 
117 | # Store the coefficients
118 | coef_lasso2 <- coef(lasso.cv, s = ???)
119 | print(cbind(coef_lasso1, coef_lasso2))
120 | 
121 | # Calculate the fitted values
122 | test$predlasso2 <- predict(lasso.cv, newx = as.matrix(test[,c(1:25)]), s = lasso.cv$lambda.min)
123 | 
124 | # Correlation between the fitted values of the two Lasso models
125 | cor_fit <- cor(test$predlasso,test$predlasso2)
126 | print(paste0("Correlation between fitted values: ", cor_fit))
127 | 
128 | ########################  Ridge Path  ########################
129 | 
130 | # alpha = 0 specifies a Ridge model
131 | 
132 | # Estimate the Ridge
133 | ridge <- glmnet(as.matrix(train[,c(1:25)]), train$G3, alpha = ???)
134 | 
135 | # Plot the path of the Ridge coefficients
136 | plot(ridge, xvar = "lambda", label = TRUE)
137 | 
138 | ###############################################################
139 | 
140 | ########################  Cross-Validation  ########################
141 | 
142 | # Set starting value
143 | set.seed(27112019)
144 | 
145 | # Cross-validate the Ridge model 
146 | ridge.cv <- cv.glmnet(???)
147 | 
148 | # Plot the MSE in the cross-validation samples
149 | plot(ridge.cv)
150 | 
151 | #####################################################################
152 | 
153 | ########################  Optimal Lambda Value  ########################
154 | 
155 | # Print the optimal lambda value
156 | print(paste0("Optimal lambda that minimizes cross-validated MSE: ", ???))
157 | print(paste0("Optimal lambda using one-standard-error-rule: ", ???))
158 | 
159 | #########################################################################
160 | 
161 | ########################  Ridge Coefficients  ########################
162 | 
163 | # Print Ridge coefficients
164 | print(coef(ridge.cv, s = "lambda.min"))
165 | 
166 | # Save for later comparison
167 | coef_ridge <- coef(ridge.cv, s = "lambda.min") 
168 | 
169 | #######################################################################
170 | 
171 | ########################  Test Sample MSE  ########################
172 | 
173 | # Estimate fitted values in test sample
174 | test$predridge <- predict(ridge, newx = ???, s = ???)
175 | 
176 | # Calculate the MSE
177 | predMSEridge <- ???
178 | print(paste0("MSE: ", predMSEridge))
179 | 
180 | ###################################################################
181 | 
182 | ########################  Compare Lasso and Ridge Coefficients  ########################
183 | 
184 | # Pick the coefficients of Dalc and Walc
185 | comp <- cbind(coef(ols)[23:24], coef_lasso1[23:24], coef_lasso2[23:24], coef_ridge[23:24]) 
186 | colnames(comp) <- c("OLS", "Lasso1", "Lasso2", "Ridge")
187 | print(comp)
188 | 
189 | #########################################################################################
190 | 
191 | ########################  Compare the MSE  ########################
192 | 
193 | # Print the MSE of the OLS, Lasso and Ridge models
194 | print(c(predMSEols, predMSElasso, predMSEridge))
195 | 
196 | ####################################################################
197 | 
198 | ########################  Compare models  ########################
199 | 
200 | # Visualize the predictions (Predicted vs Actual)
201 | plot(test$G3,test$predols,xlim=c(5,20),ylim=c(4,16), col= "darkgreen", xlab = "Actual Grades", ylab = "Predicted Grades" )
202 | par(new=TRUE)
203 | plot(test$G3,test$predlasso,xlim=c(5,20),ylim=c(4,16), col= "blue", xlab = "", ylab = "" )
204 | par(new=TRUE)
205 | plot(test$G3,test$predridge,xlim=c(5,20),ylim=c(4,16), col= "red", xlab = "", ylab = "" )
206 | abline(a=0,b=1)
207 | legend(16, 9, c("OLS", "Lasso", "Ridge"), col = c("darkgreen", "blue", "red"), pch = c(21, 21, 21))
208 | 
209 | ####################################################################
210 | 


--------------------------------------------------------------------------------
/PC Lab 1/student-mat-test.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/student-mat-test.Rdata


--------------------------------------------------------------------------------
/PC Lab 1/student-mat-train.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/student-mat-train.Rdata


--------------------------------------------------------------------------------
/PC Lab 2/browser-sites.txt:
--------------------------------------------------------------------------------
   1 | atdmt.com
   2 | yahoo.com
   3 | whenu.com
   4 | weatherbug.com
   5 | msn.com
   6 | google.com
   7 | aol.com
   8 | questionmarket.com
   9 | googlesyndication.com-o02
  10 | casalemedia.com
  11 | mywebsearch.com
  12 | myspace.com
  13 | pointroll.com
  14 | atwola.com
  15 | yieldmanager.com
  16 | live.com
  17 | aim.com
  18 | mediaplex.com
  19 | precisionclick.com
  20 | tribalfusion.com
  21 | insightexpressai.com
  22 | trafficmp.com
  23 | ebay.com
  24 | realmedia.com
  25 | zedo.com
  26 | advertising.com
  27 | microsoft.com
  28 | hotbar.com
  29 | adrevolver.com
  30 | ru4.com
  31 | 180solutions.com
  32 | nextag.com
  33 | accuweather.com
  34 | overture.com
  35 | hotmail.com
  36 | passport.com
  37 | my-etrust.com
  38 | starware.com
  39 | relevantknowledge.com
  40 | myway.com
  41 | partner2profit.com
  42 | ditto.com
  43 | kanoodle.com
  44 | ebayobjects.com
  45 | mcafee.com
  46 | comcast.net
  47 | fastclick.net
  48 | adbrite.com
  49 | vpptechnologies.com
  50 | specificclick.net
  51 | serving-sys.com
  52 | weather.com
  53 | adserver.com
  54 | licenseacquisition.org
  55 | pogo.com
  56 | go.com
  57 | btgrab.com
  58 | bellsouth.net
  59 | intellisrv.net
  60 | dell.com
  61 | waol.exe
  62 | cnn.com
  63 | facebook.com
  64 | incredibarvuz1.com
  65 | burstnet.com
  66 | adknowledge.com
  67 | funwebproducts.com
  68 | belnk.com
  69 | netscape.com
  70 | mysearch.com
  71 | real.com
  72 | liveperson.net
  73 | adsonar.com
  74 | passport.net
  75 | euroclick.com
  76 | m7z.net
  77 | mywebface.com
  78 | kazaa.com
  79 | bestoffersnetworks.com
  80 | vitalstream.com
  81 | tacoda.net
  82 | unicast.com
  83 | offeroptimizer.com
  84 | bankofamerica.com
  85 | acsd.exe
  86 | gator.com
  87 | quickbrowsersearch.com
  88 | revsci.net
  89 | personalweb.com
  90 | rr.com
  91 | msnusers.com
  92 | zango.com
  93 | earthlink.net
  94 | mapquest.com
  95 | falkag.net
  96 | freeze.com
  97 | amazon.com
  98 | net-offers.net
  99 | shopperreports.com
 100 | dellfix.com
 101 | plaxo.com
 102 | ysbweb.com
 103 | googleadservices.com
 104 | qnsr.com
 105 | revenue.net
 106 | adultfriendfinder.com
 107 | addynamix.com
 108 | seekmo.com
 109 | verizon.net
 110 | cox.net
 111 | metricsdirect.com
 112 | akamai.net
 113 | admarketplace.net
 114 | amazon.com-o01
 115 | aolacsd.exe
 116 | opinionsquare.com
 117 | interclick.com
 118 | peoplepc.com
 119 | go.com-o04
 120 | realtechnetwork.net
 121 | freezecoldcash.com
 122 | ask.com
 123 | contextweb.com
 124 | intellitxt.com
 125 | yceml.net
 126 | about.com
 127 | youtube.com
 128 | wikipedia.org
 129 | surfaccuracy.com
 130 | windowsmedia.com
 131 | craigslist.org
 132 | hackerwatch.org
 133 | foxsports.com
 134 | spamblockerutility.com
 135 | walmart.com
 136 | navexcel.com
 137 | partypoker.com
 138 | wellsfargo.com
 139 | travelzoo.com
 140 | photobucket.com
 141 | viewpoint.com
 142 | nielsennetpanel.com
 143 | mymailstamp.com
 144 | windows.com
 145 | optonline.net
 146 | eguard.com
 147 | aolcdn.com
 148 | musicmatch.com
 149 | qksz.net
 150 | cometsystems.com
 151 | netzero.net
 152 | specificmedia.com
 153 | paypal.com
 154 | iwon.com
 155 | monster.com-o01
 156 | vmn.net
 157 | juno.com
 158 | information.com
 159 | sysupdates.com
 160 | 2o7.net
 161 | adwave.com
 162 | need2find.com
 163 | target.com
 164 | ebayrtm.com
 165 | match.com
 166 | bridgetrack.com
 167 | comcastsupport.com
 168 | rs6.net
 169 | screensavers.com
 170 | footprint.net
 171 | sportsline.com
 172 | adelphia.net
 173 | smileycentral.com
 174 | dlqm.net
 175 | careerbuilder.com
 176 | mlb.com
 177 | searchignite.com
 178 | wachovia.com
 179 | expedia.com
 180 | thinktarget.com
 181 | authnow.com
 182 | dotomi.com
 183 | blogspot.com
 184 | hpdjjs.com
 185 | chase.com
 186 | outerinfo.com
 187 | nscpcdn.com
 188 | vonage.com
 189 | searchscout.com
 190 | compuserve.com
 191 | lycos.com
 192 | xanga.com
 193 | websearch.com
 194 | azjmp.com
 195 | tmcs.net-o01
 196 | exitexchange.com
 197 | toshibapc.com
 198 | runescape.com
 199 | weatherstudio.com
 200 | imdb.com
 201 | adecn.com
 202 | bargain-buddy.net
 203 | carsdirect.com
 204 | mspaceads.com
 205 | apple.com
 206 | ups.com
 207 | 88.80.5.21
 208 | exct.net
 209 | cingular.com
 210 | foodnetwork.com
 211 | go.com-o03
 212 | excite.com
 213 | capitalone.com
 214 | imiclk.com
 215 | overstock.com
 216 | bloglines.com
 217 | compfused.com
 218 | morpheus.com
 219 | foxnews.com
 220 | marketwatch.com
 221 | wamu.com
 222 | monster.com
 223 | adobe.com
 224 | 888.com
 225 | untd.com
 226 | abetterinternet.com
 227 | centralmedia.ws
 228 | valuead.com
 229 | targetsaver.com
 230 | lynxtrack.com
 231 | cartoonnetwork.com
 232 | netflix.com
 233 | chitika.net
 234 | geocities.com
 235 | qsrch.com
 236 | drsnsrch.com
 237 | autobytel.com
 238 | web-nexus.net
 239 | webservicehosts.com
 240 | sharewareonline.com
 241 | llnwd.net
 242 | instantnavigation.com
 243 | nick.com
 244 | nfl.com
 245 | oingo.com
 246 | lightningcast.net
 247 | altbill.com
 248 | xolox.nl
 249 | superpages.com
 250 | classmates.com
 251 | aavalue.com
 252 | bluestreak.com
 253 | southwest.com
 254 | whitepages.com
 255 | usps.com
 256 | webhancer.com
 257 | bbc.co.uk
 258 | true.com
 259 | bearshare.com
 260 | citibank.com
 261 | blackplanet.com
 262 | pch.com
 263 | att.net
 264 | autoweb.com
 265 | insightexpress.com
 266 | charter.net
 267 | alumnigroup.org
 268 | verizonwireless.com
 269 | fedex.com
 270 | mobilesidewalk.com
 271 | netteller.com
 272 | webshots.com
 273 | sprint.com
 274 | orbitz.com
 275 | bestbuy.com
 276 | grandstreetinteractive.com
 277 | paypopup.com
 278 | cheaptickets.com
 279 | dell4me.com
 280 | new.net
 281 | nytimes.com
 282 | nyadmcncserve-05y06a.com
 283 | aoltpspd.exe
 284 | toprebates.com
 285 | jcpenney.com
 286 | geotrust.com
 287 | travelocity.com
 288 | qvc.com
 289 | 4at1.com
 290 | cpmstar.com
 291 | bizrate.com
 292 | ticketmaster.com
 293 | usbank.com
 294 | tripod.com
 295 | buy.com
 296 | nascar.com
 297 | aebn.net
 298 | infospace.com
 299 | wxbug.com
 300 | contextuads.com
 301 | bns1.net
 302 | download.com
 303 | gocyberlink.com
 304 | 192.168.1.1
 305 | dvlabs.com
 306 | defamer.com
 307 | tracking101.com
 308 | accountonline.com
 309 | hbmediapro.com
 310 | usatoday.com
 311 | bigfishgames.com
 312 | neopets.com
 313 | adoutput.com
 314 | sbc.com
 315 | noaa.gov
 316 | lowermybills.com
 317 | kmpads.com
 318 | directtrack.com
 319 | clicksor.com
 320 | legacy.com
 321 | eajmp.com
 322 | nastydollars.com
 323 | worldofwarcraft.com
 324 | mirarsearch.com
 325 | verizon.com
 326 | miniclip.com
 327 | iwin.com
 328 | peel.com
 329 | hgtv.com
 330 | amaena.com
 331 | sprintpcs.com
 332 | shopping.com
 333 | webmd.com
 334 | clearchannel.com
 335 | winamp.com
 336 | reference.com
 337 | interpolls.com
 338 | americangreetings.com
 339 | tmcs.net
 340 | midtenmedia.com
 341 | domainsponsor.com
 342 | thunderdownloads.com
 343 | akamaistream.net
 344 | livejournal.com
 345 | tx.us
 346 | onlinerewardcenter.com
 347 | msn.com-o18
 348 | sony.com
 349 | dogpile.com
 350 | nba.com
 351 | citysearch.com
 352 | connextra.com
 353 | nickjr.com
 354 | t-mobile.com
 355 | winfixer.com
 356 | adlegend.com
 357 | adsrevenue.net
 358 | sears.com
 359 | ap.org
 360 | luna.net
 361 | shockwave.com
 362 | hsn.com
 363 | fl.us
 364 | mypoints.com
 365 | mozilla.org
 366 | aresgalaxy.org
 367 | realtor.com
 368 | addictinggames.com
 369 | clickbooth.com
 370 | amateurmatch.com
 371 | worldnow.com
 372 | surveys.com
 373 | pa.us
 374 | arcaderockstar.com
 375 | coolsavings.com
 376 | yournewsletters.net
 377 | liquidmedianetworks.com
 378 | everythinggirl.com
 379 | perfectmatch.com
 380 | stockgroup.com
 381 | netster.com
 382 | bidclix.com
 383 | dropspam.com
 384 | hp.com
 385 | drivecleaner.com
 386 | consumerpromotioncenter.com
 387 | aolwbspd.exe
 388 | americanexpress.com
 389 | totaltalk.com
 390 | wwe.com
 391 | kontera.com
 392 | gamehouse.com
 393 | circuitcity.com
 394 | yimg.com
 395 | lightningcast.com
 396 | edgefcs.net
 397 | wunderground.com
 398 | realarcade.com
 399 | singlesnet.com
 400 | azcentral.com
 401 | yellowpages.com
 402 | eharmony.com
 403 | paviliondownload.com
 404 | insightbb.com
 405 | imageshack.us
 406 | shopzilla.com
 407 | ca.gov
 408 | donotchangeme.com
 409 | ca.us
 410 | sourceforge.net
 411 | washingtonpost.com
 412 | adjuggler.com
 413 | careercast.com
 414 | bangbros1.com
 415 | scripps.com-o01
 416 | migente.com
 417 | homedepot.com
 418 | winantivirus.com
 419 | irs.gov
 420 | blockbuster.com
 421 | kodakgallery.com
 422 | nih.gov
 423 | aol.com-o07
 424 | icq.com
 425 | wordcents.com
 426 | drudgereport.com
 427 | quizilla.com
 428 | srch-results.com
 429 | inqwire.com
 430 | ign.com
 431 | oinadserver.com
 432 | azoogleads.com
 433 | incredimail.com
 434 | shopathome.com
 435 | mtv.com
 436 | fidelity.com
 437 | bullseye-network.com
 438 | flash-gear.com
 439 | proficient.com
 440 | autotrader.com
 441 | charter.com
 442 | healthology.com
 443 | evite.com
 444 | checkm8.com
 445 | rsc01.net
 446 | oasei.com
 447 | heavy.com
 448 | slotch.com
 449 | passion.com
 450 | nbc.com
 451 | trafficmarketplace.com
 452 | univision.com
 453 | priceline.com
 454 | flickr.com
 455 | andale.com
 456 | dealtime.com
 457 | yfdirect.com
 458 | entrepreneur.com
 459 | go.com-o01
 460 | webmd.com-o01
 461 | sexsearch.com
 462 | pornaccess.com
 463 | gcion.com
 464 | shoplocal.com
 465 | kliptracker.com
 466 | nationalcity.com
 467 | bbeplayer.com
 468 | videodome.com
 469 | 204.95.60.12
 470 | napster.com
 471 | myweather.net
 472 | msnbc.com
 473 | linkexchange.com
 474 | searchmarketing.com
 475 | angelfire.com
 476 | callwave.com
 477 | sonnerie.net
 478 | scout.com
 479 | rivals.com
 480 | altnet.com
 481 | spynet.com
 482 | macromedia.com
 483 | ed.gov
 484 | wannawatch.com
 485 | frontiernet.net
 486 | flycell.com
 487 | edgesuite.net
 488 | 89.com
 489 | nc.us
 490 | ticketmaster.com-o01
 491 | flowgo.com
 492 | cnet.com
 493 | oddcast.com
 494 | answers.com
 495 | timeinc.net
 496 | m5-systems.com
 497 | guideforyou.com
 498 | rn11.com
 499 | lowes.com
 500 | lifescript.com
 501 | shop.com
 502 | errorsafe.com
 503 | cams.com
 504 | macys.com
 505 | aa.com
 506 | addictingclips.com
 507 | victoriassecret.com
 508 | orchardbank.com
 509 | bravenet.com
 510 | imesh.com
 511 | nextel.com
 512 | screensandthemes.com
 513 | suntrust.com
 514 | discovercard.com
 515 | nbads.com
 516 | consumerincentiverewards.com
 517 | valueclick.com
 518 | google.com-o03
 519 | cbs.com
 520 | bannerspace.com
 521 | technorati.com
 522 | cjt1.net
 523 | exactsearch.net
 524 | munky.com
 525 | cs.com
 526 | kohls.com
 527 | tagged.com
 528 | babycenter.com
 529 | ebaumsworld.com
 530 | userplane.com
 531 | mediaplazza.com
 532 | netzerovoice.com
 533 | gamespot.com
 534 | keen.com
 535 | bebo.com
 536 | rsc02.net
 537 | sysupdates2.com
 538 | imlive.com
 539 | oldnavy.com
 540 | regalinteractive.com
 541 | weightwatchers.com
 542 | subsag.com
 543 | aol.com-o08
 544 | azlyrics.com
 545 | freeringtonesnow.com
 546 | freewebs.com
 547 | toysrus.com
 548 | hollywood.com
 549 | findwhat.com
 550 | local.com
 551 | webroot.com
 552 | tvguide.com
 553 | ny.us
 554 | resultsmaster.com
 555 | jamster.com
 556 | gms1.net
 557 | switchboard.com
 558 | nicheseek.com
 559 | intelius.com
 560 | hi5.com
 561 | glispa.com
 562 | gannettonline.com
 563 | cstv.com
 564 | adengage.com
 565 | superbrewards.com
 566 | videocodezone.com
 567 | symantecliveupdate.com
 568 | pbskids.org
 569 | revresda.com
 570 | americansingles.com
 571 | ugo.com-o02
 572 | job.com
 573 | installshield.com
 574 | eprize.net
 575 | metacafe.com
 576 | focalex.com
 577 | cciads.us
 578 | perfectgonzo.com
 579 | kbb.com
 580 | reunion.com
 581 | eproof.com
 582 | tripadvisor.com
 583 | bellsouth.com
 584 | search.com
 585 | comcast.com
 586 | ivillage.com
 587 | sun.com
 588 | regionsnet.com
 589 | mininova.org
 590 | beliefnet.com
 591 | intellicast.com
 592 | fastonlineusers.com
 593 | gamespot.com-o01
 594 | expedia.com-o01
 595 | military.com
 596 | musicnet.com
 597 | 53.com
 598 | oh.us
 599 | itrack.it
 600 | officedepot.com
 601 | adultadworld.com
 602 | univision.com-o01
 603 | youravon.com
 604 | blackboard.com
 605 | yahoo.net
 606 | casinolasvegas.com
 607 | warnerbros.com
 608 | delta.com
 609 | go.com-o02
 610 | deepnetexplorer.co.uk
 611 | mozilla.com
 612 | opentracker.net
 613 | break.com
 614 | catcha10.com
 615 | hotels.com
 616 | hallmark.com
 617 | sportsbook.com
 618 | mycheckfree.com
 619 | ezboard.com
 620 | pro-market.net
 621 | mate1.com
 622 | awempire.com
 623 | jigzone.com
 624 | bangbrosnetwork.com
 625 | marketlinx.com
 626 | tickle.com
 627 | bbandt.com
 628 | mercuras.com
 629 | adtology2.com
 630 | bluemountain.com
 631 | freepornofreeporn.com
 632 | internet-optimizer.com
 633 | autotrader.com-o01
 634 | blogger.com
 635 | kraftfoods.com
 636 | loveaccess.com
 637 | shutterfly.com
 638 | stopzilla.com
 639 | xmradio.com
 640 | ga.us
 641 | ancestry.com
 642 | honda.com
 643 | fulltiltpoker.com
 644 | il.us
 645 | ibsys.com
 646 | imixserver.com
 647 | barnesandnoble.com
 648 | pricegrabber.com
 649 | constantcontact.com
 650 | zonelabs.com
 651 | pimpyourpro.com
 652 | netflame.cc
 653 | slide.com
 654 | xnxx.com
 655 | upromise.com
 656 | livesexbar.com
 657 | videosz.com
 658 | freeweblayouts.net
 659 | limewire.com
 660 | ameritrade.com
 661 | freelaptop4you.com
 662 | nickarcade.com
 663 | utkn.com
 664 | nj.us
 665 | 360i.com
 666 | finestresults.com
 667 | asseenontvnetwork.com
 668 | typepad.com
 669 | efax.com
 670 | regions.com
 671 | emachines.com
 672 | playaudiomessage.com
 673 | bofunk.com
 674 | millsberry.com
 675 | cpvfeed.com
 676 | allrecipes.com
 677 | clubpenguin.com
 678 | eversave.com
 679 | ppmdating.com
 680 | lexico.com
 681 | usaa.com
 682 | directv.com
 683 | postini.com
 684 | secure-banking.com
 685 | eyewonder.com
 686 | boston.com
 687 | ibanking-services.com
 688 | astrology.com
 689 | datinggold.com
 690 | mlxchange.com
 691 | travelhook.net
 692 | custhelp.com
 693 | mn.us
 694 | zwire.com
 695 | emarketmakers.com
 696 | gamefaqs.com
 697 | premiumproductsonline.com
 698 | chrysler.com
 699 | prodigy.net
 700 | tv.com
 701 | windowsmedia.com-o04
 702 | smashits.com
 703 | 65.115.67.11
 704 | snapfish.com
 705 | commerceonlinebanking.com
 706 | bbt.com
 707 | linksynergy.com
 708 | yahoo.com-o08
 709 | freecodesource.com
 710 | streamate.com
 711 | freecreditreport.com
 712 | intuit.com
 713 | rapid-pass.net
 714 | artistdirect.com
 715 | servedbyadbutler.com
 716 | sidestep.com
 717 | adult.com
 718 | alltel.net
 719 | bcentral.com
 720 | openbank.com
 721 | nichedsites.com
 722 | cars.com
 723 | gm.com
 724 | adshuffle.com
 725 | freeslots.com
 726 | blink.com
 727 | candystand.com
 728 | monstermarketplace.com
 729 | columbiahouse.com
 730 | pncbank.com
 731 | discovery.com
 732 | hsbcbillpay.com
 733 | movietickets.com
 734 | page-not-found.net
 735 | fandango.com
 736 | providianservices.com
 737 | carad.com
 738 | homestead.com
 739 | realcastmedia.com
 740 | webratsmusic.com
 741 | scottrade.com
 742 | cs102175.com
 743 | fnismls.com
 744 | shopperssavingcenter.com
 745 | hit-now.com
 746 | whatismyip.com
 747 | costco.com
 748 | bolt.com
 749 | bmgmusic.com
 750 | myhealthwealthandhappiness.com
 751 | symantec.com
 752 | forbes.com
 753 | digitalcity.com
 754 | live365.com
 755 | firstadsolution.com
 756 | linkconnector.com
 757 | freepagegraphics.com
 758 | imgfarm.com
 759 | insightexpresserdd.com
 760 | pcsecurityshield.com
 761 | allposters.com-o01
 762 | msnvideo.com
 763 | miva.com
 764 | jackpotmadness.com
 765 | mbnanetaccess.com
 766 | newcarinsider.com
 767 | edmunds.com
 768 | net-nucleus.com
 769 | popcap.com
 770 | alt.com
 771 | staples.com
 772 | ussearch.com
 773 | bankone.com
 774 | rootv.com
 775 | citizensbankonline.com
 776 | juggcrew.com
 777 | navyfcu.org
 778 | nordstrom.com
 779 | webstat.com
 780 | inklineglobal.com
 781 | seeq.com
 782 | onetruemedia.com
 783 | paltalk.com
 784 | sonypictures.com
 785 | 204.181.57.155
 786 | commerceonline.com
 787 | friendster.com
 788 | slate.com
 789 | hermoment.com
 790 | lovehappens.com
 791 | mi.us
 792 | kmart.com
 793 | paidsurveys.com
 794 | 123greetings.com
 795 | blinko.com
 796 | citizensbank.com
 797 | sirius.com
 798 | qrs1.net
 799 | adbureau.net
 800 | turn.com
 801 | abcdistributing.com
 802 | fundsxpress.com
 803 | pichunter.com
 804 | cbsnews.com
 805 | 216.139.222.230
 806 | anywho.com
 807 | sedoparking.com
 808 | householdbank.com
 809 | treborwear.com
 810 | evault.ws
 811 | vh1.com
 812 | financialcontent.com
 813 | gap.com
 814 | active.com
 815 | exclusivegiftcards.com
 816 | michigan.gov
 817 | dada-mobile.net
 818 | textplussolutions.com
 819 | myriadmarket.com
 820 | ifriends.net
 821 | aptimus.com
 822 | valueclick.net
 823 | pennyweb.com
 824 | blackpeoplemeet.com
 825 | eltpath.com
 826 | yahoo.com-o46
 827 | sysprotect.com
 828 | dadamobile.com
 829 | cpxinteractive.com
 830 | clickspring.net
 831 | staples-deals.com
 832 | myyearbook.com
 833 | bravenetmedianetwork.com
 834 | etrade.com
 835 | marykayintouch.com
 836 | 64.39.16.166
 837 | moregamers.com
 838 | redorbit.com
 839 | tmz.com
 840 | blogrolling.com
 841 | checkfree.com
 842 | samsclub.com
 843 | va.us
 844 | united.com
 845 | certified-safe-downloads.com
 846 | aimtoday.com
 847 | toseeka.com
 848 | bidz.com
 849 | gamespy.com
 850 | nylottery.org
 851 | godaddy.com
 852 | rsc03.net
 853 | altavista.com
 854 | ltdcommodities.com
 855 | bhg.com
 856 | opm.gov
 857 | onlinemediaoutlet.com
 858 | beboframe.com
 859 | cafepress.com
 860 | tarot.com
 861 | webgavel.com
 862 | rapmls.com
 863 | ztod.com
 864 | marriott.com
 865 | walgreens.com
 866 | rovion.com
 867 | ultimatebet.com
 868 | ea.com
 869 | petfinder.com
 870 | winsoftware.com
 871 | literotica.com
 872 | websourcedtraffic.com
 873 | 032439.com
 874 | marketbanker.com
 875 | clearchannelmusic.com
 876 | colonize.com
 877 | searchfeed.com
 878 | eimg.net
 879 | shermanstravel.com
 880 | key.com
 881 | multi-pops.com
 882 | yandex.ru
 883 | us.com
 884 | kinghost.com
 885 | sublimedirectory.com
 886 | gogotools.com
 887 | camcrush.com
 888 | trafficexplorer.com
 889 | myfamily.com
 890 | gay.com
 891 | freegiftworld.com
 892 | dexonline.com
 893 | trade-in-value.com
 894 | shopyourbargain.com
 895 | dyndns.org
 896 | bizrate.com-o01
 897 | xctrk.com
 898 | webtoolcafe.com
 899 | zappos.com
 900 | wi.us
 901 | toptvbytes.com
 902 | 157.22.32.111
 903 | hotfreelayouts.com
 904 | registrydefender.com
 905 | zap2it.com
 906 | 64.136.28.49
 907 | afy11.net
 908 | 207.97.212.250
 909 | invisionfree.com
 910 | bravenet.com-o01
 911 | gadgetcity.com
 912 | army.mil
 913 | yourgiftcards.com
 914 | craigslist.com
 915 | usairways.com
 916 | drivelinemedia.com
 917 | edline.net
 918 | dayport.com
 919 | axill.com
 920 | smartbargains.com
 921 | newgrounds.com
 922 | 216.155.193.91
 923 | providian.com
 924 | statcounter.com
 925 | ajc.com
 926 | oprah.com
 927 | slingo.com
 928 | continental.com
 929 | relevantchoice.com
 930 | toontown.com
 931 | thumbplay.com
 932 | jacquielawson.com
 933 | hotwire.com
 934 | nwa.com
 935 | atomz.com
 936 | nsgalleries.com
 937 | uclick.com
 938 | mercurial.ca
 939 | schwab.com
 940 | nvero.net
 941 | ediets.com
 942 | ichotelsgroup.com
 943 | 216.133.243.28
 944 | aggregateknowledge.com
 945 | topix.net
 946 | flalottery.com
 947 | dlv4.com
 948 | mybloglog.com
 949 | lanxtra.com
 950 | away.com
 951 | grab.com
 952 | tipany.com
 953 | quickbooks.com
 954 | instream.com
 955 | pbs.org
 956 | findology.com
 957 | business.com
 958 | cmt.com
 959 | myinsiderdeals.com
 960 | imagine-msn.com
 961 | nhl.com
 962 | modern-singles.net
 963 | addfreestats.com
 964 | rent.com
 965 | homegain.com
 966 | freeones.com
 967 | jetblue.com
 968 | loanweb.com
 969 | findarticles.com
 970 | iwon.com-o04
 971 | incredigames.com
 972 | webkinz.com
 973 | dealerconnection.com
 974 | streamaudio.com
 975 | grantmedia.com
 976 | home123info.com
 977 | exittracking.com
 978 | worldsex.com
 979 | yfdmedia.com
 980 | automotive.com
 981 | cursormania.com
 982 | tradedoubler.com
 983 | bedbathandbeyond.com
 984 | equifax.com
 985 | hotornot.com
 986 | falkag.de
 987 | chicagotribune.com
 988 | airtran.com
 989 | thebreastcancersite.com
 990 | charmingshoppes.com
 991 | ugo.com
 992 | cox.com
 993 | spicymint.com
 994 | real.com-o01
 995 | targetnet.com
 996 | effectivebrand.com
 997 | dallascowboys.com
 998 | leadgenetwork.com
 999 | in.us
1000 | vistaprint.com
1001 | 


--------------------------------------------------------------------------------
/PC Lab 2/help files/grf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 2/help files/grf.pdf


--------------------------------------------------------------------------------
/PC Lab 2/help files/rpart.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 2/help files/rpart.pdf


--------------------------------------------------------------------------------
/PC Lab 2/trees_foests_tutorial.r:
--------------------------------------------------------------------------------
  1 | ########################  Load Packages and Data  ########################
  2 | 
  3 | # Load packages
  4 | library(rpart)
  5 | library(rpart.plot)
  6 | library(grf)
  7 | library(DiagrammeR)
  8 | 
  9 | # Load data
 10 | data_2006 <-read.csv("browser_2006.csv", sep = ",")
 11 | data_new <-read.csv("browser_new.csv", sep = ",")
 12 | 
 13 | # Data preparation
 14 | y_2006 <- as.matrix(data_2006[,2])
 15 | x_2006 <- as.matrix(data_2006[,c(3:ncol(data_2006))])
 16 | id_2006 <- as.matrix(data_2006[,1])
 17 | x_new <- as.matrix(data_new[,c(2:ncol(data_new))])
 18 | id_new <- as.matrix(data_new[,1])
 19 | 
 20 | print('Packages and data successfully loaded.')
 21 | 
 22 | #############################################################################
 23 | 
 24 | ########################  Average Spending  ########################
 25 | 
 26 | spending <- round(???, digits=2)
 27 | print(paste0("In 2006, the average spending is ", spending, " US-dollars"))
 28 | 
 29 | ####################################################################
 30 | 
 31 | ########################  Online Time  ########################
 32 | 
 33 | freq <- round(x_2006[id_2006==921,x_2006[id_2006==921,] == ???], digit = 0)
 34 | page <- names(freq)
 35 | 
 36 | print(paste0("Household 921 is most of the time on the webpage ", page))
 37 | print(paste0(freq, "% of the online time is the household on this webpage"))
 38 | 
 39 | ################################################################
 40 | 
 41 | ########################  Log Transformation  ########################
 42 | 
 43 | log_y_2006 = as.matrix(???) # take logarithm
 44 | 
 45 | # Cumulative Distribution of Spending
 46 | plot(ecdf(y_2006), xlab = "Spending in US-Dollars", sub = "(Truncated at 20,000 US-Dollars)",
 47 |      ylab = "cdf", main = "Distribution of Spending", xlim= c(0,20000))
 48 | 
 49 | # Cumulative Distribution of Log Spendiung
 50 | plot(ecdf(log_y_2006), xlab = "log Spending", ylab = "cdf", main = "Distribution of Log Spending")
 51 | 
 52 | #######################################################################
 53 | 
 54 | ########################  Training and Test Samples  ########################
 55 | 
 56 | set.seed(1001)
 57 | # Generate variable with the rows in training data
 58 | size <- floor(0.5 * nrow(data_2006))
 59 | training_set <- sample(seq_len(nrow(data_2006)), size = size)
 60 | 
 61 | print('Training and test samples created.')
 62 | 
 63 | #############################################################################
 64 | 
 65 | ########################  Shallow Tree  ########################
 66 | 
 67 | # Prepare data for tree estimator
 68 | outcome <- log_y_2006[training_set]
 69 | tree_data_2006 <-  data.frame(outcome, x_2006[training_set,])
 70 | 
 71 | # Build shallow tree
 72 | set.seed(1001)
 73 | shallow_tree <- rpart(formula = outcome ~., data = tree_data_2006, method = "anova", xval = 10,
 74 |                              y = TRUE, control = rpart.control(cp = 0.00002, minbucket=150))
 75 | # Note: 'minbucket=100' imposes the restriction that each terminal leave should contain at least 100 observations. 
 76 | # The algorithm 'rpart' stops growing trees when either one leave has less than 100 observations or 
 77 | # the MSE gain of addidng one addidtional leave is below cp=0.00002.
 78 | 
 79 | ## Plot tree structure
 80 | rpart.plot(shallow_tree,digits=3)
 81 | 
 82 | # bizrate.com
 83 | # fedex.com
 84 | 
 85 | ################################################################
 86 | 
 87 | ########################  Deep Tree  ########################
 88 | set.seed(1001)
 89 | deep_tree <- rpart(formula = outcome ~., data = tree_data_2006, ???)
 90 | 
 91 | print('Relative CV-MSE for different tree sizes')
 92 | print(deep_tree$cptable)
 93 | 
 94 | # Plot CV-MSE
 95 | plotcp(deep_tree)
 96 | 
 97 | #############################################################
 98 | 
 99 | ########################  Optimal Tree Size  ########################
100 | 
101 | op.index <- which.min(deep_tree$cptable[, "xerror"])
102 | op.size <- deep_tree$cptable[op.index, "nsplit"] +1
103 | print(paste0("Optimal number final leaves: ", op.size))
104 | 
105 | #####################################################################
106 | 
107 | ########################  Pruned Tree  ########################
108 | 
109 | # Select the Tree that Minimises CV-MSE 
110 | # Get cp-value that corresponds to optimal tree size
111 | cp.vals <- deep_tree$cptable[op.index, "CP"]
112 | 
113 | # Prune the deep tree
114 | pruned_tree <- prune(???, cp = cp.vals)
115 | 
116 | ## Plot tree structure
117 | rpart.plot(pruned_tree,digits=3)
118 | 
119 | # aggregateknowledge.com
120 | 
121 | ################################################################
122 | 
123 | ########################  Out-of-Sample Performance  ########################
124 | 
125 | # Predict log online spending 
126 | pred_tree <- predict(???, newdata= as.data.frame(x_2006))
127 | 
128 | # Test sample data
129 | outcome_test <- log_y_2006[-training_set]
130 | pred_tree_test  <- pred_tree[-training_set]
131 | 
132 | # R-squared
133 | MSE_tree <- mean((outcome_test-pred_tree_test)^2)
134 | r2_tree <-  round(1- MSE_tree/var(outcome_test), digits = 3) 
135 | print(paste0("Test sample R-squared: ", r2_tree))
136 | 
137 | ##############################################################################
138 | 
139 | ########################  Random Forest  ########################
140 | 
141 | rep <- 1000 # number of trees
142 | cov <- 1/3 # share of covariates
143 | frac <- 1/2 # fraction of subsample
144 | min_obs <- 100 # max. size of terminal leaves in trees
145 | 
146 | # Build Forest
147 | set.seed(10001)
148 | forest1 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,], 
149 |                             mtry = floor(cov*ncol(x_2006)), sample.fraction = frac, num.trees = rep, 
150 |                             min.node.size = min_obs, honesty=FALSE)
151 | 
152 | print('Forest is built.')
153 | 
154 | ##################################################################
155 | 
156 | ########################  Plot Example Tree  ########################
157 | 
158 | # Plot a tree of the forest
159 | # Just an illustration, overall the forest contains 1000 trees
160 | tree <- get_tree(???,1) # here we select tree number 1
161 | plot(tree)
162 | 
163 | #####################################################################
164 | 
165 | ########################  Variable Importance  ########################
166 | 
167 | # Plot the variable importantance
168 | # First we consider only first split
169 | imp1 <- variable_importance(forest1, max.depth = 1)
170 | print(cbind(colnames(x_2006[,imp1>0.02]),imp1[imp1>0.02]))
171 | 
172 | # Now we consider the first four splits
173 | imp2 <- round(variable_importance(forest1, decay.exponent = 2, max.depth = 4), digits = 3)
174 | print(cbind(colnames(x_2006[,imp2>0.02]),imp2[imp2>0.02]))
175 | 
176 | ########################################################################
177 | 
178 | ########################  Out-of-Sample Performance  ########################
179 | 
180 | # Prediction
181 | fit <- predict(???, newdata = x_2006[-training_set,])$predictions
182 | 
183 | # R-squared
184 | SST <- mean(((log_y_2006[-training_set,])-mean((log_y_2006[-training_set,])))^2)
185 | MSE1 <- mean(((log_y_2006[-training_set,])-fit)^2)
186 | r2_1 <-  round(1- MSE1/SST, digits = 3) 
187 | print(paste0("Test sample R-squared: ", r2_1))
188 | 
189 | #############################################################################
190 | 
191 | ########################  Area Under the Curve (AUC)  ########################
192 | 
193 | sizes <- c(1000,500,400,300, 200, 100, 50, 40,30,20,10, 5,4,3,2,1) # Select a grid of sample sizes
194 | # Prepare matrix to store results
195 | auc <- matrix(NA, nrow = length(sizes), ncol = 3)
196 | colnames(auc) <- c("Trees", "AUC", "Marginal AUC")
197 | auc[,1] <- sizes
198 | # Sum of Squares Total
199 | SST <- mean(((log_y_2006[-training_set,])-(mean(log_y_2006[-training_set,])))^2)
200 | 
201 | set.seed(10001) # set starting value
202 | for (t in sizes){
203 |   # Estimate Forests
204 |   forest <- regression_forest(x_2006[training_set,],(log_y_2006[training_set,]), mtry = floor(cov*ncol(x_2006)),
205 |                               sample.fraction = frac, num.trees = t, min.node.size = min_obs, honesty=FALSE)
206 |   fit <- predict(forest, newdata = x_2006[-training_set,])$predictions # prediction in test sample
207 |   auc[auc[,1]== t,2] <- 1- mean(((log_y_2006[-training_set,])-fit)^2)/SST # store R-squared
208 | }
209 | auc[,3] <- auc[,2] - rbind(as.matrix(auc[-1,2]),auc[nrow(auc),2])
210 | 
211 | # Marginal AUC
212 | plot(auc[,1],auc[,2],type = "o",xlab="Trees", ylab= "R-squared", main = "AUC")
213 | abline(a=0,b=0, col="red")
214 | 
215 | ################################################################################
216 | 
217 | ########################  Deep Forest  ########################
218 | 
219 | min_obs <- 5
220 | # Build Forest
221 | forest2 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,], 
222 |                             ???)
223 | 
224 | # Prediction
225 | fit <- predict(forest2, newdata = x_2006[-training_set,])$predictions
226 | 
227 | # R-squared
228 | SST <- mean((log_y_2006[-training_set,]-mean(log_y_2006[-training_set,]))^2)
229 | MSE2 <- mean((log_y_2006[-training_set,]-fit)^2)
230 | r2_2 <-  round(1- MSE2/SST, digits = 3)
231 | print(cbind(r2_1,r2_2))
232 | 
233 | # Plot tree
234 | tree <- get_tree(forest2, 34)
235 | plot(tree)
236 | 
237 | ###############################################################
238 | 
239 | ########################  Store Prediction for Hold-out-Sample  ########################
240 | 
241 | # Hold-out-Sample Prediction
242 | fit_new <- predict(???, newdata = x_new)$predictions
243 | 
244 | results <- as.matrix(cbind(id_new,fit_new)) # store ID's and predictions in oine matrix
245 | colnames(results) <- c("id","predictions") # label columns
246 | 
247 | # Store results
248 | write.csv(results, "predictions.csv")
249 | 
250 | print('Results for the hold-out-sample stored.')
251 | 
252 | #########################################################################################
253 | 


--------------------------------------------------------------------------------
/PC Lab 3/help files/R_ K-Means Clustering.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><title>R: K-Means Clustering</title>
  2 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  3 | <link rel="stylesheet" type="text/css" href="R.css" />
  4 | </head><body>
  5 | 
  6 | <table width="100%" summary="page for kmeans {stats}"><tr><td>kmeans {stats}</td><td style="text-align: right;">R Documentation</td></tr></table>
  7 | 
  8 | <h2>
  9 | K-Means Clustering
 10 | </h2>
 11 | 
 12 | <h3>Description</h3>
 13 | 
 14 | <p>Perform k-means clustering on a data matrix.
 15 | </p>
 16 | 
 17 | 
 18 | <h3>Usage</h3>
 19 | 
 20 | <pre>
 21 | kmeans(x, centers, iter.max = 10, nstart = 1,
 22 |        algorithm = c("Hartigan-Wong", "Lloyd", "Forgy",
 23 |                      "MacQueen"), trace=FALSE)
 24 | ## S3 method for class 'kmeans'
 25 | fitted(object, method = c("centers", "classes"), ...)
 26 | </pre>
 27 | 
 28 | 
 29 | <h3>Arguments</h3>
 30 | 
 31 | <table summary="R argblock">
 32 | <tr valign="top"><td><code>x</code></td>
 33 | <td>
 34 | <p>numeric matrix of data, or an object that can be coerced to
 35 | such a matrix (such as a numeric vector or a data frame with all
 36 | numeric columns).</p>
 37 | </td></tr>
 38 | <tr valign="top"><td><code>centers</code></td>
 39 | <td>
 40 | <p>either the number of clusters, say <i>k</i>, or a set of
 41 | initial (distinct) cluster centres.  If a number, a random set of
 42 | (distinct) rows in <code>x</code> is chosen as the initial centres.</p>
 43 | </td></tr>
 44 | <tr valign="top"><td><code>iter.max</code></td>
 45 | <td>
 46 | <p>the maximum number of iterations allowed.</p>
 47 | </td></tr>
 48 | <tr valign="top"><td><code>nstart</code></td>
 49 | <td>
 50 | <p>if <code>centers</code> is a number, how many random sets
 51 | should be chosen?</p>
 52 | </td></tr>
 53 | <tr valign="top"><td><code>algorithm</code></td>
 54 | <td>
 55 | <p>character: may be abbreviated.  Note that
 56 | <code>"Lloyd"</code> and <code>"Forgy"</code> are alternative names for one
 57 | algorithm.</p>
 58 | </td></tr>
 59 | <tr valign="top"><td><code>object</code></td>
 60 | <td>
 61 | <p>an <span style="font-family: Courier New, Courier; color: #666666;"><b>R</b></span> object of class <code>"kmeans"</code>, typically the
 62 | result <code>ob</code> of <code>ob &lt;- kmeans(..)</code>.</p>
 63 | </td></tr>
 64 | <tr valign="top"><td><code>method</code></td>
 65 | <td>
 66 | <p>character: may be abbreviated. <code>"centers"</code> causes
 67 | <code>fitted</code> to return cluster centers (one for each input point) and
 68 | <code>"classes"</code> causes <code>fitted</code> to return a vector of class
 69 | assignments.</p>
 70 | </td></tr>
 71 | <tr valign="top"><td><code>trace</code></td>
 72 | <td>
 73 | <p>logical or integer number, currently only used in the
 74 | default method (<code>"Hartigan-Wong"</code>): if positive (or true),
 75 | tracing information on the progress of the algorithm is
 76 | produced.  Higher values may produce more tracing information.</p>
 77 | </td></tr>
 78 | <tr valign="top"><td><code>...</code></td>
 79 | <td>
 80 | <p>not used.</p>
 81 | </td></tr>
 82 | </table>
 83 | 
 84 | 
 85 | <h3>Details</h3>
 86 | 
 87 | <p>The data given by <code>x</code> are clustered by the <i>k</i>-means method,
 88 | which aims to partition the points into <i>k</i> groups such that the
 89 | sum of squares from points to the assigned cluster centres is minimized.
 90 | At the minimum, all cluster centres are at the mean of their Voronoi
 91 | sets (the set of data points which are nearest to the cluster centre).
 92 | </p>
 93 | <p>The algorithm of Hartigan and Wong (1979) is used by default.  Note
 94 | that some authors use <i>k</i>-means to refer to a specific algorithm
 95 | rather than the general method: most commonly the algorithm given by
 96 | MacQueen (1967) but sometimes that given by Lloyd (1957) and Forgy
 97 | (1965).  The Hartigan&ndash;Wong algorithm generally does a better job than
 98 | either of those, but trying several random starts (<code>nstart</code><i>&gt;
 99 |   1</i>) is often recommended.  In rare cases, when some of the points
100 | (rows of <code>x</code>) are extremely close, the algorithm may not converge
101 | in the &ldquo;Quick-Transfer&rdquo; stage, signalling a warning (and
102 | returning <code>ifault = 4</code>).  Slight
103 | rounding of the data may be advisable in that case.
104 | </p>
105 | <p>For ease of programmatic exploration, <i>k=1</i> is allowed, notably
106 | returning the center and <code>withinss</code>.
107 | </p>
108 | <p>Except for the Lloyd&ndash;Forgy method, <i>k</i> clusters will always be
109 | returned if a number is specified.
110 | If an initial matrix of centres is supplied, it is possible that
111 | no point will be closest to one or more centres, which is currently
112 | an error for the Hartigan&ndash;Wong method.
113 | </p>
114 | 
115 | 
116 | <h3>Value</h3>
117 | 
118 | <p><code>kmeans</code> returns an object of class <code>"kmeans"</code> which has a
119 | <code>print</code> and a <code>fitted</code> method.  It is a list with at least
120 | the following components:
121 | </p>
122 | <table summary="R valueblock">
123 | <tr valign="top"><td><code>cluster</code></td>
124 | <td>
125 | 
126 | <p>A vector of integers (from <code>1:k</code>) indicating the cluster to
127 | which each point is allocated.
128 | </p>
129 | </td></tr>
130 | <tr valign="top"><td><code>centers</code></td>
131 | <td>
132 | <p>A matrix of cluster centres.</p>
133 | </td></tr>
134 | <tr valign="top"><td><code>totss</code></td>
135 | <td>
136 | <p>The total sum of squares.</p>
137 | </td></tr>
138 | <tr valign="top"><td><code>withinss</code></td>
139 | <td>
140 | <p>Vector of within-cluster sum of squares,
141 | one component per cluster.</p>
142 | </td></tr>
143 | <tr valign="top"><td><code>tot.withinss</code></td>
144 | <td>
145 | <p>Total within-cluster sum of squares,
146 | i.e. <code>sum(withinss)</code>.</p>
147 | </td></tr>
148 | <tr valign="top"><td><code>betweenss</code></td>
149 | <td>
150 | <p>The between-cluster sum of squares,
151 | i.e. <code>totss-tot.withinss</code>.</p>
152 | </td></tr>
153 | <tr valign="top"><td><code>size</code></td>
154 | <td>
155 | <p>The number of points in each cluster.</p>
156 | </td></tr>
157 | <tr valign="top"><td><code>iter</code></td>
158 | <td>
159 | <p>The number of (outer) iterations.</p>
160 | </td></tr>
161 | <tr valign="top"><td><code>ifault</code></td>
162 | <td>
163 | <p>integer: indicator of a possible algorithm problem
164 | &ndash; for experts.</p>
165 | </td></tr>
166 | </table>
167 | 
168 | 
169 | <h3>References</h3>
170 | 
171 | <p>Forgy, E. W. (1965).
172 | Cluster analysis of multivariate data: efficiency vs interpretability
173 | of classifications. 
174 | <em>Biometrics</em>, <b>21</b>, 768&ndash;769.
175 | </p>
176 | <p>Hartigan, J. A. and Wong, M. A. (1979).
177 | Algorithm AS 136: A K-means clustering algorithm.
178 | <em>Applied Statistics</em>, <b>28</b>, 100&ndash;108.
179 | doi: <a href="https://doi.org/10.2307/2346830">10.2307/2346830</a>.
180 | </p>
181 | <p>Lloyd, S. P. (1957, 1982).
182 | Least squares quantization in PCM.
183 | Technical Note, Bell Laboratories.
184 | Published in 1982 in <em>IEEE Transactions on Information Theory</em>,
185 | <b>28</b>, 128&ndash;137. 
186 | </p>
187 | <p>MacQueen, J. (1967).
188 | Some methods for classification and analysis of multivariate
189 | observations.
190 | In <em>Proceedings of the Fifth Berkeley Symposium on  Mathematical
191 | Statistics and Probability</em>, 
192 | eds L. M. Le Cam &amp; J. Neyman,
193 | <b>1</b>, pp. 281&ndash;297.
194 | Berkeley, CA: University of California Press.
195 | </p>
196 | 
197 | 
198 | <h3>Examples</h3>
199 | 
200 | <pre>
201 | require(graphics)
202 | 
203 | # a 2-dimensional example
204 | x &lt;- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
205 |            matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
206 | colnames(x) &lt;- c("x", "y")
207 | (cl &lt;- kmeans(x, 2))
208 | plot(x, col = cl$cluster)
209 | points(cl$centers, col = 1:2, pch = 8, cex = 2)
210 | 
211 | # sum of squares
212 | ss &lt;- function(x) sum(scale(x, scale = FALSE)^2)
213 | 
214 | ## cluster centers "fitted" to each obs.:
215 | fitted.x &lt;- fitted(cl);  head(fitted.x)
216 | resid.x &lt;- x - fitted(cl)
217 | 
218 | ## Equalities : ----------------------------------
219 | cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
220 |          c(ss(fitted.x), ss(resid.x),    ss(x)))
221 | stopifnot(all.equal(cl$ totss,        ss(x)),
222 | 	  all.equal(cl$ tot.withinss, ss(resid.x)),
223 | 	  ## these three are the same:
224 | 	  all.equal(cl$ betweenss,    ss(fitted.x)),
225 | 	  all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
226 | 	  ## and hence also
227 | 	  all.equal(ss(x), ss(fitted.x) + ss(resid.x))
228 | 	  )
229 | 
230 | kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
231 | 
232 | ## random starts do help here with too many clusters
233 | ## (and are often recommended anyway!):
234 | (cl &lt;- kmeans(x, 5, nstart = 25))
235 | plot(x, col = cl$cluster)
236 | points(cl$centers, col = 1:5, pch = 8)
237 | </pre>
238 | 
239 | <hr /><div style="text-align: center;">[Package <em>stats</em> version 4.1.0 <a href="00Index.html">Index</a>]</div>
240 | </body></html>
241 | 


--------------------------------------------------------------------------------
/PC Lab 3/help files/R_ Principal Components Analysis.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><title>R: Principal Components Analysis</title>
  2 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  3 | <link rel="stylesheet" type="text/css" href="R.css" />
  4 | </head><body>
  5 | 
  6 | <table width="100%" summary="page for prcomp {stats}"><tr><td>prcomp {stats}</td><td style="text-align: right;">R Documentation</td></tr></table>
  7 | 
  8 | <h2>Principal Components Analysis</h2>
  9 | 
 10 | <h3>Description</h3>
 11 | 
 12 | <p>Performs a principal components analysis on the given data matrix
 13 | and returns the results as an object of class <code>prcomp</code>.</p>
 14 | 
 15 | 
 16 | <h3>Usage</h3>
 17 | 
 18 | <pre>
 19 | prcomp(x, ...)
 20 | 
 21 | ## S3 method for class 'formula'
 22 | prcomp(formula, data = NULL, subset, na.action, ...)
 23 | 
 24 | ## Default S3 method:
 25 | prcomp(x, retx = TRUE, center = TRUE, scale. = FALSE,
 26 |        tol = NULL, rank. = NULL, ...)
 27 | 
 28 | ## S3 method for class 'prcomp'
 29 | predict(object, newdata, ...)
 30 | </pre>
 31 | 
 32 | 
 33 | <h3>Arguments</h3>
 34 | 
 35 | <table summary="R argblock">
 36 | <tr valign="top"><td><code>formula</code></td>
 37 | <td>
 38 | <p>a formula with no response variable, referring only to
 39 | numeric variables.</p>
 40 | </td></tr>
 41 | <tr valign="top"><td><code>data</code></td>
 42 | <td>
 43 | <p>an optional data frame (or similar: see
 44 | <code><a href="model.frame.html">model.frame</a></code>) containing the variables in the
 45 | formula <code>formula</code>.  By default the variables are taken from
 46 | <code>environment(formula)</code>.</p>
 47 | </td></tr>
 48 | <tr valign="top"><td><code>subset</code></td>
 49 | <td>
 50 | <p>an optional vector used to select rows (observations) of the
 51 | data matrix <code>x</code>.</p>
 52 | </td></tr>
 53 | <tr valign="top"><td><code>na.action</code></td>
 54 | <td>
 55 | <p>a function which indicates what should happen
 56 | when the data contain <code>NA</code>s.  The default is set by
 57 | the <code>na.action</code> setting of <code><a href="../../base/html/options.html">options</a></code>, and is
 58 | <code><a href="na.fail.html">na.fail</a></code> if that is unset. The &lsquo;factory-fresh&rsquo;
 59 | default is <code><a href="na.fail.html">na.omit</a></code>.</p>
 60 | </td></tr>
 61 | <tr valign="top"><td><code>...</code></td>
 62 | <td>
 63 | <p>arguments passed to or from other methods.  If <code>x</code> is
 64 | a formula one might specify <code>scale.</code> or <code>tol</code>.</p>
 65 | </td></tr>
 66 | <tr valign="top"><td><code>x</code></td>
 67 | <td>
 68 | <p>a numeric or complex matrix (or data frame) which provides
 69 | the data for the principal components analysis.</p>
 70 | </td></tr>
 71 | <tr valign="top"><td><code>retx</code></td>
 72 | <td>
 73 | <p>a logical value indicating whether the rotated variables
 74 | should be returned.</p>
 75 | </td></tr>
 76 | <tr valign="top"><td><code>center</code></td>
 77 | <td>
 78 | <p>a logical value indicating whether the variables
 79 | should be shifted to be zero centered. Alternately, a vector of
 80 | length equal the number of columns of <code>x</code> can be supplied.
 81 | The value is passed to <code>scale</code>.</p>
 82 | </td></tr>
 83 | <tr valign="top"><td><code>scale.</code></td>
 84 | <td>
 85 | <p>a logical value indicating whether the variables should
 86 | be scaled to have unit variance before the analysis takes
 87 | place.  The default is <code>FALSE</code> for consistency with S, but
 88 | in general scaling is advisable.  Alternatively, a vector of length
 89 | equal the number of columns of <code>x</code> can be supplied.  The
 90 | value is passed to <code><a href="../../base/html/scale.html">scale</a></code>.</p>
 91 | </td></tr>
 92 | <tr valign="top"><td><code>tol</code></td>
 93 | <td>
 94 | <p>a value indicating the magnitude below which components
 95 | should be omitted. (Components are omitted if their
 96 | standard deviations are less than or equal to <code>tol</code> times the
 97 | standard deviation of the first component.)  With the default null
 98 | setting, no components are omitted (unless <code>rank.</code> is specified
 99 | less than <code>min(dim(x))</code>.).  Other settings for tol could be
100 | <code>tol = 0</code> or <code>tol = sqrt(.Machine$double.eps)</code>, which
101 | would omit essentially constant components.</p>
102 | </td></tr>
103 | <tr valign="top"><td><code>rank.</code></td>
104 | <td>
105 | <p>optionally, a number specifying the maximal rank, i.e.,
106 | maximal number of principal components to be used.  Can be set as
107 | alternative or in addition to <code>tol</code>, useful notably when the
108 | desired rank is considerably smaller than the dimensions of the matrix.</p>
109 | </td></tr>
110 | <tr valign="top"><td><code>object</code></td>
111 | <td>
112 | <p>object of class inheriting from <code>"prcomp"</code></p>
113 | </td></tr>
114 | <tr valign="top"><td><code>newdata</code></td>
115 | <td>
116 | <p>An optional data frame or matrix in which to look for
117 | variables with which to predict.  If omitted, the scores are used.
118 | If the original fit used a formula or a data frame or a matrix with
119 | column names, <code>newdata</code> must contain columns with the same
120 | names. Otherwise it must contain the same number of columns, to be
121 | used in the same order.
122 | </p>
123 | </td></tr>
124 | </table>
125 | 
126 | 
127 | <h3>Details</h3>
128 | 
129 | <p>The calculation is done by a singular value decomposition of the
130 | (centered and possibly scaled) data matrix, not by using
131 | <code>eigen</code> on the covariance matrix.  This
132 | is generally the preferred method for numerical accuracy.  The
133 | <code>print</code> method for these objects prints the results in a nice
134 | format and the <code>plot</code> method produces a scree plot.
135 | </p>
136 | <p>Unlike <code><a href="princomp.html">princomp</a></code>, variances are computed with the usual
137 | divisor <i>N - 1</i>.
138 | </p>
139 | <p>Note that <code>scale = TRUE</code> cannot be used if there are zero or
140 | constant (for <code>center = TRUE</code>) variables.
141 | </p>
142 | 
143 | 
144 | <h3>Value</h3>
145 | 
146 | <p><code>prcomp</code> returns a list with class <code>"prcomp"</code>
147 | containing the following components:
148 | </p>
149 | <table summary="R valueblock">
150 | <tr valign="top"><td><code>sdev</code></td>
151 | <td>
152 | <p>the standard deviations of the principal components
153 | (i.e., the square roots of the eigenvalues of the
154 | covariance/correlation matrix, though the calculation
155 | is actually done with the singular values of the data matrix).</p>
156 | </td></tr>
157 | <tr valign="top"><td><code>rotation</code></td>
158 | <td>
159 | <p>the matrix of variable loadings (i.e., a matrix
160 | whose columns contain the eigenvectors).  The function
161 | <code>princomp</code> returns this in the element <code>loadings</code>.</p>
162 | </td></tr>
163 | <tr valign="top"><td><code>x</code></td>
164 | <td>
165 | <p>if <code>retx</code> is true the value of the rotated data (the
166 | centred (and scaled if requested) data multiplied by the
167 | <code>rotation</code> matrix) is returned.  Hence, <code>cov(x)</code> is the
168 | diagonal matrix <code>diag(sdev^2)</code>.  For the formula method,
169 | <code><a href="nafns.html">napredict</a>()</code> is applied to handle the treatment of values
170 | omitted by the <code>na.action</code>.</p>
171 | </td></tr>
172 | <tr valign="top"><td><code>center, scale</code></td>
173 | <td>
174 | <p>the centering and scaling used, or <code>FALSE</code>.</p>
175 | </td></tr>
176 | </table>
177 | 
178 | 
179 | <h3>Note</h3>
180 | 
181 | <p>The signs of the columns of the rotation matrix are arbitrary, and
182 | so may differ between different programs for PCA, and even between
183 | different builds of <span style="font-family: Courier New, Courier; color: #666666;"><b>R</b></span>.
184 | </p>
185 | 
186 | 
187 | <h3>References</h3>
188 | 
189 | <p>Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
190 | <em>The New S Language</em>.
191 | Wadsworth &amp; Brooks/Cole.
192 | </p>
193 | <p>Mardia, K. V., J. T. Kent, and J. M. Bibby (1979)
194 | <em>Multivariate Analysis</em>, London: Academic Press.
195 | </p>
196 | <p>Venables, W. N. and B. D. Ripley (2002)
197 | <em>Modern Applied Statistics with S</em>, Springer-Verlag.
198 | </p>
199 | 
200 | 
201 | <h3>See Also</h3>
202 | 
203 | <p><code><a href="biplot.princomp.html">biplot.prcomp</a></code>, <code><a href="screeplot.html">screeplot</a></code>,
204 | <code><a href="princomp.html">princomp</a></code>, <code><a href="cor.html">cor</a></code>, <code><a href="cor.html">cov</a></code>,
205 | <code><a href="../../base/html/svd.html">svd</a></code>, <code><a href="../../base/html/eigen.html">eigen</a></code>.
206 | </p>
207 | 
208 | 
209 | <h3>Examples</h3>
210 | 
211 | <pre>
212 | C &lt;- chol(S &lt;- toeplitz(.9 ^ (0:31))) # Cov.matrix and its root
213 | all.equal(S, crossprod(C))
214 | set.seed(17)
215 | X &lt;- matrix(rnorm(32000), 1000, 32)
216 | Z &lt;- X %*% C  ## ==&gt;  cov(Z) ~=  C'C = S
217 | all.equal(cov(Z), S, tol = 0.08)
218 | pZ &lt;- prcomp(Z, tol = 0.1)
219 | summary(pZ) # only ~14 PCs (out of 32)
220 | ## or choose only 3 PCs more directly:
221 | pz3 &lt;- prcomp(Z, rank. = 3)
222 | summary(pz3) # same numbers as the first 3 above
223 | stopifnot(ncol(pZ$rotation) == 14, ncol(pz3$rotation) == 3,
224 |           all.equal(pz3$sdev, pZ$sdev, tol = 1e-15)) # exactly equal typically
225 | 
226 | ## signs are random
227 | require(graphics)
228 | ## the variances of the variables in the
229 | ## USArrests data vary by orders of magnitude, so scaling is appropriate
230 | prcomp(USArrests)  # inappropriate
231 | prcomp(USArrests, scale = TRUE)
232 | prcomp(~ Murder + Assault + Rape, data = USArrests, scale = TRUE)
233 | plot(prcomp(USArrests))
234 | summary(prcomp(USArrests, scale = TRUE))
235 | biplot(prcomp(USArrests, scale = TRUE))
236 | 
237 | </pre>
238 | 
239 | <hr /><div style="text-align: center;">[Package <em>stats</em> version 4.0.2 <a href="00Index.html">Index</a>]</div>
240 | </body></html>
241 | 


--------------------------------------------------------------------------------
/PC Lab 3/rollcall-members.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 3/rollcall-members.Rdata


--------------------------------------------------------------------------------
/PC Lab 3/rollcall-votes.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 3/rollcall-votes.Rdata


--------------------------------------------------------------------------------
/PC Lab 3/unsupervised_tutorial.r:
--------------------------------------------------------------------------------
  1 | ########################  Load Data  ########################
  2 | 
  3 | ### Load data
  4 | load("rollcall-votes.Rdata")
  5 | load("rollcall-members.Rdata")
  6 | 
  7 | print('Data loaded.')
  8 | 
  9 | ##############################################################
 10 | 
 11 | print('# Counts of Democrats, Republicans and one special politician')
 12 | table(members$party)
 13 | 
 14 | print('# Shares of Democrats, Republicans and one special politician')
 15 | round(table(members$party)/nrow(members),3)
 16 | 
 17 | # Count missing votings for each politician and plot the counts
 18 | missings <- rowSums(votes[,(1:ncol(votes))]==0)
 19 | 
 20 | # No. politicians who always voted
 21 | sum(missings == 0)
 22 | 
 23 | # Shares of missing votings
 24 | s_missings <- missings/(ncol(votes)-1)
 25 | 
 26 | # Histogram with 100 bins
 27 | hist(???, breaks = 100)
 28 | 
 29 | # Counts - yes and nos
 30 | yeas <- rowSums(votes[,(1:ncol(votes))]== ???)
 31 | nays <- rowSums(votes[,(1:ncol(votes))]== ???)
 32 | 
 33 | # Plots - Party
 34 | plot(yeas, nays, col = members$party)
 35 | legend('topleft', legend = levels(members$party), col = 1:3,  pch = 1)
 36 | 
 37 | # PCA
 38 | pr.out = prcomp(??? , center = TRUE, scale = TRUE)
 39 | 
 40 | # No of principal components
 41 | dim(pr.out$rotation)[2]
 42 | 
 43 | # variance explained by each component
 44 | pr.var = pr.out$sdev^2
 45 | 
 46 | # Proportion of variance explained
 47 | pve=pr.var/sum(pr.var)
 48 | 
 49 | # Print first 10 PC
 50 | pve[1:10]
 51 | 
 52 | # Plot the first 10 PC
 53 | barplot(pve[1:10], xlab=" Principal Component ", ylab=" Proportion of Variance Explained ", ylim=c(0,1))
 54 | barplot(cumsum(pve[1:10]), xlab=" Principal Component ", ylab ="Cumulative Proportion of Variance Explained ", ylim=c(0,1))
 55 | 
 56 | # Plot the first two principal components, color the party membership
 57 | plot(pr.out$x[,1], pr.out$x[,2], xlab = "PC1", ylab = "PC2", col = members$party, main = "Top two PC directions")
 58 | legend('bottomright', legend = levels(members$party), col = 1:3,  pch = 1)
 59 | 
 60 | ## Far right (very conservative)
 61 | head(sort(???))
 62 | 
 63 | ## Far left (very liberal)
 64 | head(sort(???, decreasing=???))
 65 | 
 66 | # PC 2
 67 | head(sort(???))
 68 | # No clear pattern based on party and state information
 69 | 
 70 | # Look at the largest loadings in PC2 to discern an interpretation.
 71 | loadings <- pr.out$rotation
 72 | loadings[order(abs(loadings[,2]), decreasing=TRUE)[1:5],2]
 73 | 
 74 | # Analyze voting behavior
 75 | table(votes[,1146])
 76 | table(votes[,658])
 77 | table(votes[,1090])
 78 | 
 79 | # Either everyone voted "yea" or missed the voting.
 80 | # These votes all correspond to near-unanimous symbolic action.
 81 | 
 82 | # Mystery Solved: the second PC is just attendance!
 83 | head(sort(rowSums(votes==0), decreasing=TRUE))
 84 | 
 85 | set.seed(11122019)
 86 | 
 87 | # K-means clustering with 2 clusters
 88 | km.out = kmeans(???, 2, nstart = 20)
 89 | km.out$cluster
 90 | 
 91 | # Tabulate party vs cluster
 92 | table(members$party, km.out$cluster)
 93 | 
 94 | # How to analyze the optimal number of clusters
 95 | 
 96 | sse <- c()
 97 | sse[1] <- Inf
 98 | 
 99 | for (ind_cl in c(2:20)) {
100 |   set.seed(3)
101 |   km.out = kmeans (votes, ind_cl, nstart = 20)
102 |   sse[ind_cl] = km.out$tot.withinss
103 | }
104 | 
105 | plot(sse)
106 | # Optimum 4-5 clusters
107 | 
108 | # Plot the 5 clusters on the PC components graph
109 | set.seed(3)
110 | km.out = kmeans (???, ???, nstart = 20)
111 | 
112 | # Plot the first two principal components color the party membership
113 | plot(pr.out$x[,1], pr.out$x[,2], xlab = "PC1", ylab = "PC2", col = km.out$cluster, main = "Top two PC directions with 5 clusters")
114 | legend('bottomright', legend = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"), col = 1:5,  pch = 1)
115 | 
116 | # Analyzing how the number of starts work
117 | set.seed (3)
118 | print('With nstart = 1')
119 | km.out = kmeans (votes,6, nstart = ???)
120 | km.out$tot.withinss
121 | 
122 | print('With nstart = 20')
123 | km.out =kmeans (votes,6, nstart = ???)
124 | km.out$tot.withinss
125 | 


--------------------------------------------------------------------------------
/PC Lab 4/help files/glmnet_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 4/help files/glmnet_package.pdf


--------------------------------------------------------------------------------
/PC Lab 4/help files/hdm_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 4/help files/hdm_package.pdf


--------------------------------------------------------------------------------
/PC Lab 4/post_double_selection_tutorial.r:
--------------------------------------------------------------------------------
  1 | ##############################################################################
  2 | ########################  Load Packages and the Data  ########################
  3 | ##############################################################################
  4 | 
  5 | ### Load the packages  
  6 | library(fBasics)     # use for descriptive statistics
  7 | library(tidyverse)   # use for handling data
  8 | library(caret)       # use for handling data
  9 | library(lmtest)      # use for heteroscedasticity robust standard errors
 10 | library(sandwich)    # use for heteroscedasticity robust standard errors
 11 | library(hdm)         # use for Lasso and Post-Double-Selection
 12 | library(glmnet)      # use for lasso and Elastic Net regularized Generalized Linear Models
 13 | options(warn=-1)     # supress warnings
 14 | 
 15 | print('All packages successfully installed and loaded.')
 16 | 
 17 | ### Load the Data
 18 | set.seed(12345678) 
 19 | df <- read.csv("job_corps.csv",header=TRUE, sep=",") # load data from csv-file
 20 | df <- df[sample(c(1:nrow(df)), size=3000, replace =F),] # Select a random subsample of 3000 observations
 21 | print('Data successfully loaded.')
 22 | 
 23 | ##############################################################################
 24 | 
 25 | ##############################################################################
 26 | ########################  Descriptive Statistics  ############################
 27 | ##############################################################################
 28 | 
 29 | ## Table with Descriptive Statistics 
 30 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 
 31 |           select(Mean, Stdev, Minimum, Maximum, nobs)
 32 | print(round(desc, digits=2))
 33 | 
 34 | ##############################################################################
 35 | 
 36 | #########################################################################
 37 | ########################  Univariate OLS Regression #####################
 38 | #########################################################################
 39 | 
 40 | ## Univariate OLS
 41 | ols1 <- lm(EARNY4 ~ participation, data = df)
 42 | summary(ols1)
 43 | 
 44 | ## Store results
 45 | results <- as.matrix(coef(summary(ols1))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
 46 | 
 47 | # Prepare matrix to store results
 48 | res <- matrix(NA,nrow=3,ncol=5)
 49 | colnames(res) <- c("Univariate OLS", "Multivariate OLS1", "Multivariate OLS2",
 50 |                    "Multivariate OLS3", "Multivariate OLS4")
 51 | rownames(res) <- rownames(results)
 52 | res[,1] <- results
 53 | 
 54 | print(round(res[,1], digits=2))
 55 | 
 56 | ########################################################################
 57 | 
 58 | ########################################################################
 59 | ########################  Standardized Differences #####################
 60 | ########################################################################
 61 | 
 62 | ## Means and standard deviations for the participants (D=1)
 63 | desc_1 <- fBasics::basicStats(df[df$participation==1,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
 64 | 
 65 | ## Means and standard deviations for the non-participants (D=0)
 66 | desc_0 <- fBasics::basicStats(df[df$participation==0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
 67 | 
 68 | # Make table and add standardized differences
 69 | desc <- cbind(desc_1[-c(1:3),],desc_0[-c(1:3),], 
 70 |         100*abs(desc_1[-c(1:3),1]-desc_0[-c(1:3),1])/sqrt(0.5*(desc_1[-c(1:3),2]^2+desc_0[-c(1:3),2]^2)))
 71 | colnames(desc) <- c("D=1 Mean", "D=1 Std.Dev.", "D=0 Mean", "D=0 Std.Dev.", "Std.Diff.")
 72 | print(round(desc, digits=2))
 73 | 
 74 | ########################################################################
 75 | 
 76 | #########################################################################
 77 | ########################  Multivariate OLS Regression ###################
 78 | #########################################################################
 79 | 
 80 | ## Multivariate OLS
 81 | ols2 <- lm(EARNY4 ~ participation + age_1 + age_3 + livespou + publich, data = df)
 82 | summary(ols2)
 83 | # Question: Why do we omit age_2?
 84 | 
 85 | ## Store results
 86 | results <- as.matrix(coef(summary(ols2))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
 87 | res[,2] <- results
 88 | print(round(res[,c(1:2)], digits=2))
 89 | 
 90 | ## Relative change in the estimated effect
 91 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,2]-res[1,1])/res[1,1], digits=1),"%"))
 92 | 
 93 | ########################################################################
 94 | 
 95 | #########################################################################
 96 | 
 97 | ## Multivariate OLS
 98 | ols3 <- lm(EARNY4 ~ ???, data = df)
 99 | summary(ols3)
100 | 
101 | ## Store results
102 | results <- as.matrix(coef(summary(ols3))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
103 | res[,3] <- results
104 | print(round(res[,c(1:3)], digits=2))
105 | 
106 | ## Relative change in the estimated effect
107 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,3]-res[1,2])/res[1,2], digits=1),"%"))
108 | 
109 | ########################################################################
110 | 
111 | ###############################################################################
112 | 
113 | ## Generate first-order interactions between all control variables
114 | interactions <- t(apply(df[,-c(1,2,3,6,11)], 1, combn, 2, prod))
115 | colnames(interactions) <- paste("Inter.V", combn(1:ncol(df[,-c(1,2,3,6,11)]), 2, paste, collapse="V"), sep="")
116 | print(paste0("Maximm number of interaction terms: ", ncol(interactions)))
117 | 
118 | ## Merge basline characteristics with interaction terms
119 | df_merge <- as.data.frame(cbind(df[,-c(1,2,3,6,11)], interactions))
120 | 
121 | ## Eliminate collinear variables
122 | df2 = cor(df_merge)
123 | df2[is.na(df2)] <- 1
124 | hc = findCorrelation(df2, cutoff=0.8) # putt any value as a "cutoff" 
125 | hc = sort(hc)
126 | df_int = cbind(df[,c(1,3)],df_merge[,-c(hc)])
127 | print(paste0("Total number of control variables: ", ncol(df_int)-2))
128 | 
129 | ###############################################################################
130 | 
131 | ###############################################################################
132 | 
133 | ## Multivariate OLS with all baseline characteristics and interaction terms
134 | ols4 <- lm(EARNY4 ~ ., data = df_int)
135 | 
136 | ## Store results
137 | results <- as.matrix(coef(summary(ols4))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
138 | res[,4] <- results
139 | print(round(res[,c(1:4)], digits=2))
140 | 
141 | ## Relative change in the estimated effect
142 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,4]-res[1,3])/res[1,3], digits=1),"%"))
143 | 
144 | ########################################################################
145 | 
146 | ###############################################################################
147 | 
148 | # Set starting value for replicability 
149 | set.seed(123456) 
150 | 
151 | # Specify number of random variables
152 | cols <- 1000
153 | 
154 | # Generate random variables
155 | redundant_x <- matrix(rnorm(nrow(df_int)*cols), nrow = nrow(df_int)) # We draw from a random standard normal distribution
156 | colnames(redundant_x) <- paste("Rand.", 1:cols, sep="")
157 | 
158 | # Merge random variables with baseline characteritics and interaction terms
159 | df_rand <- as.data.frame(cbind(df_int, redundant_x))
160 | print(paste0("Total number of control variables: ", ncol(df_rand)-2))
161 | 
162 | ###############################################################################
163 | 
164 | ###############################################################################
165 | 
166 | ## Multivariate OLS with all baseline characteristics, interaction terms, and random variables
167 | ols5 <- lm(EARNY4 ~ ., data = df_rand)
168 | 
169 | ## Store results
170 | results <- as.matrix(coef(summary(ols5))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
171 | res[,5] <- results
172 | print(round(res, digits=2))
173 | 
174 | ## Relative change in the estimated effect
175 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,5]-res[1,4])/res[1,4], digits=1),"%"))
176 | 
177 | ########################################################################
178 | 
179 | ###############################################################################
180 | ########################### Earnings Equation #################################
181 | ###############################################################################
182 | 
183 | # Predict earnings
184 | N <- nrow(df)
185 | st1 <- rlasso(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$EARNY4), 
186 |               penalty = list(homoscedastic = FALSE, c= 1.1, gamma = 0.1/log(N)))
187 | summary(st1)
188 | 
189 | # Store selected variables
190 | n1<- names(st1$coefficients[(st1$coefficients != 0) == TRUE])[-1]
191 | 
192 | ###############################################################################
193 | 
194 | ###############################################################################
195 | ######################### Participation Probability ###########################
196 | ###############################################################################
197 | 
198 | # Predict participation
199 | N <- nrow(df)
200 | st2 <- rlasso(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$participation), 
201 |               penalty = list(homoscedastic = FALSE, c= 1.1, gamma = 0.1/log(N)))
202 | summary(st2)
203 | 
204 | # Store selected variables
205 | n2<- names(st2$coefficients[(st2$coefficients != 0) == TRUE])[-1]
206 | 
207 | ###############################################################################
208 | 
209 | ###############################################################################
210 | ################################# Post-Lasso ##################################
211 | ###############################################################################
212 | 
213 | # Take union of selected covariates
214 | selected_covariates <- c("participation", unique(c(n1, n2)))
215 | 
216 | # Setup the formula of the linear regression model
217 | sumx <- paste(selected_covariates, collapse = " + ")  
218 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ")
219 | linear <- as.formula(linear)
220 | 
221 | # Post-Lasso regression
222 | ols <- lm(linear, data = df)
223 | summary(ols)
224 | 
225 | # Heteroskedasticity robust standard errors
226 | #coeftest(ols, vcov = vcovHC(ols, type = "HC1"))
227 | 
228 | ###############################################################################
229 | 
230 | ###############################################################################
231 | ################## Estimate the Treatment Effect Directly #####################
232 | ###############################################################################
233 | 
234 | # Post-Double-Selection Procedure 
235 | dsp <- rlassoEffect(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$EARNY4)
236 |           , as.matrix(df$participation), model = TRUE, penalty = list(homoscedastic = FALSE), method = "double selection")
237 | summary(dsp)
238 | 
239 | ###############################################################################
240 | # Earning Equation
241 | ###############################################################################
242 | 
243 | # Predict earnings
244 | 
245 | # Store selected variables
246 | 
247 | ###############################################################################
248 | # Participation Probability
249 | ###############################################################################
250 | 
251 | # Predict participation
252 | 
253 | # Store selected variables
254 | 
255 | ###############################################################################
256 | # Post-Lasso Model
257 | ###############################################################################
258 | 
259 | # Take union of selected covariates
260 | selected_covariates <- c("participation", unique(c(n1, n2)))
261 | 
262 | # Setup the formula of the linear regression model
263 | sumx <- paste(selected_covariates, collapse = " + ")  
264 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ")
265 | linear <- as.formula(linear)
266 | 
267 | # Post-Lasso OLS regression
268 | ols <- lm(linear, data = df_rand)
269 | summary(ols)
270 | 
271 | ###############################################################################
272 | 
273 | ####################################################################
274 | ################# Cross-Validated Lasso ############################
275 | ####################################################################
276 | 
277 | set.seed(123456789) # Starting value
278 | 
279 | # Cross-validated Lasso in earnings equation
280 | lasso_earn <- cv.glmnet(as.matrix(df_int[,c(3:ncol(df_int))]), as.matrix(df$EARNY4), 
281 |                         alpha=1, nfolds = 10, type.measure = 'mse', standardize = TRUE)
282 | # alpha =1 is Lasso, alpha = 0 is Ridgde
283 | # nfolds - number of cross-validation folds
284 | # type.measure - measure for model accuracy
285 | 
286 | plot(lasso_earn)
287 | 
288 | ####################################################################
289 | 
290 | ####################################################################
291 | 
292 | # Plot Lasso coefficients
293 | coef(lasso_earn,s = lasso_earn$lambda.1se) 
294 | # $lambda.min - Lambda that minimizes cross-validated MSE
295 | # $lambda.1se - Lambda of 1 standard error rule
296 | 
297 | ####################################################################
298 | 
299 | ####################################################################
300 | 
301 | # Select covariates with non-zero coefficients
302 | coef <- predict(lasso_earn,s = lasso_earn$lambda.min, type = "nonzero") #
303 | colnames <- colnames(df_int[,c(3:ncol(df_int))])
304 | n1 <- colnames[unlist(coef)]
305 | print(paste0("Number of Selected Variables Earnings Equation: ",length(n1)))
306 | print("Selected Variables:")
307 | print(n1)
308 | 
309 | ####################################################################
310 | 
311 | ####################################################################
312 | 
313 | set.seed(123456789) # Starting value
314 | 
315 | # Cross-validated Lasso in participation equation
316 | lasso_part <- cv.glmnet(???, 
317 |                         alpha=1, nfolds = 10, type.measure = 'mse', standardize = TRUE)
318 | plot(lasso_part)
319 | 
320 | ####################################################################
321 | 
322 | ####################################################################
323 | 
324 | # Select covariates with non-zero coefficients
325 | coef <- predict(???,s = ???, type = "nonzero") #
326 | colnames <- colnames(df_int[,c(3:ncol(df_int))])
327 | print(paste0("Number of Selected Variables Participation Equation: ",length(n2)))
328 | print("Selected Variables:")
329 | print(n2)
330 | 
331 | ####################################################################
332 | 
333 | ###############################################################################
334 | # Post-Lasso Model
335 | ###############################################################################
336 | 
337 | # Take union of selected covariates
338 | selected_covariates <- c(???)
339 | 
340 | # Setup the formula of the linear regression model
341 | sumx <- paste(selected_covariates, collapse = " + ")  
342 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ")
343 | linear <- as.formula(linear)
344 | 
345 | # Post-Lasso OLS regression
346 | ols <- lm(linear, data = df_int)
347 | summary(ols)
348 | 
349 | ###############################################################################
350 | 


--------------------------------------------------------------------------------
/PC Lab 5/double_machine_learning_tutorial.r:
--------------------------------------------------------------------------------
  1 | ##############################################################################
  2 | ########################  Load Packages and the Data  ########################
  3 | ##############################################################################
  4 | 
  5 | ### Load the packages  
  6 | library(fBasics)     # use for descriptive statistics
  7 | library(tidyverse)   # use for handling data
  8 | library(DiagrammeR)  # use for plotting trees
  9 | library(lmtest)      # use for heteroscedasticity robust standard errors
 10 | library(sandwich)    # use for heteroscedasticity robust standard errors
 11 | library(grf)         # use for generalized random forest
 12 | library(glmnet)      # use for lasso and Elastic Net regularized Generalized Linear Models
 13 | options(warn=-1)     # supress warnings
 14 | 
 15 | print('All packages successfully installed and loaded.')
 16 | 
 17 | ### Load the Data
 18 | set.seed(12345678) 
 19 | df <- read.csv("job_corps.csv",header=TRUE, sep=",") # load data from csv-file
 20 | df <- df[sample(c(1:nrow(df)), size=3000, replace =F),] # Select a random subsample of 3000 observations
 21 | print('Data successfully loaded.')
 22 | 
 23 | ##############################################################################
 24 | 
 25 | ##############################################################################
 26 | ########################  Descriptive Statistics  ############################
 27 | ##############################################################################
 28 | 
 29 | ## Table with Descriptive Statistics 
 30 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 
 31 |           select(Mean, Stdev, Minimum, Maximum, nobs)
 32 | print(round(desc, digits=2))
 33 | 
 34 | ##############################################################################
 35 | 
 36 | ###############################################################################
 37 | ######################### Sample Splitting ####################################
 38 | ###############################################################################
 39 | 
 40 | # Set starting value 
 41 | set.seed(123456789)
 42 | 
 43 | # Partition Samples for Cross-Fitting
 44 | df_part <- modelr::resample_partition(df, c(obs_A = 0.5, obs_B = 0.5)) # Split sample in strata of equal size
 45 | df_obs_A <- as.data.frame(df_part$obs_A) # Sample A
 46 | df_obs_B <- as.data.frame(df_part$obs_B) # Sample B
 47 | 
 48 | ##  Generate Variables  
 49 | # Outcome variable
 50 | earnings_obs_A <- as.matrix(df_obs_A[,1])
 51 | earnings_obs_B <- as.matrix(df_obs_B[,1])
 52 | 
 53 | # Treatment variable
 54 | treat = 3 #Select treatment 2= offer to participate, 3 = actual participation
 55 | treat_obs_A <- as.matrix(df_obs_A[,treat])
 56 | treat_obs_B <- as.matrix(df_obs_B[,treat])
 57 | 
 58 | # Covariates
 59 | covariates_obs_A <- as.matrix(df_obs_A[,c(4:ncol(df_obs_A))])
 60 | covariates_obs_B <- as.matrix(df_obs_B[,c(4:ncol(df_obs_B))])
 61 | 
 62 | print('Sample partitioning ready.')
 63 | 
 64 | ##############################################################################
 65 | 
 66 | ###############################################################################
 67 | ########### Conditional Potential Earnings under Non-Participation ############
 68 | ###############################################################################
 69 | 
 70 | p = 1 # 1 for LASSO, 0 for Ridge
 71 | 
 72 | # Set starting value
 73 | set.seed(123456789)
 74 | 
 75 | # Estimate Lasso among non-participants in Sample A
 76 | # Use cross-validation to select optimal lambda value
 77 | lasso_y0_A <- cv.glmnet(covariates_obs_A[treat_obs_A==0,], earnings_obs_A[treat_obs_A==0,],
 78 |                               alpha=p, type.measure = 'mse')
 79 | # Plot the cross-validated MSE
 80 | plot(lasso_y0_A)
 81 | 
 82 | # Extrapolate the fitted values to Sample B
 83 | y0hat_B <- predict(lasso_y0_A, newx = covariates_obs_B, type = 'response', s = lasso_y0_A$lambda.min)
 84 | 
 85 | # Estimate Lasso among non-participants in Sample B
 86 | lasso_y0_B <- cv.glmnet(covariates_obs_B[treat_obs_B==0,], earnings_obs_B[treat_obs_B==0,],
 87 |                               alpha=p, type.measure = 'mse')
 88 | # Plot the cross-validated MSE
 89 | plot(lasso_y0_B) 
 90 | 
 91 | # Extrapolate the fitted values to Sample A
 92 | y0hat_A <- predict(lasso_y0_B, newx = covariates_obs_A, type = 'response', s= lasso_y0_B$lambda.min)
 93 | 
 94 | # Merge fitted values of both samples
 95 | y0hat <- rbind(y0hat_A,y0hat_B)
 96 | 
 97 | #################################################################################
 98 | 
 99 | ###############################################################################
100 | ########### Conditional Potential Earnings under Participation ############
101 | ###############################################################################
102 | 
103 | p = 1 # 1 for LASSO, 0 for Ridge
104 | 
105 | # Set starting value
106 | set.seed(123456789)
107 | 
108 | # Estimate Lasso among participants in Sample A
109 | # Use cross-validation to select optimal lambda value
110 | lasso_y1_A <- cv.glmnet(covariates_obs_A[treat_obs_A==1,], earnings_obs_A[treat_obs_A==1,],
111 |                               alpha=p, type.measure = 'mse')
112 | plot(lasso_y1_A)
113 | 
114 | # Extrapolate the fitted values to Sample B
115 | y1hat_B <- predict(lasso_y1_A, newx = covariates_obs_B, type = 'response', s = lasso_y1_A$lambda.min)
116 | 
117 | # Estimate Lasso among participants in Sample B
118 | lasso_y1_B <- cv.glmnet(covariates_obs_B[treat_obs_B==1,], earnings_obs_B[treat_obs_B==1,],
119 |                               alpha=p, type.measure = 'mse')
120 | plot(lasso_y1_B)
121 | 
122 | # Extrapolate the fitted values to Sample A
123 | y1hat_A <- predict(lasso_y1_B, newx = covariates_obs_A, type = 'response', s= lasso_y1_B$lambda.min)
124 | 
125 | # Merge the fitted values of both samples
126 | y1hat <- rbind(y1hat_A,y1hat_B)
127 | 
128 | #################################################################################
129 | 
130 | ###############################################################################
131 | ########################### Propensity Score ##################################
132 | ###############################################################################
133 | 
134 | #  Propensity Score  
135 | p = 1 # 1 for LASSO, 0 for Ridge
136 | 
137 | # Set starting value
138 | set.seed(123456789)
139 | 
140 | # Estimate Logit-Lasso in Sample A
141 | # Use cross-validation to select optimal lambda value
142 | lasso_p_A <- cv.glmnet(covariates_obs_A, treat_obs_A, alpha=p, type.measure = 'mse', family="binomial")
143 | plot(lasso_p_A)
144 | 
145 | # Extrapolate the fitted values to Sample B
146 | pscore_B <- predict(lasso_p_A, newx = covariates_obs_B, type = 'response', s= lasso_p_A$lambda.min)
147 | 
148 | # Estimate Logit-Lasso in Sample B
149 | lasso_p_B <- cv.glmnet(covariates_obs_B, treat_obs_B, alpha=p, type.measure = 'mse', family="binomial")
150 | plot(lasso_p_B)
151 | 
152 | # Extrapolate the fitted values to Sample A
153 | pscore_A <- predict(lasso_p_B, newx = covariates_obs_A, type = 'response', s= lasso_p_B$lambda.min)
154 | 
155 | # Merge the fitted values of both samples
156 | pscore <- rbind(pscore_A,pscore_B)
157 | 
158 | ###############################################################################
159 | 
160 | ###############################################################################
161 | ################################### ATE Score #################################
162 | ###############################################################################
163 | 
164 | # Merge earnings outcome of Sample A and B
165 | earnings_obs <- rbind(earnings_obs_A,earnings_obs_B)
166 | 
167 | # Merge treatmente of Sample A and B
168 | treat_obs <- rbind(treat_obs_A,treat_obs_B)
169 | 
170 | # Calculate the ATE score using the formula described above
171 | Y_ate_star = invisible(???)
172 | 
173 | # Calculate ATE
174 | # It is the sample average of the ATE score
175 | ate <- round(mean(Y_ate_star), digits = 2)
176 | 
177 | # Calculate the standard errors of the ATE
178 | # Square root of the quotient of variance of the ATE score and the sample size
179 | se_ate <- round(sqrt(var(Y_ate_star)/length(Y_ate_star)), digits = 2)
180 | 
181 | 
182 | print(paste0("Average Treatment Effect (ATE): ", ate))
183 | print(paste0("Standard Error for ATE: ", se_ate))
184 | 
185 | ###############################################################################
186 | 
187 | ###############################################################################
188 | ################################## ATET Score #################################
189 | ###############################################################################
190 | 
191 | ## Unconditional Treatment probability
192 | p = mean(pscore)
193 | 
194 | # Calculate the ATET score using the formula described above
195 | Y_atet_star = invisible(???)
196 | 
197 | # Calculate ATET
198 | # It is the sample average of the ATET score
199 | atet <- round(mean(Y_atet_star), digits = 2)
200 | 
201 | # Calculate the standard errors of the ATET
202 | # Square root of the quotient of variance of the ATET score and the sample size
203 | se_atet <- round(sqrt(var(Y_atet_star)/length(Y_atet_star)), digits = 2)
204 | 
205 | print(paste0("Average Treatment Effect for Treated (ATET): ", atet))
206 | print(paste0("Standard Error for ATET: ", se_atet))
207 | 
208 | ###############################################################################
209 | 
210 | ###############################################################################
211 | ##################################### CATEs ###################################
212 | ###############################################################################
213 | 
214 | # Merge covariates of Sample A and B
215 | covariates_obs <- rbind(covariates_obs_A,covariates_obs_B)
216 | 
217 | # Generate a new data frame
218 | # Merge the ATE score and the covariates
219 | colnames(Y_ate_star) <- "y_star"
220 | Y_star <- as.data.frame(cbind(Y_ate_star,covariates_obs[,-c(3,8)]))
221 | 
222 | # Estimate an OLS regression
223 | # Regress the ATE score on the covariates
224 | cates <- lm(y_star ~., Y_star)
225 | 
226 | # Heteroskedasticity robust standard errors
227 | coeftest(cates, vcov = vcovHC(cates, type = "HC1"))
228 | 
229 | ###############################################################################
230 | 
231 | ###############################################################################
232 | 
233 | # Calculate the predicted effect size for each observation
234 | fit <- predict(cates)
235 | 
236 | # Count the observations with positive and negative effects
237 | print(paste0("Number of individuals with positive effects: ", length(fit[fit>=0])))
238 | print(paste0("Number of individuals with negative effects: ", length(fit[fit<0])))
239 | 
240 | ###############################################################################
241 | 
242 | ###############################################################################
243 | ################ Plot Cumulative Distribution of CATEs ########################
244 | ###############################################################################
245 | 
246 | plot(ecdf(fit), col="blue", xlim = c(-100,150), xlab="Effect Size (in Dollars)",
247 |      ylab="Cumulative Distribution", main="Cumulative Distibution of the CATEs")
248 | abline(v=0, col="red")
249 | 
250 | ###############################################################################
251 | 
252 | ###############################################################################
253 | ######################## Description of CATEs #################################
254 | ###############################################################################
255 | 
256 | ## Means and standard deviations for individuals with positive effects
257 | desc_1 <- fBasics::basicStats(Y_star[fit >= 0,-1]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
258 | 
259 | ## Means and standard deviations for individuals with negative effects
260 | desc_0 <- fBasics::basicStats(Y_star[fit < 0,-1]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
261 | 
262 | # Make table and add standardized differences
263 | desc <- cbind(desc_1,desc_0, 
264 |         100*abs(desc_1[,1]-desc_0[,1])/sqrt(0.5*(desc_1[,2]^2+desc_0[,2]^2)))
265 | colnames(desc) <- c("Mean (Pos.)", "Std.Dev. (Pos.)", "Mean (Neg.)", "Std.Dev. (Neg.)", "Std.Diff.")
266 | print(round(desc, digits=2))
267 | 
268 | ###############################################################################
269 | 
270 | ###############################################################################
271 | ########### Conditional Potential Earnings under Non-Participation ############
272 | ###############################################################################
273 | 
274 | # Set starting value
275 | set.seed(123456789)
276 | 
277 | # Tuning parameters for forest
278 | trees = 1000 # number of trees in the forest
279 | frac = 0.5 # share of subsample used for each tree
280 | cov = floor(1/2*ncol(covariates_obs)) # number of covariates used for each tree
281 | min = 10 # minimum sample size in the terminal leaves of the trees
282 | 
283 | # Estimate Random Forest among non-participants in Sample A
284 | forest_y0_A <- regression_forest(covariates_obs_A[treat_obs_A==0,], earnings_obs_A[treat_obs_A==0,],
285 |                               num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min)
286 | 
287 | # Extrapolate the fitted values to Sample B
288 | y0hat_B <- as.matrix(predict(forest_y0_A, newdata = covariates_obs_B)$predictions)
289 | 
290 | print("Random Forest for Sample A estimated.")
291 | 
292 | #################################################################################
293 | 
294 | #################################################################################
295 | 
296 | # Plot one tree from the random forest
297 | plot(tree <- get_tree(forest_y0_A, 1))
298 | # the last number is the tree number
299 | # it can be varied from 1 to 1000
300 | 
301 | #################################################################################
302 | 
303 | #################################################################################
304 | 
305 | # Count the splitting frequencies for each covariate
306 | split <- split_frequencies(forest_y0_A, max.depth = 4)
307 | # max.depth specifies the maximum tree depth we consider
308 | 
309 | # Label the results
310 | colnames(split) <- colnames(covariates_obs)
311 | rownames(split) <- c("Depth 1", "Depth 2", "Depth 3", "Depth 4")
312 | 
313 | print(t(split))
314 | 
315 | #################################################################################
316 | 
317 | #################################################################################
318 | 
319 | # Estimate Random Forest among non-participants in Sample B
320 | forest_y0_B <- regression_forest(???)
321 | 
322 | # Extrapolate the fitted values to Sample A
323 | y0hat_A <- as.matrix(predict(forest_y0_B, newdata = covariates_obs_A)$predictions)
324 | 
325 | # Merge fitted values of both samples
326 | y0hat <- rbind(y0hat_A,y0hat_B)
327 | 
328 | print("Random Forest for Sample B estimated.")
329 | 
330 | #################################################################################
331 | 
332 | ###############################################################################
333 | ########################### Propensity Score ##################################
334 | ###############################################################################
335 | 
336 | # Set starting value
337 | set.seed(123456789)
338 | 
339 | # Tuning parameters for forest
340 | trees = 1000
341 | frac = 0.5
342 | cov = floor(1/2*ncol(covariates_obs))
343 | min = 10
344 | 
345 | # Estimate Random Forest in Sample A
346 | forest_p_A <- regression_forest(covariates_obs_A, treat_obs_A, 
347 |                          num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min)
348 | 
349 | # Extrapolate the fitted values to Sample B
350 | pscore_B <- as.matrix(predict(forest_p_A, newdata = covariates_obs_B)$predictions)
351 | 
352 | ##############
353 | 
354 | # Estimate Random Forest in Sample B
355 | forest_p_B <- regression_forest(covariates_obs_B, treat_obs_B, 
356 |                          num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min)
357 | 
358 | # Extrapolate the fitted values to Sample A
359 | pscore_A <- as.matrix(predict(forest_p_B, newdata = covariates_obs_A)$predictions)
360 | 
361 | # Merge the fitted values of both samples
362 | pscore <- rbind(pscore_A,pscore_B)
363 | 
364 | print("Propensity score is estimated.")
365 | 
366 | ###############################################################################
367 | 
368 | ###############################################################################
369 | ################################## ATET Score #################################
370 | ###############################################################################
371 | 
372 | ## Unconditional Treatment probability
373 | p = mean(pscore)
374 | 
375 | # Calculate the ATET score using the formula described above
376 | Y_atet_star = invisible(treat_obs*(earnings_obs - y0hat)/p 
377 |             - (1-treat_obs)*pscore*(earnings_obs - y0hat)/(p*(1-pscore)))
378 | 
379 | # Calculate ATET
380 | # It is the sample average of the ATET score
381 | atet <- round(mean(Y_atet_star), digits = 2)
382 | 
383 | # Calculate the standard errors of the ATET
384 | # Square root of the quotient of variance of the ATET score and the sample size
385 | se_atet <- round(sqrt(var(Y_atet_star)/length(Y_atet_star)), digits = 2)
386 | 
387 | print(paste0("Average Treatment Effect for Treated (ATET): ", atet))
388 | print(paste0("Standard Error for ATET: ", se_atet))
389 | 
390 | ###############################################################################
391 | 


--------------------------------------------------------------------------------
/PC Lab 5/help files/glmnet_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 5/help files/glmnet_package.pdf


--------------------------------------------------------------------------------
/PC Lab 5/help files/grf_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 5/help files/grf_package.pdf


--------------------------------------------------------------------------------
/PC Lab 6/causal_forest.r:
--------------------------------------------------------------------------------
  1 | ########################  Load Packages  ########################
  2 | 
  3 | # List of required packages
  4 | pkgs <- c('fBasics', 'corrplot', 'tidyverse', 'grf', 'plotmo')
  5 | 
  6 | # Load packages
  7 | for(pkg in pkgs){
  8 |     library(pkg, character.only = TRUE)
  9 | }
 10 | options(warn=-1)     # supress warnings
 11 | 
 12 | print('All packages successfully installed and loaded.')
 13 | 
 14 | ##################################################################
 15 | 
 16 | ########################  Load Data Frame  ########################
 17 | 
 18 | # Load data frame
 19 | df <- read.csv("fundraising.csv",header=TRUE, sep=",")
 20 | 
 21 | # Outcome Variable
 22 | outcome <- c("char_giving")
 23 | 
 24 | # Treatment Variables
 25 | treatment <- c("treat")
 26 | 
 27 | # Covariates/Features
 28 | covariates <- c("amount_pre", "amount_lastpre", "amount_maxpre", "H_number_yearbefore", "H_ngifts",
 29 |                 "H_littleask", "H_bigask", "H_nyears", "H_frequency", "H_medinc", "H_medinc_mdum",
 30 |                 "H_Avg_years_ed", "H_Avg_years_ed_mdum")
 31 | 
 32 |     
 33 | all_variables <- c(outcome, treatment, covariates)
 34 | 
 35 | print('Data frame successfully loaded and sample selected.')
 36 | 
 37 | ####################################################################
 38 | 
 39 | ########################  Table with Descriptive Statistics  ########################
 40 | 
 41 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 
 42 |   select(Mean, Stdev, Minimum, Maximum, nobs)
 43 | print(round(desc, digits=2))
 44 | 
 45 | #####################################################################################
 46 | 
 47 | ########################  Correlation Matrix  ########################
 48 | 
 49 | corr = cor(df[,-c(1:2)])
 50 | corrplot(corr, type = "upper", tl.col = "black")
 51 | 
 52 | ######################################################################
 53 | 
 54 | ########################  Partition the Samples  ########################
 55 | set.seed(100239) # set starting value for random number generator
 56 | 
 57 | # Partition Hold-Out-Sample
 58 | df_part <- modelr::resample_partition(df, c(obs = 0.8, hold_out = 0.2))
 59 | df_obs <- as.data.frame(df_part$obs) # Training and estimation sample
 60 | df_hold_out <- as.data.frame(df_part$hold_out) # Hold-out-sample
 61 | 
 62 | print('Samples are partitioned.')
 63 | 
 64 | ########################  Generate Variables  ########################
 65 | 
 66 | # Outcome
 67 | giving_hold_out <- as.matrix(df_hold_out[,1])
 68 | giving_obs <- as.matrix(df_obs[,1])
 69 | 
 70 | # Treatment
 71 | treat_hold_out <- as.matrix(df_hold_out[,2])
 72 | treat_obs <- as.matrix(df_obs[,2])
 73 | 
 74 | # Covariates
 75 | covariates_hold_out <- as.matrix(df_hold_out[,c(3:ncol(df_hold_out))])
 76 | covariates_obs <- as.matrix(df_obs[,c(3:ncol(df_obs))])
 77 | 
 78 | print('The data is now ready for your analysis!')
 79 | 
 80 | #######################################################################
 81 | 
 82 | ########################  Causal Forest  ######################## 
 83 | set.seed(100244)
 84 | 
 85 | # Tuning parameters
 86 | min_tree = 100 # Minimum size of terminal leaves
 87 | num_trees = 1000 # Number of trees in forest
 88 | cov_frac = 1/2 # Fraction of covariates in each tree
 89 | sample_part= 0.5 # Fraction of sample used for each tree (subsampling)
 90 | 
 91 | # Caual Forest
 92 | cates <- causal_forest(covariates_obs, giving_obs, treat_obs,
 93 |                   sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 
 94 |                   num.trees = num_trees, min.node.size = min_tree,
 95 |                   honesty = TRUE, honesty.fraction = 0.5)
 96 | 
 97 | print('Forest is ready!')
 98 | 
 99 | ###################################################################
100 | 
101 | #################################################################################
102 | 
103 | # Plot one tree from the random forest
104 | plot(tree <- get_tree(cates, 1))
105 | # the last number is the tree number
106 | # it can be varied from 1 to 1000
107 | 
108 | #################################################################################
109 | 
110 | #################################################################################
111 | 
112 | # Count the splitting frequencies for each covariate
113 | split <- split_frequencies(cates, max.depth = 4)
114 | # max.depth specifies the maximum tree depth we consider
115 | 
116 | # Label the results
117 | colnames(split) <- colnames(covariates_obs)
118 | rownames(split) <- c("Depth 1", "Depth 2", "Depth 3", "Depth 4")
119 | 
120 | print(t(split))
121 | 
122 | #################################################################################
123 | 
124 | ######################### ATE ###############################
125 | 
126 | average_treatment_effect(cates, target.sample = c("all"))
127 | 
128 | #############################################################
129 | 
130 | ###############################################################################
131 | 
132 | # Calculate the predicted effect size for each observation
133 | fit <- predict(cates, covariates_hold_out, estimate.variance = FALSE)$predictions
134 | 
135 | # Count the observations with positive and negative effects
136 | print(paste0("Number of individuals with positive effects: ", length(fit[fit>=0])))
137 | print(paste0("Number of individuals with negative effects: ", length(fit[fit<0])))
138 | 
139 | print(paste0("Share of individuals with positive effects: ", round(100*length(fit[fit>=0])/length(fit),digits=1), "%"))
140 | 
141 | ###############################################################################
142 | 
143 | ###############################################################################
144 | ################ Plot Cumulative Distribution of CATEs ########################
145 | ###############################################################################
146 | 
147 | plot(ecdf(fit), col="blue", xlim = c(-25,25), xlab="Effect Size (in Dollars)",
148 |      ylab="Cumulative Distribution", main="Cumulative Distibution of the CATEs")
149 | abline(v=0, col="red")
150 | 
151 | ###############################################################################
152 | 
153 | ###############################################################################
154 | ######################## Description of CATEs #################################
155 | ###############################################################################
156 | 
157 | ## Means and standard deviations for individuals with positive effects
158 | desc_1 <- fBasics::basicStats(covariates_hold_out[fit >= 0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
159 | 
160 | ## Means and standard deviations for individuals with negative effects
161 | desc_0 <- fBasics::basicStats(covariates_hold_out[fit < 0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
162 | 
163 | # Make table and add standardized differences
164 | desc <- cbind(desc_1,desc_0, 
165 |         100*abs(desc_1[,1]-desc_0[,1])/sqrt(0.5*(desc_1[,2]^2+desc_0[,2]^2)))
166 | colnames(desc) <- c("Mean (Pos.)", "Std.Dev. (Pos.)", "Mean (Neg.)", "Std.Dev. (Neg.)", "Std.Diff.")
167 | print(round(desc, digits=2))
168 | 
169 | ###############################################################################
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/PC Lab 6/help files/grf_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 6/help files/grf_package.pdf


--------------------------------------------------------------------------------
/PC Lab 7/help files/grf_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 7/help files/grf_package.pdf


--------------------------------------------------------------------------------
/PC Lab 7/help files/rpart_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 7/help files/rpart_package.pdf


--------------------------------------------------------------------------------
/PC Lab 7/optimal_policy_learning.r:
--------------------------------------------------------------------------------
  1 | ########################  Load Packages  ########################
  2 | 
  3 | # List of required packages
  4 | pkgs <- c('fBasics', 'corrplot', 'rpart', 'rpart.plot', 'tidyverse', 'grf', 'caret')
  5 | 
  6 | # Load packages
  7 | for(pkg in pkgs){
  8 |     library(pkg, character.only = TRUE)
  9 | }
 10 | options(warn=-1)     # supress warnings
 11 | 
 12 | print('All packages successfully installed and loaded.')
 13 | 
 14 | ###################################################################
 15 | 
 16 | ########################  Load Data Frame  ########################
 17 | 
 18 | # Load data frame
 19 | df <- read.csv("fundraising.csv",header=TRUE, sep=",")
 20 | 
 21 | # Outcome Variable
 22 | outcome <- c("char_giving")
 23 | 
 24 | # Treatment Variables
 25 | treatment <- c("treat")
 26 | 
 27 | # Covariates/Features
 28 | covariates <- c("amount_pre", "amount_lastpre", "amount_maxpre", "H_number_yearbefore", "H_ngifts",
 29 |                 "H_littleask", "H_bigask", "H_nyears", "H_frequency", "H_medinc", "H_medinc_mdum",
 30 |                 "H_Avg_years_ed", "H_Avg_years_ed_mdum")
 31 | 
 32 | all_variables <- c(outcome, treatment, covariates)
 33 | 
 34 | print('Data frame successfully loaded and sample selected.')
 35 | 
 36 | ######################################################################
 37 | 
 38 | ########################  Table with Descriptive Statistics  ########################
 39 | 
 40 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 
 41 |   select(Mean, Stdev, Minimum, Maximum, nobs)
 42 | print(round(desc, digits=2))
 43 | 
 44 | #####################################################################################
 45 | 
 46 | ########################  Correlation Matrix  ########################
 47 | 
 48 | corr = cor(df[,-c(1:2)])
 49 | corrplot(corr, type = "upper", tl.col = "black")
 50 | 
 51 | ######################################################################
 52 | 
 53 | ########################  Partition the Samples  ########################
 54 | set.seed(100233) # set starting value for random number generator
 55 | 
 56 | # Partition Hold-Out-Sample
 57 | df_part <- modelr::resample_partition(df, c(obs = 0.8, hold_out = 0.2))
 58 | df_obs <- as.data.frame(df_part$obs) # Training and estimation sample
 59 | df_hold_out <- as.data.frame(df_part$hold_out) # Hold-out-sample
 60 | 
 61 | # Partition Samples for Cross-Fitting
 62 | df_part <- modelr::resample_partition(df_obs, c(obs_A = 0.5, obs_B = 0.5))
 63 | df_obs_A <- as.data.frame(df_part$obs_A) # Sample A
 64 | df_obs_B <- as.data.frame(df_part$obs_B) # Sample B
 65 | 
 66 | print('Samples are partitioned.')
 67 | 
 68 | ########################  Generate Variables  ########################
 69 | 
 70 | # Outcome
 71 | giving_hold_out <- as.matrix(df_hold_out[,1])
 72 | giving_obs <- as.matrix(df_obs[,1])
 73 | giving_obs_A <- as.matrix(df_obs_A[,1])
 74 | giving_obs_B <- as.matrix(df_obs_B[,1])
 75 | 
 76 | # Treatment
 77 | treat_hold_out <- as.matrix(df_hold_out[,2])
 78 | treat_obs <- as.matrix(df_obs[,2])
 79 | treat_obs_A <- as.matrix(df_obs_A[,2])
 80 | treat_obs_B <- as.matrix(df_obs_B[,2])
 81 | 
 82 | # Covariates
 83 | covariates_hold_out <- as.matrix(df_hold_out[,c(3:ncol(df_hold_out))])
 84 | covariates_obs <- as.matrix(df_obs[,c(3:ncol(df_obs))])
 85 | covariates_obs_A <- as.matrix(df_obs_A[,c(3:ncol(df_obs_A))])
 86 | covariates_obs_B <- as.matrix(df_obs_B[,c(3:ncol(df_obs_B))])
 87 | 
 88 | ########################  Standardise Covariates  ########################
 89 | 
 90 | preProcValues <- preProcess(covariates_obs, method = c("center", "scale"))
 91 | covariates_hold_out <- predict(preProcValues, covariates_hold_out) 
 92 | covariates_obs <- predict(preProcValues, covariates_obs) 
 93 | covariates_obs_A <- predict(preProcValues, covariates_obs_A) 
 94 | covariates_obs_B <- predict(preProcValues, covariates_obs_B) 
 95 | 
 96 | df_obs <- as.data.frame(cbind(giving_obs,treat_obs,covariates_obs))
 97 | df_obs_A <- as.data.frame(cbind(giving_obs_A,treat_obs_A,covariates_obs_A))
 98 | df_obs_B <- as.data.frame(cbind(giving_obs_B,treat_obs_B,covariates_obs_B))
 99 | covariates_hold_out <- as.data.frame(covariates_hold_out)
100 | 
101 | print('Covariates are standardised.')
102 | print('The data is now ready for your analysis!')
103 | 
104 | ###########################################################################
105 | 
106 | ########################  Potential Outcomes  ######################## 
107 | set.seed(100243)
108 | 
109 | # Tuning parameters
110 | min_tree = 20
111 | # Number of trees is set to a very low value in order to increase the computational speed in this tutorial
112 | num_trees = 100 # Use at least 1,000 trees
113 | cov_frac = 1/3
114 | sample_part= 0.5
115 | 
116 | # Build generalised random forest
117 | 
118 | # Use Sample A to predict Sample B
119 | # Potential outcome under treatment
120 | f_y1_A <- regression_forest(covariates_obs_A[treat_obs_A == 1,], giving_obs_A[treat_obs_A == 1, ],
121 |                   sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 
122 |                   num.trees = num_trees, min.node.size = min_tree,
123 |                   honesty = TRUE, honesty.fraction = 0.5)
124 | y1hat_B <- as.matrix(predict(f_y1_A, covariates_obs_B)$predictions)
125 | y1hat_B_hold_out <- as.matrix(predict(f_y1_A, covariates_hold_out)$predictions)
126 | 
127 | # Potential outcome under non-treatment
128 | f_y0_A <- regression_forest(covariates_obs_A[treat_obs_A == 0,], giving_obs_A[treat_obs_A == 0, ],
129 |                   sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 
130 |                   num.trees = num_trees, min.node.size = min_tree,
131 |                   honesty = TRUE, honesty.fraction = 0.5)
132 | y0hat_B <- as.matrix(predict(f_y0_A, covariates_obs_B)$predictions)
133 | y0hat_B_hold_out <- as.matrix(predict(f_y0_A, covariates_hold_out)$predictions)
134 | 
135 | ###########################################################################
136 | 
137 | # Use Sample B to predict Sample A
138 | # Potential outcome under treatment
139 | f_y1_B <- regression_forest(covariates_obs_B[treat_obs_B == 1,], giving_obs_B[treat_obs_B == 1, ],
140 |                   sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 
141 |                   num.trees = num_trees, min.node.size = min_tree,
142 |                   honesty = TRUE, honesty.fraction = 0.5)
143 | y1hat_A <- as.matrix(predict(f_y1_B, covariates_obs_A)$predictions)
144 | y1hat_A_hold_out <- as.matrix(predict(f_y1_B, covariates_hold_out)$predictions)
145 | 
146 | # Potential outcome under non-treatment
147 | f_y0_B <- regression_forest(covariates_obs_B[treat_obs_B == 0,], giving_obs_B[treat_obs_B == 0, ],
148 |                   sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 
149 |                   num.trees = num_trees, min.node.size = min_tree,
150 |                   honesty = TRUE, honesty.fraction = 0.5)
151 | y0hat_A <- as.matrix(predict(f_y0_B, covariates_obs_A)$predictions)
152 | y0hat_A_hold_out <- as.matrix(predict(f_y0_B, covariates_hold_out)$predictions)
153 | 
154 | ###########################################################################
155 | 
156 | # Merge the fitted values from samples A and B
157 | y1hat <- rbind(y1hat_A,y1hat_B)
158 | y0hat <- rbind(y0hat_A,y0hat_B)
159 | 
160 | y1hat_hold_out <- (y1hat_A_hold_out+y1hat_B_hold_out)/2
161 | y0hat_hold_out <- (y0hat_A_hold_out+y0hat_B_hold_out)/2
162 | 
163 | print("Potential outcomes are estimated")
164 | 
165 | ###########################################################################
166 | 
167 | ########################  Propensity Score  ######################## 
168 | set.seed(100242)
169 | 
170 | # Tuning parameters
171 | min_tree = 20
172 | num_trees = 100 # Use at least 1,000 trees
173 | cov_frac = 1/3
174 | sample_part= 0.5
175 | 
176 | # Use Sample A to predict Sample B
177 | f_p_A <- regression_forest(covariates_obs_A, treat_obs_A,
178 |                   sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 
179 |                   num.trees = num_trees, min.node.size = min_tree,
180 |                   honesty = TRUE, honesty.fraction = 0.5)
181 | pscore_B <- as.matrix(predict(f_p_A, covariates_obs_B)$predictions)
182 | pscore_B_hold_out <- as.matrix(predict(f_p_A, covariates_hold_out)$predictions)
183 | 
184 | # Use Sample B to predict Sample A
185 | f_p_B <- regression_forest(covariates_obs_B, treat_obs_B,
186 |                   sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 
187 |                   num.trees = num_trees, min.node.size = min_tree,
188 |                   honesty = TRUE, honesty.fraction = 0.5)
189 | pscore_A <- as.matrix(predict(f_p_B, covariates_obs_A)$predictions)
190 | pscore_A_hold_out <- as.matrix(predict(f_p_B, covariates_hold_out)$predictions)
191 | 
192 | pscore <- rbind(pscore_A,pscore_B)
193 | pscore_hold_out <- (pscore_A_hold_out+pscore_B_hold_out)/2
194 | 
195 | print("Propensity scores are estimated")
196 | 
197 | ###########################################################################
198 | 
199 | ########################  Average Treatment Effects (ATE)  ########################
200 | 
201 | # Merge samples A and B
202 | giving_obs <- rbind(giving_obs_A,giving_obs_B)
203 | treat_obs <- rbind(treat_obs_A,treat_obs_B)
204 | 
205 | # Generate Modified Outcome
206 | Y_star = invisible(y1hat - y0hat + treat_obs*(giving_obs - y1hat)/pscore 
207 |             - (1-treat_obs)*(giving_obs - y0hat)/(1-pscore))
208 | 
209 | # Average Treatment Effect (ATE)
210 | ATE <- round(mean(Y_star), digits=1)
211 | print(paste0("Average Treatment Effect (ATE): ", ATE))
212 | 
213 | # Standard error
214 | SD_ATE <- round(sqrt(var(Y_star)/length(Y_star)),digits=1)
215 | print(paste0("Standard Error for ATE: ", SD_ATE))
216 | 
217 | ####################################################################################
218 | 
219 | ########################  Individualised Treatment Rules  ########################                         
220 | 
221 | set.seed(1234567)
222 | 
223 | # Define transformed Variables
224 | sign = sign(Y_star)
225 | lambda = abs(Y_star)
226 | Z <- factor(sign, labels = c("Don't", "Treat"))
227 | df_obs <- rbind(df_obs_A,df_obs_B)
228 | 
229 | # Genrate linear fromular for tree
230 | sumx <- paste(covariates, collapse = " + ")  
231 | linear <- paste("Z",paste(sumx, sep=" + "), sep=" ~ ")
232 | linear <- as.formula(linear)
233 | 
234 | ########################  Build a Shallow Tree  ########################                         
235 | 
236 | # Tree 
237 | tree_1 <- rpart(formula = linear, # Predict sign of treatment
238 |                 data = df_obs,
239 |                 weights = lambda,  # Larger absolute effect -> Higher weight
240 |                 method = "class",
241 |                 control = rpart.control(cp = 2.00e-10,maxdepth = 3, minbucket=10))
242 | 
243 | # Plot MSE in CV-Sample
244 | rpart.plot(tree_1,digits=3)
245 | 
246 | # Predict policy rule to hold-out-sample
247 | pi_tree1_hold_out = as.matrix(predict(tree_1, newdata=covariates_hold_out))
248 | 
249 | ####################################################################################
250 | 
251 | #############################  Build Trees Deeper Tree  #################################                         
252 | 
253 | set.seed(1234567)
254 | 
255 | # Tree 
256 | tree_2 <- rpart(formula = linear, # Predict sign of treatment
257 |                 data = df_obs,
258 |                 weights = lambda,  # Larger absolute effect --> Higher weight
259 |                 method = "class",
260 |                 control = rpart.control(cp = 2.00e-10, minbucket=10))
261 | 
262 | # Find optimal tree sizes
263 | op.index_2 <- which.min(tree_2$cptable[, "xerror"])
264 | print(paste0("Optimal number of splits: ", tree_2$cptable[op.index_2, "nsplit"]))
265 | 
266 | # Plot CV-Error 
267 | plotcp(tree_2, minline = TRUE)
268 | abline(v = op.index_2, lty = "dashed")
269 | 
270 | ########################  Select the Tree that Minimises CV-MSE  ######################## 
271 | 
272 | # Get cp-value that corresponds to optimal tree sizes
273 | cp.vals_2 <- tree_2$cptable[op.index_2, "CP"]
274 | 
275 | # Prune the trees
276 | prune_tree_2 <- prune(tree_2, cp = cp.vals_2)
277 | 
278 | # Plot pruned tree 
279 | rpart.plot(prune_tree_2,digits=3, main = "Pruned Tree")
280 | 
281 | # Predict policy rule to hold-out-sample
282 | pi_tree2_hold_out = as.matrix(predict(prune_tree_2, newdata=covariates_hold_out))
283 | 
284 | #########################################################################################
285 | 
286 | ########################  Share of Treated  ######################## 
287 | 
288 | # Rule based on shallow tree (ITR1)
289 | rule_tree_1 <- as.numeric(pi_tree1_hold_out[,2]> .5)
290 | # Rule based on deeper tree (ITR2)
291 | rule_tree_2 <- as.numeric(pi_tree2_hold_out[,2]> .5)
292 |                   
293 | print('Descriptives of Policy Rules')
294 | desc <- fBasics::basicStats(cbind(rule_tree_1,rule_tree_2)) %>% t() %>% as.data.frame() %>% 
295 |   select(Mean, nobs)
296 | print(round(desc, digits=5))
297 | 
298 | print('Correlation between the Policy Rules')
299 | corr = cor(cbind(rule_tree_1,rule_tree_2))
300 | print(corr)
301 | 
302 | #####################################################################
303 | 
304 | ########################  Average Giving Under Policy Rule  ######################## 
305 | 
306 | # Generate Modified Outcome
307 | y_1_hold_out = invisible(y1hat_hold_out + treat_hold_out*(giving_hold_out - y1hat_hold_out)/pscore_hold_out)
308 | y_0_hold_out = invisible(y0hat_hold_out + (1-treat_hold_out)*(giving_hold_out - y0hat_hold_out)/(1-pscore_hold_out))
309 | 
310 | # Calulate expected average giving under the different policy rules 
311 | O_tree_1 <- round(mean(rule_tree_1*y_1_hold_out  + (1-rule_tree_1)*y_0_hold_out), digits = 2)
312 | O_tree_2 <- round(mean(rule_tree_2*y_1_hold_out  + (1-rule_tree_2)*y_0_hold_out), digits = 2)
313 | 
314 | print('Average Givings Under')
315 | print(paste0("Shallow Tree: ",O_tree_1))
316 | print(paste0("Pruned Tree: ",O_tree_2))
317 | 
318 | #####################################################################################
319 | 
320 | ########################  Policy Value Compared to Everybody is Treated  ######################## 
321 | 
322 | #Modified Outcome
323 | Y_star_hold_out = y_1_hold_out - y_0_hold_out
324 | 
325 | # Estimate Policy Value
326 | tree_all <- round(mean((rule_tree_2-1)*Y_star_hold_out), digits = 2)
327 | se_tree_all <- round(sqrt(var((rule_tree_2-1)*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2)
328 | 
329 | print('Total Policy Value Compared to Everybody is Treated')
330 | print(paste0("Average Gain of Pruned Tree: ", tree_all))
331 | print(paste0("Standard Error: ", se_tree_all))
332 | 
333 | #round(mean(giving_hold_out[treat_hold_out==1,]), digits = 2)
334 | #round(mean((rule_tree_2-1)*Y_star_hold_out)/mean(giving_hold_out[treat_hold_out==1,]), digits = 2)
335 | 
336 | ################################################################################################
337 | 
338 | ########################  Policy Value Compared to Nobody is Treated  ######################## 
339 | 
340 | # Estimate Policy Value
341 | tree_no <- round(mean(rule_tree_2*Y_star_hold_out), digits = 2)
342 | se_tree_no <- round(sqrt(var(rule_tree_2*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2)
343 | 
344 | print('Total Policy Value Compared to Nobody is Treated')
345 | print(paste0("Average Gain of Pruned Tree: ", tree_no))
346 | print(paste0("Standard Error: ", se_tree_no))
347 | 
348 | #round(mean(giving_hold_out[treat_hold_out==0,]), digits = 2)
349 | #round(mean(rule_tree_2*Y_star_hold_out)/mean(giving_hold_out[treat_hold_out==0,]), digits = 2)
350 | 
351 | ################################################################################################
352 | 
353 | ########################  Policy Value Compared to Random Assignment  ######################## 
354 | 
355 | # Estimate Policy Value
356 | R1_tree_2 <- round(1/2*mean((2*rule_tree_2-1)*Y_star_hold_out), digits = 2)
357 | se_tree_2 <- round(sqrt(1/4*var((2*rule_tree_2-1)*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2)
358 | 
359 | 
360 | print('Total Policy Value Compared to Random Assignment')
361 | print(paste0("Average Gain of Pruned Tree: ", R1_tree_2))
362 | print(paste0("Standard Error: ", se_tree_2))
363 | 
364 | #round((mean(giving_hold_out[treat_hold_out==1,])+mean(giving_hold_out[treat_hold_out==0,]))/2, digits = 2)
365 | #round(1/2*mean((2*rule_tree_2-1)*Y_star_hold_out)/(mean(giving_hold_out[treat_hold_out==1,])+
366 | #                                                   mean(giving_hold_out[treat_hold_out==0,]))/2, digits = 2)
367 | 
368 | ################################################################################################
369 | 
370 | 
371 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine-Learning-Course
 2 | Machine Learning for Economists and Business Analysts
 3 | 
 4 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/AStrittmatter/Machine-Learning-Course/HEAD)
 5 | 
 6 | Machine learning estimation methods gain more and more popularity. Compared to conventional estimation methods, machine learning can solve statistical prediction tasks in a data adaptive way. Furthermore, machine learning can deal with high-dimensional variable spaces in a relatively flexible way. Prediction methods are used in many different business and economic domains. Examples of prediction tasks are: The prediction of sales for a grocery store, such that logisticians can ship products before they are sold. The prediction of the probability to become drug addicted later in life, such that drug prevention programs can be targeted at adolescents with high risk.
 7 | 
 8 | Besides predictions, economists and managers are often interested in causal questions. Examples of causal questions are: What are the effects of tweets by Elon Musk on Bitcoins? What impact has lowering the central bank interest rate on GDP? Does participation in training programs reduce the unemployment duration? Machine learning cannot give us an automatic answer to causal questions without using an empirical design. However, machine learning estimates can serve as input factors for these empirical designs. Furthermore, we can estimate heterogeneous effects with machine learning.
 9 | 
10 | The course covers different predictive and causal machine learning methods. A focus will be on the application of these methods in practical R programming session.
11 | 
12 | Predictive Machine Learning:
13 | - Regularized Regression
14 | - Trees and Forests
15 | - Unsupervised Machine Learning
16 | 
17 | Causal Machine Learning
18 | - Double Selection Procedure
19 | - Debiased Machine Learning
20 | - Causal Forests
21 | - Optimal Policy Learning
22 | - Reinforcement Learning
23 | 


--------------------------------------------------------------------------------
/Stata Example/ajr_example.do:
--------------------------------------------------------------------------------
 1 | clear
 2 | 
 3 | // Data is from Acemoglu, Robinson, and Johnson (2001) "The Colonial Origins of Comparative Development: An Empirical Investigation"
 4 | use https://statalasso.github.io/dta/AJR.dta
 5 | 
 6 | // We estimate the effect of institutions (avexpr) on income (logpgp95)
 7 | // logpgp95 - log of GDP per capita in 1995
 8 | // avexpr - average protection against exprorition risk, 1985-1995
 9 | 
10 | * Unconditional OLS estimate
11 | reg logpgp95 avexpr, robust
12 | 
13 | * Conditional OLS estimate
14 | // We have 24 control variables (latitude, temperature, humidity, ethnical diversity, soil, commodities, etc.)
15 | // The data contains only have 64 country-level observations
16 | reg logpgp95 avexpr lat_abst edes1975 avelf temp* humid* steplow-oilres, robust
17 | 
18 | * Post-Lasso Double Selection Procedure
19 | // Let the data decide which control variables are important
20 | pdslasso logpgp95 avexpr (lat_abst edes1975 avelf temp* humid* steplow-oilres), robust nois
21 | 
22 | 
23 | * Useful Links:
24 | // https://statalasso.github.io/
25 | // https://github.com/aahrens1
26 | // https://economics.mit.edu/files/4123
27 | 


--------------------------------------------------------------------------------
/Stata Example/pdslasso.ado:
--------------------------------------------------------------------------------
 1 | *! pdslasso 1.0.01 30jan2018
 2 | *! authors aa/cbh/ms
 3 | *  wrapper for ivlasso
 4 | 
 5 | program define pdslasso, eclass sortpreserve
 6 | 	syntax [anything] [if] [in] ,		///
 7 | 		[								///
 8 | 		OLSOPTions(string)				/// options passed to IV or OLS estimation
 9 | 		* ]
10 | 
11 | 	version 13
12 | 	ivlasso `anything' `if' `in', `options' cmdname(pdslasso) ivoptions(`olsoptions')
13 | 	
14 | 	ereturn local cmd 		pdslasso
15 | 	
16 | end
17 | 
18 | 


--------------------------------------------------------------------------------
/Stata Example/rlasso.ado:
--------------------------------------------------------------------------------
   1 | *! rlasso 1.0.06 10feb2018
   2 | *! authors aa/cbh/ms
   3 | 
   4 | * Updates (release date):
   5 | * 1.0.05  (30jan2018)
   6 | *         First public release.
   7 | *         Added seed(.) option to rlasso/lassoutils to control rnd # seed for xdep & sup-score.
   8 | *         Fixed bug in DisplayCoefs (didn't accommodate both e(notpen) and e(pnotpen)).
   9 | *         Promoted to require version 13 or higher.
  10 | *         Added dots option.
  11 | *         Fixed displaynames bug (wrong dictionaries used for partialled-out vars).
  12 | *         Recoding of cons and demeaning flags.
  13 | *         partial and nocons no longer compatible.
  14 | *         Removed hdm version of sup-score stat.
  15 | *         Removed misc debug code.
  16 | * 1.0.06  (xxx)
  17 | *         Support for Sergio Correia's FTOOLS FE transform (if installed).
  18 | 
  19 | program rlasso, eclass sortpreserve
  20 | 
  21 | 	version 13
  22 | 	
  23 | 	syntax [anything] [if] [in] [,		///
  24 | 		displayall						///
  25 | 		varwidth(int 17)				///
  26 | 		VERsion							///
  27 | 		supscore						///
  28 | 		testonly						///
  29 | 		*								///
  30 | 		]
  31 | 		
  32 | 	local lversion 1.0.05
  33 | 
  34 | 	if "`version'" != "" {							//  Report program version number, then exit.
  35 | 		di in gr "`lversion'"
  36 | 		ereturn clear
  37 | 		ereturn local version `lversion'
  38 | 		exit
  39 | 	}
  40 | 
  41 | 	if ~replay() {									//  not replay so estimate
  42 | 		_rlasso `anything' `if' `in',				///
  43 | 		`options' `supscore' `testonly'
  44 | 	}
  45 | 	else if e(cmd)~="rlasso" {						//  replay, so check that rlasso results exist
  46 | 		di as err "last estimates not found"
  47 | 		exit 301
  48 | 	}
  49 | 	
  50 | 	if "`e(method)'"~="" {
  51 | 		DisplayCoefs, `displayall' varwidth(`varwidth')
  52 | 	}
  53 | 	
  54 | 	// temp measure
  55 | 	if e(supscore) < . {
  56 | 		DisplaySupScore
  57 | 	}
  58 | 
  59 | end
  60 | 
  61 | program _rlasso, eclass sortpreserve
  62 | 
  63 | 	version 13
  64 | 
  65 | 	syntax varlist(numeric fv ts min=2) [if] [in] [,	///
  66 | 														/// specify options with varlists to be used by marksample/markout
  67 | 		PNOTPen(varlist fv ts numeric)					/// list of variables not penalised
  68 | 		partial(string)									/// string so that list can contain "_cons"
  69 | 		fe												/// do within-transformation
  70 | 		NOCONStant										///  
  71 | 		CLuster(varlist max=1)							/// penalty level/loadings allow for within-panel dependence & heterosk.
  72 | 		pols											/// post-lasso coefs in e(b) (default=lasso)
  73 | 		prestd											///
  74 | 		VERbose											/// pass to lassoutils
  75 | 		VVERbose										/// pass to lassoutils
  76 | 		dots											///
  77 | 		displaynames_o(string)							/// dictionary with names of vars as supplied in varlist
  78 | 		displaynames_d(string)							/// corresponding display names of vars
  79 | 		pminus(int 0)									/// overrides calculation of pminus
  80 | 		debug											/// used for debugging
  81 | 		postall											/// full coef vector in e(b) (default=selected only)
  82 | 		testonly										/// obtain supscore test only
  83 | 		NOFTOOLS										///
  84 | 		*												/// additional options to be passed to lassoutils
  85 | 		]
  86 | 
  87 | 	*** rlasso-specific
  88 | 	//  to distinguish between lasso2 and rlasso treatment of notpen,
  89 | 	//  rlasso option is called pnotpen
  90 | 	//  to keep lasso2 and rlasso code aligned, rename to notpen here
  91 | 	//  and at end of program save macros as pnotpen
  92 | 	//  temporary measure until lasso2 and rlasso code is merge
  93 | 	local notpen	`pnotpen'
  94 | 	//  supscore test flag
  95 | 	local testonlyflag	=("`testonly'"~="")
  96 | 	*
  97 | 
  98 | 	*** debug mode; create flag
  99 | 	local debugflag	=("`debug'"~="")
 100 | 	*
 101 | 
 102 | 	*** Record which observations have non-missing values
 103 | 	marksample touse
 104 | 	markout `touse' `varlist' `cluster' `ivar'
 105 | 	sum `touse' if `touse', meanonly		//  will sum weight var when weights are used
 106 | 	local N		= r(N)
 107 | 	*
 108 | 
 109 | 	*** FEs. Create 1/0 flag.
 110 | 	// Get panel id
 111 | 	local feflag=("`fe'"~="")
 112 | 	if `feflag' {
 113 | 		cap _xt
 114 | 		if _rc ~= 0 {
 115 | 			di as err "Error: fe option requires data to be xtset"
 116 | 			exit 459
 117 | 		}
 118 | 		else {
 119 | 			local ivar	`r(ivar)'
 120 | 		}
 121 | 	}
 122 | 	*
 123 | 	
 124 | 	*** constant, partial, etc.
 125 | 	// conmodel: constant in original model
 126 | 	// consflag: constant in transformed equation to estimate
 127 | 	local consmodel		=("`noconstant'"=="") & ~`feflag'	//  if fe, then consmodel=0 & partialcons=""
 128 | 	local partialflag	=("`partial'"~="")					//  =1 even if just cons being partialled out
 129 | 	local prestdflag	=("`prestd'"~="")
 130 | 	// "_cons" allowed as an argument to partial(.) - remove it
 131 | 	local partial		: subinstr local partial "_cons" "", all word count(local pconscount)
 132 | 	local notpen		: subinstr local notpen "_cons" "", all word count(local notpenconscount)
 133 | 	// Tell estimation code if cons has been partialled out or there isn't one in the first place
 134 | 	if `feflag' | `partialflag' | `prestdflag' | (~`consmodel') {
 135 | 		local consflag	0
 136 | 	}
 137 | 	else {
 138 | 		local consflag	1
 139 | 	}
 140 | 	*
 141 | 
 142 | 	*** create main varlist and tempvars
 143 | 	// remove duplicates from varlist
 144 | 	// _o list is vars with original names
 145 | 	fvexpand `varlist' if `touse'
 146 | 	local varlist_o	`r(varlist)'
 147 | 	// check for duplicates has to follow expand
 148 | 	local dups			: list dups varlist_o
 149 | 	if "`dups'"~="" {
 150 | 		di as text "Dropping duplicates: `dups'"
 151 | 	}
 152 | 	local varlist_o		: list uniq varlist_o
 153 | 	*
 154 | 
 155 | 	*** Create separate _o varlists: Y, X, notpen, partial
 156 | 	// Y, X
 157 | 	local varY_o		: word 1 of `varlist_o'
 158 | 	local varX_o		: list varlist_o - varY_o				//  incl notpen/partial
 159 | 	// notpen
 160 | 	fvexpand `notpen' if `touse'
 161 | 	local notpen_o		`r(varlist)'
 162 | 	local dups			: list dups notpen_o
 163 | 	if "`dups'"~="" {
 164 | 		di as text "Dropping duplicates: `dups'"
 165 | 	}
 166 | 	local notpen_o		: list uniq notpen_o
 167 | 	// partial
 168 | 	fvexpand `partial' if `touse'
 169 | 	local partial_o		`r(varlist)'
 170 | 	local dups			: list dups partial_o
 171 | 	if "`dups'"~="" {
 172 | 		di as text "Dropping duplicates: `dups'"
 173 | 	}
 174 | 	local partial_o		: list uniq partial_o
 175 | 	// "model" = vars without partialled-out
 176 | 	local varXmodel_o	: list varX_o - partial_o
 177 | 	*
 178 | 	
 179 | 	*** syntax checks
 180 | 	// check that notpen vars are in full list
 181 | 	local checklist	: list notpen_o - varX_o
 182 | 	local checknum	: word count `checklist'
 183 | 	if `checknum' {
 184 | 		di as err "syntax error - `checklist' in notpen(.) but not in list of regressors"
 185 | 		exit 198
 186 | 	}
 187 | 	// check that partial vars are in full list
 188 | 	local checklist	: list partial_o - varX_o
 189 | 	local checknum	: word count `checklist'
 190 | 	if `checknum' {
 191 | 		di as err "syntax error - `checklist' in partial(.) but not in list of regressors"
 192 | 		exit 198
 193 | 	}
 194 | 	// check that ivar (FE) is not a used variable
 195 | 	if `feflag' {
 196 | 		fvrevar `varY_o' `varX_o', list					//  list option means we get only base vars
 197 | 		local vlist `r(varlist)'
 198 | 		local checklist	: list ivar - vlist
 199 | 		local checknum	: word count `checklist'
 200 | 		if `checknum'==0 {
 201 | 			di as err "syntax error - `ivar' is xtset variable and cannot be used in model"
 202 | 			exit 198
 203 | 		}
 204 | 	}
 205 | 	// other checks
 206 | 	if `pconscount' & `feflag' {
 207 | 		di as err "error: incompatible options, partial(_cons) and fe"
 208 | 		exit 198
 209 | 	}
 210 | 	if "`partial'"~="" & "`noconstant'"~="" {
 211 | 		di as err "error: incompatible options, partial and nocons"
 212 | 		exit 198
 213 | 	}
 214 | 	if `feflag' & "`noconstant'"~="" {
 215 | 		di as err "error: incompatible options, fe and nocons"
 216 | 		exit 198
 217 | 	}
 218 | 	*
 219 | 	
 220 | 	*** Create _t varlists: Y, X, notpen, partial
 221 | 	// _o list is vars with original names
 222 | 	// _t list is temp vars if transform needed, original vars if not
 223 | 	if `feflag' {												//  everything needs to be transformed including partial
 224 | 		local temp_ct : word count `varlist_o'
 225 | 		mata: s_maketemps(`temp_ct')
 226 | 		local varlist_t `r(varlist)'
 227 | 	}
 228 | 	else if `partialflag' | `prestdflag' {						//  everything except partial_o needs to be transformed
 229 | 		local varYXmodel_o `varY_o' `varXmodel_o'
 230 | 		local temp_ct : word count `varYXmodel_o'
 231 | 		mata: s_maketemps(`temp_ct')
 232 | 		local varYXmodel_t `r(varlist)'
 233 | 		matchnames "`varlist_o'" "`varYXmodel_o'" "`varYXmodel_t'"
 234 | 		local varlist_t		`r(names)'
 235 | 	}
 236 | 	else {														//  no transformation needed but still need temps
 237 | 		fvrevar `varlist_o' if `touse'							//  fvrevar creates temps only when needed
 238 | 		local varlist_t		`r(varlist)'
 239 | 	}
 240 | 	// dictionary is now varlist_o / varlist_t
 241 | 	// now create separate _o and _t varlists using dictionary
 242 | 	foreach vlist in varY varX varXmodel notpen partial {
 243 | 		matchnames "``vlist'_o'" "`varlist_o'" "`varlist_t'"
 244 | 		local `vlist'_t		`r(names)'						//  corresponding tempnames; always need this because of possible fvs
 245 | 	}
 246 | 	*
 247 | 
 248 | 	******************* Display names ***********************************************************
 249 | 	//  may be called by another program with tempvars and display names for them
 250 | 	//  if display names option not used, use _o names as provided in rlasso command
 251 | 	//  if display names option used, use display names matched with _o names
 252 | 	//  if display names macros are empty, has no effect
 253 | 	matchnames "`varY_o'" "`displaynames_o'" "`displaynames_d'"
 254 | 	local varY_d		`r(names)'
 255 | 	matchnames "`varXmodel_o'" "`displaynames_o'" "`displaynames_d'"
 256 | 	local varXmodel_d	`r(names)'
 257 | 	matchnames "`varX_o'" "`displaynames_o'" "`displaynames_d'"
 258 | 	local varX_d		`r(names)'
 259 | 	matchnames "`notpen_o'" "`displaynames_o'" "`displaynames_d'"
 260 | 	local notpen_d		`r(names)'
 261 | 	matchnames "`partial_o'" "`displaynames_o'" "`displaynames_d'"
 262 | 	local partial_d		`r(names)'
 263 | 	*
 264 | 
 265 | 	*** summary varlists and flags:
 266 | 	* varY_o		= dep var
 267 | 	* varY_t		= dep var, temp var
 268 | 	* varX_o		= full, expanded set of RHS, original names, includes partial
 269 | 	* varX_t		= as above but with temp names for all variables
 270 | 	* varXmodel_o	= full, expanded set of RHS, original names, excludes partial
 271 | 	* varXmodel_t	= as above but with temp names for all variables
 272 | 	* notpen_o		= full, expanded set of not-penalized
 273 | 	* notpen_t		= as above but with temp names for all variables
 274 | 	
 275 | 	//  p is number of penalized vars in the model; follows convention in BCH papers
 276 | 	//  p is calculated in lassoutils/_rlasso as number of model vars excluding constant
 277 | 	//  here we calculate which of the model vars are unpenalized or omitted/base vars
 278 | 	//  to provide as `pminus' to lassoutils/_rlasso (unless provided by user)
 279 | 	//  do here so that code above is compatible with lasso2
 280 | 	//  use _o names / display names since they have info on whether var is omitted/base/etc.
 281 | 	if ~`pminus' {
 282 | 		foreach vn of local varXmodel_d {								//  display names
 283 | 			_ms_parse_parts `vn'
 284 | 			// increment pminus if model variable is MISSING
 285 | 			if r(omit) {
 286 | 				local ++pminus
 287 | 			}
 288 | 		}
 289 | 		foreach vn of local notpen_d {									//  display names
 290 | 			_ms_parse_parts `vn'
 291 | 			// increment pminus if notpen variable is NOT MISSING
 292 | 			if ~r(omit) {
 293 | 				local ++pminus
 294 | 			}
 295 | 		}
 296 | 	}
 297 | 	//  p0 here is total number of variables provided to model EXCLUDING constant
 298 | 	local p0	: word count `varXmodel_o'
 299 | 	local p		=`p0'-`pminus'
 300 | 	// warn
 301 | 	if `p'<=0 {
 302 | 		di as text "warning: no penalized regressors; results are OLS"
 303 | 	}
 304 | 	//  now for error-checking below, p0 should INCLUDE constant unless partialled-out etc.
 305 | 	local p0	=`p0'+`consflag'
 306 | 	*
 307 | 
 308 | 	******************* FE, partialling out, standardization ************************************
 309 | 	//  If FE:    partial-out FEs from temp variables, then preserve,
 310 | 	//            then partial-out low-dim ctrls from temp variables
 311 | 	//            restore will restore all temp vars with only FEs partialled-out
 312 | 	//  If no FE: leave original variables unchanged.
 313 | 	//            partial-out low-dim ctrls from temp variables.
 314 | 	//            if no FE/low-dim ctrls, no transform needed
 315 | 
 316 | 	local dmflag	=0										//  initialize demeaned flag
 317 | 	if `feflag' {											//  FE-transform all variables
 318 | 		fvrevar `varY_o' `varX_o' if `touse'				//  in case any FV or TS vars in _o list
 319 | 		local vlist `r(varlist)'
 320 | 		lassoutils `vlist',									/// call on _o list
 321 | 						touse(`touse')						///
 322 | 						tvarlist(`varY_t' `varX_t')			/// overwrite/initialize these
 323 | 						`noftools'							///
 324 | 						fe(`ivar')							//  triggers branching to FE utility
 325 | 		local N_g	=r(N_g)									//  N_g will be empty if no FEs
 326 | 		local noftools `r(noftools)'						//  either not installed or user option
 327 | 		local dmflag=1										//  data are now demeaned
 328 | 		if `partialflag' {									//  And then partial out any additional vars	
 329 | 			preserve										//  preserve the original values of tempvars before partialling out
 330 | 			lassoutils `varY_t' `varXmodel_t',				/// _t vars have been created and filled so use here
 331 | 							touse(`touse')					/// don't need tvarlist because vars already created
 332 | 							partial(`partial_t')			/// _t vars have been created and filled so use here
 333 | 							partialflag(`partialflag')		/// triggers branching to partial utility
 334 | 							dmflag(1)						//  FE => mean zero
 335 | 		}
 336 | 		if `prestdflag' {
 337 | 			tempname prestdY prestdX
 338 | 			lassoutils `varY_t',							/// _t vars have been created and filled so use here
 339 | 							touse(`touse')					/// don't need tvarlist because vars already created
 340 | 							std								///
 341 | 							dmflag(1)						//  FE => data already mean zero
 342 | 			mat `prestdY'=r(stdvec)
 343 | 			lassoutils `varXmodel_t',						/// 
 344 | 							touse(`touse')					/// 
 345 | 							std								///
 346 | 							dmflag(1)						//  FE => data already mean zero 
 347 | 			mat `prestdX'=r(stdvec)
 348 | 		}
 349 | 	}
 350 | 	else if `partialflag' {									//  Just partial out
 351 | 		fvrevar `varY_o' `varXmodel_o' if `touse'			//  in case any FV or TS vars in _o list
 352 | 		local vlist `r(varlist)'
 353 | 		fvrevar `partial_o' if `touse'						//  in case any FV or TS vars in _o list
 354 | 		local pvlist `r(varlist)'
 355 | 		lassoutils `vlist',									/// call on _o list
 356 | 						touse(`touse')						///
 357 | 						partial(`pvlist')					///
 358 | 						tvarlist(`varY_t' `varXmodel_t')	/// overwrite/initialize these
 359 | 						partialflag(`partialflag')			/// triggers branching to partial utility
 360 | 						dmflag(0)							//  data are not yet demeaned
 361 | 		local dmflag	=1									//  data are now demeaned
 362 | 		if `prestdflag' {
 363 | 			tempname prestdY prestdX
 364 | 			lassoutils `varY_t',							/// _t vars have been created and filled so use here
 365 | 							touse(`touse')					/// don't need tvarlist because vars already created
 366 | 							std								///
 367 | 							dmflag(1)						//  partial => already mean zero
 368 | 			mat `prestdY'=r(stdvec)
 369 | 			lassoutils `varXmodel_t',						/// 
 370 | 							touse(`touse')					/// 
 371 | 							std								///
 372 | 							dmflag(1)						//  partial => already mean zero 
 373 | 			mat `prestdX'=r(stdvec)
 374 | 		}
 375 | 	}
 376 | 	else if `prestdflag' {
 377 | 		tempname prestdY prestdX
 378 | 		lassoutils `varY_o',								/// call on _o list
 379 | 						touse(`touse')						///
 380 | 						std									///
 381 | 						tvarlist(`varY_t')					/// overwrite/initialize these
 382 | 						consmodel(`consmodel')				/// =1 => data should be demeaned
 383 | 						dmflag(0)							//  data not yet mean zero
 384 | 		mat `prestdY'=r(stdvec)
 385 | 		fvrevar `varXmodel_o' if `touse'					//  in case any FV or TS vars in _o list
 386 | 		local vlist `r(varlist)'
 387 | 		lassoutils `vlist',									/// call on _o list
 388 | 						touse(`touse')						///
 389 | 						std									///
 390 | 						tvarlist(`varXmodel_t')				/// overwrite/initialize these
 391 | 						consmodel(`consmodel')				/// =1 => data should be demeaned
 392 | 						dmflag(0)							//  data not yet mean zero
 393 | 		mat `prestdX'=r(stdvec)
 394 | 		if `consmodel' {
 395 | 			local dmflag	=1								//  if cons in model, data are now demeaned
 396 | 		}
 397 | 	}
 398 | 
 399 | 	************* Partialling/standardization END ***********************************************
 400 | 	
 401 | 	************* Lasso estimation with transformed/partialled-out vars *************************
 402 | 	if "`verbose'`vverbose'`dots'"=="" {
 403 | 		local quietly "quietly"							//  don't show lassoutils output
 404 | 	}
 405 | 
 406 | 	`quietly' lassoutils `varY_t',						///
 407 | 						rlasso							/// branch to _rlasso subroutine
 408 | 														/// nocons, no penloads, etc. all assumed
 409 | 						touse(`touse')					///
 410 | 						xnames_o(`varXmodel_d')			/// display names for lassoutils output
 411 | 						xnames_t(`varXmodel_t')			///
 412 | 						cluster(`cluster')				///
 413 | 						notpen_o(`notpen_d')			///
 414 | 						notpen_t(`notpen_t')			///
 415 | 						consflag(`consflag')			/// =0 if cons already partialled out or if no cons
 416 | 						dmflag(`dmflag')				/// =1 if data have been demeaned
 417 | 						pminus(`pminus')				///
 418 | 						stdy(`prestdY')					///
 419 | 						stdx(`prestdX')					///
 420 | 						`verbose' `vverbose' `dots'		///
 421 | 						`testonly'						///
 422 | 						`options'
 423 | 	*
 424 | 
 425 | 	************* Finish up ********************************************************
 426 | 	*** e-return lasso estimation results
 427 | 	tempname b beta betaOLS Ups sUps eUps
 428 | 	tempname betaAll betaAllOLS
 429 | 	tempname lambda slambda lambda0 rmse rmseOLS
 430 | 	tempname c gamma gammad
 431 | 	tempname supscore supscore_p supscore_cv supscore_gamma
 432 | 	
 433 | 	if ~`testonlyflag' {
 434 | 	
 435 | 		if "`cluster'" ~= "" {
 436 | 			local N_clust		=r(N_clust)
 437 | 		}
 438 | 		mat `beta'			=r(beta)		//  may be empty!
 439 | 		mat `betaOLS'		=r(betaOLS)		//  may be empty!
 440 | 		mat `betaAll'		=r(betaAll)
 441 | 		mat `betaAllOLS'	=r(betaAllOLS)
 442 | 		mat `Ups'			=r(Ups)
 443 | 		mat `sUps'			=r(sUps)
 444 | 		mat `eUps'			=r(eUps)
 445 | 		scalar `lambda'		=r(lambda)
 446 | 		scalar `slambda'	=r(slambda)
 447 | 		scalar `lambda0'	=r(lambda0)
 448 | 		scalar `c'			=r(c)
 449 | 		scalar `gamma'		=r(gamma)
 450 | 		scalar `gammad'		=r(gammad)
 451 | 		scalar `rmse'		=r(rmse)		//  Lasso RMSE
 452 | 		scalar `rmseOLS'	=r(rmseOLS)		//  post-Lasso RMSE
 453 | 		local selected		`r(selected)'	//  EXCL NOTPEN/CONS
 454 | 		local selected0		`r(selected0)'	//  INCL NOTPEN, EXCL CONS
 455 | 		local s				=r(s)			//  EXCL NOTPEN/CONS; number of elements in selected
 456 | 		local s0			=r(s0)			//  INCL NOTPEN, EXCL CONS; number of elements in selected0
 457 | 		local clustvar		`r(clustvar)'
 458 | 		local robust		`r(robust)'
 459 | 		local center		=r(center)
 460 | 		local method		`r(method)'		//  lasso or sqrt-lasso
 461 | 		local niter			=r(niter)
 462 | 		local maxiter		=r(maxiter)
 463 | 		local nupsiter		=r(nupsiter)
 464 | 		local maxupsiter	=r(maxupsiter)
 465 | 		// these can be missings
 466 | 		scalar `supscore'		=r(supscore)
 467 | 		scalar `supscore_p'		=r(supscore_p)
 468 | 		scalar `supscore_cv'	=r(supscore_cv)
 469 | 		scalar `supscore_gamma'	=r(supscore_gamma)
 470 | 		local ssnumsim			=r(ssnumsim)
 471 | 
 472 | 		// flag for empty beta (consflag=0 means rlasso didn't estimate a constant)
 473 | 		local betaempty		=(`s0'==0 & `consflag'==0)
 474 | 		// error check
 475 | 		if `betaempty' {
 476 | 			if ~(colsof(`beta')==1 & `beta'[1,1]==.) {
 477 | 				di as err "internal _rlasso error - beta should be empty (no vars estimated) but isn't
 478 | 				exit 499
 479 | 			}
 480 | 		} 
 481 | 		// issue warning if lasso max iteration limit hit
 482 | 		if `niter'==`maxiter' {
 483 | 			di as text "Warning: reached max shooting iterations w/o achieving convergence."
 484 | 		}
 485 | 		// error check - p0s and ps should match
 486 | 		if `p0'~=r(p0) {					//  number of all variables in betaAll INCL NOTPEN/CONS (if present or not partialled etc.)
 487 | 			di as err "internal _rlasso error - p0 count of model vars `p0' does not match returned value `r(p0)'"
 488 | 			exit 499
 489 | 		}
 490 | 		if `p'~=r(p) {						//  number of penalized variables in model
 491 | 			di as err "internal _rlasso error - p count of penalized vars `p' does not match returned value `r(p)'"
 492 | 			exit 499
 493 | 		}
 494 | 		// fix depvar (rownames) of beta vectors to use _o (or _d if display names provided) not _t
 495 | 		mat rownames `beta'			= `varY_d'
 496 | 		mat rownames `betaOLS'		= `varY_d'
 497 | 		mat rownames `betaAll'		= `varY_d'
 498 | 		mat rownames `betaAllOLS'	= `varY_d'
 499 | 		if ~`betaempty' {								// cnames should stay empty if beta has a single missing value
 500 | 			local cnames_o	: colnames `beta'
 501 | 			fvstrip `cnames_o'							//  colnames may insert b/n/o operators - remove
 502 | 			local cnames_o	`r(varlist)'
 503 | 			matchnames "`cnames_o'" "`varlist_o'" "`varlist_t'"
 504 | 			local cnames_t	`r(names)'
 505 | 		}
 506 | 		else {
 507 | 			local cnames_o
 508 | 			local cnames_t
 509 | 		}
 510 | 		*
 511 | 	
 512 | 		*********** Get coeff estimates for partialled-out vars/std intercept. ********************
 513 | 		if `feflag' & `partialflag' {					//  FE case and there are partialled-out notpen vars
 514 | 			restore										//  Restores dataset with tempvars after FE transform but before notpen partialled out
 515 | 		}
 516 | 		if `partialflag' | (`prestdflag' & `consmodel') {	//  standardization removes constant so must enter for that
 517 | 			if `feflag' {
 518 | 				local depvar `varY_t'					//  use FE-transformed depvar and X vars
 519 | 				local scorevars `cnames_t'
 520 | 			}
 521 | 			else {
 522 | 				local depvar `varY_o'					//  use original depvar and X vars
 523 | 				local scorevars `cnames_o'
 524 | 			}
 525 | 			lassoutils `depvar',						///
 526 | 				unpartial								///
 527 | 				touse(`touse')							///
 528 | 				beta(`beta')							///
 529 | 				scorevars(`scorevars')					///
 530 | 				partial(`partial_t')					///
 531 | 				names_o(`varX_d')						/// dictionary
 532 | 				names_t(`varX_t')						///	dictionary
 533 | 				consmodel(`consmodel')
 534 | 			mat `beta'			= r(b)
 535 | 			mat `betaAll'		= `betaAll', r(bpartial)
 536 | 			lassoutils `depvar',						///
 537 | 				unpartial								///
 538 | 				touse(`touse')							///
 539 | 				beta(`betaOLS')							///
 540 | 				scorevars(`scorevars')					///
 541 | 				partial(`partial_t')					///
 542 | 				names_o(`varX_d')						/// dictionary
 543 | 				names_t(`varX_t')						///	dictionary
 544 | 				consmodel(`consmodel')
 545 | 			mat `betaOLS'		= r(b)
 546 | 			mat `betaAllOLS'	= `betaAllOLS', r(bpartial)
 547 | 			// for unknown reasons, _ms_build_info doesn't add info here (e.g. "base")
 548 | 			_ms_build_info	`beta' if `touse'
 549 | 			_ms_build_info	`betaAll' if `touse'
 550 | 			_ms_build_info	`betaOLS' if `touse'
 551 | 			_ms_build_info	`betaAllOLS' if `touse'
 552 | 			// finish by setting betaempty to 0
 553 | 			local betaempty	=0
 554 | 		}
 555 | 		*
 556 | 	
 557 | 		*** Prepare and post results
 558 | 		if "`pols'"=="" & "`postall'"=="" {										//  selected lasso coefs by default
 559 | 			mat `b' = `beta'
 560 | 		}
 561 | 		else if "`pols'"~="" & "`postall'"=="" {								//  selected post-lasso coefs
 562 | 			mat `b' = `betaOLS'
 563 | 		}
 564 | 		else if "`pols'"=="" {												//  full lasso coef vector
 565 | 			mat `b' = `betaAll'
 566 | 		}
 567 | 		else {																	//  full post-lasso coef vector
 568 | 			mat `b' = `betaAllOLS'
 569 | 		}
 570 | 		if `betaempty' & "`postall'"=="" {										//  no vars in b
 571 | 			ereturn post    , obs(`N') depname(`varY_d') esample(`touse')		//  display name
 572 | 		}
 573 | 		else {																	//  b has some selected/nonpen/cons
 574 | 			ereturn post `b', obs(`N') depname(`varY_d') esample(`touse')		//  display name
 575 | 		}	
 576 | 		// additional returned results
 577 | 		ereturn local noftools		`noftools'
 578 | 		ereturn local postall		`postall'
 579 | 		ereturn scalar niter		=`niter'
 580 | 		ereturn scalar maxiter		=`maxiter'
 581 | 		ereturn scalar nupsiter		=`nupsiter'
 582 | 		ereturn scalar maxupsiter	=`maxupsiter'
 583 | 		ereturn local robust		`robust'
 584 | 		ereturn local ivar			`ivar'
 585 | 		ereturn local selected		`selected'			//  selected only
 586 | 		ereturn local varXmodel		`varXmodel_d'		//  display name
 587 | 		ereturn local varX			`varX_d'			//  display name
 588 | 		if "`pols'"=="" {
 589 | 			ereturn local estimator	ols
 590 | 		}
 591 | 		else {
 592 | 			ereturn local estimator	postlasso
 593 | 		}
 594 | 		ereturn local method		`method'
 595 | 		ereturn local predict		rlasso_p
 596 | 		ereturn local cmd			rlasso
 597 | 		ereturn scalar center		=`center'
 598 | 		ereturn scalar cons			=`consmodel'
 599 | 		ereturn scalar lambda		=`lambda'
 600 | 		ereturn scalar lambda0		=`lambda0'
 601 | 		ereturn scalar slambda		=`slambda'
 602 | 		ereturn scalar c			=`c'
 603 | 		ereturn scalar gamma		=`gamma'
 604 | 		ereturn scalar gammad		=`gammad'
 605 | 	
 606 | 		if `supscore' < . {
 607 | 			ereturn scalar ssnumsim			=`ssnumsim'
 608 | 			ereturn scalar supscore			=`supscore'
 609 | 			ereturn scalar supscore_p		=`supscore_p'
 610 | 			ereturn scalar supscore_cv		=`supscore_cv'
 611 | 			ereturn scalar supscore_gamma	=`supscore_gamma'
 612 | 		}
 613 | 	
 614 | 		if "`N_clust'" ~= "" {
 615 | 			ereturn local clustvar	`clustvar'
 616 | 			ereturn scalar N_clust	=`N_clust'
 617 | 		}
 618 | 		if "`N_g'" ~= "" {
 619 | 			ereturn scalar N_g		=`N_g'
 620 | 		}
 621 | 		ereturn scalar fe			=`feflag'
 622 | 		ereturn scalar rmse			=`rmse'
 623 | 		ereturn scalar rmseOLS		=`rmseOLS'
 624 | 		ereturn scalar pminus		=`pminus'
 625 | 		ereturn scalar p			=`p'					//  number of all penalized vars; excludes omitteds etc.
 626 | 		ereturn scalar s0			=`s0'					//  number of all estimated coefs (elements of beta)
 627 | 		ereturn scalar s			=`s'					//  number of selected
 628 | 	
 629 | 		ereturn matrix sUps 		=`sUps'
 630 | 		ereturn matrix eUps 		=`eUps'
 631 | 		ereturn matrix Ups 			=`Ups'
 632 | 		ereturn matrix betaAllOLS	=`betaAllOLS'
 633 | 		ereturn matrix betaAll		=`betaAll'
 634 | 		ereturn matrix betaOLS		=`betaOLS'
 635 | 		ereturn matrix beta			=`beta'
 636 | 	
 637 | 		// rlasso-specific:
 638 | 		// selected0 and s0 included partialled-out.
 639 | 		// If cons exists and was not partialled out, add to notpen and selected0.
 640 | 		// Otherwise if cons exists and was partialled out, add to to partial list.
 641 | 		if `consmodel'  & ~`partialflag' {
 642 | 			local selected0			`selected0' _cons
 643 | 			local notpen_d			`notpen_d' _cons			//  display name
 644 | 		}
 645 | 		else if `consmodel' & `partialflag' {
 646 | 			local partial_d			`partial_d' _cons			//  display name
 647 | 			local selected0			`selected0' `partial_d'		//  display name
 648 | 		}
 649 | 		else if `partialflag' {
 650 | 			local selected0			`selected0'	`partial_d'		//  display name
 651 | 		}
 652 | 		// remaining results
 653 | 		ereturn local selected0		`selected0'
 654 | 		ereturn local partial		`partial_d'					//  display name
 655 | 		ereturn scalar partial_ct	=`: word count `partial_d''	//  (display name) number of partialled-out INCLUDING CONSTANT
 656 | 		ereturn scalar s0			=`: word count `selected0''	//  (update) selected or notpen, INCL CONS
 657 | 		// rlasso-specific - save as "pnotpen" (vs lasso2 "notpen")
 658 | 		ereturn local pnotpen		`notpen_d'					//  display name
 659 | 		ereturn scalar pnotpen_ct	=`: word count `notpen_d''	//  (display name) number of notpen INCLUDING CONSTANT (if not partialled-out)
 660 | 		*
 661 | 	}
 662 | 	else {
 663 | 
 664 | 		// sup-score test only - no lasso results
 665 | 		ereturn clear
 666 | 
 667 | 		ereturn scalar N				=r(N)
 668 | 		ereturn scalar N_clust			=r(N_clust)
 669 | 		ereturn scalar gamma			=r(gamma)
 670 | 		ereturn scalar c				=r(c)
 671 | 		ereturn scalar p				=`p'
 672 | 		ereturn scalar ssnumsim			=r(ssnumsim)
 673 | 		ereturn scalar supscore			=r(supscore)
 674 | 		ereturn scalar supscore_p		=r(supscore_p)
 675 | 		ereturn scalar supscore_cv		=r(supscore_cv)
 676 | 		ereturn scalar supscore_gamma	=r(supscore_gamma)
 677 | 		
 678 | 		ereturn local cmd				rlasso
 679 | 		ereturn scalar cons				=`consmodel'
 680 | 	
 681 | 	}
 682 | 	
 683 | end
 684 | 
 685 | prog DisplaySupScore
 686 | 
 687 | 	di
 688 | 	di as text "{help rlasso##supscore:Sup-score} test H0: beta=0"
 689 | 	di as text "CCK sup-score statistic" _col(25) as res %6.2f e(supscore) _c
 690 | 	if e(supscore_p) < . {
 691 | 		di as text _col(32) "p-value=" _col(39) as res %6.3f e(supscore_p)
 692 | 	}
 693 | 	else {
 694 | 		di
 695 | 	}
 696 | 	di as text "CCK "  as res 100*e(supscore_gamma) as text "% critical value" _c
 697 | 	di as res _col(25) %6.2f e(supscore_cv) _col(32) as text "(asympt bound)"
 698 | 
 699 | end
 700 | 
 701 | 
 702 | // Used in rlasso and lasso2.
 703 | // version  2017-12-20
 704 | // updated 31dec17 to accommodate e(pnotpen)
 705 | prog DisplayCoefs
 706 | 
 707 | 	syntax	,								///
 708 | 		[									///
 709 | 		displayall							///  full coef vector in display (default=selected only)
 710 | 		varwidth(int 17)					///
 711 | 		NORecover 							///
 712 | 		]
 713 | 	
 714 | 	local cons			=e(cons)
 715 | 	if ("`norecover'"=="") {
 716 | 		local partial		`e(partial)'
 717 | 		local partial_ct	=e(partial_ct)
 718 | 	}
 719 | 	else {
 720 | 		local partial
 721 | 		local partial_ct	=0
 722 | 	}
 723 | 
 724 | 	// varlists
 725 | 	local selected		`e(selected)'
 726 | 	fvstrip `selected'
 727 | 	local selected		`r(varlist)'
 728 | 	local notpen		`e(notpen)'`e(pnotpen)'
 729 | 	fvstrip `notpen'
 730 | 	local notpen		`r(varlist)'
 731 | 	local selected0		`e(selected0)'
 732 | 	fvstrip `selected0'
 733 | 	local selected0		`r(varlist)'
 734 | 	// coef vectors
 735 | 	tempname beta betaOLS
 736 | 	if "`displayall'"~="" {						//  there must be some vars specified even if nothing selected
 737 | 		mat `beta'		=e(betaAll)
 738 | 		mat `betaOLS'	=e(betaAllOLS)
 739 | 		local col_ct	=colsof(`beta')
 740 | 		local vlist		: colnames `beta'
 741 | 		local vlistOLS	: colnames `betaOLS'
 742 | 		local baselevels baselevels
 743 | 	}
 744 | 	else if e(k)>0 {							//  display only selected, but only if there are any
 745 | 		mat `beta'		=e(beta)
 746 | 		mat `betaOLS'	=e(betaOLS)
 747 | 		local col_ct	=colsof(`beta')
 748 | 		local vlist		: colnames `beta'
 749 | 		local vlistOLS	: colnames `betaOLS'
 750 | 	}
 751 | 	else {										//  nothing selected, zero columns in beta
 752 | 		local col_ct	=0
 753 | 	}
 754 | 	if e(k)>0 {
 755 | 		_ms_build_info `beta' if e(sample)
 756 | 		_ms_build_info `betaOLS' if e(sample)
 757 | 	}
 758 | 
 759 | 	*** (Re-)display coefficients including constant/partial
 760 | 	local varwidth1		=`varwidth'+1
 761 | 	local varwidth3		=`varwidth'+3
 762 | 	local varwidth4		=`varwidth'+4
 763 | 	local varwidthm7	=`varwidth'-7
 764 | 	local varwidthm13	=`varwidth'-13
 765 | 	di
 766 | 	di as text "{hline `varwidth1'}{c TT}{hline 32}"
 767 | 	if "`e(method)'"=="sqrt-lasso" {
 768 | 		di as text _col(`varwidthm7') "Selected {c |}      Sqrt-lasso   Post-est OLS"
 769 | 	}
 770 | 	else if "`e(method)'"=="ridge" {
 771 | 		di as text _col(`varwidthm7') "Selected {c |}           Ridge   Post-est OLS"
 772 | 	}
 773 | 	else if "`e(method)'"=="elastic net" {
 774 | 		di as text _col(`varwidthm7') "Selected {c |}     Elastic net   Post-est OLS"
 775 | 		di as text _col(`varwidthm7') "         {c |}" _c
 776 | 		di as text "   (alpha=" _c
 777 | 		di as text %4.3f `e(alpha)' _c
 778 | 		di as text ")"
 779 | 	}
 780 | 	else if "`e(method)'"=="lasso" {
 781 | 		di as text _col(`varwidthm7') "Selected {c |}           Lasso   Post-est OLS"
 782 | 	}
 783 | 	else {
 784 | 		di as err "internal DisplayCoefs error. unknown method."
 785 | 		exit 1
 786 | 	}
 787 | 	di as text "{hline `varwidth1'}{c +}{hline 32}"
 788 | 	local anynotpen = 0
 789 | 	local i 1
 790 | 	local lastcol = `col_ct' - `partial_ct'
 791 | 	tokenize `vlist'								//  put elements of coef vector into macros 1, 2, ...
 792 | 	while `i' <= `lastcol' {
 793 | 		local vn ``i''
 794 | 		fvstrip `vn'								// get rid of o/b/n prefix for display purposes
 795 | 		local vn		`r(varlist)'
 796 | 		_ms_display, element(`i') matrix(`beta') width(`varwidth') `baselevels'
 797 | 		// in selected or notpen list?
 798 | 		local isselnotpen	: list posof "`vn'" in selected0
 799 | 		local isnotpen		: list posof "`vn'" in notpen
 800 | 		local anynotpen		= `anynotpen' + `isnotpen'
 801 | 		// note attached? base, empty, omitted
 802 | 		qui _ms_display, element(`i') matrix(`beta')
 803 | 		local note `r(note)'
 804 | 		qui _ms_display, element(`i') matrix(`betaOLS')
 805 | 		local noteOLS `r(note)'
 806 | 		// if notpen, add footnote
 807 | 		if `isnotpen' & "`note'"=="" {
 808 | 			di as text "{helpb rlasso##notpen:*}" _c
 809 | 		}
 810 | 		if `isselnotpen' {
 811 | 			// lasso coef
 812 | 			if "`note'"=="" {
 813 | 				di _col(`varwidth4') as res %15.7f el(`beta',1,`i') _c
 814 | 			}
 815 | 			else {
 816 | 				di _col(`varwidth4') as text %15s "`note'" _c
 817 | 			}
 818 | 			// post-lasso coef - can be omitted if collinear
 819 | 			if "`noteOLS'"=="" {
 820 | 				di as res %15.7f el(`betaOLS',1,`i')
 821 | 			}
 822 | 			else {
 823 | 				di as text %15s "`noteOLS'"
 824 | 			}
 825 | 		}
 826 | 		else if "`note'"=="(omitted)" {
 827 | 			// not selected
 828 | 			di _col(`varwidth4') as text %15s "(not selected)" _c
 829 | 			di                   as text %15s "(not selected)"
 830 | 		}
 831 | 		else {
 832 | 			// other eg base var
 833 | 			di as text %15s "`note'" _c
 834 | 			di as text %15s "`noteOLS'"
 835 | 		}
 836 | 		local ++i
 837 | 	}
 838 | 	if `partial_ct' {
 839 | 		di as text "{hline `varwidth1'}{c +}{hline 32}"
 840 | 		di as text _col(`varwidthm13') "Partialled-out{help lasso2##notpen:*}{c |}"
 841 | 		di as text "{hline `varwidth1'}{c +}{hline 32}"
 842 | 		local i = `lastcol'+1
 843 | 		while `i' <= `col_ct' {
 844 | 			local vn ``i''
 845 | 			fvstrip `vn'								// get rid of o/b/n prefix for display purposes
 846 | 			local vn		`r(varlist)'
 847 | 			_ms_display, element(`i') matrix(`beta') width(`varwidth') `baselevels'
 848 | 			// note attached? base, empty, omitted
 849 | 			qui _ms_display, element(`i') matrix(`beta')
 850 | 			local note `r(note)'
 851 | 			qui _ms_display, element(`i') matrix(`betaOLS')
 852 | 			local noteOLS `r(note)'
 853 | 			// lasso coef
 854 | 			if "`note'"=="" {
 855 | 				di _col(`varwidth4') as res %15.7f el(`beta',1,`i') _c
 856 | 			}
 857 | 			else {
 858 | 				di _col(`varwidth4') as text %15s "`note'" _c
 859 | 			}
 860 | 			// post-lasso coef - can be omitted if collinear
 861 | 			if "`noteOLS'"=="" {
 862 | 				di as res %15.7f el(`betaOLS',1,`i')
 863 | 			}
 864 | 			else {
 865 | 				di as text %15s "`noteOLS'"
 866 | 			}
 867 | 			local ++i
 868 | 		}
 869 | 	}
 870 | 	di as text "{hline `varwidth1'}{c BT}{hline 32}"
 871 | 	
 872 | 	if `anynotpen' {
 873 | 		di "{help rlasso##notpen:*Not penalized}"
 874 | 	}
 875 | 	
 876 | end
 877 | 
 878 | *************************** Stata utilities ******************************
 879 | 
 880 | // internal version of fvstrip 1.01 ms 24march2015
 881 | // takes varlist with possible FVs and strips out b/n/o notation
 882 | // returns results in r(varnames)
 883 | // optionally also omits omittable FVs
 884 | // expand calls fvexpand either on full varlist
 885 | // or (with onebyone option) on elements of varlist
 886 | 
 887 | program define fvstrip, rclass
 888 | 	version 11.2
 889 | 	syntax [anything] [if] , [ dropomit expand onebyone NOIsily ]
 890 | 	if "`expand'"~="" {												//  force call to fvexpand
 891 | 		if "`onebyone'"=="" {
 892 | 			fvexpand `anything' `if'								//  single call to fvexpand
 893 | 			local anything `r(varlist)'
 894 | 		}
 895 | 		else {
 896 | 			foreach vn of local anything {
 897 | 				fvexpand `vn' `if'									//  call fvexpand on items one-by-one
 898 | 				local newlist	`newlist' `r(varlist)'
 899 | 			}
 900 | 			local anything	: list clean newlist
 901 | 		}
 902 | 	}
 903 | 	foreach vn of local anything {									//  loop through varnames
 904 | 		if "`dropomit'"~="" {										//  check & include only if
 905 | 			_ms_parse_parts `vn'									//  not omitted (b. or o.)
 906 | 			if ~`r(omit)' {
 907 | 				local unstripped	`unstripped' `vn'				//  add to list only if not omitted
 908 | 			}
 909 | 		}
 910 | 		else {														//  add varname to list even if
 911 | 			local unstripped		`unstripped' `vn'				//  could be omitted (b. or o.)
 912 | 		}
 913 | 	}
 914 | // Now create list with b/n/o stripped out
 915 | 	foreach vn of local unstripped {
 916 | 		local svn ""											//  initialize
 917 | 		_ms_parse_parts `vn'
 918 | 		if "`r(type)'"=="variable" & "`r(op)'"=="" {			//  simplest case - no change
 919 | 			local svn	`vn'
 920 | 		}
 921 | 		else if "`r(type)'"=="variable" & "`r(op)'"=="o" {		//  next simplest case - o.varname => varname
 922 | 			local svn	`r(name)'
 923 | 		}
 924 | 		else if "`r(type)'"=="variable" {						//  has other operators so strip o but leave .
 925 | 			local op	`r(op)'
 926 | 			local op	: subinstr local op "o" "", all
 927 | 			local svn	`op'.`r(name)'
 928 | 		}
 929 | 		else if "`r(type)'"=="factor" {							//  simple factor variable
 930 | 			local op	`r(op)'
 931 | 			local op	: subinstr local op "b" "", all
 932 | 			local op	: subinstr local op "n" "", all
 933 | 			local op	: subinstr local op "o" "", all
 934 | 			local svn	`op'.`r(name)'							//  operator + . + varname
 935 | 		}
 936 | 		else if"`r(type)'"=="interaction" {						//  multiple variables
 937 | 			forvalues i=1/`r(k_names)' {
 938 | 				local op	`r(op`i')'
 939 | 				local op	: subinstr local op "b" "", all
 940 | 				local op	: subinstr local op "n" "", all
 941 | 				local op	: subinstr local op "o" "", all
 942 | 				local opv	`op'.`r(name`i')'					//  operator + . + varname
 943 | 				if `i'==1 {
 944 | 					local svn	`opv'
 945 | 				}
 946 | 				else {
 947 | 					local svn	`svn'#`opv'
 948 | 				}
 949 | 			}
 950 | 		}
 951 | 		else if "`r(type)'"=="product" {
 952 | 			di as err "fvstrip error - type=product for `vn'"
 953 | 			exit 198
 954 | 		}
 955 | 		else if "`r(type)'"=="error" {
 956 | 			di as err "fvstrip error - type=error for `vn'"
 957 | 			exit 198
 958 | 		}
 959 | 		else {
 960 | 			di as err "fvstrip error - unknown type for `vn'"
 961 | 			exit 198
 962 | 		}
 963 | 		local stripped `stripped' `svn'
 964 | 	}
 965 | 	local stripped	: list retokenize stripped						//  clean any extra spaces
 966 | 	
 967 | 	if "`noisily'"~="" {											//  for debugging etc.
 968 | 		di as result "`stripped'"
 969 | 	}
 970 | 
 971 | 	return local varlist	`stripped'								//  return results in r(varlist)
 972 | end
 973 | 
 974 | // Internal version of matchnames
 975 | // Sample syntax:
 976 | // matchnames "`varlist'" "`list1'" "`list2'"
 977 | // takes list in `varlist', looks up in `list1', returns entries in `list2', called r(names)
 978 | program define matchnames, rclass
 979 | 	version 11.2
 980 | 	args	varnames namelist1 namelist2
 981 | 
 982 | 	local k1 : word count `namelist1'
 983 | 	local k2 : word count `namelist2'
 984 | 
 985 | 	if `k1' ~= `k2' {
 986 | 		di as err "namelist error"
 987 | 		exit 198
 988 | 	}
 989 | 	foreach vn in `varnames' {
 990 | 		local i : list posof `"`vn'"' in namelist1
 991 | 		if `i' > 0 {
 992 | 			local newname : word `i' of `namelist2'
 993 | 		}
 994 | 		else {
 995 | * Keep old name if not found in list
 996 | 			local newname "`vn'"
 997 | 		}
 998 | 		local names "`names' `newname'"
 999 | 	}
1000 | 	local names	: list clean names
1001 | 	return local names "`names'"
1002 | end
1003 | 
1004 | // Display varlist with specified indentation
1005 | program define Disp 
1006 | 	version 11.2
1007 | 	syntax [anything] [, _col(integer 15) ]
1008 | 	local maxlen = 80-`_col'
1009 | 	local len = 0
1010 | 	local first = 1
1011 | 	foreach vn in `anything' {
1012 | * Don't display if base or omitted variable
1013 | 		_ms_parse_parts `vn'
1014 | 		if ~`r(omit)' {
1015 | 			local vnlen		: length local vn
1016 | 			if `len'+`vnlen' > `maxlen' {
1017 | 				di
1018 | 				local first = 1
1019 | 				local len = `vnlen'
1020 | 			}
1021 | 			else {
1022 | 				local len = `len'+`vnlen'+1
1023 | 			}
1024 | 			if `first' {
1025 | 				local first = 0
1026 | 				di in gr _col(`_col') "`vn'" _c
1027 | 				}
1028 | 			else {
1029 | 				di in gr " `vn'" _c
1030 | 			}
1031 | 		}
1032 | 	}
1033 | * Finish with a newline
1034 | 	di
1035 | end
1036 | 
1037 | version 13
1038 | mata:
1039 | 
1040 | void s_maketemps(real scalar p)
1041 | {
1042 | 	(void) st_addvar("double", names=st_tempname(p), 1)
1043 | 	st_global("r(varlist)",invtokens(names))
1044 | }
1045 | 
1046 | 
1047 | // END MATA SECTION
1048 | end
1049 | 


--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
 1 | name: r-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - r-base=4.3
 6 |   - r-tidyverse
 7 |   - r-fbasics
 8 |   - r-corrplot
 9 |   - r-psych
10 |   - r-glmnet
11 |   - r-glmnetutils
12 |   - r-grf
13 |   - r-rpart
14 |   - r-rpart.plot
15 |   - r-randomforest
16 |   - r-rlang
17 |   - r-readr
18 |   - r-devtools
19 |   - r-reshape2
20 |   - r-caret
21 |   - r-plotmo
22 |   - r-randomfieldsutils
23 |   - r-rms
24 |   - r-hdm
25 |   - r-aer
26 |   - r-lmtest
27 |   - r-dplyr
28 |   - r-sandwich
29 |   - r-diagrammer
30 |   - r-neuralnet
31 |   - r-ISLR2
32 |   - r-zeallot
33 |   - r-nycflights13


--------------------------------------------------------------------------------