├── 1_Overview.pdf ├── 2_Regularized_Regression.pdf ├── 3_Trees_Forests.pdf ├── 4_Deep_Learning.pdf ├── 5_Unsupervised.pdf ├── 6_confounders_with_ML.pdf ├── 7_causal_forest.pdf ├── 8_optimal_policy.pdf ├── 9_reinforcement_learning.pdf ├── Example Lecture 3 └── deep_learning_example.R ├── Examples lecture 1 ├── Data │ ├── job_corps.csv │ ├── mylemon.csv │ ├── used_cars_test.csv │ └── used_cars_train.csv ├── examples_first_lecture.html └── examples_first_lecture.ipynb ├── Group Data Challenge 2025 ├── data_challenge.pdf ├── juice.csv ├── new_grocery.csv ├── orange_juice.html ├── orange_juice.ipynb └── orange_juice.r ├── Individual Home Assignment 2025 ├── grading_grid.pdf └── research_proposal.pdf ├── Literature ├── Athey_2017.pdf ├── Athey_et_al_2019.pdf ├── Belloni_et_al_2012.pdf ├── Belloni_et_al_2014a.pdf ├── Belloni_et_al_2014b.pdf ├── Cagala_et_al_2021.pdf ├── Chernozhukov_et_al_2017.pdf ├── Chetverikov_et_al_2020.pdf ├── Google flu trends.pdf ├── Mullainathan_Spiess_2017.pdf └── Semenova_Chernozhukov_2020.pdf ├── PC Lab 1 ├── help files │ └── glmnet_package.pdf ├── penalize_regression_tutorial.r ├── penalized_regression_solution.html ├── penalized_regression_solution.ipynb ├── penalized_regression_tutorial.ipynb ├── student-mat-test.Rdata └── student-mat-train.Rdata ├── PC Lab 2 ├── browser-sites.txt ├── browser_2006.csv ├── browser_new.csv ├── help files │ ├── grf.pdf │ └── rpart.pdf ├── trees_foests_solution.html ├── trees_foests_solution.ipynb ├── trees_foests_tutorial.ipynb └── trees_foests_tutorial.r ├── PC Lab 3 ├── help files │ ├── R_ K-Means Clustering.html │ └── R_ Principal Components Analysis.html ├── rollcall-members.Rdata ├── rollcall-votes.Rdata ├── unsupervised_solution.html ├── unsupervised_solution.ipynb ├── unsupervised_tutorial.ipynb └── unsupervised_tutorial.r ├── PC Lab 4 ├── help files │ ├── glmnet_package.pdf │ └── hdm_package.pdf ├── job_corps.csv ├── post_double_selection_solution.html ├── post_double_selection_solution.ipynb ├── post_double_selection_tutorial.ipynb └── post_double_selection_tutorial.r ├── PC Lab 5 ├── double_machine_learning_solution.html ├── double_machine_learning_solution.ipynb ├── double_machine_learning_tutorial.ipynb ├── double_machine_learning_tutorial.r ├── help files │ ├── glmnet_package.pdf │ └── grf_package.pdf └── job_corps.csv ├── PC Lab 6 ├── causal_forest.html ├── causal_forest.ipynb ├── causal_forest.r ├── fundraising.csv └── help files │ └── grf_package.pdf ├── PC Lab 7 ├── fundraising.csv ├── help files │ ├── grf_package.pdf │ └── rpart_package.pdf ├── optimal_policy_learning.html ├── optimal_policy_learning.ipynb └── optimal_policy_learning.r ├── README.md ├── Stata Example ├── ajr_example.do ├── ivlasso.ado ├── lassoutils.ado ├── pdslasso.ado └── rlasso.ado └── binder └── environment.yml /1_Overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/1_Overview.pdf -------------------------------------------------------------------------------- /2_Regularized_Regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/2_Regularized_Regression.pdf -------------------------------------------------------------------------------- /3_Trees_Forests.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/3_Trees_Forests.pdf -------------------------------------------------------------------------------- /4_Deep_Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/4_Deep_Learning.pdf -------------------------------------------------------------------------------- /5_Unsupervised.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/5_Unsupervised.pdf -------------------------------------------------------------------------------- /6_confounders_with_ML.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/6_confounders_with_ML.pdf -------------------------------------------------------------------------------- /7_causal_forest.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/7_causal_forest.pdf -------------------------------------------------------------------------------- /8_optimal_policy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/8_optimal_policy.pdf -------------------------------------------------------------------------------- /9_reinforcement_learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/9_reinforcement_learning.pdf -------------------------------------------------------------------------------- /Example Lecture 3/deep_learning_example.R: -------------------------------------------------------------------------------- 1 | ### Lab: Deep Learning 2 | 3 | ## In this version of the Ch10 lab, we use the `luz` package, which interfaces to the 4 | ## `torch` package which in turn links to efficient 5 | ## `C++` code in the LibTorch library. 6 | 7 | ## This version of the lab was produced by Daniel Falbel and Sigrid 8 | ## Keydana, both data scientists at Rstudio where these packages were 9 | ## produced. 10 | 11 | ## An advantage over our original `keras` implementation is that this 12 | ## version does not require a separate `python` installation. 13 | 14 | ########################################## 15 | ## Single Layer Network on Hitters Data ## 16 | ########################################## 17 | 18 | ## Load various packages 19 | library(ISLR2) 20 | library(glmnet) 21 | library(torch) 22 | library(luz) # high-level interface for torch 23 | library(torchvision) # for datasets and image transformation 24 | library(torchdatasets) # for datasets we are going to use 25 | library(zeallot) 26 | library(ggplot2) 27 | library(grf) 28 | 29 | ## Loading the dataset 30 | ## We use the example data with baseball player salaries from lecture 4 31 | Gitters <- na.omit(Hitters) 32 | n <- nrow(Gitters) 33 | print(paste("Number of observations:", n)) 34 | 35 | ## Define tes sample 36 | set.seed(13) 37 | ntest <- trunc(n / 3) 38 | testid <- sample(1:n, ntest) 39 | 40 | 41 | ####################### 42 | ## Linear Regression ## 43 | ####################### 44 | lfit <- lm(Salary ~ ., data = Gitters[-testid, ]) 45 | summary(lfit) 46 | lpred <- predict(lfit, Gitters[testid, ]) 47 | print(paste("MAE:", mean(abs(Gitters$Salary[testid] - lpred)))) 48 | 49 | ## Dafine y and x as matrix 50 | x <- scale(model.matrix(Salary ~ . - 1, data = Gitters)) 51 | print(paste("Number of controls:", ncol(x))) 52 | y <- Gitters$Salary 53 | 54 | ############ 55 | ## Lasso ## 56 | ########### 57 | cvfit <- cv.glmnet(x[-testid, ], y[-testid], type.measure = "mae") 58 | coef(cvfit) 59 | cpred <- predict(cvfit, x[testid, ], s = "lambda.min") 60 | print(paste("MAE:",mean(abs(y[testid] - cpred)))) 61 | 62 | ################### 63 | ## Random Forest ## 64 | ################### 65 | forest <- regression_forest(x[-testid, ], y[-testid]) 66 | fpred <- predict(forest, x[testid, ]) 67 | print(paste("MAE:",mean(abs(y[testid] - fpred$prediction)))) 68 | 69 | #################### 70 | ## Neural Network ## 71 | #################### 72 | 73 | torch_manual_seed(13) 74 | 75 | # single hidden layer 76 | # 10 hidden units 77 | # ReLU activation function 78 | # dropout layer, in which a random 40% of the 10 activations from the 79 | # previous layer are set to zero during each iteration of the stochastic 80 | # gradient descent algorithm 81 | # One output 82 | # linear output function 83 | modnn <- nn_module( 84 | initialize = function(input_size) { 85 | self$hidden <- nn_linear(input_size, 10) 86 | self$activation <- nn_relu() 87 | self$dropout <- nn_dropout(0.4) 88 | self$output <- nn_linear(10, 1) 89 | }, 90 | forward = function(x) { 91 | x %>% 92 | self$hidden() %>% 93 | self$activation() %>% 94 | self$dropout() %>% 95 | self$output() 96 | } 97 | ) 98 | 99 | # Specify optimisation algorithm 100 | # Here mse loss 101 | modnn <- modnn %>% 102 | setup( 103 | loss = nn_mse_loss(), 104 | optimizer = optim_rmsprop, 105 | metrics = list(luz_metric_mae()) 106 | ) %>% 107 | set_hparams(input_size = ncol(x)) 108 | 109 | # Train the neural network in 1500 iterations 110 | fitted <- modnn %>% 111 | fit( 112 | data = list(x[-testid, ], matrix(y[-testid], ncol = 1)), 113 | valid_data = list(x[testid, ], matrix(y[testid], ncol = 1)), 114 | epochs = 1500 115 | ) 116 | #plot(fitted) 117 | 118 | 119 | npred <- predict(fitted, x[testid, ]) 120 | mean(abs(y[testid] - npred)) 121 | -------------------------------------------------------------------------------- /Examples lecture 1/Data/used_cars_test.csv: -------------------------------------------------------------------------------- 1 | "","first_price","mileage","age_car_years","diesel","other_car_owner","bmw_320","opel_astra","mercedes_c","vw_golf","vw_passat","pm_green","private_seller","guarantee","inspection","maintenance_cert","co2_em","euro_norm","mile_20","mile_30","mile_40","mile_50","mile_100","mile_150","mileage2","mileage3","mileage4","age_3","age_6","age_car_years2","age_car_years3","age_car_years4","dur_next_ins_0","dur_next_ins_1_2","new_inspection","euro_1","euro_2","euro_3","euro_4","euro_5","euro_6" 2 | "1",25.5,79.85,3.1,1,1,1,0,0,0,0,1,0,0,2,0,124,5,1,1,1,1,0,0,6376.0225,509125.41,40653664,1,0,9.6099997,29.791,92.352097,0,0,1,0,0,0,0,1,0 3 | "3",7.47,142.5,9.6,0,4,1,0,0,0,0,1,0,0,0,0,182,4,1,1,1,1,1,0,20306.25,2893640.5,412343776,1,1,92.160004,884.73602,8493.4658,1,0,0,0,0,0,1,0,0 4 | "10",20.882,76.85,3.3,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,1,1,1,0,0,5905.9224,453870.16,34879920,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0 5 | "12",11.389,143,7.6,1,1,1,0,0,0,0,1,0,0,1,1,131,4,1,1,1,1,1,0,20449,2924207,418161600,1,1,57.759998,438.97601,3336.2175,0,1,0,0,0,0,1,0,0 6 | "15",23.015,47.2,1.4,1,1,0,0,0,0,1,1,0,0,0,0,123,5,1,1,1,0,0,0,2227.8401,105154.05,4963271,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0 7 | "17",25.26,42.495,2.9,1,1,1,0,0,0,0,1,0,0,0,1,123,5,1,1,1,0,0,0,1805.8251,76738.531,3261004,0,0,8.4099998,24.389,70.728104,1,0,0,0,0,0,0,1,0 8 | "18",19.029,29.4,2.3,1,1,0,0,0,0,1,1,0,0,1,1,122,5,1,0,0,0,0,0,864.35999,25412.184,747118.19,0,0,5.29,12.167,27.9841,0,1,0,0,0,0,0,1,0 9 | "24",17.339,54.936,4.4,1,1,0,0,0,1,0,0,0,0,0,1,139,5,1,1,1,1,0,0,3017.9641,165794.88,9108107,1,0,19.360001,85.183998,374.8096,1,0,0,0,0,0,0,1,0 10 | "26",8.96,75.19,7.4,1,4,0,0,0,1,0,1,0,0,1,1,137,4,1,1,1,1,0,0,5653.5361,425089.38,31962470,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0 11 | "27",13.65,71,4.7,1,2,0,0,0,1,0,1,1,0,0,1,125,5,1,1,1,1,0,0,5041,357911,25411680,1,0,22.09,103.823,487.96811,1,0,0,0,0,0,0,1,0 12 | "28",20.42,127.634,3.2,1,1,0,0,0,0,1,1,0,0,1,1,155,5,1,1,1,1,1,0,16290.438,2079213.8,265378368,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0 13 | "29",15.36,22.931,3.2,0,0,0,1,0,0,0,1,0,0,1,1,144,5,1,0,0,0,0,0,525.83075,12057.825,276498,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0 14 | "34",7,186.3,4.3,1,1,0,1,0,0,0,1,0,0,0,1,134,5,1,1,1,1,1,1,34707.691,6466042.5,1204623744,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0 15 | "36",10.61,94,7.4,1,0,0,0,0,0,1,1,1,0,2,0,149,5,1,1,1,1,0,0,8836,830584,78074896,1,1,54.759998,405.224,2998.6577,0,0,1,0,0,0,0,1,0 16 | "40",6.9,184,8,1,0,0,0,0,1,0,1,1,0,2,0,159,4,1,1,1,1,1,1,33856,6229504,1146228736,1,1,64,512,4096,0,0,1,0,0,0,1,0,0 17 | "41",32.78,44.778,4.8,0,2,1,0,0,0,0,1,0,0,1,1,235,5,1,1,1,0,0,0,2005.0693,89782.992,4020302.8,1,0,23.040001,110.592,530.84161,0,1,0,0,0,0,0,1,0 18 | "42",14.53,69.028,3.6,1,1,0,0,0,1,0,1,0,0,1,1,119,5,1,1,1,1,0,0,4764.8647,328909.09,22703936,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0 19 | "43",24.16,119,1.9,1,1,1,0,0,0,0,1,0,0,1,1,112,5,1,1,1,1,1,0,14161,1685159,200533920,0,0,3.6099999,6.8590002,13.0321,0,1,0,0,0,0,0,1,0 20 | "44",5.669,155.1,12.5,1,3,1,0,0,0,0,0,1,0,0,1,158,3,1,1,1,1,1,1,24056.01,3731087.3,578691648,1,1,156.25,1953.125,24414.063,1,0,0,0,0,1,0,0,0 21 | "45",22.46,54.49,4,1,2,0,0,0,0,1,1,0,0,0,0,151,5,1,1,1,1,0,0,2969.1602,161789.53,8815912,1,0,16,64,256,1,0,0,0,0,0,0,1,0 22 | "46",15.98,111.1,3.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,12343.21,1371330.6,152354832,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 23 | "51",10.27,154.907,3.8,1,1,0,1,0,0,0,1,0,0,1,1,148,5,1,1,1,1,1,1,23996.178,3717176,575816576,1,0,14.44,54.872002,208.5136,0,1,0,0,0,0,0,1,0 24 | "52",22.48,69.97,3.2,1,1,1,0,0,0,0,1,0,0,2,0,123,5,1,1,1,1,0,0,4895.8008,342559.19,23968866,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0 25 | "54",29.27,25.86,1.6,1,1,1,0,0,0,0,1,0,0,0,1,119,6,1,0,0,0,0,0,668.73962,17293.605,447212.66,0,0,2.5599999,4.0960002,6.5535998,1,0,0,0,0,0,0,0,1 26 | "56",13.39,62.715,3.5,1,2,0,1,0,0,0,1,0,0,0,0,119,5,1,1,1,1,0,0,3933.1711,246668.83,15469836,1,0,12.25,42.875,150.0625,1,0,0,0,0,0,0,1,0 27 | "60",11.86,75.9,2.2,1,1,0,1,0,0,0,1,0,0,1,1,113,5,1,1,1,1,0,0,5760.8101,437245.47,33186932,0,0,4.8400002,10.648,23.4256,0,1,0,0,0,0,0,1,0 28 | "63",7.2,155,6.3,1,1,0,0,0,1,0,1,0,0,0,1,137,4,1,1,1,1,1,1,24025,3723875,577200640,1,1,39.689999,250.047,1575.2961,1,0,0,0,0,0,1,0,0 29 | "66",5.63,171,10.4,1,0,0,0,0,0,1,1,0,0,1,1,156,4,1,1,1,1,1,1,29241,5000211,855036096,1,1,108.16,1124.864,11698.586,0,1,0,0,0,0,1,0,0 30 | "68",9.57,144,10.1,0,0,1,0,0,0,0,1,0,0,1,1,182,4,1,1,1,1,1,0,20736,2985984,429981696,1,1,102.01,1030.301,10406.04,0,1,0,0,0,0,1,0,0 31 | "71",13.61,20.85,1.4,0,1,0,1,0,0,0,1,0,0,1,1,134,5,1,0,0,0,0,0,434.7225,9063.9639,188983.66,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,1,0 32 | "72",20.119,160,4,1,2,0,0,1,0,0,0,1,0,0,1,128,5,1,1,1,1,1,1,25600,4096000,655360000,1,0,16,64,256,1,0,0,0,0,0,0,1,0 33 | "76",10.71,160.871,3.5,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,1,1,25879.479,4163257.5,669747392,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,1,0 34 | "77",14.68,125,3.4,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,1,1,1,1,0,15625,1953125,244140624,1,0,11.56,39.304001,133.63361,0,1,0,0,0,0,0,1,0 35 | "80",22.335,18.704,1.2,1,1,0,0,0,0,1,1,0,0,1,1,125,6,0,0,0,0,0,0,349.83963,6543.4004,122387.76,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,0,1 36 | "81",8.29,163.44,6.3,1,1,0,0,0,1,0,1,0,0,0,1,122,4,1,1,1,1,1,1,26712.633,4365913,713564800,1,1,39.689999,250.047,1575.2961,1,0,0,0,0,0,1,0,0 37 | "82",17.48,150,3.3,1,1,1,0,0,0,0,0,0,0,2,1,124,5,1,1,1,1,1,1,22500,3375000,506249984,1,0,10.89,35.937,118.5921,0,0,1,0,0,0,0,1,0 38 | "83",15.84,19,1.7,0,0,0,0,0,1,0,1,0,0,0,1,124,5,0,0,0,0,0,0,361,6859,130321,0,0,2.8900001,4.9130001,8.3521004,1,0,0,0,0,0,0,1,0 39 | "86",14.99,91.735,5.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,0,0,8415.3105,771978.5,70817448,1,0,26.01,132.651,676.52008,0,1,0,0,0,0,0,1,0 40 | "88",15.27,109.6,2.4,1,1,0,1,0,0,0,1,0,1,0,1,154,5,1,1,1,1,1,0,12012.16,1316532.8,144291984,0,0,5.7600002,13.824,33.177601,1,0,0,0,0,0,0,1,0 41 | "91",6.09,140,6.7,1,2,0,0,0,1,0,1,0,0,0,1,137,4,1,1,1,1,1,0,19600,2744000,384160000,1,1,44.889999,300.763,2015.1121,1,0,0,0,0,0,1,0,0 42 | "92",29.25,31.988,1.5,1,1,0,0,1,0,0,1,0,0,0,0,128,5,1,1,0,0,0,0,1023.2321,32731.15,1047004,0,0,2.25,3.375,5.0625,1,0,0,0,0,0,0,1,0 43 | "94",11.71,199.95,8,0,2,0,0,1,0,0,1,0,0,0,1,229,4,1,1,1,1,1,1,39980.004,7994001.5,1598400640,1,1,64,512,4096,1,0,0,0,0,0,1,0,0 44 | "97",13.37,112,6.2,1,2,1,0,0,0,0,1,0,0,0,1,142,5,1,1,1,1,1,0,12544,1404928,157351936,1,1,38.439999,238.328,1477.6335,1,0,0,0,0,0,0,1,0 45 | "98",15.359,61.404,5.8,0,1,0,0,0,1,0,1,0,0,1,1,139,5,1,1,1,1,0,0,3770.4512,231520.78,14216302,1,0,33.639999,195.112,1131.6497,0,1,0,0,0,0,0,1,0 46 | "99",16.53,114.5,3.5,1,1,0,0,0,0,1,1,0,0,1,0,125,6,1,1,1,1,1,0,13110.25,1501123.6,171878656,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,0,1 47 | "100",6.87,171.914,3.9,1,0,0,1,0,0,0,1,0,0,2,1,117,5,1,1,1,1,1,1,29554.424,5080819,873463936,1,0,15.21,59.319,231.3441,0,0,1,0,0,0,0,1,0 48 | "102",16.91,88.639,3.3,1,0,0,0,0,0,1,1,0,0,0,1,120,5,1,1,1,1,0,0,7856.8726,696425.31,61730444,1,0,10.89,35.937,118.5921,1,0,0,0,0,0,0,1,0 49 | "103",20.35,22.81,3.8,1,0,0,0,0,0,1,1,0,0,2,0,139,5,1,0,0,0,0,0,520.29608,11867.954,270708.03,1,0,14.44,54.872002,208.5136,0,0,1,0,0,0,0,1,0 50 | "104",8.05,120,8.9,1,3,0,0,0,0,1,1,0,0,0,0,177,4,1,1,1,1,1,0,14400,1728000,207360000,1,1,79.209999,704.96899,6274.2241,1,0,0,0,0,0,1,0,0 51 | "107",18.34,25.117,1.4,0,1,0,0,0,1,0,1,0,0,1,1,126,5,1,0,0,0,0,0,630.86371,15845.403,397989,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,1,0 52 | "110",22.72,42.8,1.5,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,0,0,0,1831.84,78402.75,3355637.8,0,0,2.25,3.375,5.0625,0,1,0,0,0,0,0,1,0 53 | "113",18.25,150.99,4.1,1,1,1,0,0,0,0,1,0,0,1,0,128,5,1,1,1,1,1,1,22797.98,3442267,519747904,1,0,16.809999,68.920998,282.57611,0,1,0,0,0,0,0,1,0 54 | "114",16.85,57,4.1,1,1,0,0,0,0,1,1,0,0,0,1,123,5,1,1,1,1,0,0,3249,185193,10556001,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0 55 | "120",35.7,42,1.1,1,1,0,0,1,0,0,1,0,0,1,1,108,6,1,1,1,0,0,0,1764,74088,3111696,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1 56 | "121",8.26,108,6.2,0,0,0,0,0,1,0,1,0,0,1,0,170,4,1,1,1,1,1,0,11664,1259712,136048896,1,1,38.439999,238.328,1477.6335,0,1,0,0,0,0,1,0,0 57 | "125",13.31,98,4.1,1,2,0,1,0,0,0,0,1,0,0,1,156,5,1,1,1,1,0,0,9604,941192,92236816,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0 58 | "126",17.045,134.7,3.9,1,1,0,0,0,0,1,1,0,1,0,1,120,5,1,1,1,1,1,0,18144.09,2444009,329208000,1,0,15.21,59.319,231.3441,1,0,0,0,0,0,0,1,0 59 | "131",14.74,37.703,1.4,0,1,0,1,0,0,0,1,0,0,1,0,139,6,1,1,0,0,0,0,1421.5162,53595.426,2020708.4,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,0,1 60 | "132",7.1,84.78,7,0,1,0,0,0,1,0,1,0,0,1,1,176,4,1,1,1,1,0,0,7187.6484,609368.81,51662288,1,1,49,343,2401,0,1,0,0,0,0,1,0,0 61 | "136",6.63,95.55,5.9,0,1,0,0,0,1,0,1,0,0,0,1,165,5,1,1,1,1,0,0,9129.8027,872352.63,83353296,1,0,34.810001,205.379,1211.7361,1,0,0,0,0,0,0,1,0 62 | "137",30.65,30.456,1.1,1,1,1,0,0,0,0,1,0,0,2,0,123,5,1,1,0,0,0,0,927.56793,28250.01,860382.25,0,0,1.21,1.331,1.4641,0,0,1,0,0,0,0,1,0 63 | "138",6.29,189.285,9.5,1,0,1,0,0,0,0,1,0,0,0,0,158,4,1,1,1,1,1,1,35828.813,6781856.5,1283703680,1,1,90.25,857.375,8145.0625,1,0,0,0,0,0,1,0,0 64 | "139",23.109,31.972,1.2,1,1,1,0,0,0,0,1,0,0,1,1,119,5,1,1,0,0,0,0,1022.2088,32682.059,1044910.8,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0 65 | "140",11.56,110.8,4.7,1,1,1,0,0,0,0,1,0,0,1,0,109,5,1,1,1,1,1,0,12276.64,1360251.8,150715888,1,0,22.09,103.823,487.96811,0,1,0,0,0,0,0,1,0 66 | "141",29.06,29,1.6,1,0,1,0,0,0,0,1,0,0,0,1,129,6,1,0,0,0,0,0,841,24389,707281,0,0,2.5599999,4.0960002,6.5535998,1,0,0,0,0,0,0,0,1 67 | "143",16.26,88.315,3.5,1,1,0,0,0,0,1,1,0,0,1,1,123,6,1,1,1,1,0,0,7799.5391,688816.31,60832812,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,0,1 68 | "144",21.12,72.656,2.4,1,1,0,0,1,0,0,1,0,0,1,1,124,5,1,1,1,1,0,0,5278.8945,383543.34,27866726,0,0,5.7600002,13.824,33.177601,0,1,0,0,0,0,0,1,0 69 | "145",22.389,30.681,4.4,1,2,1,0,0,0,0,1,0,0,1,1,142,4,1,1,0,0,0,0,941.32379,28880.754,886090.44,1,0,19.360001,85.183998,374.8096,0,1,0,0,0,0,1,0,0 70 | "147",12.65,150.3,3.1,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,1,1,22590.09,3395290.5,510312160,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 71 | "148",17.38,131.672,4.2,1,1,0,0,1,0,0,1,0,0,0,1,134,5,1,1,1,1,1,0,17337.516,2282865.3,300589440,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0 72 | "150",18.34,93.472,4.9,1,2,0,0,0,0,1,1,0,1,1,1,125,5,1,1,1,1,0,0,8737.0146,816666.25,76335424,1,0,24.01,117.649,576.4801,0,1,0,0,0,0,0,1,0 73 | "152",10.75,95.223,4.3,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,0,0,9067.4199,863426.94,82218104,1,0,18.49,79.507004,341.8801,0,1,0,0,0,0,0,1,0 74 | "153",19.699,55.641,2,1,1,1,0,0,0,0,1,0,0,1,1,119,5,1,1,1,1,0,0,3095.9209,172260.14,9584726,0,0,4,8,16,0,1,0,0,0,0,0,1,0 75 | "154",21.354,69.998,3.2,1,1,0,0,0,0,1,1,0,0,2,1,120,5,1,1,1,1,0,0,4899.7202,342970.59,24007256,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0 76 | "158",20.01,88.112,3.6,1,2,0,0,0,0,1,1,0,0,1,1,123,5,1,1,1,1,0,0,7763.7246,684077.31,60275420,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0 77 | "159",10.97,174,7,1,0,1,0,0,0,0,1,0,0,1,1,150,5,1,1,1,1,1,1,30276,5268024,916636160,1,1,49,343,2401,0,1,0,0,0,0,0,1,0 78 | "160",26.62,59.998,3.2,1,1,1,0,0,0,0,1,0,0,2,1,125,5,1,1,1,1,0,0,3599.76,215978.41,12958272,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0 79 | "161",7.23,173,8.8,0,2,1,0,0,0,0,1,0,0,1,1,182,4,1,1,1,1,1,1,29929,5177717,895745024,1,1,77.440002,681.47198,5996.9536,0,1,0,0,0,0,1,0,0 80 | "165",19.74,72.645,3.3,1,1,0,0,0,0,1,1,0,0,1,1,1,5,1,1,1,1,0,0,5277.2959,383369.16,27849854,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0 81 | "166",15.83,66.8,4.5,1,2,0,0,0,0,1,1,0,0,1,1,120,4,1,1,1,1,0,0,4462.2402,298077.63,19911586,1,0,20.25,91.125,410.0625,0,1,0,0,0,0,1,0,0 82 | "169",9.23,128,9.2,1,1,0,0,0,0,1,1,0,0,1,1,158,4,1,1,1,1,1,0,16384,2097152,268435456,1,1,84.639999,778.68799,7163.9297,0,1,0,0,0,0,1,0,0 83 | "170",14.26,128.2,3.4,1,1,0,0,0,0,1,1,0,1,1,1,121,5,1,1,1,1,1,0,16435.24,2106997.8,270117120,1,0,11.56,39.304001,133.63361,0,1,0,0,0,0,0,1,0 84 | "171",15.62,106,4.2,0,1,0,0,0,0,1,1,0,0,1,1,163,5,1,1,1,1,1,0,11236,1191016,126247696,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0 85 | "172",20.06,21.5,1.5,0,0,0,0,0,1,0,1,0,0,0,0,117,5,1,0,0,0,0,0,462.25,9938.375,213675.06,0,0,2.25,3.375,5.0625,1,0,0,0,0,0,0,1,0 86 | "173",4.79,133.1,7.5,1,2,0,1,0,0,0,1,0,0,0,1,146,4,1,1,1,1,1,0,17715.609,2357947.8,313842848,1,1,56.25,421.875,3164.0625,1,0,0,0,0,0,1,0,0 87 | "175",21.83,89.976,4.6,1,1,0,0,1,0,0,1,0,0,1,1,180,4,1,1,1,1,0,0,8095.6807,728416.94,65540044,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,1,0,0 88 | "178",14.12,124.471,4.4,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,15493.03,1928432.9,240033968,1,0,19.360001,85.183998,374.8096,0,1,0,0,0,0,0,1,0 89 | "180",5.92,132,7.7,1,0,0,0,0,1,0,0,1,0,0,1,137,4,1,1,1,1,1,0,17424,2299968,303595776,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0 90 | "181",23.08,61.208,3.2,1,1,1,0,0,0,0,1,0,0,1,1,124,5,1,1,1,1,0,0,3746.4192,229310.83,14035657,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0 91 | "182",10.48,173.51,2.8,1,1,0,1,0,0,0,1,0,0,1,1,129,5,1,1,1,1,1,1,30105.721,5223643.5,906354368,0,0,7.8400002,21.952,61.465599,0,1,0,0,0,0,0,1,0 92 | "183",13.955,163.9,5.5,1,2,0,0,0,0,1,0,1,0,0,1,159,5,1,1,1,1,1,1,26863.211,4402880,721632064,1,0,30.25,166.375,915.0625,1,0,0,0,0,0,0,1,0 93 | "184",11.94,177,4.3,1,2,0,0,0,0,1,1,0,0,0,1,121,5,1,1,1,1,1,1,31329,5545233,981506240,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0 94 | "185",8.09,125,4.2,1,0,0,0,0,1,0,0,0,1,0,1,119,5,1,1,1,1,1,0,15625,1953125,244140624,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0 95 | "186",10.61,182.301,3.9,1,1,1,0,0,0,0,1,0,0,1,0,120,5,1,1,1,1,1,1,33233.656,6058528.5,1104475776,1,0,15.21,59.319,231.3441,0,1,0,0,0,0,0,1,0 96 | "187",18.7,164.618,2.9,1,1,0,0,0,0,1,1,0,1,1,1,125,5,1,1,1,1,1,1,27099.086,4460997.5,734360448,0,0,8.4099998,24.389,70.728104,0,1,0,0,0,0,0,1,0 97 | "188",10.91,91.54,4.8,0,2,0,1,0,0,0,1,0,0,0,1,144,5,1,1,1,1,0,0,8379.5713,767066,70217224,1,0,23.040001,110.592,530.84161,1,0,0,0,0,0,0,1,0 98 | "192",29.081,63.169,2.9,1,3,1,0,0,0,0,1,0,0,0,0,123,5,1,1,1,1,0,0,3990.3225,252064.69,15922674,0,0,8.4099998,24.389,70.728104,1,0,0,0,0,0,0,1,0 99 | "195",15.199,39,1.2,1,1,0,1,0,0,0,1,0,0,2,1,104,5,1,1,0,0,0,0,1521,59319,2313441,0,0,1.4400001,1.728,2.0736001,0,0,1,0,0,0,0,1,0 100 | "196",12.77,98,4.1,1,1,0,1,0,0,0,1,0,0,0,1,134,5,1,1,1,1,0,0,9604,941192,92236816,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0 101 | "199",16,14.104,1.2,0,1,0,0,0,1,0,1,0,0,1,1,119,5,0,0,0,0,0,0,198.92282,2805.6074,39570.285,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0 102 | "200",17.08,147,4.2,1,1,1,0,0,0,0,1,0,0,1,1,140,5,1,1,1,1,1,0,21609,3176523,466948896,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0 103 | -------------------------------------------------------------------------------- /Examples lecture 1/Data/used_cars_train.csv: -------------------------------------------------------------------------------- 1 | "","first_price","mileage","age_car_years","diesel","other_car_owner","bmw_320","opel_astra","mercedes_c","vw_golf","vw_passat","pm_green","private_seller","guarantee","inspection","maintenance_cert","co2_em","euro_norm","mile_20","mile_30","mile_40","mile_50","mile_100","mile_150","mileage2","mileage3","mileage4","age_3","age_6","age_car_years2","age_car_years3","age_car_years4","dur_next_ins_0","dur_next_ins_1_2","new_inspection","euro_1","euro_2","euro_3","euro_4","euro_5","euro_6" 2 | "2",21.91,77.1,3.7,1,1,0,0,0,0,1,1,0,0,0,1,136,5,1,1,1,1,0,0,5944.4102,458314,35336012,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0 3 | "4",14.58,45.45,5,0,2,0,0,0,0,1,1,0,0,1,1,145,5,1,1,1,0,0,0,2065.7024,93886.18,4267127,1,0,25,125,625,0,1,0,0,0,0,0,1,0 4 | "5",17.98,183.5,3.6,1,1,1,0,0,0,0,1,0,0,1,1,124,5,1,1,1,1,1,1,33672.25,6178858,1133820416,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0 5 | "6",19.03,74.85,3.1,1,1,0,0,0,0,1,1,0,0,1,1,125,5,1,1,1,1,0,0,5602.5225,419348.81,31388258,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 6 | "7",10.969,174,6.8,1,1,0,0,1,0,0,1,0,0,1,1,154,5,1,1,1,1,1,1,30276,5268024,916636160,1,1,46.240002,314.43201,2138.1377,0,1,0,0,0,0,0,1,0 7 | "8",24.11,51.001,2.3,1,2,1,0,0,0,0,1,0,0,0,0,123,5,1,1,1,1,0,0,2601.1021,132658.8,6765731.5,0,0,5.29,12.167,27.9841,1,0,0,0,0,0,0,1,0 8 | "9",13.26,62,2.6,1,2,0,0,0,1,0,1,0,0,0,0,119,5,1,1,1,1,0,0,3844,238328,14776336,0,0,6.7600002,17.576,45.697601,1,0,0,0,0,0,0,1,0 9 | "11",23.2,16.901,1.4,1,1,1,0,0,0,0,1,0,0,0,1,119,5,0,0,0,0,0,0,285.6438,4827.666,81592.383,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0 10 | "13",13.65,119.636,4.2,1,0,0,0,0,0,1,0,0,0,2,0,123,5,1,1,1,1,1,0,14312.772,1712322.9,204855456,1,0,17.639999,74.087997,311.16959,0,0,1,0,0,0,0,1,0 11 | "14",11.74,83,7.4,1,2,1,0,0,0,0,0,1,0,0,1,120,5,1,1,1,1,0,0,6889,571787,47458320,1,1,54.759998,405.224,2998.6577,1,0,0,0,0,0,0,1,0 12 | "16",12.07,46.36,7.1,0,1,1,0,0,0,0,1,0,0,1,1,147,4,1,1,1,0,0,0,2149.2495,99639.211,4619274,1,1,50.41,357.91101,2541.1682,0,1,0,0,0,0,1,0,0 13 | "19",16.79,18.4,1.1,1,1,0,0,0,1,0,1,0,1,1,1,92,6,0,0,0,0,0,0,338.56,6229.5039,114622.88,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1 14 | "20",8.18,110.375,9.4,1,2,0,0,0,0,1,1,0,0,0,1,156,4,1,1,1,1,1,0,12182.641,1344659,148416736,1,1,88.360001,830.58398,7807.4897,1,0,0,0,0,0,1,0,0 15 | "21",5.43,151,13,0,0,1,0,0,0,0,1,0,0,1,0,199,4,1,1,1,1,1,1,22801,3442951,519885600,1,1,169,2197,28561,0,1,0,0,0,0,1,0,0 16 | "22",16.719,94.435,3.3,1,1,0,0,0,0,1,1,0,0,2,1,123,5,1,1,1,1,0,0,8917.9688,842168.44,79530176,1,0,10.89,35.937,118.5921,0,0,1,0,0,0,0,1,0 17 | "23",22.42,84.89,4.3,1,1,0,0,1,0,0,1,0,0,0,1,124,5,1,1,1,1,0,0,7206.312,611743.81,51930936,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0 18 | "25",8.82,83,4,1,0,0,1,0,0,0,1,0,0,2,1,119,5,1,1,1,1,0,0,6889,571787,47458320,1,0,16,64,256,0,0,1,0,0,0,0,1,0 19 | "30",11.24,95.2,2.9,1,1,0,1,0,0,0,1,0,1,1,1,120,5,1,1,1,1,0,0,9063.04,862801.44,82138696,0,0,8.4099998,24.389,70.728104,0,1,0,0,0,0,0,1,0 20 | "31",14.54,85.606,4.8,1,0,0,0,1,0,0,0,0,0,2,0,130,5,1,1,1,1,0,0,7328.3872,627353.94,53705260,1,0,23.040001,110.592,530.84161,0,0,1,0,0,0,0,1,0 21 | "32",29.78,69.89,3.4,1,0,1,0,0,0,0,1,0,0,0,1,123,5,1,1,1,1,0,0,4884.6123,341385.53,23859436,1,0,11.56,39.304001,133.63361,1,0,0,0,0,0,0,1,0 22 | "33",22.43,22.208,1,0,1,0,0,0,0,1,1,0,0,1,1,119,6,1,0,0,0,0,0,493.19525,10952.881,243241.56,0,0,1,1,1,0,1,0,0,0,0,0,0,1 23 | "35",9.27,97,3.5,1,1,0,1,0,0,0,1,0,1,0,1,120,5,1,1,1,1,0,0,9409,912673,88529280,1,0,12.25,42.875,150.0625,1,0,0,0,0,0,0,1,0 24 | "37",10.33,93.1,5.4,0,1,0,0,0,0,1,1,0,0,0,1,158,5,1,1,1,1,0,0,8667.6104,806954.5,75127464,1,0,29.16,157.464,850.3056,1,0,0,0,0,0,0,1,0 25 | "38",17.71,92.568,3.1,1,1,0,0,0,0,1,1,0,1,1,1,120,5,1,1,1,1,0,0,8568.835,793199.88,73424928,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 26 | "39",10.9,129.781,5,1,0,0,0,0,0,1,0,0,0,2,0,120,5,1,1,1,1,1,0,16843.107,2185915.5,283690272,1,0,25,125,625,0,0,1,0,0,0,0,1,0 27 | "47",6.71,98.82,12,0,2,1,0,0,0,0,1,0,0,1,1,185,4,1,1,1,1,0,0,9765.3926,965016.06,95362888,1,1,144,1728,20736,0,1,0,0,0,0,1,0,0 28 | "48",21.8,76.5,4.6,1,2,0,0,1,0,0,1,0,1,1,1,136,5,1,1,1,1,0,0,5852.25,447697.13,34248832,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,0,1,0 29 | "49",20.24,17,1.3,1,1,0,0,0,1,0,1,0,0,2,0,119,5,0,0,0,0,0,0,289,4913,83521,0,0,1.6900001,2.197,2.8561001,0,0,1,0,0,0,0,1,0 30 | "50",13.26,126.248,3.1,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,1,0,15938.558,2012211,254037616,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 31 | "53",6.49,159,7.3,1,2,0,0,0,1,0,1,0,0,0,1,122,4,1,1,1,1,1,1,25281,4019679,639128960,1,1,53.290001,389.017,2839.8242,1,0,0,0,0,0,1,0,0 32 | "55",24.01,96,3.1,1,0,1,0,0,0,0,1,0,0,2,0,119,5,1,1,1,1,0,0,9216,884736,84934656,1,0,9.6099997,29.791,92.352097,0,0,1,0,0,0,0,1,0 33 | "57",33.89,25.575,1.4,1,1,0,0,0,0,1,1,0,0,0,1,156,5,1,0,0,0,0,0,654.08063,16728.111,427821.47,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0 34 | "58",18.85,81.804,2.6,1,1,0,0,0,0,1,1,0,0,1,1,113,5,1,1,1,1,0,0,6691.8945,547423.75,44781452,0,0,6.7600002,17.576,45.697601,0,1,0,0,0,0,0,1,0 35 | "59",6.55,65.24,10.1,0,1,1,0,0,0,0,1,0,0,0,1,182,4,1,1,1,1,0,0,4256.2578,277678.25,18115728,1,1,102.01,1030.301,10406.04,1,0,0,0,0,0,1,0,0 36 | "61",9.72,200,7.2,1,1,1,0,0,0,0,1,0,0,1,1,150,5,1,1,1,1,1,1,40000,8e+06,1.6e+09,1,1,51.84,373.24799,2687.3855,0,1,0,0,0,0,0,1,0 37 | "62",16.44,166,3.3,1,1,0,0,1,0,0,1,0,0,0,1,124,5,1,1,1,1,1,1,27556,4574296,759333120,1,0,10.89,35.937,118.5921,1,0,0,0,0,0,0,1,0 38 | "64",15.2,70,4.4,1,1,1,0,0,0,0,1,0,0,2,1,142,5,1,1,1,1,0,0,4900,343000,24010000,1,0,19.360001,85.183998,374.8096,0,0,1,0,0,0,0,1,0 39 | "65",14.04,51.5,4.9,0,2,0,0,1,0,0,1,0,0,1,1,164,5,1,1,1,1,0,0,2652.25,136590.88,7034430,1,0,24.01,117.649,576.4801,0,1,0,0,0,0,0,1,0 40 | "67",33.9,16.994,1.6,1,2,0,0,1,0,0,1,0,0,1,1,171,6,0,0,0,0,0,0,288.79605,4907.7998,83403.148,0,0,2.5599999,4.0960002,6.5535998,0,1,0,0,0,0,0,0,1 41 | "69",25.35,23.33,1,1,1,0,0,0,0,1,1,0,0,1,0,119,6,1,0,0,0,0,0,544.28888,12698.26,296250.41,0,0,1,1,1,0,1,0,0,0,0,0,0,1 42 | "70",9.049,199.98,7.6,1,1,0,0,0,0,1,1,0,0,0,1,189,4,1,1,1,1,1,1,39992,7997600,1599360128,1,1,57.759998,438.97601,3336.2175,1,0,0,0,0,0,1,0,0 43 | "73",12.68,143.4,7.7,1,0,0,0,1,0,0,1,0,0,0,1,161,4,1,1,1,1,1,0,20563.561,2948814.5,422860000,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0 44 | "74",22.38,111.326,3.4,1,1,0,0,1,0,0,1,0,1,2,0,136,5,1,1,1,1,1,0,12393.479,1379716.4,153598304,1,0,11.56,39.304001,133.63361,0,0,1,0,0,0,0,1,0 45 | "75",5.5,129.651,7.7,0,2,0,1,0,0,0,1,0,0,1,1,163,4,1,1,1,1,1,0,16809.381,2179353.3,282555328,1,1,59.290001,456.53299,3515.3042,0,1,0,0,0,0,1,0,0 46 | "78",9.43,139.9,3.8,1,1,0,0,0,1,0,1,0,1,0,1,109,5,1,1,1,1,1,0,19572.01,2738124.3,383063584,1,0,14.44,54.872002,208.5136,1,0,0,0,0,0,0,1,0 47 | "79",14.82,189.2,3.8,1,1,0,0,1,0,0,1,0,0,0,1,133,5,1,1,1,1,1,1,35796.641,6772724.5,1281399424,1,0,14.44,54.872002,208.5136,1,0,0,0,0,0,0,1,0 48 | "84",19.59,24.976,1,0,1,0,0,0,1,0,1,0,0,1,0,126,6,1,0,0,0,0,0,623.8006,15580.043,389127.16,0,0,1,1,1,0,1,0,0,0,0,0,0,1 49 | "85",8.02,149,10,0,0,0,0,0,0,1,1,0,0,2,1,214,4,1,1,1,1,1,0,22201,3307949,492884416,1,1,100,1000,10000,0,0,1,0,0,0,1,0,0 50 | "87",13.17,65.77,5.1,1,3,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,0,0,4325.6929,284500.81,18711620,1,0,26.01,132.651,676.52008,0,1,0,0,0,0,0,1,0 51 | "89",16.85,13.55,1.7,0,1,0,1,0,0,0,0,0,0,2,0,137,5,0,0,0,0,0,0,183.60249,2487.814,33709.879,0,0,2.8900001,4.9130001,8.3521004,0,0,1,0,0,0,0,1,0 52 | "90",13.47,106.5,5.4,1,2,0,0,0,0,1,1,0,0,1,0,170,5,1,1,1,1,1,0,11342.25,1207949.6,128646632,1,0,29.16,157.464,850.3056,0,1,0,0,0,0,0,1,0 53 | "93",9.92,180,4.1,1,1,0,0,0,0,1,1,0,0,0,1,116,5,1,1,1,1,1,1,32400,5832000,1049760000,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0 54 | "95",13.12,100.898,8,1,1,0,0,0,0,1,1,0,0,1,1,177,4,1,1,1,1,1,0,10180.406,1027182.6,103640672,1,1,64,512,4096,0,1,0,0,0,0,1,0,0 55 | "96",21.91,21.336,1,1,1,0,0,0,1,0,1,0,1,1,1,117,6,1,0,0,0,0,0,455.22488,9712.6787,207229.7,0,0,1,1,1,0,1,0,0,0,0,0,0,1 56 | "101",18.62,108.697,3.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,11815.038,1284259.1,139595120,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 57 | "105",17.15,89.414,3.1,1,2,0,0,0,0,1,1,0,1,1,1,135,5,1,1,1,1,0,0,7994.8633,714852.69,63917840,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 58 | "106",24.36,96.35,3.3,1,1,0,0,1,0,0,1,0,0,1,1,128,5,1,1,1,1,0,0,9283.3223,894448.13,86180080,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0 59 | "108",12.059,123.862,8.7,0,0,1,0,0,0,0,1,0,0,0,1,182,4,1,1,1,1,1,0,15341.795,1900265.4,235370672,1,1,75.690002,658.50299,5728.9761,1,0,0,0,0,0,1,0,0 60 | "109",13.27,118.955,3.2,1,2,0,0,0,1,0,1,0,0,0,1,148,5,1,1,1,1,1,0,14150.292,1683248,200230768,1,0,10.24,32.768002,104.8576,1,0,0,0,0,0,0,1,0 61 | "111",15.319,125,2.4,1,1,0,0,0,0,1,1,0,0,0,1,135,5,1,1,1,1,1,0,15625,1953125,244140624,0,0,5.7600002,13.824,33.177601,1,0,0,0,0,0,0,1,0 62 | "112",17.34,112.601,3.2,1,1,0,0,1,0,0,1,0,0,1,1,127,5,1,1,1,1,1,0,12678.985,1427666.4,160756672,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0 63 | "115",21.06,18.55,1,1,1,0,0,0,0,1,1,0,0,2,1,120,5,0,0,0,0,0,0,344.10251,6383.1016,118406.53,0,0,1,1,1,0,0,1,0,0,0,0,1,0 64 | "116",9.3,99.6,7.6,1,2,0,0,0,1,0,1,1,0,1,1,137,4,1,1,1,1,0,0,9920.1602,988047.94,98409576,1,1,57.759998,438.97601,3336.2175,0,1,0,0,0,0,1,0,0 65 | "117",9.62,56.979,7.4,1,1,0,0,0,1,0,1,0,0,1,1,137,4,1,1,1,1,0,0,3246.6064,184988.39,10540453,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0 66 | "118",10.889,189.3,3,1,0,0,0,0,0,1,1,0,0,2,1,121,5,1,1,1,1,1,1,35834.488,6783469,1284110720,1,0,9,27,81,0,0,1,0,0,0,0,1,0 67 | "119",18.21,78.95,3.2,1,0,0,0,0,0,1,1,0,0,2,0,122,5,1,1,1,1,0,0,6233.1025,492103.44,38851568,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0 68 | "122",9.87,181,4.8,1,2,0,0,0,1,0,1,1,0,0,0,139,5,1,1,1,1,1,1,32761,5929741,1073283136,1,0,23.040001,110.592,530.84161,1,0,0,0,0,0,0,1,0 69 | "123",11.915,111.35,4.1,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,1,0,12398.822,1380608.9,153730800,1,0,16.809999,68.920998,282.57611,0,1,0,0,0,0,0,1,0 70 | "124",6.268,183,4.6,1,2,0,1,0,0,0,1,0,0,0,1,119,5,1,1,1,1,1,1,33489,6128487,1121513088,1,0,21.16,97.335999,447.74561,1,0,0,0,0,0,0,1,0 71 | "127",23.79,24.422,1.2,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,0,0,0,0,0,596.43408,14566.113,355733.63,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0 72 | "128",12.02,89.498,3.2,1,1,0,0,0,1,0,1,0,0,1,1,125,5,1,1,1,1,0,0,8009.8921,716869.31,64158368,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0 73 | "129",9.74,185,9,0,4,1,0,0,0,0,1,0,0,0,1,196,4,1,1,1,1,1,1,34225,6331625,1171350656,1,1,81,729,6561,1,0,0,0,0,0,1,0,0 74 | "130",5.109,105.098,7.4,1,0,0,1,0,0,0,1,0,0,1,0,149,4,1,1,1,1,1,0,11045.59,1160869.4,122005048,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0 75 | "133",16.38,87.317,5.1,0,1,1,0,0,0,0,1,0,0,2,1,159,5,1,1,1,1,0,0,7624.2583,665727.38,58129316,1,0,26.01,132.651,676.52008,0,0,1,0,0,0,0,1,0 76 | "134",13.26,173.136,7.8,0,1,0,0,0,0,1,1,0,1,0,1,204,4,1,1,1,1,1,1,29976.074,5189937.5,898565056,1,1,60.84,474.552,3701.5056,1,0,0,0,0,0,1,0,0 77 | "135",26.83,50,3,1,1,1,0,0,0,0,1,0,1,1,1,124,5,1,1,1,1,0,0,2500,125000,6250000,1,0,9,27,81,0,1,0,0,0,0,0,1,0 78 | "142",6.91,149.8,8.9,1,2,0,1,0,0,0,1,0,1,0,0,159,4,1,1,1,1,1,0,22440.039,3361518,503555392,1,1,79.209999,704.96899,6274.2241,1,0,0,0,0,0,1,0,0 79 | "146",15.4,87.385,4.3,1,2,0,0,0,0,1,1,0,0,1,1,116,5,1,1,1,1,0,0,7636.1382,667283.94,58310608,1,0,18.49,79.507004,341.8801,0,1,0,0,0,0,0,1,0 80 | "149",21.07,26.086,1,0,1,0,0,0,1,0,1,0,0,1,1,121,6,1,0,0,0,0,0,680.47937,17750.986,463052.22,0,0,1,1,1,0,1,0,0,0,0,0,0,1 81 | "151",15.93,106.02,4.3,1,1,0,0,1,0,0,1,0,0,2,1,133,5,1,1,1,1,1,0,11240.24,1191690.3,126343008,1,0,18.49,79.507004,341.8801,0,0,1,0,0,0,0,1,0 82 | "155",12.75,96,8.1,0,2,1,0,0,0,0,1,0,0,1,1,194,4,1,1,1,1,0,0,9216,884736,84934656,1,1,65.610001,531.44098,4304.6719,0,1,0,0,0,0,1,0,0 83 | "156",13.42,102.648,4.6,1,2,0,0,0,1,0,1,0,1,1,1,121,5,1,1,1,1,1,0,10536.612,1081562.1,111020192,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,0,1,0 84 | "157",1.2,158.2,17.6,0,0,0,0,0,0,1,1,0,0,2,0,216,3,1,1,1,1,1,1,25027.24,3959309.3,626362752,1,1,309.76001,5451.7759,95951.258,0,0,1,0,0,1,0,0,0 85 | "162",19.49,102.943,3.7,1,1,0,0,0,0,1,0,0,0,2,0,125,5,1,1,1,1,1,0,10597.262,1090913.9,112301944,1,0,13.69,50.653,187.41611,0,0,1,0,0,0,0,1,0 86 | "163",13.249,94.41,3.1,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,0,0,8913.248,841499.75,79445992,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0 87 | "164",34.149,20.217,1.2,1,1,0,0,0,0,1,1,0,1,1,1,119,6,1,0,0,0,0,0,408.72708,8263.2354,167057.83,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,0,1 88 | "167",25.78,33.235,2.7,1,1,1,0,0,0,0,1,0,0,0,1,124,5,1,1,0,0,0,0,1104.5652,36710.227,1220064.4,0,0,7.29,19.683001,53.1441,1,0,0,0,0,0,0,1,0 89 | "168",3.94,159,8.7,1,2,0,0,0,0,1,0,1,0,2,0,177,4,1,1,1,1,1,1,25281,4019679,639128960,1,1,75.690002,658.50299,5728.9761,0,0,1,0,0,0,1,0,0 90 | "174",17.46,119.95,4.2,1,1,0,0,0,0,1,1,0,0,1,1,139,5,1,1,1,1,1,0,14388.003,1725840.9,207014608,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0 91 | "176",29.499,18.238,1.1,1,1,0,0,1,0,0,1,0,1,1,0,108,6,0,0,0,0,0,0,332.62463,6066.4082,110639.16,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1 92 | "177",13.91,55.6,3.7,1,1,0,0,0,0,1,1,0,0,0,1,116,5,1,1,1,1,0,0,3091.3601,171879.61,9556507,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0 93 | "179",19.67,77,3.7,1,1,0,0,0,0,1,0,1,0,0,1,136,5,1,1,1,1,0,0,5929,456533,35153040,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0 94 | "189",8.5,41.326,5.4,1,1,0,1,0,0,0,1,0,0,0,1,149,5,1,1,1,0,0,0,1707.8383,70578.125,2916711.5,1,0,29.16,157.464,850.3056,1,0,0,0,0,0,0,1,0 95 | "190",21.73,53.71,7.7,0,0,0,0,1,0,0,1,0,1,0,1,235,4,1,1,1,1,0,0,2884.7642,154940.69,8321864,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0 96 | "191",17.52,49.308,2.8,1,1,1,0,0,0,0,1,0,0,0,1,119,5,1,1,1,0,0,0,2431.2788,119881.5,5911117,0,0,7.8400002,21.952,61.465599,1,0,0,0,0,0,0,1,0 97 | "193",16.85,150,3.4,1,1,1,0,0,0,0,0,0,0,2,1,124,5,1,1,1,1,1,1,22500,3375000,506249984,1,0,11.56,39.304001,133.63361,0,0,1,0,0,0,0,1,0 98 | "194",9.72,123,4.2,1,1,0,0,0,1,0,1,0,0,0,1,109,5,1,1,1,1,1,0,15129,1860867,228886640,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0 99 | "197",27.68,20.489,1,1,1,1,0,0,0,0,1,0,1,1,0,127,5,1,0,0,0,0,0,419.79913,8601.2646,176231.3,0,0,1,1,1,0,1,0,0,0,0,0,1,0 100 | "198",9.49,175.1,7.8,1,2,1,0,0,0,0,1,1,0,0,1,146,4,1,1,1,1,1,1,30660.01,5368568,940036224,1,1,60.84,474.552,3701.5056,1,0,0,0,0,0,1,0,0 101 | -------------------------------------------------------------------------------- /Group Data Challenge 2025/data_challenge.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Group Data Challenge 2025/data_challenge.pdf -------------------------------------------------------------------------------- /Group Data Challenge 2025/orange_juice.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Wholesale Manager" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Author:**\n", 15 | "[Anthony Strittmatter](http://www.anthonystrittmatter.com)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "You manage a wholesale store. The data file juice.csv contains orange juice sales (sales) and prices (price) of different grocery stores that you deliver. Your product range contains three different orange juice brands: Tropicana, Minute Maid, and Dominicks. Some stores advertise/feature specific orange juice brands, which is indicated by the dummy variable feat. The data contains also the store ID (id). You deliver new grocery stores. The new stores sent you the file new grocery.csv, which\n", 23 | "contains the planned prices and advertisements for the different brands. Your job as wholesale manager is to predict the sales of the new grocery stores and deliver the right amount of orange juice." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Load Packages and Data" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "[1] \"Packages and data successfully loaded.\"\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "######################## Load Packages and Data ########################\n", 48 | "\n", 49 | "# Load packages\n", 50 | "library(rpart)\n", 51 | "library(rpart.plot)\n", 52 | "library(grf)\n", 53 | "library(glmnet)\n", 54 | "\n", 55 | "# Load data\n", 56 | "juice <- read.csv(\"juice.csv\", sep = \",\")\n", 57 | "new_grocery <- read.csv(\"new_grocery.csv\", sep = \",\")\n", 58 | "\n", 59 | "print('Packages and data successfully loaded.')\n", 60 | "\n", 61 | "#############################################################################" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Inspect Data" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 2, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/html": [ 79 | "\n", 80 | "\n", 81 | "\n", 82 | "\t\n", 83 | "\t\n", 84 | "\t\n", 85 | "\t\n", 86 | "\t\n", 87 | "\t\n", 88 | "\n", 89 | "
Xidsalespricebrandfeat
1 1140 11970 2.47 minute.maid0
3 7182 30205 1.57 dominicks 1
4 1741 3521 2.55 minute.maid0
5 1725 11777 1.41 dominicks 0
6 7565 129151 2.05 minute.maid1
8 5617 7104 3.74 tropicana 0
\n" 90 | ], 91 | "text/latex": [ 92 | "\\begin{tabular}{r|llllll}\n", 93 | " X & id & sales & price & brand & feat\\\\\n", 94 | "\\hline\n", 95 | "\t 1 & 1140 & 11970 & 2.47 & minute.maid & 0 \\\\\n", 96 | "\t 3 & 7182 & 30205 & 1.57 & dominicks & 1 \\\\\n", 97 | "\t 4 & 1741 & 3521 & 2.55 & minute.maid & 0 \\\\\n", 98 | "\t 5 & 1725 & 11777 & 1.41 & dominicks & 0 \\\\\n", 99 | "\t 6 & 7565 & 129151 & 2.05 & minute.maid & 1 \\\\\n", 100 | "\t 8 & 5617 & 7104 & 3.74 & tropicana & 0 \\\\\n", 101 | "\\end{tabular}\n" 102 | ], 103 | "text/markdown": [ 104 | "\n", 105 | "| X | id | sales | price | brand | feat |\n", 106 | "|---|---|---|---|---|---|\n", 107 | "| 1 | 1140 | 11970 | 2.47 | minute.maid | 0 |\n", 108 | "| 3 | 7182 | 30205 | 1.57 | dominicks | 1 |\n", 109 | "| 4 | 1741 | 3521 | 2.55 | minute.maid | 0 |\n", 110 | "| 5 | 1725 | 11777 | 1.41 | dominicks | 0 |\n", 111 | "| 6 | 7565 | 129151 | 2.05 | minute.maid | 1 |\n", 112 | "| 8 | 5617 | 7104 | 3.74 | tropicana | 0 |\n", 113 | "\n" 114 | ], 115 | "text/plain": [ 116 | " X id sales price brand feat\n", 117 | "1 1 1140 11970 2.47 minute.maid 0 \n", 118 | "2 3 7182 30205 1.57 dominicks 1 \n", 119 | "3 4 1741 3521 2.55 minute.maid 0 \n", 120 | "4 5 1725 11777 1.41 dominicks 0 \n", 121 | "5 6 7565 129151 2.05 minute.maid 1 \n", 122 | "6 8 5617 7104 3.74 tropicana 0 " 123 | ] 124 | }, 125 | "metadata": {}, 126 | "output_type": "display_data" 127 | }, 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "[1] \"Old data: 9685 observations\"\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "######################## Describe Old Data ########################\n", 138 | "\n", 139 | "# Print first few rows of old data\n", 140 | "head(juice)\n", 141 | "\n", 142 | "# Number of observations\n", 143 | "print(paste0('Old data: ',nrow(juice),' observations'))\n", 144 | "\n", 145 | "######################################################################" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 3, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/html": [ 156 | "\n", 157 | "\n", 158 | "\n", 159 | "\t\n", 160 | "\t\n", 161 | "\t\n", 162 | "\t\n", 163 | "\t\n", 164 | "\t\n", 165 | "\n", 166 | "
Xidpricebrandfeat
2 10171 1.81 dominicks 1
7 7489 NA tropicana 0
10 7559 3.29 tropicana 0
11 1236 1.77 minute.maid1
16 5361 1.53 dominicks 0
17 108 1.42 dominicks 0
\n" 167 | ], 168 | "text/latex": [ 169 | "\\begin{tabular}{r|lllll}\n", 170 | " X & id & price & brand & feat\\\\\n", 171 | "\\hline\n", 172 | "\t 2 & 10171 & 1.81 & dominicks & 1 \\\\\n", 173 | "\t 7 & 7489 & NA & tropicana & 0 \\\\\n", 174 | "\t 10 & 7559 & 3.29 & tropicana & 0 \\\\\n", 175 | "\t 11 & 1236 & 1.77 & minute.maid & 1 \\\\\n", 176 | "\t 16 & 5361 & 1.53 & dominicks & 0 \\\\\n", 177 | "\t 17 & 108 & 1.42 & dominicks & 0 \\\\\n", 178 | "\\end{tabular}\n" 179 | ], 180 | "text/markdown": [ 181 | "\n", 182 | "| X | id | price | brand | feat |\n", 183 | "|---|---|---|---|---|\n", 184 | "| 2 | 10171 | 1.81 | dominicks | 1 |\n", 185 | "| 7 | 7489 | NA | tropicana | 0 |\n", 186 | "| 10 | 7559 | 3.29 | tropicana | 0 |\n", 187 | "| 11 | 1236 | 1.77 | minute.maid | 1 |\n", 188 | "| 16 | 5361 | 1.53 | dominicks | 0 |\n", 189 | "| 17 | 108 | 1.42 | dominicks | 0 |\n", 190 | "\n" 191 | ], 192 | "text/plain": [ 193 | " X id price brand feat\n", 194 | "1 2 10171 1.81 dominicks 1 \n", 195 | "2 7 7489 NA tropicana 0 \n", 196 | "3 10 7559 3.29 tropicana 0 \n", 197 | "4 11 1236 1.77 minute.maid 1 \n", 198 | "5 16 5361 1.53 dominicks 0 \n", 199 | "6 17 108 1.42 dominicks 0 " 200 | ] 201 | }, 202 | "metadata": {}, 203 | "output_type": "display_data" 204 | }, 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "[1] \"New data: 3262 observations\"\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "######################## Describe Old Data ########################\n", 215 | "\n", 216 | "# Print first few rows of new data\n", 217 | "head(new_grocery)\n", 218 | "\n", 219 | "# Number of observations\n", 220 | "print(paste0('New data: ',nrow(new_grocery),' observations'))\n", 221 | "\n", 222 | "######################################################################" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "## Prepare Data" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 4, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | " sales price missing minute.maid \n", 241 | " Min. : 63 Min. :0.000 Min. :0.00000 Min. :0.0000 \n", 242 | " 1st Qu.: 4800 1st Qu.:1.710 1st Qu.:0.00000 1st Qu.:0.0000 \n", 243 | " Median : 8256 Median :2.120 Median :0.00000 Median :0.0000 \n", 244 | " Mean : 17023 Mean :2.174 Mean :0.04801 Mean :0.3284 \n", 245 | " 3rd Qu.: 16896 3rd Qu.:2.720 3rd Qu.:0.00000 3rd Qu.:1.0000 \n", 246 | " Max. :716415 Max. :4.170 Max. :1.00000 Max. :1.0000 \n", 247 | " dominicks tropicana featured \n", 248 | " Min. :0.0000 Min. :0.000 Min. :0.0000 \n", 249 | " 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 \n", 250 | " Median :0.0000 Median :0.000 Median :0.0000 \n", 251 | " Mean :0.3405 Mean :0.331 Mean :0.2355 \n", 252 | " 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.0000 \n", 253 | " Max. :1.0000 Max. :1.000 Max. :1.0000 " 254 | ] 255 | }, 256 | "metadata": {}, 257 | "output_type": "display_data" 258 | }, 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "[1] \"Data is prepared.\"\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "######################## Data Preparation ########################\n", 269 | "\n", 270 | "# Generate dummy for missing prices\n", 271 | "missing <- (is.na(juice$price) == TRUE)\n", 272 | "new_missing <- (is.na(new_grocery$price) == TRUE)\n", 273 | "\n", 274 | "# Replace missing prices with zero\n", 275 | "juice$price[is.na(juice$price)] <-0\n", 276 | "new_grocery$price[is.na(new_grocery$price)] <-0\n", 277 | "\n", 278 | "# Generate Dummies for Brands\n", 279 | "brand_1 <- (juice$brand == \"minute.maid\")\n", 280 | "brand_2 <- (juice$brand == \"dominicks\")\n", 281 | "brand_3 <- (juice$brand == \"tropicana\")\n", 282 | "\n", 283 | "new_brand_1 <- (new_grocery$brand == \"minute.maid\")\n", 284 | "new_brand_2 <- (new_grocery$brand == \"dominicks\")\n", 285 | "new_brand_3 <- (new_grocery$brand == \"tropicana\")\n", 286 | "\n", 287 | "# Generate outcome and control variables\n", 288 | "y <- as.matrix(juice$sales)\n", 289 | "colnames(y) <- c(\"sales\")\n", 290 | "\n", 291 | "x <- as.matrix(cbind(juice$price, missing, brand_1, brand_2, brand_3, juice$feat))\n", 292 | "colnames(x) <- c(\"price\", \"missing\", \"minute.maid\", \"dominicks\", \"tropicana\", \"featured\")\n", 293 | "\n", 294 | "new_x <- as.matrix(cbind(new_grocery$price, new_missing, new_brand_1, new_brand_2, new_brand_3, new_grocery$feat))\n", 295 | "colnames(new_x) <- c(\"price\", \"missing\", \"minute.maid\", \"dominicks\", \"tropicana\", \"featured\")\n", 296 | "\n", 297 | "# Descriptive statistics\n", 298 | "summary(cbind(y,x))\n", 299 | "\n", 300 | "print('Data is prepared.')\n", 301 | "\n", 302 | "#############################################################################" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "**$\\Rightarrow$ It is possible to add non-linear and interaction terms.**" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "## Generate Training and Test Sample" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 5, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "name": "stdout", 326 | "output_type": "stream", 327 | "text": [ 328 | "[1] \"Training and test samples created.\"\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "######################## Training and Test Samples ########################\n", 334 | "\n", 335 | "set.seed(???)\n", 336 | "\n", 337 | "# Generate variable with the rows in training data\n", 338 | "\n", 339 | "\n", 340 | "print('Training and test samples created.')\n", 341 | "\n", 342 | "#############################################################################" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "## Predict Orange Juice Prices in Training Sample and Assess Model in Test Sample" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 6, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "[1] \"R-squared Penalized Regression: 0.278\"\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "######################## LASSO, Ridge, Elastic Net ##############################\n", 367 | "\n", 368 | "set.seed(???)\n", 369 | "penalized.cv <- ???\n", 370 | "\n", 371 | "\n", 372 | "# Fitted values\n", 373 | "pred_penalized <- ???\n", 374 | "\n", 375 | "# Calculate the MSE\n", 376 | "MSE_penalized <- mean((y[-training_set] - pred_penalized[-training_set])^2)\n", 377 | "R2_penalized <- round(1- MSE_penalized/var(y[-training_set]), digits = 3)\n", 378 | "\n", 379 | "print(paste0(\"R-squared Penalized Regression: \", R2_penalized))\n", 380 | " \n", 381 | "################################################################" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 11, 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "[1] \"R-squared Tree: 0.365\"\n" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "###################### Regression Tree #######################\n", 399 | "\n", 400 | "set.seed(???)\n", 401 | "# Prepare data for tree estimator\n", 402 | "outcome <- y[training_set]\n", 403 | "tree_data <- data.frame(outcome, x[training_set,])\n", 404 | "\n", 405 | "deep_tree <- ???\n", 406 | "\n", 407 | "# Optimal tree size\n", 408 | "op.index <- ???\n", 409 | "\n", 410 | "## Select the Tree that Minimises CV-MSE\n", 411 | "cp.vals <- ???\n", 412 | "\n", 413 | "# Prune the deep tree\n", 414 | "pruned_tree <- ???\n", 415 | "\n", 416 | "## Plot tree structure\n", 417 | "#rpart.plot(pruned_tree,digits=3)\n", 418 | "\n", 419 | "# Fitted values\n", 420 | "predtree <- ???\n", 421 | "\n", 422 | "# Calculate the MSE\n", 423 | "MSEtree <- mean((y[-training_set] - predtree[-training_set])^2)\n", 424 | "R2tree <- round(1- MSEtree/var(y[-training_set]), digits = 3)\n", 425 | "\n", 426 | "print(paste0(\"R-squared Tree: \", R2tree))\n", 427 | "\n", 428 | "################################################################" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 8, 434 | "metadata": {}, 435 | "outputs": [ 436 | { 437 | "name": "stdout", 438 | "output_type": "stream", 439 | "text": [ 440 | "[1] \"R-squared Forest: 0.411\"\n" 441 | ] 442 | } 443 | ], 444 | "source": [ 445 | "######################## Random Forest #######################\n", 446 | "\n", 447 | "set.seed(???)\n", 448 | "\n", 449 | "rep <- ??? # number of trees\n", 450 | "cov <- ??? # share of covariates\n", 451 | "frac <- ??? # fraction of subsample\n", 452 | "min_obs <- ??? # max. size of terminal leaves in trees\n", 453 | "\n", 454 | "# Build Forest\n", 455 | "forest <- ???\n", 456 | "\n", 457 | "# Fitted values\n", 458 | "predforest <- ???\n", 459 | "\n", 460 | "# Calculate MSE\n", 461 | "MSEforest <- mean((y[-training_set] - predforest[-training_set])^2)\n", 462 | "R2forest <- round(1- MSEforest/var(y[-training_set]), digits = 3)\n", 463 | "\n", 464 | "print(paste0(\"R-squared Forest: \", R2forest))\n", 465 | "\n", 466 | "################################################################" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "## Select Favorite Model and Extrapolate to New Data" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 9, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "[1] \"Out-of-sample sales are predicted.\"\n" 486 | ] 487 | } 488 | ], 489 | "source": [ 490 | "######################## Out-of-Sample Prediction #######################\n", 491 | "\n", 492 | "# Fitted values\n", 493 | "new_prediction <- ???\n", 494 | "\n", 495 | "print('Out-of-sample sales are predicted.')\n", 496 | "\n", 497 | "###########################################################################" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "## Store Out-of-Sample Predictions" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 10, 510 | "metadata": {}, 511 | "outputs": [ 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "[1] \"File is stored.\"\n", 517 | "[1] \"Send your results to anthony.strittmatter@unibas.ch\"\n" 518 | ] 519 | } 520 | ], 521 | "source": [ 522 | "######################## Store Results #######################\n", 523 | "\n", 524 | "id_new <- as.matrix(new_grocery$id)\n", 525 | "\n", 526 | "# Replace ??? with your group name\n", 527 | "write.csv(cbind(id_new,new_prediction),\"???.csv\")\n", 528 | "\n", 529 | "print('File is stored.')\n", 530 | "print('Send your results to anthony.strittmatter@unibas.ch')\n", 531 | "\n", 532 | "################################################################" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [] 541 | } 542 | ], 543 | "metadata": { 544 | "kernelspec": { 545 | "display_name": "R", 546 | "language": "R", 547 | "name": "ir" 548 | }, 549 | "language_info": { 550 | "codemirror_mode": "r", 551 | "file_extension": ".r", 552 | "mimetype": "text/x-r-source", 553 | "name": "R", 554 | "pygments_lexer": "r", 555 | "version": "3.6.1" 556 | } 557 | }, 558 | "nbformat": 4, 559 | "nbformat_minor": 4 560 | } 561 | -------------------------------------------------------------------------------- /Group Data Challenge 2025/orange_juice.r: -------------------------------------------------------------------------------- 1 | ######################## Load Packages and Data ######################## 2 | 3 | # Load packages 4 | library(rpart) 5 | library(rpart.plot) 6 | library(grf) 7 | library(glmnet) 8 | 9 | # Load data 10 | juice <- read.csv("juice.csv", sep = ",") 11 | new_grocery <- read.csv("new_grocery.csv", sep = ",") 12 | 13 | print('Packages and data successfully loaded.') 14 | 15 | ############################################################################# 16 | 17 | ######################## Describe Old Data ######################## 18 | 19 | # Print first few rows of old data 20 | head(juice) 21 | 22 | # Number of observations 23 | print(paste0('Old data: ',nrow(juice),' observations')) 24 | 25 | ###################################################################### 26 | 27 | ######################## Describe Old Data ######################## 28 | 29 | # Print first few rows of new data 30 | head(new_grocery) 31 | 32 | # Number of observations 33 | print(paste0('New data: ',nrow(new_grocery),' observations')) 34 | 35 | ###################################################################### 36 | 37 | ######################## Data Preparation ######################## 38 | 39 | # Generate dummy for missing prices 40 | missing <- (is.na(juice$price) == TRUE) 41 | new_missing <- (is.na(new_grocery$price) == TRUE) 42 | 43 | # Replace missing prices with zero 44 | juice$price[is.na(juice$price)] <-0 45 | new_grocery$price[is.na(new_grocery$price)] <-0 46 | 47 | # Generate Dummies for Brands 48 | brand_1 <- (juice$brand == "minute.maid") 49 | brand_2 <- (juice$brand == "dominicks") 50 | brand_3 <- (juice$brand == "tropicana") 51 | 52 | new_brand_1 <- (new_grocery$brand == "minute.maid") 53 | new_brand_2 <- (new_grocery$brand == "dominicks") 54 | new_brand_3 <- (new_grocery$brand == "tropicana") 55 | 56 | # Generate outcome and control variables 57 | y <- as.matrix(juice$sales) 58 | colnames(y) <- c("sales") 59 | 60 | x <- as.matrix(cbind(juice$price, missing, brand_1, brand_2, brand_3, juice$feat)) 61 | colnames(x) <- c("price", "missing", "minute.maid", "dominicks", "tropicana", "featured") 62 | 63 | new_x <- as.matrix(cbind(new_grocery$price, new_missing, new_brand_1, new_brand_2, new_brand_3, new_grocery$feat)) 64 | colnames(new_x) <- c("price", "missing", "minute.maid", "dominicks", "tropicana", "featured") 65 | 66 | # Descriptive statistics 67 | summary(cbind(y,x)) 68 | 69 | print('Data is prepared.') 70 | 71 | ############################################################################# 72 | 73 | ######################## Training and Test Samples ######################## 74 | 75 | set.seed(???) 76 | 77 | # Generate variable with the rows in training data 78 | 79 | 80 | print('Training and test samples created.') 81 | 82 | ############################################################################# 83 | 84 | ######################## LASSO, Ridge, Elastic Net ############################## 85 | 86 | set.seed(???) 87 | penalized.cv <- ??? 88 | 89 | 90 | # Fitted values 91 | pred_penalized <- ??? 92 | 93 | # Calculate the MSE 94 | MSE_penalized <- mean((y[-training_set] - pred_penalized[-training_set])^2) 95 | R2_penalized <- round(1- MSE_penalized/var(y[-training_set]), digits = 3) 96 | 97 | print(paste0("R-squared Penalized Regression: ", R2_penalized)) 98 | 99 | ################################################################ 100 | 101 | ###################### Regression Tree ####################### 102 | 103 | set.seed(???) 104 | # Prepare data for tree estimator 105 | outcome <- y[training_set] 106 | tree_data <- data.frame(outcome, x[training_set,]) 107 | 108 | deep_tree <- ??? 109 | 110 | # Optimal tree size 111 | op.index <- ??? 112 | 113 | ## Select the Tree that Minimises CV-MSE 114 | cp.vals <- ??? 115 | 116 | # Prune the deep tree 117 | pruned_tree <- ??? 118 | 119 | ## Plot tree structure 120 | #rpart.plot(pruned_tree,digits=3) 121 | 122 | # Fitted values 123 | predtree <- ??? 124 | 125 | # Calculate the MSE 126 | MSEtree <- mean((y[-training_set] - predtree[-training_set])^2) 127 | R2tree <- round(1- MSEtree/var(y[-training_set]), digits = 3) 128 | 129 | print(paste0("R-squared Tree: ", R2tree)) 130 | 131 | ################################################################ 132 | 133 | ######################## Random Forest ####################### 134 | 135 | set.seed(???) 136 | 137 | rep <- ??? # number of trees 138 | cov <- ??? # share of covariates 139 | frac <- ??? # fraction of subsample 140 | min_obs <- ??? # max. size of terminal leaves in trees 141 | 142 | # Build Forest 143 | forest <- ??? 144 | 145 | # Fitted values 146 | predforest <- ??? 147 | 148 | # Calculate MSE 149 | MSEforest <- mean((y[-training_set] - predforest[-training_set])^2) 150 | R2forest <- round(1- MSEforest/var(y[-training_set]), digits = 3) 151 | 152 | print(paste0("R-squared Forest: ", R2forest)) 153 | 154 | ################################################################ 155 | 156 | ######################## Out-of-Sample Prediction ####################### 157 | 158 | # Fitted values 159 | new_prediction <- ??? 160 | 161 | print('Out-of-sample sales are predicted.') 162 | 163 | ########################################################################### 164 | 165 | ######################## Store Results ####################### 166 | 167 | id_new <- as.matrix(new_grocery$id) 168 | 169 | # Replace ??? with your group name 170 | write.csv(cbind(id_new,new_prediction),"???.csv") 171 | 172 | print('File is stored.') 173 | print('Send your results to anthony.strittmatter@unibas.ch') 174 | 175 | ################################################################ 176 | 177 | 178 | -------------------------------------------------------------------------------- /Individual Home Assignment 2025/grading_grid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Individual Home Assignment 2025/grading_grid.pdf -------------------------------------------------------------------------------- /Individual Home Assignment 2025/research_proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Individual Home Assignment 2025/research_proposal.pdf -------------------------------------------------------------------------------- /Literature/Athey_2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Athey_2017.pdf -------------------------------------------------------------------------------- /Literature/Athey_et_al_2019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Athey_et_al_2019.pdf -------------------------------------------------------------------------------- /Literature/Belloni_et_al_2012.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2012.pdf -------------------------------------------------------------------------------- /Literature/Belloni_et_al_2014a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2014a.pdf -------------------------------------------------------------------------------- /Literature/Belloni_et_al_2014b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2014b.pdf -------------------------------------------------------------------------------- /Literature/Cagala_et_al_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Cagala_et_al_2021.pdf -------------------------------------------------------------------------------- /Literature/Chernozhukov_et_al_2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Chernozhukov_et_al_2017.pdf -------------------------------------------------------------------------------- /Literature/Chetverikov_et_al_2020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Chetverikov_et_al_2020.pdf -------------------------------------------------------------------------------- /Literature/Google flu trends.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Google flu trends.pdf -------------------------------------------------------------------------------- /Literature/Mullainathan_Spiess_2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Mullainathan_Spiess_2017.pdf -------------------------------------------------------------------------------- /Literature/Semenova_Chernozhukov_2020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Semenova_Chernozhukov_2020.pdf -------------------------------------------------------------------------------- /PC Lab 1/help files/glmnet_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/help files/glmnet_package.pdf -------------------------------------------------------------------------------- /PC Lab 1/penalize_regression_tutorial.r: -------------------------------------------------------------------------------- 1 | ######################## Load Packages and Data ######################## 2 | 3 | # Load packages 4 | library(glmnet) 5 | library(corrplot) 6 | 7 | # Load data 8 | load("student-mat-train.Rdata") 9 | load("student-mat-test.Rdata") 10 | 11 | # Number of observations 12 | print(paste0('Training set: ',nrow(train),' obs')) 13 | print(paste0('Test set: ',nrow(test),' obs')) 14 | 15 | ########################################################################### 16 | 17 | ######################## Correlation analysis ######################## 18 | cor <- round(cor(train[,c(1:25)]),2) # Variable 26 is the depedendent variable 19 | corrplot(cor) 20 | 21 | ######################## Estimation of the linear regression ######################## 22 | 23 | ols <- lm(G3 ~ ., data = train) 24 | summary(ols) 25 | 26 | # Calculate the MSE 27 | test$predols <- predict(ols, newdata = test) 28 | 29 | predMSEols <- mean((test$G3 - test$predols)^2) 30 | print(predMSEols) 31 | 32 | ######################################################################################## 33 | 34 | ######################## OLS model ######################## 35 | 36 | ols_small <- lm(??? , data = train) 37 | 38 | # Calculate the MSE 39 | test$predols_small <- predict(ols_small, newdata = test) 40 | 41 | predMSEols_small <- mean((test$G3 - test$predols_small)^2) 42 | print(predMSEols_small) 43 | 44 | ######################## Lasso Path ######################## 45 | 46 | # We make a plot that shows how the Lasso coefficients change with lambda 47 | # glmnet is the standard R package for Lasso, Ridge, and Elastic Net 48 | # alpha is a parmeter that allows to specify a Lasso, Ridge, or Elastic Net model 49 | # alpha = 1 for Lasso; alpha = 0 for Ridge, 0 < alpha < 1 for Elastic Net 50 | # The control variables are train[,c(1:25)] 51 | # The outcome variable is train$G3 (math grades) 52 | 53 | # Estimate a Lasso model 54 | lasso <- glmnet(as.matrix(train[,c(1:25)]), train$G3, alpha = 1) # We save the model under the name "lasso" 55 | plot(lasso, xvar = "lambda", label = TRUE) 56 | 57 | ############################################################### 58 | 59 | ######################## Cross-Validaton ######################## 60 | 61 | # Set starting value for replicability 62 | set.seed(27112019) 63 | 64 | # cv.glmnet performs a cross-validation to determine the optimal lambda value 65 | # type.measure specifies the measure we use to assess the model accuracy (here MSE) 66 | # nfolds specifies the number of cross-validation folds we use (here 5) 67 | 68 | # Cross-validate the Lasso 69 | lasso.cv <- cv.glmnet(as.matrix(train[,c(1:25)]), train$G3, type.measure = "mse", nfolds = 5, alpha = 1) 70 | 71 | # Plot the MSE for the different lambda values 72 | plot(lasso.cv) 73 | 74 | ##################################################################### 75 | 76 | ######################## Optimal Lambda Value ######################## 77 | 78 | # Print the optimal lambda value 79 | print(paste0("Optimal lambda that minimizes cross-validated MSE: ", lasso.cv$lambda.min)) 80 | print(paste0("Optimal lambda using one-standard-error-rule: ", lasso.cv$lambda.1se)) 81 | 82 | ######################################################################### 83 | 84 | ######################## Lasso Coefficients ######################## 85 | 86 | # Print Lasso coefficients 87 | print(coef(lasso.cv, s = "lambda.min")) 88 | 89 | # Save for later comparison 90 | coef_lasso1 <- coef(lasso.cv, s = "lambda.min") 91 | 92 | ####################################################################### 93 | 94 | ######################## Test Sample MSE ######################## 95 | 96 | # Estimate the fitted values of the Lasso model in the test sample 97 | # We use the model "lasso.cv" and the lambda value which we estimated in the training sample 98 | # The control variables "newx" are from the test sample 99 | 100 | # Fitted values 101 | test$predlasso <- predict(lasso.cv, newx = as.matrix(test[,c(1:25)]), s = lasso.cv$lambda.min) 102 | 103 | # Calculate the MSE 104 | predMSElasso <- mean((test$G3 - test$predlasso)^2) 105 | print(paste0("MSE: ", predMSElasso)) 106 | 107 | ##################################################################### 108 | 109 | ######################## Different Starting Value ######################## 110 | 111 | # Change the starting value 112 | set.seed(27112025) # 27112024 113 | 114 | # Re-estimate the Lasso model 115 | lasso.cv <- cv.glmnet(???) 116 | 117 | # Store the coefficients 118 | coef_lasso2 <- coef(lasso.cv, s = ???) 119 | print(cbind(coef_lasso1, coef_lasso2)) 120 | 121 | # Calculate the fitted values 122 | test$predlasso2 <- predict(lasso.cv, newx = as.matrix(test[,c(1:25)]), s = lasso.cv$lambda.min) 123 | 124 | # Correlation between the fitted values of the two Lasso models 125 | cor_fit <- cor(test$predlasso,test$predlasso2) 126 | print(paste0("Correlation between fitted values: ", cor_fit)) 127 | 128 | ######################## Ridge Path ######################## 129 | 130 | # alpha = 0 specifies a Ridge model 131 | 132 | # Estimate the Ridge 133 | ridge <- glmnet(as.matrix(train[,c(1:25)]), train$G3, alpha = ???) 134 | 135 | # Plot the path of the Ridge coefficients 136 | plot(ridge, xvar = "lambda", label = TRUE) 137 | 138 | ############################################################### 139 | 140 | ######################## Cross-Validation ######################## 141 | 142 | # Set starting value 143 | set.seed(27112019) 144 | 145 | # Cross-validate the Ridge model 146 | ridge.cv <- cv.glmnet(???) 147 | 148 | # Plot the MSE in the cross-validation samples 149 | plot(ridge.cv) 150 | 151 | ##################################################################### 152 | 153 | ######################## Optimal Lambda Value ######################## 154 | 155 | # Print the optimal lambda value 156 | print(paste0("Optimal lambda that minimizes cross-validated MSE: ", ???)) 157 | print(paste0("Optimal lambda using one-standard-error-rule: ", ???)) 158 | 159 | ######################################################################### 160 | 161 | ######################## Ridge Coefficients ######################## 162 | 163 | # Print Ridge coefficients 164 | print(coef(ridge.cv, s = "lambda.min")) 165 | 166 | # Save for later comparison 167 | coef_ridge <- coef(ridge.cv, s = "lambda.min") 168 | 169 | ####################################################################### 170 | 171 | ######################## Test Sample MSE ######################## 172 | 173 | # Estimate fitted values in test sample 174 | test$predridge <- predict(ridge, newx = ???, s = ???) 175 | 176 | # Calculate the MSE 177 | predMSEridge <- ??? 178 | print(paste0("MSE: ", predMSEridge)) 179 | 180 | ################################################################### 181 | 182 | ######################## Compare Lasso and Ridge Coefficients ######################## 183 | 184 | # Pick the coefficients of Dalc and Walc 185 | comp <- cbind(coef(ols)[23:24], coef_lasso1[23:24], coef_lasso2[23:24], coef_ridge[23:24]) 186 | colnames(comp) <- c("OLS", "Lasso1", "Lasso2", "Ridge") 187 | print(comp) 188 | 189 | ######################################################################################### 190 | 191 | ######################## Compare the MSE ######################## 192 | 193 | # Print the MSE of the OLS, Lasso and Ridge models 194 | print(c(predMSEols, predMSElasso, predMSEridge)) 195 | 196 | #################################################################### 197 | 198 | ######################## Compare models ######################## 199 | 200 | # Visualize the predictions (Predicted vs Actual) 201 | plot(test$G3,test$predols,xlim=c(5,20),ylim=c(4,16), col= "darkgreen", xlab = "Actual Grades", ylab = "Predicted Grades" ) 202 | par(new=TRUE) 203 | plot(test$G3,test$predlasso,xlim=c(5,20),ylim=c(4,16), col= "blue", xlab = "", ylab = "" ) 204 | par(new=TRUE) 205 | plot(test$G3,test$predridge,xlim=c(5,20),ylim=c(4,16), col= "red", xlab = "", ylab = "" ) 206 | abline(a=0,b=1) 207 | legend(16, 9, c("OLS", "Lasso", "Ridge"), col = c("darkgreen", "blue", "red"), pch = c(21, 21, 21)) 208 | 209 | #################################################################### 210 | -------------------------------------------------------------------------------- /PC Lab 1/student-mat-test.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/student-mat-test.Rdata -------------------------------------------------------------------------------- /PC Lab 1/student-mat-train.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/student-mat-train.Rdata -------------------------------------------------------------------------------- /PC Lab 2/browser-sites.txt: -------------------------------------------------------------------------------- 1 | atdmt.com 2 | yahoo.com 3 | whenu.com 4 | weatherbug.com 5 | msn.com 6 | google.com 7 | aol.com 8 | questionmarket.com 9 | googlesyndication.com-o02 10 | casalemedia.com 11 | mywebsearch.com 12 | myspace.com 13 | pointroll.com 14 | atwola.com 15 | yieldmanager.com 16 | live.com 17 | aim.com 18 | mediaplex.com 19 | precisionclick.com 20 | tribalfusion.com 21 | insightexpressai.com 22 | trafficmp.com 23 | ebay.com 24 | realmedia.com 25 | zedo.com 26 | advertising.com 27 | microsoft.com 28 | hotbar.com 29 | adrevolver.com 30 | ru4.com 31 | 180solutions.com 32 | nextag.com 33 | accuweather.com 34 | overture.com 35 | hotmail.com 36 | passport.com 37 | my-etrust.com 38 | starware.com 39 | relevantknowledge.com 40 | myway.com 41 | partner2profit.com 42 | ditto.com 43 | kanoodle.com 44 | ebayobjects.com 45 | mcafee.com 46 | comcast.net 47 | fastclick.net 48 | adbrite.com 49 | vpptechnologies.com 50 | specificclick.net 51 | serving-sys.com 52 | weather.com 53 | adserver.com 54 | licenseacquisition.org 55 | pogo.com 56 | go.com 57 | btgrab.com 58 | bellsouth.net 59 | intellisrv.net 60 | dell.com 61 | waol.exe 62 | cnn.com 63 | facebook.com 64 | incredibarvuz1.com 65 | burstnet.com 66 | adknowledge.com 67 | funwebproducts.com 68 | belnk.com 69 | netscape.com 70 | mysearch.com 71 | real.com 72 | liveperson.net 73 | adsonar.com 74 | passport.net 75 | euroclick.com 76 | m7z.net 77 | mywebface.com 78 | kazaa.com 79 | bestoffersnetworks.com 80 | vitalstream.com 81 | tacoda.net 82 | unicast.com 83 | offeroptimizer.com 84 | bankofamerica.com 85 | acsd.exe 86 | gator.com 87 | quickbrowsersearch.com 88 | revsci.net 89 | personalweb.com 90 | rr.com 91 | msnusers.com 92 | zango.com 93 | earthlink.net 94 | mapquest.com 95 | falkag.net 96 | freeze.com 97 | amazon.com 98 | net-offers.net 99 | shopperreports.com 100 | dellfix.com 101 | plaxo.com 102 | ysbweb.com 103 | googleadservices.com 104 | qnsr.com 105 | revenue.net 106 | adultfriendfinder.com 107 | addynamix.com 108 | seekmo.com 109 | verizon.net 110 | cox.net 111 | metricsdirect.com 112 | akamai.net 113 | admarketplace.net 114 | amazon.com-o01 115 | aolacsd.exe 116 | opinionsquare.com 117 | interclick.com 118 | peoplepc.com 119 | go.com-o04 120 | realtechnetwork.net 121 | freezecoldcash.com 122 | ask.com 123 | contextweb.com 124 | intellitxt.com 125 | yceml.net 126 | about.com 127 | youtube.com 128 | wikipedia.org 129 | surfaccuracy.com 130 | windowsmedia.com 131 | craigslist.org 132 | hackerwatch.org 133 | foxsports.com 134 | spamblockerutility.com 135 | walmart.com 136 | navexcel.com 137 | partypoker.com 138 | wellsfargo.com 139 | travelzoo.com 140 | photobucket.com 141 | viewpoint.com 142 | nielsennetpanel.com 143 | mymailstamp.com 144 | windows.com 145 | optonline.net 146 | eguard.com 147 | aolcdn.com 148 | musicmatch.com 149 | qksz.net 150 | cometsystems.com 151 | netzero.net 152 | specificmedia.com 153 | paypal.com 154 | iwon.com 155 | monster.com-o01 156 | vmn.net 157 | juno.com 158 | information.com 159 | sysupdates.com 160 | 2o7.net 161 | adwave.com 162 | need2find.com 163 | target.com 164 | ebayrtm.com 165 | match.com 166 | bridgetrack.com 167 | comcastsupport.com 168 | rs6.net 169 | screensavers.com 170 | footprint.net 171 | sportsline.com 172 | adelphia.net 173 | smileycentral.com 174 | dlqm.net 175 | careerbuilder.com 176 | mlb.com 177 | searchignite.com 178 | wachovia.com 179 | expedia.com 180 | thinktarget.com 181 | authnow.com 182 | dotomi.com 183 | blogspot.com 184 | hpdjjs.com 185 | chase.com 186 | outerinfo.com 187 | nscpcdn.com 188 | vonage.com 189 | searchscout.com 190 | compuserve.com 191 | lycos.com 192 | xanga.com 193 | websearch.com 194 | azjmp.com 195 | tmcs.net-o01 196 | exitexchange.com 197 | toshibapc.com 198 | runescape.com 199 | weatherstudio.com 200 | imdb.com 201 | adecn.com 202 | bargain-buddy.net 203 | carsdirect.com 204 | mspaceads.com 205 | apple.com 206 | ups.com 207 | 88.80.5.21 208 | exct.net 209 | cingular.com 210 | foodnetwork.com 211 | go.com-o03 212 | excite.com 213 | capitalone.com 214 | imiclk.com 215 | overstock.com 216 | bloglines.com 217 | compfused.com 218 | morpheus.com 219 | foxnews.com 220 | marketwatch.com 221 | wamu.com 222 | monster.com 223 | adobe.com 224 | 888.com 225 | untd.com 226 | abetterinternet.com 227 | centralmedia.ws 228 | valuead.com 229 | targetsaver.com 230 | lynxtrack.com 231 | cartoonnetwork.com 232 | netflix.com 233 | chitika.net 234 | geocities.com 235 | qsrch.com 236 | drsnsrch.com 237 | autobytel.com 238 | web-nexus.net 239 | webservicehosts.com 240 | sharewareonline.com 241 | llnwd.net 242 | instantnavigation.com 243 | nick.com 244 | nfl.com 245 | oingo.com 246 | lightningcast.net 247 | altbill.com 248 | xolox.nl 249 | superpages.com 250 | classmates.com 251 | aavalue.com 252 | bluestreak.com 253 | southwest.com 254 | whitepages.com 255 | usps.com 256 | webhancer.com 257 | bbc.co.uk 258 | true.com 259 | bearshare.com 260 | citibank.com 261 | blackplanet.com 262 | pch.com 263 | att.net 264 | autoweb.com 265 | insightexpress.com 266 | charter.net 267 | alumnigroup.org 268 | verizonwireless.com 269 | fedex.com 270 | mobilesidewalk.com 271 | netteller.com 272 | webshots.com 273 | sprint.com 274 | orbitz.com 275 | bestbuy.com 276 | grandstreetinteractive.com 277 | paypopup.com 278 | cheaptickets.com 279 | dell4me.com 280 | new.net 281 | nytimes.com 282 | nyadmcncserve-05y06a.com 283 | aoltpspd.exe 284 | toprebates.com 285 | jcpenney.com 286 | geotrust.com 287 | travelocity.com 288 | qvc.com 289 | 4at1.com 290 | cpmstar.com 291 | bizrate.com 292 | ticketmaster.com 293 | usbank.com 294 | tripod.com 295 | buy.com 296 | nascar.com 297 | aebn.net 298 | infospace.com 299 | wxbug.com 300 | contextuads.com 301 | bns1.net 302 | download.com 303 | gocyberlink.com 304 | 192.168.1.1 305 | dvlabs.com 306 | defamer.com 307 | tracking101.com 308 | accountonline.com 309 | hbmediapro.com 310 | usatoday.com 311 | bigfishgames.com 312 | neopets.com 313 | adoutput.com 314 | sbc.com 315 | noaa.gov 316 | lowermybills.com 317 | kmpads.com 318 | directtrack.com 319 | clicksor.com 320 | legacy.com 321 | eajmp.com 322 | nastydollars.com 323 | worldofwarcraft.com 324 | mirarsearch.com 325 | verizon.com 326 | miniclip.com 327 | iwin.com 328 | peel.com 329 | hgtv.com 330 | amaena.com 331 | sprintpcs.com 332 | shopping.com 333 | webmd.com 334 | clearchannel.com 335 | winamp.com 336 | reference.com 337 | interpolls.com 338 | americangreetings.com 339 | tmcs.net 340 | midtenmedia.com 341 | domainsponsor.com 342 | thunderdownloads.com 343 | akamaistream.net 344 | livejournal.com 345 | tx.us 346 | onlinerewardcenter.com 347 | msn.com-o18 348 | sony.com 349 | dogpile.com 350 | nba.com 351 | citysearch.com 352 | connextra.com 353 | nickjr.com 354 | t-mobile.com 355 | winfixer.com 356 | adlegend.com 357 | adsrevenue.net 358 | sears.com 359 | ap.org 360 | luna.net 361 | shockwave.com 362 | hsn.com 363 | fl.us 364 | mypoints.com 365 | mozilla.org 366 | aresgalaxy.org 367 | realtor.com 368 | addictinggames.com 369 | clickbooth.com 370 | amateurmatch.com 371 | worldnow.com 372 | surveys.com 373 | pa.us 374 | arcaderockstar.com 375 | coolsavings.com 376 | yournewsletters.net 377 | liquidmedianetworks.com 378 | everythinggirl.com 379 | perfectmatch.com 380 | stockgroup.com 381 | netster.com 382 | bidclix.com 383 | dropspam.com 384 | hp.com 385 | drivecleaner.com 386 | consumerpromotioncenter.com 387 | aolwbspd.exe 388 | americanexpress.com 389 | totaltalk.com 390 | wwe.com 391 | kontera.com 392 | gamehouse.com 393 | circuitcity.com 394 | yimg.com 395 | lightningcast.com 396 | edgefcs.net 397 | wunderground.com 398 | realarcade.com 399 | singlesnet.com 400 | azcentral.com 401 | yellowpages.com 402 | eharmony.com 403 | paviliondownload.com 404 | insightbb.com 405 | imageshack.us 406 | shopzilla.com 407 | ca.gov 408 | donotchangeme.com 409 | ca.us 410 | sourceforge.net 411 | washingtonpost.com 412 | adjuggler.com 413 | careercast.com 414 | bangbros1.com 415 | scripps.com-o01 416 | migente.com 417 | homedepot.com 418 | winantivirus.com 419 | irs.gov 420 | blockbuster.com 421 | kodakgallery.com 422 | nih.gov 423 | aol.com-o07 424 | icq.com 425 | wordcents.com 426 | drudgereport.com 427 | quizilla.com 428 | srch-results.com 429 | inqwire.com 430 | ign.com 431 | oinadserver.com 432 | azoogleads.com 433 | incredimail.com 434 | shopathome.com 435 | mtv.com 436 | fidelity.com 437 | bullseye-network.com 438 | flash-gear.com 439 | proficient.com 440 | autotrader.com 441 | charter.com 442 | healthology.com 443 | evite.com 444 | checkm8.com 445 | rsc01.net 446 | oasei.com 447 | heavy.com 448 | slotch.com 449 | passion.com 450 | nbc.com 451 | trafficmarketplace.com 452 | univision.com 453 | priceline.com 454 | flickr.com 455 | andale.com 456 | dealtime.com 457 | yfdirect.com 458 | entrepreneur.com 459 | go.com-o01 460 | webmd.com-o01 461 | sexsearch.com 462 | pornaccess.com 463 | gcion.com 464 | shoplocal.com 465 | kliptracker.com 466 | nationalcity.com 467 | bbeplayer.com 468 | videodome.com 469 | 204.95.60.12 470 | napster.com 471 | myweather.net 472 | msnbc.com 473 | linkexchange.com 474 | searchmarketing.com 475 | angelfire.com 476 | callwave.com 477 | sonnerie.net 478 | scout.com 479 | rivals.com 480 | altnet.com 481 | spynet.com 482 | macromedia.com 483 | ed.gov 484 | wannawatch.com 485 | frontiernet.net 486 | flycell.com 487 | edgesuite.net 488 | 89.com 489 | nc.us 490 | ticketmaster.com-o01 491 | flowgo.com 492 | cnet.com 493 | oddcast.com 494 | answers.com 495 | timeinc.net 496 | m5-systems.com 497 | guideforyou.com 498 | rn11.com 499 | lowes.com 500 | lifescript.com 501 | shop.com 502 | errorsafe.com 503 | cams.com 504 | macys.com 505 | aa.com 506 | addictingclips.com 507 | victoriassecret.com 508 | orchardbank.com 509 | bravenet.com 510 | imesh.com 511 | nextel.com 512 | screensandthemes.com 513 | suntrust.com 514 | discovercard.com 515 | nbads.com 516 | consumerincentiverewards.com 517 | valueclick.com 518 | google.com-o03 519 | cbs.com 520 | bannerspace.com 521 | technorati.com 522 | cjt1.net 523 | exactsearch.net 524 | munky.com 525 | cs.com 526 | kohls.com 527 | tagged.com 528 | babycenter.com 529 | ebaumsworld.com 530 | userplane.com 531 | mediaplazza.com 532 | netzerovoice.com 533 | gamespot.com 534 | keen.com 535 | bebo.com 536 | rsc02.net 537 | sysupdates2.com 538 | imlive.com 539 | oldnavy.com 540 | regalinteractive.com 541 | weightwatchers.com 542 | subsag.com 543 | aol.com-o08 544 | azlyrics.com 545 | freeringtonesnow.com 546 | freewebs.com 547 | toysrus.com 548 | hollywood.com 549 | findwhat.com 550 | local.com 551 | webroot.com 552 | tvguide.com 553 | ny.us 554 | resultsmaster.com 555 | jamster.com 556 | gms1.net 557 | switchboard.com 558 | nicheseek.com 559 | intelius.com 560 | hi5.com 561 | glispa.com 562 | gannettonline.com 563 | cstv.com 564 | adengage.com 565 | superbrewards.com 566 | videocodezone.com 567 | symantecliveupdate.com 568 | pbskids.org 569 | revresda.com 570 | americansingles.com 571 | ugo.com-o02 572 | job.com 573 | installshield.com 574 | eprize.net 575 | metacafe.com 576 | focalex.com 577 | cciads.us 578 | perfectgonzo.com 579 | kbb.com 580 | reunion.com 581 | eproof.com 582 | tripadvisor.com 583 | bellsouth.com 584 | search.com 585 | comcast.com 586 | ivillage.com 587 | sun.com 588 | regionsnet.com 589 | mininova.org 590 | beliefnet.com 591 | intellicast.com 592 | fastonlineusers.com 593 | gamespot.com-o01 594 | expedia.com-o01 595 | military.com 596 | musicnet.com 597 | 53.com 598 | oh.us 599 | itrack.it 600 | officedepot.com 601 | adultadworld.com 602 | univision.com-o01 603 | youravon.com 604 | blackboard.com 605 | yahoo.net 606 | casinolasvegas.com 607 | warnerbros.com 608 | delta.com 609 | go.com-o02 610 | deepnetexplorer.co.uk 611 | mozilla.com 612 | opentracker.net 613 | break.com 614 | catcha10.com 615 | hotels.com 616 | hallmark.com 617 | sportsbook.com 618 | mycheckfree.com 619 | ezboard.com 620 | pro-market.net 621 | mate1.com 622 | awempire.com 623 | jigzone.com 624 | bangbrosnetwork.com 625 | marketlinx.com 626 | tickle.com 627 | bbandt.com 628 | mercuras.com 629 | adtology2.com 630 | bluemountain.com 631 | freepornofreeporn.com 632 | internet-optimizer.com 633 | autotrader.com-o01 634 | blogger.com 635 | kraftfoods.com 636 | loveaccess.com 637 | shutterfly.com 638 | stopzilla.com 639 | xmradio.com 640 | ga.us 641 | ancestry.com 642 | honda.com 643 | fulltiltpoker.com 644 | il.us 645 | ibsys.com 646 | imixserver.com 647 | barnesandnoble.com 648 | pricegrabber.com 649 | constantcontact.com 650 | zonelabs.com 651 | pimpyourpro.com 652 | netflame.cc 653 | slide.com 654 | xnxx.com 655 | upromise.com 656 | livesexbar.com 657 | videosz.com 658 | freeweblayouts.net 659 | limewire.com 660 | ameritrade.com 661 | freelaptop4you.com 662 | nickarcade.com 663 | utkn.com 664 | nj.us 665 | 360i.com 666 | finestresults.com 667 | asseenontvnetwork.com 668 | typepad.com 669 | efax.com 670 | regions.com 671 | emachines.com 672 | playaudiomessage.com 673 | bofunk.com 674 | millsberry.com 675 | cpvfeed.com 676 | allrecipes.com 677 | clubpenguin.com 678 | eversave.com 679 | ppmdating.com 680 | lexico.com 681 | usaa.com 682 | directv.com 683 | postini.com 684 | secure-banking.com 685 | eyewonder.com 686 | boston.com 687 | ibanking-services.com 688 | astrology.com 689 | datinggold.com 690 | mlxchange.com 691 | travelhook.net 692 | custhelp.com 693 | mn.us 694 | zwire.com 695 | emarketmakers.com 696 | gamefaqs.com 697 | premiumproductsonline.com 698 | chrysler.com 699 | prodigy.net 700 | tv.com 701 | windowsmedia.com-o04 702 | smashits.com 703 | 65.115.67.11 704 | snapfish.com 705 | commerceonlinebanking.com 706 | bbt.com 707 | linksynergy.com 708 | yahoo.com-o08 709 | freecodesource.com 710 | streamate.com 711 | freecreditreport.com 712 | intuit.com 713 | rapid-pass.net 714 | artistdirect.com 715 | servedbyadbutler.com 716 | sidestep.com 717 | adult.com 718 | alltel.net 719 | bcentral.com 720 | openbank.com 721 | nichedsites.com 722 | cars.com 723 | gm.com 724 | adshuffle.com 725 | freeslots.com 726 | blink.com 727 | candystand.com 728 | monstermarketplace.com 729 | columbiahouse.com 730 | pncbank.com 731 | discovery.com 732 | hsbcbillpay.com 733 | movietickets.com 734 | page-not-found.net 735 | fandango.com 736 | providianservices.com 737 | carad.com 738 | homestead.com 739 | realcastmedia.com 740 | webratsmusic.com 741 | scottrade.com 742 | cs102175.com 743 | fnismls.com 744 | shopperssavingcenter.com 745 | hit-now.com 746 | whatismyip.com 747 | costco.com 748 | bolt.com 749 | bmgmusic.com 750 | myhealthwealthandhappiness.com 751 | symantec.com 752 | forbes.com 753 | digitalcity.com 754 | live365.com 755 | firstadsolution.com 756 | linkconnector.com 757 | freepagegraphics.com 758 | imgfarm.com 759 | insightexpresserdd.com 760 | pcsecurityshield.com 761 | allposters.com-o01 762 | msnvideo.com 763 | miva.com 764 | jackpotmadness.com 765 | mbnanetaccess.com 766 | newcarinsider.com 767 | edmunds.com 768 | net-nucleus.com 769 | popcap.com 770 | alt.com 771 | staples.com 772 | ussearch.com 773 | bankone.com 774 | rootv.com 775 | citizensbankonline.com 776 | juggcrew.com 777 | navyfcu.org 778 | nordstrom.com 779 | webstat.com 780 | inklineglobal.com 781 | seeq.com 782 | onetruemedia.com 783 | paltalk.com 784 | sonypictures.com 785 | 204.181.57.155 786 | commerceonline.com 787 | friendster.com 788 | slate.com 789 | hermoment.com 790 | lovehappens.com 791 | mi.us 792 | kmart.com 793 | paidsurveys.com 794 | 123greetings.com 795 | blinko.com 796 | citizensbank.com 797 | sirius.com 798 | qrs1.net 799 | adbureau.net 800 | turn.com 801 | abcdistributing.com 802 | fundsxpress.com 803 | pichunter.com 804 | cbsnews.com 805 | 216.139.222.230 806 | anywho.com 807 | sedoparking.com 808 | householdbank.com 809 | treborwear.com 810 | evault.ws 811 | vh1.com 812 | financialcontent.com 813 | gap.com 814 | active.com 815 | exclusivegiftcards.com 816 | michigan.gov 817 | dada-mobile.net 818 | textplussolutions.com 819 | myriadmarket.com 820 | ifriends.net 821 | aptimus.com 822 | valueclick.net 823 | pennyweb.com 824 | blackpeoplemeet.com 825 | eltpath.com 826 | yahoo.com-o46 827 | sysprotect.com 828 | dadamobile.com 829 | cpxinteractive.com 830 | clickspring.net 831 | staples-deals.com 832 | myyearbook.com 833 | bravenetmedianetwork.com 834 | etrade.com 835 | marykayintouch.com 836 | 64.39.16.166 837 | moregamers.com 838 | redorbit.com 839 | tmz.com 840 | blogrolling.com 841 | checkfree.com 842 | samsclub.com 843 | va.us 844 | united.com 845 | certified-safe-downloads.com 846 | aimtoday.com 847 | toseeka.com 848 | bidz.com 849 | gamespy.com 850 | nylottery.org 851 | godaddy.com 852 | rsc03.net 853 | altavista.com 854 | ltdcommodities.com 855 | bhg.com 856 | opm.gov 857 | onlinemediaoutlet.com 858 | beboframe.com 859 | cafepress.com 860 | tarot.com 861 | webgavel.com 862 | rapmls.com 863 | ztod.com 864 | marriott.com 865 | walgreens.com 866 | rovion.com 867 | ultimatebet.com 868 | ea.com 869 | petfinder.com 870 | winsoftware.com 871 | literotica.com 872 | websourcedtraffic.com 873 | 032439.com 874 | marketbanker.com 875 | clearchannelmusic.com 876 | colonize.com 877 | searchfeed.com 878 | eimg.net 879 | shermanstravel.com 880 | key.com 881 | multi-pops.com 882 | yandex.ru 883 | us.com 884 | kinghost.com 885 | sublimedirectory.com 886 | gogotools.com 887 | camcrush.com 888 | trafficexplorer.com 889 | myfamily.com 890 | gay.com 891 | freegiftworld.com 892 | dexonline.com 893 | trade-in-value.com 894 | shopyourbargain.com 895 | dyndns.org 896 | bizrate.com-o01 897 | xctrk.com 898 | webtoolcafe.com 899 | zappos.com 900 | wi.us 901 | toptvbytes.com 902 | 157.22.32.111 903 | hotfreelayouts.com 904 | registrydefender.com 905 | zap2it.com 906 | 64.136.28.49 907 | afy11.net 908 | 207.97.212.250 909 | invisionfree.com 910 | bravenet.com-o01 911 | gadgetcity.com 912 | army.mil 913 | yourgiftcards.com 914 | craigslist.com 915 | usairways.com 916 | drivelinemedia.com 917 | edline.net 918 | dayport.com 919 | axill.com 920 | smartbargains.com 921 | newgrounds.com 922 | 216.155.193.91 923 | providian.com 924 | statcounter.com 925 | ajc.com 926 | oprah.com 927 | slingo.com 928 | continental.com 929 | relevantchoice.com 930 | toontown.com 931 | thumbplay.com 932 | jacquielawson.com 933 | hotwire.com 934 | nwa.com 935 | atomz.com 936 | nsgalleries.com 937 | uclick.com 938 | mercurial.ca 939 | schwab.com 940 | nvero.net 941 | ediets.com 942 | ichotelsgroup.com 943 | 216.133.243.28 944 | aggregateknowledge.com 945 | topix.net 946 | flalottery.com 947 | dlv4.com 948 | mybloglog.com 949 | lanxtra.com 950 | away.com 951 | grab.com 952 | tipany.com 953 | quickbooks.com 954 | instream.com 955 | pbs.org 956 | findology.com 957 | business.com 958 | cmt.com 959 | myinsiderdeals.com 960 | imagine-msn.com 961 | nhl.com 962 | modern-singles.net 963 | addfreestats.com 964 | rent.com 965 | homegain.com 966 | freeones.com 967 | jetblue.com 968 | loanweb.com 969 | findarticles.com 970 | iwon.com-o04 971 | incredigames.com 972 | webkinz.com 973 | dealerconnection.com 974 | streamaudio.com 975 | grantmedia.com 976 | home123info.com 977 | exittracking.com 978 | worldsex.com 979 | yfdmedia.com 980 | automotive.com 981 | cursormania.com 982 | tradedoubler.com 983 | bedbathandbeyond.com 984 | equifax.com 985 | hotornot.com 986 | falkag.de 987 | chicagotribune.com 988 | airtran.com 989 | thebreastcancersite.com 990 | charmingshoppes.com 991 | ugo.com 992 | cox.com 993 | spicymint.com 994 | real.com-o01 995 | targetnet.com 996 | effectivebrand.com 997 | dallascowboys.com 998 | leadgenetwork.com 999 | in.us 1000 | vistaprint.com 1001 | -------------------------------------------------------------------------------- /PC Lab 2/help files/grf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 2/help files/grf.pdf -------------------------------------------------------------------------------- /PC Lab 2/help files/rpart.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 2/help files/rpart.pdf -------------------------------------------------------------------------------- /PC Lab 2/trees_foests_tutorial.r: -------------------------------------------------------------------------------- 1 | ######################## Load Packages and Data ######################## 2 | 3 | # Load packages 4 | library(rpart) 5 | library(rpart.plot) 6 | library(grf) 7 | library(DiagrammeR) 8 | 9 | # Load data 10 | data_2006 <-read.csv("browser_2006.csv", sep = ",") 11 | data_new <-read.csv("browser_new.csv", sep = ",") 12 | 13 | # Data preparation 14 | y_2006 <- as.matrix(data_2006[,2]) 15 | x_2006 <- as.matrix(data_2006[,c(3:ncol(data_2006))]) 16 | id_2006 <- as.matrix(data_2006[,1]) 17 | x_new <- as.matrix(data_new[,c(2:ncol(data_new))]) 18 | id_new <- as.matrix(data_new[,1]) 19 | 20 | print('Packages and data successfully loaded.') 21 | 22 | ############################################################################# 23 | 24 | ######################## Average Spending ######################## 25 | 26 | spending <- round(???, digits=2) 27 | print(paste0("In 2006, the average spending is ", spending, " US-dollars")) 28 | 29 | #################################################################### 30 | 31 | ######################## Online Time ######################## 32 | 33 | freq <- round(x_2006[id_2006==921,x_2006[id_2006==921,] == ???], digit = 0) 34 | page <- names(freq) 35 | 36 | print(paste0("Household 921 is most of the time on the webpage ", page)) 37 | print(paste0(freq, "% of the online time is the household on this webpage")) 38 | 39 | ################################################################ 40 | 41 | ######################## Log Transformation ######################## 42 | 43 | log_y_2006 = as.matrix(???) # take logarithm 44 | 45 | # Cumulative Distribution of Spending 46 | plot(ecdf(y_2006), xlab = "Spending in US-Dollars", sub = "(Truncated at 20,000 US-Dollars)", 47 | ylab = "cdf", main = "Distribution of Spending", xlim= c(0,20000)) 48 | 49 | # Cumulative Distribution of Log Spendiung 50 | plot(ecdf(log_y_2006), xlab = "log Spending", ylab = "cdf", main = "Distribution of Log Spending") 51 | 52 | ####################################################################### 53 | 54 | ######################## Training and Test Samples ######################## 55 | 56 | set.seed(1001) 57 | # Generate variable with the rows in training data 58 | size <- floor(0.5 * nrow(data_2006)) 59 | training_set <- sample(seq_len(nrow(data_2006)), size = size) 60 | 61 | print('Training and test samples created.') 62 | 63 | ############################################################################# 64 | 65 | ######################## Shallow Tree ######################## 66 | 67 | # Prepare data for tree estimator 68 | outcome <- log_y_2006[training_set] 69 | tree_data_2006 <- data.frame(outcome, x_2006[training_set,]) 70 | 71 | # Build shallow tree 72 | set.seed(1001) 73 | shallow_tree <- rpart(formula = outcome ~., data = tree_data_2006, method = "anova", xval = 10, 74 | y = TRUE, control = rpart.control(cp = 0.00002, minbucket=150)) 75 | # Note: 'minbucket=100' imposes the restriction that each terminal leave should contain at least 100 observations. 76 | # The algorithm 'rpart' stops growing trees when either one leave has less than 100 observations or 77 | # the MSE gain of addidng one addidtional leave is below cp=0.00002. 78 | 79 | ## Plot tree structure 80 | rpart.plot(shallow_tree,digits=3) 81 | 82 | # bizrate.com 83 | # fedex.com 84 | 85 | ################################################################ 86 | 87 | ######################## Deep Tree ######################## 88 | set.seed(1001) 89 | deep_tree <- rpart(formula = outcome ~., data = tree_data_2006, ???) 90 | 91 | print('Relative CV-MSE for different tree sizes') 92 | print(deep_tree$cptable) 93 | 94 | # Plot CV-MSE 95 | plotcp(deep_tree) 96 | 97 | ############################################################# 98 | 99 | ######################## Optimal Tree Size ######################## 100 | 101 | op.index <- which.min(deep_tree$cptable[, "xerror"]) 102 | op.size <- deep_tree$cptable[op.index, "nsplit"] +1 103 | print(paste0("Optimal number final leaves: ", op.size)) 104 | 105 | ##################################################################### 106 | 107 | ######################## Pruned Tree ######################## 108 | 109 | # Select the Tree that Minimises CV-MSE 110 | # Get cp-value that corresponds to optimal tree size 111 | cp.vals <- deep_tree$cptable[op.index, "CP"] 112 | 113 | # Prune the deep tree 114 | pruned_tree <- prune(???, cp = cp.vals) 115 | 116 | ## Plot tree structure 117 | rpart.plot(pruned_tree,digits=3) 118 | 119 | # aggregateknowledge.com 120 | 121 | ################################################################ 122 | 123 | ######################## Out-of-Sample Performance ######################## 124 | 125 | # Predict log online spending 126 | pred_tree <- predict(???, newdata= as.data.frame(x_2006)) 127 | 128 | # Test sample data 129 | outcome_test <- log_y_2006[-training_set] 130 | pred_tree_test <- pred_tree[-training_set] 131 | 132 | # R-squared 133 | MSE_tree <- mean((outcome_test-pred_tree_test)^2) 134 | r2_tree <- round(1- MSE_tree/var(outcome_test), digits = 3) 135 | print(paste0("Test sample R-squared: ", r2_tree)) 136 | 137 | ############################################################################## 138 | 139 | ######################## Random Forest ######################## 140 | 141 | rep <- 1000 # number of trees 142 | cov <- 1/3 # share of covariates 143 | frac <- 1/2 # fraction of subsample 144 | min_obs <- 100 # max. size of terminal leaves in trees 145 | 146 | # Build Forest 147 | set.seed(10001) 148 | forest1 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,], 149 | mtry = floor(cov*ncol(x_2006)), sample.fraction = frac, num.trees = rep, 150 | min.node.size = min_obs, honesty=FALSE) 151 | 152 | print('Forest is built.') 153 | 154 | ################################################################## 155 | 156 | ######################## Plot Example Tree ######################## 157 | 158 | # Plot a tree of the forest 159 | # Just an illustration, overall the forest contains 1000 trees 160 | tree <- get_tree(???,1) # here we select tree number 1 161 | plot(tree) 162 | 163 | ##################################################################### 164 | 165 | ######################## Variable Importance ######################## 166 | 167 | # Plot the variable importantance 168 | # First we consider only first split 169 | imp1 <- variable_importance(forest1, max.depth = 1) 170 | print(cbind(colnames(x_2006[,imp1>0.02]),imp1[imp1>0.02])) 171 | 172 | # Now we consider the first four splits 173 | imp2 <- round(variable_importance(forest1, decay.exponent = 2, max.depth = 4), digits = 3) 174 | print(cbind(colnames(x_2006[,imp2>0.02]),imp2[imp2>0.02])) 175 | 176 | ######################################################################## 177 | 178 | ######################## Out-of-Sample Performance ######################## 179 | 180 | # Prediction 181 | fit <- predict(???, newdata = x_2006[-training_set,])$predictions 182 | 183 | # R-squared 184 | SST <- mean(((log_y_2006[-training_set,])-mean((log_y_2006[-training_set,])))^2) 185 | MSE1 <- mean(((log_y_2006[-training_set,])-fit)^2) 186 | r2_1 <- round(1- MSE1/SST, digits = 3) 187 | print(paste0("Test sample R-squared: ", r2_1)) 188 | 189 | ############################################################################# 190 | 191 | ######################## Area Under the Curve (AUC) ######################## 192 | 193 | sizes <- c(1000,500,400,300, 200, 100, 50, 40,30,20,10, 5,4,3,2,1) # Select a grid of sample sizes 194 | # Prepare matrix to store results 195 | auc <- matrix(NA, nrow = length(sizes), ncol = 3) 196 | colnames(auc) <- c("Trees", "AUC", "Marginal AUC") 197 | auc[,1] <- sizes 198 | # Sum of Squares Total 199 | SST <- mean(((log_y_2006[-training_set,])-(mean(log_y_2006[-training_set,])))^2) 200 | 201 | set.seed(10001) # set starting value 202 | for (t in sizes){ 203 | # Estimate Forests 204 | forest <- regression_forest(x_2006[training_set,],(log_y_2006[training_set,]), mtry = floor(cov*ncol(x_2006)), 205 | sample.fraction = frac, num.trees = t, min.node.size = min_obs, honesty=FALSE) 206 | fit <- predict(forest, newdata = x_2006[-training_set,])$predictions # prediction in test sample 207 | auc[auc[,1]== t,2] <- 1- mean(((log_y_2006[-training_set,])-fit)^2)/SST # store R-squared 208 | } 209 | auc[,3] <- auc[,2] - rbind(as.matrix(auc[-1,2]),auc[nrow(auc),2]) 210 | 211 | # Marginal AUC 212 | plot(auc[,1],auc[,2],type = "o",xlab="Trees", ylab= "R-squared", main = "AUC") 213 | abline(a=0,b=0, col="red") 214 | 215 | ################################################################################ 216 | 217 | ######################## Deep Forest ######################## 218 | 219 | min_obs <- 5 220 | # Build Forest 221 | forest2 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,], 222 | ???) 223 | 224 | # Prediction 225 | fit <- predict(forest2, newdata = x_2006[-training_set,])$predictions 226 | 227 | # R-squared 228 | SST <- mean((log_y_2006[-training_set,]-mean(log_y_2006[-training_set,]))^2) 229 | MSE2 <- mean((log_y_2006[-training_set,]-fit)^2) 230 | r2_2 <- round(1- MSE2/SST, digits = 3) 231 | print(cbind(r2_1,r2_2)) 232 | 233 | # Plot tree 234 | tree <- get_tree(forest2, 34) 235 | plot(tree) 236 | 237 | ############################################################### 238 | 239 | ######################## Store Prediction for Hold-out-Sample ######################## 240 | 241 | # Hold-out-Sample Prediction 242 | fit_new <- predict(???, newdata = x_new)$predictions 243 | 244 | results <- as.matrix(cbind(id_new,fit_new)) # store ID's and predictions in oine matrix 245 | colnames(results) <- c("id","predictions") # label columns 246 | 247 | # Store results 248 | write.csv(results, "predictions.csv") 249 | 250 | print('Results for the hold-out-sample stored.') 251 | 252 | ######################################################################################### 253 | -------------------------------------------------------------------------------- /PC Lab 3/help files/R_ K-Means Clustering.html: -------------------------------------------------------------------------------- 1 | R: K-Means Clustering 2 | 3 | 4 | 5 | 6 |
kmeans {stats}R Documentation
7 | 8 |

9 | K-Means Clustering 10 |

11 | 12 |

Description

13 | 14 |

Perform k-means clustering on a data matrix. 15 |

16 | 17 | 18 |

Usage

19 | 20 |
 21 | kmeans(x, centers, iter.max = 10, nstart = 1,
 22 |        algorithm = c("Hartigan-Wong", "Lloyd", "Forgy",
 23 |                      "MacQueen"), trace=FALSE)
 24 | ## S3 method for class 'kmeans'
 25 | fitted(object, method = c("centers", "classes"), ...)
 26 | 
27 | 28 | 29 |

Arguments

30 | 31 | 32 | 33 | 38 | 39 | 44 | 45 | 48 | 49 | 53 | 54 | 59 | 60 | 64 | 65 | 71 | 72 | 78 | 79 | 82 |
x 34 |

numeric matrix of data, or an object that can be coerced to 35 | such a matrix (such as a numeric vector or a data frame with all 36 | numeric columns).

37 |
centers 40 |

either the number of clusters, say k, or a set of 41 | initial (distinct) cluster centres. If a number, a random set of 42 | (distinct) rows in x is chosen as the initial centres.

43 |
iter.max 46 |

the maximum number of iterations allowed.

47 |
nstart 50 |

if centers is a number, how many random sets 51 | should be chosen?

52 |
algorithm 55 |

character: may be abbreviated. Note that 56 | "Lloyd" and "Forgy" are alternative names for one 57 | algorithm.

58 |
object 61 |

an R object of class "kmeans", typically the 62 | result ob of ob <- kmeans(..).

63 |
method 66 |

character: may be abbreviated. "centers" causes 67 | fitted to return cluster centers (one for each input point) and 68 | "classes" causes fitted to return a vector of class 69 | assignments.

70 |
trace 73 |

logical or integer number, currently only used in the 74 | default method ("Hartigan-Wong"): if positive (or true), 75 | tracing information on the progress of the algorithm is 76 | produced. Higher values may produce more tracing information.

77 |
... 80 |

not used.

81 |
83 | 84 | 85 |

Details

86 | 87 |

The data given by x are clustered by the k-means method, 88 | which aims to partition the points into k groups such that the 89 | sum of squares from points to the assigned cluster centres is minimized. 90 | At the minimum, all cluster centres are at the mean of their Voronoi 91 | sets (the set of data points which are nearest to the cluster centre). 92 |

93 |

The algorithm of Hartigan and Wong (1979) is used by default. Note 94 | that some authors use k-means to refer to a specific algorithm 95 | rather than the general method: most commonly the algorithm given by 96 | MacQueen (1967) but sometimes that given by Lloyd (1957) and Forgy 97 | (1965). The Hartigan–Wong algorithm generally does a better job than 98 | either of those, but trying several random starts (nstart> 99 | 1) is often recommended. In rare cases, when some of the points 100 | (rows of x) are extremely close, the algorithm may not converge 101 | in the “Quick-Transfer” stage, signalling a warning (and 102 | returning ifault = 4). Slight 103 | rounding of the data may be advisable in that case. 104 |

105 |

For ease of programmatic exploration, k=1 is allowed, notably 106 | returning the center and withinss. 107 |

108 |

Except for the Lloyd–Forgy method, k clusters will always be 109 | returned if a number is specified. 110 | If an initial matrix of centres is supplied, it is possible that 111 | no point will be closest to one or more centres, which is currently 112 | an error for the Hartigan–Wong method. 113 |

114 | 115 | 116 |

Value

117 | 118 |

kmeans returns an object of class "kmeans" which has a 119 | print and a fitted method. It is a list with at least 120 | the following components: 121 |

122 | 123 | 124 | 130 | 131 | 134 | 135 | 138 | 139 | 143 | 144 | 148 | 149 | 153 | 154 | 157 | 158 | 161 | 162 | 166 |
cluster 125 | 126 |

A vector of integers (from 1:k) indicating the cluster to 127 | which each point is allocated. 128 |

129 |
centers 132 |

A matrix of cluster centres.

133 |
totss 136 |

The total sum of squares.

137 |
withinss 140 |

Vector of within-cluster sum of squares, 141 | one component per cluster.

142 |
tot.withinss 145 |

Total within-cluster sum of squares, 146 | i.e. sum(withinss).

147 |
betweenss 150 |

The between-cluster sum of squares, 151 | i.e. totss-tot.withinss.

152 |
size 155 |

The number of points in each cluster.

156 |
iter 159 |

The number of (outer) iterations.

160 |
ifault 163 |

integer: indicator of a possible algorithm problem 164 | – for experts.

165 |
167 | 168 | 169 |

References

170 | 171 |

Forgy, E. W. (1965). 172 | Cluster analysis of multivariate data: efficiency vs interpretability 173 | of classifications. 174 | Biometrics, 21, 768–769. 175 |

176 |

Hartigan, J. A. and Wong, M. A. (1979). 177 | Algorithm AS 136: A K-means clustering algorithm. 178 | Applied Statistics, 28, 100–108. 179 | doi: 10.2307/2346830. 180 |

181 |

Lloyd, S. P. (1957, 1982). 182 | Least squares quantization in PCM. 183 | Technical Note, Bell Laboratories. 184 | Published in 1982 in IEEE Transactions on Information Theory, 185 | 28, 128–137. 186 |

187 |

MacQueen, J. (1967). 188 | Some methods for classification and analysis of multivariate 189 | observations. 190 | In Proceedings of the Fifth Berkeley Symposium on Mathematical 191 | Statistics and Probability, 192 | eds L. M. Le Cam & J. Neyman, 193 | 1, pp. 281–297. 194 | Berkeley, CA: University of California Press. 195 |

196 | 197 | 198 |

Examples

199 | 200 |
201 | require(graphics)
202 | 
203 | # a 2-dimensional example
204 | x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
205 |            matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
206 | colnames(x) <- c("x", "y")
207 | (cl <- kmeans(x, 2))
208 | plot(x, col = cl$cluster)
209 | points(cl$centers, col = 1:2, pch = 8, cex = 2)
210 | 
211 | # sum of squares
212 | ss <- function(x) sum(scale(x, scale = FALSE)^2)
213 | 
214 | ## cluster centers "fitted" to each obs.:
215 | fitted.x <- fitted(cl);  head(fitted.x)
216 | resid.x <- x - fitted(cl)
217 | 
218 | ## Equalities : ----------------------------------
219 | cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
220 |          c(ss(fitted.x), ss(resid.x),    ss(x)))
221 | stopifnot(all.equal(cl$ totss,        ss(x)),
222 | 	  all.equal(cl$ tot.withinss, ss(resid.x)),
223 | 	  ## these three are the same:
224 | 	  all.equal(cl$ betweenss,    ss(fitted.x)),
225 | 	  all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
226 | 	  ## and hence also
227 | 	  all.equal(ss(x), ss(fitted.x) + ss(resid.x))
228 | 	  )
229 | 
230 | kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
231 | 
232 | ## random starts do help here with too many clusters
233 | ## (and are often recommended anyway!):
234 | (cl <- kmeans(x, 5, nstart = 25))
235 | plot(x, col = cl$cluster)
236 | points(cl$centers, col = 1:5, pch = 8)
237 | 
238 | 239 |
[Package stats version 4.1.0 Index]
240 | 241 | -------------------------------------------------------------------------------- /PC Lab 3/help files/R_ Principal Components Analysis.html: -------------------------------------------------------------------------------- 1 | R: Principal Components Analysis 2 | 3 | 4 | 5 | 6 |
prcomp {stats}R Documentation
7 | 8 |

Principal Components Analysis

9 | 10 |

Description

11 | 12 |

Performs a principal components analysis on the given data matrix 13 | and returns the results as an object of class prcomp.

14 | 15 | 16 |

Usage

17 | 18 |
 19 | prcomp(x, ...)
 20 | 
 21 | ## S3 method for class 'formula'
 22 | prcomp(formula, data = NULL, subset, na.action, ...)
 23 | 
 24 | ## Default S3 method:
 25 | prcomp(x, retx = TRUE, center = TRUE, scale. = FALSE,
 26 |        tol = NULL, rank. = NULL, ...)
 27 | 
 28 | ## S3 method for class 'prcomp'
 29 | predict(object, newdata, ...)
 30 | 
31 | 32 | 33 |

Arguments

34 | 35 | 36 | 37 | 41 | 42 | 48 | 49 | 53 | 54 | 61 | 62 | 66 | 67 | 71 | 72 | 76 | 77 | 83 | 84 | 92 | 93 | 103 | 104 | 110 | 111 | 114 | 115 | 124 |
formula 38 |

a formula with no response variable, referring only to 39 | numeric variables.

40 |
data 43 |

an optional data frame (or similar: see 44 | model.frame) containing the variables in the 45 | formula formula. By default the variables are taken from 46 | environment(formula).

47 |
subset 50 |

an optional vector used to select rows (observations) of the 51 | data matrix x.

52 |
na.action 55 |

a function which indicates what should happen 56 | when the data contain NAs. The default is set by 57 | the na.action setting of options, and is 58 | na.fail if that is unset. The ‘factory-fresh’ 59 | default is na.omit.

60 |
... 63 |

arguments passed to or from other methods. If x is 64 | a formula one might specify scale. or tol.

65 |
x 68 |

a numeric or complex matrix (or data frame) which provides 69 | the data for the principal components analysis.

70 |
retx 73 |

a logical value indicating whether the rotated variables 74 | should be returned.

75 |
center 78 |

a logical value indicating whether the variables 79 | should be shifted to be zero centered. Alternately, a vector of 80 | length equal the number of columns of x can be supplied. 81 | The value is passed to scale.

82 |
scale. 85 |

a logical value indicating whether the variables should 86 | be scaled to have unit variance before the analysis takes 87 | place. The default is FALSE for consistency with S, but 88 | in general scaling is advisable. Alternatively, a vector of length 89 | equal the number of columns of x can be supplied. The 90 | value is passed to scale.

91 |
tol 94 |

a value indicating the magnitude below which components 95 | should be omitted. (Components are omitted if their 96 | standard deviations are less than or equal to tol times the 97 | standard deviation of the first component.) With the default null 98 | setting, no components are omitted (unless rank. is specified 99 | less than min(dim(x)).). Other settings for tol could be 100 | tol = 0 or tol = sqrt(.Machine$double.eps), which 101 | would omit essentially constant components.

102 |
rank. 105 |

optionally, a number specifying the maximal rank, i.e., 106 | maximal number of principal components to be used. Can be set as 107 | alternative or in addition to tol, useful notably when the 108 | desired rank is considerably smaller than the dimensions of the matrix.

109 |
object 112 |

object of class inheriting from "prcomp"

113 |
newdata 116 |

An optional data frame or matrix in which to look for 117 | variables with which to predict. If omitted, the scores are used. 118 | If the original fit used a formula or a data frame or a matrix with 119 | column names, newdata must contain columns with the same 120 | names. Otherwise it must contain the same number of columns, to be 121 | used in the same order. 122 |

123 |
125 | 126 | 127 |

Details

128 | 129 |

The calculation is done by a singular value decomposition of the 130 | (centered and possibly scaled) data matrix, not by using 131 | eigen on the covariance matrix. This 132 | is generally the preferred method for numerical accuracy. The 133 | print method for these objects prints the results in a nice 134 | format and the plot method produces a scree plot. 135 |

136 |

Unlike princomp, variances are computed with the usual 137 | divisor N - 1. 138 |

139 |

Note that scale = TRUE cannot be used if there are zero or 140 | constant (for center = TRUE) variables. 141 |

142 | 143 | 144 |

Value

145 | 146 |

prcomp returns a list with class "prcomp" 147 | containing the following components: 148 |

149 | 150 | 151 | 157 | 158 | 163 | 164 | 172 | 173 | 176 |
sdev 152 |

the standard deviations of the principal components 153 | (i.e., the square roots of the eigenvalues of the 154 | covariance/correlation matrix, though the calculation 155 | is actually done with the singular values of the data matrix).

156 |
rotation 159 |

the matrix of variable loadings (i.e., a matrix 160 | whose columns contain the eigenvectors). The function 161 | princomp returns this in the element loadings.

162 |
x 165 |

if retx is true the value of the rotated data (the 166 | centred (and scaled if requested) data multiplied by the 167 | rotation matrix) is returned. Hence, cov(x) is the 168 | diagonal matrix diag(sdev^2). For the formula method, 169 | napredict() is applied to handle the treatment of values 170 | omitted by the na.action.

171 |
center, scale 174 |

the centering and scaling used, or FALSE.

175 |
177 | 178 | 179 |

Note

180 | 181 |

The signs of the columns of the rotation matrix are arbitrary, and 182 | so may differ between different programs for PCA, and even between 183 | different builds of R. 184 |

185 | 186 | 187 |

References

188 | 189 |

Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) 190 | The New S Language. 191 | Wadsworth & Brooks/Cole. 192 |

193 |

Mardia, K. V., J. T. Kent, and J. M. Bibby (1979) 194 | Multivariate Analysis, London: Academic Press. 195 |

196 |

Venables, W. N. and B. D. Ripley (2002) 197 | Modern Applied Statistics with S, Springer-Verlag. 198 |

199 | 200 | 201 |

See Also

202 | 203 |

biplot.prcomp, screeplot, 204 | princomp, cor, cov, 205 | svd, eigen. 206 |

207 | 208 | 209 |

Examples

210 | 211 |
212 | C <- chol(S <- toeplitz(.9 ^ (0:31))) # Cov.matrix and its root
213 | all.equal(S, crossprod(C))
214 | set.seed(17)
215 | X <- matrix(rnorm(32000), 1000, 32)
216 | Z <- X %*% C  ## ==>  cov(Z) ~=  C'C = S
217 | all.equal(cov(Z), S, tol = 0.08)
218 | pZ <- prcomp(Z, tol = 0.1)
219 | summary(pZ) # only ~14 PCs (out of 32)
220 | ## or choose only 3 PCs more directly:
221 | pz3 <- prcomp(Z, rank. = 3)
222 | summary(pz3) # same numbers as the first 3 above
223 | stopifnot(ncol(pZ$rotation) == 14, ncol(pz3$rotation) == 3,
224 |           all.equal(pz3$sdev, pZ$sdev, tol = 1e-15)) # exactly equal typically
225 | 
226 | ## signs are random
227 | require(graphics)
228 | ## the variances of the variables in the
229 | ## USArrests data vary by orders of magnitude, so scaling is appropriate
230 | prcomp(USArrests)  # inappropriate
231 | prcomp(USArrests, scale = TRUE)
232 | prcomp(~ Murder + Assault + Rape, data = USArrests, scale = TRUE)
233 | plot(prcomp(USArrests))
234 | summary(prcomp(USArrests, scale = TRUE))
235 | biplot(prcomp(USArrests, scale = TRUE))
236 | 
237 | 
238 | 239 |
[Package stats version 4.0.2 Index]
240 | 241 | -------------------------------------------------------------------------------- /PC Lab 3/rollcall-members.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 3/rollcall-members.Rdata -------------------------------------------------------------------------------- /PC Lab 3/rollcall-votes.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 3/rollcall-votes.Rdata -------------------------------------------------------------------------------- /PC Lab 3/unsupervised_tutorial.r: -------------------------------------------------------------------------------- 1 | ######################## Load Data ######################## 2 | 3 | ### Load data 4 | load("rollcall-votes.Rdata") 5 | load("rollcall-members.Rdata") 6 | 7 | print('Data loaded.') 8 | 9 | ############################################################## 10 | 11 | print('# Counts of Democrats, Republicans and one special politician') 12 | table(members$party) 13 | 14 | print('# Shares of Democrats, Republicans and one special politician') 15 | round(table(members$party)/nrow(members),3) 16 | 17 | # Count missing votings for each politician and plot the counts 18 | missings <- rowSums(votes[,(1:ncol(votes))]==0) 19 | 20 | # No. politicians who always voted 21 | sum(missings == 0) 22 | 23 | # Shares of missing votings 24 | s_missings <- missings/(ncol(votes)-1) 25 | 26 | # Histogram with 100 bins 27 | hist(???, breaks = 100) 28 | 29 | # Counts - yes and nos 30 | yeas <- rowSums(votes[,(1:ncol(votes))]== ???) 31 | nays <- rowSums(votes[,(1:ncol(votes))]== ???) 32 | 33 | # Plots - Party 34 | plot(yeas, nays, col = members$party) 35 | legend('topleft', legend = levels(members$party), col = 1:3, pch = 1) 36 | 37 | # PCA 38 | pr.out = prcomp(??? , center = TRUE, scale = TRUE) 39 | 40 | # No of principal components 41 | dim(pr.out$rotation)[2] 42 | 43 | # variance explained by each component 44 | pr.var = pr.out$sdev^2 45 | 46 | # Proportion of variance explained 47 | pve=pr.var/sum(pr.var) 48 | 49 | # Print first 10 PC 50 | pve[1:10] 51 | 52 | # Plot the first 10 PC 53 | barplot(pve[1:10], xlab=" Principal Component ", ylab=" Proportion of Variance Explained ", ylim=c(0,1)) 54 | barplot(cumsum(pve[1:10]), xlab=" Principal Component ", ylab ="Cumulative Proportion of Variance Explained ", ylim=c(0,1)) 55 | 56 | # Plot the first two principal components, color the party membership 57 | plot(pr.out$x[,1], pr.out$x[,2], xlab = "PC1", ylab = "PC2", col = members$party, main = "Top two PC directions") 58 | legend('bottomright', legend = levels(members$party), col = 1:3, pch = 1) 59 | 60 | ## Far right (very conservative) 61 | head(sort(???)) 62 | 63 | ## Far left (very liberal) 64 | head(sort(???, decreasing=???)) 65 | 66 | # PC 2 67 | head(sort(???)) 68 | # No clear pattern based on party and state information 69 | 70 | # Look at the largest loadings in PC2 to discern an interpretation. 71 | loadings <- pr.out$rotation 72 | loadings[order(abs(loadings[,2]), decreasing=TRUE)[1:5],2] 73 | 74 | # Analyze voting behavior 75 | table(votes[,1146]) 76 | table(votes[,658]) 77 | table(votes[,1090]) 78 | 79 | # Either everyone voted "yea" or missed the voting. 80 | # These votes all correspond to near-unanimous symbolic action. 81 | 82 | # Mystery Solved: the second PC is just attendance! 83 | head(sort(rowSums(votes==0), decreasing=TRUE)) 84 | 85 | set.seed(11122019) 86 | 87 | # K-means clustering with 2 clusters 88 | km.out = kmeans(???, 2, nstart = 20) 89 | km.out$cluster 90 | 91 | # Tabulate party vs cluster 92 | table(members$party, km.out$cluster) 93 | 94 | # How to analyze the optimal number of clusters 95 | 96 | sse <- c() 97 | sse[1] <- Inf 98 | 99 | for (ind_cl in c(2:20)) { 100 | set.seed(3) 101 | km.out = kmeans (votes, ind_cl, nstart = 20) 102 | sse[ind_cl] = km.out$tot.withinss 103 | } 104 | 105 | plot(sse) 106 | # Optimum 4-5 clusters 107 | 108 | # Plot the 5 clusters on the PC components graph 109 | set.seed(3) 110 | km.out = kmeans (???, ???, nstart = 20) 111 | 112 | # Plot the first two principal components color the party membership 113 | plot(pr.out$x[,1], pr.out$x[,2], xlab = "PC1", ylab = "PC2", col = km.out$cluster, main = "Top two PC directions with 5 clusters") 114 | legend('bottomright', legend = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"), col = 1:5, pch = 1) 115 | 116 | # Analyzing how the number of starts work 117 | set.seed (3) 118 | print('With nstart = 1') 119 | km.out = kmeans (votes,6, nstart = ???) 120 | km.out$tot.withinss 121 | 122 | print('With nstart = 20') 123 | km.out =kmeans (votes,6, nstart = ???) 124 | km.out$tot.withinss 125 | -------------------------------------------------------------------------------- /PC Lab 4/help files/glmnet_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 4/help files/glmnet_package.pdf -------------------------------------------------------------------------------- /PC Lab 4/help files/hdm_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 4/help files/hdm_package.pdf -------------------------------------------------------------------------------- /PC Lab 4/post_double_selection_tutorial.r: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | ######################## Load Packages and the Data ######################## 3 | ############################################################################## 4 | 5 | ### Load the packages 6 | library(fBasics) # use for descriptive statistics 7 | library(tidyverse) # use for handling data 8 | library(caret) # use for handling data 9 | library(lmtest) # use for heteroscedasticity robust standard errors 10 | library(sandwich) # use for heteroscedasticity robust standard errors 11 | library(hdm) # use for Lasso and Post-Double-Selection 12 | library(glmnet) # use for lasso and Elastic Net regularized Generalized Linear Models 13 | options(warn=-1) # supress warnings 14 | 15 | print('All packages successfully installed and loaded.') 16 | 17 | ### Load the Data 18 | set.seed(12345678) 19 | df <- read.csv("job_corps.csv",header=TRUE, sep=",") # load data from csv-file 20 | df <- df[sample(c(1:nrow(df)), size=3000, replace =F),] # Select a random subsample of 3000 observations 21 | print('Data successfully loaded.') 22 | 23 | ############################################################################## 24 | 25 | ############################################################################## 26 | ######################## Descriptive Statistics ############################ 27 | ############################################################################## 28 | 29 | ## Table with Descriptive Statistics 30 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 31 | select(Mean, Stdev, Minimum, Maximum, nobs) 32 | print(round(desc, digits=2)) 33 | 34 | ############################################################################## 35 | 36 | ######################################################################### 37 | ######################## Univariate OLS Regression ##################### 38 | ######################################################################### 39 | 40 | ## Univariate OLS 41 | ols1 <- lm(EARNY4 ~ participation, data = df) 42 | summary(ols1) 43 | 44 | ## Store results 45 | results <- as.matrix(coef(summary(ols1))[2, c("Estimate", "Std. Error", "Pr(>|t|)")]) 46 | 47 | # Prepare matrix to store results 48 | res <- matrix(NA,nrow=3,ncol=5) 49 | colnames(res) <- c("Univariate OLS", "Multivariate OLS1", "Multivariate OLS2", 50 | "Multivariate OLS3", "Multivariate OLS4") 51 | rownames(res) <- rownames(results) 52 | res[,1] <- results 53 | 54 | print(round(res[,1], digits=2)) 55 | 56 | ######################################################################## 57 | 58 | ######################################################################## 59 | ######################## Standardized Differences ##################### 60 | ######################################################################## 61 | 62 | ## Means and standard deviations for the participants (D=1) 63 | desc_1 <- fBasics::basicStats(df[df$participation==1,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev) 64 | 65 | ## Means and standard deviations for the non-participants (D=0) 66 | desc_0 <- fBasics::basicStats(df[df$participation==0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev) 67 | 68 | # Make table and add standardized differences 69 | desc <- cbind(desc_1[-c(1:3),],desc_0[-c(1:3),], 70 | 100*abs(desc_1[-c(1:3),1]-desc_0[-c(1:3),1])/sqrt(0.5*(desc_1[-c(1:3),2]^2+desc_0[-c(1:3),2]^2))) 71 | colnames(desc) <- c("D=1 Mean", "D=1 Std.Dev.", "D=0 Mean", "D=0 Std.Dev.", "Std.Diff.") 72 | print(round(desc, digits=2)) 73 | 74 | ######################################################################## 75 | 76 | ######################################################################### 77 | ######################## Multivariate OLS Regression ################### 78 | ######################################################################### 79 | 80 | ## Multivariate OLS 81 | ols2 <- lm(EARNY4 ~ participation + age_1 + age_3 + livespou + publich, data = df) 82 | summary(ols2) 83 | # Question: Why do we omit age_2? 84 | 85 | ## Store results 86 | results <- as.matrix(coef(summary(ols2))[2, c("Estimate", "Std. Error", "Pr(>|t|)")]) 87 | res[,2] <- results 88 | print(round(res[,c(1:2)], digits=2)) 89 | 90 | ## Relative change in the estimated effect 91 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,2]-res[1,1])/res[1,1], digits=1),"%")) 92 | 93 | ######################################################################## 94 | 95 | ######################################################################### 96 | 97 | ## Multivariate OLS 98 | ols3 <- lm(EARNY4 ~ ???, data = df) 99 | summary(ols3) 100 | 101 | ## Store results 102 | results <- as.matrix(coef(summary(ols3))[2, c("Estimate", "Std. Error", "Pr(>|t|)")]) 103 | res[,3] <- results 104 | print(round(res[,c(1:3)], digits=2)) 105 | 106 | ## Relative change in the estimated effect 107 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,3]-res[1,2])/res[1,2], digits=1),"%")) 108 | 109 | ######################################################################## 110 | 111 | ############################################################################### 112 | 113 | ## Generate first-order interactions between all control variables 114 | interactions <- t(apply(df[,-c(1,2,3,6,11)], 1, combn, 2, prod)) 115 | colnames(interactions) <- paste("Inter.V", combn(1:ncol(df[,-c(1,2,3,6,11)]), 2, paste, collapse="V"), sep="") 116 | print(paste0("Maximm number of interaction terms: ", ncol(interactions))) 117 | 118 | ## Merge basline characteristics with interaction terms 119 | df_merge <- as.data.frame(cbind(df[,-c(1,2,3,6,11)], interactions)) 120 | 121 | ## Eliminate collinear variables 122 | df2 = cor(df_merge) 123 | df2[is.na(df2)] <- 1 124 | hc = findCorrelation(df2, cutoff=0.8) # putt any value as a "cutoff" 125 | hc = sort(hc) 126 | df_int = cbind(df[,c(1,3)],df_merge[,-c(hc)]) 127 | print(paste0("Total number of control variables: ", ncol(df_int)-2)) 128 | 129 | ############################################################################### 130 | 131 | ############################################################################### 132 | 133 | ## Multivariate OLS with all baseline characteristics and interaction terms 134 | ols4 <- lm(EARNY4 ~ ., data = df_int) 135 | 136 | ## Store results 137 | results <- as.matrix(coef(summary(ols4))[2, c("Estimate", "Std. Error", "Pr(>|t|)")]) 138 | res[,4] <- results 139 | print(round(res[,c(1:4)], digits=2)) 140 | 141 | ## Relative change in the estimated effect 142 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,4]-res[1,3])/res[1,3], digits=1),"%")) 143 | 144 | ######################################################################## 145 | 146 | ############################################################################### 147 | 148 | # Set starting value for replicability 149 | set.seed(123456) 150 | 151 | # Specify number of random variables 152 | cols <- 1000 153 | 154 | # Generate random variables 155 | redundant_x <- matrix(rnorm(nrow(df_int)*cols), nrow = nrow(df_int)) # We draw from a random standard normal distribution 156 | colnames(redundant_x) <- paste("Rand.", 1:cols, sep="") 157 | 158 | # Merge random variables with baseline characteritics and interaction terms 159 | df_rand <- as.data.frame(cbind(df_int, redundant_x)) 160 | print(paste0("Total number of control variables: ", ncol(df_rand)-2)) 161 | 162 | ############################################################################### 163 | 164 | ############################################################################### 165 | 166 | ## Multivariate OLS with all baseline characteristics, interaction terms, and random variables 167 | ols5 <- lm(EARNY4 ~ ., data = df_rand) 168 | 169 | ## Store results 170 | results <- as.matrix(coef(summary(ols5))[2, c("Estimate", "Std. Error", "Pr(>|t|)")]) 171 | res[,5] <- results 172 | print(round(res, digits=2)) 173 | 174 | ## Relative change in the estimated effect 175 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,5]-res[1,4])/res[1,4], digits=1),"%")) 176 | 177 | ######################################################################## 178 | 179 | ############################################################################### 180 | ########################### Earnings Equation ################################# 181 | ############################################################################### 182 | 183 | # Predict earnings 184 | N <- nrow(df) 185 | st1 <- rlasso(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$EARNY4), 186 | penalty = list(homoscedastic = FALSE, c= 1.1, gamma = 0.1/log(N))) 187 | summary(st1) 188 | 189 | # Store selected variables 190 | n1<- names(st1$coefficients[(st1$coefficients != 0) == TRUE])[-1] 191 | 192 | ############################################################################### 193 | 194 | ############################################################################### 195 | ######################### Participation Probability ########################### 196 | ############################################################################### 197 | 198 | # Predict participation 199 | N <- nrow(df) 200 | st2 <- rlasso(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$participation), 201 | penalty = list(homoscedastic = FALSE, c= 1.1, gamma = 0.1/log(N))) 202 | summary(st2) 203 | 204 | # Store selected variables 205 | n2<- names(st2$coefficients[(st2$coefficients != 0) == TRUE])[-1] 206 | 207 | ############################################################################### 208 | 209 | ############################################################################### 210 | ################################# Post-Lasso ################################## 211 | ############################################################################### 212 | 213 | # Take union of selected covariates 214 | selected_covariates <- c("participation", unique(c(n1, n2))) 215 | 216 | # Setup the formula of the linear regression model 217 | sumx <- paste(selected_covariates, collapse = " + ") 218 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ") 219 | linear <- as.formula(linear) 220 | 221 | # Post-Lasso regression 222 | ols <- lm(linear, data = df) 223 | summary(ols) 224 | 225 | # Heteroskedasticity robust standard errors 226 | #coeftest(ols, vcov = vcovHC(ols, type = "HC1")) 227 | 228 | ############################################################################### 229 | 230 | ############################################################################### 231 | ################## Estimate the Treatment Effect Directly ##################### 232 | ############################################################################### 233 | 234 | # Post-Double-Selection Procedure 235 | dsp <- rlassoEffect(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$EARNY4) 236 | , as.matrix(df$participation), model = TRUE, penalty = list(homoscedastic = FALSE), method = "double selection") 237 | summary(dsp) 238 | 239 | ############################################################################### 240 | # Earning Equation 241 | ############################################################################### 242 | 243 | # Predict earnings 244 | 245 | # Store selected variables 246 | 247 | ############################################################################### 248 | # Participation Probability 249 | ############################################################################### 250 | 251 | # Predict participation 252 | 253 | # Store selected variables 254 | 255 | ############################################################################### 256 | # Post-Lasso Model 257 | ############################################################################### 258 | 259 | # Take union of selected covariates 260 | selected_covariates <- c("participation", unique(c(n1, n2))) 261 | 262 | # Setup the formula of the linear regression model 263 | sumx <- paste(selected_covariates, collapse = " + ") 264 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ") 265 | linear <- as.formula(linear) 266 | 267 | # Post-Lasso OLS regression 268 | ols <- lm(linear, data = df_rand) 269 | summary(ols) 270 | 271 | ############################################################################### 272 | 273 | #################################################################### 274 | ################# Cross-Validated Lasso ############################ 275 | #################################################################### 276 | 277 | set.seed(123456789) # Starting value 278 | 279 | # Cross-validated Lasso in earnings equation 280 | lasso_earn <- cv.glmnet(as.matrix(df_int[,c(3:ncol(df_int))]), as.matrix(df$EARNY4), 281 | alpha=1, nfolds = 10, type.measure = 'mse', standardize = TRUE) 282 | # alpha =1 is Lasso, alpha = 0 is Ridgde 283 | # nfolds - number of cross-validation folds 284 | # type.measure - measure for model accuracy 285 | 286 | plot(lasso_earn) 287 | 288 | #################################################################### 289 | 290 | #################################################################### 291 | 292 | # Plot Lasso coefficients 293 | coef(lasso_earn,s = lasso_earn$lambda.1se) 294 | # $lambda.min - Lambda that minimizes cross-validated MSE 295 | # $lambda.1se - Lambda of 1 standard error rule 296 | 297 | #################################################################### 298 | 299 | #################################################################### 300 | 301 | # Select covariates with non-zero coefficients 302 | coef <- predict(lasso_earn,s = lasso_earn$lambda.min, type = "nonzero") # 303 | colnames <- colnames(df_int[,c(3:ncol(df_int))]) 304 | n1 <- colnames[unlist(coef)] 305 | print(paste0("Number of Selected Variables Earnings Equation: ",length(n1))) 306 | print("Selected Variables:") 307 | print(n1) 308 | 309 | #################################################################### 310 | 311 | #################################################################### 312 | 313 | set.seed(123456789) # Starting value 314 | 315 | # Cross-validated Lasso in participation equation 316 | lasso_part <- cv.glmnet(???, 317 | alpha=1, nfolds = 10, type.measure = 'mse', standardize = TRUE) 318 | plot(lasso_part) 319 | 320 | #################################################################### 321 | 322 | #################################################################### 323 | 324 | # Select covariates with non-zero coefficients 325 | coef <- predict(???,s = ???, type = "nonzero") # 326 | colnames <- colnames(df_int[,c(3:ncol(df_int))]) 327 | print(paste0("Number of Selected Variables Participation Equation: ",length(n2))) 328 | print("Selected Variables:") 329 | print(n2) 330 | 331 | #################################################################### 332 | 333 | ############################################################################### 334 | # Post-Lasso Model 335 | ############################################################################### 336 | 337 | # Take union of selected covariates 338 | selected_covariates <- c(???) 339 | 340 | # Setup the formula of the linear regression model 341 | sumx <- paste(selected_covariates, collapse = " + ") 342 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ") 343 | linear <- as.formula(linear) 344 | 345 | # Post-Lasso OLS regression 346 | ols <- lm(linear, data = df_int) 347 | summary(ols) 348 | 349 | ############################################################################### 350 | -------------------------------------------------------------------------------- /PC Lab 5/double_machine_learning_tutorial.r: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | ######################## Load Packages and the Data ######################## 3 | ############################################################################## 4 | 5 | ### Load the packages 6 | library(fBasics) # use for descriptive statistics 7 | library(tidyverse) # use for handling data 8 | library(DiagrammeR) # use for plotting trees 9 | library(lmtest) # use for heteroscedasticity robust standard errors 10 | library(sandwich) # use for heteroscedasticity robust standard errors 11 | library(grf) # use for generalized random forest 12 | library(glmnet) # use for lasso and Elastic Net regularized Generalized Linear Models 13 | options(warn=-1) # supress warnings 14 | 15 | print('All packages successfully installed and loaded.') 16 | 17 | ### Load the Data 18 | set.seed(12345678) 19 | df <- read.csv("job_corps.csv",header=TRUE, sep=",") # load data from csv-file 20 | df <- df[sample(c(1:nrow(df)), size=3000, replace =F),] # Select a random subsample of 3000 observations 21 | print('Data successfully loaded.') 22 | 23 | ############################################################################## 24 | 25 | ############################################################################## 26 | ######################## Descriptive Statistics ############################ 27 | ############################################################################## 28 | 29 | ## Table with Descriptive Statistics 30 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 31 | select(Mean, Stdev, Minimum, Maximum, nobs) 32 | print(round(desc, digits=2)) 33 | 34 | ############################################################################## 35 | 36 | ############################################################################### 37 | ######################### Sample Splitting #################################### 38 | ############################################################################### 39 | 40 | # Set starting value 41 | set.seed(123456789) 42 | 43 | # Partition Samples for Cross-Fitting 44 | df_part <- modelr::resample_partition(df, c(obs_A = 0.5, obs_B = 0.5)) # Split sample in strata of equal size 45 | df_obs_A <- as.data.frame(df_part$obs_A) # Sample A 46 | df_obs_B <- as.data.frame(df_part$obs_B) # Sample B 47 | 48 | ## Generate Variables 49 | # Outcome variable 50 | earnings_obs_A <- as.matrix(df_obs_A[,1]) 51 | earnings_obs_B <- as.matrix(df_obs_B[,1]) 52 | 53 | # Treatment variable 54 | treat = 3 #Select treatment 2= offer to participate, 3 = actual participation 55 | treat_obs_A <- as.matrix(df_obs_A[,treat]) 56 | treat_obs_B <- as.matrix(df_obs_B[,treat]) 57 | 58 | # Covariates 59 | covariates_obs_A <- as.matrix(df_obs_A[,c(4:ncol(df_obs_A))]) 60 | covariates_obs_B <- as.matrix(df_obs_B[,c(4:ncol(df_obs_B))]) 61 | 62 | print('Sample partitioning ready.') 63 | 64 | ############################################################################## 65 | 66 | ############################################################################### 67 | ########### Conditional Potential Earnings under Non-Participation ############ 68 | ############################################################################### 69 | 70 | p = 1 # 1 for LASSO, 0 for Ridge 71 | 72 | # Set starting value 73 | set.seed(123456789) 74 | 75 | # Estimate Lasso among non-participants in Sample A 76 | # Use cross-validation to select optimal lambda value 77 | lasso_y0_A <- cv.glmnet(covariates_obs_A[treat_obs_A==0,], earnings_obs_A[treat_obs_A==0,], 78 | alpha=p, type.measure = 'mse') 79 | # Plot the cross-validated MSE 80 | plot(lasso_y0_A) 81 | 82 | # Extrapolate the fitted values to Sample B 83 | y0hat_B <- predict(lasso_y0_A, newx = covariates_obs_B, type = 'response', s = lasso_y0_A$lambda.min) 84 | 85 | # Estimate Lasso among non-participants in Sample B 86 | lasso_y0_B <- cv.glmnet(covariates_obs_B[treat_obs_B==0,], earnings_obs_B[treat_obs_B==0,], 87 | alpha=p, type.measure = 'mse') 88 | # Plot the cross-validated MSE 89 | plot(lasso_y0_B) 90 | 91 | # Extrapolate the fitted values to Sample A 92 | y0hat_A <- predict(lasso_y0_B, newx = covariates_obs_A, type = 'response', s= lasso_y0_B$lambda.min) 93 | 94 | # Merge fitted values of both samples 95 | y0hat <- rbind(y0hat_A,y0hat_B) 96 | 97 | ################################################################################# 98 | 99 | ############################################################################### 100 | ########### Conditional Potential Earnings under Participation ############ 101 | ############################################################################### 102 | 103 | p = 1 # 1 for LASSO, 0 for Ridge 104 | 105 | # Set starting value 106 | set.seed(123456789) 107 | 108 | # Estimate Lasso among participants in Sample A 109 | # Use cross-validation to select optimal lambda value 110 | lasso_y1_A <- cv.glmnet(covariates_obs_A[treat_obs_A==1,], earnings_obs_A[treat_obs_A==1,], 111 | alpha=p, type.measure = 'mse') 112 | plot(lasso_y1_A) 113 | 114 | # Extrapolate the fitted values to Sample B 115 | y1hat_B <- predict(lasso_y1_A, newx = covariates_obs_B, type = 'response', s = lasso_y1_A$lambda.min) 116 | 117 | # Estimate Lasso among participants in Sample B 118 | lasso_y1_B <- cv.glmnet(covariates_obs_B[treat_obs_B==1,], earnings_obs_B[treat_obs_B==1,], 119 | alpha=p, type.measure = 'mse') 120 | plot(lasso_y1_B) 121 | 122 | # Extrapolate the fitted values to Sample A 123 | y1hat_A <- predict(lasso_y1_B, newx = covariates_obs_A, type = 'response', s= lasso_y1_B$lambda.min) 124 | 125 | # Merge the fitted values of both samples 126 | y1hat <- rbind(y1hat_A,y1hat_B) 127 | 128 | ################################################################################# 129 | 130 | ############################################################################### 131 | ########################### Propensity Score ################################## 132 | ############################################################################### 133 | 134 | # Propensity Score 135 | p = 1 # 1 for LASSO, 0 for Ridge 136 | 137 | # Set starting value 138 | set.seed(123456789) 139 | 140 | # Estimate Logit-Lasso in Sample A 141 | # Use cross-validation to select optimal lambda value 142 | lasso_p_A <- cv.glmnet(covariates_obs_A, treat_obs_A, alpha=p, type.measure = 'mse', family="binomial") 143 | plot(lasso_p_A) 144 | 145 | # Extrapolate the fitted values to Sample B 146 | pscore_B <- predict(lasso_p_A, newx = covariates_obs_B, type = 'response', s= lasso_p_A$lambda.min) 147 | 148 | # Estimate Logit-Lasso in Sample B 149 | lasso_p_B <- cv.glmnet(covariates_obs_B, treat_obs_B, alpha=p, type.measure = 'mse', family="binomial") 150 | plot(lasso_p_B) 151 | 152 | # Extrapolate the fitted values to Sample A 153 | pscore_A <- predict(lasso_p_B, newx = covariates_obs_A, type = 'response', s= lasso_p_B$lambda.min) 154 | 155 | # Merge the fitted values of both samples 156 | pscore <- rbind(pscore_A,pscore_B) 157 | 158 | ############################################################################### 159 | 160 | ############################################################################### 161 | ################################### ATE Score ################################# 162 | ############################################################################### 163 | 164 | # Merge earnings outcome of Sample A and B 165 | earnings_obs <- rbind(earnings_obs_A,earnings_obs_B) 166 | 167 | # Merge treatmente of Sample A and B 168 | treat_obs <- rbind(treat_obs_A,treat_obs_B) 169 | 170 | # Calculate the ATE score using the formula described above 171 | Y_ate_star = invisible(???) 172 | 173 | # Calculate ATE 174 | # It is the sample average of the ATE score 175 | ate <- round(mean(Y_ate_star), digits = 2) 176 | 177 | # Calculate the standard errors of the ATE 178 | # Square root of the quotient of variance of the ATE score and the sample size 179 | se_ate <- round(sqrt(var(Y_ate_star)/length(Y_ate_star)), digits = 2) 180 | 181 | 182 | print(paste0("Average Treatment Effect (ATE): ", ate)) 183 | print(paste0("Standard Error for ATE: ", se_ate)) 184 | 185 | ############################################################################### 186 | 187 | ############################################################################### 188 | ################################## ATET Score ################################# 189 | ############################################################################### 190 | 191 | ## Unconditional Treatment probability 192 | p = mean(pscore) 193 | 194 | # Calculate the ATET score using the formula described above 195 | Y_atet_star = invisible(???) 196 | 197 | # Calculate ATET 198 | # It is the sample average of the ATET score 199 | atet <- round(mean(Y_atet_star), digits = 2) 200 | 201 | # Calculate the standard errors of the ATET 202 | # Square root of the quotient of variance of the ATET score and the sample size 203 | se_atet <- round(sqrt(var(Y_atet_star)/length(Y_atet_star)), digits = 2) 204 | 205 | print(paste0("Average Treatment Effect for Treated (ATET): ", atet)) 206 | print(paste0("Standard Error for ATET: ", se_atet)) 207 | 208 | ############################################################################### 209 | 210 | ############################################################################### 211 | ##################################### CATEs ################################### 212 | ############################################################################### 213 | 214 | # Merge covariates of Sample A and B 215 | covariates_obs <- rbind(covariates_obs_A,covariates_obs_B) 216 | 217 | # Generate a new data frame 218 | # Merge the ATE score and the covariates 219 | colnames(Y_ate_star) <- "y_star" 220 | Y_star <- as.data.frame(cbind(Y_ate_star,covariates_obs[,-c(3,8)])) 221 | 222 | # Estimate an OLS regression 223 | # Regress the ATE score on the covariates 224 | cates <- lm(y_star ~., Y_star) 225 | 226 | # Heteroskedasticity robust standard errors 227 | coeftest(cates, vcov = vcovHC(cates, type = "HC1")) 228 | 229 | ############################################################################### 230 | 231 | ############################################################################### 232 | 233 | # Calculate the predicted effect size for each observation 234 | fit <- predict(cates) 235 | 236 | # Count the observations with positive and negative effects 237 | print(paste0("Number of individuals with positive effects: ", length(fit[fit>=0]))) 238 | print(paste0("Number of individuals with negative effects: ", length(fit[fit<0]))) 239 | 240 | ############################################################################### 241 | 242 | ############################################################################### 243 | ################ Plot Cumulative Distribution of CATEs ######################## 244 | ############################################################################### 245 | 246 | plot(ecdf(fit), col="blue", xlim = c(-100,150), xlab="Effect Size (in Dollars)", 247 | ylab="Cumulative Distribution", main="Cumulative Distibution of the CATEs") 248 | abline(v=0, col="red") 249 | 250 | ############################################################################### 251 | 252 | ############################################################################### 253 | ######################## Description of CATEs ################################# 254 | ############################################################################### 255 | 256 | ## Means and standard deviations for individuals with positive effects 257 | desc_1 <- fBasics::basicStats(Y_star[fit >= 0,-1]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev) 258 | 259 | ## Means and standard deviations for individuals with negative effects 260 | desc_0 <- fBasics::basicStats(Y_star[fit < 0,-1]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev) 261 | 262 | # Make table and add standardized differences 263 | desc <- cbind(desc_1,desc_0, 264 | 100*abs(desc_1[,1]-desc_0[,1])/sqrt(0.5*(desc_1[,2]^2+desc_0[,2]^2))) 265 | colnames(desc) <- c("Mean (Pos.)", "Std.Dev. (Pos.)", "Mean (Neg.)", "Std.Dev. (Neg.)", "Std.Diff.") 266 | print(round(desc, digits=2)) 267 | 268 | ############################################################################### 269 | 270 | ############################################################################### 271 | ########### Conditional Potential Earnings under Non-Participation ############ 272 | ############################################################################### 273 | 274 | # Set starting value 275 | set.seed(123456789) 276 | 277 | # Tuning parameters for forest 278 | trees = 1000 # number of trees in the forest 279 | frac = 0.5 # share of subsample used for each tree 280 | cov = floor(1/2*ncol(covariates_obs)) # number of covariates used for each tree 281 | min = 10 # minimum sample size in the terminal leaves of the trees 282 | 283 | # Estimate Random Forest among non-participants in Sample A 284 | forest_y0_A <- regression_forest(covariates_obs_A[treat_obs_A==0,], earnings_obs_A[treat_obs_A==0,], 285 | num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min) 286 | 287 | # Extrapolate the fitted values to Sample B 288 | y0hat_B <- as.matrix(predict(forest_y0_A, newdata = covariates_obs_B)$predictions) 289 | 290 | print("Random Forest for Sample A estimated.") 291 | 292 | ################################################################################# 293 | 294 | ################################################################################# 295 | 296 | # Plot one tree from the random forest 297 | plot(tree <- get_tree(forest_y0_A, 1)) 298 | # the last number is the tree number 299 | # it can be varied from 1 to 1000 300 | 301 | ################################################################################# 302 | 303 | ################################################################################# 304 | 305 | # Count the splitting frequencies for each covariate 306 | split <- split_frequencies(forest_y0_A, max.depth = 4) 307 | # max.depth specifies the maximum tree depth we consider 308 | 309 | # Label the results 310 | colnames(split) <- colnames(covariates_obs) 311 | rownames(split) <- c("Depth 1", "Depth 2", "Depth 3", "Depth 4") 312 | 313 | print(t(split)) 314 | 315 | ################################################################################# 316 | 317 | ################################################################################# 318 | 319 | # Estimate Random Forest among non-participants in Sample B 320 | forest_y0_B <- regression_forest(???) 321 | 322 | # Extrapolate the fitted values to Sample A 323 | y0hat_A <- as.matrix(predict(forest_y0_B, newdata = covariates_obs_A)$predictions) 324 | 325 | # Merge fitted values of both samples 326 | y0hat <- rbind(y0hat_A,y0hat_B) 327 | 328 | print("Random Forest for Sample B estimated.") 329 | 330 | ################################################################################# 331 | 332 | ############################################################################### 333 | ########################### Propensity Score ################################## 334 | ############################################################################### 335 | 336 | # Set starting value 337 | set.seed(123456789) 338 | 339 | # Tuning parameters for forest 340 | trees = 1000 341 | frac = 0.5 342 | cov = floor(1/2*ncol(covariates_obs)) 343 | min = 10 344 | 345 | # Estimate Random Forest in Sample A 346 | forest_p_A <- regression_forest(covariates_obs_A, treat_obs_A, 347 | num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min) 348 | 349 | # Extrapolate the fitted values to Sample B 350 | pscore_B <- as.matrix(predict(forest_p_A, newdata = covariates_obs_B)$predictions) 351 | 352 | ############## 353 | 354 | # Estimate Random Forest in Sample B 355 | forest_p_B <- regression_forest(covariates_obs_B, treat_obs_B, 356 | num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min) 357 | 358 | # Extrapolate the fitted values to Sample A 359 | pscore_A <- as.matrix(predict(forest_p_B, newdata = covariates_obs_A)$predictions) 360 | 361 | # Merge the fitted values of both samples 362 | pscore <- rbind(pscore_A,pscore_B) 363 | 364 | print("Propensity score is estimated.") 365 | 366 | ############################################################################### 367 | 368 | ############################################################################### 369 | ################################## ATET Score ################################# 370 | ############################################################################### 371 | 372 | ## Unconditional Treatment probability 373 | p = mean(pscore) 374 | 375 | # Calculate the ATET score using the formula described above 376 | Y_atet_star = invisible(treat_obs*(earnings_obs - y0hat)/p 377 | - (1-treat_obs)*pscore*(earnings_obs - y0hat)/(p*(1-pscore))) 378 | 379 | # Calculate ATET 380 | # It is the sample average of the ATET score 381 | atet <- round(mean(Y_atet_star), digits = 2) 382 | 383 | # Calculate the standard errors of the ATET 384 | # Square root of the quotient of variance of the ATET score and the sample size 385 | se_atet <- round(sqrt(var(Y_atet_star)/length(Y_atet_star)), digits = 2) 386 | 387 | print(paste0("Average Treatment Effect for Treated (ATET): ", atet)) 388 | print(paste0("Standard Error for ATET: ", se_atet)) 389 | 390 | ############################################################################### 391 | -------------------------------------------------------------------------------- /PC Lab 5/help files/glmnet_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 5/help files/glmnet_package.pdf -------------------------------------------------------------------------------- /PC Lab 5/help files/grf_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 5/help files/grf_package.pdf -------------------------------------------------------------------------------- /PC Lab 6/causal_forest.r: -------------------------------------------------------------------------------- 1 | ######################## Load Packages ######################## 2 | 3 | # List of required packages 4 | pkgs <- c('fBasics', 'corrplot', 'tidyverse', 'grf', 'plotmo') 5 | 6 | # Load packages 7 | for(pkg in pkgs){ 8 | library(pkg, character.only = TRUE) 9 | } 10 | options(warn=-1) # supress warnings 11 | 12 | print('All packages successfully installed and loaded.') 13 | 14 | ################################################################## 15 | 16 | ######################## Load Data Frame ######################## 17 | 18 | # Load data frame 19 | df <- read.csv("fundraising.csv",header=TRUE, sep=",") 20 | 21 | # Outcome Variable 22 | outcome <- c("char_giving") 23 | 24 | # Treatment Variables 25 | treatment <- c("treat") 26 | 27 | # Covariates/Features 28 | covariates <- c("amount_pre", "amount_lastpre", "amount_maxpre", "H_number_yearbefore", "H_ngifts", 29 | "H_littleask", "H_bigask", "H_nyears", "H_frequency", "H_medinc", "H_medinc_mdum", 30 | "H_Avg_years_ed", "H_Avg_years_ed_mdum") 31 | 32 | 33 | all_variables <- c(outcome, treatment, covariates) 34 | 35 | print('Data frame successfully loaded and sample selected.') 36 | 37 | #################################################################### 38 | 39 | ######################## Table with Descriptive Statistics ######################## 40 | 41 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 42 | select(Mean, Stdev, Minimum, Maximum, nobs) 43 | print(round(desc, digits=2)) 44 | 45 | ##################################################################################### 46 | 47 | ######################## Correlation Matrix ######################## 48 | 49 | corr = cor(df[,-c(1:2)]) 50 | corrplot(corr, type = "upper", tl.col = "black") 51 | 52 | ###################################################################### 53 | 54 | ######################## Partition the Samples ######################## 55 | set.seed(100239) # set starting value for random number generator 56 | 57 | # Partition Hold-Out-Sample 58 | df_part <- modelr::resample_partition(df, c(obs = 0.8, hold_out = 0.2)) 59 | df_obs <- as.data.frame(df_part$obs) # Training and estimation sample 60 | df_hold_out <- as.data.frame(df_part$hold_out) # Hold-out-sample 61 | 62 | print('Samples are partitioned.') 63 | 64 | ######################## Generate Variables ######################## 65 | 66 | # Outcome 67 | giving_hold_out <- as.matrix(df_hold_out[,1]) 68 | giving_obs <- as.matrix(df_obs[,1]) 69 | 70 | # Treatment 71 | treat_hold_out <- as.matrix(df_hold_out[,2]) 72 | treat_obs <- as.matrix(df_obs[,2]) 73 | 74 | # Covariates 75 | covariates_hold_out <- as.matrix(df_hold_out[,c(3:ncol(df_hold_out))]) 76 | covariates_obs <- as.matrix(df_obs[,c(3:ncol(df_obs))]) 77 | 78 | print('The data is now ready for your analysis!') 79 | 80 | ####################################################################### 81 | 82 | ######################## Causal Forest ######################## 83 | set.seed(100244) 84 | 85 | # Tuning parameters 86 | min_tree = 100 # Minimum size of terminal leaves 87 | num_trees = 1000 # Number of trees in forest 88 | cov_frac = 1/2 # Fraction of covariates in each tree 89 | sample_part= 0.5 # Fraction of sample used for each tree (subsampling) 90 | 91 | # Caual Forest 92 | cates <- causal_forest(covariates_obs, giving_obs, treat_obs, 93 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 94 | num.trees = num_trees, min.node.size = min_tree, 95 | honesty = TRUE, honesty.fraction = 0.5) 96 | 97 | print('Forest is ready!') 98 | 99 | ################################################################### 100 | 101 | ################################################################################# 102 | 103 | # Plot one tree from the random forest 104 | plot(tree <- get_tree(cates, 1)) 105 | # the last number is the tree number 106 | # it can be varied from 1 to 1000 107 | 108 | ################################################################################# 109 | 110 | ################################################################################# 111 | 112 | # Count the splitting frequencies for each covariate 113 | split <- split_frequencies(cates, max.depth = 4) 114 | # max.depth specifies the maximum tree depth we consider 115 | 116 | # Label the results 117 | colnames(split) <- colnames(covariates_obs) 118 | rownames(split) <- c("Depth 1", "Depth 2", "Depth 3", "Depth 4") 119 | 120 | print(t(split)) 121 | 122 | ################################################################################# 123 | 124 | ######################### ATE ############################### 125 | 126 | average_treatment_effect(cates, target.sample = c("all")) 127 | 128 | ############################################################# 129 | 130 | ############################################################################### 131 | 132 | # Calculate the predicted effect size for each observation 133 | fit <- predict(cates, covariates_hold_out, estimate.variance = FALSE)$predictions 134 | 135 | # Count the observations with positive and negative effects 136 | print(paste0("Number of individuals with positive effects: ", length(fit[fit>=0]))) 137 | print(paste0("Number of individuals with negative effects: ", length(fit[fit<0]))) 138 | 139 | print(paste0("Share of individuals with positive effects: ", round(100*length(fit[fit>=0])/length(fit),digits=1), "%")) 140 | 141 | ############################################################################### 142 | 143 | ############################################################################### 144 | ################ Plot Cumulative Distribution of CATEs ######################## 145 | ############################################################################### 146 | 147 | plot(ecdf(fit), col="blue", xlim = c(-25,25), xlab="Effect Size (in Dollars)", 148 | ylab="Cumulative Distribution", main="Cumulative Distibution of the CATEs") 149 | abline(v=0, col="red") 150 | 151 | ############################################################################### 152 | 153 | ############################################################################### 154 | ######################## Description of CATEs ################################# 155 | ############################################################################### 156 | 157 | ## Means and standard deviations for individuals with positive effects 158 | desc_1 <- fBasics::basicStats(covariates_hold_out[fit >= 0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev) 159 | 160 | ## Means and standard deviations for individuals with negative effects 161 | desc_0 <- fBasics::basicStats(covariates_hold_out[fit < 0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev) 162 | 163 | # Make table and add standardized differences 164 | desc <- cbind(desc_1,desc_0, 165 | 100*abs(desc_1[,1]-desc_0[,1])/sqrt(0.5*(desc_1[,2]^2+desc_0[,2]^2))) 166 | colnames(desc) <- c("Mean (Pos.)", "Std.Dev. (Pos.)", "Mean (Neg.)", "Std.Dev. (Neg.)", "Std.Diff.") 167 | print(round(desc, digits=2)) 168 | 169 | ############################################################################### 170 | 171 | 172 | -------------------------------------------------------------------------------- /PC Lab 6/help files/grf_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 6/help files/grf_package.pdf -------------------------------------------------------------------------------- /PC Lab 7/help files/grf_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 7/help files/grf_package.pdf -------------------------------------------------------------------------------- /PC Lab 7/help files/rpart_package.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 7/help files/rpart_package.pdf -------------------------------------------------------------------------------- /PC Lab 7/optimal_policy_learning.r: -------------------------------------------------------------------------------- 1 | ######################## Load Packages ######################## 2 | 3 | # List of required packages 4 | pkgs <- c('fBasics', 'corrplot', 'rpart', 'rpart.plot', 'tidyverse', 'grf', 'caret') 5 | 6 | # Load packages 7 | for(pkg in pkgs){ 8 | library(pkg, character.only = TRUE) 9 | } 10 | options(warn=-1) # supress warnings 11 | 12 | print('All packages successfully installed and loaded.') 13 | 14 | ################################################################### 15 | 16 | ######################## Load Data Frame ######################## 17 | 18 | # Load data frame 19 | df <- read.csv("fundraising.csv",header=TRUE, sep=",") 20 | 21 | # Outcome Variable 22 | outcome <- c("char_giving") 23 | 24 | # Treatment Variables 25 | treatment <- c("treat") 26 | 27 | # Covariates/Features 28 | covariates <- c("amount_pre", "amount_lastpre", "amount_maxpre", "H_number_yearbefore", "H_ngifts", 29 | "H_littleask", "H_bigask", "H_nyears", "H_frequency", "H_medinc", "H_medinc_mdum", 30 | "H_Avg_years_ed", "H_Avg_years_ed_mdum") 31 | 32 | all_variables <- c(outcome, treatment, covariates) 33 | 34 | print('Data frame successfully loaded and sample selected.') 35 | 36 | ###################################################################### 37 | 38 | ######################## Table with Descriptive Statistics ######################## 39 | 40 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>% 41 | select(Mean, Stdev, Minimum, Maximum, nobs) 42 | print(round(desc, digits=2)) 43 | 44 | ##################################################################################### 45 | 46 | ######################## Correlation Matrix ######################## 47 | 48 | corr = cor(df[,-c(1:2)]) 49 | corrplot(corr, type = "upper", tl.col = "black") 50 | 51 | ###################################################################### 52 | 53 | ######################## Partition the Samples ######################## 54 | set.seed(100233) # set starting value for random number generator 55 | 56 | # Partition Hold-Out-Sample 57 | df_part <- modelr::resample_partition(df, c(obs = 0.8, hold_out = 0.2)) 58 | df_obs <- as.data.frame(df_part$obs) # Training and estimation sample 59 | df_hold_out <- as.data.frame(df_part$hold_out) # Hold-out-sample 60 | 61 | # Partition Samples for Cross-Fitting 62 | df_part <- modelr::resample_partition(df_obs, c(obs_A = 0.5, obs_B = 0.5)) 63 | df_obs_A <- as.data.frame(df_part$obs_A) # Sample A 64 | df_obs_B <- as.data.frame(df_part$obs_B) # Sample B 65 | 66 | print('Samples are partitioned.') 67 | 68 | ######################## Generate Variables ######################## 69 | 70 | # Outcome 71 | giving_hold_out <- as.matrix(df_hold_out[,1]) 72 | giving_obs <- as.matrix(df_obs[,1]) 73 | giving_obs_A <- as.matrix(df_obs_A[,1]) 74 | giving_obs_B <- as.matrix(df_obs_B[,1]) 75 | 76 | # Treatment 77 | treat_hold_out <- as.matrix(df_hold_out[,2]) 78 | treat_obs <- as.matrix(df_obs[,2]) 79 | treat_obs_A <- as.matrix(df_obs_A[,2]) 80 | treat_obs_B <- as.matrix(df_obs_B[,2]) 81 | 82 | # Covariates 83 | covariates_hold_out <- as.matrix(df_hold_out[,c(3:ncol(df_hold_out))]) 84 | covariates_obs <- as.matrix(df_obs[,c(3:ncol(df_obs))]) 85 | covariates_obs_A <- as.matrix(df_obs_A[,c(3:ncol(df_obs_A))]) 86 | covariates_obs_B <- as.matrix(df_obs_B[,c(3:ncol(df_obs_B))]) 87 | 88 | ######################## Standardise Covariates ######################## 89 | 90 | preProcValues <- preProcess(covariates_obs, method = c("center", "scale")) 91 | covariates_hold_out <- predict(preProcValues, covariates_hold_out) 92 | covariates_obs <- predict(preProcValues, covariates_obs) 93 | covariates_obs_A <- predict(preProcValues, covariates_obs_A) 94 | covariates_obs_B <- predict(preProcValues, covariates_obs_B) 95 | 96 | df_obs <- as.data.frame(cbind(giving_obs,treat_obs,covariates_obs)) 97 | df_obs_A <- as.data.frame(cbind(giving_obs_A,treat_obs_A,covariates_obs_A)) 98 | df_obs_B <- as.data.frame(cbind(giving_obs_B,treat_obs_B,covariates_obs_B)) 99 | covariates_hold_out <- as.data.frame(covariates_hold_out) 100 | 101 | print('Covariates are standardised.') 102 | print('The data is now ready for your analysis!') 103 | 104 | ########################################################################### 105 | 106 | ######################## Potential Outcomes ######################## 107 | set.seed(100243) 108 | 109 | # Tuning parameters 110 | min_tree = 20 111 | # Number of trees is set to a very low value in order to increase the computational speed in this tutorial 112 | num_trees = 100 # Use at least 1,000 trees 113 | cov_frac = 1/3 114 | sample_part= 0.5 115 | 116 | # Build generalised random forest 117 | 118 | # Use Sample A to predict Sample B 119 | # Potential outcome under treatment 120 | f_y1_A <- regression_forest(covariates_obs_A[treat_obs_A == 1,], giving_obs_A[treat_obs_A == 1, ], 121 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 122 | num.trees = num_trees, min.node.size = min_tree, 123 | honesty = TRUE, honesty.fraction = 0.5) 124 | y1hat_B <- as.matrix(predict(f_y1_A, covariates_obs_B)$predictions) 125 | y1hat_B_hold_out <- as.matrix(predict(f_y1_A, covariates_hold_out)$predictions) 126 | 127 | # Potential outcome under non-treatment 128 | f_y0_A <- regression_forest(covariates_obs_A[treat_obs_A == 0,], giving_obs_A[treat_obs_A == 0, ], 129 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 130 | num.trees = num_trees, min.node.size = min_tree, 131 | honesty = TRUE, honesty.fraction = 0.5) 132 | y0hat_B <- as.matrix(predict(f_y0_A, covariates_obs_B)$predictions) 133 | y0hat_B_hold_out <- as.matrix(predict(f_y0_A, covariates_hold_out)$predictions) 134 | 135 | ########################################################################### 136 | 137 | # Use Sample B to predict Sample A 138 | # Potential outcome under treatment 139 | f_y1_B <- regression_forest(covariates_obs_B[treat_obs_B == 1,], giving_obs_B[treat_obs_B == 1, ], 140 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 141 | num.trees = num_trees, min.node.size = min_tree, 142 | honesty = TRUE, honesty.fraction = 0.5) 143 | y1hat_A <- as.matrix(predict(f_y1_B, covariates_obs_A)$predictions) 144 | y1hat_A_hold_out <- as.matrix(predict(f_y1_B, covariates_hold_out)$predictions) 145 | 146 | # Potential outcome under non-treatment 147 | f_y0_B <- regression_forest(covariates_obs_B[treat_obs_B == 0,], giving_obs_B[treat_obs_B == 0, ], 148 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 149 | num.trees = num_trees, min.node.size = min_tree, 150 | honesty = TRUE, honesty.fraction = 0.5) 151 | y0hat_A <- as.matrix(predict(f_y0_B, covariates_obs_A)$predictions) 152 | y0hat_A_hold_out <- as.matrix(predict(f_y0_B, covariates_hold_out)$predictions) 153 | 154 | ########################################################################### 155 | 156 | # Merge the fitted values from samples A and B 157 | y1hat <- rbind(y1hat_A,y1hat_B) 158 | y0hat <- rbind(y0hat_A,y0hat_B) 159 | 160 | y1hat_hold_out <- (y1hat_A_hold_out+y1hat_B_hold_out)/2 161 | y0hat_hold_out <- (y0hat_A_hold_out+y0hat_B_hold_out)/2 162 | 163 | print("Potential outcomes are estimated") 164 | 165 | ########################################################################### 166 | 167 | ######################## Propensity Score ######################## 168 | set.seed(100242) 169 | 170 | # Tuning parameters 171 | min_tree = 20 172 | num_trees = 100 # Use at least 1,000 trees 173 | cov_frac = 1/3 174 | sample_part= 0.5 175 | 176 | # Use Sample A to predict Sample B 177 | f_p_A <- regression_forest(covariates_obs_A, treat_obs_A, 178 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 179 | num.trees = num_trees, min.node.size = min_tree, 180 | honesty = TRUE, honesty.fraction = 0.5) 181 | pscore_B <- as.matrix(predict(f_p_A, covariates_obs_B)$predictions) 182 | pscore_B_hold_out <- as.matrix(predict(f_p_A, covariates_hold_out)$predictions) 183 | 184 | # Use Sample B to predict Sample A 185 | f_p_B <- regression_forest(covariates_obs_B, treat_obs_B, 186 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)), 187 | num.trees = num_trees, min.node.size = min_tree, 188 | honesty = TRUE, honesty.fraction = 0.5) 189 | pscore_A <- as.matrix(predict(f_p_B, covariates_obs_A)$predictions) 190 | pscore_A_hold_out <- as.matrix(predict(f_p_B, covariates_hold_out)$predictions) 191 | 192 | pscore <- rbind(pscore_A,pscore_B) 193 | pscore_hold_out <- (pscore_A_hold_out+pscore_B_hold_out)/2 194 | 195 | print("Propensity scores are estimated") 196 | 197 | ########################################################################### 198 | 199 | ######################## Average Treatment Effects (ATE) ######################## 200 | 201 | # Merge samples A and B 202 | giving_obs <- rbind(giving_obs_A,giving_obs_B) 203 | treat_obs <- rbind(treat_obs_A,treat_obs_B) 204 | 205 | # Generate Modified Outcome 206 | Y_star = invisible(y1hat - y0hat + treat_obs*(giving_obs - y1hat)/pscore 207 | - (1-treat_obs)*(giving_obs - y0hat)/(1-pscore)) 208 | 209 | # Average Treatment Effect (ATE) 210 | ATE <- round(mean(Y_star), digits=1) 211 | print(paste0("Average Treatment Effect (ATE): ", ATE)) 212 | 213 | # Standard error 214 | SD_ATE <- round(sqrt(var(Y_star)/length(Y_star)),digits=1) 215 | print(paste0("Standard Error for ATE: ", SD_ATE)) 216 | 217 | #################################################################################### 218 | 219 | ######################## Individualised Treatment Rules ######################## 220 | 221 | set.seed(1234567) 222 | 223 | # Define transformed Variables 224 | sign = sign(Y_star) 225 | lambda = abs(Y_star) 226 | Z <- factor(sign, labels = c("Don't", "Treat")) 227 | df_obs <- rbind(df_obs_A,df_obs_B) 228 | 229 | # Genrate linear fromular for tree 230 | sumx <- paste(covariates, collapse = " + ") 231 | linear <- paste("Z",paste(sumx, sep=" + "), sep=" ~ ") 232 | linear <- as.formula(linear) 233 | 234 | ######################## Build a Shallow Tree ######################## 235 | 236 | # Tree 237 | tree_1 <- rpart(formula = linear, # Predict sign of treatment 238 | data = df_obs, 239 | weights = lambda, # Larger absolute effect -> Higher weight 240 | method = "class", 241 | control = rpart.control(cp = 2.00e-10,maxdepth = 3, minbucket=10)) 242 | 243 | # Plot MSE in CV-Sample 244 | rpart.plot(tree_1,digits=3) 245 | 246 | # Predict policy rule to hold-out-sample 247 | pi_tree1_hold_out = as.matrix(predict(tree_1, newdata=covariates_hold_out)) 248 | 249 | #################################################################################### 250 | 251 | ############################# Build Trees Deeper Tree ################################# 252 | 253 | set.seed(1234567) 254 | 255 | # Tree 256 | tree_2 <- rpart(formula = linear, # Predict sign of treatment 257 | data = df_obs, 258 | weights = lambda, # Larger absolute effect --> Higher weight 259 | method = "class", 260 | control = rpart.control(cp = 2.00e-10, minbucket=10)) 261 | 262 | # Find optimal tree sizes 263 | op.index_2 <- which.min(tree_2$cptable[, "xerror"]) 264 | print(paste0("Optimal number of splits: ", tree_2$cptable[op.index_2, "nsplit"])) 265 | 266 | # Plot CV-Error 267 | plotcp(tree_2, minline = TRUE) 268 | abline(v = op.index_2, lty = "dashed") 269 | 270 | ######################## Select the Tree that Minimises CV-MSE ######################## 271 | 272 | # Get cp-value that corresponds to optimal tree sizes 273 | cp.vals_2 <- tree_2$cptable[op.index_2, "CP"] 274 | 275 | # Prune the trees 276 | prune_tree_2 <- prune(tree_2, cp = cp.vals_2) 277 | 278 | # Plot pruned tree 279 | rpart.plot(prune_tree_2,digits=3, main = "Pruned Tree") 280 | 281 | # Predict policy rule to hold-out-sample 282 | pi_tree2_hold_out = as.matrix(predict(prune_tree_2, newdata=covariates_hold_out)) 283 | 284 | ######################################################################################### 285 | 286 | ######################## Share of Treated ######################## 287 | 288 | # Rule based on shallow tree (ITR1) 289 | rule_tree_1 <- as.numeric(pi_tree1_hold_out[,2]> .5) 290 | # Rule based on deeper tree (ITR2) 291 | rule_tree_2 <- as.numeric(pi_tree2_hold_out[,2]> .5) 292 | 293 | print('Descriptives of Policy Rules') 294 | desc <- fBasics::basicStats(cbind(rule_tree_1,rule_tree_2)) %>% t() %>% as.data.frame() %>% 295 | select(Mean, nobs) 296 | print(round(desc, digits=5)) 297 | 298 | print('Correlation between the Policy Rules') 299 | corr = cor(cbind(rule_tree_1,rule_tree_2)) 300 | print(corr) 301 | 302 | ##################################################################### 303 | 304 | ######################## Average Giving Under Policy Rule ######################## 305 | 306 | # Generate Modified Outcome 307 | y_1_hold_out = invisible(y1hat_hold_out + treat_hold_out*(giving_hold_out - y1hat_hold_out)/pscore_hold_out) 308 | y_0_hold_out = invisible(y0hat_hold_out + (1-treat_hold_out)*(giving_hold_out - y0hat_hold_out)/(1-pscore_hold_out)) 309 | 310 | # Calulate expected average giving under the different policy rules 311 | O_tree_1 <- round(mean(rule_tree_1*y_1_hold_out + (1-rule_tree_1)*y_0_hold_out), digits = 2) 312 | O_tree_2 <- round(mean(rule_tree_2*y_1_hold_out + (1-rule_tree_2)*y_0_hold_out), digits = 2) 313 | 314 | print('Average Givings Under') 315 | print(paste0("Shallow Tree: ",O_tree_1)) 316 | print(paste0("Pruned Tree: ",O_tree_2)) 317 | 318 | ##################################################################################### 319 | 320 | ######################## Policy Value Compared to Everybody is Treated ######################## 321 | 322 | #Modified Outcome 323 | Y_star_hold_out = y_1_hold_out - y_0_hold_out 324 | 325 | # Estimate Policy Value 326 | tree_all <- round(mean((rule_tree_2-1)*Y_star_hold_out), digits = 2) 327 | se_tree_all <- round(sqrt(var((rule_tree_2-1)*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2) 328 | 329 | print('Total Policy Value Compared to Everybody is Treated') 330 | print(paste0("Average Gain of Pruned Tree: ", tree_all)) 331 | print(paste0("Standard Error: ", se_tree_all)) 332 | 333 | #round(mean(giving_hold_out[treat_hold_out==1,]), digits = 2) 334 | #round(mean((rule_tree_2-1)*Y_star_hold_out)/mean(giving_hold_out[treat_hold_out==1,]), digits = 2) 335 | 336 | ################################################################################################ 337 | 338 | ######################## Policy Value Compared to Nobody is Treated ######################## 339 | 340 | # Estimate Policy Value 341 | tree_no <- round(mean(rule_tree_2*Y_star_hold_out), digits = 2) 342 | se_tree_no <- round(sqrt(var(rule_tree_2*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2) 343 | 344 | print('Total Policy Value Compared to Nobody is Treated') 345 | print(paste0("Average Gain of Pruned Tree: ", tree_no)) 346 | print(paste0("Standard Error: ", se_tree_no)) 347 | 348 | #round(mean(giving_hold_out[treat_hold_out==0,]), digits = 2) 349 | #round(mean(rule_tree_2*Y_star_hold_out)/mean(giving_hold_out[treat_hold_out==0,]), digits = 2) 350 | 351 | ################################################################################################ 352 | 353 | ######################## Policy Value Compared to Random Assignment ######################## 354 | 355 | # Estimate Policy Value 356 | R1_tree_2 <- round(1/2*mean((2*rule_tree_2-1)*Y_star_hold_out), digits = 2) 357 | se_tree_2 <- round(sqrt(1/4*var((2*rule_tree_2-1)*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2) 358 | 359 | 360 | print('Total Policy Value Compared to Random Assignment') 361 | print(paste0("Average Gain of Pruned Tree: ", R1_tree_2)) 362 | print(paste0("Standard Error: ", se_tree_2)) 363 | 364 | #round((mean(giving_hold_out[treat_hold_out==1,])+mean(giving_hold_out[treat_hold_out==0,]))/2, digits = 2) 365 | #round(1/2*mean((2*rule_tree_2-1)*Y_star_hold_out)/(mean(giving_hold_out[treat_hold_out==1,])+ 366 | # mean(giving_hold_out[treat_hold_out==0,]))/2, digits = 2) 367 | 368 | ################################################################################################ 369 | 370 | 371 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-Course 2 | Machine Learning for Economists and Business Analysts 3 | 4 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/AStrittmatter/Machine-Learning-Course/HEAD) 5 | 6 | Machine learning estimation methods gain more and more popularity. Compared to conventional estimation methods, machine learning can solve statistical prediction tasks in a data adaptive way. Furthermore, machine learning can deal with high-dimensional variable spaces in a relatively flexible way. Prediction methods are used in many different business and economic domains. Examples of prediction tasks are: The prediction of sales for a grocery store, such that logisticians can ship products before they are sold. The prediction of the probability to become drug addicted later in life, such that drug prevention programs can be targeted at adolescents with high risk. 7 | 8 | Besides predictions, economists and managers are often interested in causal questions. Examples of causal questions are: What are the effects of tweets by Elon Musk on Bitcoins? What impact has lowering the central bank interest rate on GDP? Does participation in training programs reduce the unemployment duration? Machine learning cannot give us an automatic answer to causal questions without using an empirical design. However, machine learning estimates can serve as input factors for these empirical designs. Furthermore, we can estimate heterogeneous effects with machine learning. 9 | 10 | The course covers different predictive and causal machine learning methods. A focus will be on the application of these methods in practical R programming session. 11 | 12 | Predictive Machine Learning: 13 | - Regularized Regression 14 | - Trees and Forests 15 | - Unsupervised Machine Learning 16 | 17 | Causal Machine Learning 18 | - Double Selection Procedure 19 | - Debiased Machine Learning 20 | - Causal Forests 21 | - Optimal Policy Learning 22 | - Reinforcement Learning 23 | -------------------------------------------------------------------------------- /Stata Example/ajr_example.do: -------------------------------------------------------------------------------- 1 | clear 2 | 3 | // Data is from Acemoglu, Robinson, and Johnson (2001) "The Colonial Origins of Comparative Development: An Empirical Investigation" 4 | use https://statalasso.github.io/dta/AJR.dta 5 | 6 | // We estimate the effect of institutions (avexpr) on income (logpgp95) 7 | // logpgp95 - log of GDP per capita in 1995 8 | // avexpr - average protection against exprorition risk, 1985-1995 9 | 10 | * Unconditional OLS estimate 11 | reg logpgp95 avexpr, robust 12 | 13 | * Conditional OLS estimate 14 | // We have 24 control variables (latitude, temperature, humidity, ethnical diversity, soil, commodities, etc.) 15 | // The data contains only have 64 country-level observations 16 | reg logpgp95 avexpr lat_abst edes1975 avelf temp* humid* steplow-oilres, robust 17 | 18 | * Post-Lasso Double Selection Procedure 19 | // Let the data decide which control variables are important 20 | pdslasso logpgp95 avexpr (lat_abst edes1975 avelf temp* humid* steplow-oilres), robust nois 21 | 22 | 23 | * Useful Links: 24 | // https://statalasso.github.io/ 25 | // https://github.com/aahrens1 26 | // https://economics.mit.edu/files/4123 27 | -------------------------------------------------------------------------------- /Stata Example/pdslasso.ado: -------------------------------------------------------------------------------- 1 | *! pdslasso 1.0.01 30jan2018 2 | *! authors aa/cbh/ms 3 | * wrapper for ivlasso 4 | 5 | program define pdslasso, eclass sortpreserve 6 | syntax [anything] [if] [in] , /// 7 | [ /// 8 | OLSOPTions(string) /// options passed to IV or OLS estimation 9 | * ] 10 | 11 | version 13 12 | ivlasso `anything' `if' `in', `options' cmdname(pdslasso) ivoptions(`olsoptions') 13 | 14 | ereturn local cmd pdslasso 15 | 16 | end 17 | 18 | -------------------------------------------------------------------------------- /Stata Example/rlasso.ado: -------------------------------------------------------------------------------- 1 | *! rlasso 1.0.06 10feb2018 2 | *! authors aa/cbh/ms 3 | 4 | * Updates (release date): 5 | * 1.0.05 (30jan2018) 6 | * First public release. 7 | * Added seed(.) option to rlasso/lassoutils to control rnd # seed for xdep & sup-score. 8 | * Fixed bug in DisplayCoefs (didn't accommodate both e(notpen) and e(pnotpen)). 9 | * Promoted to require version 13 or higher. 10 | * Added dots option. 11 | * Fixed displaynames bug (wrong dictionaries used for partialled-out vars). 12 | * Recoding of cons and demeaning flags. 13 | * partial and nocons no longer compatible. 14 | * Removed hdm version of sup-score stat. 15 | * Removed misc debug code. 16 | * 1.0.06 (xxx) 17 | * Support for Sergio Correia's FTOOLS FE transform (if installed). 18 | 19 | program rlasso, eclass sortpreserve 20 | 21 | version 13 22 | 23 | syntax [anything] [if] [in] [, /// 24 | displayall /// 25 | varwidth(int 17) /// 26 | VERsion /// 27 | supscore /// 28 | testonly /// 29 | * /// 30 | ] 31 | 32 | local lversion 1.0.05 33 | 34 | if "`version'" != "" { // Report program version number, then exit. 35 | di in gr "`lversion'" 36 | ereturn clear 37 | ereturn local version `lversion' 38 | exit 39 | } 40 | 41 | if ~replay() { // not replay so estimate 42 | _rlasso `anything' `if' `in', /// 43 | `options' `supscore' `testonly' 44 | } 45 | else if e(cmd)~="rlasso" { // replay, so check that rlasso results exist 46 | di as err "last estimates not found" 47 | exit 301 48 | } 49 | 50 | if "`e(method)'"~="" { 51 | DisplayCoefs, `displayall' varwidth(`varwidth') 52 | } 53 | 54 | // temp measure 55 | if e(supscore) < . { 56 | DisplaySupScore 57 | } 58 | 59 | end 60 | 61 | program _rlasso, eclass sortpreserve 62 | 63 | version 13 64 | 65 | syntax varlist(numeric fv ts min=2) [if] [in] [, /// 66 | /// specify options with varlists to be used by marksample/markout 67 | PNOTPen(varlist fv ts numeric) /// list of variables not penalised 68 | partial(string) /// string so that list can contain "_cons" 69 | fe /// do within-transformation 70 | NOCONStant /// 71 | CLuster(varlist max=1) /// penalty level/loadings allow for within-panel dependence & heterosk. 72 | pols /// post-lasso coefs in e(b) (default=lasso) 73 | prestd /// 74 | VERbose /// pass to lassoutils 75 | VVERbose /// pass to lassoutils 76 | dots /// 77 | displaynames_o(string) /// dictionary with names of vars as supplied in varlist 78 | displaynames_d(string) /// corresponding display names of vars 79 | pminus(int 0) /// overrides calculation of pminus 80 | debug /// used for debugging 81 | postall /// full coef vector in e(b) (default=selected only) 82 | testonly /// obtain supscore test only 83 | NOFTOOLS /// 84 | * /// additional options to be passed to lassoutils 85 | ] 86 | 87 | *** rlasso-specific 88 | // to distinguish between lasso2 and rlasso treatment of notpen, 89 | // rlasso option is called pnotpen 90 | // to keep lasso2 and rlasso code aligned, rename to notpen here 91 | // and at end of program save macros as pnotpen 92 | // temporary measure until lasso2 and rlasso code is merge 93 | local notpen `pnotpen' 94 | // supscore test flag 95 | local testonlyflag =("`testonly'"~="") 96 | * 97 | 98 | *** debug mode; create flag 99 | local debugflag =("`debug'"~="") 100 | * 101 | 102 | *** Record which observations have non-missing values 103 | marksample touse 104 | markout `touse' `varlist' `cluster' `ivar' 105 | sum `touse' if `touse', meanonly // will sum weight var when weights are used 106 | local N = r(N) 107 | * 108 | 109 | *** FEs. Create 1/0 flag. 110 | // Get panel id 111 | local feflag=("`fe'"~="") 112 | if `feflag' { 113 | cap _xt 114 | if _rc ~= 0 { 115 | di as err "Error: fe option requires data to be xtset" 116 | exit 459 117 | } 118 | else { 119 | local ivar `r(ivar)' 120 | } 121 | } 122 | * 123 | 124 | *** constant, partial, etc. 125 | // conmodel: constant in original model 126 | // consflag: constant in transformed equation to estimate 127 | local consmodel =("`noconstant'"=="") & ~`feflag' // if fe, then consmodel=0 & partialcons="" 128 | local partialflag =("`partial'"~="") // =1 even if just cons being partialled out 129 | local prestdflag =("`prestd'"~="") 130 | // "_cons" allowed as an argument to partial(.) - remove it 131 | local partial : subinstr local partial "_cons" "", all word count(local pconscount) 132 | local notpen : subinstr local notpen "_cons" "", all word count(local notpenconscount) 133 | // Tell estimation code if cons has been partialled out or there isn't one in the first place 134 | if `feflag' | `partialflag' | `prestdflag' | (~`consmodel') { 135 | local consflag 0 136 | } 137 | else { 138 | local consflag 1 139 | } 140 | * 141 | 142 | *** create main varlist and tempvars 143 | // remove duplicates from varlist 144 | // _o list is vars with original names 145 | fvexpand `varlist' if `touse' 146 | local varlist_o `r(varlist)' 147 | // check for duplicates has to follow expand 148 | local dups : list dups varlist_o 149 | if "`dups'"~="" { 150 | di as text "Dropping duplicates: `dups'" 151 | } 152 | local varlist_o : list uniq varlist_o 153 | * 154 | 155 | *** Create separate _o varlists: Y, X, notpen, partial 156 | // Y, X 157 | local varY_o : word 1 of `varlist_o' 158 | local varX_o : list varlist_o - varY_o // incl notpen/partial 159 | // notpen 160 | fvexpand `notpen' if `touse' 161 | local notpen_o `r(varlist)' 162 | local dups : list dups notpen_o 163 | if "`dups'"~="" { 164 | di as text "Dropping duplicates: `dups'" 165 | } 166 | local notpen_o : list uniq notpen_o 167 | // partial 168 | fvexpand `partial' if `touse' 169 | local partial_o `r(varlist)' 170 | local dups : list dups partial_o 171 | if "`dups'"~="" { 172 | di as text "Dropping duplicates: `dups'" 173 | } 174 | local partial_o : list uniq partial_o 175 | // "model" = vars without partialled-out 176 | local varXmodel_o : list varX_o - partial_o 177 | * 178 | 179 | *** syntax checks 180 | // check that notpen vars are in full list 181 | local checklist : list notpen_o - varX_o 182 | local checknum : word count `checklist' 183 | if `checknum' { 184 | di as err "syntax error - `checklist' in notpen(.) but not in list of regressors" 185 | exit 198 186 | } 187 | // check that partial vars are in full list 188 | local checklist : list partial_o - varX_o 189 | local checknum : word count `checklist' 190 | if `checknum' { 191 | di as err "syntax error - `checklist' in partial(.) but not in list of regressors" 192 | exit 198 193 | } 194 | // check that ivar (FE) is not a used variable 195 | if `feflag' { 196 | fvrevar `varY_o' `varX_o', list // list option means we get only base vars 197 | local vlist `r(varlist)' 198 | local checklist : list ivar - vlist 199 | local checknum : word count `checklist' 200 | if `checknum'==0 { 201 | di as err "syntax error - `ivar' is xtset variable and cannot be used in model" 202 | exit 198 203 | } 204 | } 205 | // other checks 206 | if `pconscount' & `feflag' { 207 | di as err "error: incompatible options, partial(_cons) and fe" 208 | exit 198 209 | } 210 | if "`partial'"~="" & "`noconstant'"~="" { 211 | di as err "error: incompatible options, partial and nocons" 212 | exit 198 213 | } 214 | if `feflag' & "`noconstant'"~="" { 215 | di as err "error: incompatible options, fe and nocons" 216 | exit 198 217 | } 218 | * 219 | 220 | *** Create _t varlists: Y, X, notpen, partial 221 | // _o list is vars with original names 222 | // _t list is temp vars if transform needed, original vars if not 223 | if `feflag' { // everything needs to be transformed including partial 224 | local temp_ct : word count `varlist_o' 225 | mata: s_maketemps(`temp_ct') 226 | local varlist_t `r(varlist)' 227 | } 228 | else if `partialflag' | `prestdflag' { // everything except partial_o needs to be transformed 229 | local varYXmodel_o `varY_o' `varXmodel_o' 230 | local temp_ct : word count `varYXmodel_o' 231 | mata: s_maketemps(`temp_ct') 232 | local varYXmodel_t `r(varlist)' 233 | matchnames "`varlist_o'" "`varYXmodel_o'" "`varYXmodel_t'" 234 | local varlist_t `r(names)' 235 | } 236 | else { // no transformation needed but still need temps 237 | fvrevar `varlist_o' if `touse' // fvrevar creates temps only when needed 238 | local varlist_t `r(varlist)' 239 | } 240 | // dictionary is now varlist_o / varlist_t 241 | // now create separate _o and _t varlists using dictionary 242 | foreach vlist in varY varX varXmodel notpen partial { 243 | matchnames "``vlist'_o'" "`varlist_o'" "`varlist_t'" 244 | local `vlist'_t `r(names)' // corresponding tempnames; always need this because of possible fvs 245 | } 246 | * 247 | 248 | ******************* Display names *********************************************************** 249 | // may be called by another program with tempvars and display names for them 250 | // if display names option not used, use _o names as provided in rlasso command 251 | // if display names option used, use display names matched with _o names 252 | // if display names macros are empty, has no effect 253 | matchnames "`varY_o'" "`displaynames_o'" "`displaynames_d'" 254 | local varY_d `r(names)' 255 | matchnames "`varXmodel_o'" "`displaynames_o'" "`displaynames_d'" 256 | local varXmodel_d `r(names)' 257 | matchnames "`varX_o'" "`displaynames_o'" "`displaynames_d'" 258 | local varX_d `r(names)' 259 | matchnames "`notpen_o'" "`displaynames_o'" "`displaynames_d'" 260 | local notpen_d `r(names)' 261 | matchnames "`partial_o'" "`displaynames_o'" "`displaynames_d'" 262 | local partial_d `r(names)' 263 | * 264 | 265 | *** summary varlists and flags: 266 | * varY_o = dep var 267 | * varY_t = dep var, temp var 268 | * varX_o = full, expanded set of RHS, original names, includes partial 269 | * varX_t = as above but with temp names for all variables 270 | * varXmodel_o = full, expanded set of RHS, original names, excludes partial 271 | * varXmodel_t = as above but with temp names for all variables 272 | * notpen_o = full, expanded set of not-penalized 273 | * notpen_t = as above but with temp names for all variables 274 | 275 | // p is number of penalized vars in the model; follows convention in BCH papers 276 | // p is calculated in lassoutils/_rlasso as number of model vars excluding constant 277 | // here we calculate which of the model vars are unpenalized or omitted/base vars 278 | // to provide as `pminus' to lassoutils/_rlasso (unless provided by user) 279 | // do here so that code above is compatible with lasso2 280 | // use _o names / display names since they have info on whether var is omitted/base/etc. 281 | if ~`pminus' { 282 | foreach vn of local varXmodel_d { // display names 283 | _ms_parse_parts `vn' 284 | // increment pminus if model variable is MISSING 285 | if r(omit) { 286 | local ++pminus 287 | } 288 | } 289 | foreach vn of local notpen_d { // display names 290 | _ms_parse_parts `vn' 291 | // increment pminus if notpen variable is NOT MISSING 292 | if ~r(omit) { 293 | local ++pminus 294 | } 295 | } 296 | } 297 | // p0 here is total number of variables provided to model EXCLUDING constant 298 | local p0 : word count `varXmodel_o' 299 | local p =`p0'-`pminus' 300 | // warn 301 | if `p'<=0 { 302 | di as text "warning: no penalized regressors; results are OLS" 303 | } 304 | // now for error-checking below, p0 should INCLUDE constant unless partialled-out etc. 305 | local p0 =`p0'+`consflag' 306 | * 307 | 308 | ******************* FE, partialling out, standardization ************************************ 309 | // If FE: partial-out FEs from temp variables, then preserve, 310 | // then partial-out low-dim ctrls from temp variables 311 | // restore will restore all temp vars with only FEs partialled-out 312 | // If no FE: leave original variables unchanged. 313 | // partial-out low-dim ctrls from temp variables. 314 | // if no FE/low-dim ctrls, no transform needed 315 | 316 | local dmflag =0 // initialize demeaned flag 317 | if `feflag' { // FE-transform all variables 318 | fvrevar `varY_o' `varX_o' if `touse' // in case any FV or TS vars in _o list 319 | local vlist `r(varlist)' 320 | lassoutils `vlist', /// call on _o list 321 | touse(`touse') /// 322 | tvarlist(`varY_t' `varX_t') /// overwrite/initialize these 323 | `noftools' /// 324 | fe(`ivar') // triggers branching to FE utility 325 | local N_g =r(N_g) // N_g will be empty if no FEs 326 | local noftools `r(noftools)' // either not installed or user option 327 | local dmflag=1 // data are now demeaned 328 | if `partialflag' { // And then partial out any additional vars 329 | preserve // preserve the original values of tempvars before partialling out 330 | lassoutils `varY_t' `varXmodel_t', /// _t vars have been created and filled so use here 331 | touse(`touse') /// don't need tvarlist because vars already created 332 | partial(`partial_t') /// _t vars have been created and filled so use here 333 | partialflag(`partialflag') /// triggers branching to partial utility 334 | dmflag(1) // FE => mean zero 335 | } 336 | if `prestdflag' { 337 | tempname prestdY prestdX 338 | lassoutils `varY_t', /// _t vars have been created and filled so use here 339 | touse(`touse') /// don't need tvarlist because vars already created 340 | std /// 341 | dmflag(1) // FE => data already mean zero 342 | mat `prestdY'=r(stdvec) 343 | lassoutils `varXmodel_t', /// 344 | touse(`touse') /// 345 | std /// 346 | dmflag(1) // FE => data already mean zero 347 | mat `prestdX'=r(stdvec) 348 | } 349 | } 350 | else if `partialflag' { // Just partial out 351 | fvrevar `varY_o' `varXmodel_o' if `touse' // in case any FV or TS vars in _o list 352 | local vlist `r(varlist)' 353 | fvrevar `partial_o' if `touse' // in case any FV or TS vars in _o list 354 | local pvlist `r(varlist)' 355 | lassoutils `vlist', /// call on _o list 356 | touse(`touse') /// 357 | partial(`pvlist') /// 358 | tvarlist(`varY_t' `varXmodel_t') /// overwrite/initialize these 359 | partialflag(`partialflag') /// triggers branching to partial utility 360 | dmflag(0) // data are not yet demeaned 361 | local dmflag =1 // data are now demeaned 362 | if `prestdflag' { 363 | tempname prestdY prestdX 364 | lassoutils `varY_t', /// _t vars have been created and filled so use here 365 | touse(`touse') /// don't need tvarlist because vars already created 366 | std /// 367 | dmflag(1) // partial => already mean zero 368 | mat `prestdY'=r(stdvec) 369 | lassoutils `varXmodel_t', /// 370 | touse(`touse') /// 371 | std /// 372 | dmflag(1) // partial => already mean zero 373 | mat `prestdX'=r(stdvec) 374 | } 375 | } 376 | else if `prestdflag' { 377 | tempname prestdY prestdX 378 | lassoutils `varY_o', /// call on _o list 379 | touse(`touse') /// 380 | std /// 381 | tvarlist(`varY_t') /// overwrite/initialize these 382 | consmodel(`consmodel') /// =1 => data should be demeaned 383 | dmflag(0) // data not yet mean zero 384 | mat `prestdY'=r(stdvec) 385 | fvrevar `varXmodel_o' if `touse' // in case any FV or TS vars in _o list 386 | local vlist `r(varlist)' 387 | lassoutils `vlist', /// call on _o list 388 | touse(`touse') /// 389 | std /// 390 | tvarlist(`varXmodel_t') /// overwrite/initialize these 391 | consmodel(`consmodel') /// =1 => data should be demeaned 392 | dmflag(0) // data not yet mean zero 393 | mat `prestdX'=r(stdvec) 394 | if `consmodel' { 395 | local dmflag =1 // if cons in model, data are now demeaned 396 | } 397 | } 398 | 399 | ************* Partialling/standardization END *********************************************** 400 | 401 | ************* Lasso estimation with transformed/partialled-out vars ************************* 402 | if "`verbose'`vverbose'`dots'"=="" { 403 | local quietly "quietly" // don't show lassoutils output 404 | } 405 | 406 | `quietly' lassoutils `varY_t', /// 407 | rlasso /// branch to _rlasso subroutine 408 | /// nocons, no penloads, etc. all assumed 409 | touse(`touse') /// 410 | xnames_o(`varXmodel_d') /// display names for lassoutils output 411 | xnames_t(`varXmodel_t') /// 412 | cluster(`cluster') /// 413 | notpen_o(`notpen_d') /// 414 | notpen_t(`notpen_t') /// 415 | consflag(`consflag') /// =0 if cons already partialled out or if no cons 416 | dmflag(`dmflag') /// =1 if data have been demeaned 417 | pminus(`pminus') /// 418 | stdy(`prestdY') /// 419 | stdx(`prestdX') /// 420 | `verbose' `vverbose' `dots' /// 421 | `testonly' /// 422 | `options' 423 | * 424 | 425 | ************* Finish up ******************************************************** 426 | *** e-return lasso estimation results 427 | tempname b beta betaOLS Ups sUps eUps 428 | tempname betaAll betaAllOLS 429 | tempname lambda slambda lambda0 rmse rmseOLS 430 | tempname c gamma gammad 431 | tempname supscore supscore_p supscore_cv supscore_gamma 432 | 433 | if ~`testonlyflag' { 434 | 435 | if "`cluster'" ~= "" { 436 | local N_clust =r(N_clust) 437 | } 438 | mat `beta' =r(beta) // may be empty! 439 | mat `betaOLS' =r(betaOLS) // may be empty! 440 | mat `betaAll' =r(betaAll) 441 | mat `betaAllOLS' =r(betaAllOLS) 442 | mat `Ups' =r(Ups) 443 | mat `sUps' =r(sUps) 444 | mat `eUps' =r(eUps) 445 | scalar `lambda' =r(lambda) 446 | scalar `slambda' =r(slambda) 447 | scalar `lambda0' =r(lambda0) 448 | scalar `c' =r(c) 449 | scalar `gamma' =r(gamma) 450 | scalar `gammad' =r(gammad) 451 | scalar `rmse' =r(rmse) // Lasso RMSE 452 | scalar `rmseOLS' =r(rmseOLS) // post-Lasso RMSE 453 | local selected `r(selected)' // EXCL NOTPEN/CONS 454 | local selected0 `r(selected0)' // INCL NOTPEN, EXCL CONS 455 | local s =r(s) // EXCL NOTPEN/CONS; number of elements in selected 456 | local s0 =r(s0) // INCL NOTPEN, EXCL CONS; number of elements in selected0 457 | local clustvar `r(clustvar)' 458 | local robust `r(robust)' 459 | local center =r(center) 460 | local method `r(method)' // lasso or sqrt-lasso 461 | local niter =r(niter) 462 | local maxiter =r(maxiter) 463 | local nupsiter =r(nupsiter) 464 | local maxupsiter =r(maxupsiter) 465 | // these can be missings 466 | scalar `supscore' =r(supscore) 467 | scalar `supscore_p' =r(supscore_p) 468 | scalar `supscore_cv' =r(supscore_cv) 469 | scalar `supscore_gamma' =r(supscore_gamma) 470 | local ssnumsim =r(ssnumsim) 471 | 472 | // flag for empty beta (consflag=0 means rlasso didn't estimate a constant) 473 | local betaempty =(`s0'==0 & `consflag'==0) 474 | // error check 475 | if `betaempty' { 476 | if ~(colsof(`beta')==1 & `beta'[1,1]==.) { 477 | di as err "internal _rlasso error - beta should be empty (no vars estimated) but isn't 478 | exit 499 479 | } 480 | } 481 | // issue warning if lasso max iteration limit hit 482 | if `niter'==`maxiter' { 483 | di as text "Warning: reached max shooting iterations w/o achieving convergence." 484 | } 485 | // error check - p0s and ps should match 486 | if `p0'~=r(p0) { // number of all variables in betaAll INCL NOTPEN/CONS (if present or not partialled etc.) 487 | di as err "internal _rlasso error - p0 count of model vars `p0' does not match returned value `r(p0)'" 488 | exit 499 489 | } 490 | if `p'~=r(p) { // number of penalized variables in model 491 | di as err "internal _rlasso error - p count of penalized vars `p' does not match returned value `r(p)'" 492 | exit 499 493 | } 494 | // fix depvar (rownames) of beta vectors to use _o (or _d if display names provided) not _t 495 | mat rownames `beta' = `varY_d' 496 | mat rownames `betaOLS' = `varY_d' 497 | mat rownames `betaAll' = `varY_d' 498 | mat rownames `betaAllOLS' = `varY_d' 499 | if ~`betaempty' { // cnames should stay empty if beta has a single missing value 500 | local cnames_o : colnames `beta' 501 | fvstrip `cnames_o' // colnames may insert b/n/o operators - remove 502 | local cnames_o `r(varlist)' 503 | matchnames "`cnames_o'" "`varlist_o'" "`varlist_t'" 504 | local cnames_t `r(names)' 505 | } 506 | else { 507 | local cnames_o 508 | local cnames_t 509 | } 510 | * 511 | 512 | *********** Get coeff estimates for partialled-out vars/std intercept. ******************** 513 | if `feflag' & `partialflag' { // FE case and there are partialled-out notpen vars 514 | restore // Restores dataset with tempvars after FE transform but before notpen partialled out 515 | } 516 | if `partialflag' | (`prestdflag' & `consmodel') { // standardization removes constant so must enter for that 517 | if `feflag' { 518 | local depvar `varY_t' // use FE-transformed depvar and X vars 519 | local scorevars `cnames_t' 520 | } 521 | else { 522 | local depvar `varY_o' // use original depvar and X vars 523 | local scorevars `cnames_o' 524 | } 525 | lassoutils `depvar', /// 526 | unpartial /// 527 | touse(`touse') /// 528 | beta(`beta') /// 529 | scorevars(`scorevars') /// 530 | partial(`partial_t') /// 531 | names_o(`varX_d') /// dictionary 532 | names_t(`varX_t') /// dictionary 533 | consmodel(`consmodel') 534 | mat `beta' = r(b) 535 | mat `betaAll' = `betaAll', r(bpartial) 536 | lassoutils `depvar', /// 537 | unpartial /// 538 | touse(`touse') /// 539 | beta(`betaOLS') /// 540 | scorevars(`scorevars') /// 541 | partial(`partial_t') /// 542 | names_o(`varX_d') /// dictionary 543 | names_t(`varX_t') /// dictionary 544 | consmodel(`consmodel') 545 | mat `betaOLS' = r(b) 546 | mat `betaAllOLS' = `betaAllOLS', r(bpartial) 547 | // for unknown reasons, _ms_build_info doesn't add info here (e.g. "base") 548 | _ms_build_info `beta' if `touse' 549 | _ms_build_info `betaAll' if `touse' 550 | _ms_build_info `betaOLS' if `touse' 551 | _ms_build_info `betaAllOLS' if `touse' 552 | // finish by setting betaempty to 0 553 | local betaempty =0 554 | } 555 | * 556 | 557 | *** Prepare and post results 558 | if "`pols'"=="" & "`postall'"=="" { // selected lasso coefs by default 559 | mat `b' = `beta' 560 | } 561 | else if "`pols'"~="" & "`postall'"=="" { // selected post-lasso coefs 562 | mat `b' = `betaOLS' 563 | } 564 | else if "`pols'"=="" { // full lasso coef vector 565 | mat `b' = `betaAll' 566 | } 567 | else { // full post-lasso coef vector 568 | mat `b' = `betaAllOLS' 569 | } 570 | if `betaempty' & "`postall'"=="" { // no vars in b 571 | ereturn post , obs(`N') depname(`varY_d') esample(`touse') // display name 572 | } 573 | else { // b has some selected/nonpen/cons 574 | ereturn post `b', obs(`N') depname(`varY_d') esample(`touse') // display name 575 | } 576 | // additional returned results 577 | ereturn local noftools `noftools' 578 | ereturn local postall `postall' 579 | ereturn scalar niter =`niter' 580 | ereturn scalar maxiter =`maxiter' 581 | ereturn scalar nupsiter =`nupsiter' 582 | ereturn scalar maxupsiter =`maxupsiter' 583 | ereturn local robust `robust' 584 | ereturn local ivar `ivar' 585 | ereturn local selected `selected' // selected only 586 | ereturn local varXmodel `varXmodel_d' // display name 587 | ereturn local varX `varX_d' // display name 588 | if "`pols'"=="" { 589 | ereturn local estimator ols 590 | } 591 | else { 592 | ereturn local estimator postlasso 593 | } 594 | ereturn local method `method' 595 | ereturn local predict rlasso_p 596 | ereturn local cmd rlasso 597 | ereturn scalar center =`center' 598 | ereturn scalar cons =`consmodel' 599 | ereturn scalar lambda =`lambda' 600 | ereturn scalar lambda0 =`lambda0' 601 | ereturn scalar slambda =`slambda' 602 | ereturn scalar c =`c' 603 | ereturn scalar gamma =`gamma' 604 | ereturn scalar gammad =`gammad' 605 | 606 | if `supscore' < . { 607 | ereturn scalar ssnumsim =`ssnumsim' 608 | ereturn scalar supscore =`supscore' 609 | ereturn scalar supscore_p =`supscore_p' 610 | ereturn scalar supscore_cv =`supscore_cv' 611 | ereturn scalar supscore_gamma =`supscore_gamma' 612 | } 613 | 614 | if "`N_clust'" ~= "" { 615 | ereturn local clustvar `clustvar' 616 | ereturn scalar N_clust =`N_clust' 617 | } 618 | if "`N_g'" ~= "" { 619 | ereturn scalar N_g =`N_g' 620 | } 621 | ereturn scalar fe =`feflag' 622 | ereturn scalar rmse =`rmse' 623 | ereturn scalar rmseOLS =`rmseOLS' 624 | ereturn scalar pminus =`pminus' 625 | ereturn scalar p =`p' // number of all penalized vars; excludes omitteds etc. 626 | ereturn scalar s0 =`s0' // number of all estimated coefs (elements of beta) 627 | ereturn scalar s =`s' // number of selected 628 | 629 | ereturn matrix sUps =`sUps' 630 | ereturn matrix eUps =`eUps' 631 | ereturn matrix Ups =`Ups' 632 | ereturn matrix betaAllOLS =`betaAllOLS' 633 | ereturn matrix betaAll =`betaAll' 634 | ereturn matrix betaOLS =`betaOLS' 635 | ereturn matrix beta =`beta' 636 | 637 | // rlasso-specific: 638 | // selected0 and s0 included partialled-out. 639 | // If cons exists and was not partialled out, add to notpen and selected0. 640 | // Otherwise if cons exists and was partialled out, add to to partial list. 641 | if `consmodel' & ~`partialflag' { 642 | local selected0 `selected0' _cons 643 | local notpen_d `notpen_d' _cons // display name 644 | } 645 | else if `consmodel' & `partialflag' { 646 | local partial_d `partial_d' _cons // display name 647 | local selected0 `selected0' `partial_d' // display name 648 | } 649 | else if `partialflag' { 650 | local selected0 `selected0' `partial_d' // display name 651 | } 652 | // remaining results 653 | ereturn local selected0 `selected0' 654 | ereturn local partial `partial_d' // display name 655 | ereturn scalar partial_ct =`: word count `partial_d'' // (display name) number of partialled-out INCLUDING CONSTANT 656 | ereturn scalar s0 =`: word count `selected0'' // (update) selected or notpen, INCL CONS 657 | // rlasso-specific - save as "pnotpen" (vs lasso2 "notpen") 658 | ereturn local pnotpen `notpen_d' // display name 659 | ereturn scalar pnotpen_ct =`: word count `notpen_d'' // (display name) number of notpen INCLUDING CONSTANT (if not partialled-out) 660 | * 661 | } 662 | else { 663 | 664 | // sup-score test only - no lasso results 665 | ereturn clear 666 | 667 | ereturn scalar N =r(N) 668 | ereturn scalar N_clust =r(N_clust) 669 | ereturn scalar gamma =r(gamma) 670 | ereturn scalar c =r(c) 671 | ereturn scalar p =`p' 672 | ereturn scalar ssnumsim =r(ssnumsim) 673 | ereturn scalar supscore =r(supscore) 674 | ereturn scalar supscore_p =r(supscore_p) 675 | ereturn scalar supscore_cv =r(supscore_cv) 676 | ereturn scalar supscore_gamma =r(supscore_gamma) 677 | 678 | ereturn local cmd rlasso 679 | ereturn scalar cons =`consmodel' 680 | 681 | } 682 | 683 | end 684 | 685 | prog DisplaySupScore 686 | 687 | di 688 | di as text "{help rlasso##supscore:Sup-score} test H0: beta=0" 689 | di as text "CCK sup-score statistic" _col(25) as res %6.2f e(supscore) _c 690 | if e(supscore_p) < . { 691 | di as text _col(32) "p-value=" _col(39) as res %6.3f e(supscore_p) 692 | } 693 | else { 694 | di 695 | } 696 | di as text "CCK " as res 100*e(supscore_gamma) as text "% critical value" _c 697 | di as res _col(25) %6.2f e(supscore_cv) _col(32) as text "(asympt bound)" 698 | 699 | end 700 | 701 | 702 | // Used in rlasso and lasso2. 703 | // version 2017-12-20 704 | // updated 31dec17 to accommodate e(pnotpen) 705 | prog DisplayCoefs 706 | 707 | syntax , /// 708 | [ /// 709 | displayall /// full coef vector in display (default=selected only) 710 | varwidth(int 17) /// 711 | NORecover /// 712 | ] 713 | 714 | local cons =e(cons) 715 | if ("`norecover'"=="") { 716 | local partial `e(partial)' 717 | local partial_ct =e(partial_ct) 718 | } 719 | else { 720 | local partial 721 | local partial_ct =0 722 | } 723 | 724 | // varlists 725 | local selected `e(selected)' 726 | fvstrip `selected' 727 | local selected `r(varlist)' 728 | local notpen `e(notpen)'`e(pnotpen)' 729 | fvstrip `notpen' 730 | local notpen `r(varlist)' 731 | local selected0 `e(selected0)' 732 | fvstrip `selected0' 733 | local selected0 `r(varlist)' 734 | // coef vectors 735 | tempname beta betaOLS 736 | if "`displayall'"~="" { // there must be some vars specified even if nothing selected 737 | mat `beta' =e(betaAll) 738 | mat `betaOLS' =e(betaAllOLS) 739 | local col_ct =colsof(`beta') 740 | local vlist : colnames `beta' 741 | local vlistOLS : colnames `betaOLS' 742 | local baselevels baselevels 743 | } 744 | else if e(k)>0 { // display only selected, but only if there are any 745 | mat `beta' =e(beta) 746 | mat `betaOLS' =e(betaOLS) 747 | local col_ct =colsof(`beta') 748 | local vlist : colnames `beta' 749 | local vlistOLS : colnames `betaOLS' 750 | } 751 | else { // nothing selected, zero columns in beta 752 | local col_ct =0 753 | } 754 | if e(k)>0 { 755 | _ms_build_info `beta' if e(sample) 756 | _ms_build_info `betaOLS' if e(sample) 757 | } 758 | 759 | *** (Re-)display coefficients including constant/partial 760 | local varwidth1 =`varwidth'+1 761 | local varwidth3 =`varwidth'+3 762 | local varwidth4 =`varwidth'+4 763 | local varwidthm7 =`varwidth'-7 764 | local varwidthm13 =`varwidth'-13 765 | di 766 | di as text "{hline `varwidth1'}{c TT}{hline 32}" 767 | if "`e(method)'"=="sqrt-lasso" { 768 | di as text _col(`varwidthm7') "Selected {c |} Sqrt-lasso Post-est OLS" 769 | } 770 | else if "`e(method)'"=="ridge" { 771 | di as text _col(`varwidthm7') "Selected {c |} Ridge Post-est OLS" 772 | } 773 | else if "`e(method)'"=="elastic net" { 774 | di as text _col(`varwidthm7') "Selected {c |} Elastic net Post-est OLS" 775 | di as text _col(`varwidthm7') " {c |}" _c 776 | di as text " (alpha=" _c 777 | di as text %4.3f `e(alpha)' _c 778 | di as text ")" 779 | } 780 | else if "`e(method)'"=="lasso" { 781 | di as text _col(`varwidthm7') "Selected {c |} Lasso Post-est OLS" 782 | } 783 | else { 784 | di as err "internal DisplayCoefs error. unknown method." 785 | exit 1 786 | } 787 | di as text "{hline `varwidth1'}{c +}{hline 32}" 788 | local anynotpen = 0 789 | local i 1 790 | local lastcol = `col_ct' - `partial_ct' 791 | tokenize `vlist' // put elements of coef vector into macros 1, 2, ... 792 | while `i' <= `lastcol' { 793 | local vn ``i'' 794 | fvstrip `vn' // get rid of o/b/n prefix for display purposes 795 | local vn `r(varlist)' 796 | _ms_display, element(`i') matrix(`beta') width(`varwidth') `baselevels' 797 | // in selected or notpen list? 798 | local isselnotpen : list posof "`vn'" in selected0 799 | local isnotpen : list posof "`vn'" in notpen 800 | local anynotpen = `anynotpen' + `isnotpen' 801 | // note attached? base, empty, omitted 802 | qui _ms_display, element(`i') matrix(`beta') 803 | local note `r(note)' 804 | qui _ms_display, element(`i') matrix(`betaOLS') 805 | local noteOLS `r(note)' 806 | // if notpen, add footnote 807 | if `isnotpen' & "`note'"=="" { 808 | di as text "{helpb rlasso##notpen:*}" _c 809 | } 810 | if `isselnotpen' { 811 | // lasso coef 812 | if "`note'"=="" { 813 | di _col(`varwidth4') as res %15.7f el(`beta',1,`i') _c 814 | } 815 | else { 816 | di _col(`varwidth4') as text %15s "`note'" _c 817 | } 818 | // post-lasso coef - can be omitted if collinear 819 | if "`noteOLS'"=="" { 820 | di as res %15.7f el(`betaOLS',1,`i') 821 | } 822 | else { 823 | di as text %15s "`noteOLS'" 824 | } 825 | } 826 | else if "`note'"=="(omitted)" { 827 | // not selected 828 | di _col(`varwidth4') as text %15s "(not selected)" _c 829 | di as text %15s "(not selected)" 830 | } 831 | else { 832 | // other eg base var 833 | di as text %15s "`note'" _c 834 | di as text %15s "`noteOLS'" 835 | } 836 | local ++i 837 | } 838 | if `partial_ct' { 839 | di as text "{hline `varwidth1'}{c +}{hline 32}" 840 | di as text _col(`varwidthm13') "Partialled-out{help lasso2##notpen:*}{c |}" 841 | di as text "{hline `varwidth1'}{c +}{hline 32}" 842 | local i = `lastcol'+1 843 | while `i' <= `col_ct' { 844 | local vn ``i'' 845 | fvstrip `vn' // get rid of o/b/n prefix for display purposes 846 | local vn `r(varlist)' 847 | _ms_display, element(`i') matrix(`beta') width(`varwidth') `baselevels' 848 | // note attached? base, empty, omitted 849 | qui _ms_display, element(`i') matrix(`beta') 850 | local note `r(note)' 851 | qui _ms_display, element(`i') matrix(`betaOLS') 852 | local noteOLS `r(note)' 853 | // lasso coef 854 | if "`note'"=="" { 855 | di _col(`varwidth4') as res %15.7f el(`beta',1,`i') _c 856 | } 857 | else { 858 | di _col(`varwidth4') as text %15s "`note'" _c 859 | } 860 | // post-lasso coef - can be omitted if collinear 861 | if "`noteOLS'"=="" { 862 | di as res %15.7f el(`betaOLS',1,`i') 863 | } 864 | else { 865 | di as text %15s "`noteOLS'" 866 | } 867 | local ++i 868 | } 869 | } 870 | di as text "{hline `varwidth1'}{c BT}{hline 32}" 871 | 872 | if `anynotpen' { 873 | di "{help rlasso##notpen:*Not penalized}" 874 | } 875 | 876 | end 877 | 878 | *************************** Stata utilities ****************************** 879 | 880 | // internal version of fvstrip 1.01 ms 24march2015 881 | // takes varlist with possible FVs and strips out b/n/o notation 882 | // returns results in r(varnames) 883 | // optionally also omits omittable FVs 884 | // expand calls fvexpand either on full varlist 885 | // or (with onebyone option) on elements of varlist 886 | 887 | program define fvstrip, rclass 888 | version 11.2 889 | syntax [anything] [if] , [ dropomit expand onebyone NOIsily ] 890 | if "`expand'"~="" { // force call to fvexpand 891 | if "`onebyone'"=="" { 892 | fvexpand `anything' `if' // single call to fvexpand 893 | local anything `r(varlist)' 894 | } 895 | else { 896 | foreach vn of local anything { 897 | fvexpand `vn' `if' // call fvexpand on items one-by-one 898 | local newlist `newlist' `r(varlist)' 899 | } 900 | local anything : list clean newlist 901 | } 902 | } 903 | foreach vn of local anything { // loop through varnames 904 | if "`dropomit'"~="" { // check & include only if 905 | _ms_parse_parts `vn' // not omitted (b. or o.) 906 | if ~`r(omit)' { 907 | local unstripped `unstripped' `vn' // add to list only if not omitted 908 | } 909 | } 910 | else { // add varname to list even if 911 | local unstripped `unstripped' `vn' // could be omitted (b. or o.) 912 | } 913 | } 914 | // Now create list with b/n/o stripped out 915 | foreach vn of local unstripped { 916 | local svn "" // initialize 917 | _ms_parse_parts `vn' 918 | if "`r(type)'"=="variable" & "`r(op)'"=="" { // simplest case - no change 919 | local svn `vn' 920 | } 921 | else if "`r(type)'"=="variable" & "`r(op)'"=="o" { // next simplest case - o.varname => varname 922 | local svn `r(name)' 923 | } 924 | else if "`r(type)'"=="variable" { // has other operators so strip o but leave . 925 | local op `r(op)' 926 | local op : subinstr local op "o" "", all 927 | local svn `op'.`r(name)' 928 | } 929 | else if "`r(type)'"=="factor" { // simple factor variable 930 | local op `r(op)' 931 | local op : subinstr local op "b" "", all 932 | local op : subinstr local op "n" "", all 933 | local op : subinstr local op "o" "", all 934 | local svn `op'.`r(name)' // operator + . + varname 935 | } 936 | else if"`r(type)'"=="interaction" { // multiple variables 937 | forvalues i=1/`r(k_names)' { 938 | local op `r(op`i')' 939 | local op : subinstr local op "b" "", all 940 | local op : subinstr local op "n" "", all 941 | local op : subinstr local op "o" "", all 942 | local opv `op'.`r(name`i')' // operator + . + varname 943 | if `i'==1 { 944 | local svn `opv' 945 | } 946 | else { 947 | local svn `svn'#`opv' 948 | } 949 | } 950 | } 951 | else if "`r(type)'"=="product" { 952 | di as err "fvstrip error - type=product for `vn'" 953 | exit 198 954 | } 955 | else if "`r(type)'"=="error" { 956 | di as err "fvstrip error - type=error for `vn'" 957 | exit 198 958 | } 959 | else { 960 | di as err "fvstrip error - unknown type for `vn'" 961 | exit 198 962 | } 963 | local stripped `stripped' `svn' 964 | } 965 | local stripped : list retokenize stripped // clean any extra spaces 966 | 967 | if "`noisily'"~="" { // for debugging etc. 968 | di as result "`stripped'" 969 | } 970 | 971 | return local varlist `stripped' // return results in r(varlist) 972 | end 973 | 974 | // Internal version of matchnames 975 | // Sample syntax: 976 | // matchnames "`varlist'" "`list1'" "`list2'" 977 | // takes list in `varlist', looks up in `list1', returns entries in `list2', called r(names) 978 | program define matchnames, rclass 979 | version 11.2 980 | args varnames namelist1 namelist2 981 | 982 | local k1 : word count `namelist1' 983 | local k2 : word count `namelist2' 984 | 985 | if `k1' ~= `k2' { 986 | di as err "namelist error" 987 | exit 198 988 | } 989 | foreach vn in `varnames' { 990 | local i : list posof `"`vn'"' in namelist1 991 | if `i' > 0 { 992 | local newname : word `i' of `namelist2' 993 | } 994 | else { 995 | * Keep old name if not found in list 996 | local newname "`vn'" 997 | } 998 | local names "`names' `newname'" 999 | } 1000 | local names : list clean names 1001 | return local names "`names'" 1002 | end 1003 | 1004 | // Display varlist with specified indentation 1005 | program define Disp 1006 | version 11.2 1007 | syntax [anything] [, _col(integer 15) ] 1008 | local maxlen = 80-`_col' 1009 | local len = 0 1010 | local first = 1 1011 | foreach vn in `anything' { 1012 | * Don't display if base or omitted variable 1013 | _ms_parse_parts `vn' 1014 | if ~`r(omit)' { 1015 | local vnlen : length local vn 1016 | if `len'+`vnlen' > `maxlen' { 1017 | di 1018 | local first = 1 1019 | local len = `vnlen' 1020 | } 1021 | else { 1022 | local len = `len'+`vnlen'+1 1023 | } 1024 | if `first' { 1025 | local first = 0 1026 | di in gr _col(`_col') "`vn'" _c 1027 | } 1028 | else { 1029 | di in gr " `vn'" _c 1030 | } 1031 | } 1032 | } 1033 | * Finish with a newline 1034 | di 1035 | end 1036 | 1037 | version 13 1038 | mata: 1039 | 1040 | void s_maketemps(real scalar p) 1041 | { 1042 | (void) st_addvar("double", names=st_tempname(p), 1) 1043 | st_global("r(varlist)",invtokens(names)) 1044 | } 1045 | 1046 | 1047 | // END MATA SECTION 1048 | end 1049 | -------------------------------------------------------------------------------- /binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: r-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - r-base=4.3 6 | - r-tidyverse 7 | - r-fbasics 8 | - r-corrplot 9 | - r-psych 10 | - r-glmnet 11 | - r-glmnetutils 12 | - r-grf 13 | - r-rpart 14 | - r-rpart.plot 15 | - r-randomforest 16 | - r-rlang 17 | - r-readr 18 | - r-devtools 19 | - r-reshape2 20 | - r-caret 21 | - r-plotmo 22 | - r-randomfieldsutils 23 | - r-rms 24 | - r-hdm 25 | - r-aer 26 | - r-lmtest 27 | - r-dplyr 28 | - r-sandwich 29 | - r-diagrammer 30 | - r-neuralnet 31 | - r-ISLR2 32 | - r-zeallot 33 | - r-nycflights13 --------------------------------------------------------------------------------