├── 1_Overview.pdf
├── 2_Regularized_Regression.pdf
├── 3_Trees_Forests.pdf
├── 4_Deep_Learning.pdf
├── 5_Unsupervised.pdf
├── 6_confounders_with_ML.pdf
├── 7_causal_forest.pdf
├── 8_optimal_policy.pdf
├── 9_reinforcement_learning.pdf
├── Example Lecture 3
└── deep_learning_example.R
├── Examples lecture 1
├── Data
│ ├── job_corps.csv
│ ├── mylemon.csv
│ ├── used_cars_test.csv
│ └── used_cars_train.csv
├── examples_first_lecture.html
└── examples_first_lecture.ipynb
├── Group Data Challenge 2025
├── data_challenge.pdf
├── juice.csv
├── new_grocery.csv
├── orange_juice.html
├── orange_juice.ipynb
└── orange_juice.r
├── Individual Home Assignment 2025
├── grading_grid.pdf
└── research_proposal.pdf
├── Literature
├── Athey_2017.pdf
├── Athey_et_al_2019.pdf
├── Belloni_et_al_2012.pdf
├── Belloni_et_al_2014a.pdf
├── Belloni_et_al_2014b.pdf
├── Cagala_et_al_2021.pdf
├── Chernozhukov_et_al_2017.pdf
├── Chetverikov_et_al_2020.pdf
├── Google flu trends.pdf
├── Mullainathan_Spiess_2017.pdf
└── Semenova_Chernozhukov_2020.pdf
├── PC Lab 1
├── help files
│ └── glmnet_package.pdf
├── penalize_regression_tutorial.r
├── penalized_regression_solution.html
├── penalized_regression_solution.ipynb
├── penalized_regression_tutorial.ipynb
├── student-mat-test.Rdata
└── student-mat-train.Rdata
├── PC Lab 2
├── browser-sites.txt
├── browser_2006.csv
├── browser_new.csv
├── help files
│ ├── grf.pdf
│ └── rpart.pdf
├── trees_foests_solution.html
├── trees_foests_solution.ipynb
├── trees_foests_tutorial.ipynb
└── trees_foests_tutorial.r
├── PC Lab 3
├── help files
│ ├── R_ K-Means Clustering.html
│ └── R_ Principal Components Analysis.html
├── rollcall-members.Rdata
├── rollcall-votes.Rdata
├── unsupervised_solution.html
├── unsupervised_solution.ipynb
├── unsupervised_tutorial.ipynb
└── unsupervised_tutorial.r
├── PC Lab 4
├── help files
│ ├── glmnet_package.pdf
│ └── hdm_package.pdf
├── job_corps.csv
├── post_double_selection_solution.html
├── post_double_selection_solution.ipynb
├── post_double_selection_tutorial.ipynb
└── post_double_selection_tutorial.r
├── PC Lab 5
├── double_machine_learning_solution.html
├── double_machine_learning_solution.ipynb
├── double_machine_learning_tutorial.ipynb
├── double_machine_learning_tutorial.r
├── help files
│ ├── glmnet_package.pdf
│ └── grf_package.pdf
└── job_corps.csv
├── PC Lab 6
├── causal_forest.html
├── causal_forest.ipynb
├── causal_forest.r
├── fundraising.csv
└── help files
│ └── grf_package.pdf
├── PC Lab 7
├── fundraising.csv
├── help files
│ ├── grf_package.pdf
│ └── rpart_package.pdf
├── optimal_policy_learning.html
├── optimal_policy_learning.ipynb
└── optimal_policy_learning.r
├── README.md
├── Stata Example
├── ajr_example.do
├── ivlasso.ado
├── lassoutils.ado
├── pdslasso.ado
└── rlasso.ado
└── binder
└── environment.yml
/1_Overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/1_Overview.pdf
--------------------------------------------------------------------------------
/2_Regularized_Regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/2_Regularized_Regression.pdf
--------------------------------------------------------------------------------
/3_Trees_Forests.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/3_Trees_Forests.pdf
--------------------------------------------------------------------------------
/4_Deep_Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/4_Deep_Learning.pdf
--------------------------------------------------------------------------------
/5_Unsupervised.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/5_Unsupervised.pdf
--------------------------------------------------------------------------------
/6_confounders_with_ML.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/6_confounders_with_ML.pdf
--------------------------------------------------------------------------------
/7_causal_forest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/7_causal_forest.pdf
--------------------------------------------------------------------------------
/8_optimal_policy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/8_optimal_policy.pdf
--------------------------------------------------------------------------------
/9_reinforcement_learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/9_reinforcement_learning.pdf
--------------------------------------------------------------------------------
/Example Lecture 3/deep_learning_example.R:
--------------------------------------------------------------------------------
1 | ### Lab: Deep Learning
2 |
3 | ## In this version of the Ch10 lab, we use the `luz` package, which interfaces to the
4 | ## `torch` package which in turn links to efficient
5 | ## `C++` code in the LibTorch library.
6 |
7 | ## This version of the lab was produced by Daniel Falbel and Sigrid
8 | ## Keydana, both data scientists at Rstudio where these packages were
9 | ## produced.
10 |
11 | ## An advantage over our original `keras` implementation is that this
12 | ## version does not require a separate `python` installation.
13 |
14 | ##########################################
15 | ## Single Layer Network on Hitters Data ##
16 | ##########################################
17 |
18 | ## Load various packages
19 | library(ISLR2)
20 | library(glmnet)
21 | library(torch)
22 | library(luz) # high-level interface for torch
23 | library(torchvision) # for datasets and image transformation
24 | library(torchdatasets) # for datasets we are going to use
25 | library(zeallot)
26 | library(ggplot2)
27 | library(grf)
28 |
29 | ## Loading the dataset
30 | ## We use the example data with baseball player salaries from lecture 4
31 | Gitters <- na.omit(Hitters)
32 | n <- nrow(Gitters)
33 | print(paste("Number of observations:", n))
34 |
35 | ## Define tes sample
36 | set.seed(13)
37 | ntest <- trunc(n / 3)
38 | testid <- sample(1:n, ntest)
39 |
40 |
41 | #######################
42 | ## Linear Regression ##
43 | #######################
44 | lfit <- lm(Salary ~ ., data = Gitters[-testid, ])
45 | summary(lfit)
46 | lpred <- predict(lfit, Gitters[testid, ])
47 | print(paste("MAE:", mean(abs(Gitters$Salary[testid] - lpred))))
48 |
49 | ## Dafine y and x as matrix
50 | x <- scale(model.matrix(Salary ~ . - 1, data = Gitters))
51 | print(paste("Number of controls:", ncol(x)))
52 | y <- Gitters$Salary
53 |
54 | ############
55 | ## Lasso ##
56 | ###########
57 | cvfit <- cv.glmnet(x[-testid, ], y[-testid], type.measure = "mae")
58 | coef(cvfit)
59 | cpred <- predict(cvfit, x[testid, ], s = "lambda.min")
60 | print(paste("MAE:",mean(abs(y[testid] - cpred))))
61 |
62 | ###################
63 | ## Random Forest ##
64 | ###################
65 | forest <- regression_forest(x[-testid, ], y[-testid])
66 | fpred <- predict(forest, x[testid, ])
67 | print(paste("MAE:",mean(abs(y[testid] - fpred$prediction))))
68 |
69 | ####################
70 | ## Neural Network ##
71 | ####################
72 |
73 | torch_manual_seed(13)
74 |
75 | # single hidden layer
76 | # 10 hidden units
77 | # ReLU activation function
78 | # dropout layer, in which a random 40% of the 10 activations from the
79 | # previous layer are set to zero during each iteration of the stochastic
80 | # gradient descent algorithm
81 | # One output
82 | # linear output function
83 | modnn <- nn_module(
84 | initialize = function(input_size) {
85 | self$hidden <- nn_linear(input_size, 10)
86 | self$activation <- nn_relu()
87 | self$dropout <- nn_dropout(0.4)
88 | self$output <- nn_linear(10, 1)
89 | },
90 | forward = function(x) {
91 | x %>%
92 | self$hidden() %>%
93 | self$activation() %>%
94 | self$dropout() %>%
95 | self$output()
96 | }
97 | )
98 |
99 | # Specify optimisation algorithm
100 | # Here mse loss
101 | modnn <- modnn %>%
102 | setup(
103 | loss = nn_mse_loss(),
104 | optimizer = optim_rmsprop,
105 | metrics = list(luz_metric_mae())
106 | ) %>%
107 | set_hparams(input_size = ncol(x))
108 |
109 | # Train the neural network in 1500 iterations
110 | fitted <- modnn %>%
111 | fit(
112 | data = list(x[-testid, ], matrix(y[-testid], ncol = 1)),
113 | valid_data = list(x[testid, ], matrix(y[testid], ncol = 1)),
114 | epochs = 1500
115 | )
116 | #plot(fitted)
117 |
118 |
119 | npred <- predict(fitted, x[testid, ])
120 | mean(abs(y[testid] - npred))
121 |
--------------------------------------------------------------------------------
/Examples lecture 1/Data/used_cars_test.csv:
--------------------------------------------------------------------------------
1 | "","first_price","mileage","age_car_years","diesel","other_car_owner","bmw_320","opel_astra","mercedes_c","vw_golf","vw_passat","pm_green","private_seller","guarantee","inspection","maintenance_cert","co2_em","euro_norm","mile_20","mile_30","mile_40","mile_50","mile_100","mile_150","mileage2","mileage3","mileage4","age_3","age_6","age_car_years2","age_car_years3","age_car_years4","dur_next_ins_0","dur_next_ins_1_2","new_inspection","euro_1","euro_2","euro_3","euro_4","euro_5","euro_6"
2 | "1",25.5,79.85,3.1,1,1,1,0,0,0,0,1,0,0,2,0,124,5,1,1,1,1,0,0,6376.0225,509125.41,40653664,1,0,9.6099997,29.791,92.352097,0,0,1,0,0,0,0,1,0
3 | "3",7.47,142.5,9.6,0,4,1,0,0,0,0,1,0,0,0,0,182,4,1,1,1,1,1,0,20306.25,2893640.5,412343776,1,1,92.160004,884.73602,8493.4658,1,0,0,0,0,0,1,0,0
4 | "10",20.882,76.85,3.3,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,1,1,1,0,0,5905.9224,453870.16,34879920,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0
5 | "12",11.389,143,7.6,1,1,1,0,0,0,0,1,0,0,1,1,131,4,1,1,1,1,1,0,20449,2924207,418161600,1,1,57.759998,438.97601,3336.2175,0,1,0,0,0,0,1,0,0
6 | "15",23.015,47.2,1.4,1,1,0,0,0,0,1,1,0,0,0,0,123,5,1,1,1,0,0,0,2227.8401,105154.05,4963271,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0
7 | "17",25.26,42.495,2.9,1,1,1,0,0,0,0,1,0,0,0,1,123,5,1,1,1,0,0,0,1805.8251,76738.531,3261004,0,0,8.4099998,24.389,70.728104,1,0,0,0,0,0,0,1,0
8 | "18",19.029,29.4,2.3,1,1,0,0,0,0,1,1,0,0,1,1,122,5,1,0,0,0,0,0,864.35999,25412.184,747118.19,0,0,5.29,12.167,27.9841,0,1,0,0,0,0,0,1,0
9 | "24",17.339,54.936,4.4,1,1,0,0,0,1,0,0,0,0,0,1,139,5,1,1,1,1,0,0,3017.9641,165794.88,9108107,1,0,19.360001,85.183998,374.8096,1,0,0,0,0,0,0,1,0
10 | "26",8.96,75.19,7.4,1,4,0,0,0,1,0,1,0,0,1,1,137,4,1,1,1,1,0,0,5653.5361,425089.38,31962470,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0
11 | "27",13.65,71,4.7,1,2,0,0,0,1,0,1,1,0,0,1,125,5,1,1,1,1,0,0,5041,357911,25411680,1,0,22.09,103.823,487.96811,1,0,0,0,0,0,0,1,0
12 | "28",20.42,127.634,3.2,1,1,0,0,0,0,1,1,0,0,1,1,155,5,1,1,1,1,1,0,16290.438,2079213.8,265378368,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
13 | "29",15.36,22.931,3.2,0,0,0,1,0,0,0,1,0,0,1,1,144,5,1,0,0,0,0,0,525.83075,12057.825,276498,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
14 | "34",7,186.3,4.3,1,1,0,1,0,0,0,1,0,0,0,1,134,5,1,1,1,1,1,1,34707.691,6466042.5,1204623744,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0
15 | "36",10.61,94,7.4,1,0,0,0,0,0,1,1,1,0,2,0,149,5,1,1,1,1,0,0,8836,830584,78074896,1,1,54.759998,405.224,2998.6577,0,0,1,0,0,0,0,1,0
16 | "40",6.9,184,8,1,0,0,0,0,1,0,1,1,0,2,0,159,4,1,1,1,1,1,1,33856,6229504,1146228736,1,1,64,512,4096,0,0,1,0,0,0,1,0,0
17 | "41",32.78,44.778,4.8,0,2,1,0,0,0,0,1,0,0,1,1,235,5,1,1,1,0,0,0,2005.0693,89782.992,4020302.8,1,0,23.040001,110.592,530.84161,0,1,0,0,0,0,0,1,0
18 | "42",14.53,69.028,3.6,1,1,0,0,0,1,0,1,0,0,1,1,119,5,1,1,1,1,0,0,4764.8647,328909.09,22703936,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0
19 | "43",24.16,119,1.9,1,1,1,0,0,0,0,1,0,0,1,1,112,5,1,1,1,1,1,0,14161,1685159,200533920,0,0,3.6099999,6.8590002,13.0321,0,1,0,0,0,0,0,1,0
20 | "44",5.669,155.1,12.5,1,3,1,0,0,0,0,0,1,0,0,1,158,3,1,1,1,1,1,1,24056.01,3731087.3,578691648,1,1,156.25,1953.125,24414.063,1,0,0,0,0,1,0,0,0
21 | "45",22.46,54.49,4,1,2,0,0,0,0,1,1,0,0,0,0,151,5,1,1,1,1,0,0,2969.1602,161789.53,8815912,1,0,16,64,256,1,0,0,0,0,0,0,1,0
22 | "46",15.98,111.1,3.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,12343.21,1371330.6,152354832,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
23 | "51",10.27,154.907,3.8,1,1,0,1,0,0,0,1,0,0,1,1,148,5,1,1,1,1,1,1,23996.178,3717176,575816576,1,0,14.44,54.872002,208.5136,0,1,0,0,0,0,0,1,0
24 | "52",22.48,69.97,3.2,1,1,1,0,0,0,0,1,0,0,2,0,123,5,1,1,1,1,0,0,4895.8008,342559.19,23968866,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
25 | "54",29.27,25.86,1.6,1,1,1,0,0,0,0,1,0,0,0,1,119,6,1,0,0,0,0,0,668.73962,17293.605,447212.66,0,0,2.5599999,4.0960002,6.5535998,1,0,0,0,0,0,0,0,1
26 | "56",13.39,62.715,3.5,1,2,0,1,0,0,0,1,0,0,0,0,119,5,1,1,1,1,0,0,3933.1711,246668.83,15469836,1,0,12.25,42.875,150.0625,1,0,0,0,0,0,0,1,0
27 | "60",11.86,75.9,2.2,1,1,0,1,0,0,0,1,0,0,1,1,113,5,1,1,1,1,0,0,5760.8101,437245.47,33186932,0,0,4.8400002,10.648,23.4256,0,1,0,0,0,0,0,1,0
28 | "63",7.2,155,6.3,1,1,0,0,0,1,0,1,0,0,0,1,137,4,1,1,1,1,1,1,24025,3723875,577200640,1,1,39.689999,250.047,1575.2961,1,0,0,0,0,0,1,0,0
29 | "66",5.63,171,10.4,1,0,0,0,0,0,1,1,0,0,1,1,156,4,1,1,1,1,1,1,29241,5000211,855036096,1,1,108.16,1124.864,11698.586,0,1,0,0,0,0,1,0,0
30 | "68",9.57,144,10.1,0,0,1,0,0,0,0,1,0,0,1,1,182,4,1,1,1,1,1,0,20736,2985984,429981696,1,1,102.01,1030.301,10406.04,0,1,0,0,0,0,1,0,0
31 | "71",13.61,20.85,1.4,0,1,0,1,0,0,0,1,0,0,1,1,134,5,1,0,0,0,0,0,434.7225,9063.9639,188983.66,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,1,0
32 | "72",20.119,160,4,1,2,0,0,1,0,0,0,1,0,0,1,128,5,1,1,1,1,1,1,25600,4096000,655360000,1,0,16,64,256,1,0,0,0,0,0,0,1,0
33 | "76",10.71,160.871,3.5,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,1,1,25879.479,4163257.5,669747392,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,1,0
34 | "77",14.68,125,3.4,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,1,1,1,1,0,15625,1953125,244140624,1,0,11.56,39.304001,133.63361,0,1,0,0,0,0,0,1,0
35 | "80",22.335,18.704,1.2,1,1,0,0,0,0,1,1,0,0,1,1,125,6,0,0,0,0,0,0,349.83963,6543.4004,122387.76,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,0,1
36 | "81",8.29,163.44,6.3,1,1,0,0,0,1,0,1,0,0,0,1,122,4,1,1,1,1,1,1,26712.633,4365913,713564800,1,1,39.689999,250.047,1575.2961,1,0,0,0,0,0,1,0,0
37 | "82",17.48,150,3.3,1,1,1,0,0,0,0,0,0,0,2,1,124,5,1,1,1,1,1,1,22500,3375000,506249984,1,0,10.89,35.937,118.5921,0,0,1,0,0,0,0,1,0
38 | "83",15.84,19,1.7,0,0,0,0,0,1,0,1,0,0,0,1,124,5,0,0,0,0,0,0,361,6859,130321,0,0,2.8900001,4.9130001,8.3521004,1,0,0,0,0,0,0,1,0
39 | "86",14.99,91.735,5.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,0,0,8415.3105,771978.5,70817448,1,0,26.01,132.651,676.52008,0,1,0,0,0,0,0,1,0
40 | "88",15.27,109.6,2.4,1,1,0,1,0,0,0,1,0,1,0,1,154,5,1,1,1,1,1,0,12012.16,1316532.8,144291984,0,0,5.7600002,13.824,33.177601,1,0,0,0,0,0,0,1,0
41 | "91",6.09,140,6.7,1,2,0,0,0,1,0,1,0,0,0,1,137,4,1,1,1,1,1,0,19600,2744000,384160000,1,1,44.889999,300.763,2015.1121,1,0,0,0,0,0,1,0,0
42 | "92",29.25,31.988,1.5,1,1,0,0,1,0,0,1,0,0,0,0,128,5,1,1,0,0,0,0,1023.2321,32731.15,1047004,0,0,2.25,3.375,5.0625,1,0,0,0,0,0,0,1,0
43 | "94",11.71,199.95,8,0,2,0,0,1,0,0,1,0,0,0,1,229,4,1,1,1,1,1,1,39980.004,7994001.5,1598400640,1,1,64,512,4096,1,0,0,0,0,0,1,0,0
44 | "97",13.37,112,6.2,1,2,1,0,0,0,0,1,0,0,0,1,142,5,1,1,1,1,1,0,12544,1404928,157351936,1,1,38.439999,238.328,1477.6335,1,0,0,0,0,0,0,1,0
45 | "98",15.359,61.404,5.8,0,1,0,0,0,1,0,1,0,0,1,1,139,5,1,1,1,1,0,0,3770.4512,231520.78,14216302,1,0,33.639999,195.112,1131.6497,0,1,0,0,0,0,0,1,0
46 | "99",16.53,114.5,3.5,1,1,0,0,0,0,1,1,0,0,1,0,125,6,1,1,1,1,1,0,13110.25,1501123.6,171878656,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,0,1
47 | "100",6.87,171.914,3.9,1,0,0,1,0,0,0,1,0,0,2,1,117,5,1,1,1,1,1,1,29554.424,5080819,873463936,1,0,15.21,59.319,231.3441,0,0,1,0,0,0,0,1,0
48 | "102",16.91,88.639,3.3,1,0,0,0,0,0,1,1,0,0,0,1,120,5,1,1,1,1,0,0,7856.8726,696425.31,61730444,1,0,10.89,35.937,118.5921,1,0,0,0,0,0,0,1,0
49 | "103",20.35,22.81,3.8,1,0,0,0,0,0,1,1,0,0,2,0,139,5,1,0,0,0,0,0,520.29608,11867.954,270708.03,1,0,14.44,54.872002,208.5136,0,0,1,0,0,0,0,1,0
50 | "104",8.05,120,8.9,1,3,0,0,0,0,1,1,0,0,0,0,177,4,1,1,1,1,1,0,14400,1728000,207360000,1,1,79.209999,704.96899,6274.2241,1,0,0,0,0,0,1,0,0
51 | "107",18.34,25.117,1.4,0,1,0,0,0,1,0,1,0,0,1,1,126,5,1,0,0,0,0,0,630.86371,15845.403,397989,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,1,0
52 | "110",22.72,42.8,1.5,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,0,0,0,1831.84,78402.75,3355637.8,0,0,2.25,3.375,5.0625,0,1,0,0,0,0,0,1,0
53 | "113",18.25,150.99,4.1,1,1,1,0,0,0,0,1,0,0,1,0,128,5,1,1,1,1,1,1,22797.98,3442267,519747904,1,0,16.809999,68.920998,282.57611,0,1,0,0,0,0,0,1,0
54 | "114",16.85,57,4.1,1,1,0,0,0,0,1,1,0,0,0,1,123,5,1,1,1,1,0,0,3249,185193,10556001,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
55 | "120",35.7,42,1.1,1,1,0,0,1,0,0,1,0,0,1,1,108,6,1,1,1,0,0,0,1764,74088,3111696,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1
56 | "121",8.26,108,6.2,0,0,0,0,0,1,0,1,0,0,1,0,170,4,1,1,1,1,1,0,11664,1259712,136048896,1,1,38.439999,238.328,1477.6335,0,1,0,0,0,0,1,0,0
57 | "125",13.31,98,4.1,1,2,0,1,0,0,0,0,1,0,0,1,156,5,1,1,1,1,0,0,9604,941192,92236816,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
58 | "126",17.045,134.7,3.9,1,1,0,0,0,0,1,1,0,1,0,1,120,5,1,1,1,1,1,0,18144.09,2444009,329208000,1,0,15.21,59.319,231.3441,1,0,0,0,0,0,0,1,0
59 | "131",14.74,37.703,1.4,0,1,0,1,0,0,0,1,0,0,1,0,139,6,1,1,0,0,0,0,1421.5162,53595.426,2020708.4,0,0,1.96,2.744,3.8415999,0,1,0,0,0,0,0,0,1
60 | "132",7.1,84.78,7,0,1,0,0,0,1,0,1,0,0,1,1,176,4,1,1,1,1,0,0,7187.6484,609368.81,51662288,1,1,49,343,2401,0,1,0,0,0,0,1,0,0
61 | "136",6.63,95.55,5.9,0,1,0,0,0,1,0,1,0,0,0,1,165,5,1,1,1,1,0,0,9129.8027,872352.63,83353296,1,0,34.810001,205.379,1211.7361,1,0,0,0,0,0,0,1,0
62 | "137",30.65,30.456,1.1,1,1,1,0,0,0,0,1,0,0,2,0,123,5,1,1,0,0,0,0,927.56793,28250.01,860382.25,0,0,1.21,1.331,1.4641,0,0,1,0,0,0,0,1,0
63 | "138",6.29,189.285,9.5,1,0,1,0,0,0,0,1,0,0,0,0,158,4,1,1,1,1,1,1,35828.813,6781856.5,1283703680,1,1,90.25,857.375,8145.0625,1,0,0,0,0,0,1,0,0
64 | "139",23.109,31.972,1.2,1,1,1,0,0,0,0,1,0,0,1,1,119,5,1,1,0,0,0,0,1022.2088,32682.059,1044910.8,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0
65 | "140",11.56,110.8,4.7,1,1,1,0,0,0,0,1,0,0,1,0,109,5,1,1,1,1,1,0,12276.64,1360251.8,150715888,1,0,22.09,103.823,487.96811,0,1,0,0,0,0,0,1,0
66 | "141",29.06,29,1.6,1,0,1,0,0,0,0,1,0,0,0,1,129,6,1,0,0,0,0,0,841,24389,707281,0,0,2.5599999,4.0960002,6.5535998,1,0,0,0,0,0,0,0,1
67 | "143",16.26,88.315,3.5,1,1,0,0,0,0,1,1,0,0,1,1,123,6,1,1,1,1,0,0,7799.5391,688816.31,60832812,1,0,12.25,42.875,150.0625,0,1,0,0,0,0,0,0,1
68 | "144",21.12,72.656,2.4,1,1,0,0,1,0,0,1,0,0,1,1,124,5,1,1,1,1,0,0,5278.8945,383543.34,27866726,0,0,5.7600002,13.824,33.177601,0,1,0,0,0,0,0,1,0
69 | "145",22.389,30.681,4.4,1,2,1,0,0,0,0,1,0,0,1,1,142,4,1,1,0,0,0,0,941.32379,28880.754,886090.44,1,0,19.360001,85.183998,374.8096,0,1,0,0,0,0,1,0,0
70 | "147",12.65,150.3,3.1,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,1,1,22590.09,3395290.5,510312160,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
71 | "148",17.38,131.672,4.2,1,1,0,0,1,0,0,1,0,0,0,1,134,5,1,1,1,1,1,0,17337.516,2282865.3,300589440,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0
72 | "150",18.34,93.472,4.9,1,2,0,0,0,0,1,1,0,1,1,1,125,5,1,1,1,1,0,0,8737.0146,816666.25,76335424,1,0,24.01,117.649,576.4801,0,1,0,0,0,0,0,1,0
73 | "152",10.75,95.223,4.3,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,0,0,9067.4199,863426.94,82218104,1,0,18.49,79.507004,341.8801,0,1,0,0,0,0,0,1,0
74 | "153",19.699,55.641,2,1,1,1,0,0,0,0,1,0,0,1,1,119,5,1,1,1,1,0,0,3095.9209,172260.14,9584726,0,0,4,8,16,0,1,0,0,0,0,0,1,0
75 | "154",21.354,69.998,3.2,1,1,0,0,0,0,1,1,0,0,2,1,120,5,1,1,1,1,0,0,4899.7202,342970.59,24007256,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
76 | "158",20.01,88.112,3.6,1,2,0,0,0,0,1,1,0,0,1,1,123,5,1,1,1,1,0,0,7763.7246,684077.31,60275420,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0
77 | "159",10.97,174,7,1,0,1,0,0,0,0,1,0,0,1,1,150,5,1,1,1,1,1,1,30276,5268024,916636160,1,1,49,343,2401,0,1,0,0,0,0,0,1,0
78 | "160",26.62,59.998,3.2,1,1,1,0,0,0,0,1,0,0,2,1,125,5,1,1,1,1,0,0,3599.76,215978.41,12958272,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
79 | "161",7.23,173,8.8,0,2,1,0,0,0,0,1,0,0,1,1,182,4,1,1,1,1,1,1,29929,5177717,895745024,1,1,77.440002,681.47198,5996.9536,0,1,0,0,0,0,1,0,0
80 | "165",19.74,72.645,3.3,1,1,0,0,0,0,1,1,0,0,1,1,1,5,1,1,1,1,0,0,5277.2959,383369.16,27849854,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0
81 | "166",15.83,66.8,4.5,1,2,0,0,0,0,1,1,0,0,1,1,120,4,1,1,1,1,0,0,4462.2402,298077.63,19911586,1,0,20.25,91.125,410.0625,0,1,0,0,0,0,1,0,0
82 | "169",9.23,128,9.2,1,1,0,0,0,0,1,1,0,0,1,1,158,4,1,1,1,1,1,0,16384,2097152,268435456,1,1,84.639999,778.68799,7163.9297,0,1,0,0,0,0,1,0,0
83 | "170",14.26,128.2,3.4,1,1,0,0,0,0,1,1,0,1,1,1,121,5,1,1,1,1,1,0,16435.24,2106997.8,270117120,1,0,11.56,39.304001,133.63361,0,1,0,0,0,0,0,1,0
84 | "171",15.62,106,4.2,0,1,0,0,0,0,1,1,0,0,1,1,163,5,1,1,1,1,1,0,11236,1191016,126247696,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0
85 | "172",20.06,21.5,1.5,0,0,0,0,0,1,0,1,0,0,0,0,117,5,1,0,0,0,0,0,462.25,9938.375,213675.06,0,0,2.25,3.375,5.0625,1,0,0,0,0,0,0,1,0
86 | "173",4.79,133.1,7.5,1,2,0,1,0,0,0,1,0,0,0,1,146,4,1,1,1,1,1,0,17715.609,2357947.8,313842848,1,1,56.25,421.875,3164.0625,1,0,0,0,0,0,1,0,0
87 | "175",21.83,89.976,4.6,1,1,0,0,1,0,0,1,0,0,1,1,180,4,1,1,1,1,0,0,8095.6807,728416.94,65540044,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,1,0,0
88 | "178",14.12,124.471,4.4,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,15493.03,1928432.9,240033968,1,0,19.360001,85.183998,374.8096,0,1,0,0,0,0,0,1,0
89 | "180",5.92,132,7.7,1,0,0,0,0,1,0,0,1,0,0,1,137,4,1,1,1,1,1,0,17424,2299968,303595776,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0
90 | "181",23.08,61.208,3.2,1,1,1,0,0,0,0,1,0,0,1,1,124,5,1,1,1,1,0,0,3746.4192,229310.83,14035657,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
91 | "182",10.48,173.51,2.8,1,1,0,1,0,0,0,1,0,0,1,1,129,5,1,1,1,1,1,1,30105.721,5223643.5,906354368,0,0,7.8400002,21.952,61.465599,0,1,0,0,0,0,0,1,0
92 | "183",13.955,163.9,5.5,1,2,0,0,0,0,1,0,1,0,0,1,159,5,1,1,1,1,1,1,26863.211,4402880,721632064,1,0,30.25,166.375,915.0625,1,0,0,0,0,0,0,1,0
93 | "184",11.94,177,4.3,1,2,0,0,0,0,1,1,0,0,0,1,121,5,1,1,1,1,1,1,31329,5545233,981506240,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0
94 | "185",8.09,125,4.2,1,0,0,0,0,1,0,0,0,1,0,1,119,5,1,1,1,1,1,0,15625,1953125,244140624,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0
95 | "186",10.61,182.301,3.9,1,1,1,0,0,0,0,1,0,0,1,0,120,5,1,1,1,1,1,1,33233.656,6058528.5,1104475776,1,0,15.21,59.319,231.3441,0,1,0,0,0,0,0,1,0
96 | "187",18.7,164.618,2.9,1,1,0,0,0,0,1,1,0,1,1,1,125,5,1,1,1,1,1,1,27099.086,4460997.5,734360448,0,0,8.4099998,24.389,70.728104,0,1,0,0,0,0,0,1,0
97 | "188",10.91,91.54,4.8,0,2,0,1,0,0,0,1,0,0,0,1,144,5,1,1,1,1,0,0,8379.5713,767066,70217224,1,0,23.040001,110.592,530.84161,1,0,0,0,0,0,0,1,0
98 | "192",29.081,63.169,2.9,1,3,1,0,0,0,0,1,0,0,0,0,123,5,1,1,1,1,0,0,3990.3225,252064.69,15922674,0,0,8.4099998,24.389,70.728104,1,0,0,0,0,0,0,1,0
99 | "195",15.199,39,1.2,1,1,0,1,0,0,0,1,0,0,2,1,104,5,1,1,0,0,0,0,1521,59319,2313441,0,0,1.4400001,1.728,2.0736001,0,0,1,0,0,0,0,1,0
100 | "196",12.77,98,4.1,1,1,0,1,0,0,0,1,0,0,0,1,134,5,1,1,1,1,0,0,9604,941192,92236816,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
101 | "199",16,14.104,1.2,0,1,0,0,0,1,0,1,0,0,1,1,119,5,0,0,0,0,0,0,198.92282,2805.6074,39570.285,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0
102 | "200",17.08,147,4.2,1,1,1,0,0,0,0,1,0,0,1,1,140,5,1,1,1,1,1,0,21609,3176523,466948896,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0
103 |
--------------------------------------------------------------------------------
/Examples lecture 1/Data/used_cars_train.csv:
--------------------------------------------------------------------------------
1 | "","first_price","mileage","age_car_years","diesel","other_car_owner","bmw_320","opel_astra","mercedes_c","vw_golf","vw_passat","pm_green","private_seller","guarantee","inspection","maintenance_cert","co2_em","euro_norm","mile_20","mile_30","mile_40","mile_50","mile_100","mile_150","mileage2","mileage3","mileage4","age_3","age_6","age_car_years2","age_car_years3","age_car_years4","dur_next_ins_0","dur_next_ins_1_2","new_inspection","euro_1","euro_2","euro_3","euro_4","euro_5","euro_6"
2 | "2",21.91,77.1,3.7,1,1,0,0,0,0,1,1,0,0,0,1,136,5,1,1,1,1,0,0,5944.4102,458314,35336012,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0
3 | "4",14.58,45.45,5,0,2,0,0,0,0,1,1,0,0,1,1,145,5,1,1,1,0,0,0,2065.7024,93886.18,4267127,1,0,25,125,625,0,1,0,0,0,0,0,1,0
4 | "5",17.98,183.5,3.6,1,1,1,0,0,0,0,1,0,0,1,1,124,5,1,1,1,1,1,1,33672.25,6178858,1133820416,1,0,12.96,46.655998,167.96159,0,1,0,0,0,0,0,1,0
5 | "6",19.03,74.85,3.1,1,1,0,0,0,0,1,1,0,0,1,1,125,5,1,1,1,1,0,0,5602.5225,419348.81,31388258,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
6 | "7",10.969,174,6.8,1,1,0,0,1,0,0,1,0,0,1,1,154,5,1,1,1,1,1,1,30276,5268024,916636160,1,1,46.240002,314.43201,2138.1377,0,1,0,0,0,0,0,1,0
7 | "8",24.11,51.001,2.3,1,2,1,0,0,0,0,1,0,0,0,0,123,5,1,1,1,1,0,0,2601.1021,132658.8,6765731.5,0,0,5.29,12.167,27.9841,1,0,0,0,0,0,0,1,0
8 | "9",13.26,62,2.6,1,2,0,0,0,1,0,1,0,0,0,0,119,5,1,1,1,1,0,0,3844,238328,14776336,0,0,6.7600002,17.576,45.697601,1,0,0,0,0,0,0,1,0
9 | "11",23.2,16.901,1.4,1,1,1,0,0,0,0,1,0,0,0,1,119,5,0,0,0,0,0,0,285.6438,4827.666,81592.383,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0
10 | "13",13.65,119.636,4.2,1,0,0,0,0,0,1,0,0,0,2,0,123,5,1,1,1,1,1,0,14312.772,1712322.9,204855456,1,0,17.639999,74.087997,311.16959,0,0,1,0,0,0,0,1,0
11 | "14",11.74,83,7.4,1,2,1,0,0,0,0,0,1,0,0,1,120,5,1,1,1,1,0,0,6889,571787,47458320,1,1,54.759998,405.224,2998.6577,1,0,0,0,0,0,0,1,0
12 | "16",12.07,46.36,7.1,0,1,1,0,0,0,0,1,0,0,1,1,147,4,1,1,1,0,0,0,2149.2495,99639.211,4619274,1,1,50.41,357.91101,2541.1682,0,1,0,0,0,0,1,0,0
13 | "19",16.79,18.4,1.1,1,1,0,0,0,1,0,1,0,1,1,1,92,6,0,0,0,0,0,0,338.56,6229.5039,114622.88,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1
14 | "20",8.18,110.375,9.4,1,2,0,0,0,0,1,1,0,0,0,1,156,4,1,1,1,1,1,0,12182.641,1344659,148416736,1,1,88.360001,830.58398,7807.4897,1,0,0,0,0,0,1,0,0
15 | "21",5.43,151,13,0,0,1,0,0,0,0,1,0,0,1,0,199,4,1,1,1,1,1,1,22801,3442951,519885600,1,1,169,2197,28561,0,1,0,0,0,0,1,0,0
16 | "22",16.719,94.435,3.3,1,1,0,0,0,0,1,1,0,0,2,1,123,5,1,1,1,1,0,0,8917.9688,842168.44,79530176,1,0,10.89,35.937,118.5921,0,0,1,0,0,0,0,1,0
17 | "23",22.42,84.89,4.3,1,1,0,0,1,0,0,1,0,0,0,1,124,5,1,1,1,1,0,0,7206.312,611743.81,51930936,1,0,18.49,79.507004,341.8801,1,0,0,0,0,0,0,1,0
18 | "25",8.82,83,4,1,0,0,1,0,0,0,1,0,0,2,1,119,5,1,1,1,1,0,0,6889,571787,47458320,1,0,16,64,256,0,0,1,0,0,0,0,1,0
19 | "30",11.24,95.2,2.9,1,1,0,1,0,0,0,1,0,1,1,1,120,5,1,1,1,1,0,0,9063.04,862801.44,82138696,0,0,8.4099998,24.389,70.728104,0,1,0,0,0,0,0,1,0
20 | "31",14.54,85.606,4.8,1,0,0,0,1,0,0,0,0,0,2,0,130,5,1,1,1,1,0,0,7328.3872,627353.94,53705260,1,0,23.040001,110.592,530.84161,0,0,1,0,0,0,0,1,0
21 | "32",29.78,69.89,3.4,1,0,1,0,0,0,0,1,0,0,0,1,123,5,1,1,1,1,0,0,4884.6123,341385.53,23859436,1,0,11.56,39.304001,133.63361,1,0,0,0,0,0,0,1,0
22 | "33",22.43,22.208,1,0,1,0,0,0,0,1,1,0,0,1,1,119,6,1,0,0,0,0,0,493.19525,10952.881,243241.56,0,0,1,1,1,0,1,0,0,0,0,0,0,1
23 | "35",9.27,97,3.5,1,1,0,1,0,0,0,1,0,1,0,1,120,5,1,1,1,1,0,0,9409,912673,88529280,1,0,12.25,42.875,150.0625,1,0,0,0,0,0,0,1,0
24 | "37",10.33,93.1,5.4,0,1,0,0,0,0,1,1,0,0,0,1,158,5,1,1,1,1,0,0,8667.6104,806954.5,75127464,1,0,29.16,157.464,850.3056,1,0,0,0,0,0,0,1,0
25 | "38",17.71,92.568,3.1,1,1,0,0,0,0,1,1,0,1,1,1,120,5,1,1,1,1,0,0,8568.835,793199.88,73424928,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
26 | "39",10.9,129.781,5,1,0,0,0,0,0,1,0,0,0,2,0,120,5,1,1,1,1,1,0,16843.107,2185915.5,283690272,1,0,25,125,625,0,0,1,0,0,0,0,1,0
27 | "47",6.71,98.82,12,0,2,1,0,0,0,0,1,0,0,1,1,185,4,1,1,1,1,0,0,9765.3926,965016.06,95362888,1,1,144,1728,20736,0,1,0,0,0,0,1,0,0
28 | "48",21.8,76.5,4.6,1,2,0,0,1,0,0,1,0,1,1,1,136,5,1,1,1,1,0,0,5852.25,447697.13,34248832,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,0,1,0
29 | "49",20.24,17,1.3,1,1,0,0,0,1,0,1,0,0,2,0,119,5,0,0,0,0,0,0,289,4913,83521,0,0,1.6900001,2.197,2.8561001,0,0,1,0,0,0,0,1,0
30 | "50",13.26,126.248,3.1,1,1,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,1,0,15938.558,2012211,254037616,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
31 | "53",6.49,159,7.3,1,2,0,0,0,1,0,1,0,0,0,1,122,4,1,1,1,1,1,1,25281,4019679,639128960,1,1,53.290001,389.017,2839.8242,1,0,0,0,0,0,1,0,0
32 | "55",24.01,96,3.1,1,0,1,0,0,0,0,1,0,0,2,0,119,5,1,1,1,1,0,0,9216,884736,84934656,1,0,9.6099997,29.791,92.352097,0,0,1,0,0,0,0,1,0
33 | "57",33.89,25.575,1.4,1,1,0,0,0,0,1,1,0,0,0,1,156,5,1,0,0,0,0,0,654.08063,16728.111,427821.47,0,0,1.96,2.744,3.8415999,1,0,0,0,0,0,0,1,0
34 | "58",18.85,81.804,2.6,1,1,0,0,0,0,1,1,0,0,1,1,113,5,1,1,1,1,0,0,6691.8945,547423.75,44781452,0,0,6.7600002,17.576,45.697601,0,1,0,0,0,0,0,1,0
35 | "59",6.55,65.24,10.1,0,1,1,0,0,0,0,1,0,0,0,1,182,4,1,1,1,1,0,0,4256.2578,277678.25,18115728,1,1,102.01,1030.301,10406.04,1,0,0,0,0,0,1,0,0
36 | "61",9.72,200,7.2,1,1,1,0,0,0,0,1,0,0,1,1,150,5,1,1,1,1,1,1,40000,8e+06,1.6e+09,1,1,51.84,373.24799,2687.3855,0,1,0,0,0,0,0,1,0
37 | "62",16.44,166,3.3,1,1,0,0,1,0,0,1,0,0,0,1,124,5,1,1,1,1,1,1,27556,4574296,759333120,1,0,10.89,35.937,118.5921,1,0,0,0,0,0,0,1,0
38 | "64",15.2,70,4.4,1,1,1,0,0,0,0,1,0,0,2,1,142,5,1,1,1,1,0,0,4900,343000,24010000,1,0,19.360001,85.183998,374.8096,0,0,1,0,0,0,0,1,0
39 | "65",14.04,51.5,4.9,0,2,0,0,1,0,0,1,0,0,1,1,164,5,1,1,1,1,0,0,2652.25,136590.88,7034430,1,0,24.01,117.649,576.4801,0,1,0,0,0,0,0,1,0
40 | "67",33.9,16.994,1.6,1,2,0,0,1,0,0,1,0,0,1,1,171,6,0,0,0,0,0,0,288.79605,4907.7998,83403.148,0,0,2.5599999,4.0960002,6.5535998,0,1,0,0,0,0,0,0,1
41 | "69",25.35,23.33,1,1,1,0,0,0,0,1,1,0,0,1,0,119,6,1,0,0,0,0,0,544.28888,12698.26,296250.41,0,0,1,1,1,0,1,0,0,0,0,0,0,1
42 | "70",9.049,199.98,7.6,1,1,0,0,0,0,1,1,0,0,0,1,189,4,1,1,1,1,1,1,39992,7997600,1599360128,1,1,57.759998,438.97601,3336.2175,1,0,0,0,0,0,1,0,0
43 | "73",12.68,143.4,7.7,1,0,0,0,1,0,0,1,0,0,0,1,161,4,1,1,1,1,1,0,20563.561,2948814.5,422860000,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0
44 | "74",22.38,111.326,3.4,1,1,0,0,1,0,0,1,0,1,2,0,136,5,1,1,1,1,1,0,12393.479,1379716.4,153598304,1,0,11.56,39.304001,133.63361,0,0,1,0,0,0,0,1,0
45 | "75",5.5,129.651,7.7,0,2,0,1,0,0,0,1,0,0,1,1,163,4,1,1,1,1,1,0,16809.381,2179353.3,282555328,1,1,59.290001,456.53299,3515.3042,0,1,0,0,0,0,1,0,0
46 | "78",9.43,139.9,3.8,1,1,0,0,0,1,0,1,0,1,0,1,109,5,1,1,1,1,1,0,19572.01,2738124.3,383063584,1,0,14.44,54.872002,208.5136,1,0,0,0,0,0,0,1,0
47 | "79",14.82,189.2,3.8,1,1,0,0,1,0,0,1,0,0,0,1,133,5,1,1,1,1,1,1,35796.641,6772724.5,1281399424,1,0,14.44,54.872002,208.5136,1,0,0,0,0,0,0,1,0
48 | "84",19.59,24.976,1,0,1,0,0,0,1,0,1,0,0,1,0,126,6,1,0,0,0,0,0,623.8006,15580.043,389127.16,0,0,1,1,1,0,1,0,0,0,0,0,0,1
49 | "85",8.02,149,10,0,0,0,0,0,0,1,1,0,0,2,1,214,4,1,1,1,1,1,0,22201,3307949,492884416,1,1,100,1000,10000,0,0,1,0,0,0,1,0,0
50 | "87",13.17,65.77,5.1,1,3,0,0,0,1,0,1,0,0,1,1,109,5,1,1,1,1,0,0,4325.6929,284500.81,18711620,1,0,26.01,132.651,676.52008,0,1,0,0,0,0,0,1,0
51 | "89",16.85,13.55,1.7,0,1,0,1,0,0,0,0,0,0,2,0,137,5,0,0,0,0,0,0,183.60249,2487.814,33709.879,0,0,2.8900001,4.9130001,8.3521004,0,0,1,0,0,0,0,1,0
52 | "90",13.47,106.5,5.4,1,2,0,0,0,0,1,1,0,0,1,0,170,5,1,1,1,1,1,0,11342.25,1207949.6,128646632,1,0,29.16,157.464,850.3056,0,1,0,0,0,0,0,1,0
53 | "93",9.92,180,4.1,1,1,0,0,0,0,1,1,0,0,0,1,116,5,1,1,1,1,1,1,32400,5832000,1049760000,1,0,16.809999,68.920998,282.57611,1,0,0,0,0,0,0,1,0
54 | "95",13.12,100.898,8,1,1,0,0,0,0,1,1,0,0,1,1,177,4,1,1,1,1,1,0,10180.406,1027182.6,103640672,1,1,64,512,4096,0,1,0,0,0,0,1,0,0
55 | "96",21.91,21.336,1,1,1,0,0,0,1,0,1,0,1,1,1,117,6,1,0,0,0,0,0,455.22488,9712.6787,207229.7,0,0,1,1,1,0,1,0,0,0,0,0,0,1
56 | "101",18.62,108.697,3.1,1,1,0,0,0,0,1,1,0,0,1,1,120,5,1,1,1,1,1,0,11815.038,1284259.1,139595120,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
57 | "105",17.15,89.414,3.1,1,2,0,0,0,0,1,1,0,1,1,1,135,5,1,1,1,1,0,0,7994.8633,714852.69,63917840,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
58 | "106",24.36,96.35,3.3,1,1,0,0,1,0,0,1,0,0,1,1,128,5,1,1,1,1,0,0,9283.3223,894448.13,86180080,1,0,10.89,35.937,118.5921,0,1,0,0,0,0,0,1,0
59 | "108",12.059,123.862,8.7,0,0,1,0,0,0,0,1,0,0,0,1,182,4,1,1,1,1,1,0,15341.795,1900265.4,235370672,1,1,75.690002,658.50299,5728.9761,1,0,0,0,0,0,1,0,0
60 | "109",13.27,118.955,3.2,1,2,0,0,0,1,0,1,0,0,0,1,148,5,1,1,1,1,1,0,14150.292,1683248,200230768,1,0,10.24,32.768002,104.8576,1,0,0,0,0,0,0,1,0
61 | "111",15.319,125,2.4,1,1,0,0,0,0,1,1,0,0,0,1,135,5,1,1,1,1,1,0,15625,1953125,244140624,0,0,5.7600002,13.824,33.177601,1,0,0,0,0,0,0,1,0
62 | "112",17.34,112.601,3.2,1,1,0,0,1,0,0,1,0,0,1,1,127,5,1,1,1,1,1,0,12678.985,1427666.4,160756672,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
63 | "115",21.06,18.55,1,1,1,0,0,0,0,1,1,0,0,2,1,120,5,0,0,0,0,0,0,344.10251,6383.1016,118406.53,0,0,1,1,1,0,0,1,0,0,0,0,1,0
64 | "116",9.3,99.6,7.6,1,2,0,0,0,1,0,1,1,0,1,1,137,4,1,1,1,1,0,0,9920.1602,988047.94,98409576,1,1,57.759998,438.97601,3336.2175,0,1,0,0,0,0,1,0,0
65 | "117",9.62,56.979,7.4,1,1,0,0,0,1,0,1,0,0,1,1,137,4,1,1,1,1,0,0,3246.6064,184988.39,10540453,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0
66 | "118",10.889,189.3,3,1,0,0,0,0,0,1,1,0,0,2,1,121,5,1,1,1,1,1,1,35834.488,6783469,1284110720,1,0,9,27,81,0,0,1,0,0,0,0,1,0
67 | "119",18.21,78.95,3.2,1,0,0,0,0,0,1,1,0,0,2,0,122,5,1,1,1,1,0,0,6233.1025,492103.44,38851568,1,0,10.24,32.768002,104.8576,0,0,1,0,0,0,0,1,0
68 | "122",9.87,181,4.8,1,2,0,0,0,1,0,1,1,0,0,0,139,5,1,1,1,1,1,1,32761,5929741,1073283136,1,0,23.040001,110.592,530.84161,1,0,0,0,0,0,0,1,0
69 | "123",11.915,111.35,4.1,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,1,0,12398.822,1380608.9,153730800,1,0,16.809999,68.920998,282.57611,0,1,0,0,0,0,0,1,0
70 | "124",6.268,183,4.6,1,2,0,1,0,0,0,1,0,0,0,1,119,5,1,1,1,1,1,1,33489,6128487,1121513088,1,0,21.16,97.335999,447.74561,1,0,0,0,0,0,0,1,0
71 | "127",23.79,24.422,1.2,1,1,0,0,0,0,1,1,0,0,1,1,135,5,1,0,0,0,0,0,596.43408,14566.113,355733.63,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,1,0
72 | "128",12.02,89.498,3.2,1,1,0,0,0,1,0,1,0,0,1,1,125,5,1,1,1,1,0,0,8009.8921,716869.31,64158368,1,0,10.24,32.768002,104.8576,0,1,0,0,0,0,0,1,0
73 | "129",9.74,185,9,0,4,1,0,0,0,0,1,0,0,0,1,196,4,1,1,1,1,1,1,34225,6331625,1171350656,1,1,81,729,6561,1,0,0,0,0,0,1,0,0
74 | "130",5.109,105.098,7.4,1,0,0,1,0,0,0,1,0,0,1,0,149,4,1,1,1,1,1,0,11045.59,1160869.4,122005048,1,1,54.759998,405.224,2998.6577,0,1,0,0,0,0,1,0,0
75 | "133",16.38,87.317,5.1,0,1,1,0,0,0,0,1,0,0,2,1,159,5,1,1,1,1,0,0,7624.2583,665727.38,58129316,1,0,26.01,132.651,676.52008,0,0,1,0,0,0,0,1,0
76 | "134",13.26,173.136,7.8,0,1,0,0,0,0,1,1,0,1,0,1,204,4,1,1,1,1,1,1,29976.074,5189937.5,898565056,1,1,60.84,474.552,3701.5056,1,0,0,0,0,0,1,0,0
77 | "135",26.83,50,3,1,1,1,0,0,0,0,1,0,1,1,1,124,5,1,1,1,1,0,0,2500,125000,6250000,1,0,9,27,81,0,1,0,0,0,0,0,1,0
78 | "142",6.91,149.8,8.9,1,2,0,1,0,0,0,1,0,1,0,0,159,4,1,1,1,1,1,0,22440.039,3361518,503555392,1,1,79.209999,704.96899,6274.2241,1,0,0,0,0,0,1,0,0
79 | "146",15.4,87.385,4.3,1,2,0,0,0,0,1,1,0,0,1,1,116,5,1,1,1,1,0,0,7636.1382,667283.94,58310608,1,0,18.49,79.507004,341.8801,0,1,0,0,0,0,0,1,0
80 | "149",21.07,26.086,1,0,1,0,0,0,1,0,1,0,0,1,1,121,6,1,0,0,0,0,0,680.47937,17750.986,463052.22,0,0,1,1,1,0,1,0,0,0,0,0,0,1
81 | "151",15.93,106.02,4.3,1,1,0,0,1,0,0,1,0,0,2,1,133,5,1,1,1,1,1,0,11240.24,1191690.3,126343008,1,0,18.49,79.507004,341.8801,0,0,1,0,0,0,0,1,0
82 | "155",12.75,96,8.1,0,2,1,0,0,0,0,1,0,0,1,1,194,4,1,1,1,1,0,0,9216,884736,84934656,1,1,65.610001,531.44098,4304.6719,0,1,0,0,0,0,1,0,0
83 | "156",13.42,102.648,4.6,1,2,0,0,0,1,0,1,0,1,1,1,121,5,1,1,1,1,1,0,10536.612,1081562.1,111020192,1,0,21.16,97.335999,447.74561,0,1,0,0,0,0,0,1,0
84 | "157",1.2,158.2,17.6,0,0,0,0,0,0,1,1,0,0,2,0,216,3,1,1,1,1,1,1,25027.24,3959309.3,626362752,1,1,309.76001,5451.7759,95951.258,0,0,1,0,0,1,0,0,0
85 | "162",19.49,102.943,3.7,1,1,0,0,0,0,1,0,0,0,2,0,125,5,1,1,1,1,1,0,10597.262,1090913.9,112301944,1,0,13.69,50.653,187.41611,0,0,1,0,0,0,0,1,0
86 | "163",13.249,94.41,3.1,1,1,0,0,0,1,0,1,0,0,1,1,128,5,1,1,1,1,0,0,8913.248,841499.75,79445992,1,0,9.6099997,29.791,92.352097,0,1,0,0,0,0,0,1,0
87 | "164",34.149,20.217,1.2,1,1,0,0,0,0,1,1,0,1,1,1,119,6,1,0,0,0,0,0,408.72708,8263.2354,167057.83,0,0,1.4400001,1.728,2.0736001,0,1,0,0,0,0,0,0,1
88 | "167",25.78,33.235,2.7,1,1,1,0,0,0,0,1,0,0,0,1,124,5,1,1,0,0,0,0,1104.5652,36710.227,1220064.4,0,0,7.29,19.683001,53.1441,1,0,0,0,0,0,0,1,0
89 | "168",3.94,159,8.7,1,2,0,0,0,0,1,0,1,0,2,0,177,4,1,1,1,1,1,1,25281,4019679,639128960,1,1,75.690002,658.50299,5728.9761,0,0,1,0,0,0,1,0,0
90 | "174",17.46,119.95,4.2,1,1,0,0,0,0,1,1,0,0,1,1,139,5,1,1,1,1,1,0,14388.003,1725840.9,207014608,1,0,17.639999,74.087997,311.16959,0,1,0,0,0,0,0,1,0
91 | "176",29.499,18.238,1.1,1,1,0,0,1,0,0,1,0,1,1,0,108,6,0,0,0,0,0,0,332.62463,6066.4082,110639.16,0,0,1.21,1.331,1.4641,0,1,0,0,0,0,0,0,1
92 | "177",13.91,55.6,3.7,1,1,0,0,0,0,1,1,0,0,0,1,116,5,1,1,1,1,0,0,3091.3601,171879.61,9556507,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0
93 | "179",19.67,77,3.7,1,1,0,0,0,0,1,0,1,0,0,1,136,5,1,1,1,1,0,0,5929,456533,35153040,1,0,13.69,50.653,187.41611,1,0,0,0,0,0,0,1,0
94 | "189",8.5,41.326,5.4,1,1,0,1,0,0,0,1,0,0,0,1,149,5,1,1,1,0,0,0,1707.8383,70578.125,2916711.5,1,0,29.16,157.464,850.3056,1,0,0,0,0,0,0,1,0
95 | "190",21.73,53.71,7.7,0,0,0,0,1,0,0,1,0,1,0,1,235,4,1,1,1,1,0,0,2884.7642,154940.69,8321864,1,1,59.290001,456.53299,3515.3042,1,0,0,0,0,0,1,0,0
96 | "191",17.52,49.308,2.8,1,1,1,0,0,0,0,1,0,0,0,1,119,5,1,1,1,0,0,0,2431.2788,119881.5,5911117,0,0,7.8400002,21.952,61.465599,1,0,0,0,0,0,0,1,0
97 | "193",16.85,150,3.4,1,1,1,0,0,0,0,0,0,0,2,1,124,5,1,1,1,1,1,1,22500,3375000,506249984,1,0,11.56,39.304001,133.63361,0,0,1,0,0,0,0,1,0
98 | "194",9.72,123,4.2,1,1,0,0,0,1,0,1,0,0,0,1,109,5,1,1,1,1,1,0,15129,1860867,228886640,1,0,17.639999,74.087997,311.16959,1,0,0,0,0,0,0,1,0
99 | "197",27.68,20.489,1,1,1,1,0,0,0,0,1,0,1,1,0,127,5,1,0,0,0,0,0,419.79913,8601.2646,176231.3,0,0,1,1,1,0,1,0,0,0,0,0,1,0
100 | "198",9.49,175.1,7.8,1,2,1,0,0,0,0,1,1,0,0,1,146,4,1,1,1,1,1,1,30660.01,5368568,940036224,1,1,60.84,474.552,3701.5056,1,0,0,0,0,0,1,0,0
101 |
--------------------------------------------------------------------------------
/Group Data Challenge 2025/data_challenge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Group Data Challenge 2025/data_challenge.pdf
--------------------------------------------------------------------------------
/Group Data Challenge 2025/orange_juice.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Wholesale Manager"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "**Author:**\n",
15 | "[Anthony Strittmatter](http://www.anthonystrittmatter.com)"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "You manage a wholesale store. The data file juice.csv contains orange juice sales (sales) and prices (price) of different grocery stores that you deliver. Your product range contains three different orange juice brands: Tropicana, Minute Maid, and Dominicks. Some stores advertise/feature specific orange juice brands, which is indicated by the dummy variable feat. The data contains also the store ID (id). You deliver new grocery stores. The new stores sent you the file new grocery.csv, which\n",
23 | "contains the planned prices and advertisements for the different brands. Your job as wholesale manager is to predict the sales of the new grocery stores and deliver the right amount of orange juice."
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "## Load Packages and Data"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "[1] \"Packages and data successfully loaded.\"\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "######################## Load Packages and Data ########################\n",
48 | "\n",
49 | "# Load packages\n",
50 | "library(rpart)\n",
51 | "library(rpart.plot)\n",
52 | "library(grf)\n",
53 | "library(glmnet)\n",
54 | "\n",
55 | "# Load data\n",
56 | "juice <- read.csv(\"juice.csv\", sep = \",\")\n",
57 | "new_grocery <- read.csv(\"new_grocery.csv\", sep = \",\")\n",
58 | "\n",
59 | "print('Packages and data successfully loaded.')\n",
60 | "\n",
61 | "#############################################################################"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "## Inspect Data"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 2,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/html": [
79 | "
\n",
80 | "X id sales price brand feat \n",
81 | "\n",
82 | "\t1 1140 11970 2.47 minute.maid 0 \n",
83 | "\t3 7182 30205 1.57 dominicks 1 \n",
84 | "\t4 1741 3521 2.55 minute.maid 0 \n",
85 | "\t5 1725 11777 1.41 dominicks 0 \n",
86 | "\t6 7565 129151 2.05 minute.maid 1 \n",
87 | "\t8 5617 7104 3.74 tropicana 0 \n",
88 | " \n",
89 | "
\n"
90 | ],
91 | "text/latex": [
92 | "\\begin{tabular}{r|llllll}\n",
93 | " X & id & sales & price & brand & feat\\\\\n",
94 | "\\hline\n",
95 | "\t 1 & 1140 & 11970 & 2.47 & minute.maid & 0 \\\\\n",
96 | "\t 3 & 7182 & 30205 & 1.57 & dominicks & 1 \\\\\n",
97 | "\t 4 & 1741 & 3521 & 2.55 & minute.maid & 0 \\\\\n",
98 | "\t 5 & 1725 & 11777 & 1.41 & dominicks & 0 \\\\\n",
99 | "\t 6 & 7565 & 129151 & 2.05 & minute.maid & 1 \\\\\n",
100 | "\t 8 & 5617 & 7104 & 3.74 & tropicana & 0 \\\\\n",
101 | "\\end{tabular}\n"
102 | ],
103 | "text/markdown": [
104 | "\n",
105 | "| X | id | sales | price | brand | feat |\n",
106 | "|---|---|---|---|---|---|\n",
107 | "| 1 | 1140 | 11970 | 2.47 | minute.maid | 0 |\n",
108 | "| 3 | 7182 | 30205 | 1.57 | dominicks | 1 |\n",
109 | "| 4 | 1741 | 3521 | 2.55 | minute.maid | 0 |\n",
110 | "| 5 | 1725 | 11777 | 1.41 | dominicks | 0 |\n",
111 | "| 6 | 7565 | 129151 | 2.05 | minute.maid | 1 |\n",
112 | "| 8 | 5617 | 7104 | 3.74 | tropicana | 0 |\n",
113 | "\n"
114 | ],
115 | "text/plain": [
116 | " X id sales price brand feat\n",
117 | "1 1 1140 11970 2.47 minute.maid 0 \n",
118 | "2 3 7182 30205 1.57 dominicks 1 \n",
119 | "3 4 1741 3521 2.55 minute.maid 0 \n",
120 | "4 5 1725 11777 1.41 dominicks 0 \n",
121 | "5 6 7565 129151 2.05 minute.maid 1 \n",
122 | "6 8 5617 7104 3.74 tropicana 0 "
123 | ]
124 | },
125 | "metadata": {},
126 | "output_type": "display_data"
127 | },
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "[1] \"Old data: 9685 observations\"\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "######################## Describe Old Data ########################\n",
138 | "\n",
139 | "# Print first few rows of old data\n",
140 | "head(juice)\n",
141 | "\n",
142 | "# Number of observations\n",
143 | "print(paste0('Old data: ',nrow(juice),' observations'))\n",
144 | "\n",
145 | "######################################################################"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 3,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "data": {
155 | "text/html": [
156 | "\n",
157 | "X id price brand feat \n",
158 | "\n",
159 | "\t 2 10171 1.81 dominicks 1 \n",
160 | "\t 7 7489 NA tropicana 0 \n",
161 | "\t10 7559 3.29 tropicana 0 \n",
162 | "\t11 1236 1.77 minute.maid 1 \n",
163 | "\t16 5361 1.53 dominicks 0 \n",
164 | "\t17 108 1.42 dominicks 0 \n",
165 | " \n",
166 | "
\n"
167 | ],
168 | "text/latex": [
169 | "\\begin{tabular}{r|lllll}\n",
170 | " X & id & price & brand & feat\\\\\n",
171 | "\\hline\n",
172 | "\t 2 & 10171 & 1.81 & dominicks & 1 \\\\\n",
173 | "\t 7 & 7489 & NA & tropicana & 0 \\\\\n",
174 | "\t 10 & 7559 & 3.29 & tropicana & 0 \\\\\n",
175 | "\t 11 & 1236 & 1.77 & minute.maid & 1 \\\\\n",
176 | "\t 16 & 5361 & 1.53 & dominicks & 0 \\\\\n",
177 | "\t 17 & 108 & 1.42 & dominicks & 0 \\\\\n",
178 | "\\end{tabular}\n"
179 | ],
180 | "text/markdown": [
181 | "\n",
182 | "| X | id | price | brand | feat |\n",
183 | "|---|---|---|---|---|\n",
184 | "| 2 | 10171 | 1.81 | dominicks | 1 |\n",
185 | "| 7 | 7489 | NA | tropicana | 0 |\n",
186 | "| 10 | 7559 | 3.29 | tropicana | 0 |\n",
187 | "| 11 | 1236 | 1.77 | minute.maid | 1 |\n",
188 | "| 16 | 5361 | 1.53 | dominicks | 0 |\n",
189 | "| 17 | 108 | 1.42 | dominicks | 0 |\n",
190 | "\n"
191 | ],
192 | "text/plain": [
193 | " X id price brand feat\n",
194 | "1 2 10171 1.81 dominicks 1 \n",
195 | "2 7 7489 NA tropicana 0 \n",
196 | "3 10 7559 3.29 tropicana 0 \n",
197 | "4 11 1236 1.77 minute.maid 1 \n",
198 | "5 16 5361 1.53 dominicks 0 \n",
199 | "6 17 108 1.42 dominicks 0 "
200 | ]
201 | },
202 | "metadata": {},
203 | "output_type": "display_data"
204 | },
205 | {
206 | "name": "stdout",
207 | "output_type": "stream",
208 | "text": [
209 | "[1] \"New data: 3262 observations\"\n"
210 | ]
211 | }
212 | ],
213 | "source": [
214 | "######################## Describe Old Data ########################\n",
215 | "\n",
216 | "# Print first few rows of new data\n",
217 | "head(new_grocery)\n",
218 | "\n",
219 | "# Number of observations\n",
220 | "print(paste0('New data: ',nrow(new_grocery),' observations'))\n",
221 | "\n",
222 | "######################################################################"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "## Prepare Data"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 4,
235 | "metadata": {},
236 | "outputs": [
237 | {
238 | "data": {
239 | "text/plain": [
240 | " sales price missing minute.maid \n",
241 | " Min. : 63 Min. :0.000 Min. :0.00000 Min. :0.0000 \n",
242 | " 1st Qu.: 4800 1st Qu.:1.710 1st Qu.:0.00000 1st Qu.:0.0000 \n",
243 | " Median : 8256 Median :2.120 Median :0.00000 Median :0.0000 \n",
244 | " Mean : 17023 Mean :2.174 Mean :0.04801 Mean :0.3284 \n",
245 | " 3rd Qu.: 16896 3rd Qu.:2.720 3rd Qu.:0.00000 3rd Qu.:1.0000 \n",
246 | " Max. :716415 Max. :4.170 Max. :1.00000 Max. :1.0000 \n",
247 | " dominicks tropicana featured \n",
248 | " Min. :0.0000 Min. :0.000 Min. :0.0000 \n",
249 | " 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 \n",
250 | " Median :0.0000 Median :0.000 Median :0.0000 \n",
251 | " Mean :0.3405 Mean :0.331 Mean :0.2355 \n",
252 | " 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.0000 \n",
253 | " Max. :1.0000 Max. :1.000 Max. :1.0000 "
254 | ]
255 | },
256 | "metadata": {},
257 | "output_type": "display_data"
258 | },
259 | {
260 | "name": "stdout",
261 | "output_type": "stream",
262 | "text": [
263 | "[1] \"Data is prepared.\"\n"
264 | ]
265 | }
266 | ],
267 | "source": [
268 | "######################## Data Preparation ########################\n",
269 | "\n",
270 | "# Generate dummy for missing prices\n",
271 | "missing <- (is.na(juice$price) == TRUE)\n",
272 | "new_missing <- (is.na(new_grocery$price) == TRUE)\n",
273 | "\n",
274 | "# Replace missing prices with zero\n",
275 | "juice$price[is.na(juice$price)] <-0\n",
276 | "new_grocery$price[is.na(new_grocery$price)] <-0\n",
277 | "\n",
278 | "# Generate Dummies for Brands\n",
279 | "brand_1 <- (juice$brand == \"minute.maid\")\n",
280 | "brand_2 <- (juice$brand == \"dominicks\")\n",
281 | "brand_3 <- (juice$brand == \"tropicana\")\n",
282 | "\n",
283 | "new_brand_1 <- (new_grocery$brand == \"minute.maid\")\n",
284 | "new_brand_2 <- (new_grocery$brand == \"dominicks\")\n",
285 | "new_brand_3 <- (new_grocery$brand == \"tropicana\")\n",
286 | "\n",
287 | "# Generate outcome and control variables\n",
288 | "y <- as.matrix(juice$sales)\n",
289 | "colnames(y) <- c(\"sales\")\n",
290 | "\n",
291 | "x <- as.matrix(cbind(juice$price, missing, brand_1, brand_2, brand_3, juice$feat))\n",
292 | "colnames(x) <- c(\"price\", \"missing\", \"minute.maid\", \"dominicks\", \"tropicana\", \"featured\")\n",
293 | "\n",
294 | "new_x <- as.matrix(cbind(new_grocery$price, new_missing, new_brand_1, new_brand_2, new_brand_3, new_grocery$feat))\n",
295 | "colnames(new_x) <- c(\"price\", \"missing\", \"minute.maid\", \"dominicks\", \"tropicana\", \"featured\")\n",
296 | "\n",
297 | "# Descriptive statistics\n",
298 | "summary(cbind(y,x))\n",
299 | "\n",
300 | "print('Data is prepared.')\n",
301 | "\n",
302 | "#############################################################################"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {},
308 | "source": [
309 | "**$\\Rightarrow$ It is possible to add non-linear and interaction terms.**"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "## Generate Training and Test Sample"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 5,
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "name": "stdout",
326 | "output_type": "stream",
327 | "text": [
328 | "[1] \"Training and test samples created.\"\n"
329 | ]
330 | }
331 | ],
332 | "source": [
333 | "######################## Training and Test Samples ########################\n",
334 | "\n",
335 | "set.seed(???)\n",
336 | "\n",
337 | "# Generate variable with the rows in training data\n",
338 | "\n",
339 | "\n",
340 | "print('Training and test samples created.')\n",
341 | "\n",
342 | "#############################################################################"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {},
348 | "source": [
349 | "## Predict Orange Juice Prices in Training Sample and Assess Model in Test Sample"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 6,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "name": "stdout",
359 | "output_type": "stream",
360 | "text": [
361 | "[1] \"R-squared Penalized Regression: 0.278\"\n"
362 | ]
363 | }
364 | ],
365 | "source": [
366 | "######################## LASSO, Ridge, Elastic Net ##############################\n",
367 | "\n",
368 | "set.seed(???)\n",
369 | "penalized.cv <- ???\n",
370 | "\n",
371 | "\n",
372 | "# Fitted values\n",
373 | "pred_penalized <- ???\n",
374 | "\n",
375 | "# Calculate the MSE\n",
376 | "MSE_penalized <- mean((y[-training_set] - pred_penalized[-training_set])^2)\n",
377 | "R2_penalized <- round(1- MSE_penalized/var(y[-training_set]), digits = 3)\n",
378 | "\n",
379 | "print(paste0(\"R-squared Penalized Regression: \", R2_penalized))\n",
380 | " \n",
381 | "################################################################"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 11,
387 | "metadata": {},
388 | "outputs": [
389 | {
390 | "name": "stdout",
391 | "output_type": "stream",
392 | "text": [
393 | "[1] \"R-squared Tree: 0.365\"\n"
394 | ]
395 | }
396 | ],
397 | "source": [
398 | "###################### Regression Tree #######################\n",
399 | "\n",
400 | "set.seed(???)\n",
401 | "# Prepare data for tree estimator\n",
402 | "outcome <- y[training_set]\n",
403 | "tree_data <- data.frame(outcome, x[training_set,])\n",
404 | "\n",
405 | "deep_tree <- ???\n",
406 | "\n",
407 | "# Optimal tree size\n",
408 | "op.index <- ???\n",
409 | "\n",
410 | "## Select the Tree that Minimises CV-MSE\n",
411 | "cp.vals <- ???\n",
412 | "\n",
413 | "# Prune the deep tree\n",
414 | "pruned_tree <- ???\n",
415 | "\n",
416 | "## Plot tree structure\n",
417 | "#rpart.plot(pruned_tree,digits=3)\n",
418 | "\n",
419 | "# Fitted values\n",
420 | "predtree <- ???\n",
421 | "\n",
422 | "# Calculate the MSE\n",
423 | "MSEtree <- mean((y[-training_set] - predtree[-training_set])^2)\n",
424 | "R2tree <- round(1- MSEtree/var(y[-training_set]), digits = 3)\n",
425 | "\n",
426 | "print(paste0(\"R-squared Tree: \", R2tree))\n",
427 | "\n",
428 | "################################################################"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": 8,
434 | "metadata": {},
435 | "outputs": [
436 | {
437 | "name": "stdout",
438 | "output_type": "stream",
439 | "text": [
440 | "[1] \"R-squared Forest: 0.411\"\n"
441 | ]
442 | }
443 | ],
444 | "source": [
445 | "######################## Random Forest #######################\n",
446 | "\n",
447 | "set.seed(???)\n",
448 | "\n",
449 | "rep <- ??? # number of trees\n",
450 | "cov <- ??? # share of covariates\n",
451 | "frac <- ??? # fraction of subsample\n",
452 | "min_obs <- ??? # max. size of terminal leaves in trees\n",
453 | "\n",
454 | "# Build Forest\n",
455 | "forest <- ???\n",
456 | "\n",
457 | "# Fitted values\n",
458 | "predforest <- ???\n",
459 | "\n",
460 | "# Calculate MSE\n",
461 | "MSEforest <- mean((y[-training_set] - predforest[-training_set])^2)\n",
462 | "R2forest <- round(1- MSEforest/var(y[-training_set]), digits = 3)\n",
463 | "\n",
464 | "print(paste0(\"R-squared Forest: \", R2forest))\n",
465 | "\n",
466 | "################################################################"
467 | ]
468 | },
469 | {
470 | "cell_type": "markdown",
471 | "metadata": {},
472 | "source": [
473 | "## Select Favorite Model and Extrapolate to New Data"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 9,
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "name": "stdout",
483 | "output_type": "stream",
484 | "text": [
485 | "[1] \"Out-of-sample sales are predicted.\"\n"
486 | ]
487 | }
488 | ],
489 | "source": [
490 | "######################## Out-of-Sample Prediction #######################\n",
491 | "\n",
492 | "# Fitted values\n",
493 | "new_prediction <- ???\n",
494 | "\n",
495 | "print('Out-of-sample sales are predicted.')\n",
496 | "\n",
497 | "###########################################################################"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "metadata": {},
503 | "source": [
504 | "## Store Out-of-Sample Predictions"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": 10,
510 | "metadata": {},
511 | "outputs": [
512 | {
513 | "name": "stdout",
514 | "output_type": "stream",
515 | "text": [
516 | "[1] \"File is stored.\"\n",
517 | "[1] \"Send your results to anthony.strittmatter@unibas.ch\"\n"
518 | ]
519 | }
520 | ],
521 | "source": [
522 | "######################## Store Results #######################\n",
523 | "\n",
524 | "id_new <- as.matrix(new_grocery$id)\n",
525 | "\n",
526 | "# Replace ??? with your group name\n",
527 | "write.csv(cbind(id_new,new_prediction),\"???.csv\")\n",
528 | "\n",
529 | "print('File is stored.')\n",
530 | "print('Send your results to anthony.strittmatter@unibas.ch')\n",
531 | "\n",
532 | "################################################################"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": null,
538 | "metadata": {},
539 | "outputs": [],
540 | "source": []
541 | }
542 | ],
543 | "metadata": {
544 | "kernelspec": {
545 | "display_name": "R",
546 | "language": "R",
547 | "name": "ir"
548 | },
549 | "language_info": {
550 | "codemirror_mode": "r",
551 | "file_extension": ".r",
552 | "mimetype": "text/x-r-source",
553 | "name": "R",
554 | "pygments_lexer": "r",
555 | "version": "3.6.1"
556 | }
557 | },
558 | "nbformat": 4,
559 | "nbformat_minor": 4
560 | }
561 |
--------------------------------------------------------------------------------
/Group Data Challenge 2025/orange_juice.r:
--------------------------------------------------------------------------------
1 | ######################## Load Packages and Data ########################
2 |
3 | # Load packages
4 | library(rpart)
5 | library(rpart.plot)
6 | library(grf)
7 | library(glmnet)
8 |
9 | # Load data
10 | juice <- read.csv("juice.csv", sep = ",")
11 | new_grocery <- read.csv("new_grocery.csv", sep = ",")
12 |
13 | print('Packages and data successfully loaded.')
14 |
15 | #############################################################################
16 |
17 | ######################## Describe Old Data ########################
18 |
19 | # Print first few rows of old data
20 | head(juice)
21 |
22 | # Number of observations
23 | print(paste0('Old data: ',nrow(juice),' observations'))
24 |
25 | ######################################################################
26 |
27 | ######################## Describe Old Data ########################
28 |
29 | # Print first few rows of new data
30 | head(new_grocery)
31 |
32 | # Number of observations
33 | print(paste0('New data: ',nrow(new_grocery),' observations'))
34 |
35 | ######################################################################
36 |
37 | ######################## Data Preparation ########################
38 |
39 | # Generate dummy for missing prices
40 | missing <- (is.na(juice$price) == TRUE)
41 | new_missing <- (is.na(new_grocery$price) == TRUE)
42 |
43 | # Replace missing prices with zero
44 | juice$price[is.na(juice$price)] <-0
45 | new_grocery$price[is.na(new_grocery$price)] <-0
46 |
47 | # Generate Dummies for Brands
48 | brand_1 <- (juice$brand == "minute.maid")
49 | brand_2 <- (juice$brand == "dominicks")
50 | brand_3 <- (juice$brand == "tropicana")
51 |
52 | new_brand_1 <- (new_grocery$brand == "minute.maid")
53 | new_brand_2 <- (new_grocery$brand == "dominicks")
54 | new_brand_3 <- (new_grocery$brand == "tropicana")
55 |
56 | # Generate outcome and control variables
57 | y <- as.matrix(juice$sales)
58 | colnames(y) <- c("sales")
59 |
60 | x <- as.matrix(cbind(juice$price, missing, brand_1, brand_2, brand_3, juice$feat))
61 | colnames(x) <- c("price", "missing", "minute.maid", "dominicks", "tropicana", "featured")
62 |
63 | new_x <- as.matrix(cbind(new_grocery$price, new_missing, new_brand_1, new_brand_2, new_brand_3, new_grocery$feat))
64 | colnames(new_x) <- c("price", "missing", "minute.maid", "dominicks", "tropicana", "featured")
65 |
66 | # Descriptive statistics
67 | summary(cbind(y,x))
68 |
69 | print('Data is prepared.')
70 |
71 | #############################################################################
72 |
73 | ######################## Training and Test Samples ########################
74 |
75 | set.seed(???)
76 |
77 | # Generate variable with the rows in training data
78 |
79 |
80 | print('Training and test samples created.')
81 |
82 | #############################################################################
83 |
84 | ######################## LASSO, Ridge, Elastic Net ##############################
85 |
86 | set.seed(???)
87 | penalized.cv <- ???
88 |
89 |
90 | # Fitted values
91 | pred_penalized <- ???
92 |
93 | # Calculate the MSE
94 | MSE_penalized <- mean((y[-training_set] - pred_penalized[-training_set])^2)
95 | R2_penalized <- round(1- MSE_penalized/var(y[-training_set]), digits = 3)
96 |
97 | print(paste0("R-squared Penalized Regression: ", R2_penalized))
98 |
99 | ################################################################
100 |
101 | ###################### Regression Tree #######################
102 |
103 | set.seed(???)
104 | # Prepare data for tree estimator
105 | outcome <- y[training_set]
106 | tree_data <- data.frame(outcome, x[training_set,])
107 |
108 | deep_tree <- ???
109 |
110 | # Optimal tree size
111 | op.index <- ???
112 |
113 | ## Select the Tree that Minimises CV-MSE
114 | cp.vals <- ???
115 |
116 | # Prune the deep tree
117 | pruned_tree <- ???
118 |
119 | ## Plot tree structure
120 | #rpart.plot(pruned_tree,digits=3)
121 |
122 | # Fitted values
123 | predtree <- ???
124 |
125 | # Calculate the MSE
126 | MSEtree <- mean((y[-training_set] - predtree[-training_set])^2)
127 | R2tree <- round(1- MSEtree/var(y[-training_set]), digits = 3)
128 |
129 | print(paste0("R-squared Tree: ", R2tree))
130 |
131 | ################################################################
132 |
133 | ######################## Random Forest #######################
134 |
135 | set.seed(???)
136 |
137 | rep <- ??? # number of trees
138 | cov <- ??? # share of covariates
139 | frac <- ??? # fraction of subsample
140 | min_obs <- ??? # max. size of terminal leaves in trees
141 |
142 | # Build Forest
143 | forest <- ???
144 |
145 | # Fitted values
146 | predforest <- ???
147 |
148 | # Calculate MSE
149 | MSEforest <- mean((y[-training_set] - predforest[-training_set])^2)
150 | R2forest <- round(1- MSEforest/var(y[-training_set]), digits = 3)
151 |
152 | print(paste0("R-squared Forest: ", R2forest))
153 |
154 | ################################################################
155 |
156 | ######################## Out-of-Sample Prediction #######################
157 |
158 | # Fitted values
159 | new_prediction <- ???
160 |
161 | print('Out-of-sample sales are predicted.')
162 |
163 | ###########################################################################
164 |
165 | ######################## Store Results #######################
166 |
167 | id_new <- as.matrix(new_grocery$id)
168 |
169 | # Replace ??? with your group name
170 | write.csv(cbind(id_new,new_prediction),"???.csv")
171 |
172 | print('File is stored.')
173 | print('Send your results to anthony.strittmatter@unibas.ch')
174 |
175 | ################################################################
176 |
177 |
178 |
--------------------------------------------------------------------------------
/Individual Home Assignment 2025/grading_grid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Individual Home Assignment 2025/grading_grid.pdf
--------------------------------------------------------------------------------
/Individual Home Assignment 2025/research_proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Individual Home Assignment 2025/research_proposal.pdf
--------------------------------------------------------------------------------
/Literature/Athey_2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Athey_2017.pdf
--------------------------------------------------------------------------------
/Literature/Athey_et_al_2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Athey_et_al_2019.pdf
--------------------------------------------------------------------------------
/Literature/Belloni_et_al_2012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2012.pdf
--------------------------------------------------------------------------------
/Literature/Belloni_et_al_2014a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2014a.pdf
--------------------------------------------------------------------------------
/Literature/Belloni_et_al_2014b.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Belloni_et_al_2014b.pdf
--------------------------------------------------------------------------------
/Literature/Cagala_et_al_2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Cagala_et_al_2021.pdf
--------------------------------------------------------------------------------
/Literature/Chernozhukov_et_al_2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Chernozhukov_et_al_2017.pdf
--------------------------------------------------------------------------------
/Literature/Chetverikov_et_al_2020.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Chetverikov_et_al_2020.pdf
--------------------------------------------------------------------------------
/Literature/Google flu trends.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Google flu trends.pdf
--------------------------------------------------------------------------------
/Literature/Mullainathan_Spiess_2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Mullainathan_Spiess_2017.pdf
--------------------------------------------------------------------------------
/Literature/Semenova_Chernozhukov_2020.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/Literature/Semenova_Chernozhukov_2020.pdf
--------------------------------------------------------------------------------
/PC Lab 1/help files/glmnet_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/help files/glmnet_package.pdf
--------------------------------------------------------------------------------
/PC Lab 1/penalize_regression_tutorial.r:
--------------------------------------------------------------------------------
1 | ######################## Load Packages and Data ########################
2 |
3 | # Load packages
4 | library(glmnet)
5 | library(corrplot)
6 |
7 | # Load data
8 | load("student-mat-train.Rdata")
9 | load("student-mat-test.Rdata")
10 |
11 | # Number of observations
12 | print(paste0('Training set: ',nrow(train),' obs'))
13 | print(paste0('Test set: ',nrow(test),' obs'))
14 |
15 | ###########################################################################
16 |
17 | ######################## Correlation analysis ########################
18 | cor <- round(cor(train[,c(1:25)]),2) # Variable 26 is the depedendent variable
19 | corrplot(cor)
20 |
21 | ######################## Estimation of the linear regression ########################
22 |
23 | ols <- lm(G3 ~ ., data = train)
24 | summary(ols)
25 |
26 | # Calculate the MSE
27 | test$predols <- predict(ols, newdata = test)
28 |
29 | predMSEols <- mean((test$G3 - test$predols)^2)
30 | print(predMSEols)
31 |
32 | ########################################################################################
33 |
34 | ######################## OLS model ########################
35 |
36 | ols_small <- lm(??? , data = train)
37 |
38 | # Calculate the MSE
39 | test$predols_small <- predict(ols_small, newdata = test)
40 |
41 | predMSEols_small <- mean((test$G3 - test$predols_small)^2)
42 | print(predMSEols_small)
43 |
44 | ######################## Lasso Path ########################
45 |
46 | # We make a plot that shows how the Lasso coefficients change with lambda
47 | # glmnet is the standard R package for Lasso, Ridge, and Elastic Net
48 | # alpha is a parmeter that allows to specify a Lasso, Ridge, or Elastic Net model
49 | # alpha = 1 for Lasso; alpha = 0 for Ridge, 0 < alpha < 1 for Elastic Net
50 | # The control variables are train[,c(1:25)]
51 | # The outcome variable is train$G3 (math grades)
52 |
53 | # Estimate a Lasso model
54 | lasso <- glmnet(as.matrix(train[,c(1:25)]), train$G3, alpha = 1) # We save the model under the name "lasso"
55 | plot(lasso, xvar = "lambda", label = TRUE)
56 |
57 | ###############################################################
58 |
59 | ######################## Cross-Validaton ########################
60 |
61 | # Set starting value for replicability
62 | set.seed(27112019)
63 |
64 | # cv.glmnet performs a cross-validation to determine the optimal lambda value
65 | # type.measure specifies the measure we use to assess the model accuracy (here MSE)
66 | # nfolds specifies the number of cross-validation folds we use (here 5)
67 |
68 | # Cross-validate the Lasso
69 | lasso.cv <- cv.glmnet(as.matrix(train[,c(1:25)]), train$G3, type.measure = "mse", nfolds = 5, alpha = 1)
70 |
71 | # Plot the MSE for the different lambda values
72 | plot(lasso.cv)
73 |
74 | #####################################################################
75 |
76 | ######################## Optimal Lambda Value ########################
77 |
78 | # Print the optimal lambda value
79 | print(paste0("Optimal lambda that minimizes cross-validated MSE: ", lasso.cv$lambda.min))
80 | print(paste0("Optimal lambda using one-standard-error-rule: ", lasso.cv$lambda.1se))
81 |
82 | #########################################################################
83 |
84 | ######################## Lasso Coefficients ########################
85 |
86 | # Print Lasso coefficients
87 | print(coef(lasso.cv, s = "lambda.min"))
88 |
89 | # Save for later comparison
90 | coef_lasso1 <- coef(lasso.cv, s = "lambda.min")
91 |
92 | #######################################################################
93 |
94 | ######################## Test Sample MSE ########################
95 |
96 | # Estimate the fitted values of the Lasso model in the test sample
97 | # We use the model "lasso.cv" and the lambda value which we estimated in the training sample
98 | # The control variables "newx" are from the test sample
99 |
100 | # Fitted values
101 | test$predlasso <- predict(lasso.cv, newx = as.matrix(test[,c(1:25)]), s = lasso.cv$lambda.min)
102 |
103 | # Calculate the MSE
104 | predMSElasso <- mean((test$G3 - test$predlasso)^2)
105 | print(paste0("MSE: ", predMSElasso))
106 |
107 | #####################################################################
108 |
109 | ######################## Different Starting Value ########################
110 |
111 | # Change the starting value
112 | set.seed(27112025) # 27112024
113 |
114 | # Re-estimate the Lasso model
115 | lasso.cv <- cv.glmnet(???)
116 |
117 | # Store the coefficients
118 | coef_lasso2 <- coef(lasso.cv, s = ???)
119 | print(cbind(coef_lasso1, coef_lasso2))
120 |
121 | # Calculate the fitted values
122 | test$predlasso2 <- predict(lasso.cv, newx = as.matrix(test[,c(1:25)]), s = lasso.cv$lambda.min)
123 |
124 | # Correlation between the fitted values of the two Lasso models
125 | cor_fit <- cor(test$predlasso,test$predlasso2)
126 | print(paste0("Correlation between fitted values: ", cor_fit))
127 |
128 | ######################## Ridge Path ########################
129 |
130 | # alpha = 0 specifies a Ridge model
131 |
132 | # Estimate the Ridge
133 | ridge <- glmnet(as.matrix(train[,c(1:25)]), train$G3, alpha = ???)
134 |
135 | # Plot the path of the Ridge coefficients
136 | plot(ridge, xvar = "lambda", label = TRUE)
137 |
138 | ###############################################################
139 |
140 | ######################## Cross-Validation ########################
141 |
142 | # Set starting value
143 | set.seed(27112019)
144 |
145 | # Cross-validate the Ridge model
146 | ridge.cv <- cv.glmnet(???)
147 |
148 | # Plot the MSE in the cross-validation samples
149 | plot(ridge.cv)
150 |
151 | #####################################################################
152 |
153 | ######################## Optimal Lambda Value ########################
154 |
155 | # Print the optimal lambda value
156 | print(paste0("Optimal lambda that minimizes cross-validated MSE: ", ???))
157 | print(paste0("Optimal lambda using one-standard-error-rule: ", ???))
158 |
159 | #########################################################################
160 |
161 | ######################## Ridge Coefficients ########################
162 |
163 | # Print Ridge coefficients
164 | print(coef(ridge.cv, s = "lambda.min"))
165 |
166 | # Save for later comparison
167 | coef_ridge <- coef(ridge.cv, s = "lambda.min")
168 |
169 | #######################################################################
170 |
171 | ######################## Test Sample MSE ########################
172 |
173 | # Estimate fitted values in test sample
174 | test$predridge <- predict(ridge, newx = ???, s = ???)
175 |
176 | # Calculate the MSE
177 | predMSEridge <- ???
178 | print(paste0("MSE: ", predMSEridge))
179 |
180 | ###################################################################
181 |
182 | ######################## Compare Lasso and Ridge Coefficients ########################
183 |
184 | # Pick the coefficients of Dalc and Walc
185 | comp <- cbind(coef(ols)[23:24], coef_lasso1[23:24], coef_lasso2[23:24], coef_ridge[23:24])
186 | colnames(comp) <- c("OLS", "Lasso1", "Lasso2", "Ridge")
187 | print(comp)
188 |
189 | #########################################################################################
190 |
191 | ######################## Compare the MSE ########################
192 |
193 | # Print the MSE of the OLS, Lasso and Ridge models
194 | print(c(predMSEols, predMSElasso, predMSEridge))
195 |
196 | ####################################################################
197 |
198 | ######################## Compare models ########################
199 |
200 | # Visualize the predictions (Predicted vs Actual)
201 | plot(test$G3,test$predols,xlim=c(5,20),ylim=c(4,16), col= "darkgreen", xlab = "Actual Grades", ylab = "Predicted Grades" )
202 | par(new=TRUE)
203 | plot(test$G3,test$predlasso,xlim=c(5,20),ylim=c(4,16), col= "blue", xlab = "", ylab = "" )
204 | par(new=TRUE)
205 | plot(test$G3,test$predridge,xlim=c(5,20),ylim=c(4,16), col= "red", xlab = "", ylab = "" )
206 | abline(a=0,b=1)
207 | legend(16, 9, c("OLS", "Lasso", "Ridge"), col = c("darkgreen", "blue", "red"), pch = c(21, 21, 21))
208 |
209 | ####################################################################
210 |
--------------------------------------------------------------------------------
/PC Lab 1/student-mat-test.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/student-mat-test.Rdata
--------------------------------------------------------------------------------
/PC Lab 1/student-mat-train.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 1/student-mat-train.Rdata
--------------------------------------------------------------------------------
/PC Lab 2/browser-sites.txt:
--------------------------------------------------------------------------------
1 | atdmt.com
2 | yahoo.com
3 | whenu.com
4 | weatherbug.com
5 | msn.com
6 | google.com
7 | aol.com
8 | questionmarket.com
9 | googlesyndication.com-o02
10 | casalemedia.com
11 | mywebsearch.com
12 | myspace.com
13 | pointroll.com
14 | atwola.com
15 | yieldmanager.com
16 | live.com
17 | aim.com
18 | mediaplex.com
19 | precisionclick.com
20 | tribalfusion.com
21 | insightexpressai.com
22 | trafficmp.com
23 | ebay.com
24 | realmedia.com
25 | zedo.com
26 | advertising.com
27 | microsoft.com
28 | hotbar.com
29 | adrevolver.com
30 | ru4.com
31 | 180solutions.com
32 | nextag.com
33 | accuweather.com
34 | overture.com
35 | hotmail.com
36 | passport.com
37 | my-etrust.com
38 | starware.com
39 | relevantknowledge.com
40 | myway.com
41 | partner2profit.com
42 | ditto.com
43 | kanoodle.com
44 | ebayobjects.com
45 | mcafee.com
46 | comcast.net
47 | fastclick.net
48 | adbrite.com
49 | vpptechnologies.com
50 | specificclick.net
51 | serving-sys.com
52 | weather.com
53 | adserver.com
54 | licenseacquisition.org
55 | pogo.com
56 | go.com
57 | btgrab.com
58 | bellsouth.net
59 | intellisrv.net
60 | dell.com
61 | waol.exe
62 | cnn.com
63 | facebook.com
64 | incredibarvuz1.com
65 | burstnet.com
66 | adknowledge.com
67 | funwebproducts.com
68 | belnk.com
69 | netscape.com
70 | mysearch.com
71 | real.com
72 | liveperson.net
73 | adsonar.com
74 | passport.net
75 | euroclick.com
76 | m7z.net
77 | mywebface.com
78 | kazaa.com
79 | bestoffersnetworks.com
80 | vitalstream.com
81 | tacoda.net
82 | unicast.com
83 | offeroptimizer.com
84 | bankofamerica.com
85 | acsd.exe
86 | gator.com
87 | quickbrowsersearch.com
88 | revsci.net
89 | personalweb.com
90 | rr.com
91 | msnusers.com
92 | zango.com
93 | earthlink.net
94 | mapquest.com
95 | falkag.net
96 | freeze.com
97 | amazon.com
98 | net-offers.net
99 | shopperreports.com
100 | dellfix.com
101 | plaxo.com
102 | ysbweb.com
103 | googleadservices.com
104 | qnsr.com
105 | revenue.net
106 | adultfriendfinder.com
107 | addynamix.com
108 | seekmo.com
109 | verizon.net
110 | cox.net
111 | metricsdirect.com
112 | akamai.net
113 | admarketplace.net
114 | amazon.com-o01
115 | aolacsd.exe
116 | opinionsquare.com
117 | interclick.com
118 | peoplepc.com
119 | go.com-o04
120 | realtechnetwork.net
121 | freezecoldcash.com
122 | ask.com
123 | contextweb.com
124 | intellitxt.com
125 | yceml.net
126 | about.com
127 | youtube.com
128 | wikipedia.org
129 | surfaccuracy.com
130 | windowsmedia.com
131 | craigslist.org
132 | hackerwatch.org
133 | foxsports.com
134 | spamblockerutility.com
135 | walmart.com
136 | navexcel.com
137 | partypoker.com
138 | wellsfargo.com
139 | travelzoo.com
140 | photobucket.com
141 | viewpoint.com
142 | nielsennetpanel.com
143 | mymailstamp.com
144 | windows.com
145 | optonline.net
146 | eguard.com
147 | aolcdn.com
148 | musicmatch.com
149 | qksz.net
150 | cometsystems.com
151 | netzero.net
152 | specificmedia.com
153 | paypal.com
154 | iwon.com
155 | monster.com-o01
156 | vmn.net
157 | juno.com
158 | information.com
159 | sysupdates.com
160 | 2o7.net
161 | adwave.com
162 | need2find.com
163 | target.com
164 | ebayrtm.com
165 | match.com
166 | bridgetrack.com
167 | comcastsupport.com
168 | rs6.net
169 | screensavers.com
170 | footprint.net
171 | sportsline.com
172 | adelphia.net
173 | smileycentral.com
174 | dlqm.net
175 | careerbuilder.com
176 | mlb.com
177 | searchignite.com
178 | wachovia.com
179 | expedia.com
180 | thinktarget.com
181 | authnow.com
182 | dotomi.com
183 | blogspot.com
184 | hpdjjs.com
185 | chase.com
186 | outerinfo.com
187 | nscpcdn.com
188 | vonage.com
189 | searchscout.com
190 | compuserve.com
191 | lycos.com
192 | xanga.com
193 | websearch.com
194 | azjmp.com
195 | tmcs.net-o01
196 | exitexchange.com
197 | toshibapc.com
198 | runescape.com
199 | weatherstudio.com
200 | imdb.com
201 | adecn.com
202 | bargain-buddy.net
203 | carsdirect.com
204 | mspaceads.com
205 | apple.com
206 | ups.com
207 | 88.80.5.21
208 | exct.net
209 | cingular.com
210 | foodnetwork.com
211 | go.com-o03
212 | excite.com
213 | capitalone.com
214 | imiclk.com
215 | overstock.com
216 | bloglines.com
217 | compfused.com
218 | morpheus.com
219 | foxnews.com
220 | marketwatch.com
221 | wamu.com
222 | monster.com
223 | adobe.com
224 | 888.com
225 | untd.com
226 | abetterinternet.com
227 | centralmedia.ws
228 | valuead.com
229 | targetsaver.com
230 | lynxtrack.com
231 | cartoonnetwork.com
232 | netflix.com
233 | chitika.net
234 | geocities.com
235 | qsrch.com
236 | drsnsrch.com
237 | autobytel.com
238 | web-nexus.net
239 | webservicehosts.com
240 | sharewareonline.com
241 | llnwd.net
242 | instantnavigation.com
243 | nick.com
244 | nfl.com
245 | oingo.com
246 | lightningcast.net
247 | altbill.com
248 | xolox.nl
249 | superpages.com
250 | classmates.com
251 | aavalue.com
252 | bluestreak.com
253 | southwest.com
254 | whitepages.com
255 | usps.com
256 | webhancer.com
257 | bbc.co.uk
258 | true.com
259 | bearshare.com
260 | citibank.com
261 | blackplanet.com
262 | pch.com
263 | att.net
264 | autoweb.com
265 | insightexpress.com
266 | charter.net
267 | alumnigroup.org
268 | verizonwireless.com
269 | fedex.com
270 | mobilesidewalk.com
271 | netteller.com
272 | webshots.com
273 | sprint.com
274 | orbitz.com
275 | bestbuy.com
276 | grandstreetinteractive.com
277 | paypopup.com
278 | cheaptickets.com
279 | dell4me.com
280 | new.net
281 | nytimes.com
282 | nyadmcncserve-05y06a.com
283 | aoltpspd.exe
284 | toprebates.com
285 | jcpenney.com
286 | geotrust.com
287 | travelocity.com
288 | qvc.com
289 | 4at1.com
290 | cpmstar.com
291 | bizrate.com
292 | ticketmaster.com
293 | usbank.com
294 | tripod.com
295 | buy.com
296 | nascar.com
297 | aebn.net
298 | infospace.com
299 | wxbug.com
300 | contextuads.com
301 | bns1.net
302 | download.com
303 | gocyberlink.com
304 | 192.168.1.1
305 | dvlabs.com
306 | defamer.com
307 | tracking101.com
308 | accountonline.com
309 | hbmediapro.com
310 | usatoday.com
311 | bigfishgames.com
312 | neopets.com
313 | adoutput.com
314 | sbc.com
315 | noaa.gov
316 | lowermybills.com
317 | kmpads.com
318 | directtrack.com
319 | clicksor.com
320 | legacy.com
321 | eajmp.com
322 | nastydollars.com
323 | worldofwarcraft.com
324 | mirarsearch.com
325 | verizon.com
326 | miniclip.com
327 | iwin.com
328 | peel.com
329 | hgtv.com
330 | amaena.com
331 | sprintpcs.com
332 | shopping.com
333 | webmd.com
334 | clearchannel.com
335 | winamp.com
336 | reference.com
337 | interpolls.com
338 | americangreetings.com
339 | tmcs.net
340 | midtenmedia.com
341 | domainsponsor.com
342 | thunderdownloads.com
343 | akamaistream.net
344 | livejournal.com
345 | tx.us
346 | onlinerewardcenter.com
347 | msn.com-o18
348 | sony.com
349 | dogpile.com
350 | nba.com
351 | citysearch.com
352 | connextra.com
353 | nickjr.com
354 | t-mobile.com
355 | winfixer.com
356 | adlegend.com
357 | adsrevenue.net
358 | sears.com
359 | ap.org
360 | luna.net
361 | shockwave.com
362 | hsn.com
363 | fl.us
364 | mypoints.com
365 | mozilla.org
366 | aresgalaxy.org
367 | realtor.com
368 | addictinggames.com
369 | clickbooth.com
370 | amateurmatch.com
371 | worldnow.com
372 | surveys.com
373 | pa.us
374 | arcaderockstar.com
375 | coolsavings.com
376 | yournewsletters.net
377 | liquidmedianetworks.com
378 | everythinggirl.com
379 | perfectmatch.com
380 | stockgroup.com
381 | netster.com
382 | bidclix.com
383 | dropspam.com
384 | hp.com
385 | drivecleaner.com
386 | consumerpromotioncenter.com
387 | aolwbspd.exe
388 | americanexpress.com
389 | totaltalk.com
390 | wwe.com
391 | kontera.com
392 | gamehouse.com
393 | circuitcity.com
394 | yimg.com
395 | lightningcast.com
396 | edgefcs.net
397 | wunderground.com
398 | realarcade.com
399 | singlesnet.com
400 | azcentral.com
401 | yellowpages.com
402 | eharmony.com
403 | paviliondownload.com
404 | insightbb.com
405 | imageshack.us
406 | shopzilla.com
407 | ca.gov
408 | donotchangeme.com
409 | ca.us
410 | sourceforge.net
411 | washingtonpost.com
412 | adjuggler.com
413 | careercast.com
414 | bangbros1.com
415 | scripps.com-o01
416 | migente.com
417 | homedepot.com
418 | winantivirus.com
419 | irs.gov
420 | blockbuster.com
421 | kodakgallery.com
422 | nih.gov
423 | aol.com-o07
424 | icq.com
425 | wordcents.com
426 | drudgereport.com
427 | quizilla.com
428 | srch-results.com
429 | inqwire.com
430 | ign.com
431 | oinadserver.com
432 | azoogleads.com
433 | incredimail.com
434 | shopathome.com
435 | mtv.com
436 | fidelity.com
437 | bullseye-network.com
438 | flash-gear.com
439 | proficient.com
440 | autotrader.com
441 | charter.com
442 | healthology.com
443 | evite.com
444 | checkm8.com
445 | rsc01.net
446 | oasei.com
447 | heavy.com
448 | slotch.com
449 | passion.com
450 | nbc.com
451 | trafficmarketplace.com
452 | univision.com
453 | priceline.com
454 | flickr.com
455 | andale.com
456 | dealtime.com
457 | yfdirect.com
458 | entrepreneur.com
459 | go.com-o01
460 | webmd.com-o01
461 | sexsearch.com
462 | pornaccess.com
463 | gcion.com
464 | shoplocal.com
465 | kliptracker.com
466 | nationalcity.com
467 | bbeplayer.com
468 | videodome.com
469 | 204.95.60.12
470 | napster.com
471 | myweather.net
472 | msnbc.com
473 | linkexchange.com
474 | searchmarketing.com
475 | angelfire.com
476 | callwave.com
477 | sonnerie.net
478 | scout.com
479 | rivals.com
480 | altnet.com
481 | spynet.com
482 | macromedia.com
483 | ed.gov
484 | wannawatch.com
485 | frontiernet.net
486 | flycell.com
487 | edgesuite.net
488 | 89.com
489 | nc.us
490 | ticketmaster.com-o01
491 | flowgo.com
492 | cnet.com
493 | oddcast.com
494 | answers.com
495 | timeinc.net
496 | m5-systems.com
497 | guideforyou.com
498 | rn11.com
499 | lowes.com
500 | lifescript.com
501 | shop.com
502 | errorsafe.com
503 | cams.com
504 | macys.com
505 | aa.com
506 | addictingclips.com
507 | victoriassecret.com
508 | orchardbank.com
509 | bravenet.com
510 | imesh.com
511 | nextel.com
512 | screensandthemes.com
513 | suntrust.com
514 | discovercard.com
515 | nbads.com
516 | consumerincentiverewards.com
517 | valueclick.com
518 | google.com-o03
519 | cbs.com
520 | bannerspace.com
521 | technorati.com
522 | cjt1.net
523 | exactsearch.net
524 | munky.com
525 | cs.com
526 | kohls.com
527 | tagged.com
528 | babycenter.com
529 | ebaumsworld.com
530 | userplane.com
531 | mediaplazza.com
532 | netzerovoice.com
533 | gamespot.com
534 | keen.com
535 | bebo.com
536 | rsc02.net
537 | sysupdates2.com
538 | imlive.com
539 | oldnavy.com
540 | regalinteractive.com
541 | weightwatchers.com
542 | subsag.com
543 | aol.com-o08
544 | azlyrics.com
545 | freeringtonesnow.com
546 | freewebs.com
547 | toysrus.com
548 | hollywood.com
549 | findwhat.com
550 | local.com
551 | webroot.com
552 | tvguide.com
553 | ny.us
554 | resultsmaster.com
555 | jamster.com
556 | gms1.net
557 | switchboard.com
558 | nicheseek.com
559 | intelius.com
560 | hi5.com
561 | glispa.com
562 | gannettonline.com
563 | cstv.com
564 | adengage.com
565 | superbrewards.com
566 | videocodezone.com
567 | symantecliveupdate.com
568 | pbskids.org
569 | revresda.com
570 | americansingles.com
571 | ugo.com-o02
572 | job.com
573 | installshield.com
574 | eprize.net
575 | metacafe.com
576 | focalex.com
577 | cciads.us
578 | perfectgonzo.com
579 | kbb.com
580 | reunion.com
581 | eproof.com
582 | tripadvisor.com
583 | bellsouth.com
584 | search.com
585 | comcast.com
586 | ivillage.com
587 | sun.com
588 | regionsnet.com
589 | mininova.org
590 | beliefnet.com
591 | intellicast.com
592 | fastonlineusers.com
593 | gamespot.com-o01
594 | expedia.com-o01
595 | military.com
596 | musicnet.com
597 | 53.com
598 | oh.us
599 | itrack.it
600 | officedepot.com
601 | adultadworld.com
602 | univision.com-o01
603 | youravon.com
604 | blackboard.com
605 | yahoo.net
606 | casinolasvegas.com
607 | warnerbros.com
608 | delta.com
609 | go.com-o02
610 | deepnetexplorer.co.uk
611 | mozilla.com
612 | opentracker.net
613 | break.com
614 | catcha10.com
615 | hotels.com
616 | hallmark.com
617 | sportsbook.com
618 | mycheckfree.com
619 | ezboard.com
620 | pro-market.net
621 | mate1.com
622 | awempire.com
623 | jigzone.com
624 | bangbrosnetwork.com
625 | marketlinx.com
626 | tickle.com
627 | bbandt.com
628 | mercuras.com
629 | adtology2.com
630 | bluemountain.com
631 | freepornofreeporn.com
632 | internet-optimizer.com
633 | autotrader.com-o01
634 | blogger.com
635 | kraftfoods.com
636 | loveaccess.com
637 | shutterfly.com
638 | stopzilla.com
639 | xmradio.com
640 | ga.us
641 | ancestry.com
642 | honda.com
643 | fulltiltpoker.com
644 | il.us
645 | ibsys.com
646 | imixserver.com
647 | barnesandnoble.com
648 | pricegrabber.com
649 | constantcontact.com
650 | zonelabs.com
651 | pimpyourpro.com
652 | netflame.cc
653 | slide.com
654 | xnxx.com
655 | upromise.com
656 | livesexbar.com
657 | videosz.com
658 | freeweblayouts.net
659 | limewire.com
660 | ameritrade.com
661 | freelaptop4you.com
662 | nickarcade.com
663 | utkn.com
664 | nj.us
665 | 360i.com
666 | finestresults.com
667 | asseenontvnetwork.com
668 | typepad.com
669 | efax.com
670 | regions.com
671 | emachines.com
672 | playaudiomessage.com
673 | bofunk.com
674 | millsberry.com
675 | cpvfeed.com
676 | allrecipes.com
677 | clubpenguin.com
678 | eversave.com
679 | ppmdating.com
680 | lexico.com
681 | usaa.com
682 | directv.com
683 | postini.com
684 | secure-banking.com
685 | eyewonder.com
686 | boston.com
687 | ibanking-services.com
688 | astrology.com
689 | datinggold.com
690 | mlxchange.com
691 | travelhook.net
692 | custhelp.com
693 | mn.us
694 | zwire.com
695 | emarketmakers.com
696 | gamefaqs.com
697 | premiumproductsonline.com
698 | chrysler.com
699 | prodigy.net
700 | tv.com
701 | windowsmedia.com-o04
702 | smashits.com
703 | 65.115.67.11
704 | snapfish.com
705 | commerceonlinebanking.com
706 | bbt.com
707 | linksynergy.com
708 | yahoo.com-o08
709 | freecodesource.com
710 | streamate.com
711 | freecreditreport.com
712 | intuit.com
713 | rapid-pass.net
714 | artistdirect.com
715 | servedbyadbutler.com
716 | sidestep.com
717 | adult.com
718 | alltel.net
719 | bcentral.com
720 | openbank.com
721 | nichedsites.com
722 | cars.com
723 | gm.com
724 | adshuffle.com
725 | freeslots.com
726 | blink.com
727 | candystand.com
728 | monstermarketplace.com
729 | columbiahouse.com
730 | pncbank.com
731 | discovery.com
732 | hsbcbillpay.com
733 | movietickets.com
734 | page-not-found.net
735 | fandango.com
736 | providianservices.com
737 | carad.com
738 | homestead.com
739 | realcastmedia.com
740 | webratsmusic.com
741 | scottrade.com
742 | cs102175.com
743 | fnismls.com
744 | shopperssavingcenter.com
745 | hit-now.com
746 | whatismyip.com
747 | costco.com
748 | bolt.com
749 | bmgmusic.com
750 | myhealthwealthandhappiness.com
751 | symantec.com
752 | forbes.com
753 | digitalcity.com
754 | live365.com
755 | firstadsolution.com
756 | linkconnector.com
757 | freepagegraphics.com
758 | imgfarm.com
759 | insightexpresserdd.com
760 | pcsecurityshield.com
761 | allposters.com-o01
762 | msnvideo.com
763 | miva.com
764 | jackpotmadness.com
765 | mbnanetaccess.com
766 | newcarinsider.com
767 | edmunds.com
768 | net-nucleus.com
769 | popcap.com
770 | alt.com
771 | staples.com
772 | ussearch.com
773 | bankone.com
774 | rootv.com
775 | citizensbankonline.com
776 | juggcrew.com
777 | navyfcu.org
778 | nordstrom.com
779 | webstat.com
780 | inklineglobal.com
781 | seeq.com
782 | onetruemedia.com
783 | paltalk.com
784 | sonypictures.com
785 | 204.181.57.155
786 | commerceonline.com
787 | friendster.com
788 | slate.com
789 | hermoment.com
790 | lovehappens.com
791 | mi.us
792 | kmart.com
793 | paidsurveys.com
794 | 123greetings.com
795 | blinko.com
796 | citizensbank.com
797 | sirius.com
798 | qrs1.net
799 | adbureau.net
800 | turn.com
801 | abcdistributing.com
802 | fundsxpress.com
803 | pichunter.com
804 | cbsnews.com
805 | 216.139.222.230
806 | anywho.com
807 | sedoparking.com
808 | householdbank.com
809 | treborwear.com
810 | evault.ws
811 | vh1.com
812 | financialcontent.com
813 | gap.com
814 | active.com
815 | exclusivegiftcards.com
816 | michigan.gov
817 | dada-mobile.net
818 | textplussolutions.com
819 | myriadmarket.com
820 | ifriends.net
821 | aptimus.com
822 | valueclick.net
823 | pennyweb.com
824 | blackpeoplemeet.com
825 | eltpath.com
826 | yahoo.com-o46
827 | sysprotect.com
828 | dadamobile.com
829 | cpxinteractive.com
830 | clickspring.net
831 | staples-deals.com
832 | myyearbook.com
833 | bravenetmedianetwork.com
834 | etrade.com
835 | marykayintouch.com
836 | 64.39.16.166
837 | moregamers.com
838 | redorbit.com
839 | tmz.com
840 | blogrolling.com
841 | checkfree.com
842 | samsclub.com
843 | va.us
844 | united.com
845 | certified-safe-downloads.com
846 | aimtoday.com
847 | toseeka.com
848 | bidz.com
849 | gamespy.com
850 | nylottery.org
851 | godaddy.com
852 | rsc03.net
853 | altavista.com
854 | ltdcommodities.com
855 | bhg.com
856 | opm.gov
857 | onlinemediaoutlet.com
858 | beboframe.com
859 | cafepress.com
860 | tarot.com
861 | webgavel.com
862 | rapmls.com
863 | ztod.com
864 | marriott.com
865 | walgreens.com
866 | rovion.com
867 | ultimatebet.com
868 | ea.com
869 | petfinder.com
870 | winsoftware.com
871 | literotica.com
872 | websourcedtraffic.com
873 | 032439.com
874 | marketbanker.com
875 | clearchannelmusic.com
876 | colonize.com
877 | searchfeed.com
878 | eimg.net
879 | shermanstravel.com
880 | key.com
881 | multi-pops.com
882 | yandex.ru
883 | us.com
884 | kinghost.com
885 | sublimedirectory.com
886 | gogotools.com
887 | camcrush.com
888 | trafficexplorer.com
889 | myfamily.com
890 | gay.com
891 | freegiftworld.com
892 | dexonline.com
893 | trade-in-value.com
894 | shopyourbargain.com
895 | dyndns.org
896 | bizrate.com-o01
897 | xctrk.com
898 | webtoolcafe.com
899 | zappos.com
900 | wi.us
901 | toptvbytes.com
902 | 157.22.32.111
903 | hotfreelayouts.com
904 | registrydefender.com
905 | zap2it.com
906 | 64.136.28.49
907 | afy11.net
908 | 207.97.212.250
909 | invisionfree.com
910 | bravenet.com-o01
911 | gadgetcity.com
912 | army.mil
913 | yourgiftcards.com
914 | craigslist.com
915 | usairways.com
916 | drivelinemedia.com
917 | edline.net
918 | dayport.com
919 | axill.com
920 | smartbargains.com
921 | newgrounds.com
922 | 216.155.193.91
923 | providian.com
924 | statcounter.com
925 | ajc.com
926 | oprah.com
927 | slingo.com
928 | continental.com
929 | relevantchoice.com
930 | toontown.com
931 | thumbplay.com
932 | jacquielawson.com
933 | hotwire.com
934 | nwa.com
935 | atomz.com
936 | nsgalleries.com
937 | uclick.com
938 | mercurial.ca
939 | schwab.com
940 | nvero.net
941 | ediets.com
942 | ichotelsgroup.com
943 | 216.133.243.28
944 | aggregateknowledge.com
945 | topix.net
946 | flalottery.com
947 | dlv4.com
948 | mybloglog.com
949 | lanxtra.com
950 | away.com
951 | grab.com
952 | tipany.com
953 | quickbooks.com
954 | instream.com
955 | pbs.org
956 | findology.com
957 | business.com
958 | cmt.com
959 | myinsiderdeals.com
960 | imagine-msn.com
961 | nhl.com
962 | modern-singles.net
963 | addfreestats.com
964 | rent.com
965 | homegain.com
966 | freeones.com
967 | jetblue.com
968 | loanweb.com
969 | findarticles.com
970 | iwon.com-o04
971 | incredigames.com
972 | webkinz.com
973 | dealerconnection.com
974 | streamaudio.com
975 | grantmedia.com
976 | home123info.com
977 | exittracking.com
978 | worldsex.com
979 | yfdmedia.com
980 | automotive.com
981 | cursormania.com
982 | tradedoubler.com
983 | bedbathandbeyond.com
984 | equifax.com
985 | hotornot.com
986 | falkag.de
987 | chicagotribune.com
988 | airtran.com
989 | thebreastcancersite.com
990 | charmingshoppes.com
991 | ugo.com
992 | cox.com
993 | spicymint.com
994 | real.com-o01
995 | targetnet.com
996 | effectivebrand.com
997 | dallascowboys.com
998 | leadgenetwork.com
999 | in.us
1000 | vistaprint.com
1001 |
--------------------------------------------------------------------------------
/PC Lab 2/help files/grf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 2/help files/grf.pdf
--------------------------------------------------------------------------------
/PC Lab 2/help files/rpart.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 2/help files/rpart.pdf
--------------------------------------------------------------------------------
/PC Lab 2/trees_foests_tutorial.r:
--------------------------------------------------------------------------------
1 | ######################## Load Packages and Data ########################
2 |
3 | # Load packages
4 | library(rpart)
5 | library(rpart.plot)
6 | library(grf)
7 | library(DiagrammeR)
8 |
9 | # Load data
10 | data_2006 <-read.csv("browser_2006.csv", sep = ",")
11 | data_new <-read.csv("browser_new.csv", sep = ",")
12 |
13 | # Data preparation
14 | y_2006 <- as.matrix(data_2006[,2])
15 | x_2006 <- as.matrix(data_2006[,c(3:ncol(data_2006))])
16 | id_2006 <- as.matrix(data_2006[,1])
17 | x_new <- as.matrix(data_new[,c(2:ncol(data_new))])
18 | id_new <- as.matrix(data_new[,1])
19 |
20 | print('Packages and data successfully loaded.')
21 |
22 | #############################################################################
23 |
24 | ######################## Average Spending ########################
25 |
26 | spending <- round(???, digits=2)
27 | print(paste0("In 2006, the average spending is ", spending, " US-dollars"))
28 |
29 | ####################################################################
30 |
31 | ######################## Online Time ########################
32 |
33 | freq <- round(x_2006[id_2006==921,x_2006[id_2006==921,] == ???], digit = 0)
34 | page <- names(freq)
35 |
36 | print(paste0("Household 921 is most of the time on the webpage ", page))
37 | print(paste0(freq, "% of the online time is the household on this webpage"))
38 |
39 | ################################################################
40 |
41 | ######################## Log Transformation ########################
42 |
43 | log_y_2006 = as.matrix(???) # take logarithm
44 |
45 | # Cumulative Distribution of Spending
46 | plot(ecdf(y_2006), xlab = "Spending in US-Dollars", sub = "(Truncated at 20,000 US-Dollars)",
47 | ylab = "cdf", main = "Distribution of Spending", xlim= c(0,20000))
48 |
49 | # Cumulative Distribution of Log Spendiung
50 | plot(ecdf(log_y_2006), xlab = "log Spending", ylab = "cdf", main = "Distribution of Log Spending")
51 |
52 | #######################################################################
53 |
54 | ######################## Training and Test Samples ########################
55 |
56 | set.seed(1001)
57 | # Generate variable with the rows in training data
58 | size <- floor(0.5 * nrow(data_2006))
59 | training_set <- sample(seq_len(nrow(data_2006)), size = size)
60 |
61 | print('Training and test samples created.')
62 |
63 | #############################################################################
64 |
65 | ######################## Shallow Tree ########################
66 |
67 | # Prepare data for tree estimator
68 | outcome <- log_y_2006[training_set]
69 | tree_data_2006 <- data.frame(outcome, x_2006[training_set,])
70 |
71 | # Build shallow tree
72 | set.seed(1001)
73 | shallow_tree <- rpart(formula = outcome ~., data = tree_data_2006, method = "anova", xval = 10,
74 | y = TRUE, control = rpart.control(cp = 0.00002, minbucket=150))
75 | # Note: 'minbucket=100' imposes the restriction that each terminal leave should contain at least 100 observations.
76 | # The algorithm 'rpart' stops growing trees when either one leave has less than 100 observations or
77 | # the MSE gain of addidng one addidtional leave is below cp=0.00002.
78 |
79 | ## Plot tree structure
80 | rpart.plot(shallow_tree,digits=3)
81 |
82 | # bizrate.com
83 | # fedex.com
84 |
85 | ################################################################
86 |
87 | ######################## Deep Tree ########################
88 | set.seed(1001)
89 | deep_tree <- rpart(formula = outcome ~., data = tree_data_2006, ???)
90 |
91 | print('Relative CV-MSE for different tree sizes')
92 | print(deep_tree$cptable)
93 |
94 | # Plot CV-MSE
95 | plotcp(deep_tree)
96 |
97 | #############################################################
98 |
99 | ######################## Optimal Tree Size ########################
100 |
101 | op.index <- which.min(deep_tree$cptable[, "xerror"])
102 | op.size <- deep_tree$cptable[op.index, "nsplit"] +1
103 | print(paste0("Optimal number final leaves: ", op.size))
104 |
105 | #####################################################################
106 |
107 | ######################## Pruned Tree ########################
108 |
109 | # Select the Tree that Minimises CV-MSE
110 | # Get cp-value that corresponds to optimal tree size
111 | cp.vals <- deep_tree$cptable[op.index, "CP"]
112 |
113 | # Prune the deep tree
114 | pruned_tree <- prune(???, cp = cp.vals)
115 |
116 | ## Plot tree structure
117 | rpart.plot(pruned_tree,digits=3)
118 |
119 | # aggregateknowledge.com
120 |
121 | ################################################################
122 |
123 | ######################## Out-of-Sample Performance ########################
124 |
125 | # Predict log online spending
126 | pred_tree <- predict(???, newdata= as.data.frame(x_2006))
127 |
128 | # Test sample data
129 | outcome_test <- log_y_2006[-training_set]
130 | pred_tree_test <- pred_tree[-training_set]
131 |
132 | # R-squared
133 | MSE_tree <- mean((outcome_test-pred_tree_test)^2)
134 | r2_tree <- round(1- MSE_tree/var(outcome_test), digits = 3)
135 | print(paste0("Test sample R-squared: ", r2_tree))
136 |
137 | ##############################################################################
138 |
139 | ######################## Random Forest ########################
140 |
141 | rep <- 1000 # number of trees
142 | cov <- 1/3 # share of covariates
143 | frac <- 1/2 # fraction of subsample
144 | min_obs <- 100 # max. size of terminal leaves in trees
145 |
146 | # Build Forest
147 | set.seed(10001)
148 | forest1 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,],
149 | mtry = floor(cov*ncol(x_2006)), sample.fraction = frac, num.trees = rep,
150 | min.node.size = min_obs, honesty=FALSE)
151 |
152 | print('Forest is built.')
153 |
154 | ##################################################################
155 |
156 | ######################## Plot Example Tree ########################
157 |
158 | # Plot a tree of the forest
159 | # Just an illustration, overall the forest contains 1000 trees
160 | tree <- get_tree(???,1) # here we select tree number 1
161 | plot(tree)
162 |
163 | #####################################################################
164 |
165 | ######################## Variable Importance ########################
166 |
167 | # Plot the variable importantance
168 | # First we consider only first split
169 | imp1 <- variable_importance(forest1, max.depth = 1)
170 | print(cbind(colnames(x_2006[,imp1>0.02]),imp1[imp1>0.02]))
171 |
172 | # Now we consider the first four splits
173 | imp2 <- round(variable_importance(forest1, decay.exponent = 2, max.depth = 4), digits = 3)
174 | print(cbind(colnames(x_2006[,imp2>0.02]),imp2[imp2>0.02]))
175 |
176 | ########################################################################
177 |
178 | ######################## Out-of-Sample Performance ########################
179 |
180 | # Prediction
181 | fit <- predict(???, newdata = x_2006[-training_set,])$predictions
182 |
183 | # R-squared
184 | SST <- mean(((log_y_2006[-training_set,])-mean((log_y_2006[-training_set,])))^2)
185 | MSE1 <- mean(((log_y_2006[-training_set,])-fit)^2)
186 | r2_1 <- round(1- MSE1/SST, digits = 3)
187 | print(paste0("Test sample R-squared: ", r2_1))
188 |
189 | #############################################################################
190 |
191 | ######################## Area Under the Curve (AUC) ########################
192 |
193 | sizes <- c(1000,500,400,300, 200, 100, 50, 40,30,20,10, 5,4,3,2,1) # Select a grid of sample sizes
194 | # Prepare matrix to store results
195 | auc <- matrix(NA, nrow = length(sizes), ncol = 3)
196 | colnames(auc) <- c("Trees", "AUC", "Marginal AUC")
197 | auc[,1] <- sizes
198 | # Sum of Squares Total
199 | SST <- mean(((log_y_2006[-training_set,])-(mean(log_y_2006[-training_set,])))^2)
200 |
201 | set.seed(10001) # set starting value
202 | for (t in sizes){
203 | # Estimate Forests
204 | forest <- regression_forest(x_2006[training_set,],(log_y_2006[training_set,]), mtry = floor(cov*ncol(x_2006)),
205 | sample.fraction = frac, num.trees = t, min.node.size = min_obs, honesty=FALSE)
206 | fit <- predict(forest, newdata = x_2006[-training_set,])$predictions # prediction in test sample
207 | auc[auc[,1]== t,2] <- 1- mean(((log_y_2006[-training_set,])-fit)^2)/SST # store R-squared
208 | }
209 | auc[,3] <- auc[,2] - rbind(as.matrix(auc[-1,2]),auc[nrow(auc),2])
210 |
211 | # Marginal AUC
212 | plot(auc[,1],auc[,2],type = "o",xlab="Trees", ylab= "R-squared", main = "AUC")
213 | abline(a=0,b=0, col="red")
214 |
215 | ################################################################################
216 |
217 | ######################## Deep Forest ########################
218 |
219 | min_obs <- 5
220 | # Build Forest
221 | forest2 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,],
222 | ???)
223 |
224 | # Prediction
225 | fit <- predict(forest2, newdata = x_2006[-training_set,])$predictions
226 |
227 | # R-squared
228 | SST <- mean((log_y_2006[-training_set,]-mean(log_y_2006[-training_set,]))^2)
229 | MSE2 <- mean((log_y_2006[-training_set,]-fit)^2)
230 | r2_2 <- round(1- MSE2/SST, digits = 3)
231 | print(cbind(r2_1,r2_2))
232 |
233 | # Plot tree
234 | tree <- get_tree(forest2, 34)
235 | plot(tree)
236 |
237 | ###############################################################
238 |
239 | ######################## Store Prediction for Hold-out-Sample ########################
240 |
241 | # Hold-out-Sample Prediction
242 | fit_new <- predict(???, newdata = x_new)$predictions
243 |
244 | results <- as.matrix(cbind(id_new,fit_new)) # store ID's and predictions in oine matrix
245 | colnames(results) <- c("id","predictions") # label columns
246 |
247 | # Store results
248 | write.csv(results, "predictions.csv")
249 |
250 | print('Results for the hold-out-sample stored.')
251 |
252 | #########################################################################################
253 |
--------------------------------------------------------------------------------
/PC Lab 3/help files/R_ K-Means Clustering.html:
--------------------------------------------------------------------------------
1 | R: K-Means Clustering
2 |
3 |
4 |
5 |
6 | kmeans {stats} R Documentation
7 |
8 |
9 | K-Means Clustering
10 |
11 |
12 | Description
13 |
14 | Perform k-means clustering on a data matrix.
15 |
16 |
17 |
18 | Usage
19 |
20 |
21 | kmeans(x, centers, iter.max = 10, nstart = 1,
22 | algorithm = c("Hartigan-Wong", "Lloyd", "Forgy",
23 | "MacQueen"), trace=FALSE)
24 | ## S3 method for class 'kmeans'
25 | fitted(object, method = c("centers", "classes"), ...)
26 |
27 |
28 |
29 | Arguments
30 |
31 |
32 | x
33 |
34 | numeric matrix of data, or an object that can be coerced to
35 | such a matrix (such as a numeric vector or a data frame with all
36 | numeric columns).
37 |
38 | centers
39 |
40 | either the number of clusters, say k , or a set of
41 | initial (distinct) cluster centres. If a number, a random set of
42 | (distinct) rows in x
is chosen as the initial centres.
43 |
44 | iter.max
45 |
46 | the maximum number of iterations allowed.
47 |
48 | nstart
49 |
50 | if centers
is a number, how many random sets
51 | should be chosen?
52 |
53 | algorithm
54 |
55 | character: may be abbreviated. Note that
56 | "Lloyd"
and "Forgy"
are alternative names for one
57 | algorithm.
58 |
59 | object
60 |
61 | an R object of class "kmeans"
, typically the
62 | result ob
of ob <- kmeans(..)
.
63 |
64 | method
65 |
66 | character: may be abbreviated. "centers"
causes
67 | fitted
to return cluster centers (one for each input point) and
68 | "classes"
causes fitted
to return a vector of class
69 | assignments.
70 |
71 | trace
72 |
73 | logical or integer number, currently only used in the
74 | default method ("Hartigan-Wong"
): if positive (or true),
75 | tracing information on the progress of the algorithm is
76 | produced. Higher values may produce more tracing information.
77 |
78 | ...
79 |
80 | not used.
81 |
82 |
83 |
84 |
85 | Details
86 |
87 | The data given by x
are clustered by the k -means method,
88 | which aims to partition the points into k groups such that the
89 | sum of squares from points to the assigned cluster centres is minimized.
90 | At the minimum, all cluster centres are at the mean of their Voronoi
91 | sets (the set of data points which are nearest to the cluster centre).
92 |
93 | The algorithm of Hartigan and Wong (1979) is used by default. Note
94 | that some authors use k -means to refer to a specific algorithm
95 | rather than the general method: most commonly the algorithm given by
96 | MacQueen (1967) but sometimes that given by Lloyd (1957) and Forgy
97 | (1965). The Hartigan–Wong algorithm generally does a better job than
98 | either of those, but trying several random starts (nstart
>
99 | 1 ) is often recommended. In rare cases, when some of the points
100 | (rows of x
) are extremely close, the algorithm may not converge
101 | in the “Quick-Transfer” stage, signalling a warning (and
102 | returning ifault = 4
). Slight
103 | rounding of the data may be advisable in that case.
104 |
105 | For ease of programmatic exploration, k=1 is allowed, notably
106 | returning the center and withinss
.
107 |
108 | Except for the Lloyd–Forgy method, k clusters will always be
109 | returned if a number is specified.
110 | If an initial matrix of centres is supplied, it is possible that
111 | no point will be closest to one or more centres, which is currently
112 | an error for the Hartigan–Wong method.
113 |
114 |
115 |
116 | Value
117 |
118 | kmeans
returns an object of class "kmeans"
which has a
119 | print
and a fitted
method. It is a list with at least
120 | the following components:
121 |
122 |
123 | cluster
124 |
125 |
126 | A vector of integers (from 1:k
) indicating the cluster to
127 | which each point is allocated.
128 |
129 |
130 | centers
131 |
132 | A matrix of cluster centres.
133 |
134 | totss
135 |
136 | The total sum of squares.
137 |
138 | withinss
139 |
140 | Vector of within-cluster sum of squares,
141 | one component per cluster.
142 |
143 | tot.withinss
144 |
145 | Total within-cluster sum of squares,
146 | i.e. sum(withinss)
.
147 |
148 | betweenss
149 |
150 | The between-cluster sum of squares,
151 | i.e. totss-tot.withinss
.
152 |
153 | size
154 |
155 | The number of points in each cluster.
156 |
157 | iter
158 |
159 | The number of (outer) iterations.
160 |
161 | ifault
162 |
163 | integer: indicator of a possible algorithm problem
164 | – for experts.
165 |
166 |
167 |
168 |
169 | References
170 |
171 | Forgy, E. W. (1965).
172 | Cluster analysis of multivariate data: efficiency vs interpretability
173 | of classifications.
174 | Biometrics , 21 , 768–769.
175 |
176 | Hartigan, J. A. and Wong, M. A. (1979).
177 | Algorithm AS 136: A K-means clustering algorithm.
178 | Applied Statistics , 28 , 100–108.
179 | doi: 10.2307/2346830 .
180 |
181 | Lloyd, S. P. (1957, 1982).
182 | Least squares quantization in PCM.
183 | Technical Note, Bell Laboratories.
184 | Published in 1982 in IEEE Transactions on Information Theory ,
185 | 28 , 128–137.
186 |
187 | MacQueen, J. (1967).
188 | Some methods for classification and analysis of multivariate
189 | observations.
190 | In Proceedings of the Fifth Berkeley Symposium on Mathematical
191 | Statistics and Probability ,
192 | eds L. M. Le Cam & J. Neyman,
193 | 1 , pp. 281–297.
194 | Berkeley, CA: University of California Press.
195 |
196 |
197 |
198 | Examples
199 |
200 |
201 | require(graphics)
202 |
203 | # a 2-dimensional example
204 | x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
205 | matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
206 | colnames(x) <- c("x", "y")
207 | (cl <- kmeans(x, 2))
208 | plot(x, col = cl$cluster)
209 | points(cl$centers, col = 1:2, pch = 8, cex = 2)
210 |
211 | # sum of squares
212 | ss <- function(x) sum(scale(x, scale = FALSE)^2)
213 |
214 | ## cluster centers "fitted" to each obs.:
215 | fitted.x <- fitted(cl); head(fitted.x)
216 | resid.x <- x - fitted(cl)
217 |
218 | ## Equalities : ----------------------------------
219 | cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
220 | c(ss(fitted.x), ss(resid.x), ss(x)))
221 | stopifnot(all.equal(cl$ totss, ss(x)),
222 | all.equal(cl$ tot.withinss, ss(resid.x)),
223 | ## these three are the same:
224 | all.equal(cl$ betweenss, ss(fitted.x)),
225 | all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
226 | ## and hence also
227 | all.equal(ss(x), ss(fitted.x) + ss(resid.x))
228 | )
229 |
230 | kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
231 |
232 | ## random starts do help here with too many clusters
233 | ## (and are often recommended anyway!):
234 | (cl <- kmeans(x, 5, nstart = 25))
235 | plot(x, col = cl$cluster)
236 | points(cl$centers, col = 1:5, pch = 8)
237 |
238 |
239 | [Package
stats version 4.1.0
Index ]
240 |
241 |
--------------------------------------------------------------------------------
/PC Lab 3/help files/R_ Principal Components Analysis.html:
--------------------------------------------------------------------------------
1 | R: Principal Components Analysis
2 |
3 |
4 |
5 |
6 | prcomp {stats} R Documentation
7 |
8 | Principal Components Analysis
9 |
10 | Description
11 |
12 | Performs a principal components analysis on the given data matrix
13 | and returns the results as an object of class prcomp
.
14 |
15 |
16 | Usage
17 |
18 |
19 | prcomp(x, ...)
20 |
21 | ## S3 method for class 'formula'
22 | prcomp(formula, data = NULL, subset, na.action, ...)
23 |
24 | ## Default S3 method:
25 | prcomp(x, retx = TRUE, center = TRUE, scale. = FALSE,
26 | tol = NULL, rank. = NULL, ...)
27 |
28 | ## S3 method for class 'prcomp'
29 | predict(object, newdata, ...)
30 |
31 |
32 |
33 | Arguments
34 |
35 |
36 | formula
37 |
38 | a formula with no response variable, referring only to
39 | numeric variables.
40 |
41 | data
42 |
43 | an optional data frame (or similar: see
44 | model.frame
) containing the variables in the
45 | formula formula
. By default the variables are taken from
46 | environment(formula)
.
47 |
48 | subset
49 |
50 | an optional vector used to select rows (observations) of the
51 | data matrix x
.
52 |
53 | na.action
54 |
55 | a function which indicates what should happen
56 | when the data contain NA
s. The default is set by
57 | the na.action
setting of options
, and is
58 | na.fail
if that is unset. The ‘factory-fresh’
59 | default is na.omit
.
60 |
61 | ...
62 |
63 | arguments passed to or from other methods. If x
is
64 | a formula one might specify scale.
or tol
.
65 |
66 | x
67 |
68 | a numeric or complex matrix (or data frame) which provides
69 | the data for the principal components analysis.
70 |
71 | retx
72 |
73 | a logical value indicating whether the rotated variables
74 | should be returned.
75 |
76 | center
77 |
78 | a logical value indicating whether the variables
79 | should be shifted to be zero centered. Alternately, a vector of
80 | length equal the number of columns of x
can be supplied.
81 | The value is passed to scale
.
82 |
83 | scale.
84 |
85 | a logical value indicating whether the variables should
86 | be scaled to have unit variance before the analysis takes
87 | place. The default is FALSE
for consistency with S, but
88 | in general scaling is advisable. Alternatively, a vector of length
89 | equal the number of columns of x
can be supplied. The
90 | value is passed to scale
.
91 |
92 | tol
93 |
94 | a value indicating the magnitude below which components
95 | should be omitted. (Components are omitted if their
96 | standard deviations are less than or equal to tol
times the
97 | standard deviation of the first component.) With the default null
98 | setting, no components are omitted (unless rank.
is specified
99 | less than min(dim(x))
.). Other settings for tol could be
100 | tol = 0
or tol = sqrt(.Machine$double.eps)
, which
101 | would omit essentially constant components.
102 |
103 | rank.
104 |
105 | optionally, a number specifying the maximal rank, i.e.,
106 | maximal number of principal components to be used. Can be set as
107 | alternative or in addition to tol
, useful notably when the
108 | desired rank is considerably smaller than the dimensions of the matrix.
109 |
110 | object
111 |
112 | object of class inheriting from "prcomp"
113 |
114 | newdata
115 |
116 | An optional data frame or matrix in which to look for
117 | variables with which to predict. If omitted, the scores are used.
118 | If the original fit used a formula or a data frame or a matrix with
119 | column names, newdata
must contain columns with the same
120 | names. Otherwise it must contain the same number of columns, to be
121 | used in the same order.
122 |
123 |
124 |
125 |
126 |
127 | Details
128 |
129 | The calculation is done by a singular value decomposition of the
130 | (centered and possibly scaled) data matrix, not by using
131 | eigen
on the covariance matrix. This
132 | is generally the preferred method for numerical accuracy. The
133 | print
method for these objects prints the results in a nice
134 | format and the plot
method produces a scree plot.
135 |
136 | Unlike princomp
, variances are computed with the usual
137 | divisor N - 1 .
138 |
139 | Note that scale = TRUE
cannot be used if there are zero or
140 | constant (for center = TRUE
) variables.
141 |
142 |
143 |
144 | Value
145 |
146 | prcomp
returns a list with class "prcomp"
147 | containing the following components:
148 |
149 |
150 | sdev
151 |
152 | the standard deviations of the principal components
153 | (i.e., the square roots of the eigenvalues of the
154 | covariance/correlation matrix, though the calculation
155 | is actually done with the singular values of the data matrix).
156 |
157 | rotation
158 |
159 | the matrix of variable loadings (i.e., a matrix
160 | whose columns contain the eigenvectors). The function
161 | princomp
returns this in the element loadings
.
162 |
163 | x
164 |
165 | if retx
is true the value of the rotated data (the
166 | centred (and scaled if requested) data multiplied by the
167 | rotation
matrix) is returned. Hence, cov(x)
is the
168 | diagonal matrix diag(sdev^2)
. For the formula method,
169 | napredict ()
is applied to handle the treatment of values
170 | omitted by the na.action
.
171 |
172 | center, scale
173 |
174 | the centering and scaling used, or FALSE
.
175 |
176 |
177 |
178 |
179 | Note
180 |
181 | The signs of the columns of the rotation matrix are arbitrary, and
182 | so may differ between different programs for PCA, and even between
183 | different builds of R .
184 |
185 |
186 |
187 | References
188 |
189 | Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
190 | The New S Language .
191 | Wadsworth & Brooks/Cole.
192 |
193 | Mardia, K. V., J. T. Kent, and J. M. Bibby (1979)
194 | Multivariate Analysis , London: Academic Press.
195 |
196 | Venables, W. N. and B. D. Ripley (2002)
197 | Modern Applied Statistics with S , Springer-Verlag.
198 |
199 |
200 |
201 | See Also
202 |
203 | biplot.prcomp
, screeplot
,
204 | princomp
, cor
, cov
,
205 | svd
, eigen
.
206 |
207 |
208 |
209 | Examples
210 |
211 |
212 | C <- chol(S <- toeplitz(.9 ^ (0:31))) # Cov.matrix and its root
213 | all.equal(S, crossprod(C))
214 | set.seed(17)
215 | X <- matrix(rnorm(32000), 1000, 32)
216 | Z <- X %*% C ## ==> cov(Z) ~= C'C = S
217 | all.equal(cov(Z), S, tol = 0.08)
218 | pZ <- prcomp(Z, tol = 0.1)
219 | summary(pZ) # only ~14 PCs (out of 32)
220 | ## or choose only 3 PCs more directly:
221 | pz3 <- prcomp(Z, rank. = 3)
222 | summary(pz3) # same numbers as the first 3 above
223 | stopifnot(ncol(pZ$rotation) == 14, ncol(pz3$rotation) == 3,
224 | all.equal(pz3$sdev, pZ$sdev, tol = 1e-15)) # exactly equal typically
225 |
226 | ## signs are random
227 | require(graphics)
228 | ## the variances of the variables in the
229 | ## USArrests data vary by orders of magnitude, so scaling is appropriate
230 | prcomp(USArrests) # inappropriate
231 | prcomp(USArrests, scale = TRUE)
232 | prcomp(~ Murder + Assault + Rape, data = USArrests, scale = TRUE)
233 | plot(prcomp(USArrests))
234 | summary(prcomp(USArrests, scale = TRUE))
235 | biplot(prcomp(USArrests, scale = TRUE))
236 |
237 |
238 |
239 | [Package
stats version 4.0.2
Index ]
240 |
241 |
--------------------------------------------------------------------------------
/PC Lab 3/rollcall-members.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 3/rollcall-members.Rdata
--------------------------------------------------------------------------------
/PC Lab 3/rollcall-votes.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 3/rollcall-votes.Rdata
--------------------------------------------------------------------------------
/PC Lab 3/unsupervised_tutorial.r:
--------------------------------------------------------------------------------
1 | ######################## Load Data ########################
2 |
3 | ### Load data
4 | load("rollcall-votes.Rdata")
5 | load("rollcall-members.Rdata")
6 |
7 | print('Data loaded.')
8 |
9 | ##############################################################
10 |
11 | print('# Counts of Democrats, Republicans and one special politician')
12 | table(members$party)
13 |
14 | print('# Shares of Democrats, Republicans and one special politician')
15 | round(table(members$party)/nrow(members),3)
16 |
17 | # Count missing votings for each politician and plot the counts
18 | missings <- rowSums(votes[,(1:ncol(votes))]==0)
19 |
20 | # No. politicians who always voted
21 | sum(missings == 0)
22 |
23 | # Shares of missing votings
24 | s_missings <- missings/(ncol(votes)-1)
25 |
26 | # Histogram with 100 bins
27 | hist(???, breaks = 100)
28 |
29 | # Counts - yes and nos
30 | yeas <- rowSums(votes[,(1:ncol(votes))]== ???)
31 | nays <- rowSums(votes[,(1:ncol(votes))]== ???)
32 |
33 | # Plots - Party
34 | plot(yeas, nays, col = members$party)
35 | legend('topleft', legend = levels(members$party), col = 1:3, pch = 1)
36 |
37 | # PCA
38 | pr.out = prcomp(??? , center = TRUE, scale = TRUE)
39 |
40 | # No of principal components
41 | dim(pr.out$rotation)[2]
42 |
43 | # variance explained by each component
44 | pr.var = pr.out$sdev^2
45 |
46 | # Proportion of variance explained
47 | pve=pr.var/sum(pr.var)
48 |
49 | # Print first 10 PC
50 | pve[1:10]
51 |
52 | # Plot the first 10 PC
53 | barplot(pve[1:10], xlab=" Principal Component ", ylab=" Proportion of Variance Explained ", ylim=c(0,1))
54 | barplot(cumsum(pve[1:10]), xlab=" Principal Component ", ylab ="Cumulative Proportion of Variance Explained ", ylim=c(0,1))
55 |
56 | # Plot the first two principal components, color the party membership
57 | plot(pr.out$x[,1], pr.out$x[,2], xlab = "PC1", ylab = "PC2", col = members$party, main = "Top two PC directions")
58 | legend('bottomright', legend = levels(members$party), col = 1:3, pch = 1)
59 |
60 | ## Far right (very conservative)
61 | head(sort(???))
62 |
63 | ## Far left (very liberal)
64 | head(sort(???, decreasing=???))
65 |
66 | # PC 2
67 | head(sort(???))
68 | # No clear pattern based on party and state information
69 |
70 | # Look at the largest loadings in PC2 to discern an interpretation.
71 | loadings <- pr.out$rotation
72 | loadings[order(abs(loadings[,2]), decreasing=TRUE)[1:5],2]
73 |
74 | # Analyze voting behavior
75 | table(votes[,1146])
76 | table(votes[,658])
77 | table(votes[,1090])
78 |
79 | # Either everyone voted "yea" or missed the voting.
80 | # These votes all correspond to near-unanimous symbolic action.
81 |
82 | # Mystery Solved: the second PC is just attendance!
83 | head(sort(rowSums(votes==0), decreasing=TRUE))
84 |
85 | set.seed(11122019)
86 |
87 | # K-means clustering with 2 clusters
88 | km.out = kmeans(???, 2, nstart = 20)
89 | km.out$cluster
90 |
91 | # Tabulate party vs cluster
92 | table(members$party, km.out$cluster)
93 |
94 | # How to analyze the optimal number of clusters
95 |
96 | sse <- c()
97 | sse[1] <- Inf
98 |
99 | for (ind_cl in c(2:20)) {
100 | set.seed(3)
101 | km.out = kmeans (votes, ind_cl, nstart = 20)
102 | sse[ind_cl] = km.out$tot.withinss
103 | }
104 |
105 | plot(sse)
106 | # Optimum 4-5 clusters
107 |
108 | # Plot the 5 clusters on the PC components graph
109 | set.seed(3)
110 | km.out = kmeans (???, ???, nstart = 20)
111 |
112 | # Plot the first two principal components color the party membership
113 | plot(pr.out$x[,1], pr.out$x[,2], xlab = "PC1", ylab = "PC2", col = km.out$cluster, main = "Top two PC directions with 5 clusters")
114 | legend('bottomright', legend = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"), col = 1:5, pch = 1)
115 |
116 | # Analyzing how the number of starts work
117 | set.seed (3)
118 | print('With nstart = 1')
119 | km.out = kmeans (votes,6, nstart = ???)
120 | km.out$tot.withinss
121 |
122 | print('With nstart = 20')
123 | km.out =kmeans (votes,6, nstart = ???)
124 | km.out$tot.withinss
125 |
--------------------------------------------------------------------------------
/PC Lab 4/help files/glmnet_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 4/help files/glmnet_package.pdf
--------------------------------------------------------------------------------
/PC Lab 4/help files/hdm_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 4/help files/hdm_package.pdf
--------------------------------------------------------------------------------
/PC Lab 4/post_double_selection_tutorial.r:
--------------------------------------------------------------------------------
1 | ##############################################################################
2 | ######################## Load Packages and the Data ########################
3 | ##############################################################################
4 |
5 | ### Load the packages
6 | library(fBasics) # use for descriptive statistics
7 | library(tidyverse) # use for handling data
8 | library(caret) # use for handling data
9 | library(lmtest) # use for heteroscedasticity robust standard errors
10 | library(sandwich) # use for heteroscedasticity robust standard errors
11 | library(hdm) # use for Lasso and Post-Double-Selection
12 | library(glmnet) # use for lasso and Elastic Net regularized Generalized Linear Models
13 | options(warn=-1) # supress warnings
14 |
15 | print('All packages successfully installed and loaded.')
16 |
17 | ### Load the Data
18 | set.seed(12345678)
19 | df <- read.csv("job_corps.csv",header=TRUE, sep=",") # load data from csv-file
20 | df <- df[sample(c(1:nrow(df)), size=3000, replace =F),] # Select a random subsample of 3000 observations
21 | print('Data successfully loaded.')
22 |
23 | ##############################################################################
24 |
25 | ##############################################################################
26 | ######################## Descriptive Statistics ############################
27 | ##############################################################################
28 |
29 | ## Table with Descriptive Statistics
30 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>%
31 | select(Mean, Stdev, Minimum, Maximum, nobs)
32 | print(round(desc, digits=2))
33 |
34 | ##############################################################################
35 |
36 | #########################################################################
37 | ######################## Univariate OLS Regression #####################
38 | #########################################################################
39 |
40 | ## Univariate OLS
41 | ols1 <- lm(EARNY4 ~ participation, data = df)
42 | summary(ols1)
43 |
44 | ## Store results
45 | results <- as.matrix(coef(summary(ols1))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
46 |
47 | # Prepare matrix to store results
48 | res <- matrix(NA,nrow=3,ncol=5)
49 | colnames(res) <- c("Univariate OLS", "Multivariate OLS1", "Multivariate OLS2",
50 | "Multivariate OLS3", "Multivariate OLS4")
51 | rownames(res) <- rownames(results)
52 | res[,1] <- results
53 |
54 | print(round(res[,1], digits=2))
55 |
56 | ########################################################################
57 |
58 | ########################################################################
59 | ######################## Standardized Differences #####################
60 | ########################################################################
61 |
62 | ## Means and standard deviations for the participants (D=1)
63 | desc_1 <- fBasics::basicStats(df[df$participation==1,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
64 |
65 | ## Means and standard deviations for the non-participants (D=0)
66 | desc_0 <- fBasics::basicStats(df[df$participation==0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
67 |
68 | # Make table and add standardized differences
69 | desc <- cbind(desc_1[-c(1:3),],desc_0[-c(1:3),],
70 | 100*abs(desc_1[-c(1:3),1]-desc_0[-c(1:3),1])/sqrt(0.5*(desc_1[-c(1:3),2]^2+desc_0[-c(1:3),2]^2)))
71 | colnames(desc) <- c("D=1 Mean", "D=1 Std.Dev.", "D=0 Mean", "D=0 Std.Dev.", "Std.Diff.")
72 | print(round(desc, digits=2))
73 |
74 | ########################################################################
75 |
76 | #########################################################################
77 | ######################## Multivariate OLS Regression ###################
78 | #########################################################################
79 |
80 | ## Multivariate OLS
81 | ols2 <- lm(EARNY4 ~ participation + age_1 + age_3 + livespou + publich, data = df)
82 | summary(ols2)
83 | # Question: Why do we omit age_2?
84 |
85 | ## Store results
86 | results <- as.matrix(coef(summary(ols2))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
87 | res[,2] <- results
88 | print(round(res[,c(1:2)], digits=2))
89 |
90 | ## Relative change in the estimated effect
91 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,2]-res[1,1])/res[1,1], digits=1),"%"))
92 |
93 | ########################################################################
94 |
95 | #########################################################################
96 |
97 | ## Multivariate OLS
98 | ols3 <- lm(EARNY4 ~ ???, data = df)
99 | summary(ols3)
100 |
101 | ## Store results
102 | results <- as.matrix(coef(summary(ols3))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
103 | res[,3] <- results
104 | print(round(res[,c(1:3)], digits=2))
105 |
106 | ## Relative change in the estimated effect
107 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,3]-res[1,2])/res[1,2], digits=1),"%"))
108 |
109 | ########################################################################
110 |
111 | ###############################################################################
112 |
113 | ## Generate first-order interactions between all control variables
114 | interactions <- t(apply(df[,-c(1,2,3,6,11)], 1, combn, 2, prod))
115 | colnames(interactions) <- paste("Inter.V", combn(1:ncol(df[,-c(1,2,3,6,11)]), 2, paste, collapse="V"), sep="")
116 | print(paste0("Maximm number of interaction terms: ", ncol(interactions)))
117 |
118 | ## Merge basline characteristics with interaction terms
119 | df_merge <- as.data.frame(cbind(df[,-c(1,2,3,6,11)], interactions))
120 |
121 | ## Eliminate collinear variables
122 | df2 = cor(df_merge)
123 | df2[is.na(df2)] <- 1
124 | hc = findCorrelation(df2, cutoff=0.8) # putt any value as a "cutoff"
125 | hc = sort(hc)
126 | df_int = cbind(df[,c(1,3)],df_merge[,-c(hc)])
127 | print(paste0("Total number of control variables: ", ncol(df_int)-2))
128 |
129 | ###############################################################################
130 |
131 | ###############################################################################
132 |
133 | ## Multivariate OLS with all baseline characteristics and interaction terms
134 | ols4 <- lm(EARNY4 ~ ., data = df_int)
135 |
136 | ## Store results
137 | results <- as.matrix(coef(summary(ols4))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
138 | res[,4] <- results
139 | print(round(res[,c(1:4)], digits=2))
140 |
141 | ## Relative change in the estimated effect
142 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,4]-res[1,3])/res[1,3], digits=1),"%"))
143 |
144 | ########################################################################
145 |
146 | ###############################################################################
147 |
148 | # Set starting value for replicability
149 | set.seed(123456)
150 |
151 | # Specify number of random variables
152 | cols <- 1000
153 |
154 | # Generate random variables
155 | redundant_x <- matrix(rnorm(nrow(df_int)*cols), nrow = nrow(df_int)) # We draw from a random standard normal distribution
156 | colnames(redundant_x) <- paste("Rand.", 1:cols, sep="")
157 |
158 | # Merge random variables with baseline characteritics and interaction terms
159 | df_rand <- as.data.frame(cbind(df_int, redundant_x))
160 | print(paste0("Total number of control variables: ", ncol(df_rand)-2))
161 |
162 | ###############################################################################
163 |
164 | ###############################################################################
165 |
166 | ## Multivariate OLS with all baseline characteristics, interaction terms, and random variables
167 | ols5 <- lm(EARNY4 ~ ., data = df_rand)
168 |
169 | ## Store results
170 | results <- as.matrix(coef(summary(ols5))[2, c("Estimate", "Std. Error", "Pr(>|t|)")])
171 | res[,5] <- results
172 | print(round(res, digits=2))
173 |
174 | ## Relative change in the estimated effect
175 | print(paste0("Relative change in the estimated effect: ",round(100*(res[1,5]-res[1,4])/res[1,4], digits=1),"%"))
176 |
177 | ########################################################################
178 |
179 | ###############################################################################
180 | ########################### Earnings Equation #################################
181 | ###############################################################################
182 |
183 | # Predict earnings
184 | N <- nrow(df)
185 | st1 <- rlasso(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$EARNY4),
186 | penalty = list(homoscedastic = FALSE, c= 1.1, gamma = 0.1/log(N)))
187 | summary(st1)
188 |
189 | # Store selected variables
190 | n1<- names(st1$coefficients[(st1$coefficients != 0) == TRUE])[-1]
191 |
192 | ###############################################################################
193 |
194 | ###############################################################################
195 | ######################### Participation Probability ###########################
196 | ###############################################################################
197 |
198 | # Predict participation
199 | N <- nrow(df)
200 | st2 <- rlasso(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$participation),
201 | penalty = list(homoscedastic = FALSE, c= 1.1, gamma = 0.1/log(N)))
202 | summary(st2)
203 |
204 | # Store selected variables
205 | n2<- names(st2$coefficients[(st2$coefficients != 0) == TRUE])[-1]
206 |
207 | ###############################################################################
208 |
209 | ###############################################################################
210 | ################################# Post-Lasso ##################################
211 | ###############################################################################
212 |
213 | # Take union of selected covariates
214 | selected_covariates <- c("participation", unique(c(n1, n2)))
215 |
216 | # Setup the formula of the linear regression model
217 | sumx <- paste(selected_covariates, collapse = " + ")
218 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ")
219 | linear <- as.formula(linear)
220 |
221 | # Post-Lasso regression
222 | ols <- lm(linear, data = df)
223 | summary(ols)
224 |
225 | # Heteroskedasticity robust standard errors
226 | #coeftest(ols, vcov = vcovHC(ols, type = "HC1"))
227 |
228 | ###############################################################################
229 |
230 | ###############################################################################
231 | ################## Estimate the Treatment Effect Directly #####################
232 | ###############################################################################
233 |
234 | # Post-Double-Selection Procedure
235 | dsp <- rlassoEffect(as.matrix(df[,c(4:ncol(df))]), as.matrix(df$EARNY4)
236 | , as.matrix(df$participation), model = TRUE, penalty = list(homoscedastic = FALSE), method = "double selection")
237 | summary(dsp)
238 |
239 | ###############################################################################
240 | # Earning Equation
241 | ###############################################################################
242 |
243 | # Predict earnings
244 |
245 | # Store selected variables
246 |
247 | ###############################################################################
248 | # Participation Probability
249 | ###############################################################################
250 |
251 | # Predict participation
252 |
253 | # Store selected variables
254 |
255 | ###############################################################################
256 | # Post-Lasso Model
257 | ###############################################################################
258 |
259 | # Take union of selected covariates
260 | selected_covariates <- c("participation", unique(c(n1, n2)))
261 |
262 | # Setup the formula of the linear regression model
263 | sumx <- paste(selected_covariates, collapse = " + ")
264 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ")
265 | linear <- as.formula(linear)
266 |
267 | # Post-Lasso OLS regression
268 | ols <- lm(linear, data = df_rand)
269 | summary(ols)
270 |
271 | ###############################################################################
272 |
273 | ####################################################################
274 | ################# Cross-Validated Lasso ############################
275 | ####################################################################
276 |
277 | set.seed(123456789) # Starting value
278 |
279 | # Cross-validated Lasso in earnings equation
280 | lasso_earn <- cv.glmnet(as.matrix(df_int[,c(3:ncol(df_int))]), as.matrix(df$EARNY4),
281 | alpha=1, nfolds = 10, type.measure = 'mse', standardize = TRUE)
282 | # alpha =1 is Lasso, alpha = 0 is Ridgde
283 | # nfolds - number of cross-validation folds
284 | # type.measure - measure for model accuracy
285 |
286 | plot(lasso_earn)
287 |
288 | ####################################################################
289 |
290 | ####################################################################
291 |
292 | # Plot Lasso coefficients
293 | coef(lasso_earn,s = lasso_earn$lambda.1se)
294 | # $lambda.min - Lambda that minimizes cross-validated MSE
295 | # $lambda.1se - Lambda of 1 standard error rule
296 |
297 | ####################################################################
298 |
299 | ####################################################################
300 |
301 | # Select covariates with non-zero coefficients
302 | coef <- predict(lasso_earn,s = lasso_earn$lambda.min, type = "nonzero") #
303 | colnames <- colnames(df_int[,c(3:ncol(df_int))])
304 | n1 <- colnames[unlist(coef)]
305 | print(paste0("Number of Selected Variables Earnings Equation: ",length(n1)))
306 | print("Selected Variables:")
307 | print(n1)
308 |
309 | ####################################################################
310 |
311 | ####################################################################
312 |
313 | set.seed(123456789) # Starting value
314 |
315 | # Cross-validated Lasso in participation equation
316 | lasso_part <- cv.glmnet(???,
317 | alpha=1, nfolds = 10, type.measure = 'mse', standardize = TRUE)
318 | plot(lasso_part)
319 |
320 | ####################################################################
321 |
322 | ####################################################################
323 |
324 | # Select covariates with non-zero coefficients
325 | coef <- predict(???,s = ???, type = "nonzero") #
326 | colnames <- colnames(df_int[,c(3:ncol(df_int))])
327 | print(paste0("Number of Selected Variables Participation Equation: ",length(n2)))
328 | print("Selected Variables:")
329 | print(n2)
330 |
331 | ####################################################################
332 |
333 | ###############################################################################
334 | # Post-Lasso Model
335 | ###############################################################################
336 |
337 | # Take union of selected covariates
338 | selected_covariates <- c(???)
339 |
340 | # Setup the formula of the linear regression model
341 | sumx <- paste(selected_covariates, collapse = " + ")
342 | linear <- paste("EARNY4",paste(sumx, sep=" + "), sep=" ~ ")
343 | linear <- as.formula(linear)
344 |
345 | # Post-Lasso OLS regression
346 | ols <- lm(linear, data = df_int)
347 | summary(ols)
348 |
349 | ###############################################################################
350 |
--------------------------------------------------------------------------------
/PC Lab 5/double_machine_learning_tutorial.r:
--------------------------------------------------------------------------------
1 | ##############################################################################
2 | ######################## Load Packages and the Data ########################
3 | ##############################################################################
4 |
5 | ### Load the packages
6 | library(fBasics) # use for descriptive statistics
7 | library(tidyverse) # use for handling data
8 | library(DiagrammeR) # use for plotting trees
9 | library(lmtest) # use for heteroscedasticity robust standard errors
10 | library(sandwich) # use for heteroscedasticity robust standard errors
11 | library(grf) # use for generalized random forest
12 | library(glmnet) # use for lasso and Elastic Net regularized Generalized Linear Models
13 | options(warn=-1) # supress warnings
14 |
15 | print('All packages successfully installed and loaded.')
16 |
17 | ### Load the Data
18 | set.seed(12345678)
19 | df <- read.csv("job_corps.csv",header=TRUE, sep=",") # load data from csv-file
20 | df <- df[sample(c(1:nrow(df)), size=3000, replace =F),] # Select a random subsample of 3000 observations
21 | print('Data successfully loaded.')
22 |
23 | ##############################################################################
24 |
25 | ##############################################################################
26 | ######################## Descriptive Statistics ############################
27 | ##############################################################################
28 |
29 | ## Table with Descriptive Statistics
30 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>%
31 | select(Mean, Stdev, Minimum, Maximum, nobs)
32 | print(round(desc, digits=2))
33 |
34 | ##############################################################################
35 |
36 | ###############################################################################
37 | ######################### Sample Splitting ####################################
38 | ###############################################################################
39 |
40 | # Set starting value
41 | set.seed(123456789)
42 |
43 | # Partition Samples for Cross-Fitting
44 | df_part <- modelr::resample_partition(df, c(obs_A = 0.5, obs_B = 0.5)) # Split sample in strata of equal size
45 | df_obs_A <- as.data.frame(df_part$obs_A) # Sample A
46 | df_obs_B <- as.data.frame(df_part$obs_B) # Sample B
47 |
48 | ## Generate Variables
49 | # Outcome variable
50 | earnings_obs_A <- as.matrix(df_obs_A[,1])
51 | earnings_obs_B <- as.matrix(df_obs_B[,1])
52 |
53 | # Treatment variable
54 | treat = 3 #Select treatment 2= offer to participate, 3 = actual participation
55 | treat_obs_A <- as.matrix(df_obs_A[,treat])
56 | treat_obs_B <- as.matrix(df_obs_B[,treat])
57 |
58 | # Covariates
59 | covariates_obs_A <- as.matrix(df_obs_A[,c(4:ncol(df_obs_A))])
60 | covariates_obs_B <- as.matrix(df_obs_B[,c(4:ncol(df_obs_B))])
61 |
62 | print('Sample partitioning ready.')
63 |
64 | ##############################################################################
65 |
66 | ###############################################################################
67 | ########### Conditional Potential Earnings under Non-Participation ############
68 | ###############################################################################
69 |
70 | p = 1 # 1 for LASSO, 0 for Ridge
71 |
72 | # Set starting value
73 | set.seed(123456789)
74 |
75 | # Estimate Lasso among non-participants in Sample A
76 | # Use cross-validation to select optimal lambda value
77 | lasso_y0_A <- cv.glmnet(covariates_obs_A[treat_obs_A==0,], earnings_obs_A[treat_obs_A==0,],
78 | alpha=p, type.measure = 'mse')
79 | # Plot the cross-validated MSE
80 | plot(lasso_y0_A)
81 |
82 | # Extrapolate the fitted values to Sample B
83 | y0hat_B <- predict(lasso_y0_A, newx = covariates_obs_B, type = 'response', s = lasso_y0_A$lambda.min)
84 |
85 | # Estimate Lasso among non-participants in Sample B
86 | lasso_y0_B <- cv.glmnet(covariates_obs_B[treat_obs_B==0,], earnings_obs_B[treat_obs_B==0,],
87 | alpha=p, type.measure = 'mse')
88 | # Plot the cross-validated MSE
89 | plot(lasso_y0_B)
90 |
91 | # Extrapolate the fitted values to Sample A
92 | y0hat_A <- predict(lasso_y0_B, newx = covariates_obs_A, type = 'response', s= lasso_y0_B$lambda.min)
93 |
94 | # Merge fitted values of both samples
95 | y0hat <- rbind(y0hat_A,y0hat_B)
96 |
97 | #################################################################################
98 |
99 | ###############################################################################
100 | ########### Conditional Potential Earnings under Participation ############
101 | ###############################################################################
102 |
103 | p = 1 # 1 for LASSO, 0 for Ridge
104 |
105 | # Set starting value
106 | set.seed(123456789)
107 |
108 | # Estimate Lasso among participants in Sample A
109 | # Use cross-validation to select optimal lambda value
110 | lasso_y1_A <- cv.glmnet(covariates_obs_A[treat_obs_A==1,], earnings_obs_A[treat_obs_A==1,],
111 | alpha=p, type.measure = 'mse')
112 | plot(lasso_y1_A)
113 |
114 | # Extrapolate the fitted values to Sample B
115 | y1hat_B <- predict(lasso_y1_A, newx = covariates_obs_B, type = 'response', s = lasso_y1_A$lambda.min)
116 |
117 | # Estimate Lasso among participants in Sample B
118 | lasso_y1_B <- cv.glmnet(covariates_obs_B[treat_obs_B==1,], earnings_obs_B[treat_obs_B==1,],
119 | alpha=p, type.measure = 'mse')
120 | plot(lasso_y1_B)
121 |
122 | # Extrapolate the fitted values to Sample A
123 | y1hat_A <- predict(lasso_y1_B, newx = covariates_obs_A, type = 'response', s= lasso_y1_B$lambda.min)
124 |
125 | # Merge the fitted values of both samples
126 | y1hat <- rbind(y1hat_A,y1hat_B)
127 |
128 | #################################################################################
129 |
130 | ###############################################################################
131 | ########################### Propensity Score ##################################
132 | ###############################################################################
133 |
134 | # Propensity Score
135 | p = 1 # 1 for LASSO, 0 for Ridge
136 |
137 | # Set starting value
138 | set.seed(123456789)
139 |
140 | # Estimate Logit-Lasso in Sample A
141 | # Use cross-validation to select optimal lambda value
142 | lasso_p_A <- cv.glmnet(covariates_obs_A, treat_obs_A, alpha=p, type.measure = 'mse', family="binomial")
143 | plot(lasso_p_A)
144 |
145 | # Extrapolate the fitted values to Sample B
146 | pscore_B <- predict(lasso_p_A, newx = covariates_obs_B, type = 'response', s= lasso_p_A$lambda.min)
147 |
148 | # Estimate Logit-Lasso in Sample B
149 | lasso_p_B <- cv.glmnet(covariates_obs_B, treat_obs_B, alpha=p, type.measure = 'mse', family="binomial")
150 | plot(lasso_p_B)
151 |
152 | # Extrapolate the fitted values to Sample A
153 | pscore_A <- predict(lasso_p_B, newx = covariates_obs_A, type = 'response', s= lasso_p_B$lambda.min)
154 |
155 | # Merge the fitted values of both samples
156 | pscore <- rbind(pscore_A,pscore_B)
157 |
158 | ###############################################################################
159 |
160 | ###############################################################################
161 | ################################### ATE Score #################################
162 | ###############################################################################
163 |
164 | # Merge earnings outcome of Sample A and B
165 | earnings_obs <- rbind(earnings_obs_A,earnings_obs_B)
166 |
167 | # Merge treatmente of Sample A and B
168 | treat_obs <- rbind(treat_obs_A,treat_obs_B)
169 |
170 | # Calculate the ATE score using the formula described above
171 | Y_ate_star = invisible(???)
172 |
173 | # Calculate ATE
174 | # It is the sample average of the ATE score
175 | ate <- round(mean(Y_ate_star), digits = 2)
176 |
177 | # Calculate the standard errors of the ATE
178 | # Square root of the quotient of variance of the ATE score and the sample size
179 | se_ate <- round(sqrt(var(Y_ate_star)/length(Y_ate_star)), digits = 2)
180 |
181 |
182 | print(paste0("Average Treatment Effect (ATE): ", ate))
183 | print(paste0("Standard Error for ATE: ", se_ate))
184 |
185 | ###############################################################################
186 |
187 | ###############################################################################
188 | ################################## ATET Score #################################
189 | ###############################################################################
190 |
191 | ## Unconditional Treatment probability
192 | p = mean(pscore)
193 |
194 | # Calculate the ATET score using the formula described above
195 | Y_atet_star = invisible(???)
196 |
197 | # Calculate ATET
198 | # It is the sample average of the ATET score
199 | atet <- round(mean(Y_atet_star), digits = 2)
200 |
201 | # Calculate the standard errors of the ATET
202 | # Square root of the quotient of variance of the ATET score and the sample size
203 | se_atet <- round(sqrt(var(Y_atet_star)/length(Y_atet_star)), digits = 2)
204 |
205 | print(paste0("Average Treatment Effect for Treated (ATET): ", atet))
206 | print(paste0("Standard Error for ATET: ", se_atet))
207 |
208 | ###############################################################################
209 |
210 | ###############################################################################
211 | ##################################### CATEs ###################################
212 | ###############################################################################
213 |
214 | # Merge covariates of Sample A and B
215 | covariates_obs <- rbind(covariates_obs_A,covariates_obs_B)
216 |
217 | # Generate a new data frame
218 | # Merge the ATE score and the covariates
219 | colnames(Y_ate_star) <- "y_star"
220 | Y_star <- as.data.frame(cbind(Y_ate_star,covariates_obs[,-c(3,8)]))
221 |
222 | # Estimate an OLS regression
223 | # Regress the ATE score on the covariates
224 | cates <- lm(y_star ~., Y_star)
225 |
226 | # Heteroskedasticity robust standard errors
227 | coeftest(cates, vcov = vcovHC(cates, type = "HC1"))
228 |
229 | ###############################################################################
230 |
231 | ###############################################################################
232 |
233 | # Calculate the predicted effect size for each observation
234 | fit <- predict(cates)
235 |
236 | # Count the observations with positive and negative effects
237 | print(paste0("Number of individuals with positive effects: ", length(fit[fit>=0])))
238 | print(paste0("Number of individuals with negative effects: ", length(fit[fit<0])))
239 |
240 | ###############################################################################
241 |
242 | ###############################################################################
243 | ################ Plot Cumulative Distribution of CATEs ########################
244 | ###############################################################################
245 |
246 | plot(ecdf(fit), col="blue", xlim = c(-100,150), xlab="Effect Size (in Dollars)",
247 | ylab="Cumulative Distribution", main="Cumulative Distibution of the CATEs")
248 | abline(v=0, col="red")
249 |
250 | ###############################################################################
251 |
252 | ###############################################################################
253 | ######################## Description of CATEs #################################
254 | ###############################################################################
255 |
256 | ## Means and standard deviations for individuals with positive effects
257 | desc_1 <- fBasics::basicStats(Y_star[fit >= 0,-1]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
258 |
259 | ## Means and standard deviations for individuals with negative effects
260 | desc_0 <- fBasics::basicStats(Y_star[fit < 0,-1]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
261 |
262 | # Make table and add standardized differences
263 | desc <- cbind(desc_1,desc_0,
264 | 100*abs(desc_1[,1]-desc_0[,1])/sqrt(0.5*(desc_1[,2]^2+desc_0[,2]^2)))
265 | colnames(desc) <- c("Mean (Pos.)", "Std.Dev. (Pos.)", "Mean (Neg.)", "Std.Dev. (Neg.)", "Std.Diff.")
266 | print(round(desc, digits=2))
267 |
268 | ###############################################################################
269 |
270 | ###############################################################################
271 | ########### Conditional Potential Earnings under Non-Participation ############
272 | ###############################################################################
273 |
274 | # Set starting value
275 | set.seed(123456789)
276 |
277 | # Tuning parameters for forest
278 | trees = 1000 # number of trees in the forest
279 | frac = 0.5 # share of subsample used for each tree
280 | cov = floor(1/2*ncol(covariates_obs)) # number of covariates used for each tree
281 | min = 10 # minimum sample size in the terminal leaves of the trees
282 |
283 | # Estimate Random Forest among non-participants in Sample A
284 | forest_y0_A <- regression_forest(covariates_obs_A[treat_obs_A==0,], earnings_obs_A[treat_obs_A==0,],
285 | num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min)
286 |
287 | # Extrapolate the fitted values to Sample B
288 | y0hat_B <- as.matrix(predict(forest_y0_A, newdata = covariates_obs_B)$predictions)
289 |
290 | print("Random Forest for Sample A estimated.")
291 |
292 | #################################################################################
293 |
294 | #################################################################################
295 |
296 | # Plot one tree from the random forest
297 | plot(tree <- get_tree(forest_y0_A, 1))
298 | # the last number is the tree number
299 | # it can be varied from 1 to 1000
300 |
301 | #################################################################################
302 |
303 | #################################################################################
304 |
305 | # Count the splitting frequencies for each covariate
306 | split <- split_frequencies(forest_y0_A, max.depth = 4)
307 | # max.depth specifies the maximum tree depth we consider
308 |
309 | # Label the results
310 | colnames(split) <- colnames(covariates_obs)
311 | rownames(split) <- c("Depth 1", "Depth 2", "Depth 3", "Depth 4")
312 |
313 | print(t(split))
314 |
315 | #################################################################################
316 |
317 | #################################################################################
318 |
319 | # Estimate Random Forest among non-participants in Sample B
320 | forest_y0_B <- regression_forest(???)
321 |
322 | # Extrapolate the fitted values to Sample A
323 | y0hat_A <- as.matrix(predict(forest_y0_B, newdata = covariates_obs_A)$predictions)
324 |
325 | # Merge fitted values of both samples
326 | y0hat <- rbind(y0hat_A,y0hat_B)
327 |
328 | print("Random Forest for Sample B estimated.")
329 |
330 | #################################################################################
331 |
332 | ###############################################################################
333 | ########################### Propensity Score ##################################
334 | ###############################################################################
335 |
336 | # Set starting value
337 | set.seed(123456789)
338 |
339 | # Tuning parameters for forest
340 | trees = 1000
341 | frac = 0.5
342 | cov = floor(1/2*ncol(covariates_obs))
343 | min = 10
344 |
345 | # Estimate Random Forest in Sample A
346 | forest_p_A <- regression_forest(covariates_obs_A, treat_obs_A,
347 | num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min)
348 |
349 | # Extrapolate the fitted values to Sample B
350 | pscore_B <- as.matrix(predict(forest_p_A, newdata = covariates_obs_B)$predictions)
351 |
352 | ##############
353 |
354 | # Estimate Random Forest in Sample B
355 | forest_p_B <- regression_forest(covariates_obs_B, treat_obs_B,
356 | num.trees = trees, sample.fraction = frac, mtry = cov, min.node.size = min)
357 |
358 | # Extrapolate the fitted values to Sample A
359 | pscore_A <- as.matrix(predict(forest_p_B, newdata = covariates_obs_A)$predictions)
360 |
361 | # Merge the fitted values of both samples
362 | pscore <- rbind(pscore_A,pscore_B)
363 |
364 | print("Propensity score is estimated.")
365 |
366 | ###############################################################################
367 |
368 | ###############################################################################
369 | ################################## ATET Score #################################
370 | ###############################################################################
371 |
372 | ## Unconditional Treatment probability
373 | p = mean(pscore)
374 |
375 | # Calculate the ATET score using the formula described above
376 | Y_atet_star = invisible(treat_obs*(earnings_obs - y0hat)/p
377 | - (1-treat_obs)*pscore*(earnings_obs - y0hat)/(p*(1-pscore)))
378 |
379 | # Calculate ATET
380 | # It is the sample average of the ATET score
381 | atet <- round(mean(Y_atet_star), digits = 2)
382 |
383 | # Calculate the standard errors of the ATET
384 | # Square root of the quotient of variance of the ATET score and the sample size
385 | se_atet <- round(sqrt(var(Y_atet_star)/length(Y_atet_star)), digits = 2)
386 |
387 | print(paste0("Average Treatment Effect for Treated (ATET): ", atet))
388 | print(paste0("Standard Error for ATET: ", se_atet))
389 |
390 | ###############################################################################
391 |
--------------------------------------------------------------------------------
/PC Lab 5/help files/glmnet_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 5/help files/glmnet_package.pdf
--------------------------------------------------------------------------------
/PC Lab 5/help files/grf_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 5/help files/grf_package.pdf
--------------------------------------------------------------------------------
/PC Lab 6/causal_forest.r:
--------------------------------------------------------------------------------
1 | ######################## Load Packages ########################
2 |
3 | # List of required packages
4 | pkgs <- c('fBasics', 'corrplot', 'tidyverse', 'grf', 'plotmo')
5 |
6 | # Load packages
7 | for(pkg in pkgs){
8 | library(pkg, character.only = TRUE)
9 | }
10 | options(warn=-1) # supress warnings
11 |
12 | print('All packages successfully installed and loaded.')
13 |
14 | ##################################################################
15 |
16 | ######################## Load Data Frame ########################
17 |
18 | # Load data frame
19 | df <- read.csv("fundraising.csv",header=TRUE, sep=",")
20 |
21 | # Outcome Variable
22 | outcome <- c("char_giving")
23 |
24 | # Treatment Variables
25 | treatment <- c("treat")
26 |
27 | # Covariates/Features
28 | covariates <- c("amount_pre", "amount_lastpre", "amount_maxpre", "H_number_yearbefore", "H_ngifts",
29 | "H_littleask", "H_bigask", "H_nyears", "H_frequency", "H_medinc", "H_medinc_mdum",
30 | "H_Avg_years_ed", "H_Avg_years_ed_mdum")
31 |
32 |
33 | all_variables <- c(outcome, treatment, covariates)
34 |
35 | print('Data frame successfully loaded and sample selected.')
36 |
37 | ####################################################################
38 |
39 | ######################## Table with Descriptive Statistics ########################
40 |
41 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>%
42 | select(Mean, Stdev, Minimum, Maximum, nobs)
43 | print(round(desc, digits=2))
44 |
45 | #####################################################################################
46 |
47 | ######################## Correlation Matrix ########################
48 |
49 | corr = cor(df[,-c(1:2)])
50 | corrplot(corr, type = "upper", tl.col = "black")
51 |
52 | ######################################################################
53 |
54 | ######################## Partition the Samples ########################
55 | set.seed(100239) # set starting value for random number generator
56 |
57 | # Partition Hold-Out-Sample
58 | df_part <- modelr::resample_partition(df, c(obs = 0.8, hold_out = 0.2))
59 | df_obs <- as.data.frame(df_part$obs) # Training and estimation sample
60 | df_hold_out <- as.data.frame(df_part$hold_out) # Hold-out-sample
61 |
62 | print('Samples are partitioned.')
63 |
64 | ######################## Generate Variables ########################
65 |
66 | # Outcome
67 | giving_hold_out <- as.matrix(df_hold_out[,1])
68 | giving_obs <- as.matrix(df_obs[,1])
69 |
70 | # Treatment
71 | treat_hold_out <- as.matrix(df_hold_out[,2])
72 | treat_obs <- as.matrix(df_obs[,2])
73 |
74 | # Covariates
75 | covariates_hold_out <- as.matrix(df_hold_out[,c(3:ncol(df_hold_out))])
76 | covariates_obs <- as.matrix(df_obs[,c(3:ncol(df_obs))])
77 |
78 | print('The data is now ready for your analysis!')
79 |
80 | #######################################################################
81 |
82 | ######################## Causal Forest ########################
83 | set.seed(100244)
84 |
85 | # Tuning parameters
86 | min_tree = 100 # Minimum size of terminal leaves
87 | num_trees = 1000 # Number of trees in forest
88 | cov_frac = 1/2 # Fraction of covariates in each tree
89 | sample_part= 0.5 # Fraction of sample used for each tree (subsampling)
90 |
91 | # Caual Forest
92 | cates <- causal_forest(covariates_obs, giving_obs, treat_obs,
93 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)),
94 | num.trees = num_trees, min.node.size = min_tree,
95 | honesty = TRUE, honesty.fraction = 0.5)
96 |
97 | print('Forest is ready!')
98 |
99 | ###################################################################
100 |
101 | #################################################################################
102 |
103 | # Plot one tree from the random forest
104 | plot(tree <- get_tree(cates, 1))
105 | # the last number is the tree number
106 | # it can be varied from 1 to 1000
107 |
108 | #################################################################################
109 |
110 | #################################################################################
111 |
112 | # Count the splitting frequencies for each covariate
113 | split <- split_frequencies(cates, max.depth = 4)
114 | # max.depth specifies the maximum tree depth we consider
115 |
116 | # Label the results
117 | colnames(split) <- colnames(covariates_obs)
118 | rownames(split) <- c("Depth 1", "Depth 2", "Depth 3", "Depth 4")
119 |
120 | print(t(split))
121 |
122 | #################################################################################
123 |
124 | ######################### ATE ###############################
125 |
126 | average_treatment_effect(cates, target.sample = c("all"))
127 |
128 | #############################################################
129 |
130 | ###############################################################################
131 |
132 | # Calculate the predicted effect size for each observation
133 | fit <- predict(cates, covariates_hold_out, estimate.variance = FALSE)$predictions
134 |
135 | # Count the observations with positive and negative effects
136 | print(paste0("Number of individuals with positive effects: ", length(fit[fit>=0])))
137 | print(paste0("Number of individuals with negative effects: ", length(fit[fit<0])))
138 |
139 | print(paste0("Share of individuals with positive effects: ", round(100*length(fit[fit>=0])/length(fit),digits=1), "%"))
140 |
141 | ###############################################################################
142 |
143 | ###############################################################################
144 | ################ Plot Cumulative Distribution of CATEs ########################
145 | ###############################################################################
146 |
147 | plot(ecdf(fit), col="blue", xlim = c(-25,25), xlab="Effect Size (in Dollars)",
148 | ylab="Cumulative Distribution", main="Cumulative Distibution of the CATEs")
149 | abline(v=0, col="red")
150 |
151 | ###############################################################################
152 |
153 | ###############################################################################
154 | ######################## Description of CATEs #################################
155 | ###############################################################################
156 |
157 | ## Means and standard deviations for individuals with positive effects
158 | desc_1 <- fBasics::basicStats(covariates_hold_out[fit >= 0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
159 |
160 | ## Means and standard deviations for individuals with negative effects
161 | desc_0 <- fBasics::basicStats(covariates_hold_out[fit < 0,]) %>% t() %>% as.data.frame() %>% select(Mean, Stdev)
162 |
163 | # Make table and add standardized differences
164 | desc <- cbind(desc_1,desc_0,
165 | 100*abs(desc_1[,1]-desc_0[,1])/sqrt(0.5*(desc_1[,2]^2+desc_0[,2]^2)))
166 | colnames(desc) <- c("Mean (Pos.)", "Std.Dev. (Pos.)", "Mean (Neg.)", "Std.Dev. (Neg.)", "Std.Diff.")
167 | print(round(desc, digits=2))
168 |
169 | ###############################################################################
170 |
171 |
172 |
--------------------------------------------------------------------------------
/PC Lab 6/help files/grf_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 6/help files/grf_package.pdf
--------------------------------------------------------------------------------
/PC Lab 7/help files/grf_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 7/help files/grf_package.pdf
--------------------------------------------------------------------------------
/PC Lab 7/help files/rpart_package.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AStrittmatter/Machine-Learning-Course/56776d035024d27ca2a34c08d8496eba512298a5/PC Lab 7/help files/rpart_package.pdf
--------------------------------------------------------------------------------
/PC Lab 7/optimal_policy_learning.r:
--------------------------------------------------------------------------------
1 | ######################## Load Packages ########################
2 |
3 | # List of required packages
4 | pkgs <- c('fBasics', 'corrplot', 'rpart', 'rpart.plot', 'tidyverse', 'grf', 'caret')
5 |
6 | # Load packages
7 | for(pkg in pkgs){
8 | library(pkg, character.only = TRUE)
9 | }
10 | options(warn=-1) # supress warnings
11 |
12 | print('All packages successfully installed and loaded.')
13 |
14 | ###################################################################
15 |
16 | ######################## Load Data Frame ########################
17 |
18 | # Load data frame
19 | df <- read.csv("fundraising.csv",header=TRUE, sep=",")
20 |
21 | # Outcome Variable
22 | outcome <- c("char_giving")
23 |
24 | # Treatment Variables
25 | treatment <- c("treat")
26 |
27 | # Covariates/Features
28 | covariates <- c("amount_pre", "amount_lastpre", "amount_maxpre", "H_number_yearbefore", "H_ngifts",
29 | "H_littleask", "H_bigask", "H_nyears", "H_frequency", "H_medinc", "H_medinc_mdum",
30 | "H_Avg_years_ed", "H_Avg_years_ed_mdum")
31 |
32 | all_variables <- c(outcome, treatment, covariates)
33 |
34 | print('Data frame successfully loaded and sample selected.')
35 |
36 | ######################################################################
37 |
38 | ######################## Table with Descriptive Statistics ########################
39 |
40 | desc <- fBasics::basicStats(df) %>% t() %>% as.data.frame() %>%
41 | select(Mean, Stdev, Minimum, Maximum, nobs)
42 | print(round(desc, digits=2))
43 |
44 | #####################################################################################
45 |
46 | ######################## Correlation Matrix ########################
47 |
48 | corr = cor(df[,-c(1:2)])
49 | corrplot(corr, type = "upper", tl.col = "black")
50 |
51 | ######################################################################
52 |
53 | ######################## Partition the Samples ########################
54 | set.seed(100233) # set starting value for random number generator
55 |
56 | # Partition Hold-Out-Sample
57 | df_part <- modelr::resample_partition(df, c(obs = 0.8, hold_out = 0.2))
58 | df_obs <- as.data.frame(df_part$obs) # Training and estimation sample
59 | df_hold_out <- as.data.frame(df_part$hold_out) # Hold-out-sample
60 |
61 | # Partition Samples for Cross-Fitting
62 | df_part <- modelr::resample_partition(df_obs, c(obs_A = 0.5, obs_B = 0.5))
63 | df_obs_A <- as.data.frame(df_part$obs_A) # Sample A
64 | df_obs_B <- as.data.frame(df_part$obs_B) # Sample B
65 |
66 | print('Samples are partitioned.')
67 |
68 | ######################## Generate Variables ########################
69 |
70 | # Outcome
71 | giving_hold_out <- as.matrix(df_hold_out[,1])
72 | giving_obs <- as.matrix(df_obs[,1])
73 | giving_obs_A <- as.matrix(df_obs_A[,1])
74 | giving_obs_B <- as.matrix(df_obs_B[,1])
75 |
76 | # Treatment
77 | treat_hold_out <- as.matrix(df_hold_out[,2])
78 | treat_obs <- as.matrix(df_obs[,2])
79 | treat_obs_A <- as.matrix(df_obs_A[,2])
80 | treat_obs_B <- as.matrix(df_obs_B[,2])
81 |
82 | # Covariates
83 | covariates_hold_out <- as.matrix(df_hold_out[,c(3:ncol(df_hold_out))])
84 | covariates_obs <- as.matrix(df_obs[,c(3:ncol(df_obs))])
85 | covariates_obs_A <- as.matrix(df_obs_A[,c(3:ncol(df_obs_A))])
86 | covariates_obs_B <- as.matrix(df_obs_B[,c(3:ncol(df_obs_B))])
87 |
88 | ######################## Standardise Covariates ########################
89 |
90 | preProcValues <- preProcess(covariates_obs, method = c("center", "scale"))
91 | covariates_hold_out <- predict(preProcValues, covariates_hold_out)
92 | covariates_obs <- predict(preProcValues, covariates_obs)
93 | covariates_obs_A <- predict(preProcValues, covariates_obs_A)
94 | covariates_obs_B <- predict(preProcValues, covariates_obs_B)
95 |
96 | df_obs <- as.data.frame(cbind(giving_obs,treat_obs,covariates_obs))
97 | df_obs_A <- as.data.frame(cbind(giving_obs_A,treat_obs_A,covariates_obs_A))
98 | df_obs_B <- as.data.frame(cbind(giving_obs_B,treat_obs_B,covariates_obs_B))
99 | covariates_hold_out <- as.data.frame(covariates_hold_out)
100 |
101 | print('Covariates are standardised.')
102 | print('The data is now ready for your analysis!')
103 |
104 | ###########################################################################
105 |
106 | ######################## Potential Outcomes ########################
107 | set.seed(100243)
108 |
109 | # Tuning parameters
110 | min_tree = 20
111 | # Number of trees is set to a very low value in order to increase the computational speed in this tutorial
112 | num_trees = 100 # Use at least 1,000 trees
113 | cov_frac = 1/3
114 | sample_part= 0.5
115 |
116 | # Build generalised random forest
117 |
118 | # Use Sample A to predict Sample B
119 | # Potential outcome under treatment
120 | f_y1_A <- regression_forest(covariates_obs_A[treat_obs_A == 1,], giving_obs_A[treat_obs_A == 1, ],
121 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)),
122 | num.trees = num_trees, min.node.size = min_tree,
123 | honesty = TRUE, honesty.fraction = 0.5)
124 | y1hat_B <- as.matrix(predict(f_y1_A, covariates_obs_B)$predictions)
125 | y1hat_B_hold_out <- as.matrix(predict(f_y1_A, covariates_hold_out)$predictions)
126 |
127 | # Potential outcome under non-treatment
128 | f_y0_A <- regression_forest(covariates_obs_A[treat_obs_A == 0,], giving_obs_A[treat_obs_A == 0, ],
129 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)),
130 | num.trees = num_trees, min.node.size = min_tree,
131 | honesty = TRUE, honesty.fraction = 0.5)
132 | y0hat_B <- as.matrix(predict(f_y0_A, covariates_obs_B)$predictions)
133 | y0hat_B_hold_out <- as.matrix(predict(f_y0_A, covariates_hold_out)$predictions)
134 |
135 | ###########################################################################
136 |
137 | # Use Sample B to predict Sample A
138 | # Potential outcome under treatment
139 | f_y1_B <- regression_forest(covariates_obs_B[treat_obs_B == 1,], giving_obs_B[treat_obs_B == 1, ],
140 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)),
141 | num.trees = num_trees, min.node.size = min_tree,
142 | honesty = TRUE, honesty.fraction = 0.5)
143 | y1hat_A <- as.matrix(predict(f_y1_B, covariates_obs_A)$predictions)
144 | y1hat_A_hold_out <- as.matrix(predict(f_y1_B, covariates_hold_out)$predictions)
145 |
146 | # Potential outcome under non-treatment
147 | f_y0_B <- regression_forest(covariates_obs_B[treat_obs_B == 0,], giving_obs_B[treat_obs_B == 0, ],
148 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)),
149 | num.trees = num_trees, min.node.size = min_tree,
150 | honesty = TRUE, honesty.fraction = 0.5)
151 | y0hat_A <- as.matrix(predict(f_y0_B, covariates_obs_A)$predictions)
152 | y0hat_A_hold_out <- as.matrix(predict(f_y0_B, covariates_hold_out)$predictions)
153 |
154 | ###########################################################################
155 |
156 | # Merge the fitted values from samples A and B
157 | y1hat <- rbind(y1hat_A,y1hat_B)
158 | y0hat <- rbind(y0hat_A,y0hat_B)
159 |
160 | y1hat_hold_out <- (y1hat_A_hold_out+y1hat_B_hold_out)/2
161 | y0hat_hold_out <- (y0hat_A_hold_out+y0hat_B_hold_out)/2
162 |
163 | print("Potential outcomes are estimated")
164 |
165 | ###########################################################################
166 |
167 | ######################## Propensity Score ########################
168 | set.seed(100242)
169 |
170 | # Tuning parameters
171 | min_tree = 20
172 | num_trees = 100 # Use at least 1,000 trees
173 | cov_frac = 1/3
174 | sample_part= 0.5
175 |
176 | # Use Sample A to predict Sample B
177 | f_p_A <- regression_forest(covariates_obs_A, treat_obs_A,
178 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)),
179 | num.trees = num_trees, min.node.size = min_tree,
180 | honesty = TRUE, honesty.fraction = 0.5)
181 | pscore_B <- as.matrix(predict(f_p_A, covariates_obs_B)$predictions)
182 | pscore_B_hold_out <- as.matrix(predict(f_p_A, covariates_hold_out)$predictions)
183 |
184 | # Use Sample B to predict Sample A
185 | f_p_B <- regression_forest(covariates_obs_B, treat_obs_B,
186 | sample.fraction = sample_part, mtry = floor(cov_frac*ncol(covariates_obs)),
187 | num.trees = num_trees, min.node.size = min_tree,
188 | honesty = TRUE, honesty.fraction = 0.5)
189 | pscore_A <- as.matrix(predict(f_p_B, covariates_obs_A)$predictions)
190 | pscore_A_hold_out <- as.matrix(predict(f_p_B, covariates_hold_out)$predictions)
191 |
192 | pscore <- rbind(pscore_A,pscore_B)
193 | pscore_hold_out <- (pscore_A_hold_out+pscore_B_hold_out)/2
194 |
195 | print("Propensity scores are estimated")
196 |
197 | ###########################################################################
198 |
199 | ######################## Average Treatment Effects (ATE) ########################
200 |
201 | # Merge samples A and B
202 | giving_obs <- rbind(giving_obs_A,giving_obs_B)
203 | treat_obs <- rbind(treat_obs_A,treat_obs_B)
204 |
205 | # Generate Modified Outcome
206 | Y_star = invisible(y1hat - y0hat + treat_obs*(giving_obs - y1hat)/pscore
207 | - (1-treat_obs)*(giving_obs - y0hat)/(1-pscore))
208 |
209 | # Average Treatment Effect (ATE)
210 | ATE <- round(mean(Y_star), digits=1)
211 | print(paste0("Average Treatment Effect (ATE): ", ATE))
212 |
213 | # Standard error
214 | SD_ATE <- round(sqrt(var(Y_star)/length(Y_star)),digits=1)
215 | print(paste0("Standard Error for ATE: ", SD_ATE))
216 |
217 | ####################################################################################
218 |
219 | ######################## Individualised Treatment Rules ########################
220 |
221 | set.seed(1234567)
222 |
223 | # Define transformed Variables
224 | sign = sign(Y_star)
225 | lambda = abs(Y_star)
226 | Z <- factor(sign, labels = c("Don't", "Treat"))
227 | df_obs <- rbind(df_obs_A,df_obs_B)
228 |
229 | # Genrate linear fromular for tree
230 | sumx <- paste(covariates, collapse = " + ")
231 | linear <- paste("Z",paste(sumx, sep=" + "), sep=" ~ ")
232 | linear <- as.formula(linear)
233 |
234 | ######################## Build a Shallow Tree ########################
235 |
236 | # Tree
237 | tree_1 <- rpart(formula = linear, # Predict sign of treatment
238 | data = df_obs,
239 | weights = lambda, # Larger absolute effect -> Higher weight
240 | method = "class",
241 | control = rpart.control(cp = 2.00e-10,maxdepth = 3, minbucket=10))
242 |
243 | # Plot MSE in CV-Sample
244 | rpart.plot(tree_1,digits=3)
245 |
246 | # Predict policy rule to hold-out-sample
247 | pi_tree1_hold_out = as.matrix(predict(tree_1, newdata=covariates_hold_out))
248 |
249 | ####################################################################################
250 |
251 | ############################# Build Trees Deeper Tree #################################
252 |
253 | set.seed(1234567)
254 |
255 | # Tree
256 | tree_2 <- rpart(formula = linear, # Predict sign of treatment
257 | data = df_obs,
258 | weights = lambda, # Larger absolute effect --> Higher weight
259 | method = "class",
260 | control = rpart.control(cp = 2.00e-10, minbucket=10))
261 |
262 | # Find optimal tree sizes
263 | op.index_2 <- which.min(tree_2$cptable[, "xerror"])
264 | print(paste0("Optimal number of splits: ", tree_2$cptable[op.index_2, "nsplit"]))
265 |
266 | # Plot CV-Error
267 | plotcp(tree_2, minline = TRUE)
268 | abline(v = op.index_2, lty = "dashed")
269 |
270 | ######################## Select the Tree that Minimises CV-MSE ########################
271 |
272 | # Get cp-value that corresponds to optimal tree sizes
273 | cp.vals_2 <- tree_2$cptable[op.index_2, "CP"]
274 |
275 | # Prune the trees
276 | prune_tree_2 <- prune(tree_2, cp = cp.vals_2)
277 |
278 | # Plot pruned tree
279 | rpart.plot(prune_tree_2,digits=3, main = "Pruned Tree")
280 |
281 | # Predict policy rule to hold-out-sample
282 | pi_tree2_hold_out = as.matrix(predict(prune_tree_2, newdata=covariates_hold_out))
283 |
284 | #########################################################################################
285 |
286 | ######################## Share of Treated ########################
287 |
288 | # Rule based on shallow tree (ITR1)
289 | rule_tree_1 <- as.numeric(pi_tree1_hold_out[,2]> .5)
290 | # Rule based on deeper tree (ITR2)
291 | rule_tree_2 <- as.numeric(pi_tree2_hold_out[,2]> .5)
292 |
293 | print('Descriptives of Policy Rules')
294 | desc <- fBasics::basicStats(cbind(rule_tree_1,rule_tree_2)) %>% t() %>% as.data.frame() %>%
295 | select(Mean, nobs)
296 | print(round(desc, digits=5))
297 |
298 | print('Correlation between the Policy Rules')
299 | corr = cor(cbind(rule_tree_1,rule_tree_2))
300 | print(corr)
301 |
302 | #####################################################################
303 |
304 | ######################## Average Giving Under Policy Rule ########################
305 |
306 | # Generate Modified Outcome
307 | y_1_hold_out = invisible(y1hat_hold_out + treat_hold_out*(giving_hold_out - y1hat_hold_out)/pscore_hold_out)
308 | y_0_hold_out = invisible(y0hat_hold_out + (1-treat_hold_out)*(giving_hold_out - y0hat_hold_out)/(1-pscore_hold_out))
309 |
310 | # Calulate expected average giving under the different policy rules
311 | O_tree_1 <- round(mean(rule_tree_1*y_1_hold_out + (1-rule_tree_1)*y_0_hold_out), digits = 2)
312 | O_tree_2 <- round(mean(rule_tree_2*y_1_hold_out + (1-rule_tree_2)*y_0_hold_out), digits = 2)
313 |
314 | print('Average Givings Under')
315 | print(paste0("Shallow Tree: ",O_tree_1))
316 | print(paste0("Pruned Tree: ",O_tree_2))
317 |
318 | #####################################################################################
319 |
320 | ######################## Policy Value Compared to Everybody is Treated ########################
321 |
322 | #Modified Outcome
323 | Y_star_hold_out = y_1_hold_out - y_0_hold_out
324 |
325 | # Estimate Policy Value
326 | tree_all <- round(mean((rule_tree_2-1)*Y_star_hold_out), digits = 2)
327 | se_tree_all <- round(sqrt(var((rule_tree_2-1)*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2)
328 |
329 | print('Total Policy Value Compared to Everybody is Treated')
330 | print(paste0("Average Gain of Pruned Tree: ", tree_all))
331 | print(paste0("Standard Error: ", se_tree_all))
332 |
333 | #round(mean(giving_hold_out[treat_hold_out==1,]), digits = 2)
334 | #round(mean((rule_tree_2-1)*Y_star_hold_out)/mean(giving_hold_out[treat_hold_out==1,]), digits = 2)
335 |
336 | ################################################################################################
337 |
338 | ######################## Policy Value Compared to Nobody is Treated ########################
339 |
340 | # Estimate Policy Value
341 | tree_no <- round(mean(rule_tree_2*Y_star_hold_out), digits = 2)
342 | se_tree_no <- round(sqrt(var(rule_tree_2*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2)
343 |
344 | print('Total Policy Value Compared to Nobody is Treated')
345 | print(paste0("Average Gain of Pruned Tree: ", tree_no))
346 | print(paste0("Standard Error: ", se_tree_no))
347 |
348 | #round(mean(giving_hold_out[treat_hold_out==0,]), digits = 2)
349 | #round(mean(rule_tree_2*Y_star_hold_out)/mean(giving_hold_out[treat_hold_out==0,]), digits = 2)
350 |
351 | ################################################################################################
352 |
353 | ######################## Policy Value Compared to Random Assignment ########################
354 |
355 | # Estimate Policy Value
356 | R1_tree_2 <- round(1/2*mean((2*rule_tree_2-1)*Y_star_hold_out), digits = 2)
357 | se_tree_2 <- round(sqrt(1/4*var((2*rule_tree_2-1)*Y_star_hold_out)/length(Y_star_hold_out)), digits = 2)
358 |
359 |
360 | print('Total Policy Value Compared to Random Assignment')
361 | print(paste0("Average Gain of Pruned Tree: ", R1_tree_2))
362 | print(paste0("Standard Error: ", se_tree_2))
363 |
364 | #round((mean(giving_hold_out[treat_hold_out==1,])+mean(giving_hold_out[treat_hold_out==0,]))/2, digits = 2)
365 | #round(1/2*mean((2*rule_tree_2-1)*Y_star_hold_out)/(mean(giving_hold_out[treat_hold_out==1,])+
366 | # mean(giving_hold_out[treat_hold_out==0,]))/2, digits = 2)
367 |
368 | ################################################################################################
369 |
370 |
371 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine-Learning-Course
2 | Machine Learning for Economists and Business Analysts
3 |
4 | [](https://mybinder.org/v2/gh/AStrittmatter/Machine-Learning-Course/HEAD)
5 |
6 | Machine learning estimation methods gain more and more popularity. Compared to conventional estimation methods, machine learning can solve statistical prediction tasks in a data adaptive way. Furthermore, machine learning can deal with high-dimensional variable spaces in a relatively flexible way. Prediction methods are used in many different business and economic domains. Examples of prediction tasks are: The prediction of sales for a grocery store, such that logisticians can ship products before they are sold. The prediction of the probability to become drug addicted later in life, such that drug prevention programs can be targeted at adolescents with high risk.
7 |
8 | Besides predictions, economists and managers are often interested in causal questions. Examples of causal questions are: What are the effects of tweets by Elon Musk on Bitcoins? What impact has lowering the central bank interest rate on GDP? Does participation in training programs reduce the unemployment duration? Machine learning cannot give us an automatic answer to causal questions without using an empirical design. However, machine learning estimates can serve as input factors for these empirical designs. Furthermore, we can estimate heterogeneous effects with machine learning.
9 |
10 | The course covers different predictive and causal machine learning methods. A focus will be on the application of these methods in practical R programming session.
11 |
12 | Predictive Machine Learning:
13 | - Regularized Regression
14 | - Trees and Forests
15 | - Unsupervised Machine Learning
16 |
17 | Causal Machine Learning
18 | - Double Selection Procedure
19 | - Debiased Machine Learning
20 | - Causal Forests
21 | - Optimal Policy Learning
22 | - Reinforcement Learning
23 |
--------------------------------------------------------------------------------
/Stata Example/ajr_example.do:
--------------------------------------------------------------------------------
1 | clear
2 |
3 | // Data is from Acemoglu, Robinson, and Johnson (2001) "The Colonial Origins of Comparative Development: An Empirical Investigation"
4 | use https://statalasso.github.io/dta/AJR.dta
5 |
6 | // We estimate the effect of institutions (avexpr) on income (logpgp95)
7 | // logpgp95 - log of GDP per capita in 1995
8 | // avexpr - average protection against exprorition risk, 1985-1995
9 |
10 | * Unconditional OLS estimate
11 | reg logpgp95 avexpr, robust
12 |
13 | * Conditional OLS estimate
14 | // We have 24 control variables (latitude, temperature, humidity, ethnical diversity, soil, commodities, etc.)
15 | // The data contains only have 64 country-level observations
16 | reg logpgp95 avexpr lat_abst edes1975 avelf temp* humid* steplow-oilres, robust
17 |
18 | * Post-Lasso Double Selection Procedure
19 | // Let the data decide which control variables are important
20 | pdslasso logpgp95 avexpr (lat_abst edes1975 avelf temp* humid* steplow-oilres), robust nois
21 |
22 |
23 | * Useful Links:
24 | // https://statalasso.github.io/
25 | // https://github.com/aahrens1
26 | // https://economics.mit.edu/files/4123
27 |
--------------------------------------------------------------------------------
/Stata Example/pdslasso.ado:
--------------------------------------------------------------------------------
1 | *! pdslasso 1.0.01 30jan2018
2 | *! authors aa/cbh/ms
3 | * wrapper for ivlasso
4 |
5 | program define pdslasso, eclass sortpreserve
6 | syntax [anything] [if] [in] , ///
7 | [ ///
8 | OLSOPTions(string) /// options passed to IV or OLS estimation
9 | * ]
10 |
11 | version 13
12 | ivlasso `anything' `if' `in', `options' cmdname(pdslasso) ivoptions(`olsoptions')
13 |
14 | ereturn local cmd pdslasso
15 |
16 | end
17 |
18 |
--------------------------------------------------------------------------------
/Stata Example/rlasso.ado:
--------------------------------------------------------------------------------
1 | *! rlasso 1.0.06 10feb2018
2 | *! authors aa/cbh/ms
3 |
4 | * Updates (release date):
5 | * 1.0.05 (30jan2018)
6 | * First public release.
7 | * Added seed(.) option to rlasso/lassoutils to control rnd # seed for xdep & sup-score.
8 | * Fixed bug in DisplayCoefs (didn't accommodate both e(notpen) and e(pnotpen)).
9 | * Promoted to require version 13 or higher.
10 | * Added dots option.
11 | * Fixed displaynames bug (wrong dictionaries used for partialled-out vars).
12 | * Recoding of cons and demeaning flags.
13 | * partial and nocons no longer compatible.
14 | * Removed hdm version of sup-score stat.
15 | * Removed misc debug code.
16 | * 1.0.06 (xxx)
17 | * Support for Sergio Correia's FTOOLS FE transform (if installed).
18 |
19 | program rlasso, eclass sortpreserve
20 |
21 | version 13
22 |
23 | syntax [anything] [if] [in] [, ///
24 | displayall ///
25 | varwidth(int 17) ///
26 | VERsion ///
27 | supscore ///
28 | testonly ///
29 | * ///
30 | ]
31 |
32 | local lversion 1.0.05
33 |
34 | if "`version'" != "" { // Report program version number, then exit.
35 | di in gr "`lversion'"
36 | ereturn clear
37 | ereturn local version `lversion'
38 | exit
39 | }
40 |
41 | if ~replay() { // not replay so estimate
42 | _rlasso `anything' `if' `in', ///
43 | `options' `supscore' `testonly'
44 | }
45 | else if e(cmd)~="rlasso" { // replay, so check that rlasso results exist
46 | di as err "last estimates not found"
47 | exit 301
48 | }
49 |
50 | if "`e(method)'"~="" {
51 | DisplayCoefs, `displayall' varwidth(`varwidth')
52 | }
53 |
54 | // temp measure
55 | if e(supscore) < . {
56 | DisplaySupScore
57 | }
58 |
59 | end
60 |
61 | program _rlasso, eclass sortpreserve
62 |
63 | version 13
64 |
65 | syntax varlist(numeric fv ts min=2) [if] [in] [, ///
66 | /// specify options with varlists to be used by marksample/markout
67 | PNOTPen(varlist fv ts numeric) /// list of variables not penalised
68 | partial(string) /// string so that list can contain "_cons"
69 | fe /// do within-transformation
70 | NOCONStant ///
71 | CLuster(varlist max=1) /// penalty level/loadings allow for within-panel dependence & heterosk.
72 | pols /// post-lasso coefs in e(b) (default=lasso)
73 | prestd ///
74 | VERbose /// pass to lassoutils
75 | VVERbose /// pass to lassoutils
76 | dots ///
77 | displaynames_o(string) /// dictionary with names of vars as supplied in varlist
78 | displaynames_d(string) /// corresponding display names of vars
79 | pminus(int 0) /// overrides calculation of pminus
80 | debug /// used for debugging
81 | postall /// full coef vector in e(b) (default=selected only)
82 | testonly /// obtain supscore test only
83 | NOFTOOLS ///
84 | * /// additional options to be passed to lassoutils
85 | ]
86 |
87 | *** rlasso-specific
88 | // to distinguish between lasso2 and rlasso treatment of notpen,
89 | // rlasso option is called pnotpen
90 | // to keep lasso2 and rlasso code aligned, rename to notpen here
91 | // and at end of program save macros as pnotpen
92 | // temporary measure until lasso2 and rlasso code is merge
93 | local notpen `pnotpen'
94 | // supscore test flag
95 | local testonlyflag =("`testonly'"~="")
96 | *
97 |
98 | *** debug mode; create flag
99 | local debugflag =("`debug'"~="")
100 | *
101 |
102 | *** Record which observations have non-missing values
103 | marksample touse
104 | markout `touse' `varlist' `cluster' `ivar'
105 | sum `touse' if `touse', meanonly // will sum weight var when weights are used
106 | local N = r(N)
107 | *
108 |
109 | *** FEs. Create 1/0 flag.
110 | // Get panel id
111 | local feflag=("`fe'"~="")
112 | if `feflag' {
113 | cap _xt
114 | if _rc ~= 0 {
115 | di as err "Error: fe option requires data to be xtset"
116 | exit 459
117 | }
118 | else {
119 | local ivar `r(ivar)'
120 | }
121 | }
122 | *
123 |
124 | *** constant, partial, etc.
125 | // conmodel: constant in original model
126 | // consflag: constant in transformed equation to estimate
127 | local consmodel =("`noconstant'"=="") & ~`feflag' // if fe, then consmodel=0 & partialcons=""
128 | local partialflag =("`partial'"~="") // =1 even if just cons being partialled out
129 | local prestdflag =("`prestd'"~="")
130 | // "_cons" allowed as an argument to partial(.) - remove it
131 | local partial : subinstr local partial "_cons" "", all word count(local pconscount)
132 | local notpen : subinstr local notpen "_cons" "", all word count(local notpenconscount)
133 | // Tell estimation code if cons has been partialled out or there isn't one in the first place
134 | if `feflag' | `partialflag' | `prestdflag' | (~`consmodel') {
135 | local consflag 0
136 | }
137 | else {
138 | local consflag 1
139 | }
140 | *
141 |
142 | *** create main varlist and tempvars
143 | // remove duplicates from varlist
144 | // _o list is vars with original names
145 | fvexpand `varlist' if `touse'
146 | local varlist_o `r(varlist)'
147 | // check for duplicates has to follow expand
148 | local dups : list dups varlist_o
149 | if "`dups'"~="" {
150 | di as text "Dropping duplicates: `dups'"
151 | }
152 | local varlist_o : list uniq varlist_o
153 | *
154 |
155 | *** Create separate _o varlists: Y, X, notpen, partial
156 | // Y, X
157 | local varY_o : word 1 of `varlist_o'
158 | local varX_o : list varlist_o - varY_o // incl notpen/partial
159 | // notpen
160 | fvexpand `notpen' if `touse'
161 | local notpen_o `r(varlist)'
162 | local dups : list dups notpen_o
163 | if "`dups'"~="" {
164 | di as text "Dropping duplicates: `dups'"
165 | }
166 | local notpen_o : list uniq notpen_o
167 | // partial
168 | fvexpand `partial' if `touse'
169 | local partial_o `r(varlist)'
170 | local dups : list dups partial_o
171 | if "`dups'"~="" {
172 | di as text "Dropping duplicates: `dups'"
173 | }
174 | local partial_o : list uniq partial_o
175 | // "model" = vars without partialled-out
176 | local varXmodel_o : list varX_o - partial_o
177 | *
178 |
179 | *** syntax checks
180 | // check that notpen vars are in full list
181 | local checklist : list notpen_o - varX_o
182 | local checknum : word count `checklist'
183 | if `checknum' {
184 | di as err "syntax error - `checklist' in notpen(.) but not in list of regressors"
185 | exit 198
186 | }
187 | // check that partial vars are in full list
188 | local checklist : list partial_o - varX_o
189 | local checknum : word count `checklist'
190 | if `checknum' {
191 | di as err "syntax error - `checklist' in partial(.) but not in list of regressors"
192 | exit 198
193 | }
194 | // check that ivar (FE) is not a used variable
195 | if `feflag' {
196 | fvrevar `varY_o' `varX_o', list // list option means we get only base vars
197 | local vlist `r(varlist)'
198 | local checklist : list ivar - vlist
199 | local checknum : word count `checklist'
200 | if `checknum'==0 {
201 | di as err "syntax error - `ivar' is xtset variable and cannot be used in model"
202 | exit 198
203 | }
204 | }
205 | // other checks
206 | if `pconscount' & `feflag' {
207 | di as err "error: incompatible options, partial(_cons) and fe"
208 | exit 198
209 | }
210 | if "`partial'"~="" & "`noconstant'"~="" {
211 | di as err "error: incompatible options, partial and nocons"
212 | exit 198
213 | }
214 | if `feflag' & "`noconstant'"~="" {
215 | di as err "error: incompatible options, fe and nocons"
216 | exit 198
217 | }
218 | *
219 |
220 | *** Create _t varlists: Y, X, notpen, partial
221 | // _o list is vars with original names
222 | // _t list is temp vars if transform needed, original vars if not
223 | if `feflag' { // everything needs to be transformed including partial
224 | local temp_ct : word count `varlist_o'
225 | mata: s_maketemps(`temp_ct')
226 | local varlist_t `r(varlist)'
227 | }
228 | else if `partialflag' | `prestdflag' { // everything except partial_o needs to be transformed
229 | local varYXmodel_o `varY_o' `varXmodel_o'
230 | local temp_ct : word count `varYXmodel_o'
231 | mata: s_maketemps(`temp_ct')
232 | local varYXmodel_t `r(varlist)'
233 | matchnames "`varlist_o'" "`varYXmodel_o'" "`varYXmodel_t'"
234 | local varlist_t `r(names)'
235 | }
236 | else { // no transformation needed but still need temps
237 | fvrevar `varlist_o' if `touse' // fvrevar creates temps only when needed
238 | local varlist_t `r(varlist)'
239 | }
240 | // dictionary is now varlist_o / varlist_t
241 | // now create separate _o and _t varlists using dictionary
242 | foreach vlist in varY varX varXmodel notpen partial {
243 | matchnames "``vlist'_o'" "`varlist_o'" "`varlist_t'"
244 | local `vlist'_t `r(names)' // corresponding tempnames; always need this because of possible fvs
245 | }
246 | *
247 |
248 | ******************* Display names ***********************************************************
249 | // may be called by another program with tempvars and display names for them
250 | // if display names option not used, use _o names as provided in rlasso command
251 | // if display names option used, use display names matched with _o names
252 | // if display names macros are empty, has no effect
253 | matchnames "`varY_o'" "`displaynames_o'" "`displaynames_d'"
254 | local varY_d `r(names)'
255 | matchnames "`varXmodel_o'" "`displaynames_o'" "`displaynames_d'"
256 | local varXmodel_d `r(names)'
257 | matchnames "`varX_o'" "`displaynames_o'" "`displaynames_d'"
258 | local varX_d `r(names)'
259 | matchnames "`notpen_o'" "`displaynames_o'" "`displaynames_d'"
260 | local notpen_d `r(names)'
261 | matchnames "`partial_o'" "`displaynames_o'" "`displaynames_d'"
262 | local partial_d `r(names)'
263 | *
264 |
265 | *** summary varlists and flags:
266 | * varY_o = dep var
267 | * varY_t = dep var, temp var
268 | * varX_o = full, expanded set of RHS, original names, includes partial
269 | * varX_t = as above but with temp names for all variables
270 | * varXmodel_o = full, expanded set of RHS, original names, excludes partial
271 | * varXmodel_t = as above but with temp names for all variables
272 | * notpen_o = full, expanded set of not-penalized
273 | * notpen_t = as above but with temp names for all variables
274 |
275 | // p is number of penalized vars in the model; follows convention in BCH papers
276 | // p is calculated in lassoutils/_rlasso as number of model vars excluding constant
277 | // here we calculate which of the model vars are unpenalized or omitted/base vars
278 | // to provide as `pminus' to lassoutils/_rlasso (unless provided by user)
279 | // do here so that code above is compatible with lasso2
280 | // use _o names / display names since they have info on whether var is omitted/base/etc.
281 | if ~`pminus' {
282 | foreach vn of local varXmodel_d { // display names
283 | _ms_parse_parts `vn'
284 | // increment pminus if model variable is MISSING
285 | if r(omit) {
286 | local ++pminus
287 | }
288 | }
289 | foreach vn of local notpen_d { // display names
290 | _ms_parse_parts `vn'
291 | // increment pminus if notpen variable is NOT MISSING
292 | if ~r(omit) {
293 | local ++pminus
294 | }
295 | }
296 | }
297 | // p0 here is total number of variables provided to model EXCLUDING constant
298 | local p0 : word count `varXmodel_o'
299 | local p =`p0'-`pminus'
300 | // warn
301 | if `p'<=0 {
302 | di as text "warning: no penalized regressors; results are OLS"
303 | }
304 | // now for error-checking below, p0 should INCLUDE constant unless partialled-out etc.
305 | local p0 =`p0'+`consflag'
306 | *
307 |
308 | ******************* FE, partialling out, standardization ************************************
309 | // If FE: partial-out FEs from temp variables, then preserve,
310 | // then partial-out low-dim ctrls from temp variables
311 | // restore will restore all temp vars with only FEs partialled-out
312 | // If no FE: leave original variables unchanged.
313 | // partial-out low-dim ctrls from temp variables.
314 | // if no FE/low-dim ctrls, no transform needed
315 |
316 | local dmflag =0 // initialize demeaned flag
317 | if `feflag' { // FE-transform all variables
318 | fvrevar `varY_o' `varX_o' if `touse' // in case any FV or TS vars in _o list
319 | local vlist `r(varlist)'
320 | lassoutils `vlist', /// call on _o list
321 | touse(`touse') ///
322 | tvarlist(`varY_t' `varX_t') /// overwrite/initialize these
323 | `noftools' ///
324 | fe(`ivar') // triggers branching to FE utility
325 | local N_g =r(N_g) // N_g will be empty if no FEs
326 | local noftools `r(noftools)' // either not installed or user option
327 | local dmflag=1 // data are now demeaned
328 | if `partialflag' { // And then partial out any additional vars
329 | preserve // preserve the original values of tempvars before partialling out
330 | lassoutils `varY_t' `varXmodel_t', /// _t vars have been created and filled so use here
331 | touse(`touse') /// don't need tvarlist because vars already created
332 | partial(`partial_t') /// _t vars have been created and filled so use here
333 | partialflag(`partialflag') /// triggers branching to partial utility
334 | dmflag(1) // FE => mean zero
335 | }
336 | if `prestdflag' {
337 | tempname prestdY prestdX
338 | lassoutils `varY_t', /// _t vars have been created and filled so use here
339 | touse(`touse') /// don't need tvarlist because vars already created
340 | std ///
341 | dmflag(1) // FE => data already mean zero
342 | mat `prestdY'=r(stdvec)
343 | lassoutils `varXmodel_t', ///
344 | touse(`touse') ///
345 | std ///
346 | dmflag(1) // FE => data already mean zero
347 | mat `prestdX'=r(stdvec)
348 | }
349 | }
350 | else if `partialflag' { // Just partial out
351 | fvrevar `varY_o' `varXmodel_o' if `touse' // in case any FV or TS vars in _o list
352 | local vlist `r(varlist)'
353 | fvrevar `partial_o' if `touse' // in case any FV or TS vars in _o list
354 | local pvlist `r(varlist)'
355 | lassoutils `vlist', /// call on _o list
356 | touse(`touse') ///
357 | partial(`pvlist') ///
358 | tvarlist(`varY_t' `varXmodel_t') /// overwrite/initialize these
359 | partialflag(`partialflag') /// triggers branching to partial utility
360 | dmflag(0) // data are not yet demeaned
361 | local dmflag =1 // data are now demeaned
362 | if `prestdflag' {
363 | tempname prestdY prestdX
364 | lassoutils `varY_t', /// _t vars have been created and filled so use here
365 | touse(`touse') /// don't need tvarlist because vars already created
366 | std ///
367 | dmflag(1) // partial => already mean zero
368 | mat `prestdY'=r(stdvec)
369 | lassoutils `varXmodel_t', ///
370 | touse(`touse') ///
371 | std ///
372 | dmflag(1) // partial => already mean zero
373 | mat `prestdX'=r(stdvec)
374 | }
375 | }
376 | else if `prestdflag' {
377 | tempname prestdY prestdX
378 | lassoutils `varY_o', /// call on _o list
379 | touse(`touse') ///
380 | std ///
381 | tvarlist(`varY_t') /// overwrite/initialize these
382 | consmodel(`consmodel') /// =1 => data should be demeaned
383 | dmflag(0) // data not yet mean zero
384 | mat `prestdY'=r(stdvec)
385 | fvrevar `varXmodel_o' if `touse' // in case any FV or TS vars in _o list
386 | local vlist `r(varlist)'
387 | lassoutils `vlist', /// call on _o list
388 | touse(`touse') ///
389 | std ///
390 | tvarlist(`varXmodel_t') /// overwrite/initialize these
391 | consmodel(`consmodel') /// =1 => data should be demeaned
392 | dmflag(0) // data not yet mean zero
393 | mat `prestdX'=r(stdvec)
394 | if `consmodel' {
395 | local dmflag =1 // if cons in model, data are now demeaned
396 | }
397 | }
398 |
399 | ************* Partialling/standardization END ***********************************************
400 |
401 | ************* Lasso estimation with transformed/partialled-out vars *************************
402 | if "`verbose'`vverbose'`dots'"=="" {
403 | local quietly "quietly" // don't show lassoutils output
404 | }
405 |
406 | `quietly' lassoutils `varY_t', ///
407 | rlasso /// branch to _rlasso subroutine
408 | /// nocons, no penloads, etc. all assumed
409 | touse(`touse') ///
410 | xnames_o(`varXmodel_d') /// display names for lassoutils output
411 | xnames_t(`varXmodel_t') ///
412 | cluster(`cluster') ///
413 | notpen_o(`notpen_d') ///
414 | notpen_t(`notpen_t') ///
415 | consflag(`consflag') /// =0 if cons already partialled out or if no cons
416 | dmflag(`dmflag') /// =1 if data have been demeaned
417 | pminus(`pminus') ///
418 | stdy(`prestdY') ///
419 | stdx(`prestdX') ///
420 | `verbose' `vverbose' `dots' ///
421 | `testonly' ///
422 | `options'
423 | *
424 |
425 | ************* Finish up ********************************************************
426 | *** e-return lasso estimation results
427 | tempname b beta betaOLS Ups sUps eUps
428 | tempname betaAll betaAllOLS
429 | tempname lambda slambda lambda0 rmse rmseOLS
430 | tempname c gamma gammad
431 | tempname supscore supscore_p supscore_cv supscore_gamma
432 |
433 | if ~`testonlyflag' {
434 |
435 | if "`cluster'" ~= "" {
436 | local N_clust =r(N_clust)
437 | }
438 | mat `beta' =r(beta) // may be empty!
439 | mat `betaOLS' =r(betaOLS) // may be empty!
440 | mat `betaAll' =r(betaAll)
441 | mat `betaAllOLS' =r(betaAllOLS)
442 | mat `Ups' =r(Ups)
443 | mat `sUps' =r(sUps)
444 | mat `eUps' =r(eUps)
445 | scalar `lambda' =r(lambda)
446 | scalar `slambda' =r(slambda)
447 | scalar `lambda0' =r(lambda0)
448 | scalar `c' =r(c)
449 | scalar `gamma' =r(gamma)
450 | scalar `gammad' =r(gammad)
451 | scalar `rmse' =r(rmse) // Lasso RMSE
452 | scalar `rmseOLS' =r(rmseOLS) // post-Lasso RMSE
453 | local selected `r(selected)' // EXCL NOTPEN/CONS
454 | local selected0 `r(selected0)' // INCL NOTPEN, EXCL CONS
455 | local s =r(s) // EXCL NOTPEN/CONS; number of elements in selected
456 | local s0 =r(s0) // INCL NOTPEN, EXCL CONS; number of elements in selected0
457 | local clustvar `r(clustvar)'
458 | local robust `r(robust)'
459 | local center =r(center)
460 | local method `r(method)' // lasso or sqrt-lasso
461 | local niter =r(niter)
462 | local maxiter =r(maxiter)
463 | local nupsiter =r(nupsiter)
464 | local maxupsiter =r(maxupsiter)
465 | // these can be missings
466 | scalar `supscore' =r(supscore)
467 | scalar `supscore_p' =r(supscore_p)
468 | scalar `supscore_cv' =r(supscore_cv)
469 | scalar `supscore_gamma' =r(supscore_gamma)
470 | local ssnumsim =r(ssnumsim)
471 |
472 | // flag for empty beta (consflag=0 means rlasso didn't estimate a constant)
473 | local betaempty =(`s0'==0 & `consflag'==0)
474 | // error check
475 | if `betaempty' {
476 | if ~(colsof(`beta')==1 & `beta'[1,1]==.) {
477 | di as err "internal _rlasso error - beta should be empty (no vars estimated) but isn't
478 | exit 499
479 | }
480 | }
481 | // issue warning if lasso max iteration limit hit
482 | if `niter'==`maxiter' {
483 | di as text "Warning: reached max shooting iterations w/o achieving convergence."
484 | }
485 | // error check - p0s and ps should match
486 | if `p0'~=r(p0) { // number of all variables in betaAll INCL NOTPEN/CONS (if present or not partialled etc.)
487 | di as err "internal _rlasso error - p0 count of model vars `p0' does not match returned value `r(p0)'"
488 | exit 499
489 | }
490 | if `p'~=r(p) { // number of penalized variables in model
491 | di as err "internal _rlasso error - p count of penalized vars `p' does not match returned value `r(p)'"
492 | exit 499
493 | }
494 | // fix depvar (rownames) of beta vectors to use _o (or _d if display names provided) not _t
495 | mat rownames `beta' = `varY_d'
496 | mat rownames `betaOLS' = `varY_d'
497 | mat rownames `betaAll' = `varY_d'
498 | mat rownames `betaAllOLS' = `varY_d'
499 | if ~`betaempty' { // cnames should stay empty if beta has a single missing value
500 | local cnames_o : colnames `beta'
501 | fvstrip `cnames_o' // colnames may insert b/n/o operators - remove
502 | local cnames_o `r(varlist)'
503 | matchnames "`cnames_o'" "`varlist_o'" "`varlist_t'"
504 | local cnames_t `r(names)'
505 | }
506 | else {
507 | local cnames_o
508 | local cnames_t
509 | }
510 | *
511 |
512 | *********** Get coeff estimates for partialled-out vars/std intercept. ********************
513 | if `feflag' & `partialflag' { // FE case and there are partialled-out notpen vars
514 | restore // Restores dataset with tempvars after FE transform but before notpen partialled out
515 | }
516 | if `partialflag' | (`prestdflag' & `consmodel') { // standardization removes constant so must enter for that
517 | if `feflag' {
518 | local depvar `varY_t' // use FE-transformed depvar and X vars
519 | local scorevars `cnames_t'
520 | }
521 | else {
522 | local depvar `varY_o' // use original depvar and X vars
523 | local scorevars `cnames_o'
524 | }
525 | lassoutils `depvar', ///
526 | unpartial ///
527 | touse(`touse') ///
528 | beta(`beta') ///
529 | scorevars(`scorevars') ///
530 | partial(`partial_t') ///
531 | names_o(`varX_d') /// dictionary
532 | names_t(`varX_t') /// dictionary
533 | consmodel(`consmodel')
534 | mat `beta' = r(b)
535 | mat `betaAll' = `betaAll', r(bpartial)
536 | lassoutils `depvar', ///
537 | unpartial ///
538 | touse(`touse') ///
539 | beta(`betaOLS') ///
540 | scorevars(`scorevars') ///
541 | partial(`partial_t') ///
542 | names_o(`varX_d') /// dictionary
543 | names_t(`varX_t') /// dictionary
544 | consmodel(`consmodel')
545 | mat `betaOLS' = r(b)
546 | mat `betaAllOLS' = `betaAllOLS', r(bpartial)
547 | // for unknown reasons, _ms_build_info doesn't add info here (e.g. "base")
548 | _ms_build_info `beta' if `touse'
549 | _ms_build_info `betaAll' if `touse'
550 | _ms_build_info `betaOLS' if `touse'
551 | _ms_build_info `betaAllOLS' if `touse'
552 | // finish by setting betaempty to 0
553 | local betaempty =0
554 | }
555 | *
556 |
557 | *** Prepare and post results
558 | if "`pols'"=="" & "`postall'"=="" { // selected lasso coefs by default
559 | mat `b' = `beta'
560 | }
561 | else if "`pols'"~="" & "`postall'"=="" { // selected post-lasso coefs
562 | mat `b' = `betaOLS'
563 | }
564 | else if "`pols'"=="" { // full lasso coef vector
565 | mat `b' = `betaAll'
566 | }
567 | else { // full post-lasso coef vector
568 | mat `b' = `betaAllOLS'
569 | }
570 | if `betaempty' & "`postall'"=="" { // no vars in b
571 | ereturn post , obs(`N') depname(`varY_d') esample(`touse') // display name
572 | }
573 | else { // b has some selected/nonpen/cons
574 | ereturn post `b', obs(`N') depname(`varY_d') esample(`touse') // display name
575 | }
576 | // additional returned results
577 | ereturn local noftools `noftools'
578 | ereturn local postall `postall'
579 | ereturn scalar niter =`niter'
580 | ereturn scalar maxiter =`maxiter'
581 | ereturn scalar nupsiter =`nupsiter'
582 | ereturn scalar maxupsiter =`maxupsiter'
583 | ereturn local robust `robust'
584 | ereturn local ivar `ivar'
585 | ereturn local selected `selected' // selected only
586 | ereturn local varXmodel `varXmodel_d' // display name
587 | ereturn local varX `varX_d' // display name
588 | if "`pols'"=="" {
589 | ereturn local estimator ols
590 | }
591 | else {
592 | ereturn local estimator postlasso
593 | }
594 | ereturn local method `method'
595 | ereturn local predict rlasso_p
596 | ereturn local cmd rlasso
597 | ereturn scalar center =`center'
598 | ereturn scalar cons =`consmodel'
599 | ereturn scalar lambda =`lambda'
600 | ereturn scalar lambda0 =`lambda0'
601 | ereturn scalar slambda =`slambda'
602 | ereturn scalar c =`c'
603 | ereturn scalar gamma =`gamma'
604 | ereturn scalar gammad =`gammad'
605 |
606 | if `supscore' < . {
607 | ereturn scalar ssnumsim =`ssnumsim'
608 | ereturn scalar supscore =`supscore'
609 | ereturn scalar supscore_p =`supscore_p'
610 | ereturn scalar supscore_cv =`supscore_cv'
611 | ereturn scalar supscore_gamma =`supscore_gamma'
612 | }
613 |
614 | if "`N_clust'" ~= "" {
615 | ereturn local clustvar `clustvar'
616 | ereturn scalar N_clust =`N_clust'
617 | }
618 | if "`N_g'" ~= "" {
619 | ereturn scalar N_g =`N_g'
620 | }
621 | ereturn scalar fe =`feflag'
622 | ereturn scalar rmse =`rmse'
623 | ereturn scalar rmseOLS =`rmseOLS'
624 | ereturn scalar pminus =`pminus'
625 | ereturn scalar p =`p' // number of all penalized vars; excludes omitteds etc.
626 | ereturn scalar s0 =`s0' // number of all estimated coefs (elements of beta)
627 | ereturn scalar s =`s' // number of selected
628 |
629 | ereturn matrix sUps =`sUps'
630 | ereturn matrix eUps =`eUps'
631 | ereturn matrix Ups =`Ups'
632 | ereturn matrix betaAllOLS =`betaAllOLS'
633 | ereturn matrix betaAll =`betaAll'
634 | ereturn matrix betaOLS =`betaOLS'
635 | ereturn matrix beta =`beta'
636 |
637 | // rlasso-specific:
638 | // selected0 and s0 included partialled-out.
639 | // If cons exists and was not partialled out, add to notpen and selected0.
640 | // Otherwise if cons exists and was partialled out, add to to partial list.
641 | if `consmodel' & ~`partialflag' {
642 | local selected0 `selected0' _cons
643 | local notpen_d `notpen_d' _cons // display name
644 | }
645 | else if `consmodel' & `partialflag' {
646 | local partial_d `partial_d' _cons // display name
647 | local selected0 `selected0' `partial_d' // display name
648 | }
649 | else if `partialflag' {
650 | local selected0 `selected0' `partial_d' // display name
651 | }
652 | // remaining results
653 | ereturn local selected0 `selected0'
654 | ereturn local partial `partial_d' // display name
655 | ereturn scalar partial_ct =`: word count `partial_d'' // (display name) number of partialled-out INCLUDING CONSTANT
656 | ereturn scalar s0 =`: word count `selected0'' // (update) selected or notpen, INCL CONS
657 | // rlasso-specific - save as "pnotpen" (vs lasso2 "notpen")
658 | ereturn local pnotpen `notpen_d' // display name
659 | ereturn scalar pnotpen_ct =`: word count `notpen_d'' // (display name) number of notpen INCLUDING CONSTANT (if not partialled-out)
660 | *
661 | }
662 | else {
663 |
664 | // sup-score test only - no lasso results
665 | ereturn clear
666 |
667 | ereturn scalar N =r(N)
668 | ereturn scalar N_clust =r(N_clust)
669 | ereturn scalar gamma =r(gamma)
670 | ereturn scalar c =r(c)
671 | ereturn scalar p =`p'
672 | ereturn scalar ssnumsim =r(ssnumsim)
673 | ereturn scalar supscore =r(supscore)
674 | ereturn scalar supscore_p =r(supscore_p)
675 | ereturn scalar supscore_cv =r(supscore_cv)
676 | ereturn scalar supscore_gamma =r(supscore_gamma)
677 |
678 | ereturn local cmd rlasso
679 | ereturn scalar cons =`consmodel'
680 |
681 | }
682 |
683 | end
684 |
685 | prog DisplaySupScore
686 |
687 | di
688 | di as text "{help rlasso##supscore:Sup-score} test H0: beta=0"
689 | di as text "CCK sup-score statistic" _col(25) as res %6.2f e(supscore) _c
690 | if e(supscore_p) < . {
691 | di as text _col(32) "p-value=" _col(39) as res %6.3f e(supscore_p)
692 | }
693 | else {
694 | di
695 | }
696 | di as text "CCK " as res 100*e(supscore_gamma) as text "% critical value" _c
697 | di as res _col(25) %6.2f e(supscore_cv) _col(32) as text "(asympt bound)"
698 |
699 | end
700 |
701 |
702 | // Used in rlasso and lasso2.
703 | // version 2017-12-20
704 | // updated 31dec17 to accommodate e(pnotpen)
705 | prog DisplayCoefs
706 |
707 | syntax , ///
708 | [ ///
709 | displayall /// full coef vector in display (default=selected only)
710 | varwidth(int 17) ///
711 | NORecover ///
712 | ]
713 |
714 | local cons =e(cons)
715 | if ("`norecover'"=="") {
716 | local partial `e(partial)'
717 | local partial_ct =e(partial_ct)
718 | }
719 | else {
720 | local partial
721 | local partial_ct =0
722 | }
723 |
724 | // varlists
725 | local selected `e(selected)'
726 | fvstrip `selected'
727 | local selected `r(varlist)'
728 | local notpen `e(notpen)'`e(pnotpen)'
729 | fvstrip `notpen'
730 | local notpen `r(varlist)'
731 | local selected0 `e(selected0)'
732 | fvstrip `selected0'
733 | local selected0 `r(varlist)'
734 | // coef vectors
735 | tempname beta betaOLS
736 | if "`displayall'"~="" { // there must be some vars specified even if nothing selected
737 | mat `beta' =e(betaAll)
738 | mat `betaOLS' =e(betaAllOLS)
739 | local col_ct =colsof(`beta')
740 | local vlist : colnames `beta'
741 | local vlistOLS : colnames `betaOLS'
742 | local baselevels baselevels
743 | }
744 | else if e(k)>0 { // display only selected, but only if there are any
745 | mat `beta' =e(beta)
746 | mat `betaOLS' =e(betaOLS)
747 | local col_ct =colsof(`beta')
748 | local vlist : colnames `beta'
749 | local vlistOLS : colnames `betaOLS'
750 | }
751 | else { // nothing selected, zero columns in beta
752 | local col_ct =0
753 | }
754 | if e(k)>0 {
755 | _ms_build_info `beta' if e(sample)
756 | _ms_build_info `betaOLS' if e(sample)
757 | }
758 |
759 | *** (Re-)display coefficients including constant/partial
760 | local varwidth1 =`varwidth'+1
761 | local varwidth3 =`varwidth'+3
762 | local varwidth4 =`varwidth'+4
763 | local varwidthm7 =`varwidth'-7
764 | local varwidthm13 =`varwidth'-13
765 | di
766 | di as text "{hline `varwidth1'}{c TT}{hline 32}"
767 | if "`e(method)'"=="sqrt-lasso" {
768 | di as text _col(`varwidthm7') "Selected {c |} Sqrt-lasso Post-est OLS"
769 | }
770 | else if "`e(method)'"=="ridge" {
771 | di as text _col(`varwidthm7') "Selected {c |} Ridge Post-est OLS"
772 | }
773 | else if "`e(method)'"=="elastic net" {
774 | di as text _col(`varwidthm7') "Selected {c |} Elastic net Post-est OLS"
775 | di as text _col(`varwidthm7') " {c |}" _c
776 | di as text " (alpha=" _c
777 | di as text %4.3f `e(alpha)' _c
778 | di as text ")"
779 | }
780 | else if "`e(method)'"=="lasso" {
781 | di as text _col(`varwidthm7') "Selected {c |} Lasso Post-est OLS"
782 | }
783 | else {
784 | di as err "internal DisplayCoefs error. unknown method."
785 | exit 1
786 | }
787 | di as text "{hline `varwidth1'}{c +}{hline 32}"
788 | local anynotpen = 0
789 | local i 1
790 | local lastcol = `col_ct' - `partial_ct'
791 | tokenize `vlist' // put elements of coef vector into macros 1, 2, ...
792 | while `i' <= `lastcol' {
793 | local vn ``i''
794 | fvstrip `vn' // get rid of o/b/n prefix for display purposes
795 | local vn `r(varlist)'
796 | _ms_display, element(`i') matrix(`beta') width(`varwidth') `baselevels'
797 | // in selected or notpen list?
798 | local isselnotpen : list posof "`vn'" in selected0
799 | local isnotpen : list posof "`vn'" in notpen
800 | local anynotpen = `anynotpen' + `isnotpen'
801 | // note attached? base, empty, omitted
802 | qui _ms_display, element(`i') matrix(`beta')
803 | local note `r(note)'
804 | qui _ms_display, element(`i') matrix(`betaOLS')
805 | local noteOLS `r(note)'
806 | // if notpen, add footnote
807 | if `isnotpen' & "`note'"=="" {
808 | di as text "{helpb rlasso##notpen:*}" _c
809 | }
810 | if `isselnotpen' {
811 | // lasso coef
812 | if "`note'"=="" {
813 | di _col(`varwidth4') as res %15.7f el(`beta',1,`i') _c
814 | }
815 | else {
816 | di _col(`varwidth4') as text %15s "`note'" _c
817 | }
818 | // post-lasso coef - can be omitted if collinear
819 | if "`noteOLS'"=="" {
820 | di as res %15.7f el(`betaOLS',1,`i')
821 | }
822 | else {
823 | di as text %15s "`noteOLS'"
824 | }
825 | }
826 | else if "`note'"=="(omitted)" {
827 | // not selected
828 | di _col(`varwidth4') as text %15s "(not selected)" _c
829 | di as text %15s "(not selected)"
830 | }
831 | else {
832 | // other eg base var
833 | di as text %15s "`note'" _c
834 | di as text %15s "`noteOLS'"
835 | }
836 | local ++i
837 | }
838 | if `partial_ct' {
839 | di as text "{hline `varwidth1'}{c +}{hline 32}"
840 | di as text _col(`varwidthm13') "Partialled-out{help lasso2##notpen:*}{c |}"
841 | di as text "{hline `varwidth1'}{c +}{hline 32}"
842 | local i = `lastcol'+1
843 | while `i' <= `col_ct' {
844 | local vn ``i''
845 | fvstrip `vn' // get rid of o/b/n prefix for display purposes
846 | local vn `r(varlist)'
847 | _ms_display, element(`i') matrix(`beta') width(`varwidth') `baselevels'
848 | // note attached? base, empty, omitted
849 | qui _ms_display, element(`i') matrix(`beta')
850 | local note `r(note)'
851 | qui _ms_display, element(`i') matrix(`betaOLS')
852 | local noteOLS `r(note)'
853 | // lasso coef
854 | if "`note'"=="" {
855 | di _col(`varwidth4') as res %15.7f el(`beta',1,`i') _c
856 | }
857 | else {
858 | di _col(`varwidth4') as text %15s "`note'" _c
859 | }
860 | // post-lasso coef - can be omitted if collinear
861 | if "`noteOLS'"=="" {
862 | di as res %15.7f el(`betaOLS',1,`i')
863 | }
864 | else {
865 | di as text %15s "`noteOLS'"
866 | }
867 | local ++i
868 | }
869 | }
870 | di as text "{hline `varwidth1'}{c BT}{hline 32}"
871 |
872 | if `anynotpen' {
873 | di "{help rlasso##notpen:*Not penalized}"
874 | }
875 |
876 | end
877 |
878 | *************************** Stata utilities ******************************
879 |
880 | // internal version of fvstrip 1.01 ms 24march2015
881 | // takes varlist with possible FVs and strips out b/n/o notation
882 | // returns results in r(varnames)
883 | // optionally also omits omittable FVs
884 | // expand calls fvexpand either on full varlist
885 | // or (with onebyone option) on elements of varlist
886 |
887 | program define fvstrip, rclass
888 | version 11.2
889 | syntax [anything] [if] , [ dropomit expand onebyone NOIsily ]
890 | if "`expand'"~="" { // force call to fvexpand
891 | if "`onebyone'"=="" {
892 | fvexpand `anything' `if' // single call to fvexpand
893 | local anything `r(varlist)'
894 | }
895 | else {
896 | foreach vn of local anything {
897 | fvexpand `vn' `if' // call fvexpand on items one-by-one
898 | local newlist `newlist' `r(varlist)'
899 | }
900 | local anything : list clean newlist
901 | }
902 | }
903 | foreach vn of local anything { // loop through varnames
904 | if "`dropomit'"~="" { // check & include only if
905 | _ms_parse_parts `vn' // not omitted (b. or o.)
906 | if ~`r(omit)' {
907 | local unstripped `unstripped' `vn' // add to list only if not omitted
908 | }
909 | }
910 | else { // add varname to list even if
911 | local unstripped `unstripped' `vn' // could be omitted (b. or o.)
912 | }
913 | }
914 | // Now create list with b/n/o stripped out
915 | foreach vn of local unstripped {
916 | local svn "" // initialize
917 | _ms_parse_parts `vn'
918 | if "`r(type)'"=="variable" & "`r(op)'"=="" { // simplest case - no change
919 | local svn `vn'
920 | }
921 | else if "`r(type)'"=="variable" & "`r(op)'"=="o" { // next simplest case - o.varname => varname
922 | local svn `r(name)'
923 | }
924 | else if "`r(type)'"=="variable" { // has other operators so strip o but leave .
925 | local op `r(op)'
926 | local op : subinstr local op "o" "", all
927 | local svn `op'.`r(name)'
928 | }
929 | else if "`r(type)'"=="factor" { // simple factor variable
930 | local op `r(op)'
931 | local op : subinstr local op "b" "", all
932 | local op : subinstr local op "n" "", all
933 | local op : subinstr local op "o" "", all
934 | local svn `op'.`r(name)' // operator + . + varname
935 | }
936 | else if"`r(type)'"=="interaction" { // multiple variables
937 | forvalues i=1/`r(k_names)' {
938 | local op `r(op`i')'
939 | local op : subinstr local op "b" "", all
940 | local op : subinstr local op "n" "", all
941 | local op : subinstr local op "o" "", all
942 | local opv `op'.`r(name`i')' // operator + . + varname
943 | if `i'==1 {
944 | local svn `opv'
945 | }
946 | else {
947 | local svn `svn'#`opv'
948 | }
949 | }
950 | }
951 | else if "`r(type)'"=="product" {
952 | di as err "fvstrip error - type=product for `vn'"
953 | exit 198
954 | }
955 | else if "`r(type)'"=="error" {
956 | di as err "fvstrip error - type=error for `vn'"
957 | exit 198
958 | }
959 | else {
960 | di as err "fvstrip error - unknown type for `vn'"
961 | exit 198
962 | }
963 | local stripped `stripped' `svn'
964 | }
965 | local stripped : list retokenize stripped // clean any extra spaces
966 |
967 | if "`noisily'"~="" { // for debugging etc.
968 | di as result "`stripped'"
969 | }
970 |
971 | return local varlist `stripped' // return results in r(varlist)
972 | end
973 |
974 | // Internal version of matchnames
975 | // Sample syntax:
976 | // matchnames "`varlist'" "`list1'" "`list2'"
977 | // takes list in `varlist', looks up in `list1', returns entries in `list2', called r(names)
978 | program define matchnames, rclass
979 | version 11.2
980 | args varnames namelist1 namelist2
981 |
982 | local k1 : word count `namelist1'
983 | local k2 : word count `namelist2'
984 |
985 | if `k1' ~= `k2' {
986 | di as err "namelist error"
987 | exit 198
988 | }
989 | foreach vn in `varnames' {
990 | local i : list posof `"`vn'"' in namelist1
991 | if `i' > 0 {
992 | local newname : word `i' of `namelist2'
993 | }
994 | else {
995 | * Keep old name if not found in list
996 | local newname "`vn'"
997 | }
998 | local names "`names' `newname'"
999 | }
1000 | local names : list clean names
1001 | return local names "`names'"
1002 | end
1003 |
1004 | // Display varlist with specified indentation
1005 | program define Disp
1006 | version 11.2
1007 | syntax [anything] [, _col(integer 15) ]
1008 | local maxlen = 80-`_col'
1009 | local len = 0
1010 | local first = 1
1011 | foreach vn in `anything' {
1012 | * Don't display if base or omitted variable
1013 | _ms_parse_parts `vn'
1014 | if ~`r(omit)' {
1015 | local vnlen : length local vn
1016 | if `len'+`vnlen' > `maxlen' {
1017 | di
1018 | local first = 1
1019 | local len = `vnlen'
1020 | }
1021 | else {
1022 | local len = `len'+`vnlen'+1
1023 | }
1024 | if `first' {
1025 | local first = 0
1026 | di in gr _col(`_col') "`vn'" _c
1027 | }
1028 | else {
1029 | di in gr " `vn'" _c
1030 | }
1031 | }
1032 | }
1033 | * Finish with a newline
1034 | di
1035 | end
1036 |
1037 | version 13
1038 | mata:
1039 |
1040 | void s_maketemps(real scalar p)
1041 | {
1042 | (void) st_addvar("double", names=st_tempname(p), 1)
1043 | st_global("r(varlist)",invtokens(names))
1044 | }
1045 |
1046 |
1047 | // END MATA SECTION
1048 | end
1049 |
--------------------------------------------------------------------------------
/binder/environment.yml:
--------------------------------------------------------------------------------
1 | name: r-environment
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - r-base=4.3
6 | - r-tidyverse
7 | - r-fbasics
8 | - r-corrplot
9 | - r-psych
10 | - r-glmnet
11 | - r-glmnetutils
12 | - r-grf
13 | - r-rpart
14 | - r-rpart.plot
15 | - r-randomforest
16 | - r-rlang
17 | - r-readr
18 | - r-devtools
19 | - r-reshape2
20 | - r-caret
21 | - r-plotmo
22 | - r-randomfieldsutils
23 | - r-rms
24 | - r-hdm
25 | - r-aer
26 | - r-lmtest
27 | - r-dplyr
28 | - r-sandwich
29 | - r-diagrammer
30 | - r-neuralnet
31 | - r-ISLR2
32 | - r-zeallot
33 | - r-nycflights13
--------------------------------------------------------------------------------