├── .gitignore
├── data
    ├── BLP.dta
    ├── jtpa.dta
    ├── sipp1991.dta
    └── cattaneo2.dta
├── lddml.mlib
├── stata.toc
├── ddml_eq.sthlp
├── ddml_fiv.sthlp
├── ddml_sample.sthlp
├── ddml_ate.sthlp
├── ddml_crossfit.sthlp
├── ddml_late.sthlp
├── cert
    ├── ddml_cert_all.do
    ├── DoubleML_results.csv
    ├── ddml_cert_crossfit.do
    ├── ddml_cert_helpfiles.do
    ├── ddml_cert_crossfit.log
    ├── ddml_cert_fiv.do
    ├── ddml_cert_partial_iv.do
    ├── qddml_cert.do
    ├── ddml_cert_interactiveiv.do
    └── ddml_cert_partial.do
├── README.md
├── _ddml_nnls_p.ado
├── replace_header.R
├── LICENSE
├── _ddml_drop.ado
├── ddml_example_interactiveiv_pystacked_basic.sthlp
├── ddml_example_describe.sthlp
├── ddml_example_interactive_pystacked_basic.sthlp
├── ddml_example_partial_pystacked_basic.sthlp
├── ddml.pkg
├── ddml_export.sthlp
├── ddml_example_partial_pystacked_multitreat.sthlp
├── ddml_example_partialiv_pystacked_basic.sthlp
├── ddml_example_interactiveiv_pystacked_detailed.sthlp
├── ddml_example_flexiv_anylearner_basic.sthlp
├── ddml_example_partialiv_anylearner_basic.sthlp
├── ddml_example_overlap.sthlp
├── ddml_example_fcluster.sthlp
├── ddml_example_export.sthlp
├── _ddml_export.ado
├── _ddml_save.ado
├── ddml_describe.sthlp
├── _ddml_copy.ado
├── ddml_example_interactive_pystacked_detailed.sthlp
├── ddml_install_ref_auth.ihlp
├── ddml_overlap.sthlp
├── ddml_interactive.sthlp
├── ddml_interactiveiv.sthlp
├── _ddml_use.ado
├── ddml_partial.sthlp
├── ddml_examples.sthlp
├── _ddml_overlap.ado
├── ddml_extract.sthlp
├── ddml_example_stacking.sthlp
├── ddml_example_partial_anylearner_detailed.sthlp
├── _ddml_allcombos.ado
├── ddml_iv.sthlp
├── ddml.sthlp
├── ddml_example_flexiv_anylearner_detailed.sthlp
├── ddml_example_partial_pystacked_detailed.sthlp
├── ddml_init.sthlp
├── ddml_example_extract.sthlp
├── _ddml_sample.ado
├── ddml_overview.sthlp
├── ddml_stacking.sthlp
├── ddml_estimate.sthlp
└── crossfit.sthlp


/.gitignore:
--------------------------------------------------------------------------------
1 | .RData
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/data/BLP.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aahrens1/ddml/HEAD/data/BLP.dta


--------------------------------------------------------------------------------
/lddml.mlib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aahrens1/ddml/HEAD/lddml.mlib


--------------------------------------------------------------------------------
/stata.toc:
--------------------------------------------------------------------------------
1 | v 3
2 | ddml: Package for Double Debiased Machine Learning
3 | 


--------------------------------------------------------------------------------
/data/jtpa.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aahrens1/ddml/HEAD/data/jtpa.dta


--------------------------------------------------------------------------------
/data/sipp1991.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aahrens1/ddml/HEAD/data/sipp1991.dta


--------------------------------------------------------------------------------
/data/cattaneo2.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aahrens1/ddml/HEAD/data/cattaneo2.dta


--------------------------------------------------------------------------------
/ddml_eq.sthlp:
--------------------------------------------------------------------------------
1 | {smcl}
2 | {* *! version 30aug2024}{...}
3 | {smcl}
4 | INCLUDE help ddml_init.sthlp
5 | 


--------------------------------------------------------------------------------
/ddml_fiv.sthlp:
--------------------------------------------------------------------------------
1 | {smcl}
2 | {* *! version 30aug2024}{...}
3 | {smcl}
4 | INCLUDE help ddml_iv.sthlp
5 | 


--------------------------------------------------------------------------------
/ddml_sample.sthlp:
--------------------------------------------------------------------------------
1 | {smcl}
2 | {* *! version 30aug2024}{...}
3 | {smcl}
4 | INCLUDE help ddml_init.sthlp
5 | 


--------------------------------------------------------------------------------
/ddml_ate.sthlp:
--------------------------------------------------------------------------------
1 | {smcl}
2 | {* *! version 30aug2024}{...}
3 | {smcl}
4 | INCLUDE help ddml_interactive.sthlp
5 | 


--------------------------------------------------------------------------------
/ddml_crossfit.sthlp:
--------------------------------------------------------------------------------
1 | {smcl}
2 | {* *! version 30aug2024}{...}
3 | {smcl}
4 | INCLUDE help ddml_estimate.sthlp
5 | 


--------------------------------------------------------------------------------
/ddml_late.sthlp:
--------------------------------------------------------------------------------
1 | {smcl}
2 | {* *! version 30aug2024}{...}
3 | {smcl}
4 | INCLUDE help ddml_interactiveiv.sthlp
5 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_all.do:
--------------------------------------------------------------------------------
 1 | // wrapper to execute all ddml-related cert scripts
 2 | 
 3 | adopath ++ "/Users/kahrens/MyProjects/ddml"
 4 | cd "/Users/kahrens/MyProjects/ddml/cert"
 5 | 
 6 | // general
 7 | do ddml_cert
 8 | 
 9 | // detailed
10 | do ddml_cert_crossfit
11 | do ddml_cert_fiv
12 | do ddml_cert_interactive
13 | do ddml_cert_interactiveiv
14 | do ddml_cert_misc
15 | do ddml_cert_partial
16 | do ddml_cert_partial_iv
17 | 
18 | // other
19 | do ddml_cert_helpfiles
20 | do qddml_cert
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Installation
 2 | 
 3 | To install `ddml` from Github:
 4 | 
 5 | ```
 6 | net install ddml, from(https://raw.githubusercontent.com/aahrens1/ddml/master) replace
 7 | ```
 8 | 
 9 | To install the development version (not recommended):
10 | 
11 | ```
12 | net install ddml, from(https://raw.githubusercontent.com/aahrens1/ddml/dev) replace
13 | ```
14 | 
15 | To install an archived version, use for example:
16 | 
17 | ```
18 | net install ddml, from(https://raw.githubusercontent.com/aahrens1/ddml/v1.1) replace
19 | ```


--------------------------------------------------------------------------------
/_ddml_nnls_p.ado:
--------------------------------------------------------------------------------
 1 | *! ddml v1.4.4
 2 | *! last edited: 30aug2024
 3 | *! authors: aa/ms
 4 | 
 5 | program define _ddml_nnls_p
 6 |     version 14
 7 |  
 8 |     syntax newvarname [if] [in] , [ xb r]
 9 |  
10 |     marksample touse, novarlist
11 |  
12 |     local nopts : word count `xb' `r'
13 |     if `nopts' >1 {
14 |         display "{err}only one statistic may be specified"
15 |         exit 498
16 |     }
17 |  
18 |     if `nopts' == 0 {
19 |         local xb xb
20 |         display "expected xb"
21 |     }   
22 |  
23 |     if "`xb'" != "" {
24 |         _predict `typlist' `varlist' if `touse' , xb
25 |     }
26 |     else {
27 |         tempvar xbv
28 |         quietly _predict double `xbv' if `touse' , xb
29 |         generate `typlist' `varlist' = `e(depvar)' - `xbv' if `touse'
30 |     }
31 |     
32 | end
33 | 


--------------------------------------------------------------------------------
/cert/DoubleML_results.csv:
--------------------------------------------------------------------------------
 1 | application,model,coef,se
 2 | cattaneo2,interactive ATE,-231.81039024064955,23.482048842943414
 3 | cattaneo2,interactive ATTE,-222.65166505529044,23.53672662923363
 4 | cattaneo2,partial,-222.32395689212981,22.5510985563888
 5 | jtpa,interactive IV,1767.3305120095997,513.3541241525984
 6 | jtpa,partial IV,1737.9958309658205,512.0937971974686
 7 | jtpa,interactive ATE,1108.204663037216,322.32681300406495
 8 | jtpa,interactive ATTE,1125.4739812937125,322.94628781706905
 9 | jtpa,partial,1090.0380191788256,321.60166095638897
10 | 401k,partial,5210.2049827541305,1087.5610153975133
11 | 401k,interactive ATE,2633.680392184123,2771.16190395582
12 | 401k,interactive ATTE,563.3847575207645,6963.070346807339
13 | 401k,interactive IV,3829.545322331364,4028.8655878563945
14 | 401k,partial IV,7515.858485085211,1567.248041400544
15 | 


--------------------------------------------------------------------------------
/replace_header.R:
--------------------------------------------------------------------------------
 1 | setwd("/Users/kahrens/MyProjects/ddml")
 2 | 
 3 | library("stringr")
 4 | 
 5 | all.files <- dir()
 6 | all.files <- all.files[str_detect(all.files,"ado")]
 7 | 
 8 | for (i in 1:length(all.files)) {
 9 |   
10 |   txt <- readLines(all.files[i])
11 |   txt[1] <- "*! ddml v1.4.4"
12 |   txt[2] <- "*! last edited: 30aug2024"
13 |   writeLines(txt,con=all.files[i])
14 |   
15 | }
16 | 
17 | all.files <- dir()
18 | all.files <- all.files[str_detect(all.files,"sthlp")]
19 | 
20 | for (i in 1:length(all.files)) {
21 |   
22 |   txt <- readLines(all.files[i])
23 |   for (j in 1:20) {
24 |     if(!is.na(txt[j])) txt[j] <- str_replace(txt[j],"version 17aug2023\\}","version 30aug2024\\}")
25 |     if(!is.na(txt[j])) txt[j] <- str_replace(txt[j],"\\{right: v1.4.3\\}","\\{right: v1.4.4\\}")
26 |   }
27 |   writeLines(txt,con=all.files[i])
28 |   
29 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Achim Ahrens
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_crossfit.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | 
 3 | if ("`c(username)'"=="kahrens") {
 4 | 	adopath + "/Users/kahrens/MyProjects/ddml"
 5 | 	adopath + "/Users/kahrens/MyProjects/pystacked"
 6 | }
 7 | 
 8 | cap cd "/Users/kahrens/MyProjects/ddml/cert"
 9 | cap cd "C:\LocalStore\ecomes\Documents\GitHub\ddml\cert"
10 | 
11 | cap log close
12 | log using "ddml_cert_crossfit", replace text
13 | 
14 | which ddml
15 | mata: whichddml()
16 | which crossfit
17 | 
18 | use http://fmwww.bc.edu/repec/bocode/j/jtpa.dta, clear
19 | global X sex age married black hispanic
20 | 
21 | set seed 42
22 | crossfit, estring(reg earnings $X) gen(yhat) kfolds(3)
23 | sum earnings yhat_1
24 | 
25 | set seed 42
26 | crossfit, estring(pystacked earnings $X) gen(yhat) kfolds(3)
27 | sum earnings yhat*
28 | 
29 | set seed 42
30 | crossfit, estring(reg earnings $X) gen(yhat) kfolds(3) reps(5)
31 | sum earnings yhat*
32 | 
33 | // check that norandom is equivalent to provided fold identifier
34 | count
35 | gen fid = _n<=(r(N)/2)
36 | set seed 42
37 | crossfit, estring(reg earnings $X) gen(noran) kfolds(2) norandom
38 | set seed 42
39 | crossfit, estring(reg earnings $X) gen(foldv) foldvar(fid)
40 | assert noran==foldv
41 | 
42 | log close
43 | 


--------------------------------------------------------------------------------
/_ddml_drop.ado:
--------------------------------------------------------------------------------
 1 | *! ddml v1.4.4
 2 | *! last edited: 30aug2024
 3 | *! authors: aa/ms
 4 | 
 5 | program _ddml_drop, eclass
 6 | 	version 16
 7 | 
 8 | 	syntax , mname(name)		// will already have verified that mname is a valid ddml mStruct
 9 | 	
10 | 	*** extract details of estimation
11 | 	mata: model_chars(`mname')
12 | 	local nreps		= r(nreps)
13 | 	local numeqnD	= r(numeqnD)
14 | 	local numeqnZ	= r(numeqnZ)
15 | 
16 | 	// fold IDs
17 | 	forvalues m=1/`nreps' {
18 | 		local fidlist `fidlist' `mname'_fid_`m'
19 | 	}
20 | 	
21 | 	// collect names of Y variables
22 | 	local vlist `r(Y)' `r(Y_L)'
23 | 
24 | 	// collect names of D variables
25 | 	forvalues i=1/`numeqnD' {
26 | 		local vlist `vlist' `r(D`i')' `r(D`i'_L)' `r(D`i'_h)'
27 | 	}
28 | 	
29 | 	// collect names of Z variables
30 | 	forvalues i=1/`numeqnZ' {
31 | 		local vlist `vlist' `r(Z`i')' `r(Z`i'_L)'
32 | 	}
33 | 
34 | 	// add rep numbers
35 | 	foreach vn in `vlist' {
36 | 		forvalues m=1/`nreps' {
37 | 			local vreplist `vreplist' `vn'_`m'
38 | 		}
39 | 	}
40 | 
41 | 	// drop vars may not exist, so use capture
42 | 	foreach vn in `vreplist' {
43 | 		cap confirm variable `vn', exact
44 | 		if _rc==0	drop `vn'
45 | 	}
46 | 	
47 | 	*** drop id, fold id, sample var
48 | 	cap drop `mname'_id
49 | 	cap drop `mname'_sample*
50 | 	cap drop `mname'_fid*		// multiple folds
51 | 	
52 | 	*** drop model struct
53 | 	cap mata: mata drop `mname'
54 | 
55 | end
56 | 


--------------------------------------------------------------------------------
/ddml_example_interactiveiv_pystacked_basic.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 3august2023}{...}
 3 | {smcl}
 4 | {pstd}{ul:Interactive IV model (LATE) - Basic example with {help pystacked}}
 5 | 
 6 | {pstd}We use {help pystacked} with two base learners for each reduced form equation.{p_end}
 7 | 
 8 | {phang2}. {stata "use http://fmwww.bc.edu/repec/bocode/j/jtpa.dta, clear"}{p_end}
 9 | {phang2}. {stata "global Y earnings"}{p_end}
10 | {phang2}. {stata "global D training"}{p_end}
11 | {phang2}. {stata "global Z assignmt"}{p_end}
12 | {phang2}. {stata "global X sex age married black hispanic"}{p_end}
13 | 
14 | {pstd}Drop observations where treatment=1 even though assignment=0.
15 | (Up to the user how to handle such observations;
16 | {opt ddml} can handle these cases,
17 | and we drop them here only to illustrate how this is reflected in the stacking weights.){p_end}
18 | 
19 | {phang2}. {stata "tab $D $Z"}{p_end}
20 | {phang2}. {stata "drop if $D==1 & $Z==0"}{p_end}
21 | 
22 | {pstd}Set the seed, initialize, cross-fit and estimate:{p_end}
23 | 
24 | {phang2}. {stata "set seed 42"}{p_end}
25 | {phang2}. {stata "ddml init interactiveiv"}{p_end}
26 | {phang2}. {stata "ddml E[Y|X,Z]: pystacked $Y c.($X)# #c($X), type(reg) m(ols lassocv)"}{p_end}
27 | {phang2}. {stata "ddml E[D|X,Z]: pystacked $D c.($X)# #c($X), type(class) m(logit lassocv)"}{p_end}
28 | {phang2}. {stata "ddml E[Z|X]: pystacked $Z c.($X)# #c($X), type(class) m(logit lassocv)"}{p_end}
29 | {phang2}. {stata "ddml crossfit"}{p_end}
30 | {phang2}. {stata "ddml estimate"}{p_end}
31 | 
32 | {pstd}Report the stacking weights.
33 | Note that the weights for the non-existent case (treatment=1, assignment=0) are missing.{p_end}
34 | 
35 | {phang2}. {stata "ddml extract, show(stweights)"}{p_end}
36 | 


--------------------------------------------------------------------------------
/ddml_example_describe.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:ddml describe utility - Basic example with {help pystacked}}{p_end}
 5 | 
 6 | {pstd}Load the data, define global macros, set the seed and initialize the model.
 7 | Use 2-fold cross-fitting with two repetitions (resamples)
 8 | Use {help pystacked}'s default learners as the supervised learners.{p_end}
 9 | 
10 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
11 | {phang2}. {stata "global Y net_tfa"}{p_end}
12 | {phang2}. {stata "global D e401"}{p_end}
13 | {phang2}. {stata "global X tw age inc fsize educ db marr twoearn pira hown"}{p_end}
14 | {phang2}. {stata "set seed 42"}{p_end}
15 | {phang2}. {stata "ddml init partial, kfolds(2) reps(2)"}{p_end}
16 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X"}{p_end}
17 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X"}{p_end}
18 | {phang2}. {stata "ddml crossfit"}{p_end}
19 | {phang2}. {stata "ddml estimate"}{p_end}
20 | 
21 | {pstd}Default of {opt ddml describe} is to report a brief summary.{p_end}
22 | 
23 | {phang2}. {stata "ddml describe"}{p_end}
24 | 
25 | {pstd}Options: report details of the total and cross-fit samples,
26 | learners (including the estimation strings),
27 | cross-fit results,
28 | and estimation results.{p_end}
29 | 
30 | {phang2}. {stata "ddml describe, sample"}{p_end}
31 | {phang2}. {stata "ddml describe, learners"}{p_end}
32 | {phang2}. {stata "ddml describe, crossfit"}{p_end}
33 | {phang2}. {stata "ddml describe, estimates"}{p_end}
34 | 
35 | {pstd}The {opt all} option is equivalent to specifying all 4 options.{p_end}
36 | 
37 | {phang2}. {stata "ddml describe, all"}{p_end}
38 | {phang2}. {stata "ddml describe, sample learners crossfit estimates"}{p_end}
39 | 


--------------------------------------------------------------------------------
/ddml_example_interactive_pystacked_basic.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Interactive model - Basic example with {help pystacked}}{p_end}
 5 | 
 6 | {pstd}We need to estimate the conditional expectations of E[Y|X,D=0], E[Y|X,D=1] and E[D|X].
 7 | The first two conditional expectations are added jointly.
 8 | We use 5 cross-fit folds and 2 resamplings
 9 | (more resamplings would be advisable; we use 2 in this example so the code runs faster).
10 | We specify two supervised learners: linear regression and gradient boosted
11 | trees, stacked using {help pystacked}.
12 | We use {help pystacked}'s 2nd syntax and stack using the single-best learner
13 | (rather than the default constrained least squares).
14 | Note that we use gradient boosted regression trees for E[Y|X,D],
15 | but gradient boosted classification trees for E[D|X].{p_end}
16 | 
17 | {phang2}. {stata "webuse cattaneo2, clear"}{p_end}
18 | {phang2}. {stata "global Y bweight"}{p_end}
19 | {phang2}. {stata "global D mbsmoke"}{p_end}
20 | {phang2}. {stata "global X prenatal1 mmarried fbaby mage medu"}{p_end}
21 | {phang2}. {stata "set seed 42"}{p_end}
22 | {phang2}. {stata "ddml init interactive, kfolds(5) reps(2)"}{p_end}
23 | {phang2}. {stata "ddml E[Y|X,D]: pystacked $Y $X || method(ols) || method(gradboost) || , type(reg) finalest(singlebest)"}{p_end}
24 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X || method(logit) || method(gradboost) || , type(class) finalest(singlebest)"}{p_end}
25 | {phang2}. {stata "ddml crossfit"}{p_end}
26 | 
27 | {pstd}{opt ddml estimate} reports the ATE (average treatment effect) by default:{p_end}
28 | 
29 | {phang2}. {stata "ddml estimate"}{p_end}
30 | 
31 | {pstd}Request the ATET (average treatment effect on the treated) instead:{p_end}
32 | 
33 | {phang2}. {stata "ddml estimate, atet"}{p_end}
34 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_helpfiles.do:
--------------------------------------------------------------------------------
 1 | // do file to run all clickable examples in ddml help files
 2 | // this file requires the Stata program loghelp
 3 | // install from SSC if available or from github:
 4 | // net install loghelp, ///
 5 | //   from(https://raw.githubusercontent.com/markeschaffer/stata-utilities/master) ///
 6 | //   replace
 7 | 
 8 | clear all
 9 | 
10 | cap log close
11 | log using "ddml_helpfiles_cert", replace smcl
12 | which ddml
13 | mata: whichddml()
14 | which qddml
15 | which crossfit
16 | which pystacked
17 | log close
18 | 
19 | foreach sthlpfile in									///
20 | 	/* ddml_example_partial_pystacked_basic.sthlp			///
21 | 	ddml_example_partial_pystacked_detailed.sthlp		///
22 | 	ddml_example_partial_anylearner_detailed.sthlp		///
23 | 	ddml_example_partial_pystacked_multitreat.sthlp		///
24 | 	ddml_example_interactive_pystacked_basic.sthlp		///
25 | 	ddml_example_interactive_pystacked_detailed.sthlp	///
26 | 	ddml_example_partialiv_pystacked_basic.sthlp		///
27 | 	ddml_example_partialiv_anylearner_basic.sthlp		///
28 | 	ddml_example_flexiv_anylearner_basic.sthlp			///
29 | 	ddml_example_flexiv_anylearner_detailed.sthlp		///
30 | 	ddml_example_interactiveiv_pystacked_basic.sthlp	///
31 | 	ddml_example_interactiveiv_pystacked_detailed.sthlp	///
32 | 	ddml_example_extract.sthlp							///
33 | 	ddml_example_stacking.sthlp							///
34 | 	ddml_example_describe.sthlp							///
35 | 	ddml_example_export.sthlp							///
36 | 	ddml_example_overlap.sthlp							///
37 | 	ddml_example_fcluster.sthlp							///
38 | 	qddml.sthlp						*/					///
39 | 	crossfit.sthlp										///
40 | 	{
41 | 
42 | 	log using "ddml_helpfiles_cert", append smcl
43 | 	di
44 | 	di "Help file: `sthlpfile'"
45 | 	di
46 | 	log close
47 | 
48 | 	findfile `sthlpfile'
49 | 	loghelp using ddml_helpfiles_cert,					///
50 | 		inputname(`r(fn)') append smcl
51 | 	
52 | 	}
53 | 


--------------------------------------------------------------------------------
/ddml_example_partial_pystacked_basic.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Partially-linear model - Basic example with {help pystacked}}{p_end}
 5 | 
 6 | {pstd}Load the data, define global macros, set the seed and initialize the model.
 7 | Use 2-fold cross-fitting with two repetitions (resamples)
 8 | Use {help pystacked}'s default learners as the supervised learners: OLS, cross-validated lasso, and gradient boosting.
 9 | NB: The model specification and results will be stored on a Mata object
10 | with the default name "m0".{p_end}
11 | 
12 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
13 | {phang2}. {stata "global Y net_tfa"}{p_end}
14 | {phang2}. {stata "global D e401"}{p_end}
15 | {phang2}. {stata "global X tw age inc fsize educ db marr twoearn pira hown"}{p_end}
16 | {phang2}. {stata "set seed 42"}{p_end}
17 | {phang2}. {stata "ddml init partial, kfolds(2) reps(2)"}{p_end}
18 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X"}{p_end}
19 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X"}{p_end}
20 | {phang2}. {stata "ddml crossfit"}{p_end}
21 | {phang2}. {stata "ddml estimate"}{p_end}
22 | 
23 | {pstd}Replicate the {opt ddml estimate} results for the 1st cross-fit estimation (resample 1) by hand,
24 | using the estimated conditional expectations generated by {opt ddml} and {help pystacked};
25 | "_1" means resample 1.
26 | Compare using {opt ddml estimate, replay}.{p_end}
27 | 
28 | {phang2}. {stata "cap drop Yresid"}{p_end}
29 | {phang2}. {stata "cap drop Dresid"}{p_end}
30 | {phang2}. {stata "gen double Yresid = $Y - Y1_pystacked_1"}{p_end}
31 | {phang2}. {stata "gen double Dresid = $D - D1_pystacked_1"}{p_end}
32 | {phang2}. {stata "regress Yresid Dresid"}{p_end}
33 | {phang2}. {stata "ddml estimate, mname(m0) spec(st) rep(1) notable replay"}{p_end}
34 | 


--------------------------------------------------------------------------------
/ddml.pkg:
--------------------------------------------------------------------------------
 1 | v 3
 2 | d ddml: Package for Double Debiased Machine Learning
 3 | f _ddml_allcombos.ado
 4 | f _ddml_copy.ado
 5 | f _ddml_crossfit.ado
 6 | f _ddml_describe.ado
 7 | f _ddml_drop.ado
 8 | f _ddml_estimate_ate_late.ado
 9 | f _ddml_estimate_linear.ado
10 | f _ddml_export.ado
11 | f _ddml_extract.ado
12 | f _ddml_nnls.ado
13 | f _ddml_nnls_p.ado
14 | f _ddml_overlap.ado
15 | f _ddml_sample.ado
16 | f _ddml_save.ado
17 | f _ddml_use.ado
18 | f crossfit.ado
19 | f ddml.ado
20 | f lddml.mlib
21 | f crossfit.sthlp
22 | f ddml.sthlp
23 | f ddml_ate.sthlp
24 | f ddml_crossfit.sthlp
25 | f ddml_describe.sthlp
26 | f ddml_eq.sthlp
27 | f ddml_estimate.sthlp
28 | f ddml_example_describe.sthlp
29 | f ddml_example_export.sthlp
30 | f ddml_example_extract.sthlp
31 | f ddml_example_fcluster.sthlp
32 | f ddml_example_flexiv_anylearner_basic.sthlp
33 | f ddml_example_flexiv_anylearner_detailed.sthlp
34 | f ddml_example_interactive_pystacked_basic.sthlp
35 | f ddml_example_interactive_pystacked_detailed.sthlp
36 | f ddml_example_interactiveiv_pystacked_basic.sthlp
37 | f ddml_example_interactiveiv_pystacked_detailed.sthlp
38 | f ddml_example_overlap.sthlp
39 | f ddml_example_partial_anylearner_detailed.sthlp
40 | f ddml_example_partial_pystacked_basic.sthlp
41 | f ddml_example_partial_pystacked_detailed.sthlp
42 | f ddml_example_partial_pystacked_multitreat.sthlp
43 | f ddml_example_partialiv_anylearner_basic.sthlp
44 | f ddml_example_partialiv_pystacked_basic.sthlp
45 | f ddml_example_stacking.sthlp
46 | f ddml_examples.sthlp
47 | f ddml_export.sthlp
48 | f ddml_extract.sthlp
49 | f ddml_fiv.sthlp
50 | f ddml_init.sthlp
51 | f ddml_install_ref_auth.ihlp
52 | f ddml_interactive.sthlp
53 | f ddml_interactiveiv.sthlp
54 | f ddml_iv.sthlp
55 | f ddml_late.sthlp
56 | f ddml_overlap.sthlp
57 | f ddml_overview.sthlp
58 | f ddml_partial.sthlp
59 | f ddml_sample.sthlp
60 | f ddml_stacking.sthlp
61 | f qddml.ado
62 | f qddml.sthlp
63 | 
64 | 


--------------------------------------------------------------------------------
/ddml_export.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {viewerjumpto "Syntax" "ddml_export##syntax"}{...}
 4 | {viewerjumpto "Examples" "ddml_export##examples"}{...}
 5 | {viewerjumpto "Installation" "ddml_export##installation"}{...}
 6 | {viewerjumpto "References" "ddml_export##references"}{...}
 7 | {viewerjumpto "Authors" "ddml_export##authors"}{...}
 8 | {vieweralsosee "ddml main page" "ddml"}{...}
 9 | {vieweralsosee "Other" "ddml_export##also_see"}{...}
10 | {hline}
11 | {cmd:help ddml export}{right: v1.4.4}
12 | {hline}
13 | 
14 | {title:ddml export utility for Double Debiased Machine Learning}
15 | 
16 | {p2colset 5 19 21 2}{...}
17 | {p2col:{hi: ddml} {hline 2}}Stata package for Double Debiased Machine Learning{p_end}
18 | {p2colreset}{...}
19 | 
20 | {pstd}
21 | {opt ddml} implements algorithms for causal inference aided by supervised
22 | machine learning as proposed in 
23 | {it:Double/debiased machine learning for treatment and structural parameters}
24 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
25 | binary or continuous treatment variables and endogeneity, high-dimensional 
26 | controls and/or instrumental variables. 
27 | 
28 | {pstd}
29 | {opt ddml export} saves the estimated conditional expectations, cross-fold identifers, etc.
30 | to a CSV file.
31 | 
32 | {marker syntax}{...}
33 | {title:Syntax}
34 | 
35 | {p 8 14}{cmd:ddml export}
36 | {cmd:using} {it:filename} [ , {opt mname(name)}
37 | {opt addvars(varlist)}
38 | 
39 | {synoptset 20}{...}
40 | {synopthdr:options}
41 | {synoptline}
42 | {synopt:{opt mname(name)}}
43 | name of the DDML model. Allows to run multiple DDML
44 | models simultaneously. Defaults to {it:m0}.
45 | {p_end}
46 | {synopt:{opt addvars(varlist)}}
47 | additional Stata variables to include with {opt ddml} variables.
48 | {p_end}
49 | {synoptline}
50 | {p2colreset}{...}
51 | {pstd}
52 | 
53 | 
54 | {marker examples}{...}
55 | {title:Examples}
56 | 
57 | {smcl}
58 | INCLUDE help ddml_example_export.sthlp
59 | 
60 | 
61 | {smcl}
62 | INCLUDE help ddml_install_ref_auth
63 | 


--------------------------------------------------------------------------------
/ddml_example_partial_pystacked_multitreat.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Partially-linear model - Multiple treatments with {help pystacked}}
 5 | 
 6 | {pstd}We can also run the partially-linear model with multiple treatments. 
 7 | In this simple example, we estimate the effect of both 401k elligibility 
 8 | {cmd:e401} and education {cmd:educ}. 
 9 | Note that we remove {cmd:educ} from the set of controls.
10 | We again use {help pystacked} as the single learner provided to {opt ddml};
11 | the two base learners, OLS and random forest, are provided via {help pystacked}.
12 | We use the simplified syntax supported by {help pystacked}.{p_end}
13 | 
14 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
15 | {phang2}. {stata "global Y net_tfa"}{p_end}
16 | {phang2}. {stata "global D1 e401"}{p_end}
17 | {phang2}. {stata "global D2 educ"}{p_end}
18 | {phang2}. {stata "global X tw age inc fsize db marr twoearn pira hown"}{p_end}
19 | {phang2}. {stata "set seed 42"}{p_end}
20 | 
21 | {pstd}Initialize the model.{p_end}
22 | 
23 | {phang2}. {stata "ddml init partial, kfolds(2)"}{p_end}
24 | 
25 | {pstd}Add learners. Note that we add learners with both {cmd:$D1} and
26 | {cmd:$D2} as the dependent variable.{p_end}
27 | 
28 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X, type(reg) methods(ols rf)"}{p_end}
29 | {phang2}. {stata "ddml E[D|X]: pystacked $D1 $X, type(reg) methods(ols rf)"}{p_end}
30 | {phang2}. {stata "ddml E[D|X]: pystacked $D2 $X, type(reg) methods(ols rf)"}{p_end}
31 | 
32 | {pstd}Cross-fitting.{p_end}
33 | 
34 | {phang2}. {stata "ddml crossfit"}{p_end}
35 | 
36 | {pstd}Estimation.{p_end}
37 | 
38 | {phang2}. {stata "ddml estimate, robust"}{p_end}
39 | 
40 | {pstd}Because we have used {help pystacked} as the single {opt ddml} learner,
41 | we can access the saved {opt pystacked} information.
42 | Here we use the {opt pystacked} option to get the stacking weights and MSEs by cross-fit fold:{p_end}
43 | 
44 | {phang2}. {stata "ddml extract, show(pystacked)"}{p_end}
45 | 


--------------------------------------------------------------------------------
/ddml_example_partialiv_pystacked_basic.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Partially-linear IV model - Basic example with {help pystacked}} 
 5 | 
 6 | {pstd}The model has three conditional expectations: E[Y|X], E[D|X] and E[Z|X].
 7 | For each reduced form equation, we use {help pystacked}'s default learners: 
 8 | OLS, cross-validated lasso, and gradient boosting.
 9 | Since the data set is very small, we consider 30 cross-fitting folds.
10 | NB: The model specification and results will be stored on a Mata object
11 | with the default name "m0".{p_end}
12 | 
13 | {phang2}. {stata "use https://statalasso.github.io/dta/AJR.dta, clear"}{p_end}
14 | {phang2}. {stata "global Y logpgp95"}{p_end}
15 | {phang2}. {stata "global D avexpr"}{p_end}
16 | {phang2}. {stata "global Z logem4"}{p_end}
17 | {phang2}. {stata "global X lat_abst edes1975 avelf temp* humid* steplow-oilres"}{p_end}
18 | {phang2}. {stata "set seed 42"}{p_end}
19 | {phang2}. {stata "ddml init iv, kfolds(30)"}{p_end}
20 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X"}{p_end}
21 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X"}{p_end}
22 | {phang2}. {stata "ddml E[Z|X]: pystacked $Z $X"}{p_end}
23 | {phang2}. {stata "ddml crossfit"}{p_end}
24 | {phang2}. {stata "ddml estimate"}{p_end}
25 | 
26 | {pstd}Replicate the {opt ddml estimate} results for the 1st cross-fit estimation (resample 1) by hand,
27 | using the estimated conditional expectations generated by {opt ddml} and {help pystacked};
28 | "_1" means resample 1.
29 | Compare using {opt ddml estimate, replay}.{p_end}
30 | 
31 | {phang2}. {stata "cap drop Yresid"}{p_end}
32 | {phang2}. {stata "cap drop Dresid"}{p_end}
33 | {phang2}. {stata "cap drop Zresid"}{p_end}
34 | {phang2}. {stata "gen double Yresid = $Y - Y1_pystacked_1"}{p_end}
35 | {phang2}. {stata "gen double Dresid = $D - D1_pystacked_1"}{p_end}
36 | {phang2}. {stata "gen double Zresid = $Z - Z1_pystacked_1"}{p_end}
37 | {phang2}. {stata "ivreg Yresid (Dresid=Zresid)"}{p_end}
38 | {phang2}. {stata "ddml estimate, mname(m0) spec(st) rep(1) notable replay"}{p_end}
39 | 
40 | 


--------------------------------------------------------------------------------
/ddml_example_interactiveiv_pystacked_detailed.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Interactive IV model (LATE) - Detailed example with {help pystacked}}
 5 | 
 6 | {pstd}Preparation: we load the data, define global macros and set the seed.{p_end}
 7 | 
 8 | {phang2}. {stata "use http://fmwww.bc.edu/repec/bocode/j/jtpa.dta, clear"}{p_end}
 9 | {phang2}. {stata "global Y earnings"}{p_end}
10 | {phang2}. {stata "global D training"}{p_end}
11 | {phang2}. {stata "global Z assignmt"}{p_end}
12 | {phang2}. {stata "global X sex age married black hispanic"}{p_end}
13 | {phang2}. {stata "set seed 42"}{p_end}
14 | 
15 | {pstd}We initialize the model.{p_end}
16 | 
17 | {phang2}. {stata "ddml init interactiveiv, kfolds(5)"}{p_end}
18 | 
19 | {pstd}We use {helpb pystacked} with two base learners for each reduced form equation.
20 | Note that E[Y|X,Z] is a regression problem,
21 | whereas E[D|X,Z] and E[Z|X] are classification problems.{p_end}
22 | 
23 | {phang2}. {stata "ddml E[Y|X,Z]: pystacked $Y c.($X)# #c($X), type(reg) m(ols lassocv)"}{p_end}
24 | {phang2}. {stata "ddml E[D|X,Z]: pystacked $D c.($X)# #c($X), type(class) m(logit lassocv)"}{p_end}
25 | {phang2}. {stata "ddml E[Z|X]: pystacked $Z c.($X)# #c($X), type(class) m(logit lassocv)"}{p_end}
26 | 
27 | {pstd}Cross-fitting and estimation, with short-stacking implemented via {opt ddml}.{p_end}
28 | 
29 | {phang2}. {stata "ddml crossfit, shortstack"}{p_end}
30 | {phang2}. {stata "ddml estimate, robust"}{p_end}
31 | 
32 | {pstd}Compare the short-stacking estimation above with standard (within-cross-fit-fold) stacking:{p_end}
33 | 
34 | {phang2}. {stata "ddml estimate, spec(st) rep(1) replay notable"}{p_end}
35 | 
36 | {pstd}Short-stacking is typically considerably faster than standard stacking.
37 | We can estimate using short-stacking only by specifying the {opt nostd} option when cross-fitting.
38 | We re-set the seed for comparability.{p_end}
39 | 
40 | {phang2}. {stata "set seed 42"}{p_end}
41 | {phang2}. {stata "ddml crossfit, shortstack nostdstack"}{p_end}
42 | {phang2}. {stata "ddml estimate, robust"}{p_end}
43 | 


--------------------------------------------------------------------------------
/ddml_example_flexiv_anylearner_basic.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Flexible partially-linear IV model - Basic example with {help pystacked}}
 5 | 
 6 | {pstd}First load the data, define global macros, set the seed and initialize the model.
 7 | We add learners for E[Y|X] in the usual way.
 8 | We illustrate with single {help pystacked} estimations,
 9 | but the procedure applies to all learners.{p_end}
10 | 
11 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/BLP.dta, clear"}{p_end}
12 | {phang2}. {stata "global Y share"}{p_end}
13 | {phang2}. {stata "global D price"}{p_end}
14 | {phang2}. {stata "global X hpwt air mpd space"}{p_end}
15 | {phang2}. {stata "global Z sum*"}{p_end}
16 | {phang2}. {stata "set seed 42"}{p_end}
17 | {phang2}. {stata "ddml init fiv"}{p_end}
18 | 
19 | {pstd}Adding learners for E[Y|X] is the same as for other {opt ddml} linear models:{p_end}
20 | 
21 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X, type(reg)"}{p_end}
22 | 
23 | {pstd}Adding learners for E[D|Z,X] and E[D|X] in the {opt fiv} model is different
24 | from how it's done in the {opt partialiv} model.
25 | The reason for this is that the estimation of E[D|X]
26 | depends on the estimation of E[D|X,Z].{p_end}
27 | 
28 | {pstd}When adding learners for E[D|Z,X],
29 | we need to provide a name for each learners using {opt learner(name)}.
30 | Here we use the name "Dhat_pys".{p_end}
31 | 
32 | {phang2}. {stata "ddml E[D|Z,X], learner(Dhat_pys): pystacked $D $X $Z, type(reg)"}{p_end}
33 | 
34 | {pstd}When adding learners for E[D|X], we explicitly refer to the name of the learner from 
35 | the previous step (here, "Dhat_pys").
36 | We also provide the name of the treatment variable ({cmd:vname($D)}),
37 | and we use the placeholder {cmd:{D}} in place of the dependent variable.{p_end}
38 | 
39 | {phang2}. {stata "ddml E[D|X], learner(Dhat_pys) vname($D): pystacked {D} $X, type(reg)"}{p_end}
40 | 
41 | {pstd}The crossfit and estimation commands with the {opt fiv} model are standard.{p_end}
42 | 
43 | {phang2}. {stata "ddml crossfit"}{p_end}
44 | {phang2}. {stata "ddml estimate"}{p_end}
45 | 


--------------------------------------------------------------------------------
/ddml_example_partialiv_anylearner_basic.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Partially-linear IV model - Basic example with various learners} 
 5 | 
 6 | {pstd}Preparation: we load the data, define global macros and set the seed.{p_end}
 7 | 
 8 | {phang2}. {stata "use https://statalasso.github.io/dta/AJR.dta, clear"}{p_end}
 9 | {phang2}. {stata "global Y logpgp95"}{p_end}
10 | {phang2}. {stata "global D avexpr"}{p_end}
11 | {phang2}. {stata "global Z logem4"}{p_end}
12 | {phang2}. {stata "global X lat_abst edes1975 avelf temp* humid* steplow-oilres"}{p_end}
13 | {phang2}. {stata "set seed 42"}{p_end}
14 | 
15 | {pstd}Preparation: we load the data, define global macros and set the seed. Since the
16 | data set is very small, we consider 30 cross-fitting folds.{p_end}
17 | 
18 | {phang2}. {stata "ddml init iv, kfolds(30)"}{p_end}
19 | 
20 | {pstd}The partially linear IV model has three conditional expectations: 
21 | E[Y|X], E[D|X] and E[Z|X]. For each reduced form equation, we use two learners:
22 | OLS and random forest.
23 | To illustrate how {opt ddml} works with other packages,
24 | instead of a single call to {opt pystacked} specifying two base learners
25 | we specify Stata's {help regress} and {help rforest} by Zou and Schonlau as the two learners.
26 | We need to add the option {opt vtype(none)} for {help rforest} to 
27 | work with {cmd:ddml} since {help rforest}'s {cmd:predict} command doesn't
28 | support variable types.{p_end}
29 | 
30 | {phang2}. {stata "ddml E[Y|X]: reg $Y $X"}{p_end}
31 | {phang2}. {stata "ddml E[Y|X], vtype(none): rforest $Y $X, type(reg)"}{p_end}
32 | {phang2}. {stata "ddml E[D|X]: reg $D $X"}{p_end}
33 | {phang2}. {stata "ddml E[D|X], vtype(none): rforest $D $X, type(reg)"}{p_end}
34 | {phang2}. {stata "ddml E[Z|X]: reg $Z $X"}{p_end}
35 | {phang2}. {stata "ddml E[Z|X], vtype(none): rforest $Z $X, type(reg)"}{p_end}
36 | 
37 | {pstd}Cross-fitting and estimation; report all combinations of estimated conditional expectations.{p_end}
38 | 
39 | {phang2}. {stata "ddml crossfit, shortstack"}{p_end}
40 | {phang2}. {stata "ddml estimate, robust allcombos"}{p_end}
41 | 	
42 | 


--------------------------------------------------------------------------------
/ddml_example_overlap.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:ddml overlap utility - Overlap plots with interactive models (ATE etc.)}{p_end}
 5 | 
 6 | {pstd}We use the default of 5 cross-fit folds
 7 | and specify 2 resamplings with 2 supervised learners:
 8 | linear regression and gradient boosted trees, stacked using {help pystacked}.
 9 | Note that we use gradient boosted regression trees for E[Y|X,D],
10 | but gradient boosted classification trees for E[D|X].{p_end}
11 | 
12 | {phang2}. {stata "webuse cattaneo2, clear"}{p_end}
13 | {phang2}. {stata "global Y bweight"}{p_end}
14 | {phang2}. {stata "global D mbsmoke"}{p_end}
15 | {phang2}. {stata "global X prenatal1 mmarried fbaby mage medu"}{p_end}
16 | {phang2}. {stata "set seed 42"}{p_end}
17 | {phang2}. {stata "ddml init interactive, reps(2)"}{p_end}
18 | {phang2}. {stata "ddml E[Y|X,D]: pystacked $Y $X || method(ols) || method(gradboost) || , type(reg)"}{p_end}
19 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X || method(logit) || method(gradboost) || , type(class)"}{p_end}
20 | {phang2}. {stata "ddml crossfit"}{p_end}
21 | {phang2}. {stata "ddml estimate"}{p_end}
22 | 
23 | {pstd}Default behavior of {opt ddml overlap} is to use all cross-fit resamples
24 | and plot the stacked (ensemble) learner generated by {help pystacked}:{p_end}
25 | 
26 | {phang2}. {stata "ddml overlap"}{p_end}
27 | 
28 | {pstd}Use just resample 1:{p_end}
29 | 
30 | {phang2}. {stata "ddml overlap, replist(1)"}{p_end}
31 | 
32 | {pstd}Overlap plots for the predicted values of
33 | the separate logit (#1) and gradboost (#2) learners:{p_end}
34 | 
35 | {phang2}. {stata "ddml overlap, pslist(D1_pystacked_L1 D1_pystacked_L2)"}{p_end}
36 | 
37 | {pstd}Save the overlap plot using the default triangle kernel,
38 | generate an overlap plot using the Epanechnikov, kernal,
39 | and combine the two into a single graph:{p_end}
40 | 
41 | {phang2}. {stata "ddml overlap, name(triangle, replace) title(Propensity score - triangle kernel)"}{p_end}
42 | {phang2}. {stata "ddml overlap, kernel(epanechnikov) name(epanechnikov, replace) title(Propensity score - epanechnikov kernel)"}{p_end}
43 | {phang2}. {stata "graph combine triangle epanechnikov"}{p_end}
44 | 


--------------------------------------------------------------------------------
/ddml_example_fcluster.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Cluster sampling with cross-fit folds - Basic example with {help pystacked}}{p_end}
 5 | 
 6 | {pstd}Load the data, define global macros and set the seed.{p_end}
 7 | 
 8 | {phang2}. {stata "webuse nlsw88, clear"}{p_end}
 9 | {phang2}. {stata "gen lwage = ln(wage)"}{p_end}
10 | {phang2}. {stata "global Y lwage"}{p_end}
11 | {phang2}. {stata "global D union"}{p_end}
12 | {phang2}. {stata "global X age-c_city hours-tenure"}{p_end}
13 | {phang2}. {stata "set seed 42"}{p_end}
14 | 
15 | {pstd}Initialize the model.
16 | The {opt fcluster(industry)} ("fold-cluster") option tells {opt ddml}
17 | to ensure that clusters (here, identified by the variable {opt industry})
18 | are not split across cross-fit folds, i.e., each cluster appears in only one cross-fit fold.
19 | Here we specify 2 cross-fit folds,
20 | so all observations for each cluster will appear in either fold 1 or in fold 2.
21 | NB: This example is somewhat artificial, because there are only 12 clusters (industries).{p_end}
22 | 
23 | {phang2}. {stata "ddml init partial, kfolds(2) fcluster(industry)"}{p_end}
24 | {phang2}. {stata "tab industry m0_fid_1"}{p_end}
25 | 
26 | {pstd}Since there are 12 clusters defined by {opt industry},
27 | we could achieve the same cross-fit split either by specifying {opt fcluster(industry)},
28 | or by using {opt fcluster(industry)} as the fold identifier and specifying {opt foldvar(industry)}.
29 | (NB: The split is the same but the fold numbering is different.){p_end}
30 | 
31 | {phang2}. {stata "ddml init partial, foldvar(industry)"}{p_end}
32 | {phang2}. {stata "tab industry m0_fid_1"}{p_end}
33 | 
34 | {phang2}. {stata "ddml init partial, kfolds(12) fcluster(industry)"}{p_end}
35 | {phang2}. {stata "tab industry m0_fid_1"}{p_end}
36 | 
37 | {pstd}Estimation is standard,
38 | but to obtain cluster-robust SEs the covariance estimator
39 | needs to be requested with {opt ddml estimate}:{p_end}
40 | 
41 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X"}{p_end}
42 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X"}{p_end}
43 | {phang2}. {stata "ddml crossfit"}{p_end}
44 | {phang2}. {stata "ddml estimate, cluster(industry)"}{p_end}
45 | 


--------------------------------------------------------------------------------
/ddml_example_export.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:ddml export utility - Basic example with {help pystacked}}{p_end}
 5 | 
 6 | {pstd}Load the data, define global macros, set the seed and initialize the model.
 7 | Use 2-fold cross-fitting with two repetitions (resamples)
 8 | Use {help pystacked}'s default learners as the supervised learners.
 9 | We explicitly name the model to be estimated as m_sip1991;
10 | this is the name of the Mata global containing the model details.
11 | The default behavior of {opt ddml} is to prefix the created sample and fold indicators,
12 | but we specify the {opt prefix} option with {help ddml init}
13 | so that all created variables including the estimated conditional expectations are prefixed.
14 | {p_end}
15 | 
16 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
17 | {phang2}. {stata "global Y net_tfa"}{p_end}
18 | {phang2}. {stata "global D e401"}{p_end}
19 | {phang2}. {stata "global X tw age inc fsize educ db marr twoearn pira hown"}{p_end}
20 | {phang2}. {stata "set seed 42"}{p_end}
21 | {phang2}. {stata "ddml init partial, kfolds(2) reps(2) mname(m_sip1991) prefix"}{p_end}
22 | {phang2}. {stata "ddml E[Y|X], mname(m_sip1991): pystacked $Y $X"}{p_end}
23 | {phang2}. {stata "ddml E[D|X], mname(m_sip1991): pystacked $D $X"}{p_end}
24 | {phang2}. {stata "ddml crossfit, mname(m_sip1991)"}{p_end}
25 | {phang2}. {stata "ddml estimate, mname(m_sip1991)"}{p_end}
26 | 
27 | {pstd}It will often be a good idea to include an ID variable that identifies the observation number.
28 | This dataset doesn't include an ID variable, so we create one.{p_end}
29 | 
30 | {phang2}. {stata "gen long m_sip1991_id = _n"}{p_end}
31 | 
32 | {pstd}To include the ID variable with everything else, we use the {opt addvars(.)} option.
33 | The data will be exported to a CSV file called "m_ddml_sip1991.csv".{p_end}
34 | 
35 | {phang2}. {stata "ddml export using m_ddml_sip1991.csv, mname(m_sip1991) replace addvars(m_sip1991_id)"}{p_end}
36 | 
37 | {pstd}Lists of saved {opt ddml} estimated conditional expectations with and without resample numbers,
38 | saved in r(.) macros:{p_end}
39 | 
40 | {phang2}. {stata `"di "`r(vreplist)'""'}{p_end}
41 | {phang2}. {stata `"di "`r(vlist)'""'}{p_end}
42 | 


--------------------------------------------------------------------------------
/_ddml_export.ado:
--------------------------------------------------------------------------------
 1 | *! ddml v1.4.4
 2 | *! last edited: 30aug2024
 3 | *! authors: aa/ms
 4 | 
 5 | program _ddml_export, rclass
 6 | 	version 16
 7 | 
 8 | 	syntax , mname(name) fname(string) [ addvars(varlist) * ]
 9 | 	
10 | 	// blank eqn - declare this way so that it's a struct and not transmorphic
11 | 	// used multiple times below
12 | 	tempname eqn
13 | 	mata: `eqn' = init_eStruct()
14 | 	
15 | 	*** extract details of estimation
16 | 	mata: model_chars(`mname')
17 | 	local nreps		= r(nreps)
18 | 	local numeqnD	= r(numeqnD)
19 | 	local numeqnZ	= r(numeqnZ)
20 | 
21 | 	// fold IDs
22 | 	forvalues m=1/`nreps' {
23 | 		local fidlist `fidlist' `mname'_fid_`m'
24 | 	}
25 | 	
26 | 	// check
27 | 	if r(crossfitted)==0 {
28 | 		di as err "error - model not yet crossfitted, no variables to export"
29 | 		exit 198
30 | 	}
31 | 	
32 | 	// collect names of Y variables
33 | 	local vlist `r(Y)' `r(Y_L)'
34 | 
35 | 	// collect names of D variables
36 | 	forvalues i=1/`numeqnD' {
37 | 		local vlist `vlist' `r(D`i')' `r(D`i'_L)' `r(D`i'_h)'
38 | 	}
39 | 	// collect names of Z variables
40 | 	forvalues i=1/`numeqnZ' {
41 | 		local vlist `vlist' `r(Z`i')' `r(Z`i'_L)'
42 | 	}
43 | 
44 | 	// add rep numbers
45 | 	foreach vn in `vlist' {
46 | 		forvalues m=1/`nreps' {
47 | 			local vreplist `vreplist' `vn'_`m'
48 | 		}
49 | 	}
50 | 	
51 | 	// some vars may not exist (e.g. pystacked with no std stacking); remove these
52 | 	foreach vn in `vreplist' {
53 | 		cap confirm variable `vn', exact
54 | 		if _rc==0	local tlist `tlist' `vn'
55 | 	}
56 | 	local vreplist `tlist'
57 | 
58 | 	// preserve, drop unneeded vars, rename, export, restore
59 | 	preserve
60 | 
61 | 	keep `addvars' `mname'_sample* `fidlist' `vreplist'
62 | 	order `addvars' `mname'_sample* `fidlist' `vreplist'
63 | 	export delimited using "`fname'", `options'
64 | 	
65 | 	restore
66 | 	
67 | 	fvexpand `addvars' `mname'_sample* `fidlist' `vreplist'
68 | 
69 | 	local numvars : word count `r(varlist)'
70 | 	return scalar numvars	= `numvars'
71 | 	return local vlist		`vlist'
72 | 	return local vreplist	`vreplist'
73 | 	
74 | end
75 | 
76 | ********************************************************************************
77 | *** Mata section															 ***
78 | ********************************************************************************
79 | 
80 | mata:
81 | 
82 | 
83 | end
84 | 


--------------------------------------------------------------------------------
/_ddml_save.ado:
--------------------------------------------------------------------------------
 1 | *! ddml v1.4.4
 2 | *! last edited: 30aug2024
 3 | *! authors: aa/ms
 4 | 
 5 | program _ddml_save
 6 | 	version 13
 7 | 
 8 | 	syntax , mname(name) fname(string) [ replace ]
 9 | 	
10 | 	// blank eqn - declare this way so that it's a struct and not transmorphic
11 | 	tempname eqn
12 | 	mata: `eqn' = init_eStruct()
13 | 
14 | 	// locals used below
15 | 	mata: st_local("model",`mname'.model)
16 | 	
17 | 	mata: st_local("nameY",`mname'.nameY)
18 | 	mata: st_local("nameD",invtokens(`mname'.nameD))
19 | 	mata: st_local("nameZ",invtokens(`mname'.nameZ))
20 | 	local numeqnD	: word count `nameD'
21 | 	local numeqnZ	: word count `nameZ'
22 | 	
23 | 	mata: `eqn' = (`mname'.eqnAA).get("`nameY'")
24 | 	mata: st_local("vtlistY",invtokens(`eqn'.vtlist))
25 | 	local vtlist `vtlistY'
26 | 	
27 | 	if `numeqnD' {
28 | 		foreach var of varlist `nameD' {
29 | 			mata: `eqn' = (`mname'.eqnAA).get("`var'")
30 | 			mata: st_local("lieflag",strofreal(`eqn'.lieflag))
31 | 			mata: st_local("vtlistD",invtokens(`eqn'.vtlist))
32 | 			local vtlist `vtlist' `vtlistD'
33 | 			mata: st_local("lieflag",strofreal(`eqn'.lieflag))
34 | 			if `lieflag' {
35 | 				foreach vn in `vtlistD' {
36 | 					local vtlistD_h `vtlistD_h' `vn'_h
37 | 				}
38 | 				local vtlist `vtlist' `vtlistD_h'			
39 | 			}
40 | 		}
41 | 	}
42 | 	
43 | 	if `numeqnZ' {
44 | 		foreach var of varlist `nameZ' {
45 | 			mata: `eqn' = (`mname'.eqnAA).get("`var'")
46 | 			mata: st_local("vtlistZ",invtokens(`eqn'.vtlist))
47 | 			local vtlist `vtlistZ'
48 | 		}
49 | 	}
50 | 	
51 | 	// Add wildcards, add prefixed variables, then unabbreviate
52 | 	foreach var in `vtlist' {
53 | 		local evtlist `evtlist' `var'*
54 | 	}
55 | 	local evtlist `mname'_* `evtlist'
56 | 	unab evtlist : `evtlist'
57 | 	
58 | 	// insert onto model struct
59 | 	mata: `mname'.strDatavars = "`evtlist'"
60 | 	mata: `mname'.matDatavars = st_data(., "`evtlist'")
61 | 	
62 | 	if "`replace'"~="" {
63 | 		// Mata function to delete file
64 | 		mata: unlink("`fname'")
65 | 	}
66 | 	mata: save_model("`fname'",`mname')
67 | 	
68 | 	// clear from model struct
69 | 	mata: `mname'.strDatavars = ""
70 | 	mata: `mname'.matDatavars = J(0,0,.)
71 | 	
72 | end
73 | 
74 | mata:
75 | 
76 | void save_model(					string scalar fname,
77 | 									struct mStruct m)
78 | {
79 | 	fh = fopen(fname,"w")
80 | 	fputmatrix(fh,m)
81 | 	fclose(fh)
82 | }
83 | 
84 | end
85 | 


--------------------------------------------------------------------------------
/ddml_describe.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {viewerjumpto "Syntax" "ddml_describe##syntax"}{...}
 4 | {viewerjumpto "Examples" "ddml_describe##examples"}{...}
 5 | {viewerjumpto "Installation" "ddml_describe##installation"}{...}
 6 | {viewerjumpto "References" "ddml_describe##references"}{...}
 7 | {viewerjumpto "Authors" "ddml_describe##authors"}{...}
 8 | {vieweralsosee "ddml main page" "ddml"}{...}
 9 | {vieweralsosee "Other" "ddml_describe##also_see"}{...}
10 | {hline}
11 | {cmd:help ddml describe}{right: v1.4.4}
12 | {hline}
13 | 
14 | {title:ddml describe utility for Double Debiased Machine Learning}
15 | 
16 | {pstd}
17 | {opt ddml} implements algorithms for causal inference aided by supervised
18 | machine learning as proposed in 
19 | {it:Double/debiased machine learning for treatment and structural parameters}
20 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
21 | binary or continuous treatment variables and endogeneity, high-dimensional 
22 | controls and/or instrumental variables. 
23 | 
24 | {pstd}
25 | {opt ddml describe} provides information about the model setup and/or results in detail.
26 | 
27 | {marker syntax}{...}
28 | {title:Syntax}
29 | 
30 | {p 8 14}{cmd:ddml describe}
31 | [ , {opt mname(name)}
32 | {opt sample}
33 | {opt learners}
34 | {opt crossfit} 
35 | {opt estimates}
36 | {opt all}
37 | 
38 | {synoptset 20}{...}
39 | {synopthdr:options}
40 | {synoptline}
41 | {synopt:{opt mname(name)}}
42 | name of the DDML model. Allows to run multiple DDML
43 | models simultaneously. Defaults to {it:m0}.
44 | {p_end}
45 | {synopt:{opt sample}}
46 | information about the estimation sample, folds, etc.
47 | {p_end}
48 | {synopt:{opt learners}}
49 | information about the differ learners used to estimate conditional expectations.
50 | {p_end}
51 | {synopt:{opt crossfit}}
52 | information about results of the cross-fitting step.
53 | {p_end}
54 | {synopt:{opt estimates}}
55 | information about the estimation estimation results.
56 | {p_end}
57 | {synopt:{opt all}}
58 | equivalent to {opt sample} + {opt learners} + {opt crossfit} + {opt estiamtes}.
59 | {p_end}
60 | {synoptline}
61 | {p2colreset}{...}
62 | {pstd}
63 | 
64 | 
65 | {marker examples}{...}
66 | {title:Examples}
67 | 
68 | {smcl}
69 | INCLUDE help ddml_example_describe.sthlp
70 | 
71 | 
72 | {smcl}
73 | INCLUDE help ddml_install_ref_auth
74 | 


--------------------------------------------------------------------------------
/_ddml_copy.ado:
--------------------------------------------------------------------------------
 1 | *! ddml v1.4.4
 2 | *! last edited: 30aug2024
 3 | *! authors: aa/ms
 4 | 
 5 | program _ddml_copy
 6 | 	version 13
 7 | 
 8 | 	syntax , mname(name) newmname(name)
 9 | 
10 | 	// blank eqn - declare this way so that it's a struct and not transmorphic
11 | 	// used multiple times below
12 | 	tempname eqn
13 | 	mata: `eqn' = init_eqnStruct()
14 | 
15 | 	tempfile fname
16 | 	mata: save_model("`fname'",`mname')
17 | 
18 | 	mata: `newmname' = use_model("`fname'")
19 | 
20 | 	mata: st_local("numeqns",strofreal(cols(`mname'.eqnlist)))
21 | 
22 | 	*** create id and fold id
23 | 	cap drop `newmname'_id
24 | 	cap drop `newmname'_fid
25 | 	mata: st_numscalar("r(nobs)",rows(`mname'.id))
26 | 	if r(nobs) > _N {
27 | 		set obs `r(nobs)'
28 | 	}
29 | 	qui gen double `newmname'_id = .
30 | 	qui gen byte `newmname'_sample = .
31 | 	mata: st_store( ., ("`newmname'_id", "`newmname'_sample"), (`newmname'.idSample))
32 | 	// id and sample variables always exist, fold ID may not
33 | 	mata: st_numscalar("r(ncols)",cols(`newmname'.idFold))
34 | 	if r(ncols) > 0 {
35 | 		qui gen double `newmname'_fid = .
36 | 		mata: st_store( ., ("`newmname'_fid"), (`newmname'.idFold[.,2]))
37 | 	}
38 | 
39 | 	*** loop through equations and create Stata variables
40 | 	// note that variables may not exist
41 | 	forvalues i=1/`numeqns' {
42 | 		mata: `eqn'=*(`newmname'.eqnlist[1,`i'])
43 | 		mata: st_local("vtilde",`eqn'.Vtilde)
44 | 		cap drop `newmname'_`vtilde'
45 | 		mata: st_numscalar("r(ncols)",cols(`eqn'.idVtilde))
46 | 		if r(ncols) > 0 {
47 | 			qui gen double `newmname'_`vtilde' = .
48 | 			mata: st_store( ., ("`newmname'_`vtilde'"), (`eqn'.idVtilde)[.,2])
49 | 		}
50 | 	}
51 | 
52 | end
53 | 
54 | ********************************************************************************
55 | *** Mata section															 ***
56 | ********************************************************************************
57 | 
58 | mata:
59 | 
60 | struct eqnStruct init_eqnStruct()
61 | {
62 | 	struct eqnStruct scalar		e
63 | 	return(e)
64 | }
65 | 
66 | void save_model(					string scalar fname,
67 | 									struct ddmlStruct m)
68 | {
69 | 	fh = fopen(fname,"w")
70 | 	fputmatrix(fh,m)
71 | 	fclose(fh)
72 | }
73 | 
74 | struct ddmlStruct use_model(		string scalar fname)
75 | {
76 | 	struct ddmlStruct scalar	m
77 | 	fh = fopen(fname,"r")
78 | 	m = fgetmatrix(fh,1)	// nonzero second argument required for "strict"
79 | 	fclose(fh)
80 | 	return(m)
81 | }
82 | 
83 | end
84 | 


--------------------------------------------------------------------------------
/ddml_example_interactive_pystacked_detailed.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Interactive model - Detailed example with {help pystacked}}{p_end}
 5 | 
 6 | {pstd}Preparation: we load the data, define global macros and set the seed.{p_end}
 7 | 
 8 | {phang2}. {stata "webuse cattaneo2, clear"}{p_end}
 9 | {phang2}. {stata "global Y bweight"}{p_end}
10 | {phang2}. {stata "global D mbsmoke"}{p_end}
11 | {phang2}. {stata "global X prenatal1 mmarried fbaby mage medu"}{p_end}
12 | {phang2}. {stata "set seed 42"}{p_end}
13 | 
14 | {pstd}We use 5 folds and 5 resamplings; that is, 
15 | we estimate the model 5 times using randomly chosen folds.{p_end}
16 | 
17 | {phang2}. {stata "ddml init interactive, kfolds(5) reps(5)"}{p_end}
18 | 
19 | {pstd}We need to estimate the conditional expectations of E[Y|X,D=0], 
20 | E[Y|X,D=1] and E[D|X]. The first two conditional expectations 
21 | are added jointly.{p_end} 
22 | {pstd}We consider two supervised learners: linear regression and gradient boosted
23 | trees, stacked using {helpb pystacked}.
24 | Note that we use gradient boosted regression trees for E[Y|X,D], but
25 | gradient boosted classification trees for E[D|X].{p_end}
26 | 
27 | {phang2}. {stata "ddml E[Y|X,D]: pystacked $Y $X, type(reg) methods(ols gradboost)"}{p_end}
28 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X, type(class) methods(logit gradboost)"}{p_end}
29 | 
30 | {pstd}Cross-fitting and short-stacking:{p_end}
31 | 
32 | {phang2}. {stata "ddml crossfit, shortstack"}{p_end}
33 | 
34 | {pstd}In the final estimation step, we can estimate
35 | the average treatment effect (the default),
36 | the average treatment effect on the treated ({opt atet}),
37 | or the average treatment effect on the untreated ({opt ateu}).{p_end}
38 | 
39 | {phang2}. {stata "ddml estimate"}{p_end}
40 | {phang2}. {stata "ddml estimate, atet"}{p_end}
41 | {phang2}. {stata "ddml estimate, ateu"}{p_end}
42 | 
43 | {pstd}Recall that we have specified 5 resampling iterations ({opt reps(5)})
44 | By default, the median over short-stacked resampling iterations is shown.
45 | At the bottom, a table of summary statistics over resampling iterations is shown. 
46 | To display the mean over standard stacking results, i.e.,
47 | the results where the weights derive from {helpb pystacked} and vary by cross-fit fold,
48 | we use {opt ddml estimate, replay} with {opt spec(st)} and {opt rep(mn)}.{p_end}
49 | 
50 | {phang2}. {stata "ddml estimate, spec(st) rep(mn) notable replay"}{p_end}
51 | 
52 | {pstd}Generate an overlap plot using {opt ddml overlap}:{p_end}
53 | 
54 | {phang2}. {stata "ddml overlap"}{p_end}
55 | 
56 | {pstd}Report the standard stacking and short-stacking weights:{p_end}
57 | 
58 | {phang2}. {stata "ddml extract, show(stweights)"}{p_end}
59 | {phang2}. {stata "ddml extract, show(ssweights)"}{p_end}
60 | 


--------------------------------------------------------------------------------
/ddml_install_ref_auth.ihlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 27july2023}{...}
 3 | {smcl}
 4 | {marker installation}{title:Installation}
 5 | 
 6 | {pstd}
 7 | To verify that {opt ddml} and {opt qddml} are correctly installed, 
 8 | click on or type {stata "whichpkg ddml"} 
 9 | (which requires {helpb whichpkg} 
10 | to be installed; {stata "ssc install whichpkg"}).
11 | 
12 | {pstd}
13 | To get the latest stable versions of {opt ddml} and {opt qddml} from our website, 
14 | check the installation instructions at {browse "https://statalasso.github.io/docs/ddml/installation/"}.
15 | We update the stable website version more frequently than the SSC version.
16 | 
17 | 
18 | {marker references}{title:References}
19 | 
20 | {marker Ahrens2023pystacked}{...}
21 | {pstd}
22 | Ahrens, A., Hansen, C. B., & Schaffer, M. E. (2023).
23 | pystacked: Stacking generalization and machine learning in Stata.
24 | The Stata Journal, 23(4), 909-931.
25 | {browse "https://doi.org/10.1177/1536867X231212426"}
26 | 
27 | {marker Ahrens2024stacking}{...}
28 | {pstd}
29 | Ahrens, A., Hansen, C. B., Schaffer, M. E., & Wiemann, T. (2024a).
30 | Model averaging and double machine learning. 
31 | arXiv:2401.01645.
32 | {browse "https://arxiv.org/abs/2401.01645"}
33 | 
34 | {marker Ahrens2024ddml}{...}
35 | {pstd}
36 | Ahrens, A., Hansen, C. B., Schaffer, M. E., & Wiemann, T. (2024b).
37 | ddml: Double/debiased machine learning in Stata. 
38 | {it:The Stata Journal}, 24(1), 3-45. 
39 | {browse "https://doi.org/10.1177/1536867X241233641"}
40 | 
41 | {marker Chern2018}{...}
42 | {pstd}
43 | Chernozhukov, V., Chetverikov, D., Demirer, M., 
44 | Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), 
45 | Double/debiased machine learning for 
46 | treatment and structural parameters. 
47 | {it:The Econometrics Journal}, 21: C1-C68. 
48 | {browse "https://doi.org/10.1111/ectj.12097"}
49 | 
50 | {marker Hastie2009}{...}
51 | {pstd}
52 | Hastie, T., Tibshirani, R., & Friedman, J. (2009). 
53 | The elements of statistical learning: data mining, inference,
54 | and prediction. Springer Science & Business Media.
55 | 
56 | {marker Wolpert1992}{...}
57 | {pstd}
58 | Wolpert, David H. Stacked generalization. 
59 | {it:Neural networks} 5.2 (1992): 241-259.
60 | {browse "https://doi.org/10.1016/S0893-6080(05)80023-1"}
61 | 
62 | 
63 | {marker authors}{title:Authors}
64 | 
65 | {pstd}
66 | Achim Ahrens, Public Policy Group, ETH Zurich, Switzerland {break}
67 | achim.ahrens@gess.ethz.ch
68 | 
69 | {pstd}
70 | Christian B. Hansen, University of Chicago, USA {break}
71 | Christian.Hansen@chicagobooth.edu
72 | 
73 | {pstd}
74 | Mark E Schaffer, Heriot-Watt University, UK {break}
75 | m.e.schaffer@hw.ac.uk	
76 | 
77 | {pstd}
78 | Thomas Wiemann, University of Chicago, USA {break}
79 | wiemann@uchicago.edu
80 | 
81 | 
82 | {marker also_see}{title:Also see (if installed)}
83 | 
84 | {pstd}
85 | Help:
86 | {helpb pystacked},
87 | {helpb lasso2},
88 | {helpb cvlasso},
89 | {helpb rlasso},
90 | {helpb ivlasso},
91 | {helpb pdslasso}.{p_end}
92 | 


--------------------------------------------------------------------------------
/ddml_overlap.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {viewerjumpto "Syntax" "ddml_export##syntax"}{...}
 4 | {viewerjumpto "Examples" "ddml_export##examples"}{...}
 5 | {viewerjumpto "Installation" "ddml_export##installation"}{...}
 6 | {viewerjumpto "References" "ddml_export##references"}{...}
 7 | {viewerjumpto "Authors" "ddml_export##authors"}{...}
 8 | {vieweralsosee "ddml main page" "ddml"}{...}
 9 | {vieweralsosee "ddml interactive" "ddml interactive"}{...}
10 | {vieweralsosee "Other" "ddml_export##also_see"}{...}
11 | {hline}
12 | {cmd:help ddml overlap}{right: v1.4.4}
13 | {hline}
14 | 
15 | {title:ddml overlap commands for Double Debiased Machine Learning}
16 | 
17 | {pstd}
18 | {opt ddml} implements algorithms for causal inference aided by supervised
19 | machine learning as proposed in 
20 | {it:Double/debiased machine learning for treatment and structural parameters}
21 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
22 | binary or continuous treatment variables and endogeneity, high-dimensional 
23 | controls and/or instrumental variables. 
24 | 
25 | {pstd}
26 | {cmd:ddml overlap} reports overlap plots following estimation of the {opt ddml} { {it:interactive} and {it:interactiveiv} models.
27 | One overlap (line) plot of propensity scores is reported for each treatment variable learner;
28 | by default, propensity scores for all crossfit samples are plotted.
29 | Overlap plots for the treatment variables are combined using {helpb graph combine}.
30 | 
31 | {marker syntax}{...}
32 | {title:Syntax}
33 | 
34 | {p 8 14}{cmd:ddml overlap} [ {opt mname(name)} {opt replist(numlist)} {opt pslist(namelist)} {opt n(integer)} {opt kernel(name)}
35 | {opt name(name [, replace])} {opt title(string)} {opt subtitle(string)} {opt lopt0(string)}
36 | {opt lopt1(string)}{bind: ]}
37 | 
38 | {synoptset 20}{...}
39 | {synopthdr:Options}
40 | {synoptline}
41 | {synopt:{opt mname(name)}}
42 | name of the DDML model. Defaults to {it:m0}.
43 | {p_end}
44 | {synopt:{opt replist(numlist)}}
45 | list of crossfitting resamples to plot. Defaults to all.
46 | {p_end}
47 | {synopt:{opt pslist(namelist)}}
48 | varnames of propensity scores to plot (excluding the resample number). Defaults to all.
49 | {p_end}
50 | {synopt:{opt n(integer)}}
51 | see {helpb teffects overlap}.
52 | {p_end}
53 | {synopt:{opt kernel(name)}}
54 | see {helpb teffects overlap}.
55 | {p_end}
56 | {synopt:{opt name(name)}}
57 | see {helpb graph combine}.
58 | {p_end}
59 | {synopt:{opt title(string)}}
60 | see {helpb graph combine}.
61 | {p_end}
62 | {synopt:{opt subtitle(string)}}
63 | see {helpb graph combine}.
64 | {p_end}
65 | {synopt:{opt lopt0(string)}}
66 | options for line plot of untreated; default is solid/navy; see {helpb line}.
67 | {p_end}
68 | {synopt:{opt lopt0(string)}}
69 | options for line plot of treated; default is short dash/dark orange; see {helpb line}.
70 | {p_end}
71 | {synoptline}
72 | {p2colreset}{...}
73 | {pstd}
74 | 
75 | 
76 | {title:Examples}
77 | 
78 | {smcl}
79 | INCLUDE help ddml_example_overlap.sthlp
80 | 
81 | 
82 | {smcl}
83 | INCLUDE help ddml_install_ref_auth
84 | 


--------------------------------------------------------------------------------
/ddml_interactive.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {viewerjumpto "Examples" "ddml_interactive##examples"}{...}
 4 | {viewerjumpto "Installation" "ddml_interactive##installation"}{...}
 5 | {viewerjumpto "References" "ddml_interactive##references"}{...}
 6 | {viewerjumpto "Authors" "ddml_interactive##authors"}{...}
 7 | {vieweralsosee "ddml main page" "ddml"}{...}
 8 | {vieweralsosee "Other" "ddml_interactive##also_see"}{...}
 9 | {hline}
10 | {cmd:help ddml interactive}{right: v1.4.4}
11 | {hline}
12 | 
13 | {title:ddml - estimation of the interactive (ATE, ATET) model in Double Debiased Machine Learning}
14 | 
15 | {pstd}
16 | {opt ddml} implements algorithms for causal inference aided by supervised
17 | machine learning as proposed in 
18 | {it:Double/debiased machine learning for treatment and structural parameters}
19 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
20 | binary or continuous treatment variables and endogeneity, high-dimensional 
21 | controls and/or instrumental variables.
22 | 
23 | {pstd}
24 | {opt ddml} supports a variety of different ML programs, including
25 | but not limited to {help pystacked} and {help lassopack}. 
26 | {help pystacked} is the recommended way to specify multiple learners in {opt ddml},
27 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
28 | 
29 | {pstd}
30 | The {opt ddml} package also includes the wrapper program {help qddml},
31 | which uses a simplified one-line syntax, but offers less flexibility.
32 | 
33 | {pstd}
34 | This help file illustrates usage of the {ul:interactive model}
35 | used to obtain estimates of the ATE (average treatment effect)
36 | and ATET (average treatment effect on the treated).
37 | For examples of other models,
38 | follow the links in the main {help ddml:ddml help file}.
39 | 
40 | {pstd}
41 | We use {it:Y} to denote the outcome variable, 
42 | {it:X} to denote confounders, and
43 | {it:D} to denote the treatment variable(s) of interest.
44 | 
45 | {pstd}
46 | {ul:Interactive model} [{it:interactive}]
47 | 
48 | 	Y = g(X,D) + U
49 |         D = m(X) + V
50 | 
51 | {pstd}
52 | which (compared to the {help ddml partial:partially-linear model}
53 | relaxes the assumption that X and D are separable. 
54 | D is a binary treatment variable. 
55 | We estimate, using a supervised machine
56 | learner, the following conditional expectations:
57 | {p_end}
58 | {phang2}1. E[Y|X,D=0] and E[Y|X,D=1], jointly added using {cmd:ddml E[Y|X,D]}{p_end}
59 | {phang2}2. E[D|X], added using {cmd:ddml E[D|X]}{p_end}
60 | 
61 | 
62 | {marker examples}{...}
63 | {title:Examples}
64 | 
65 | {pstd}
66 | Below we demonstrate the use of {cmd:ddml} for the interactive model. 
67 | Note that estimation models are chosen for demonstration purposes only and 
68 | may be kept simple to allow you to run the code quickly.
69 | 
70 | {pstd}{help ddml interactive##pystacked_basic:1. Basic example of the interactive model (ATE, ATET) with pystacked}{p_end}
71 | {pstd}{help ddml interactive##pystacked_detailed:2. Detailed example of the interactive model (ATE, ATET) with pystacked}{p_end}
72 | 
73 | {marker pystacked_basic}
74 | {smcl}
75 | INCLUDE help ddml_example_interactive_pystacked_basic.sthlp
76 | 
77 | {marker pystacked_detailed}
78 | {smcl}
79 | INCLUDE help ddml_example_interactive_pystacked_detailed.sthlp
80 | 
81 | 
82 | {smcl}
83 | INCLUDE help ddml_install_ref_auth
84 | 


--------------------------------------------------------------------------------
/ddml_interactiveiv.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {viewerjumpto "Examples" "ddml_interactiveiv##examples"}{...}
 4 | {viewerjumpto "Installation" "ddml_interactiveiv##installation"}{...}
 5 | {viewerjumpto "References" "ddml_interactiveiv##references"}{...}
 6 | {viewerjumpto "Authors" "ddml_interactiveiv##authors"}{...}
 7 | {vieweralsosee "ddml main page" "ddml"}{...}
 8 | {vieweralsosee "Other" "ddml_interactiveiv##also_see"}{...}
 9 | {hline}
10 | {cmd:help ddml interactiveiv}{right: v1.4.4}
11 | {hline}
12 | 
13 | {title:ddml - estimation of the interactive IV (LATE) model in Double Debiased Machine Learning}
14 | 
15 | {pstd}
16 | {opt ddml} implements algorithms for causal inference aided by supervised
17 | machine learning as proposed in 
18 | {it:Double/debiased machine learning for treatment and structural parameters}
19 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
20 | binary or continuous treatment variables and endogeneity, high-dimensional 
21 | controls and/or instrumental variables.
22 | 
23 | {pstd}
24 | {opt ddml} supports a variety of different ML programs, including
25 | but not limited to {help pystacked} and {help lassopack}. 
26 | {help pystacked} is the recommended way to specify multiple learners in {opt ddml},
27 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
28 | 
29 | {pstd}
30 | The {opt ddml} package also includes the wrapper program {help qddml},
31 | which uses a simplified one-line syntax, but offers less flexibility.
32 | 
33 | {pstd}
34 | This help file illustrates usage of the {ul:interactive IV model}
35 | used to obtain estimates of the LATE (local average treatment effect).
36 | For examples of other models,
37 | follow the links in the main {help ddml:ddml help file}.
38 | 
39 | {pstd}
40 | We use {it:Y} to denote the outcome variable, 
41 | {it:X} to denote confounders, and
42 | {it:D} to denote the treatment variable(s) of interest.
43 | 
44 | {pstd}
45 | {ul:Interactive IV model}  [{it:interactiveiv}]
46 | 
47 | 	Y = g(Z,X) + U
48 |         D = h(Z,X) + V
49 |         Z = m(X) + E
50 | 
51 | {pstd}
52 | where the aim is to estimate the local average treatment effect (LATE).
53 | We estimate, using a supervised machine
54 | learner, the following conditional expectations:
55 | {p_end}
56 | {phang2}1. E[Y|X,Z=0] and E[Y|X,Z=1], jointly added using {cmd:ddml E[Y|X,Z]}{p_end}
57 | {phang2}2. E[D|X,Z=0] and E[D|X,Z=1], jointly added using {cmd:ddml E[D|X,Z]}{p_end}
58 | {phang2}3. E[Z|X], added using {cmd:ddml E[Z|X]}{p_end}
59 | 
60 | 
61 | {marker examples}{...}
62 | {title:Examples}
63 | 
64 | {pstd}
65 | Below we demonstrate the use of {cmd:ddml} for the interactive IV model.
66 | Note that estimation models are chosen for demonstration purposes only and 
67 | may be kept simple to allow you to run the code quickly.
68 | 
69 | {pstd}{help ddml interactiveiv##pystacked_basic:1. Basic example of the interactive IV model (LATE) with pystacked}{p_end}
70 | {pstd}{help ddml interactiveiv##pystacked_detailed:2. Detailed example of the interactive IV model (LATE) with pystacked}{p_end}
71 | 
72 | {marker pystacked_basic}
73 | {smcl}
74 | INCLUDE help ddml_example_interactiveiv_pystacked_basic.sthlp
75 | 
76 | {marker pystacked_detailed}
77 | {smcl}
78 | INCLUDE help ddml_example_interactiveiv_pystacked_detailed.sthlp
79 | 
80 | 
81 | {smcl}
82 | INCLUDE help ddml_install_ref_auth
83 | 


--------------------------------------------------------------------------------
/_ddml_use.ado:
--------------------------------------------------------------------------------
  1 | *! ddml v1.4.4
  2 | *! last edited: 30aug2024
  3 | *! authors: aa/ms
  4 | 
  5 | program _ddml_use
  6 | 	version 13
  7 | 
  8 | 	syntax , mname(name) fname(string) [ replace ]
  9 | 
 10 | 	// blank eqn - declare this way so that it's a struct and not transmorphic
 11 | 	// used multiple times below
 12 | 	tempname eqn
 13 | 	mata: `eqn' = init_eStruct()
 14 | 
 15 | 	// does model already exist?
 16 | 	mata: st_local("isnull",strofreal(findexternal("`mname'")==NULL))
 17 | 
 18 | 	if `isnull' | "`replace'"=="replace" {
 19 | 		mata: `mname' = use_model("`fname'")
 20 | 	}
 21 | 	else {
 22 | 		di as err "error - `mname' already exists in Mata memory; use -replace- option"
 23 | 		exit 198
 24 | 	}
 25 | 
 26 | 	/*
 27 | 	*** extract details of estimation
 28 | 	
 29 | 	// model
 30 | 	mata: st_local("model",`mname'.model)
 31 | 	di "Model: `model'"
 32 | 	mata: st_local("numeqns",strofreal(cols(`mname'.eqnlist)))
 33 | 	mata: st_local("numeqnsY",strofreal(cols(`mname'.nameYtilde)))
 34 | 	mata: st_local("numeqnsD",strofreal(cols(`mname'.nameDtilde)))
 35 | 	mata: st_local("numeqnsZ",strofreal(cols(`mname'.nameZtilde)))
 36 | 	mata: st_local("nameY",`mname'.nameY)
 37 | 	mata: st_local("listYtilde",invtokens(`mname'.nameYtilde))
 38 | 	di "Number of Y estimating equations: `numeqnsY'"
 39 | 	if `numeqnsD' {
 40 | 		mata: st_local("listD",invtokens(`mname'.nameD))
 41 | 		mata: st_local("listDtilde",invtokens(`mname'.nameDtilde))
 42 | 		di "Number of D estimating equations: `numeqnsD'"
 43 | 	}
 44 | 	if `numeqnsZ' {
 45 | 		mata: st_local("listZ",invtokens(`mname'.nameZ))
 46 | 		mata: st_local("listZtilde",invtokens(`mname'.nameZtilde))
 47 | 		di "Number of Z estimating equations: `numeqnsZ'"
 48 | 	}
 49 | 	
 50 | 	*** create id and fold id
 51 | 	cap drop `mname'_id
 52 | 	cap drop `mname'_fid
 53 | 	mata: st_numscalar("r(nobs)",rows(`mname'.id))
 54 | 	if r(nobs) > _N {
 55 | 		set obs `r(nobs)'
 56 | 	}
 57 | 	qui gen double `mname'_id = .
 58 | 	// id variable always exists, fold ID may not
 59 | 	mata: st_numscalar("r(ncols)",cols(`mname'.idFold))
 60 | 	if r(ncols) > 0 {
 61 | 		qui gen double `mname'_fid = .
 62 | 		mata: st_store( ., ("`mname'_id", "`mname'_fid"), (`mname'.idFold))
 63 | 	}
 64 | 	else {
 65 | 		mata: st_store( ., ("`mname'_id"), (`mname'.id))
 66 | 	}
 67 | 
 68 | 	*** loop through equations and create Stata variables
 69 | 	// note that variables may not exist
 70 | 	forvalues i=1/`numeqns' {
 71 | 		mata: `eqn'=*(`mname'.eqnlist[1,`i'])
 72 | 		mata: st_local("vtilde",`eqn'.Vtilde)
 73 | 		cap drop `mname'_`vtilde'
 74 | 		mata: st_numscalar("r(ncols)",cols(`eqn'.idVtilde))
 75 | 		if r(ncols) > 0 {
 76 | 			qui gen double `mname'_`vtilde' = .
 77 | 			mata: st_store( ., ("`mname'_`vtilde'"), (`eqn'.idVtilde)[.,2])
 78 | 		}
 79 | 	}
 80 | 	*/
 81 | 
 82 | end
 83 | 
 84 | ********************************************************************************
 85 | *** Mata section															 ***
 86 | ********************************************************************************
 87 | 
 88 | mata:
 89 | 
 90 | struct mStruct use_model(string scalar fname)
 91 | {
 92 | 	struct mStruct scalar	m
 93 | 	
 94 | 	if (!fileexists(fname)) {
 95 | 		errprintf("file %s not found\n", fname)
 96 | 		exit(601)
 97 | 	}
 98 | 	
 99 | 	fh = fopen(fname,"r")
100 | 	m = fgetmatrix(fh,1)	// nonzero second argument required for "strict"
101 | 	fclose(fh)
102 | 	return(m)
103 | }
104 | 
105 | end
106 | 


--------------------------------------------------------------------------------
/ddml_partial.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {viewerjumpto "Examples" "ddml_partial##examples"}{...}
 4 | {viewerjumpto "Installation" "ddml_partial##installation"}{...}
 5 | {viewerjumpto "References" "ddml_partial##references"}{...}
 6 | {viewerjumpto "Authors" "ddml_partial##authors"}{...}
 7 | {vieweralsosee "ddml main page" "ddml"}{...}
 8 | {vieweralsosee "Other" "ddml_partial##also_see"}{...}
 9 | {hline}
10 | {cmd:help ddml partial}{right: v1.4.4}
11 | {hline}
12 | 
13 | {title:ddml - estimation of the partially-linear model in Double Debiased Machine Learning}
14 | 
15 | {pstd}
16 | {opt ddml} implements algorithms for causal inference aided by supervised
17 | machine learning as proposed in 
18 | {it:Double/debiased machine learning for treatment and structural parameters}
19 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
20 | binary or continuous treatment variables and endogeneity, high-dimensional 
21 | controls and/or instrumental variables.
22 | 
23 | {pstd}
24 | {opt ddml} supports a variety of different ML programs, including
25 | but not limited to {help pystacked} and {help lassopack}. 
26 | {help pystacked} is the recommended way to specify multiple learners in {opt ddml},
27 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
28 | 
29 | {pstd}
30 | The {opt ddml} package also includes the wrapper program {help qddml},
31 | which uses a simplified one-line syntax, but offers less flexibility.
32 | 
33 | {pstd}
34 | This help file illustrates usage of the {ul:partially-linear model}.
35 | For examples of other models,
36 | follow the links in the main {help ddml:ddml help file}.
37 | 
38 | {pstd}
39 | We use {it:Y} to denote the outcome variable, 
40 | {it:X} to denote confounders, and
41 | {it:D} to denote the treatment variable(s) of interest.
42 | 
43 | {pstd}
44 | {ul:Partially-linear model} [{it:partial}]
45 | 
46 | 	Y = {it:a}.D + g(X) + U
47 |         D = m(X) + V
48 | 
49 | {pstd}
50 | where the aim is to estimate {it:a} while controlling for X. To this end, 
51 | we estimate the conditional expectations
52 | E[Y|X] and E[D|X] using a supervised machine learner.
53 | 
54 | 
55 | {marker examples}{...}
56 | {title:Examples}
57 | 
58 | {pstd}
59 | Below we demonstrate the use of {cmd:ddml} for the partially-linear model. 
60 | Note that estimation models are chosen for demonstration purposes only and 
61 | may be kept simple to allow you to run the code quickly.
62 | 
63 | {pstd}{help ddml partial##pystacked_basic:1. Basic example of the partially-linear model with pystacked}{p_end}
64 | {pstd}{help ddml partial##pystacked_detailed:2. Detailed example of the partially-linear model with pystacked}{p_end}
65 | {pstd}{help ddml partial##anylearner_detailed:3. Detailed general example of the partially-linear model with any learner(s)}{p_end}
66 | {pstd}{help ddml partial##pystacked_multitreat:4. Estimating the partially-linear model with multiple treatments}{p_end}
67 | 
68 | {marker pystacked_basic}
69 | {smcl}
70 | INCLUDE help ddml_example_partial_pystacked_basic.sthlp
71 | 
72 | {marker pystacked_detailed}
73 | {smcl}
74 | INCLUDE help ddml_example_partial_pystacked_detailed.sthlp
75 | 
76 | {marker anylearner_detailed}
77 | {smcl}
78 | INCLUDE help ddml_example_partial_anylearner_detailed.sthlp
79 | 
80 | {marker pystacked_multitreat}
81 | {smcl}
82 | INCLUDE help ddml_example_partial_pystacked_multitreat.sthlp
83 | 
84 | 
85 | {smcl}
86 | INCLUDE help ddml_install_ref_auth
87 | 
88 | 


--------------------------------------------------------------------------------
/ddml_examples.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {vieweralsosee "ddml main page" "ddml"}{...}
 4 | {vieweralsosee "Other" "ddml_stacking##also_see"}{...}
 5 | {hline}
 6 | {cmd:help ddml examples}{right: v1.4.4}
 7 | {hline}
 8 | 
 9 | {title:ddml examples for Double Debiased Machine Learning}
10 | 
11 | {pstd}
12 | {opt ddml} implements algorithms for causal inference aided by supervised
13 | machine learning as proposed in 
14 | {it:Double/debiased machine learning for treatment and structural parameters}
15 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
16 | binary or continuous treatment variables and endogeneity, high-dimensional 
17 | controls and/or instrumental variables.
18 | 
19 | {pstd}
20 | Below is a list of links to all {opt ddml} examples.
21 | All the examples in the help files have clickable links and can be run by the user.
22 | 
23 | {pstd}{ul:Partially-linear model}:{p_end}
24 | {pstd}{help ddml_example_partial_pystacked_basic:Basic example of the partially-linear model with pystacked}{p_end}
25 | {pstd}{help ddml_example_partial_pystacked_detailed:Detailed example of the partially-linear model with pystacked}{p_end}
26 | {pstd}{help ddml_example_partial_anylearner_detailed:Detailed general example of the partially-linear model with any learner(s)}{p_end}
27 | {pstd}{help ddml_example_partial_pystacked_multitreat:Estimating the partially-linear model with multiple treatments}{p_end}
28 | 
29 | {pstd}{ul:Interactive model (ATE, ATET)}:{p_end}
30 | {pstd}{help ddml_example_interactive_pystacked_basic:Basic example of the interactive model (ATE, ATET) with pystacked}{p_end}
31 | {pstd}{help ddml_example_interactive_pystacked_detailed:Detailed example of the interactive model (ATE, ATET) with pystacked}{p_end}
32 | 
33 | {pstd}{ul:Partially-linear IV model}:{p_end}
34 | {pstd}{help ddml_example_partialiv_pystacked_basic:Basic example of the partially-linear IV model with pystacked}{p_end}
35 | {pstd}{help ddml_example_partialiv_anylearner_basic:Basic example of the partially-linear IV model with any learner(s)}{p_end}
36 | 
37 | {pstd}{ul:Flexible partially-linear IV model}:{p_end}
38 | {pstd}{help ddml_example_flexiv_anylearner_basic:Basic example of the flexible partially-linear IV model with any learner(s)}{p_end}
39 | {pstd}{help ddml_example_flexiv_anylearner_detailed:Detailed example of the flexible partially-linear IV model with any learner(s)}{p_end}
40 | 
41 | {pstd}{ul:Ineractive IV model (LATE)}:{p_end}
42 | {pstd}{help ddml_example_interactiveiv_pystacked_basic:Basic example of the interactive IV model (LATE) with pystacked}{p_end}
43 | {pstd}{help ddml_example_interactiveiv_pystacked_detailed:Detailed example of the interactive IV model (LATE) with pystacked}{p_end}
44 | 
45 | {pstd}{ul:Stacking regression with ddml}:{p_end}
46 | {pstd}{help ddml_example_stacking:Detailed discussion of stacking with ddml plus examples}{p_end}
47 | 
48 | {pstd}{ul:ddml utilities}:{p_end}
49 | {pstd}{help ddml_example_extract:Extracting stored information from ddml associative arrays}{p_end}
50 | {pstd}{help ddml_example_describe:Describe the model setup and/or results}{p_end}
51 | {pstd}{help ddml_example_export:Save estimated conditional expectations etc. to a csv file}{p_end}
52 | {pstd}{help ddml_example_overlap:Overlap plots for interactive models}{p_end}
53 | 
54 | {pstd}{ul:Cluster cross-fitting with ddml}:{p_end}
55 | {pstd}{help ddml_example_fcluster:Cluster sampling with cross-fit folds}{p_end}
56 | 
57 | 
58 | {smcl}
59 | INCLUDE help ddml_install_ref_auth
60 | 


--------------------------------------------------------------------------------
/_ddml_overlap.ado:
--------------------------------------------------------------------------------
  1 | *! ddml v1.4.4
  2 | *! last edited: 30aug2024
  3 | *! authors: aa/ms
  4 | 
  5 | program define _ddml_overlap
  6 | 	version 16
  7 | 	syntax 		, [									///
  8 | 				mname(name)							///
  9 | 				replist(numlist integer min=1)		/// list of resamples
 10 | 				pslist(namelist)					/// list of propensity scores excl resample 
 11 | 				n(integer 0)						/// number of points (default = N)
 12 | 				kernel(name)						/// default = triangle
 13 | 				lopt0(string)						/// line options for d=0
 14 | 				lopt1(string)						/// line options for d=1
 15 | 				title(string)						/// title for combined graph
 16 | 				subtitle(string)					/// subtitle for combined graph
 17 | 				name(string)						/// name of combined graph; can include ", replace"
 18 | 				*]
 19 | 	
 20 | 	// blank eqn - declare this way so that it's a struct and not transmorphic
 21 | 	tempname eqn
 22 | 	mata: `eqn' = init_eStruct()
 23 | 	
 24 | 	mata: st_local("model",`mname'.model)
 25 | 	mata: st_local("crossfitted",strofreal(`mname'.crossfitted))	// flag for crossfitting results available
 26 | 	mata: st_local("nreps",strofreal(`mname'.nreps))
 27 | 	mata: st_local("nameY",`mname'.nameY)
 28 | 	mata: st_local("nameD",invtokens(`mname'.nameD))
 29 | 	mata: st_local("nameZ",invtokens((`mname'.nameZ)))
 30 | 	local numeqnD : word count `nameD'
 31 | 	local numeqnZ : word count `nameZ'
 32 | 	
 33 | 	if "`model'"~="interactive" & "`model'"~="interactiveiv" {
 34 | 		di as err "error - overlap supported only for interactive or interactiveiv (LATE) models"
 35 | 		exit 198
 36 | 	}
 37 | 	if "`model'"=="interactive" & `numeqnD'>1 {
 38 | 		di as err "error - only one treatment variable currently supported in interactive model"
 39 | 		exit 198
 40 | 	}
 41 | 	if `crossfitted'==0 {
 42 | 		di as err "error - model not crossfitted"
 43 | 		exit 198
 44 | 	}
 45 | 	
 46 | 	// default title
 47 | 	if "`title'"=="" & "`model'"=="interactive" {
 48 | 		local title "Propensity scores by treatment group"
 49 | 	}
 50 | 	else if "`title'"=="" {
 51 | 		local title "Propensity scores by assignment group"
 52 | 	}
 53 | 	
 54 | 	// default replist, graph subtitle
 55 | 	if "`replist'"=="" {
 56 | 		local replist 1/`nreps'
 57 | 		if `nreps'>1 & "`subtitle'"=="" {
 58 | 			local subtitle "all crossfit samples"
 59 | 		}
 60 | 	}
 61 | 	else if "`subtitle'"=="" {
 62 | 		local subtitle "reps=`replist'"
 63 | 	}
 64 | 	
 65 | 	// default list of propensity scores (prefixes)
 66 | 	if "`pslist'"=="" {
 67 | 		// eqn has info about learners
 68 | 		if "`model'"=="interactive" {
 69 | 			mata: `eqn' = (`mname'.eqnAA).get("`nameD'")
 70 | 		}
 71 | 		else {
 72 | 			mata: `eqn' = (`mname'.eqnAA).get("`nameZ'")
 73 | 		}
 74 | 		mata: st_local("pslist",invtokens(`eqn'.vtlist))
 75 | 	}
 76 | 	// labels for propensity scores
 77 | 	if "`model'"=="interactive" {
 78 | 		local vlab0 "D=0"
 79 | 		local vlab1 "D=1"
 80 | 	}
 81 | 	else {
 82 | 		local vlab0 "Z=0"
 83 | 		local vlab1 "Z=1"
 84 | 	}
 85 | 		
 86 | 	// default number of points
 87 | 	if `n'==0 {
 88 | 		qui count if `mname'_sample
 89 | 		local n=r(N)
 90 | 	}
 91 | 	// default kernel
 92 | 	if "`kernel'"=="" {
 93 | 		local kernel triangle
 94 | 	}
 95 | 	// default line options
 96 | 	if "`lopt0'"=="" {
 97 | 		local lopt0 lpattern(solid) lcolor(navy)
 98 | 	}
 99 | 	if "`lopt1'"=="" {
100 | 		local lopt1 lpattern(shortdash) lcolor(dkorange)
101 | 	}
102 | 	
103 | 	// loop through propensity scores
104 | 	foreach dtilde in `pslist' {
105 | 		// gname is individual dtilde graph
106 | 		local gname `dtilde'
107 | 		// reset gcmd local
108 | 		local gcmd
109 | 		// loop through resamples
110 | 		foreach r of numlist `replist' {
111 | 			tempvar x0`r' x1`r' ps0`r' ps1`r' ps`r'
112 | 			qui gen `ps`r'' = `dtilde'_`r'
113 | 			kdensity `ps`r'' if `nameD'==0, kernel(`kernel') n(`n') nograph gen(`x0`r'' `ps0`r'')
114 | 			kdensity `ps`r'' if `nameD'==1, kernel(`kernel') n(`n') nograph gen(`x1`r'' `ps1`r'')
115 | 			local gcmd `gcmd'												///
116 | 				(line `ps0`r'' `x0`r'', `lopt0')							///
117 | 				(line `ps1`r'' `x1`r'', `lopt1')
118 | 		}
119 | 		label var `ps01' "`vlab0'"
120 | 		label var `ps11' "`vlab1'"
121 | 		twoway `gcmd',														///
122 | 			title("`dtilde'")												///
123 | 			xtitle("Propensity score")										///
124 | 			ytitle("Density")												///
125 | 			legend(order(1 2))												///
126 | 			nodraw															///
127 | 			name(`gname', replace)
128 | 	}
129 | 	graph combine `pslist', title("`title'") subtitle("`subtitle'") name(`name')
130 | 	// drop separate graphs
131 | 	cap graph drop `pslist'
132 | 		
133 | end
134 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_crossfit.log:
--------------------------------------------------------------------------------
  1 | ---------------------------------------------------------------------------------------------------------------
  2 |       name:  <unnamed>
  3 |        log:  /Users/kahrens/MyProjects/ddml/cert/ddml_cert_crossfit.log
  4 |   log type:  text
  5 |  opened on:  30 Aug 2024, 17:23:19
  6 | 
  7 | . 
  8 | . which ddml
  9 | /Users/kahrens/MyProjects/ddml/ddml.ado
 10 | *! ddml v1.4.4
 11 | *! last edited: 30aug2024
 12 | *! authors: aa/ms
 13 | 
 14 | . mata: whichddml()
 15 |   
 16 |   Mata library for ddml and related programs,
 17 |   compiled 29 Aug 2024 under Stata 15.1 born 03 Feb 2020.
 18 |   authors AA/MS
 19 | 
 20 | . which crossfit
 21 | /Users/kahrens/MyProjects/ddml/crossfit.ado
 22 | *! ddml v1.4.4
 23 | *! last edited: 30aug2024
 24 | *! authors: aa/ms
 25 | 
 26 | . 
 27 | . use http://fmwww.bc.edu/repec/bocode/j/jtpa.dta, clear
 28 | 
 29 | . global X sex age married black hispanic
 30 | 
 31 | . 
 32 | . set seed 42
 33 | 
 34 | . crossfit, estring(reg earnings $X) gen(yhat) kfolds(3)
 35 | Cross-fitting fold 1 2 3 ...completed cross-fitting
 36 | 
 37 | . sum earnings yhat_1
 38 | 
 39 |     Variable |        Obs        Mean    Std. dev.       Min        Max
 40 | -------------+---------------------------------------------------------
 41 |     earnings |     11,204    15815.29    16767.05          0     155760
 42 |       yhat_1 |     11,204     15812.1    3774.255   6123.432   23867.51
 43 | 
 44 | . 
 45 | . set seed 42
 46 | 
 47 | . crossfit, estring(pystacked earnings $X) gen(yhat) kfolds(3)
 48 | calling pystacked on full sample with noestimate option...
 49 | N=11204
 50 | number of learners = 3
 51 | Base learners: ols lassocv gradboost 
 52 | Cross-fitting fold 1 2 3 ...completed cross-fitting
 53 | 
 54 | . sum earnings yhat*
 55 | 
 56 |     Variable |        Obs        Mean    Std. dev.       Min        Max
 57 | -------------+---------------------------------------------------------
 58 |     earnings |     11,204    15815.29    16767.05          0     155760
 59 |    yhat_L1_1 |     11,204     15812.1    3774.255   6123.432   23867.51
 60 |    yhat_L2_1 |     11,204    15812.11    3768.449   6145.505   23851.63
 61 |    yhat_L3_1 |     11,204    15810.33    4477.315   76.66996   31094.56
 62 |       yhat_1 |     11,204    15810.67     4147.55   2560.445   28327.05
 63 | 
 64 | . 
 65 | . set seed 42
 66 | 
 67 | . crossfit, estring(reg earnings $X) gen(yhat) kfolds(3) reps(5)
 68 | Resample 1...
 69 | Cross-fitting fold 1 2 3 ...completed cross-fitting
 70 | Resample 2...
 71 | Cross-fitting fold 1 2 3 ...completed cross-fitting
 72 | Resample 3...
 73 | Cross-fitting fold 1 2 3 ...completed cross-fitting
 74 | Resample 4...
 75 | Cross-fitting fold 1 2 3 ...completed cross-fitting
 76 | Resample 5...
 77 | Cross-fitting fold 1 2 3 ...completed cross-fitting
 78 | 
 79 | . sum earnings yhat*
 80 | 
 81 |     Variable |        Obs        Mean    Std. dev.       Min        Max
 82 | -------------+---------------------------------------------------------
 83 |     earnings |     11,204    15815.29    16767.05          0     155760
 84 |    yhat_L1_1 |     11,204     15812.1    3774.255   6123.432   23867.51
 85 |    yhat_L2_1 |     11,204    15812.11    3768.449   6145.505   23851.63
 86 |    yhat_L3_1 |     11,204    15810.33    4477.315   76.66996   31094.56
 87 |       yhat_1 |     11,204     15812.1    3774.255   6123.432   23867.51
 88 | -------------+---------------------------------------------------------
 89 |       yhat_2 |     11,204    15817.76    3772.003   5660.687   24048.23
 90 |       yhat_3 |     11,204    15811.39    3776.824   5764.323   24005.49
 91 |       yhat_4 |     11,204    15818.01    3777.579   5495.782      24012
 92 |       yhat_5 |     11,204    15811.81    3777.137   5751.744   23775.22
 93 | 
 94 | . 
 95 | . // check that norandom is equivalent to provided fold identifier
 96 | . count
 97 |   11,204
 98 | 
 99 | . gen fid = _n<=(r(N)/2)
100 | 
101 | . set seed 42
102 | 
103 | . crossfit, estring(reg earnings $X) gen(noran) kfolds(2) norandom
104 | Cross-fitting fold 1 2 ...completed cross-fitting
105 | 
106 | . set seed 42
107 | 
108 | . crossfit, estring(reg earnings $X) gen(foldv) foldvar(fid)
109 | Cross-fitting fold 1 2 ...completed cross-fitting
110 | 
111 | . assert noran==foldv
112 | 
113 | . 
114 | . log close
115 |       name:  <unnamed>
116 |        log:  /Users/kahrens/MyProjects/ddml/cert/ddml_cert_crossfit.log
117 |   log type:  text
118 |  closed on:  30 Aug 2024, 17:23:25
119 | ---------------------------------------------------------------------------------------------------------------
120 | 


--------------------------------------------------------------------------------
/ddml_extract.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 3aug2023}{...}
  3 | {viewerjumpto "Syntax" "ddml_extract##syntax"}{...}
  4 | {viewerjumpto "Options" "ddml_extract##options"}{...}
  5 | {viewerjumpto "Examples" "ddml_extract##examples"}{...}
  6 | {viewerjumpto "Installation" "ddml_extract##installation"}{...}
  7 | {viewerjumpto "References" "ddml_extract##references"}{...}
  8 | {viewerjumpto "Authors" "ddml_extract##authors"}{...}
  9 | {vieweralsosee "ddml main page" "ddml"}{...}
 10 | {vieweralsosee "Other" "ddml_extract##also_see"}{...}
 11 | {hline}
 12 | {cmd:help ddml extract}{right: v1.4.4}
 13 | {hline}
 14 | 
 15 | {title:ddml extract utility for Double Debiased Machine Learning}
 16 | 
 17 | {p2colset 5 19 21 2}{...}
 18 | {p2col:{hi: ddml} {hline 2}}Stata package for Double Debiased Machine Learning{p_end}
 19 | {p2colreset}{...}
 20 | 
 21 | {pstd}
 22 | {opt ddml} implements algorithms for causal inference aided by supervised
 23 | machine learning as proposed in 
 24 | {it:Double/debiased machine learning for treatment and structural parameters}
 25 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
 26 | binary or continuous treatment variables and endogeneity, high-dimensional 
 27 | controls and/or instrumental variables. 
 28 | 
 29 | {p2colset 5 19 21 2}{...}
 30 | {p2col:{hi: ddml extract}} Stata extract utility for Double Debiased Machine Learning{p_end}
 31 | {p2colreset}{...}
 32 | 
 33 | {pstd}
 34 | Please check the {helpb ddml extract##examples:examples} provided at the end of the help file.
 35 | 
 36 | {marker syntax}{...}
 37 | {title:Syntax}
 38 | 
 39 | {p 8 14}{cmd:ddml extract} [ {it:object_name} , {opt mname(name)} {opt show(display_item)} {opt ename(name)} {opt vname(varname)}
 40 | {opt stata} {opt keys} {opt key1(string)} {opt key2(string)} {opt key3(string)} {opt subkey1(string)} {opt subkey2(string)}{bind: ]}
 41 | 
 42 | {pstd}
 43 | {it:display_item} can be {it:mse}, {it:n} or {it:pystacked}.
 44 | {cmd:ddml} stores many internal results on associative arrays.
 45 | These can be retrieved using the different key options.
 46 | 
 47 | {marker options}{...}
 48 | {title:Options}
 49 | 
 50 | {synoptset 20}{...}
 51 | {synopthdr:main options}
 52 | {synoptline}
 53 | {synopt:{opt mname(name)}}
 54 | Name of the DDML model; a Mata object. Defaults to {it:m0}.
 55 | {p_end}
 56 | {synopt:{opt vname(name)}}
 57 | Name of a Y, D or Z variable corresponding to a DDML equation.
 58 | {p_end}
 59 | {synopt:{opt ename(name)}}
 60 | Name of a DDML equation struct; a Mata object.
 61 | Use with {helpb crossfit} or with a DDML eStruct that has been separately extracted.
 62 | {p_end}
 63 | {synopt:{opt stata}}
 64 | Saves extracted {it:object_name} in a Stata r(.) macro (default is to leave it as Mata object).
 65 | NB: does not apply to {opt show(display_item)} (see below).
 66 | {p_end}
 67 | {synoptline}
 68 | {p2colreset}{...}
 69 | {pstd}
 70 | 
 71 | {synoptset 20}{...}
 72 | {synopthdr:show options}
 73 | {synoptline}
 74 | {synopt:{opt show(stweights)}}
 75 | Extracts standard stacking ({opt pystacked}) weights.
 76 | {p_end}
 77 | {synopt:{opt show(ssweights)}}
 78 | Extracts {opt shortstack} weights.
 79 | {p_end}
 80 | {synopt:{opt show(pweights)}}
 81 | Extracts {opt poolstack} weights.
 82 | {p_end}
 83 | {synopt:{opt show(weights)}}
 84 | Extracts all available weights: standard, short-stacked, pool-stacked.
 85 | {p_end}
 86 | {synopt:{opt show(pystacked)}}
 87 | Extracts detailed {opt pystacked} weights and learner MSEs, including a breakdown by cross-fit fold.
 88 | The MSEs are cross-validation MSEs and correspond to the predictions used to obtain the stacking weights;
 89 | see {helpb pystacked:help pystacked}.
 90 | {p_end}
 91 | {synopt:{opt show(mse)}}
 92 | Extracts OOS MSEs by crossfitting fold.
 93 | {p_end}
 94 | {synopt:{opt show(n)}}
 95 | Extracts sample size by crossfitting fold.
 96 | {p_end}
 97 | {synoptline}
 98 | {p2colreset}{...}
 99 | {pstd}
100 | 
101 | {synoptset 20}{...}
102 | {synopthdr:key options}
103 | {synoptline}
104 | {synopt:{opt keys}}
105 | List all keys on the relevant associative array.
106 | {p_end}
107 | {synopt:{opt key1(string)}}
108 | Associative array key #1.
109 | {p_end}
110 | {synopt:{opt key2(string)}}
111 | Associative array key #2.
112 | {p_end}
113 | {synopt:{opt key3(string)}}
114 | Associative array key #3.
115 | {p_end}
116 | {synopt:{opt subkey1(string)}}
117 | Associative array subkey #1.
118 | {p_end}
119 | {synopt:{opt subkey2(string)}}
120 | Associative array subkey #2.
121 | {p_end}
122 | {synoptline}
123 | {p2colreset}{...}
124 | {pstd}
125 | 
126 | 
127 | {marker examples}{...}
128 | {title:Examples}
129 | 
130 | {smcl}
131 | INCLUDE help ddml_example_extract.sthlp
132 | 
133 | 
134 | {smcl}
135 | INCLUDE help ddml_install_ref_auth
136 | 


--------------------------------------------------------------------------------
/ddml_example_stacking.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Partially-linear model with {help pystacked} and stacking}:{p_end}
 5 | 
 6 | {pstd}Preparation: load the data, define global macros, set the seed and initialize the model.{p_end}
 7 | 
 8 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
 9 | {phang2}. {stata "global Y net_tfa"}{p_end}
10 | {phang2}. {stata "global D e401"}{p_end}
11 | {phang2}. {stata "global X tw age inc fsize educ db marr twoearn pira hown"}{p_end}
12 | {phang2}. {stata "set seed 42"}{p_end}
13 | {phang2}. {stata "ddml init partial, kfolds(2) reps(2)"}{p_end}
14 | 
15 | {pstd}Add supervised machine learners for estimating conditional expectations.
16 | For simplicity, we use {help pystacked}'s default learners:
17 | OLS, cross-validated lasso, and gradient boosting.{p_end}
18 | 
19 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X"}{p_end}
20 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X"}{p_end}
21 | 
22 | {pstd} Cross-fitting and estimation:
23 | The learners are iteratively fitted on the training data to obtain the estimated conditional expectations,
24 | and then the causal coefficient of interest is estimated along with heteroskedastic-consistent SEs.
25 | Note that the initial stacking is specified at the {help ddml crossfit:cross-fitting} stage.
26 | In addition to the standard stacking done by {helpb pystacked},
27 | also request short-stacking and pooled-stacking to be done by {opt ddml}.{p_end}
28 | 
29 | {phang2}. {stata "ddml crossfit, shortstack poolstack"}{p_end}
30 | {phang2}. {stata "ddml estimate, robust"}{p_end}
31 | 
32 | {pstd}Examine the standard ({cmd:pystacked}) stacking weights as well as
33 | the {opt ddml} short-stacking and pooled-stacking weights.{p_end}
34 | 
35 | {phang2}. {stata "ddml extract, show(stweights)"}{p_end}
36 | {phang2}. {stata "ddml extract, show(ssweights)"}{p_end}
37 | {phang2}. {stata "ddml extract, show(psweights)"}{p_end}
38 | 
39 | {pstd} Re-stack without cross-fitting, using the single-best learner
40 | instead of the default constrained nonlinear least squares.
41 | We do this using the {help ddml estimate} command.
42 | Since no stacking method is specified,
43 | restacking will be done for all three methods.{p_end}
44 | 
45 | {phang2}. {stata "ddml estimate, robust finalest(singlebest)"}{p_end}
46 | 
47 | {pstd} As above, but request short-stacking only at the cross-fitting stage.
48 | Note the speed improvement.{p_end}
49 | 
50 | {phang2}. {stata "ddml crossfit, shortstack nostdstack"}{p_end}
51 | {phang2}. {stata "ddml estimate, robust"}{p_end}
52 | 
53 | {pstd} Re-stack the above without cross-fitting, using OLS as the final estimator.
54 | Use the option {opt shortstack} since only these results are re-stacked.{p_end}
55 | 
56 | {phang2}. {stata "ddml estimate, robust shortstack finalest(ols)"}{p_end}
57 | {phang2}. {stata "ddml estimate, robust shortstack finalest(ols)"}{p_end}
58 | 
59 | {pstd}{ul:Extended example with specified {help pystacked} learners and settings}:{p_end}
60 | 
61 | {pstd}Same example as above, but specify the base learners explicitly.
62 | We again make use of {help pystacked} integration,
63 | so there is a single call to {help pystacked} for each conditional expectation.
64 | The first learner in the stacked ensemble is OLS.
65 | We also use cross-validated lasso, ridge and two random forests with different settings.
66 | The settings are stored in macros for readability.{p_end}
67 | 
68 | {phang2}. {stata "ddml init partial, kfolds(2) reps(2)"}{p_end}
69 | {phang2}. {stata "global rflow max_features(5) min_samples_leaf(1) max_samples(.7)"}{p_end}
70 | {phang2}. {stata "global rfhigh max_features(5) min_samples_leaf(10) max_samples(.7)"}{p_end}
71 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X || method(ols) || method(lassocv) || method(ridgecv) || method(rf) opt($rflow) || method(rf) opt($rfhigh), type(reg)"}{p_end}
72 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X || method(ols) || method(lassocv) || method(ridgecv) || method(rf) opt($rflow) || method(rf) opt($rfhigh), type(reg)"}{p_end}
73 | 
74 | {pstd}Note: Options before ":" and after the first comma refer to {cmd:ddml}. 
75 | Options that come after the final comma refer to the estimation command. 
76 | Make sure to not confuse the two types of options.{p_end}
77 | 
78 | {pstd}The learners are iteratively fitted on the training data.
79 | In addition to the standard stacking done by {helpb pystacked},
80 | also request short-stacking to be done by {opt ddml}.
81 | Finally, estimate the coefficients of interest.{p_end}
82 | 
83 | {phang2}. {stata "ddml crossfit, shortstack"}{p_end}
84 | {phang2}. {stata "ddml estimate, robust"}{p_end}
85 | 
86 | {pstd}Examine the standard ({cmd:pystacked}) stacking weights as well as the {opt ddml} short-stacking weights.{p_end}
87 | 
88 | {phang2}. {stata "ddml extract, show(stweights)"}{p_end}
89 | {phang2}. {stata "ddml extract, show(ssweights)"}{p_end}
90 | 


--------------------------------------------------------------------------------
/ddml_example_partial_anylearner_detailed.sthlp:
--------------------------------------------------------------------------------
 1 | {smcl}
 2 | {* *! version 30aug2024}{...}
 3 | {smcl}
 4 | {pstd}{ul:Partially-linear model - Detailed general example with multiple learners} 
 5 | 
 6 | {pstd}Here we used {opt ddml} to add learners. This allows use of learners not supported by,
 7 | or as alternatives to, those available via {help pystacked}.
 8 | It is also possible to use {help pystacked} as a standalone learner in this way.{p_end}
 9 | 
10 | {pstd}Preparation: load the data and define the globals.
11 | Use the name "m1" for this new estimation, 
12 | to distinguish it from any model estimated previously that uses the default name "m0".
13 | This enables having multiple estimations available for comparison.
14 | We also use the {opt prefix} option of {help ddml init}
15 | so that all the estimated conditional expectations will be prefixed with the model name,
16 | i.e., the names of all created variables will start with "m1".
17 | This avoids overwriting any variables created for some other model using default naming.
18 | Also specify 5 resamplings.{p_end}
19 | 
20 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
21 | {phang2}. {stata "global Y net_tfa"}{p_end}
22 | {phang2}. {stata "global D e401"}{p_end}
23 | {phang2}. {stata "global X tw age inc fsize educ db marr twoearn pira hown"}{p_end}
24 | {phang2}. {stata "set seed 42"}{p_end}
25 | {phang2}. {stata "ddml init partial, kfolds(2) reps(5) mname(m1) prefix"}{p_end}
26 | 
27 | {pstd}We add supervised machine learners for estimating the conditional expectation E[Y|X].
28 | In each step, we add the {opt mname(m1)} option to ensure that the learners are added to correct model.{p_end}
29 | 
30 | {pstd} We first add simple linear regression.{p_end}
31 | 
32 | {phang2}. {stata "ddml E[Y|X], mname(m1): reg $Y $X"}{p_end}
33 | 
34 | {pstd}We can add more than one learner per reduced form equation.
35 | Here, we add a random forest learner.
36 | We do this using {help pystacked} to implement a single learner.{p_end}
37 | 
38 | {phang2}. {stata "ddml E[Y|X], mname(m1): pystacked $Y $X, type(reg) method(rf)"}{p_end}
39 | 
40 | {pstd}We do the same for the conditional expectation E[D|X].{p_end}
41 | 
42 | {phang2}. {stata "ddml E[D|X], mname(m1): reg $D $X"}{p_end}
43 | {phang2}. {stata "ddml E[D|X], mname(m1): pystacked $D $X, type(reg) method(rf)"}{p_end}
44 | 
45 | {pstd}Check if learners were correctly added:{p_end}
46 | 
47 | {phang2}. {stata "ddml desc, mname(m1) learners"}{p_end}
48 | 
49 | {pstd}Cross-fitting and estimation.
50 | Since we added two learners for each of our two reduced form equations, 
51 | there are four possible specifications. 
52 | By default, the result shown corresponds to the specification 
53 | with the lowest out-of-sample MSPE:{p_end}
54 | 
55 | {phang2}. {stata "ddml crossfit, mname(m1)"}{p_end}
56 | {phang2}. {stata "ddml estimate, mname(m1) robust"}{p_end}
57 | 
58 | {pstd}To estimate all four specifications, we use the {cmd:allcombos} option:{p_end}
59 | 
60 | {phang2}. {stata "ddml estimate, mname(m1) robust allcombos"}{p_end}
61 | 
62 | {pstd}After having estimated all specifications, we can retrieve 
63 | specific results. Here we use the specification relying on OLS for both
64 | estimating both E[Y|X] and E[D|X], from the 4th cross-fit split ({opt rep(4))}.
65 | (Note: Working interactively, the simplest way to do this
66 | is to click on the hyperlink in the summary table in the {opt ddml estimate} output above.)
67 | The {opt notable} option suppresses the summary table:{p_end}
68 | 
69 | {phang2}. {stata "ddml estimate, mname(m1) spec(1) rep(4) replay notable"}{p_end}
70 | 
71 | {pstd}You could manually retrieve the same point estimate by 
72 | cacluating the orthogonalized versions of {opt net_tfa} and {opt e401}
73 | from the 4th cross-fit estimation and then using {help regress}.
74 | Recall that we used the {opt prefix} option with {help ddml init},
75 | so the variable names start with "m1".{p_end}
76 | 
77 | {phang2}. {stata "cap drop Yresid"}{p_end}
78 | {phang2}. {stata "cap drop Dresid"}{p_end}
79 | {phang2}. {stata "gen double Yresid = $Y - m1_Y1_reg_4"}{p_end}
80 | {phang2}. {stata "gen double Dresid = $D - m1_D1_reg_4"}{p_end}
81 | {phang2}. {stata "regress Yresid Dresid, robust"}{p_end}
82 | 
83 | {pstd}You can also compare the estimated conditional expectations graphically:{p_end}
84 | 
85 | {phang2}. {stata "twoway (scatter $Y m1_Y2_pystacked_4) "}{p_end}
86 | 
87 | {pstd}To describe the ddml model setup or results in detail,
88 | you can use {cmd: ddml describe} with the relevant option ({opt sample}, {opt learners}, {opt crossfit}, {opt estimates}),
89 | or just describe them all with the {opt all} option:{p_end}
90 | 
91 | {phang2}. {stata "ddml describe, mname(m1) all"}{p_end}
92 | 
93 | {pstd}If there is a previously-estimated {opt ddml} model called "m0",
94 | we can load it using {opt ddml estimate} with the {opt mname(m0)} and {opt replay} options and compare.{p_end}
95 | 
96 | {phang2}. {stata "ddml estimate, mname(m0) replay"}{p_end}
97 | 


--------------------------------------------------------------------------------
/_ddml_allcombos.ado:
--------------------------------------------------------------------------------
  1 | *! ddml v1.4.4
  2 | *! last edited: 30aug2024
  3 | *! authors: aa/ms
  4 | 
  5 | program define _ddml_allcombos, rclass
  6 | 	version 16
  7 | 	
  8 | 	syntax anything , [ putlast(string) ///
  9 | 						debug ///  
 10 | 						ypos(int 1) /// position of Y variables
 11 | 						DPOS_start(int 2) /// position of D variables
 12 | 						ZPOS_start(int 0) /// position of Z variables
 13 | 						sep(string) ///
 14 | 						addprefix(string) ///
 15 | 						]
 16 | 	if ("`sep'"=="") {
 17 | 		local sep -
 18 | 	}
 19 | 
 20 | 	tokenize "`anything'" , parse("`sep'")
 21 | 
 22 | 	// obtain all combinations
 23 | 	tempname out
 24 | 	mata: st_rclear()
 25 | 	mata: `out' = get_combos("`anything'","`sep'")
 26 | 	return scalar ncombos = `r(ncombos)'
 27 | 	local ncols = `r(ncols)'
 28 | 
 29 | 	// determine end position for D and Z vars
 30 | 	if `zpos_start'>0 {
 31 | 		local dpos_end = `zpos_start'-1
 32 | 		local zpos_end = `ncols'
 33 | 	}
 34 | 	else {
 35 | 		local dpos_end = `ncols'
 36 | 	}
 37 | 
 38 | 	// put one specific order at the end (intended for optimal model)
 39 | 	mata: `out' = put_last(`out',"`putlast'")
 40 | 	if ("`debug'"!="") {
 41 | 		di as text "_ddml_all_combos:"
 42 | 		mata: `out'
 43 | 	}
 44 | 
 45 | 	// save all Y in one list separated by `sep'
 46 | 	mata: mat_to_string(`out'[,`ypos'],"`sep'","`addprefix'")
 47 | 	return local ystr `r(str)'
 48 | 
 49 | 	// save D variables in list separated by `sep'
 50 | 	if (`dpos_start'>0) {
 51 | 		mata: mat_to_string(`out'[,`dpos_start'..`dpos_end'],"`sep'","`addprefix'")
 52 | 		return local dstr `r(str)'
 53 | 	}
 54 | 	// save Z variables in list separated by `sep'
 55 | 	if (`zpos_start'>0) {
 56 | 		mata: mat_to_string(`out'[,`zpos_start'..`zpos_end'],"`sep'","`addprefix'")
 57 | 		return local zstr `r(str)'
 58 | 	}
 59 | 
 60 | 	// one string per column
 61 | 	mata: mat_to_colstring(`out',"`sep'","`addprefix'")
 62 | 	return scalar nvars = `r(k)'
 63 | 	forvalues i = 1(1)`r(k)' {
 64 | 		return local colstr`i' `r(colstr`i')'
 65 | 	}
 66 | 
 67 | 	// clear
 68 | 	mata: mata drop `out'
 69 | end
 70 | 
 71 | mata: 
 72 | 
 73 | // put one specific combination at the last place 
 74 | // intended for optimal combination, which should be estimated at the end
 75 | string matrix put_last(string matrix mat,
 76 | 						string scalar last)
 77 | {
 78 | 	last = tokens(last)
 79 | 
 80 | 	// check if cols match; otherwise do nothing
 81 | 	if (cols(last)>0 & cols(last)==cols(mat)) {
 82 | 
 83 | 		// find match
 84 | 		is = rowsum(mat :== last) :== cols(mat)
 85 | 
 86 | 		// check if there is one match; otherwise do nothing
 87 | 		if (sum(is)==1) {
 88 | 
 89 | 			// split in two and row bind
 90 | 			mat0 = select(mat,is:==0)
 91 | 			mat1 = select(mat,is:==1)
 92 | 			mat = (mat0\mat1)
 93 | 		}
 94 | 	} 
 95 | 
 96 | 	return(mat)
 97 | }
 98 | 
 99 | // obtain full matrix of all combinations
100 | string matrix get_combos(string scalar input,string scalar sep)
101 | {
102 | 
103 | 	input = tokens(input,sep)
104 | 
105 | 	for (i=1; i<=cols(input); i=i+2) {
106 | 
107 | 		vars = input[1,i]
108 | 		vars = ustrtrim(vars)
109 | 
110 | 		if (i==1) {
111 | 
112 | 			out = tokens(vars)'
113 | 
114 | 		}
115 | 		else {
116 | 
117 | 			// put next set of variables into mata vector
118 | 			x = tokens(vars)'
119 | 				
120 | 			// save dimensions
121 | 			orows = rows(out)
122 | 			xrows = rows(x)
123 | 				
124 | 			// duplicate rows			
125 | 			out = Jsort(out,xrows)
126 | 			x = J(orows,1,x)
127 | 				
128 | 			// column bind
129 | 			out = (out,x)
130 | 			
131 | 		}
132 | 
133 | 	}
134 | 
135 | 	st_numscalar("r(ncombos)",rows(out))
136 | 	st_numscalar("r(ncols)",cols(out))
137 | 
138 | 	return(out)
139 | 
140 | }
141 | 
142 | // matrix to one string where combinations are seperated by "|"
143 | void mat_to_string(string matrix inmat, string scalar sep,string scalar prefix)
144 | {
145 | 	
146 | 	if (prefix!="") {
147 | 		inmat = prefix :+ inmat 
148 | 	}
149 | 
150 | 	r = rows(inmat)
151 | 	for (i=1;i<=r;i++) {
152 | 
153 | 		the_row = inmat[i,]
154 | 
155 | 		// put in string
156 | 		if (i==1) {
157 | 			str = invtokens(the_row) 
158 | 		}
159 | 		else {
160 | 			str = str + " " + sep + " " + invtokens(the_row) 
161 | 		}
162 | 	} 
163 | 
164 | 	st_global("r(str)",str)
165 | 
166 | }
167 | 
168 | // matrix to one string per column
169 | void mat_to_colstring(string matrix inmat,string scalar sep,string scalar prefix)
170 | {
171 | 	
172 | 	if (prefix!="") {
173 | 		inmat = prefix :+ inmat 
174 | 	}
175 | 
176 | 	k = cols(inmat)
177 | 	st_numscalar("r(k)",k)
178 | 	for (j=1;j<=k;j++) {
179 | 
180 | 		str = invtokens(inmat[,j]'," "+sep+" ") 
181 | 		st_global("r(colstr"+strofreal(j)+")",str)
182 | 
183 | 	} 
184 | 
185 | }
186 | 
187 | // replicate elements of a vector, while maintaining order
188 | string matrix Jsort(string matrix mat,
189 | 				real scalar rep)
190 | {
191 | 	r = rows(mat)
192 | 	for (i=1;i<=r;i++) {
193 | 		if (i==1) {
194 | 			out = J(rep,1,mat[i,])
195 | 		} 
196 | 		else {
197 | 			out = (out\J(rep,1,mat[i,]))
198 | 		}
199 | 	}
200 | 	return(out)
201 | }
202 | 
203 | end
204 | 		
205 | 


--------------------------------------------------------------------------------
/ddml_iv.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {viewerjumpto "Examples" "ddml_iv##examples"}{...}
  4 | {viewerjumpto "Installation" "ddml_iv##installation"}{...}
  5 | {viewerjumpto "References" "ddml_iv##references"}{...}
  6 | {viewerjumpto "Authors" "ddml_iv##authors"}{...}
  7 | {vieweralsosee "ddml main page" "ddml"}{...}
  8 | {vieweralsosee "Other" "ddml_iv##also_see"}{...}
  9 | {hline}
 10 | {cmd:help ddml iv, help ddml fiv}{right: v1.4.4}
 11 | {hline}
 12 | 
 13 | {title:ddml - estimation of partially-linear IV models in Double Debiased Machine Learning}
 14 | 
 15 | {pstd}
 16 | {opt ddml} implements algorithms for causal inference aided by supervised
 17 | machine learning as proposed in 
 18 | {it:Double/debiased machine learning for treatment and structural parameters}
 19 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
 20 | binary or continuous treatment variables and endogeneity, high-dimensional 
 21 | controls and/or instrumental variables.
 22 | 
 23 | {pstd}
 24 | {opt ddml} supports a variety of different ML programs, including
 25 | but not limited to {help pystacked} and {help lassopack}. 
 26 | {help pystacked} is the recommended way to specify multiple learners in {opt ddml},
 27 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
 28 | 
 29 | {pstd}
 30 | The {opt ddml} package also includes the wrapper program {help qddml},
 31 | which uses a simplified one-line syntax, but offers less flexibility.
 32 | 
 33 | {pstd}
 34 | This help file illustrates usage of the {ul:partially-linear IV model}
 35 | and the {ul:flexible partially-linear IV model}.
 36 | For examples of other models,
 37 | follow the links in the main {help ddml:ddml help file}.
 38 | 
 39 | {pstd}
 40 | We use {it:Y} to denote the outcome variable, 
 41 | {it:X} to denote confounders, 
 42 | {it:Z} to denote instrumental variable(s), and
 43 | {it:D} to denote the treatment variable(s) of interest.
 44 | 
 45 | {pstd}
 46 | {ul:Partially-linear IV model} [{it:iv}]
 47 | 
 48 | 	Y = {it:a}.D + g(X) + U
 49 |         Z = m(X) + V
 50 | 
 51 | {pstd}
 52 | where the aim is to estimate {it:a}. 
 53 | We estimate the conditional expectations E[Y|X], 
 54 | E[D|X] and E[Z|X] using supervised machine learners.
 55 | Note that the instrument set Z is low-dimensional.
 56 | 
 57 | {pstd}
 58 | {ul:Flexible partially-linear IV model} [{it:fiv}]
 59 | 
 60 | 	Y = {it:a}.D + g(X) + U
 61 |         D = m(Z) + g(X) + V 
 62 | 
 63 | {pstd}
 64 | where the estimand of interest is {it:a}. 
 65 | We estimate the conditional expectations
 66 | E[Y|X], 
 67 | E[D^|X] and D^:=E[D|Z,X] using supervised machine learnerd.
 68 | The instrument is then formed as D^-E^[D^|X] where E^[D^|X] denotes
 69 | the estimate of E[D^|X]. 
 70 | 
 71 | {pstd}
 72 | Note: "{D}" is a placeholder that is used because last step (estimation of E[D|X]) 
 73 | uses the fitted values from estimating E[D|X,Z].
 74 | 
 75 | {pstd}
 76 | {ul:Which IV model?}
 77 | 
 78 | {pstd}
 79 | The flexible partially-linear IV Model allows for approximation of optimal instruments
 80 | as in Belloni et al. ({help ddml iv##BCCH2012:2012}),
 81 | but relies on a stronger independence assumption than the partially-linear IV Model.
 82 | Specifically, the partially-linear IV model uses an orthogonality condition,
 83 | 
 84 | 	E[Cov(U,Z|X)] = 0
 85 | 
 86 | {pstd}
 87 | whereas the flexible partially-linear IV model uses the conditional mean independence condition
 88 | 
 89 | 	E[U|Z,X] = 0
 90 | 
 91 | {pstd}
 92 | Note that the generated instruments are generally valid,
 93 | Also note that (unlike the standard partially-linear IV model above),
 94 | the flexible partially-linear IV model can accommodate both low- and high-dimensional instrument sets Z.
 95 | 
 96 | 
 97 | {marker examples}{...}
 98 | {title:Examples}
 99 | 
100 | {pstd}
101 | Below we demonstrate the use of {cmd:ddml} for partially-linear IV models.
102 | Note that estimation models are chosen for demonstration purposes only and 
103 | may be kept simple to allow you to run the code quickly.
104 | 
105 | {pstd}{help ddml iv##iv_pystacked_basic:1. Basic example of the partially-linear IV model with pystacked}{p_end}
106 | {pstd}{help ddml iv##iv_anylearner_basic:2. Basic example of the partially-linear IV model with any learner(s)}{p_end}
107 | {pstd}{help ddml iv##fiv_anylearner_basic:3. Basic example of the flexible partially-linear IV model with any learner(s)}{p_end}
108 | {pstd}{help ddml iv##fiv_anylearner_detailed:4. Detailed example of the flexible partially-linear IV model with any learner(s)}{p_end}
109 | 
110 | {marker iv_pystacked_basic}
111 | {smcl}
112 | INCLUDE help ddml_example_partialiv_pystacked_basic.sthlp
113 | 
114 | {marker iv_anylearner_basic}
115 | {smcl}
116 | INCLUDE help ddml_example_partialiv_anylearner_basic.sthlp
117 | 
118 | {marker fiv_anylearner_basic}
119 | {smcl}
120 | INCLUDE help ddml_example_flexiv_anylearner_basic.sthlp
121 | 
122 | {marker fiv_anylearner_detailed}
123 | {smcl}
124 | INCLUDE help ddml_example_flexiv_anylearner_detailed.sthlp
125 | 
126 | 
127 | {smcl}
128 | INCLUDE help ddml_install_ref_auth
129 | 


--------------------------------------------------------------------------------
/ddml.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 6aug2023}{...}
  3 | {viewerjumpto "Links to detailed help files" "ddml##help"}{...}
  4 | {viewerjumpto "Models" "ddml##models"}{...}
  5 | {viewerjumpto "Estimation steps" "ddml##estimation"}{...}
  6 | {viewerjumpto "Compatible programs and pystacked" "ddml##compatibility"}{...}
  7 | {viewerjumpto "Basic example" "ddml##example"}{...}
  8 | {viewerjumpto "Installation" "ddml##installation"}{...}
  9 | {viewerjumpto "References" "ddml##references"}{...}
 10 | {viewerjumpto "Authors" "ddml##authors"}{...}
 11 | {vieweralsosee "Also see" "ddml##also_see"}{...}
 12 | {hline}
 13 | {cmd:help ddml}{right: v1.4.4}
 14 | {hline}
 15 | 
 16 | {title:ddml - Stata package for Double Debiased Machine Learning}
 17 | 
 18 | {pstd}
 19 | {opt ddml} implements algorithms for causal inference aided by supervised
 20 | machine learning as proposed in 
 21 | {it:Double/debiased machine learning for treatment and structural parameters}
 22 | ({helpb ddml##Chern2018:Chernozhukov et al., Econometrics Journal, 2018}). Five different models are supported, allowing for 
 23 | binary or continuous treatment variables and endogeneity, high-dimensional 
 24 | controls and/or instrumental variables. 
 25 | {opt ddml} also implements the stacking approaches discussed in {helpb ddml##Ahrens2024stacking:Ahrens et al. (2024a)}. 
 26 | For a companion paper, see {helpb ddml##Ahrens2024ddml:Ahrens et al. (2024b)}.
 27 | 
 28 | {pstd}
 29 | {opt ddml} supports a variety of different ML programs, including
 30 | but not limited to {help pystacked} and {help lassopack}. 
 31 | {help pystacked} is the recommended way to specify multiple learners in {opt ddml},
 32 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
 33 | 
 34 | {pstd}
 35 | The {opt ddml} package includes the wrapper program {help qddml},
 36 | which uses a simplified one-line syntax, 
 37 | but offers less flexibility.
 38 | 
 39 | {pstd}
 40 | {opt ddml} and {opt qddml} rely on {help crossfit}, which can be used as a standalone program.
 41 | 
 42 | 
 43 | {title:Contents}
 44 | 
 45 | {p 2}{help ddml##help:Links to detailed help files}{p_end}
 46 | {p 2}{help ddml##models:Supported models in ddml}{p_end}
 47 | {p 2}{help ddml##estimation:Main steps in estimation using ddml}{p_end}
 48 | {p 2}{help ddml##compatibility:Compatible programs and pystacked integration}{p_end}
 49 | {p 2}{help ddml##example:Basic example: the partially-linear model with pystacked}{p_end}
 50 | {p 2}{help ddml##installation:Installation}{p_end}
 51 | {p 2}{help ddml##references:References}{p_end}
 52 | {p 2}{help ddml##authors:Authors}{p_end}
 53 | 
 54 | 
 55 | {marker help}{...}
 56 | {title:Follow links below to detailed help}
 57 | 
 58 | {p2colset 5 25 25 0}{...}
 59 | {p 2}Help files: qddml{p_end}
 60 | {p2col:{help qddml}} One-step DDML estimation using {help qddml}.{p_end}
 61 | 
 62 | {p 2}Help files: detailed help files including syntax/options for main steps in using {opt ddml}{p_end}
 63 | {p2col:{help ddml init}} 1. Initialize {opt ddml} and select model.{p_end}
 64 | {p2col:{help ddml eq}} 2. Add supervised ML programs for estimating conditional expectations.{p_end}
 65 | {p2col:{help ddml crossfit}} 3. Cross-fitting to estimate conditional expectations.{p_end}
 66 | {p2col:{help ddml estimate}} 4. Estimate causal model and report/post results.{p_end}
 67 | {p2col:{help ddml stacking}} Overview of stacking in {opt ddml} and {help pystacked}{p_end}
 68 | 
 69 | {marker examples}{...}
 70 | {p 2}Help files: detailed help files and examples for models supported by {opt ddml}{p_end}
 71 | {p2col:{help ddml partial}} Partially-linear model{p_end}
 72 | {p2col:{help ddml iv}} Partially-linear IV model{p_end}
 73 | {p2col:{help ddml fiv}} Flexible partially-linear IV model{p_end}
 74 | {p2col:{help ddml interactive}} Interactive model - ATE and ATET estimation{p_end}
 75 | {p2col:{help ddml interactiveiv}} Interactive IV model - LATE estimation{p_end}
 76 | {p2col:{help ddml examples}} Clickable list of all ddml examples{p_end}
 77 | 
 78 | {p 2}Help files: auxiliary programs{p_end}
 79 | {p2col:{help ddml describe}} Report information about the model setup and/or results.{p_end}
 80 | {p2col:{help ddml extract}} Report information about saved results e.g. stacking weigths.{p_end}
 81 | {p2col:{help ddml sample}} Add cross-fitting repetitions to an existing model.{p_end}
 82 | {p2col:{help ddml export}} Save the {opt ddml} estimated conditional expectations to a csv file.{p_end}
 83 | {p2col:{help ddml overlap}} (interactive models only) Generate overlap plots for propensity-score-based models{p_end}
 84 | {p2col:{help crossfit}} Use {opt crossfit} as a standalone program for cross-fitting and cross-validation.{p_end}
 85 | 
 86 | {marker overview}
 87 | {smcl}
 88 | INCLUDE help ddml_overview.sthlp
 89 | 
 90 | 
 91 | {marker example}{...}
 92 | {title:Example}
 93 | 
 94 | {pstd}A basic example of how to use {opt ddml} is below.
 95 | For a clickable list of all examples in the package, see {help ddml examples:help ddml examples}.{p_end}
 96 | 
 97 | {smcl}
 98 | INCLUDE help ddml_example_partial_pystacked_basic.sthlp
 99 | 
100 | 
101 | {smcl}
102 | INCLUDE help ddml_install_ref_auth.ihlp
103 | 
104 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_fiv.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 | 
  3 | if ("`c(username)'"=="kahrens") {
  4 | 	adopath + "/Users/kahrens/MyProjects/ddml"
  5 | 	adopath + "/Users/kahrens/MyProjects/pystacked"
  6 | }
  7 | 
  8 | cap cd "/Users/kahrens/MyProjects/ddml/cert"
  9 | cap cd "C:\LocalStore\ecomes\Documents\GitHub\ddml\cert"
 10 | 
 11 | cap log close
 12 | log using "ddml_cert_fiv", replace text
 13 | 
 14 | which ddml
 15 | mata: whichddml()
 16 | 
 17 | use https://github.com/aahrens1/ddml/raw/master/data/BLP.dta, clear
 18 | 
 19 | // necessary programs for cert; script exits with error if not installed
 20 | findfile pystacked.ado
 21 | 
 22 | set seed 123
 23 | 
 24 | ******************************************************************************** 
 25 | **** Flexible IV							    							****
 26 | ******************************************************************************** 
 27 | 
 28 | global Y share
 29 | global D price
 30 | global X hpwt air mpd space
 31 | global Z sum*
 32 | 
 33 | // single learner = pystacked
 34 | ddml init fiv, kfolds(2) reps(2)
 35 | ddml E[Y|X]: pystacked $Y $X, type(reg)
 36 | ddml E[D|Z,X], learner(Dhat_pystacked): pystacked $D $X $Z, type(reg)
 37 | ddml E[D|X], learner(Dhat_pystacked) vname($D): pystacked {D} $X, type(reg)
 38 | ddml crossfit
 39 | ddml estimate
 40 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 41 | ddml extract, show(stweights)
 42 | ddml extract, show(pystacked)
 43 | *** append, estimate, replay
 44 | ddml sample, append(1)
 45 | ddml crossfit
 46 | ddml estimate
 47 | *** replay
 48 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 49 | ddml estimate, mname(m0) spec(st) rep(2) replay notable
 50 | ddml estimate, mname(m0) spec(st) rep(mn) replay notable
 51 | ddml estimate, mname(m0) spec(st) rep(md) replay notable
 52 | 
 53 | // multiple learners = pystacked+gradboost, pystacked+lassocv, pystacked+rf
 54 | ddml init fiv, kfolds(2) reps(2)
 55 | ddml E[Y|X]: pystacked $Y $X, type(reg) m(gradboost)
 56 | ddml E[Y|X]: pystacked $Y $X, type(reg) m(lassocv)
 57 | ddml E[Y|X]: pystacked $Y $X, type(reg) m(rf)
 58 | ddml E[D|Z,X], learner(Dhat_psgradboost): pystacked $D $X $Z, type(reg) m(gradboost)
 59 | ddml E[D|X], learner(Dhat_psgradboost) vname($D): pystacked {D} $X, type(reg) m(gradboost)
 60 | ddml E[D|Z,X], learner(Dhat_pslassocv): pystacked $D $X $Z, type(reg) m(lassocv)
 61 | ddml E[D|X], learner(Dhat_pslassocv) vname($D): pystacked {D} $X, type(reg) m(lassocv)
 62 | ddml E[D|Z,X], learner(Dhat_rf): pystacked $D $X $Z, type(reg) m(rf)
 63 | ddml E[D|X], learner(Dhat_rf) vname($D): pystacked {D} $X, type(reg) m(rf)
 64 | ddml crossfit
 65 | ddml estimate
 66 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
 67 | *** append, estimate, replay
 68 | ddml sample, append(1)
 69 | ddml crossfit
 70 | ddml estimate
 71 | *** replay
 72 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
 73 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
 74 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
 75 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
 76 | 
 77 | 
 78 | // as above, with shortstacking
 79 | // multiple learners = pystacked+gradboost, pystacked+lassocv, pystacked+rf
 80 | ddml init fiv, kfolds(2) reps(2)
 81 | ddml E[Y|X]: pystacked $Y $X, type(reg) m(gradboost)
 82 | ddml E[Y|X]: pystacked $Y $X, type(reg) m(lassocv)
 83 | ddml E[Y|X]: pystacked $Y $X, type(reg) m(rf)
 84 | ddml E[D|Z,X], learner(Dhat_psgradboost): pystacked $D $X $Z, type(reg) m(gradboost)
 85 | ddml E[D|X], learner(Dhat_psgradboost) vname($D): pystacked {D} $X, type(reg) m(gradboost)
 86 | ddml E[D|Z,X], learner(Dhat_pslassocv): pystacked $D $X $Z, type(reg) m(lassocv)
 87 | ddml E[D|X], learner(Dhat_pslassocv) vname($D): pystacked {D} $X, type(reg) m(lassocv)
 88 | ddml E[D|Z,X], learner(Dhat_rf): pystacked $D $X $Z, type(reg) m(rf)
 89 | ddml E[D|X], learner(Dhat_rf) vname($D): pystacked {D} $X, type(reg) m(rf)
 90 | ddml crossfit, shortstack
 91 | ddml estimate
 92 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
 93 | ddml extract, show(ssweights)
 94 | *** append, estimate, replay
 95 | ddml sample, append(1)
 96 | ddml crossfit, shortstack
 97 | ddml estimate
 98 | ddml extract, show(ssweights)
 99 | *** replay
100 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
101 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
102 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
103 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
104 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
105 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
106 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
107 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
108 | *** allcombos
109 | ddml estimate, allcombos
110 | forvalues i=1/27 {
111 |     ddml estimate, mname(m0) spec(`i') rep(1) replay notable
112 | }
113 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
114 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
115 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
116 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
117 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
118 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
119 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
120 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
121 | 
122 | // poolstacking not supported with multiple calls to pystacked
123 | 
124 | log close
125 | 


--------------------------------------------------------------------------------
/ddml_example_flexiv_anylearner_detailed.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {smcl}
  4 | {pstd}{ul:Flexible partially-linear IV model - Detailed example with {help pystacked}}
  5 | 
  6 | {pstd}Here will illustrate how to do standard- and short-stacking with the flexible IV model.{p_end}
  7 | 
  8 | {pstd}Note: Support for {help pystacked} integration is relatively limited for the flexible IV model.
  9 | In particular, short-stacking requires that individual learners appear in separate {help pystacked} commands,
 10 | pooled stacking is not available, and re-stacking with {opt ddml estimate} is also not available.{p_end}
 11 | 
 12 | {pstd}First we illustrate how to do standard stacking with {help pystacked}.
 13 | To start, we load the data, define global macros, set the seed and initialize the model.{p_end}
 14 | 
 15 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/BLP.dta, clear"}{p_end}
 16 | {phang2}. {stata "global Y share"}{p_end}
 17 | {phang2}. {stata "global D price"}{p_end}
 18 | {phang2}. {stata "global X hpwt air mpd space"}{p_end}
 19 | {phang2}. {stata "global Z sum*"}{p_end}
 20 | {phang2}. {stata "set seed 42"}{p_end}
 21 | {phang2}. {stata "ddml init fiv"}{p_end}
 22 | 
 23 | {pstd}Adding learners for E[Y|X] is the same as for other {opt ddml} linear models:{p_end}
 24 | 
 25 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X, type(reg)"}{p_end}
 26 | 
 27 | {pstd}Adding learners for E[D|Z,X] and E[D|X] in the {opt fiv} model is different
 28 | from how it's done in the {opt partialiv} model.
 29 | The reason for this is that the estimation of E[D|X]
 30 | depends on the estimation of E[D|X,Z].{p_end}
 31 | 
 32 | {pstd}When adding learners for E[D|Z,X],
 33 | we need to provide a name for each learners using {opt learner(name)}.
 34 | Here we use the name "Dhat_pys".{p_end}
 35 | 
 36 | {phang2}. {stata "ddml E[D|Z,X], learner(Dhat_pys): pystacked $D $X $Z, type(reg)"}{p_end}
 37 | 
 38 | {pstd}When adding learners for E[D|X], we explicitly refer to the name of the learner from 
 39 | the previous step (here, "Dhat_pys").
 40 | We also provide the name of the treatment variable ({cmd:vname($D)}),
 41 | and we use the placeholder {cmd:{D}} in place of the dependent variable.{p_end}
 42 | 
 43 | {phang2}. {stata "ddml E[D|X], learner(Dhat_pys) vname($D): pystacked {D} $X, type(reg)"}{p_end}
 44 | 
 45 | {pstd}The crossfit and estimation commands with the {opt fiv} model are standard.{p_end}
 46 | 
 47 | {phang2}. {stata "ddml crossfit"}{p_end}
 48 | {phang2}. {stata "ddml estimate, robust"}{p_end}
 49 | 
 50 | {pstd}The stacking weights and other results from {help pystacked} are available via {help ddml extract}.
 51 | Note this is only the case if {help pystacked} is the single learner in each equation.{p_end}
 52 | 
 53 | {phang2}. {stata "ddml extract, show(stweights)"}{p_end}
 54 | {phang2}. {stata "ddml extract, show(pystacked)"}{p_end}
 55 | 
 56 | {pstd}To replicate what {cmd:ddml} does in the background:{p_end}
 57 | 
 58 | {phang2}. {stata "cap drop Ytilde"}{p_end}
 59 | {phang2}. {stata "cap drop Dtilde"}{p_end}
 60 | {phang2}. {stata "cap drop Ztilde"}{p_end}
 61 | {phang2}. {stata "gen double Ytilde = $Y - Y1_pystacked_1"}{p_end}
 62 | {phang2}. {stata "gen Dtilde = $D - Dhat_pys_h_1"}{p_end}
 63 | {phang2}. {stata "gen Zopt = Dhat_pys_1 - Dhat_pys_h_1"}{p_end}
 64 | {phang2}. {stata "ivreg Ytilde (Dtilde=Zopt), robust"}{p_end}
 65 | 
 66 | {pstd}Next we illustrate how to do short-stacking with the flexible IV model.
 67 | We again use {help pystacked}, but the procedure applies to any set of learners.
 68 | Here we use the same learners as in the standard stacking estimation with {help pystacked} above,
 69 | in order to facilitate direct comparison of the two sets of results.
 70 | We begin by re-initializing the model.{p_end}
 71 | 
 72 | {phang2}. {stata "set seed 42"}{p_end}
 73 | {phang2}. {stata "ddml init fiv"}{p_end}
 74 | 
 75 | {pstd}We add learners for E[Y|X] in the usual way,
 76 | but we need to specify each {help pystacked} learner in a separate equation.{p_end}
 77 | 
 78 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X, type(reg) m(ols)"}{p_end}
 79 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X, type(reg) m(lassocv)"}{p_end}
 80 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X, type(reg) m(gradboost)"}{p_end}
 81 | 
 82 | {pstd}
 83 | As above, when adding learners for E[D|Z,X],
 84 | we need to provide a name for each learner using {opt learner(name)}.{p_end}
 85 | 
 86 | {phang2}. {stata "ddml E[D|Z,X], learner(Dhat_ols): pystacked $D $X $Z, type(reg) m(ols)"}{p_end}
 87 | {phang2}. {stata "ddml E[D|Z,X], learner(Dhat_lassocv): pystacked $D $X $Z, type(reg) m(lassocv)"}{p_end}
 88 | {phang2}. {stata "ddml E[D|Z,X], learner(Dhat_gradboost): pystacked $D $X $Z, type(reg) m(gradboost)"}{p_end}
 89 | 
 90 | {pstd}Again as above, when adding learners for E[D|X],
 91 | we explicitly refer to the learner from the previous step,
 92 | the name of the treatment variable ({cmd:vname($D)}),
 93 | and the placeholder {cmd:{D}} in place of the dependent variable.{p_end}
 94 | 
 95 | {phang2}. {stata "ddml E[D|X], learner(Dhat_ols) vname($D): pystacked {D} $X, type(reg) m(ols)"}{p_end}
 96 | {phang2}. {stata "ddml E[D|X], learner(Dhat_lassocv) vname($D): pystacked {D} $X, type(reg) m(lassocv)"}{p_end}
 97 | {phang2}. {stata "ddml E[D|X], learner(Dhat_gradboost) vname($D): pystacked {D} $X, type(reg) m(gradboost)"}{p_end}
 98 |  
 99 | {pstd}Short-stacking is requested when cross-fitting.
100 | Short-stacking weights can be examined using {help ddml extract}.{p_end}
101 | 
102 | {phang2}. {stata "ddml crossfit, shortstack"}{p_end}
103 | {phang2}. {stata "ddml estimate, robust"}{p_end}
104 | {phang2}. {stata "ddml extract, show(ssweights)"}{p_end}
105 | 


--------------------------------------------------------------------------------
/ddml_example_partial_pystacked_detailed.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {smcl}
  4 | {pstd}{ul:Partially-linear model - Detailed example with stacking regression using {help pystacked}}
  5 | 
  6 | {pstd}Preparation: we load the data, define global macros and set the seed.{p_end}
  7 | 
  8 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
  9 | {phang2}. {stata "global Y net_tfa"}{p_end}
 10 | {phang2}. {stata "global D e401"}{p_end}
 11 | {phang2}. {stata "global X tw age inc fsize educ db marr twoearn pira hown"}{p_end}
 12 | {phang2}. {stata "set seed 42"}{p_end}
 13 | 
 14 | {pstd}We next initialize the ddml estimation and select the model.
 15 | {it:partial} refers to the partially linear model.
 16 | The model will be stored on a Mata object with the default name "m0"
 17 | unless otherwise specified using the {opt mname(name)} option.{p_end}
 18 | 
 19 | {pstd}We set the number of random folds to 2 so that 
 20 | the model runs quickly. The default is {opt kfolds(5)}. We recommend 
 21 | considering at least 5-10 folds and even more if your sample size is small.{p_end}
 22 | 
 23 | {pstd}We recommend re-running the model multiple times on 
 24 | different random folds; see options {opt reps(integer)}.
 25 | Here we set the number of repetions to 2, again only so that the model runs quickly.{p_end}
 26 | 
 27 | {phang2}. {stata "ddml init partial, kfolds(2) reps(2)"}{p_end}
 28 | 
 29 | {pstd}Stacking regression is a simple and powerful method for 
 30 | combining predictions from multiple learners.
 31 | Here we use {help pystacked} with the partially linear model,
 32 | but it can be used with any model supported by {cmd:ddml}.{p_end}
 33 | 
 34 | {pstd}Note: the additional support provided by {opt ddml} for {help pystacked} (see {help ddml##pystacked:above})
 35 | is available only if, as in this example, {help pystacked} is the only learner for each conditional expectation.
 36 | Mutliple learners are provided to {help pystacked}, not directly to {opt ddml}.
 37 | 
 38 | {pstd}Add supervised machine learners for estimating conditional expectations.
 39 | The first learner in the stacked ensemble is OLS.
 40 | We also use cross-validated lasso, ridge and two random forests with different settings, 
 41 | which we save in the following macros:{p_end}
 42 | 
 43 | {phang2}. {stata "global rflow max_features(5) min_samples_leaf(1) max_samples(.7)"}{p_end}
 44 | {phang2}. {stata "global rfhigh max_features(5) min_samples_leaf(10) max_samples(.7)"}{p_end}
 45 | 
 46 | {phang2}. {stata "ddml E[Y|X]: pystacked $Y $X || method(ols) || method(lassocv) || method(ridgecv) || method(rf) opt($rflow) || method(rf) opt($rfhigh), type(reg)"}{p_end}
 47 | {phang2}. {stata "ddml E[D|X]: pystacked $D $X || method(ols) || method(lassocv) || method(ridgecv) || method(rf) opt($rflow) || method(rf) opt($rfhigh), type(reg)"}{p_end}
 48 | 
 49 | {pstd}Note: Options before ":" and after the first comma refer to {cmd:ddml}. 
 50 | Options that come after the final comma refer to the estimation command. 
 51 | Make sure to not confuse the two types of options.{p_end}
 52 | 
 53 | {pstd}Check if learners were correctly added:{p_end}
 54 | 
 55 | {phang2}. {stata "ddml desc, learners"}{p_end}
 56 | 
 57 | {pstd} Cross-fitting: The learners are iteratively fitted on the training data.
 58 | This step may take a while, depending on the number of learners, repetitions, folds, etc.
 59 | In addition to the standard stacking done by {help pystacked},
 60 | also request short-stacking to be done by {opt ddml}.
 61 | Whereas stacking relies on (out-of-sample) cross-validated predicted values
 62 | to obtain the relative weights for the base learners,
 63 | short-stacking uses the (out-of-sample) cross-fitted predicted values.{p_end}
 64 | 
 65 | {phang2}. {stata "ddml crossfit, shortstack"}{p_end}
 66 | 
 67 | {pstd}Finally, we estimate the coefficients of interest.{p_end}
 68 | 
 69 | {phang2}. {stata "ddml estimate, robust"}{p_end}
 70 | 
 71 | {pstd}Examine the standard ({cmd:pystacked}) stacking weights as well as the {opt ddml} short-stacking weights.{p_end}
 72 | 
 73 | {phang2}. {stata "ddml extract, show(stweights)"}{p_end}
 74 | {phang2}. {stata "ddml extract, show(ssweights)"}{p_end}
 75 | 
 76 | {pstd}Replicate the {opt ddml estimate} short-stacking results for resample 2 by hand,
 77 | using the estimated conditional expectations generated by {opt ddml},
 78 | and compare using {opt ddml estimate, replay}:{p_end}
 79 | 
 80 | {phang2}. {stata "cap drop Yresid"}{p_end}
 81 | {phang2}. {stata "cap drop Dresid"}{p_end}
 82 | {phang2}. {stata "gen double Yresid = $Y - Y_net_tfa_ss_2"}{p_end}
 83 | {phang2}. {stata "gen double Dresid = $D - D_e401_ss_2"}{p_end}
 84 | {phang2}. {stata "regress Yresid Dresid, robust"}{p_end}
 85 | {phang2}. {stata "ddml estimate, mname(m0) spec(ss) rep(2) notable replay"}{p_end}
 86 | 
 87 | {pstd}Obtain the estimated coefficient using ridge - the 3rd {help pystacked} learner - 
 88 | as the only learner for the 2nd cross-fit estimation (resample 2),
 89 | using the estimated conditional expectations generated by {opt ddml} and {help pystacked}.
 90 | This can be done using {opt ddml estimate} with the {opt y(.)} and {opt d(.)} options:
 91 | "L3" means the 3rd learner and "_2" means resample 2.
 92 | Then replicate by hand.{p_end}
 93 | 
 94 | {phang2}. {stata "ddml estimate, y(Y1_pystacked_L3_2) d(D1_pystacked_L3_2) robust"}{p_end}
 95 | {phang2}. {stata "cap drop Yresid"}{p_end}
 96 | {phang2}. {stata "cap drop Dresid"}{p_end}
 97 | {phang2}. {stata "gen double Yresid = $Y - Y1_pystacked_L3_2"}{p_end}
 98 | {phang2}. {stata "gen double Dresid = $D - D1_pystacked_L3_2"}{p_end}
 99 | {phang2}. {stata "regress Yresid Dresid, robust"}{p_end}
100 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_partial_iv.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 | 
  3 | if ("`c(username)'"=="kahrens") {
  4 | 	adopath + "/Users/kahrens/MyProjects/ddml"
  5 | 	adopath + "/Users/kahrens/MyProjects/pystacked"
  6 | }
  7 | 
  8 | cap cd "/Users/kahrens/MyProjects/ddml/cert"
  9 | cap cd "C:\LocalStore\ecomes\Documents\GitHub\ddml\cert"
 10 | 
 11 | cap log close
 12 | log using "ddml_cert_partial_iv", replace text
 13 | 
 14 | which ddml
 15 | mata: whichddml()
 16 | 
 17 | use https://statalasso.github.io/dta/AJR.dta, clear
 18 | 
 19 | // necessary programs for cert; script exits with error if not installed
 20 | findfile pystacked.ado
 21 | 
 22 | set seed 123
 23 | 
 24 | ******************************************************************************** 
 25 | **** Partially linear model.												****
 26 | ******************************************************************************** 
 27 | 
 28 | global Y logpgp95
 29 | global X edes1975 temp* humid* steplow-oilres
 30 | global D avexpr  
 31 | global Z1 lat_abst
 32 | global Z2 logem4
 33 | 
 34 | *** pystacked, no SS
 35 | 
 36 | *** initialise ddml and select model; 
 37 | ddml init iv, kfolds(2) reps(2)
 38 | ddml E[Y|X]: pystacked $Y $X , type(reg)
 39 | ddml E[D|X]: pystacked $D $X , type(reg)
 40 | ddml E[Z|X]: pystacked $Z1 $X , type(reg)
 41 | ddml E[Z|X]: pystacked $Z2 $X , type(reg)
 42 | ddml crossfit
 43 | ddml estimate, robust
 44 | *** replay
 45 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 46 | *** append, estimate, replay
 47 | ddml sample, append(1)
 48 | ddml crossfit
 49 | ddml estimate
 50 | *** replay
 51 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 52 | ddml estimate, mname(m0) spec(st) rep(2) replay notable
 53 | ddml estimate, mname(m0) spec(st) rep(mn) replay notable
 54 | ddml estimate, mname(m0) spec(st) rep(md) replay notable
 55 | 
 56 | *** pystacked, SS
 57 | 
 58 | *** initialise ddml and select model; 
 59 | ddml init iv, kfolds(2) reps(2)
 60 | ddml E[Y|X]: pystacked $Y $X , type(reg)
 61 | ddml E[D|X]: pystacked $D $X , type(reg)
 62 | ddml E[Z|X]: pystacked $Z1 $X , type(reg)
 63 | ddml E[Z|X]: pystacked $Z2 $X , type(reg)
 64 | ddml crossfit, shortstack poolstack
 65 | *** estimation of parameter of interest
 66 | ddml estimate, robust
 67 | *** replay
 68 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 69 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
 70 | ddml estimate, mname(m0) spec(ps) rep(1) replay notable
 71 | *** append, estimate, replay
 72 | ddml sample, append(1)
 73 | ddml crossfit, shortstack poolstack
 74 | ddml estimate
 75 | *** replay
 76 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 77 | ddml estimate, mname(m0) spec(st) rep(2) replay notable
 78 | ddml estimate, mname(m0) spec(st) rep(mn) replay notable
 79 | ddml estimate, mname(m0) spec(st) rep(md) replay notable
 80 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
 81 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
 82 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
 83 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
 84 | ddml estimate, mname(m0) spec(ps) rep(1) replay notable
 85 | ddml estimate, mname(m0) spec(ps) rep(2) replay notable
 86 | ddml estimate, mname(m0) spec(ps) rep(mn) replay notable
 87 | ddml estimate, mname(m0) spec(ps) rep(md) replay notable
 88 | 
 89 | *** multiple learners, no SS
 90 | ddml init iv, kfolds(2) reps(2)
 91 | ddml E[Y|X]: pystacked $Y $X , type(reg)
 92 | ddml E[Y|X]: reg $Y $X
 93 | ddml E[D|X]: pystacked $D $X , type(reg)
 94 | ddml E[D|X]: reg $D $X
 95 | ddml E[Z|X]: pystacked $Z1 $X , type(reg)
 96 | ddml E[Z|X]: reg $Z1 $X
 97 | ddml crossfit
 98 | ddml estimate, robust
 99 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
100 | *** allcombos
101 | ddml estimate, allcombos
102 | forvalues i=1/4 {
103 |     ddml estimate, mname(m0) spec(`i') rep(1) replay notable
104 | }
105 | *** append, estimate, replay
106 | ddml sample, append(1)
107 | ddml crossfit
108 | ddml estimate
109 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
110 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
111 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
112 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
113 | *** allcombos
114 | ddml estimate, allcombos
115 | forvalues i=1/4 {
116 | 	forvalues r=1/2 {
117 | 		ddml estimate, mname(m0) spec(`i') rep(`r') replay notable
118 | 	}
119 | }
120 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
121 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
122 | 
123 | *** multiple learners, SS
124 | ddml init iv, kfolds(2) reps(2)
125 | ddml E[Y|X]: pystacked $Y $X , type(reg)
126 | ddml E[Y|X]: reg $Y $X
127 | ddml E[D|X]: pystacked $D $X , type(reg)
128 | ddml E[D|X]: reg $D $X
129 | ddml E[Z|X]: pystacked $Z1 $X , type(reg)
130 | ddml E[Z|X]: reg $Z1 $X
131 | ddml crossfit, shortstack
132 | ddml estimate, robust
133 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
134 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
135 | *** allcombos
136 | ddml estimate, allcombos
137 | forvalues i=1/4 {
138 |     ddml estimate, mname(m0) spec(`i') rep(1) replay notable
139 | }
140 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
141 | *** append, estimate, replay
142 | ddml sample, append(1)
143 | ddml crossfit, shortstack
144 | ddml estimate
145 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
146 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
147 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
148 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
149 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
150 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
151 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
152 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
153 | *** allcombos
154 | ddml estimate, allcombos
155 | forvalues i=1/4 {
156 |     forvalues r=1/2 {
157 | 		ddml estimate, mname(m0) spec(`i') rep(`r') replay notable
158 | 	}
159 | }
160 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
161 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
162 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
163 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
164 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
165 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
166 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
167 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
168 | 
169 | log close
170 | 


--------------------------------------------------------------------------------
/ddml_init.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {viewerjumpto "Syntax" "ddml_init##syntax"}{...}
  4 | {viewerjumpto "Options" "ddml_init##options"}{...}
  5 | {viewerjumpto "Installation" "ddml_init##installation"}{...}
  6 | {viewerjumpto "References" "ddml_init##references"}{...}
  7 | {viewerjumpto "Authors" "ddml_init##authors"}{...}
  8 | {vieweralsosee "ddml main page" "ddml"}{...}
  9 | {vieweralsosee "Other" "ddml_init##also_see"}{...}
 10 | {hline}
 11 | {cmd:help ddml init, ddml eq, ddml sample}{right: v1.4.4}
 12 | {hline}
 13 | 
 14 | {title:ddml init, eq and sample commands for Double Debiased Machine Learning}
 15 | 
 16 | {pstd}
 17 | {opt ddml} implements algorithms for causal inference aided by supervised
 18 | machine learning as proposed in 
 19 | {it:Double/debiased machine learning for treatment and structural parameters}
 20 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
 21 | binary or continuous treatment variables and endogeneity, high-dimensional 
 22 | controls and/or instrumental variables. 
 23 | 
 24 | {pstd}
 25 | {opt ddml init} {it:model} initializes the model,
 26 | where {it:model} is either {it:partial}, {it:iv}, {it:interactive}, {it:fiv}, or {it:interactiveiv}.
 27 | 
 28 | {pstd}
 29 | {cmd: ddml eq: command} adds supervised ML programs for estimating conditional expectations,
 30 | where {it:eq} is the conditional expectation to be estimated (e.g., {it:E[Y|X]})
 31 | and {it:command} is a supported supervised ML program.
 32 | 
 33 | {pstd}
 34 | {opt ddml sample} adds cross-fitting repetitions to an existing and possibly already-estimated model.
 35 | 
 36 | 
 37 | {marker syntax}{...}
 38 | {title:Syntax}
 39 | 
 40 | {p 8 14}{cmd:ddml init}
 41 | {it:model} [if] [in]
 42 | [ , {opt mname(name)}
 43 | {opt prefix}
 44 | {opt kfolds(integer)}
 45 | {opt fcluster(varname)}
 46 | {opt foldvar(varlist)}
 47 | {opt reps(integer)} 
 48 | {opt norandom}
 49 | {opt tabfold}
 50 | {opt vars(varlist)}{bind: ]}
 51 | 
 52 | {pstd}
 53 | where {it:model} is either {it:partial}, {it:iv}, {it:interactive}, {it:fiv}, {it:interactiveiv}.
 54 | 
 55 | {p 8 14}{cmd:ddml} {it:eq} 
 56 | [ , {opt mname(name)}
 57 | {opt vname(varname)}
 58 | {opt l:earner(varname)}
 59 | {opt vtype(string)}
 60 | {opt predopt(string)}{bind: ] :}
 61 | {it:command} {it:depvar} {it:vars} [ , {it:cmdopt}{bind: ]}
 62 | 
 63 | {pstd}
 64 | where, depending on model chosen in Step 1,
 65 | {it:eq} is either 
 66 | {it:E[Y|X]} {it:E[Y|D,X]} {it:E[Y|X,Z]} {it:E[D|X]} {it:E[D|X,Z]} {it:E[Z|X]}.
 67 | {it:command} is a supported supervised ML program (e.g. {helpb pystacked} or {helpb cvlasso}).
 68 | 
 69 | {pstd}
 70 | Note: Options before ":" and after the first comma refer to {cmd:ddml}. 
 71 | Options that come after ":" and the final comma refer to the estimation command. 
 72 | {p_end}
 73 | 
 74 | {p 8 14}{cmd:ddml sample} [ , {opt append}[{cmd:(}{it:integer}{cmd:)}] {opt foldvar(varlist)} {bind: ]}
 75 | 
 76 | {pstd}
 77 | adds cross-fitting repetitions to an existing and possibly already-estimated model,
 78 | where the additional repetitions is indicated either by {opt append(#)}
 79 | or by {opt append} and the cross-fit fold identifiers in {opt foldvar(varlist)}.
 80 | 
 81 | 
 82 | {marker options}{...}
 83 | {synoptset 20}{...}
 84 | {synopthdr:init options}
 85 | {synoptline}
 86 | {synopt:{opt mname(name)}}
 87 | name of the DDML model. Allows to run multiple DDML
 88 | models simultaneously. Defaults to {it:m0}.
 89 | {p_end}
 90 | {synopt:{opt prefix}}
 91 | tells {opt ddml} to prefix the names of all created variables
 92 | with name of the DDML model.
 93 | Default is to prefix only the created sample and fold ID variables.
 94 | {p_end}
 95 | {synopt:{opt kfolds(integer)}}
 96 | number of cross-fitting folds. The default is 5.
 97 | {p_end}
 98 | {synopt:{opt fcluster(varname)}}
 99 | cluster identifiers for cluster randomization of random folds.
100 | {p_end}
101 | {synopt:{opt foldvar(varlist)}}
102 | integer variable with user-specified cross-fitting folds (one per cross-fitting repetition).
103 | {p_end}
104 | {synopt:{opt norandom}}
105 | use observations in existing order instead of randomizing before splitting into folds;
106 | if multiple resamples, applies to first resample only;
107 | ignored if user-defined fold variables are provided in {opt foldvar(varlist)}.
108 | {p_end}
109 | {synopt:{opt reps(integer)}}
110 | cross-fitting repetitions, i.e., how often the cross-fitting procedure is
111 | repeated on randomly generated folds. 
112 | {p_end}
113 | {synopt:{opt tabfold}}
114 | prints a table with frequency of observations by fold.
115 | {p_end}
116 | {synopt:{opt vars(varlist)}}
117 | tells {opt ddml} that the variables in {it:varlist} are used in the estimation.
118 | Useful if you want the fold split to take account of
119 | observations dropped because of missing values.
120 | {p_end}
121 | {synoptline}
122 | {p2colreset}{...}
123 | {pstd}
124 | 
125 | {synoptset 20}{...}
126 | {synopthdr:equation options}
127 | {synoptline}
128 | {synopt:{opt mname(name)}}
129 | name of the DDML model. Defaults to {it:m0}.
130 | {p_end}
131 | {synopt:{opt vname(varname)}}
132 | name of the dependent variable in the reduced form estimation. 
133 | This is usually inferred from the command line but is mandatory
134 | for the {it:fiv} model.
135 | {p_end}
136 | {synopt:{opt l:earner(varname)}}
137 | optional name of the variable to be created. 
138 | {p_end}
139 | {synopt:{opt vtype(string)}}
140 | (rarely used) optional variable type of the variable to be created. Defaults to {it:double}. 
141 | {it:none} can be used to leave the type field blank 
142 | (required when using {cmd:ddml} with {helpb rforest}.)
143 | {p_end}
144 | {synopt:{opt predopt(string)}}
145 | (rarely used) {cmd:predict} option to be used to get predicted values. 
146 | Typical values could be {opt xb} or {opt pr}. Default is 
147 | blank. 
148 | {p_end}
149 | {synoptline}
150 | {p2colreset}{...}
151 | {pstd}
152 | 
153 | {synoptset 20}{...}
154 | {synopthdr:sample options}
155 | {synoptline}
156 | {synopt:{opt mname(name)}}
157 | name of the DDML model. Defaults to {it:m0}.
158 | {p_end}
159 | {synopt:{opt append(#)}}
160 | number of additional resamples to cross-fit.
161 | {p_end}
162 | {synopt:{opt append}}
163 | when no number of resamples to append is provided, this is based on the list fold IDs in {opt foldvar(varlist)}.
164 | {p_end}
165 | {synopt:{opt foldvar(varlist)}}
166 | integer variable with user-specified cross-fitting folds (one per cross-fitting repetition).
167 | {p_end}
168 | {synoptline}
169 | {p2colreset}{...}
170 | {pstd}
171 | 
172 | 
173 | {smcl}
174 | INCLUDE help ddml_install_ref_auth
175 | 


--------------------------------------------------------------------------------
/ddml_example_extract.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 31july2023}{...}
  3 | {smcl}
  4 | {pstd}{ul:ddml extract utility: Extracting stored information from ddml associative arrays}
  5 | 
  6 | {pstd}The examples below use the partially-linear model
  7 | and stacking regression using {helpb pystacked}.
  8 | We also request short-stacking.
  9 | The model name is the default name "m0".
 10 | For simplicity we use {helpb pystacked}'s default learners and settings.
 11 | {p_end}
 12 | 
 13 | {pstd}Preparation and estimation:{p_end}
 14 | 
 15 | {phang2}. {stata "use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear"}{p_end}
 16 | {phang2}. {stata "global X tw age inc fsize educ db marr twoearn pira hown"}{p_end}
 17 | {phang2}. {stata "set seed 42"}{p_end}
 18 | {phang2}. {stata "ddml init partial, kfolds(3) reps(5)"}{p_end}
 19 | {phang2}. {stata "ddml E[Y|X]: pystacked net_tfa $X, type(reg)"}{p_end}
 20 | {phang2}. {stata "ddml E[D|X]: pystacked e401 $X, type(reg)"}{p_end}
 21 | {phang2}. {stata "ddml crossfit, shortstack"}{p_end}
 22 | {phang2}. {stata "ddml estimate, robust"}{p_end}
 23 | 
 24 | {pstd}{ul:{opt show(something)} option examples}{p_end}
 25 | 
 26 | {pstd}{opt show} option examples: report standard (pystacked) and short-stacked weights.
 27 | Standard stacking weights displayed here are mean weights across cross-fit folds.{p_end}
 28 | 
 29 | {phang2}. {stata "ddml extract, show(stweights)"}{p_end}
 30 | {phang2}. {stata "ddml extract, show(ssweights)"}{p_end}
 31 | 
 32 | {pstd}The {opt show} option leaves results in r(.) macros.{p_end}
 33 | 
 34 | {phang2}. {stata "mat list r(Y_net_tfa_ss)"}{p_end}
 35 | {phang2}. {stata "mat list r(D_e401_ss)"}{p_end}
 36 | 
 37 | {pstd}{opt show} option examples: examine the learner weights and MSEs by fold reported by {cmd:pystacked}.{p_end}
 38 | 
 39 | {phang2}. {stata "ddml extract, show(pystacked)"}{p_end}
 40 | 
 41 | {pstd}{ul:List keys examples}{p_end}
 42 | 
 43 | {pstd}List keys of associative arrays used in model m0.
 44 | Associative array m0.eqnAA is an "equation AA" and has one key,
 45 | which is is the name of the variable for which conditional expectations are estimated.
 46 | Associative array m0.estAA is an "estimation AA" and has two keys.
 47 | The objects stored on this AA are either estimation results,
 48 | AAs that have sets of estimation results, or objects with information about the estimations.{p_end}
 49 | 
 50 | {phang2}. {stata "ddml extract, keys"}{p_end}
 51 | 
 52 | {pstd}List keys relating to equation for D variable, e401.
 53 | Keys for two associative arrays are reported.
 54 | Associative array e401.lrnAA is a "learner AA" and has two keys; it stores e.g. an estimation specification.
 55 | Associative array e401.resAA is a "results AA" and has three keys; it stores e.g. estimation results.{p_end}
 56 | 
 57 | {phang2}. {stata "ddml extract, keys vname(e401)"}{p_end}
 58 | 
 59 | {pstd}{ul:Working with model estimation results}{p_end}
 60 | 
 61 | {pstd}Extract the estimated beta for the short-stack specification ("ss"), resample 2.
 62 | Provide the keys for the AA with the results for the specification and resampling,
 63 | and the subkeys for this AA to obtain the posted beta.{p_end}
 64 | 
 65 | {phang2}. {stata "ddml extract, key1(ss) key2(2) subkey1(b) subkey2(post)"}{p_end}
 66 | 
 67 | {pstd}As above, but store as a Mata object "bmat".
 68 | This is done by providing this name after "ddml extract".{p_end}
 69 | 
 70 | {phang2}. {stata "ddml extract bmat, key1(ss) key2(2) subkey1(b) subkey2(post)"}{p_end}
 71 | {phang2}. {stata "mata: bmat"}{p_end}
 72 | 
 73 | {pstd}By default, the object is saved as a Mata object.
 74 | To save as a Stata macro r(bmat), use the {opt Stata} option:{p_end}
 75 | 
 76 | {phang2}. {stata "ddml extract bmat, key1(ss) key2(2) subkey1(b) subkey2(post) stata"}{p_end}
 77 | {phang2}. {stata "mat list r(bmat)"}{p_end}
 78 | 
 79 | {pstd}More examples of the above, relating to specification ss and
 80 | various resamples or the mean/median across resamples.
 81 | (The list of available results was already displayed above by {stata "ddml extract, keys"}.){p_end}
 82 | 
 83 | {phang2}. {stata "ddml extract, key1(ss) key2(1) subkey1(D_e401_ss_mse) subkey2(scalar)"}{p_end}
 84 | {phang2}. {stata "ddml extract, key1(ss) key2(2) subkey1(D_e401_ss_mse_folds) subkey2(matrix)"}{p_end}
 85 | {phang2}. {stata "ddml extract, key1(ss) key2(mn) subkey1(V) subkey2(post)"}{p_end}
 86 | {phang2}. {stata "ddml extract, key1(ss) key2(md) subkey1(title) subkey2(local)"}{p_end}
 87 | 
 88 | {pstd}{ul:Working with equation estimation results}{p_end}
 89 | 
 90 | {pstd}Display information stored on learner AA e401.lrnAA
 91 | about the specification of conditional expectations for variable e401.{p_end}
 92 | 
 93 | {phang2}. {stata "ddml extract, vname(e401) key1(D1_pystacked) key2(est_main)"}{p_end}
 94 | {phang2}. {stata "ddml extract, vname(e401) key1(D1_pystacked) key2(stack_base_est)"}{p_end}
 95 | 
 96 | {pstd}Display information stored on results AA e401.resAA
 97 | about the estimation results for resamplings 1 and 2.{p_end}
 98 | 
 99 | {phang2}. {stata "ddml extract, vname(e401) key1(D1_pystacked) key2(MSE_folds) key3(1)"}{p_end}
100 | {phang2}. {stata "ddml extract, vname(e401) key1(D1_pystacked) key2(MSE_folds) key3(2)"}{p_end}
101 | {phang2}. {stata "ddml extract, vname(e401) key1(D1_pystacked) key2(stack_weights) key3(1)"}{p_end}
102 | {phang2}. {stata "ddml extract, vname(e401) key1(D1_pystacked) key2(stack_weights) key3(2)"}{p_end}
103 | 
104 | {pstd}{ul:Working directly with an equation associative array}{p_end}
105 | 
106 | {pstd}Extract the associative AA for the estimation of conditional expectations for variable e401.
107 | Store it as a Mata object called AA_e401.
108 | Note: the {cmd:crossfit} command returns an equation associative array,
109 | so this step is unnecessary when using this command.{p_end}
110 | 
111 | {phang2}. {stata "ddml extract AA_e401, vname(e401)"}{p_end}
112 | {phang2}. {stata "mata: AA_e401"}{p_end}
113 | 
114 | {pstd}Examples of working with this equation associative array.
115 | Note that the {opt ename} option must be used.{p_end}
116 | 
117 | {phang2}. {stata "ddml extract, ename(AA_e401) key1(D1_pystacked) key2(MSE) key3(1)"}{p_end}
118 | {phang2}. {stata "ddml extract, ename(AA_e401) key1(D1_pystacked) key2(MSE) key3(2)"}{p_end}
119 | 
120 | {pstd}{ul:Using Mata's associative array commands}{p_end}
121 | 
122 | {pstd}If preferred, Mata's associative array commands can be used directly.
123 | Note that all keys are strings.{p_end}
124 | 
125 | {phang2}. {stata "mata: m0.estAA.keys()"}{p_end}
126 | {phang2}. {stata `"mata: AA_e1_r2 = (m0.estAA).get(("ss","2"))"'}{p_end}
127 | {phang2}. {stata "mata: AA_e1_r2.keys()"}{p_end}
128 | {phang2}. {stata `"mata: AA_e1_r2.get(("b","post"))"'}{p_end}
129 | 


--------------------------------------------------------------------------------
/_ddml_sample.ado:
--------------------------------------------------------------------------------
  1 | *! ddml v1.4.4
  2 | *! last edited: 30aug2024
  3 | *! authors: aa/ms
  4 | 
  5 | program _ddml_sample, sortpreserve					//  sortpreserve needed for fold IDs that respect clustering
  6 | 	version 16
  7 | 
  8 | 	syntax [if] [in] , mname(name) [				///
  9 | 							foldvar(varlist)		/// optional list of variables indicating folds, one per rep
 10 | 							reps(integer 0)			/// default=1 below
 11 | 							APPEND1					/// option abbrev is "append", allowing both "append" and "append(#)"
 12 | 							append(integer 0)		///
 13 | 							NORANDOM				/// first fold ID uses obs in existing order
 14 | 							vars(varlist)			///
 15 | 							kfolds(integer 0)		/// default=5 below
 16 | 							tabfold					///
 17 | 							]
 18 | 	
 19 | 	// incompatible options
 20 | 	if "`foldvar'"~="" & `reps' {
 21 | 		di as err "error - incompatible options, foldvar(`foldvar') and reps(`reps')"
 22 | 		exit 198
 23 | 	}
 24 | 	if "`foldvar'"~="" & `kfolds' {
 25 | 		di as err "error - incompatible options, foldvar(`foldvar') and kfolds(`kfolds')"
 26 | 		exit 198
 27 | 	}
 28 | 	if `reps' & `append' {
 29 | 		di as err "error - incompatible options, append(`append') and reps(`reps')"
 30 | 		exit 198
 31 | 	}
 32 | 	if "`append1'"~="" & `append' {
 33 | 		di as err "error - incompatible options, append and append(`append')"
 34 | 		exit 198
 35 | 	}
 36 | 	if "`append1'"~="" & "`foldvar'"=="" {
 37 | 		di as err "error - append option with no argument requires foldvar(varlist)"
 38 | 		exit 198
 39 | 	}
 40 | 	
 41 | 	// syntax checks and defaults
 42 | 	if `kfolds'<0 {
 43 | 		di as err "error - invalid kfolds(`kfolds'); must be an integer > 1"
 44 | 		exit 198
 45 | 	}
 46 | 	else if `kfolds'==0 & "`foldvar'"=="" {
 47 | 		// default number of folds unless foldvar is provided
 48 | 		local kfolds=5
 49 | 	}
 50 | 	if `reps'<0 {
 51 | 		di as err "error - invalid reps(`reps'); must be an integer > 0"
 52 | 		exit 198
 53 | 	}
 54 | 	else if `reps'==0 & "`foldvar'"=="" {
 55 | 		// default number of folds unless foldvar is provided
 56 | 		local kfold	=5
 57 | 		local reps	=1
 58 | 	}
 59 | 	else if `reps'==0 {
 60 | 		local reps : word count `foldvar'
 61 | 	}
 62 | 	
 63 | 	
 64 | 	// update append macro; append1 macro not needed after this
 65 | 	if "`append1'"~="" {
 66 | 		// update append macro to have number of appended resamples from #foldvars
 67 | 		local append : word count `foldvar'
 68 | 	}
 69 | 	
 70 | 	// if appending, reps and kfolds = current setting for model
 71 | 	if `append' {
 72 | 		mata: st_local("reps", strofreal(`mname'.nreps))
 73 | 		mata: st_local("kfolds", strofreal(`mname'.kfolds))
 74 | 	}
 75 | 	
 76 | 	// clear all results (crossfits and estimation) or just estimations
 77 | 	if `append'==0 {
 78 | 		// clear any preexisting equation results from the model struct
 79 | 		mata: clear_model_results(`mname')
 80 | 	}
 81 | 	else {
 82 | 		// keep crossfit equations but clear estimation results
 83 | 		mata: clear_model_estimation(`mname')
 84 | 	}
 85 | 	// set reps, firstrep, lastrep
 86 | 	if `append' {
 87 | 		// if appending, reps=prev reps setting for model, firstrep=reps+1, lastrep=reps+append
 88 | 		local firstrep	= `reps'+1
 89 | 		local lastrep	= `reps'+`append'
 90 | 	}
 91 | 	else {
 92 | 		// will create full set of foldvars for all resamples
 93 | 		local firstrep	= 1
 94 | 		local lastrep	= `reps'
 95 | 	}
 96 | 	
 97 | 	// estimation sample	
 98 | 	marksample touse
 99 | 	if "`vars'" ~= "" & `append'==0 {
100 | 		// set sample indicator to 0 if obs have missings, unless appending to existing model
101 | 		fvunab vars : `vars'
102 | 		markout `touse' `vars'
103 | 		// add list of vars to model struct
104 | 		mata: `mname'.strDatavars = "`vars'"
105 | 	}
106 | 	if `append' {
107 | 		// replace touse macro with pre-existing sample variable
108 | 		local touse = `mname'_sample
109 | 	}
110 | 	
111 | 	// tempvar fclustid is either defined using fclustvar or is equal to _n.
112 | 	tempvar fclustid
113 | 	mata: st_local("fclustvar", `mname'.fclustvar)
114 | 	if "`fclustvar'"=="" {
115 | 		qui gen double `fclustid'=_n
116 | 	}
117 | 	else {
118 | 		qui egen double `fclustid' = group(`fclustvar')
119 | 	}
120 | 	
121 | 	*** gen folds
122 | 	// create foldvar
123 | 	// Stata name will be mname_fid with _m as rep extension
124 | 	
125 | 	// delete existing foldvars, unless appending to existing model
126 | 	if `append'==0	cap drop `mname'_fid*
127 | 	
128 | 	if "`foldvar'"=="" {
129 | 		forvalues m=`firstrep'/`lastrep' {
130 | 			*** gen folds
131 | 			tempvar uni cuni tag
132 | 			// tag one ob per fold cluster; if no clustering, all obs are tagged
133 | 			qui egen `tag' = tag(`fclustid') if `mname'_sample
134 | 			if `m'==1 & "`norandom'"~="" {
135 | 				qui gen `uni' = _n if `mname'_sample & `tag'
136 | 				local labtext "Fold ID (original order), rep `m'"
137 | 			}
138 | 			else {
139 | 				qui gen double `uni' = runiform() if `mname'_sample & `tag'
140 | 				local labtext "Fold ID (randomly generated) rep `m'"
141 | 			}
142 | 			qui cumul `uni' if `mname'_sample, gen(`cuni')
143 | 			// create equal-sized folds (#obs or #cluster)
144 | 			qui egen long `mname'_fid_`m' = cut(`uni'), group(`kfolds')
145 | 			sort `fclustid' `tag'
146 | 			// propagate random uniforms (last ob in fcluster) within fclusters
147 | 			qui by `fclustid': replace `mname'_fid_`m'=`mname'_fid_`m'[_N] if `mname'_sample
148 | 			qui replace `mname'_fid_`m' = `mname'_fid_`m' + 1
149 | 			label var `mname'_fid_`m' "`labtext'"
150 | 		}
151 | 	}
152 | 	else {
153 | 		local pos = 1
154 | 		forvalues m=`firstrep'/`lastrep' {
155 | 			local vname : word `pos' of `foldvar'
156 | 			// check that fold var is legit
157 | 			cap count if `vname' < . & `touse'
158 | 			if _rc > 0 {
159 | 				di as err "error - fold variable `foldvar' does not exist or is not a valid identifier"
160 | 				exit 198
161 | 			}
162 | 			qui count if `vname'==. & `touse'
163 | 			if r(N)>0 {
164 | 				di as res "note - fold variable missing for some observations"
165 | 				di as res "these observations will be excluded from the estimation sample"
166 | 				qui replace `touse' = 0 if `vname'==.
167 | 			}
168 | 			qui egen long `mname'_fid_`m' = group(`vname')
169 | 			label var `mname'_fid_`m' "(based on `vname')"
170 | 			// enforce that the number of folds is the same for all fold vars
171 | 			qui tab `mname'_fid_`m'
172 | 			if `m'==1 {
173 | 				// initialize kfolds for checking
174 | 				local kfolds = r(r)
175 | 			}
176 | 			else {
177 | 				if r(r)~=`kfolds' {
178 | 					di as err "error - fold variables must have same number of folds"
179 | 					exit 198
180 | 				}
181 | 			}
182 | 			local ++pos			
183 | 		}
184 | 		// update sample indicator to account for missing fold vars
185 | 		qui replace `mname'_sample = `touse'
186 | 	}
187 | 
188 | 	// update model struct
189 | 	if `append' {
190 | 		local reps = `reps'+`append'
191 | 	}
192 | 	else {
193 | 		mata: `mname'.kfolds = `kfolds'
194 | 	}
195 | 	mata: `mname'.nreps = `reps'
196 | 
197 | 	forvalues m=1/`reps' {
198 | 		if ("`tabfold'"!="") {
199 | 			di
200 | 			di "Overview of frequencies by fold (sample `m'):"
201 | 			tab `mname'_fid_`m' if `mname'_sample
202 | 			di
203 | 		}
204 | 	}
205 | 
206 | 
207 | end
208 | 
209 | mata:
210 | 
211 | struct eqnStruct init_eqnStruct()
212 | {
213 | 	struct eqnStruct scalar		e
214 | 	return(e)
215 | }
216 | 
217 | end
218 | 


--------------------------------------------------------------------------------
/cert/qddml_cert.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 |  
  3 | if ("`c(username)'"=="kahrens") {
  4 | 	adopath + "/Users/kahrens/MyProjects/pystacked"
  5 | 	cd "/Users/kahrens/MyProjects/ddml/cert"
  6 | }
  7 | 
  8 | cap log close
  9 | log using "qddml_cert", replace text
 10 | 
 11 | * global tol = 0.0001
 12 | which ddml
 13 | which pystacked
 14 | 
 15 | **** Partially linear model w/pystacked integration
 16 | 
 17 | use https://github.com/aahrens1/ddml/raw/master/data/sipp1991.dta, clear
 18 | global Y net_tfa
 19 | global D e401
 20 | global X tw age inc fsize educ db marr twoearn pira hown
 21 | 
 22 | // basic qddml usage with no std stacking and default settings
 23 | set seed 42
 24 | qddml $Y $D ($X), kfolds(2) model(partial)
 25 | 
 26 | // replication example
 27 | set seed 42
 28 | ddml init partial, kfolds(2)
 29 | ddml E[Y|X]: pystacked $Y $X, type(reg)
 30 | ddml E[D|X]: pystacked $D $X, type(reg)
 31 | // ddml reports only standard stacking by default
 32 | ddml crossfit, shortstack
 33 | ddml estimate
 34 | global b1_ss = _b[$D]
 35 | ddml estimate, mname(m0) spec(st) rep(1) notable replay
 36 | global b1_std = _b[$D]
 37 | 
 38 | set seed 42
 39 | // qddml reports only short-stacking by default
 40 | qddml $Y $D ($X), kfolds(2) model(partial) stdstack shortstack mname(m0q)
 41 | global b2_ss = _b[$D]
 42 | ddml estimate, mname(m0q) spec(st) rep(1) notable replay
 43 | global b2_std = _b[$D]
 44 | 
 45 | assert reldif($b1_ss, $b2_ss)  < 10e-10
 46 | assert reldif($b1_std,$b2_std) < 10e-10
 47 | 
 48 | **** Partially linear IV model, rforest
 49 | 
 50 | use https://statalasso.github.io/dta/AJR.dta, clear
 51 | global Y logpgp95
 52 | global D avexpr
 53 | global Z logem4
 54 | global X lat_abst edes1975 avelf temp* humid* steplow-oilres
 55 | 
 56 | set seed 42
 57 | ddml init iv, kfolds(30)
 58 | ddml E[Y|X], vtype(none): rforest $Y $X, type(reg)
 59 | ddml E[D|X], vtype(none): rforest $D $X, type(reg)
 60 | ddml E[Z|X], vtype(none): rforest $Z $X, type(reg)
 61 | ddml crossfit
 62 | ddml estimate, robust
 63 | global b1 = _b[$D]
 64 | 
 65 | set seed 42
 66 | qddml $Y ($X) ($D=$Z), kfolds(30) model(iv) mname(m0q)		///
 67 | 	cmd(rforest) cmdopt(type(reg)) vtype(none) robust
 68 | global b2 = _b[$D]
 69 | 
 70 | assert reldif($b1,$b2) < 10e-10
 71 | 
 72 | **** Interactive model--ATE and ATET estimation w/pystacked integration
 73 | 
 74 | webuse cattaneo2, clear
 75 | keep in 1/1000
 76 | global Y bweight
 77 | global D mbsmoke
 78 | global X mage prenatal1 mmarried fbaby mage medu
 79 | global pystacked_y_options type(reg) method(rf gradboost)
 80 | global pystacked_d_options type(class) method(rf gradboost)
 81 | 
 82 | // basic qddml usage with no std stacking and default settings
 83 | set seed 42
 84 | qddml $Y $D ($X), kfolds(2) model(interactive)
 85 | 
 86 | // replication example
 87 | set seed 42
 88 | ddml init interactive, kfolds(2) reps(2)
 89 | ddml E[Y|X,D]: pystacked $Y $X, $pystacked_y_options
 90 | ddml E[D|X]: pystacked $D $X, $pystacked_d_options
 91 | // ddml reports only standard stacking by default
 92 | ddml crossfit, shortstack
 93 | ddml estimate
 94 | global b1_md_ss			= _b[$D]
 95 | ddml estimate, mname(m0) spec(st) rep(md) notable replay
 96 | global b1_md_std		= _b[$D]
 97 | ddml estimate, atet
 98 | global b1_atet_md_ss	= _b[$D]
 99 | ddml estimate, mname(m0) spec(st) rep(md) notable replay
100 | global b1_atet_md_std	= _b[$D]
101 | 
102 | set seed 42
103 | // qddml reports only short-stacking by default
104 | qddml $Y $D ($X), kfolds(2) reps(2) model(interactive)		///
105 | 	mname(m0q)												///
106 | 	stdstack shortstack										///
107 | 	pystacked_y($pystacked_y_options)						///
108 | 	pystacked_d($pystacked_d_options)
109 | global b2_md_ss			= _b[$D]
110 | ddml estimate, mname(m0q) spec(st) rep(md) notable replay
111 | global b2_md_std		= _b[$D]
112 | ddml estimate, atet mname(m0q)
113 | global b2_atet_md_ss	= _b[$D]
114 | ddml estimate, mname(m0q) spec(st) rep(md) notable replay
115 | global b2_atet_md_std	= _b[$D]
116 | 
117 | assert reldif($b1_md_ss,       $b2_md_ss)       < 10e-10
118 | assert reldif($b1_md_std,      $b2_md_std)      < 10e-10
119 | assert reldif($b1_atet_md_ss,  $b2_atet_md_ss)  < 10e-10
120 | assert reldif($b1_atet_md_std, $b2_atet_md_std) < 10e-10
121 | 
122 | **** Interactive IV model w/pystacked integration
123 | 
124 | use http://fmwww.bc.edu/repec/bocode/j/jtpa.dta,clear
125 | global Y earnings
126 | global D training
127 | global Z assignmt
128 | global X sex age married black hispanic
129 | global pystacked_y_options type(reg) m(lassocv)
130 | global pystacked_d_options type(class) m(lassocv)
131 | global pystacked_z_options type(class) m(lassocv)
132 | 
133 | set seed 42
134 | ddml init interactiveiv, kfolds(5)
135 | ddml E[Y|X,Z]: pystacked $Y c.($X)# #c($X), $pystacked_y_options
136 | ddml E[D|X,Z]: pystacked $D c.($X)# #c($X), $pystacked_d_options
137 | ddml E[Z|X]: pystacked $Z c.($X)# #c($X), $pystacked_z_options
138 | ddml crossfit
139 | ddml estimate
140 | global b1 = _b[$D]
141 | 
142 | set seed 42
143 | qddml $Y (c.($X)# #c($X)) ($D=$Z), kfolds(5) model(interactiveiv)	///
144 | 	mname(m0q)														///
145 | 	pystacked_y($pystacked_y_options)								///
146 | 	pystacked_d($pystacked_d_options)								///
147 | 	pystacked_z($pystacked_z_options)
148 | global b2 = _b[$D]
149 | 
150 | assert reldif($b1,$b2) < 10e-10
151 | 
152 | **** FIV
153 | 
154 | use https://github.com/aahrens1/ddml/raw/master/data/BLP.dta, clear
155 | global Y share
156 | global D price
157 | global X hpwt air mpd space
158 | global Z sum*
159 | 
160 | // standard stacking by pystacked
161 | set seed 42
162 | ddml init fiv
163 | ddml E[Y|X]: pystacked $Y $X, type(reg)
164 | ddml E[D|Z,X], learner(Dhat_pystacked): pystacked $D $X $Z, type(reg)
165 | ddml E[D|X], learner(Dhat_pystacked) vname($D): pystacked {D} $X, type(reg)
166 | ddml crossfit
167 | ddml estimate
168 | global b1 = _b[$D]
169 | 
170 | set seed 42
171 | // enforce standard stacking using the cmd(.) option
172 | qddml $Y ($X) ($D=$Z), model(fiv) cmd(pystacked) cmdopt(type(reg))
173 | global b2 = _b[$D]
174 | 
175 | assert reldif($b1,$b2) < 10e-10
176 | 
177 | ddml extract, show(pystacked)
178 | ddml extract, show(stweights)
179 | 
180 | // short-stacking only
181 | use https://github.com/aahrens1/ddml/raw/master/data/BLP.dta, clear
182 | global Y share
183 | global D price
184 | global X hpwt air mpd space
185 | global Z sum*
186 | 
187 | set seed 42
188 | ddml init fiv
189 | ddml E[Y|X], learner(Y1_ols): pystacked $Y $X, type(reg) m(ols)
190 | ddml E[Y|X], learner(Y2_lassocv): pystacked $Y $X, type(reg) m(lassocv)
191 | ddml E[Y|X], learner(Y3_gradboost): pystacked $Y $X, type(reg) m(gradboost)
192 | ddml E[D|Z,X], learner(D1_ols): pystacked $D $X $Z, type(reg) m(ols)
193 | ddml E[D|X], learner(D1_ols) vname($D): pystacked {D} $X, type(reg) m(ols)
194 | ddml E[D|Z,X], learner(D2_lassocv): pystacked $D $X $Z, type(reg) m(lassocv)
195 | ddml E[D|X], learner(D2_lassocv) vname($D): pystacked {D} $X, type(reg) m(lassocv)
196 | ddml E[D|Z,X], learner(D3_gradboost): pystacked $D $X $Z, type(reg) m(gradboost)
197 | ddml E[D|X], learner(D3_gradboost) vname($D): pystacked {D} $X, type(reg) m(gradboost)
198 | ddml crossfit, shortstack
199 | ddml estimate
200 | global b1 = _b[$D]
201 | ddml extract, show(ssweights)
202 | mat Yss1 = r(Y_share_ss)
203 | mat Dss1 = r(D_price_ss)
204 | 
205 | set seed 42
206 | // short-stacking only by default
207 | qddml $Y ($X) ($D=$Z), model(fiv)
208 | global b2 = _b[$D]
209 | ddml extract, show(ssweights)
210 | mat Yss2 = r(Y_share_ss)
211 | mat Dss2 = r(D_price_ss)
212 | 
213 | assert reldif($b1,$b2) < 10e-10
214 | assert mreldif(Yss1,Yss2) < 10e-10
215 | assert mreldif(Dss1,Dss2) < 10e-10
216 | 
217 | log close
218 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_interactiveiv.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 | 
  3 | if ("`c(username)'"=="kahrens") {
  4 | 	adopath + "/Users/kahrens/MyProjects/ddml"
  5 | 	adopath + "/Users/kahrens/MyProjects/pystacked"
  6 | }
  7 | 
  8 | cap cd "/Users/kahrens/MyProjects/ddml/cert"
  9 | cap cd "C:\LocalStore\ecomes\Documents\GitHub\ddml\cert"
 10 | 
 11 | cap log close
 12 | log using "ddml_cert_interactiveiv", replace text
 13 | 
 14 | which ddml, all
 15 | mata: whichddml()
 16 | which pystacked, all
 17 | 
 18 | use "http://fmwww.bc.edu/repec/bocode/j/jtpa.dta",clear   
 19 | keep in 1/5000
 20 | 
 21 | set seed 123
 22 | 
 23 | ********************************************************************************
 24 | *** interactiveiv															****
 25 | ********************************************************************************
 26 | 
 27 | gen lnearnings = log(earnings) 
 28 | global Y lnearnings
 29 | global D training
 30 | global Z assignmt 
 31 | global X sex-age4554
 32 | 
 33 | *** pystacked, no SS
 34 | 
 35 | *** initialise ddml and select model; 
 36 | ddml init interactiveiv, kfolds(2) reps(2)
 37 | ddml E[Y|X,Z]: pystacked $Y $X, type(reg) method(ols gradboost)
 38 | ddml E[D|X,Z]: pystacked $D $X, type(class) method(logit gradboost)
 39 | ddml E[Z|X]: pystacked $Z $X, type(class) method(logit gradboost)
 40 | ddml crossfit
 41 | ddml estimate
 42 | *** replay
 43 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 44 | *** append, estimate, replay
 45 | ddml sample, append(1)
 46 | ddml crossfit
 47 | ddml estimate
 48 | *** replay
 49 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 50 | ddml estimate, mname(m0) spec(st) rep(2) replay notable
 51 | ddml estimate, mname(m0) spec(st) rep(mn) replay notable
 52 | ddml estimate, mname(m0) spec(st) rep(md) replay notable
 53 | *** ddml extract
 54 | ddml extract, show(weights)
 55 | ddml extract, show(pystacked)
 56 | 
 57 | *** pystacked, SS
 58 | 
 59 | *** initialise ddml and select model; 
 60 | ddml init interactiveiv, kfolds(2) reps(2)
 61 | ddml E[Y|X,Z]: pystacked $Y $X, type(reg) method(ols gradboost)
 62 | ddml E[D|X,Z]: pystacked $D $X, type(class) method(logit gradboost)
 63 | ddml E[Z|X]: pystacked $Z $X, type(class) method(logit gradboost)
 64 | ddml crossfit, shortstack poolstack
 65 | ddml estimate
 66 | *** replay
 67 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 68 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
 69 | ddml estimate, mname(m0) spec(ps) rep(1) replay notable
 70 | *** ddml extract
 71 | ddml extract, show(weights)
 72 | ddml extract, show(pystacked)
 73 | 
 74 | *** append, estimate, replay
 75 | ddml sample, append(1)
 76 | ddml crossfit, shortstack poolstack
 77 | ddml estimate
 78 | *** replay
 79 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 80 | ddml estimate, mname(m0) spec(st) rep(2) replay notable
 81 | ddml estimate, mname(m0) spec(st) rep(mn) replay notable
 82 | ddml estimate, mname(m0) spec(st) rep(md) replay notable
 83 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
 84 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
 85 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
 86 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
 87 | ddml estimate, mname(m0) spec(ps) rep(1) replay notable
 88 | ddml estimate, mname(m0) spec(ps) rep(2) replay notable
 89 | ddml estimate, mname(m0) spec(ps) rep(mn) replay notable
 90 | ddml estimate, mname(m0) spec(ps) rep(md) replay notable
 91 | *** ddml extract
 92 | ddml extract, show(weights)
 93 | ddml extract, show(pystacked)
 94 | 
 95 | *** multiple learners, no SS
 96 | 
 97 | *** initialise ddml and select model; 
 98 | ddml init interactiveiv, kfolds(2) reps(2)
 99 | ddml E[Y|X,Z]: pystacked $Y $X, type(reg) method(gradboost)
100 | ddml E[Y|X,Z]: reg $Y $X
101 | ddml E[D|X,Z]: pystacked $D $X, type(class) method(gradboost)
102 | ddml E[D|X,Z]: logit $D $X
103 | ddml E[Z|X]: pystacked $Z $X, type(class) method(gradboost)
104 | ddml E[Z|X]: logit $Z $X
105 | ddml crossfit
106 | ddml estimate
107 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
108 | *** allcombos
109 | ddml estimate, allcombos
110 | forvalues i=1/8 {
111 |     ddml estimate, mname(m0) spec(`i') rep(1) replay notable
112 | }
113 | *** append, estimate, replay
114 | ddml sample, append(1)
115 | ddml crossfit
116 | ddml estimate
117 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
118 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
119 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
120 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
121 | *** allcombos
122 | ddml estimate, allcombos
123 | forvalues i=1/4 {
124 | 	forvalues r=1/2 {
125 | 		ddml estimate, mname(m0) spec(`i') rep(`r') replay notable
126 | 	}
127 | }
128 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
129 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
130 | 
131 | *** multiple learners, SS
132 | 
133 | *** initialise ddml and select model; 
134 | ddml init interactiveiv, kfolds(2) reps(2)
135 | ddml E[Y|X,Z]: pystacked $Y $X, type(reg) method(gradboost)
136 | ddml E[Y|X,Z]: reg $Y $X
137 | ddml E[D|X,Z]: pystacked $D $X, type(class) method(gradboost)
138 | ddml E[D|X,Z]: logit $D $X
139 | ddml E[Z|X]: pystacked $Z $X, type(class) method(gradboost)
140 | ddml E[Z|X]: logit $Z $X
141 | ddml crossfit, shortstack
142 | ddml estimate
143 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
144 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
145 | *** allcombos
146 | ddml estimate, allcombos
147 | forvalues i=1/8 {
148 |     ddml estimate, mname(m0) spec(`i') rep(1) replay notable
149 | }
150 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
151 | *** append, estimate, replay
152 | ddml sample, append(1)
153 | ddml crossfit, shortstack
154 | ddml estimate
155 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
156 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
157 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
158 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
159 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
160 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
161 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
162 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
163 | *** allcombos
164 | ddml estimate, allcombos
165 | forvalues i=1/8 {
166 |     forvalues r=1/2 {
167 | 		ddml estimate, mname(m0) spec(`i') rep(`r') replay notable
168 | 	}
169 | }
170 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
171 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
172 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
173 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
174 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
175 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
176 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
177 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
178 | *** ddml extract
179 | ddml extract, show(ssweights)
180 | 
181 | *** ddml overlap
182 | ddml init interactiveiv, kfolds(2) reps(2)
183 | ddml E[Y|X,Z]: pystacked $Y $X, type(reg) method(ols gradboost)
184 | ddml E[D|X,Z]: pystacked $D $X, type(class) method(logit gradboost)
185 | ddml E[Z|X]: pystacked $Z $X, type(class) method(logit gradboost)
186 | ddml crossfit
187 | ddml estimate
188 | ddml overlap
189 | ddml overlap, replist(1)
190 | ddml overlap, pslist(Z1_pystacked_L1 Z1_pystacked_L2)
191 | ddml overlap, name(triangle, replace)							///
192 | 	title("Propensity score: triangle kernel")
193 | ddml overlap, kernel(epanechnikov) name(epanechnikov, replace)	///
194 | 	title("Propensity score: epanechnikov kernel")
195 | 
196 | log close
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 


--------------------------------------------------------------------------------
/ddml_overview.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {marker models}{...}
  4 | {title:Models}
  5 | 
  6 | {pstd}
  7 | Throughout we use {it:Y} to denote the outcome variable, 
  8 | {it:X} to denote confounders, 
  9 | {it:Z} to denote instrumental variable(s), and
 10 | {it:D} to denote the treatment variable(s) of interest.
 11 | 
 12 | {pstd}
 13 | {ul:Partially-linear model} [{it:partial}]
 14 | 
 15 | 	Y = {it:a}.D + g(X) + U
 16 |         D = m(X) + V
 17 | 
 18 | {pstd}
 19 | where the aim is to estimate {it:a} while controlling for X. To this end, 
 20 | we estimate the conditional expectations
 21 | E[Y|X] and E[D|X] using a supervised machine learner.
 22 | 
 23 | {pstd}
 24 | {ul:Interactive model} [{it:interactive}]
 25 | 
 26 | 	Y = g(X,D) + U
 27 |         D = m(X) + V
 28 | 
 29 | {pstd}
 30 | which relaxes the assumption that X and D are separable. 
 31 | D is a binary treatment variable,
 32 | and we aim to estimate the average treatment effect (ATE)
 33 | or average treatment effect on the treated (ATET).
 34 | We estimate the conditional expectations E[D|X], as well as 
 35 | E[Y|X,D=0] and E[Y|X,D=1] (jointly added using {cmd:ddml E[Y|X,D]}).
 36 | 
 37 | {pstd}
 38 | {ul:Partially-linear IV model} [{it:iv}]
 39 | 
 40 | 	Y = {it:a}.D + g(X) + U
 41 |         Z = m(X) + V
 42 | 
 43 | {pstd}
 44 | where the aim is to estimate {it:a}. 
 45 | We estimate the conditional expectations E[Y|X], 
 46 | E[D|X] and E[Z|X] using a supervised machine
 47 | learner.
 48 | 
 49 | {pstd}
 50 | {ul:Flexible partially-linear IV model} [{it:fiv}]
 51 | 
 52 | 	Y = {it:a}.D + g(X) + U
 53 |         D = m(Z) + g(X) + V 
 54 | 
 55 | {pstd}
 56 | where the estimand of interest is {it:a}. 
 57 | We estimate the conditional expectations
 58 | E[Y|X], 
 59 | E[D^|X] and D^:=E[D|Z,X] using a supervised machine
 60 | learner. The instrument is then formed as D^-E^[D^|X] where E^[D^|X] denotes
 61 | the estimate of E[D^|X]. 
 62 | 
 63 | {pstd}
 64 | Note: "{D}" is a placeholder that is used because last step (estimation of E[D|X]) 
 65 | uses the fitted values from estimating E[D|X,Z].
 66 | 
 67 | {pstd}
 68 | {ul:Interactive IV model}  [{it:interactiveiv}]
 69 | 
 70 | 	Y = g(Z,X) + U
 71 |         D = h(Z,X) + V
 72 |         Z = m(X) + E
 73 | 
 74 | {pstd}
 75 | where the aim is to estimate the local average treatment effect (LATE).
 76 | We estimate, using a supervised machine
 77 | learner, the following conditional expectations:
 78 | E[Y|X,Z=0] and E[Y|X,Z=1] (jointly added using {cmd:ddml E[Y|X,Z]});
 79 | E[D|X,Z=0] and E[D|X,Z=1] (jointly added using {cmd:ddml E[D|X,Z]});
 80 | and E[Z|X].
 81 | 
 82 | 
 83 | {marker estimation}{...}
 84 | {title:Main steps when estimating with ddml}
 85 | 
 86 | {pstd}Estimation with {cmd:ddml} proceeds in four steps. 
 87 | 
 88 | {pstd}
 89 | {ul:Step 1.} Initialize {cmd:ddml} and select model:
 90 | 
 91 | {p 8 14}{cmd:ddml init}
 92 | {it:model} [if] [in]
 93 | [ , {opt mname(name)} {opt kfolds(integer)}
 94 | {opt fcluster(varname)}
 95 | {opt foldvar(varlist)} {opt reps(integer)} 
 96 | {opt norandom} {opt tabfold} {opt vars(varlist)}{bind: ]}
 97 | 
 98 | {pstd}
 99 | where {it:model} is either {it:partial}, {it:iv}, {it:interactive}, {it:fiv}, {it:interactiveiv};
100 | see {help ddml##models:model descriptions}.
101 | 
102 | {pstd}
103 | {ul:Step 2.} Add supervised ML programs for estimating conditional expectations:
104 | 
105 | {p 8 14}{cmd:ddml} {it:eq} 
106 | [ , {opt mname(name)} {opt vname(varname)} {opt l:earner(varname)}
107 | {opt vtype(string)}
108 | {opt predopt(string)}{bind: ] :}
109 | {it:command} {it:depvar} {it:vars} [ , {it:cmdopt}{bind: ]}
110 | 
111 | {pstd}
112 | where, depending on model chosen in Step 1,
113 | {it:eq} is either 
114 | {it:E[Y|X]} {it:E[Y|D,X]} {it:E[Y|X,Z]} {it:E[D|X]} {it:E[D|X,Z]} {it:E[Z|X]}.
115 | {it:command} is a supported supervised ML program (e.g. {help pystacked} or {help cvlasso}). 
116 | See {help ddml##compatibility:supported programs}.
117 | 
118 | {pstd}
119 | Note: Options before ":" and after the first comma refer to {cmd:ddml}. 
120 | Options that come after the final comma refer to the estimation command. 
121 | {p_end}
122 | 
123 | {pstd}
124 | {ul:Step 3.} Cross-fitting:
125 | 
126 | {p 8 14}{cmd:ddml crossfit} [ , {opt mname(name)} {opt shortstack}{bind: ]} 
127 | 
128 | {pstd}
129 | This step implements the cross-fitting algorithm. Each learner is fitted iteratively on training folds and out-of-sample predicted values are obtained.
130 | 
131 | {pstd}
132 | {ul:Step 4.} Estimate causal effects:
133 | 
134 | {p 8 14}{cmd:ddml estimate} [ , {opt mname(name)} {cmdab:r:obust} {opt cluster(varname)} {opt vce(type)} {opt atet} {opt ateu} {opt trim(real)}{bind: ]} 
135 | 
136 | {pstd}
137 | The {cmd:ddml estimate} command returns treatment effect estimates for all combination of learners 
138 | added in Step 2.
139 | 
140 | {pstd}
141 | {ul:Optional.} Report/post selected results:
142 | 
143 | {p 8 14}{cmd:ddml estimate} [ , {opt mname(name)} {opt spec(integer or string)} {opt rep(integer or string)} {opt allcombos} {opt not:able} {opt replay} {bind: ]} 
144 | 
145 | {pstd}
146 | {marker auxiliary}{...}
147 | {ul:Optional.} Retrieve information from {cmd:ddml}:
148 | 
149 | {p 8 14}{cmd:ddml extract} [ {it:object_name} , {opt mname(name)} {opt show(display_item)} {opt ename(name)} {opt vname(varname)}
150 | {opt stata} {opt keys} {opt key1(string)} {opt key2(string)} {opt key3(string)} {opt subkey1(string)}
151 | {opt subkey2(string)}{bind: ]}
152 | 
153 | {pstd}
154 | {it:display_item} can be {it:stweights}, {it:ssweights}, {it:psweights}, {it:weights}, {it:mse}, {it:n}, or {it:pystacked}.
155 | {cmd:ddml} stores many internal results on associative arrays.
156 | See {help ddml extract} for details.
157 | 
158 | {pstd}
159 | For full details and further options, follow the links to the detailed help files {help ddml##help:above}.
160 | 
161 | 
162 | {marker compatibility}{...}
163 | {title:Compatible programs}
164 | 
165 | {pstd}
166 | {marker general}{...}
167 | {opt ddml} is compatible with a large set of user-written Stata commands. 
168 | It has been tested with 
169 | 
170 | {p 7 9 0} 
171 | - the {help pystacked} package (see {help pystacked} and {help ddml##pystacked:below}). 
172 | Note that {help pystacked} requires Stata 16.
173 | 
174 | {p 7 9 0} 
175 | - {help lassopack} for regularized regression (see {help lasso2}, {help cvlasso}, {help rlasso}).
176 | 
177 | {p 7 9 0} 
178 | - {help rforest} by Zou & Schonlau. Note that {cmd:rforest} requires the option 
179 | {cmd:vtype(none)}. 
180 | 
181 | {p 7 9 0} 
182 | - {help svmachines} by Guenther & Schonlau.
183 | 
184 | {pstd}
185 | Beyond these, it is compatible with any Stata program that 
186 | 
187 | {p 7 9 0} 
188 | - uses the standard "{it:reg y x}" syntax,
189 | 
190 | {p 7 9 0} 
191 | - supports {it:if}-conditions,
192 | 
193 | {p 7 9 0} 
194 | - and comes with {help predict} post-estimation programs.
195 | 
196 | {marker pystacked}{...}
197 | {pstd}
198 | {help pystacked} implements stacking regression ({help pystacked##Wolpert1992:Wolpert, 1992})
199 | via Stata's Python integration in combination with
200 | {browse "https://scikit-learn.org/stable/index.html":scikit-learn}'s 
201 | {browse "https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingRegressor.html":sklearn.ensemble.StackingRegressor} and 
202 | {browse "https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html":sklearn.ensemble.StackingClassifier}. 
203 | Stacking is a way of combining multiple supervised
204 | machine learners (the "base" or "level-0" learners) into
205 | a meta learner.
206 | 
207 | {pstd}{help pystacked} is the recommended way to specify multiple learners in {opt ddml}.
208 | {help pystacked} provides a fast way of estimating all the learners in a single call to one program,
209 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
210 | {opt ddml} will store the predicted values of the specified base learners as well as the combined ("stacked") predicted values.
211 | It also stores the stacking weights used by {help pystacked} along with the {opt ddml} short-stacking weights.
212 | See {help ddml stacking} for more details.
213 | 


--------------------------------------------------------------------------------
/ddml_stacking.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {viewerjumpto "Stacking" "ddml_stacking##stacking"}{...}
  4 | {viewerjumpto "Standard stacking with pystacked" "ddml_stacking##std_stack"}{...}
  5 | {viewerjumpto "Pooled stacking" "ddml_stacking##pool_stack"}{...}
  6 | {viewerjumpto "Short-stacking" "ddml_stacking##short_stack"}{...}
  7 | {viewerjumpto "Re-stacking" "ddml_stacking##restack"}{...}
  8 | {viewerjumpto "Retrieving stacking weights" "ddml_stacking##stack_weights"}{...}
  9 | {viewerjumpto "Examples" "ddml_stacking##examples"}{...}
 10 | {viewerjumpto "Installation" "ddml_stacking##installation"}{...}
 11 | {viewerjumpto "References" "ddml_stacking##references"}{...}
 12 | {viewerjumpto "Authors" "ddml_stacking##authors"}{...}
 13 | {vieweralsosee "ddml main page" "ddml"}{...}
 14 | {vieweralsosee "Other" "ddml_stacking##also_see"}{...}
 15 | {hline}
 16 | {cmd:help ddml stacking}{right: v1.4.4}
 17 | {hline}
 18 | 
 19 | {title:ddml - Stata package for Double Debiased Machine Learning}
 20 | 
 21 | {pstd}
 22 | {opt ddml} implements algorithms for causal inference aided by supervised
 23 | machine learning as proposed in 
 24 | {it:Double/debiased machine learning for treatment and structural parameters}
 25 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
 26 | binary or continuous treatment variables and endogeneity, high-dimensional 
 27 | controls and/or instrumental variables. 
 28 | 
 29 | {pstd}Stacking regression is a simple and powerful method for 
 30 | combining predictions from multiple learners.
 31 | {help pystacked} is the recommended way to specify multiple learners in {opt ddml},
 32 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
 33 | This help file provides an overview of how to implement stacking
 34 | when estimating using {opt ddml}.
 35 | 
 36 | 
 37 | {marker stacking}{...}
 38 | {title:Stacking}
 39 | 
 40 | {pstd}
 41 | Stacking regression ({help ddml stacking##Wolpert1992:Wolpert, 1992})
 42 | is a way of combining predictions from multiple base ("level-0") learners
 43 | into a final prediction.
 44 | A final estimator ("level-1") is used to combine the base predictions.
 45 | A common approach is to use the cross-validated (out-of-sample, OOS) predictions
 46 | of the base learners to obtain the weights for combining the learners.
 47 | 
 48 | {pstd}
 49 | Three ways of pairing stacking with DDML are supported:
 50 | {it:standard stacking}, provided via the {help pystacked} package;
 51 | {it:pooled stacking}, a variant of standard stacking;
 52 | and {it:short-stacking}, a version of stacking specific to double-debiased machine learning.
 53 | 
 54 | {marker pystacked}{...}
 55 | {pstd}{help pystacked} is the recommended way to specify multiple learners in {opt ddml}.
 56 | {help pystacked} provides a fast way of estimating all the learners in a single call to one program,
 57 | and {opt ddml} has integrated support for various features provided by {help pystacked}.
 58 | {opt ddml} will store the predicted values of the specified base learners as well as the combined ("stacked") predicted values.
 59 | It also stores the standard and pooled stacking weights used by {help pystacked}
 60 | along with the {opt ddml} short-stacking weights.
 61 | 
 62 | {pstd}{bf:Important}: For these features to be available, {help pystacked} needs to be the only learner for each conditional expectation.
 63 | Multiple learners must be specified in the call to {help pystacked}; see the examples below.
 64 | {help pystacked} can be provided directly to {opt ddml} as one of several learners for a conditional expectation,
 65 | but in this case the extra features for {help pystacked} will not be availabe.
 66 | 
 67 | {pstd}Note: some of the {opt ddml} stacking options available via {help pystacked} integration
 68 | are not available for the flexible IV model.
 69 | See this {help ddml_example_flexiv_anylearner_detailed:help file} for examples and discussion
 70 | of how to stack and short-stack when using the flexible IV model).{p_end}
 71 | 
 72 | 
 73 | {marker std_stack}{...}
 74 | {title:Standard stacking}
 75 | 
 76 | {pstd}
 77 | Standard stacking is implemented via the {help pystacked} package.
 78 | This is done by specifying {help pystacked} as the learner for a conditional expectation;
 79 | {help pystacked} in turn estimates using the user-specified base learners,
 80 | and stacks them to get the stacked (ensemble) prediction.
 81 | This is done in the context of {opt ddml}'s cross-fitting algorithm,
 82 | meaning that for k-fold cross-fitting, stacking is done k times,
 83 | once for each of the cross-fit folds.
 84 | 
 85 | {pstd}
 86 | The {help pystacked} base learners are specified at the {help ddml eq} stage,
 87 | when the supervised ML learners are added.
 88 | This is also where other {help pystacked} options can be specified, e.g.,
 89 | the final ("level-1") estimator used to combined the predictions.
 90 | The {help pystacked} default final predictor for stacking
 91 | regession is non-negative least squares (NNLS) without an intercept
 92 | and with the constraint that weights sum to one.
 93 | See {help pystacked} for alternative final estimators.
 94 | 
 95 | 
 96 | {marker pool_stack}{...}
 97 | {title:Pooled stacking}
 98 | 
 99 | {pstd}
100 | Pooled stacking is a variant of standard stacking that implements additional regularization
101 | via the {help ddml crossfit:cross-fitting} step of {opt ddml}.
102 | Pooled stacking is done once, after all cross-fitting has been done
103 | and a full set of all cross-validated OOS predictions has been obtained.
104 | This means that a single set of stacking weights is used to obtain all OOS cross-fit predictions.
105 | This is in contrast to standard stacking and k-fold cross-fitting,
106 | where k different sets of stacking weights are estimated and used to obtain the k OOS cross-fit predictions.
107 | Pooled stacking is specified at the {help ddml crossfit} stage using the {opt poolstack} option.
108 | Pooled stacking is available only in conjunction with standard stacking.
109 | The default final estimator is the same as with {help pystacked},
110 | and can be changed using the {opt psfinalest(estimator)} option.
111 | 
112 | {pstd}Note: all final estimators available with {help pystacked} are also available for pooled stacking.
113 | However, the current version of {help pystacked} generates the necessary cross-validated OOS predicted values
114 | only if the standard stacking final estimator used by {help pystacked}
115 | is either {opt nnls1} (the default), {opt ls1}, {opt ols}, {opt ridge} or {opt singlebest}.
116 | Hence when using pooled stacking,
117 | the standard stacking final estimator specified with {help pystacked} needs to be one of these.
118 | 
119 | 
120 | {marker short_stack}{...}
121 | {title:Short-stacking}
122 | 
123 | {pstd}
124 | Short-stacking is a form of stacking specific to double debiased machine learning and cross-fitting.
125 | Short-stacking uses the cross-fitted predicted values to obtain
126 | the stacked (weighted average) of the multiple base learner predictions.
127 | It is computationally faster (often much faster) than
128 | either standard stacking or pooled stacking available via {help pystacked}.
129 | Short-stacking also does not require use of {help pystacked};
130 | the predictions of any base learners specified by the user can be short-stacked.
131 | Short-stacking is specified at the {help ddml crossfit} stage using the {opt shortstack} option.
132 | The default final estimator is the same as with {help pystacked},
133 | and can be changed using the {opt finalest(estimator)} option.
134 | 
135 | {pstd}
136 | Because short-stacking is typically much faster than standard or pooled stacking,
137 | users may wish to use short-stacking as the only stacking method.
138 | This can be done efficiently in combination with {help pystacked}.
139 | To do this, (1) use {help pystacked} as the single learner in each equation;
140 | (2) at the cross-fitting stage, specify the {opt shortstack} and {cmdab:nostd:stack} options.
141 | This causes {help pystacked} to estimate the base learners
142 | without the computationally-costly stacking step in each cross-fit fold.
143 | 
144 | 
145 | {marker restack}{...}
146 | {title:Re-stacking after cross-fitting}
147 | 
148 | {pstd}
149 | Users have the option of re-stacking the base learner predictions using a different final estimator
150 | without having to re-cross-fit/re-estimate the entire model.
151 | This is done by specifying the stacking method and final estimator
152 | at the {help ddml estimate:ddml estimate} step.
153 | This feature is available only if {help pystacked} is the single learner in every equation.
154 | 
155 | 
156 | {marker stack_weights}{...}
157 | {title:Stacking weights}
158 | 
159 | {pstd}
160 | The weights used for standard stacking, pooled stacking and short-stacking
161 | can be inspected after estimation using {help ddml extract}
162 | with the {opt show(stweights)}, {opt show(psweights)} and/or {opt show(ssweights)}, respectively.
163 | In the case of standard stacking, the mean weights across cross-fit folds are displayed;
164 | to display the standard stacking weights for all k folds along with the separate learner MSEs,
165 | use the {opt show(pystacked)} option.
166 | 
167 | 
168 | {marker examples}{...}
169 | {title:Examples}
170 | 
171 | {pstd}
172 | See {help ddml init:help ddml init} for details of model initialization and learner specification options.
173 | 
174 | {pstd}Note: the additional support provided by {opt ddml} for {help pystacked} (see above)
175 | is available only if {help pystacked} is the sole learner for each conditional expectation.
176 | Mutliple learners are provided to {help pystacked}, not directly to {opt ddml}.{p_end}
177 | 
178 | 
179 | {smcl}
180 | INCLUDE help ddml_example_stacking.sthlp
181 | 
182 | 
183 | {smcl}
184 | INCLUDE help ddml_install_ref_auth
185 | 


--------------------------------------------------------------------------------
/ddml_estimate.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {viewerjumpto "Syntax" "ddml_estimate##syntax"}{...}
  4 | {viewerjumpto "Cross-fit options" "ddml_estimate##crossfit"}{...}
  5 | {viewerjumpto "Estimation options" "ddml_estimate##estimation"}{...}
  6 | {viewerjumpto "Replay options" "ddml_estimate##replay"}{...}
  7 | {viewerjumpto "User-specified variables" "ddml_estimate##userspec"}{...}
  8 | {viewerjumpto "Installation" "ddml_estimate##installation"}{...}
  9 | {viewerjumpto "References" "ddml_estimate##references"}{...}
 10 | {viewerjumpto "Authors" "ddml_estimate##authors"}{...}
 11 | {vieweralsosee "ddml main page" "ddml"}{...}
 12 | {vieweralsosee "Other" "ddml_estimate##also_see"}{...}
 13 | {hline}
 14 | {cmd:help ddml crossfit, ddml estimate}{right: v1.4.4}
 15 | {hline}
 16 | 
 17 | {title:ddml crossfit and estimate commands for Double Debiased Machine Learning}
 18 | 
 19 | {pstd}
 20 | {opt ddml} implements algorithms for causal inference aided by supervised
 21 | machine learning as proposed in 
 22 | {it:Double/debiased machine learning for treatment and structural parameters}
 23 | (Econometrics Journal, 2018). Five different models are supported, allowing for 
 24 | binary or continuous treatment variables and endogeneity, high-dimensional 
 25 | controls and/or instrumental variables. 
 26 | 
 27 | {pstd}
 28 | {opt ddml crossfit} implements the cross-fitting algorithm.
 29 | Each learner is fitted iteratively on training folds and out-of-sample predicted values are obtained.
 30 | 
 31 | {pstd}
 32 | {opt ddml estimate} estimates the model using the conditional expectations obtained from the cross-fitting step.
 33 | 
 34 | {pstd}
 35 | Before cross-fitting, the model must be defined using {help ddml init} and the learners specified using {help ddml eq}.
 36 | 
 37 | {pstd}
 38 | See the help for {help ddml stacking} for a detailed discussion and examples of stacking with {opt ddml}.
 39 | 
 40 | 
 41 | {marker syntax}{...}
 42 | {title:Syntax}
 43 | 
 44 | {p 8 14}{cmd:ddml crossfit} [ , {opt mname(name)} {opt shortstack} {opt poolstack} {cmdab:NOSTD:stack} {opt finalest(name)}{bind: ]} 
 45 | 
 46 | {p 8 14}{cmd:ddml estimate} [ , {opt mname(name)} {cmdab:r:obust} {opt cluster(varname)} {opt vce(type)}
 47 | {opt atet} {opt ateu} {opt trim(real)}
 48 | {opt mname(name)} {opt shortstack} {opt poolstack} {opt stdstack} {opt finalest(name)}{bind: ]} 
 49 | 
 50 | {p 8 14}Replay options (available after model estimation):
 51 | 
 52 | {p 8 14}{cmd:ddml estimate} [ , {opt mname(name)} {opt spec(integer or string)} {opt rep(integer or string)} {opt allcombos} {opt not:able} {opt replay}{bind: ]} 
 53 | 
 54 | {p 8 14}Using a user-specified combination of {help ddml crossfit} conditional expectations (models {opt partial}, {opt partialiv}, {opt fiv}):
 55 | 
 56 | {p 8 14}{cmd:ddml estimate} , {opt y(varname)} {opt d(varlist)} [ {opt z(varlist)} {opt dh(varname)} {opt foldvar(varname)} {opt mname(name)} {cmdab:r:obust} {opt cluster(varname)} {opt vce(type)}{bind: ]} 
 57 | 
 58 | {p 8 14}Using a user-specified combination of {help ddml crossfit} conditional expectations (models {opt interactive}, {opt interactiviv}):
 59 | 
 60 | {p 8 14}{cmd:ddml estimate} , {opt y0(varname)} {opt y1(varname)} [ {opt d(varname)} {opt d0(varname)} {opt d1(varname)} {opt z(varname)} {opt mname(name)}  {cmdab:r:obust} {opt cluster(varname)} {opt vce(type)}{bind: ]}
 61 | 
 62 | 
 63 | {marker crossfit}{...}
 64 | {synoptset 20}{...}
 65 | {synopthdr:Cross-fitting}
 66 | {synoptline}
 67 | {synopt:{opt mname(name)}}
 68 | name of the DDML model. Defaults to {it:m0}.
 69 | {p_end}
 70 | {synopt:{opt shortstack}} asks for short-stacking to be used.
 71 | Short-stacking uses the
 72 | cross-fitted predicted values to obtain a weighted average
 73 | of the multiple base learners.
 74 | It is computationally faster (often much faster) than standard stacking
 75 | (implemented via {help pystacked}) is used to specify the base learners
 76 | {p_end}
 77 | {synopt:{opt poolstack}} is available as an alternative to standard stacking
 78 | when {help pystacked} is used to specify the base learners.
 79 | Pooled-stacking adds additional regularization
 80 | by obtaining a single set of stacking weights
 81 | from the full set of out-of-sample base learner predicted values
 82 | (in contrast to {help pystacked}, which stacks each cross-fit fold separately).
 83 | {p_end}
 84 | {synopt:{opt nostdstack}} is used in conjunction with short-stacking and {help pystacked}.
 85 | It tells {help pystacked} to generate the base learner predictions without
 86 | the computationally-expensive additional step of obtaining the stacking weights.
 87 | This option should be used if short-stacking is the only stacking method needed.
 88 | {p_end}
 89 | {synopt:{opt finalest(name)}} sets the final estimator for all stacking methods;
 90 | the default is the {help pystacked} default of non-negative nonlinear least squares.
 91 | See {help pystacked} for alternative stacking final estimators.
 92 | NB: use of this option is incompatible with use of the {opt finalest(.)} option
 93 | when {help pystacked} is the learner specified in an equation using {help ddml eq};
 94 | use {opt finalest} in one or the other, or neither (the default), but not both.
 95 | {p_end}
 96 | {synoptline}
 97 | {p2colreset}{...}
 98 | {pstd}
 99 | 
100 | {marker estimation}{...}
101 | {synoptset 20}{...}
102 | {synopthdr:Estimation}
103 | {synoptline}
104 | {synopt:{opt mname(name)}}
105 | name of the DDML model. Defaults to {it:m0}.
106 | {p_end}
107 | {synopt:{cmdab:r:obust}}
108 | report SEs that are robust to the
109 | presence of arbitrary heteroskedasticity.
110 | {p_end}
111 | {synopt:{opt cluster(varname)}}
112 | select cluster-robust variance-covariance estimator, e.g. {cmd:vce(hc3)} or {cmd:vce(cluster id)}.
113 | {p_end}
114 | {synopt:{opt vce(type)}}
115 | select variance-covariance estimator; see {help regress##vcetype:here}.
116 | {p_end}
117 | {synopt:{cmdab:noc:onstant}}
118 | suppress constant term ({it:partial}, {it:iv}, {it:fiv} models only). Since the residualized outcome 
119 | and treatment may not be exactly mean-zero in finite samples, {cmd:ddml} includes the constant by 
120 | default in the estimation stage of partially linear models.
121 | {p_end}
122 | {synopt:{cmdab:showc:onstant}}
123 | display constant term in summary estimation output table ({it:partial}, {it:iv}, {it:fiv} models only).
124 | {p_end}
125 | {synopt:{opt atet}}
126 | report average treatment effect of the treated (default is ATE).
127 | {p_end}
128 | {synopt:{opt ateu}}
129 | report average treatment effect of the untreated (default is ATE).
130 | {p_end}
131 | {synopt:{opt trim(real)}}
132 | trimming of propensity scores for the Interactive and Interactive IV models. The default is 0.01
133 | (that is, values below 0.01 and above 0.99 are set 
134 | to 0.01 and 0.99, respectively).
135 | {p_end}
136 | {synopt:{opt shortstack}} requests re-stacking of the short-stacking results
137 | using the final estimator specified with {opt finalest(.)};
138 | this option is available only if {help pystacked} is the single learner for each equation.
139 | Re-stacking is fast because it doesn't require re-cross-fitting.
140 | {p_end}
141 | {synopt:{opt poolstack}} requests re-stacking of the pooled stacking results
142 | using the final estimator specified with {opt finalest(.)};
143 | this option is available only if {help pystacked} is the single learner for each equation.
144 | Re-stacking is fast because it doesn't require re-cross-fitting.
145 | {p_end}
146 | {synopt:{opt stdstack}} requests re-stacking of the standard stacking results
147 | using the final estimator specified with {opt finalest(.)};
148 | this option is available only if {help pystacked} is the single learner for each equation.
149 | Re-stacking is fast because it doesn't require re-cross-fitting.
150 | {p_end}
151 | {synopt:{opt finalest(name)}} sets the final estimator for all stacking methods;
152 | the default is the {help pystacked} default of non-negative nonlinear least squares.
153 | See {help pystacked} for alternative stacking final estimators.
154 | {p_end}
155 | {p2colreset}{...}
156 | {pstd}
157 | 
158 | {marker replay}{...}
159 | {synoptset 20}{...}
160 | {synopthdr:Replay}
161 | {synoptline}
162 | {synopt:{opt spec(integer/string)}}
163 | select specification. This can either be the specification number,
164 | {it:mse} for minimum-MSE specification (the default with non-{help pystacked} multiple learners),
165 | or {it:st}, {it:ss} or {it:ps} for standard stacking (via {help pystacked}), short-stacking or pooled stacking, respectively.
166 | {p_end}
167 | {synopt:{opt rep(integer/string)}}
168 | select resampling iteration. This can either be the cross-fit repetition number, {it:mn} for mean aggregation or {it:md} for median aggregation (the default).
169 | {p_end}
170 | {synopt:{opt allcombos}}
171 | estimates all possible specifications. By default, only the min-MSE, short-stacking or or pooled-stacking
172 | specification is estimated and displayed.
173 | {p_end}
174 | {synopt:{opt replay}}
175 | used in combination with {opt spec()} and {opt rep()} to display and return estimation results.
176 | {p_end}
177 | {synoptline}
178 | {p2colreset}{...}
179 | {pstd}
180 | 
181 | {marker userspec}{...}
182 | {synoptset 20}{...}
183 | {synopthdr:User-specified vars}
184 | {synoptline}
185 | {synopt:{opt y(varname)}}
186 | estimated conditional expectation of dependent variable (models {opt partial}, {opt partialiv}, {opt fiv})
187 | {p_end}
188 | {synopt:{opt d(varname)}}
189 | estimated conditional expectation of causal variable of interest (models {opt partial}, {opt partialiv}, {opt fiv})
190 | {p_end}
191 | {synopt:{opt z(varname)}}
192 | estimated conditional expectation of instrumental variables (models {opt partialiv}, {opt interactiv})
193 | {p_end}
194 | {synopt:{opt dh(varname)}}
195 | estimated optimal IV = E[D|X,Z] - E[D^|X] (model {opt fiv} only)
196 | {p_end}
197 | {synopt:{opt y0(varname)}}
198 | estimated E[Y|X,D=0] (model {opt interactive}) or E[Y|X,Z=0] (model {opt interactiveiv})
199 | {p_end}
200 | {synopt:{opt y1(varname)}}
201 | estimated E[Y|X,D=1] (model {opt interactive}) or E[Y|X,Z=1] (model {opt interactiveiv})
202 | {p_end}
203 | {synopt:{opt d0(varname)}}
204 | estimated E[D|X,Z=0] (model {opt interactiveiv})
205 | {p_end}
206 | {synopt:{opt d1(varname)}}
207 | estimated E[D|X,Z=1] (model {opt interactiveiv})
208 | {p_end}
209 | {synopt:{opt foldvar(varname)}}
210 | cross-fit fold ID variable; required with {opt interactive} model and ATET or ATEU
211 | {p_end}
212 | 
213 | 
214 | {smcl}
215 | INCLUDE help ddml_install_ref_auth
216 | 


--------------------------------------------------------------------------------
/cert/ddml_cert_partial.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 | 
  3 | if ("`c(username)'"=="kahrens") {
  4 | 	adopath + "/Users/kahrens/MyProjects/ddml"
  5 | 	adopath + "/Users/kahrens/MyProjects/pystacked"
  6 | }
  7 | 
  8 | cap cd "/Users/kahrens/MyProjects/ddml/cert"
  9 | cap cd "C:\LocalStore\ecomes\Documents\GitHub\ddml\cert"
 10 | 
 11 | cap log close
 12 | log using "ddml_cert_partial", replace text
 13 | 
 14 | which ddml
 15 | mata: whichddml()
 16 | 
 17 | use https://statalasso.github.io/dta/AJR.dta, clear
 18 | 
 19 | // necessary programs for cert; script exits with error if not installed
 20 | findfile pystacked.ado
 21 | 
 22 | set seed 123
 23 | 
 24 | ******************************************************************************** 
 25 | **** Partially-linear model													****
 26 | ******************************************************************************** 
 27 | 
 28 | global Y logpgp95
 29 | global X lat_abst edes1975 temp* humid* steplow-oilres
 30 | global D1 avexpr 
 31 | global D2 democ1 
 32 | global D3 avelf
 33 | 
 34 | *** pystacked, no SS
 35 | 
 36 | *** initialise ddml and select model; 
 37 | ddml init partial, kfolds(2) reps(2)
 38 | ddml E[Y|X]: pystacked $Y $X , type(reg)
 39 | ddml E[D|X]: pystacked $D1 $X , type(reg)
 40 | ddml E[D|X]: pystacked $D2 $X , type(reg)
 41 | ddml crossfit
 42 | ddml estimate, robust
 43 | *** replay
 44 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 45 | *** append, estimate, replay
 46 | ddml sample, append(1)
 47 | ddml crossfit
 48 | ddml estimate
 49 | *** replay
 50 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 51 | ddml estimate, mname(m0) spec(st) rep(2) replay notable
 52 | ddml estimate, mname(m0) spec(st) rep(mn) replay notable
 53 | ddml estimate, mname(m0) spec(st) rep(md) replay notable
 54 | 
 55 | *** pystacked, SS and PS
 56 | 
 57 | *** initialise ddml and select model; 
 58 | ddml init partial, kfolds(2) reps(2)
 59 | ddml E[Y|X]: pystacked $Y $X , type(reg)
 60 | ddml E[D|X]: pystacked $D1 $X , type(reg)
 61 | ddml E[D|X]: pystacked $D2 $X , type(reg)
 62 | ddml crossfit, shortstack poolstack
 63 | *** estimation of parameter of interest
 64 | ddml estimate, robust
 65 | *** replay
 66 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 67 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
 68 | ddml estimate, mname(m0) spec(ps) rep(1) replay notable
 69 | *** append, estimate, replay
 70 | ddml sample, append(1)
 71 | ddml crossfit, shortstack poolstack
 72 | ddml estimate
 73 | *** replay
 74 | ddml estimate, mname(m0) spec(st) rep(1) replay notable
 75 | ddml estimate, mname(m0) spec(st) rep(2) replay notable
 76 | ddml estimate, mname(m0) spec(st) rep(mn) replay notable
 77 | ddml estimate, mname(m0) spec(st) rep(md) replay notable
 78 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
 79 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
 80 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
 81 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
 82 | ddml estimate, mname(m0) spec(ps) rep(1) replay notable
 83 | ddml estimate, mname(m0) spec(ps) rep(2) replay notable
 84 | ddml estimate, mname(m0) spec(ps) rep(mn) replay notable
 85 | ddml estimate, mname(m0) spec(ps) rep(md) replay notable
 86 | 
 87 | *** multiple learners, no SS
 88 | ddml init partial, kfolds(2) reps(2)
 89 | ddml E[Y|X]: pystacked $Y $X , type(reg)
 90 | ddml E[Y|X]: reg $Y $X 
 91 | ddml E[D|X]: pystacked $D1 $X , type(reg)
 92 | ddml E[D|X]: reg $D1 $X 
 93 | ddml crossfit
 94 | ddml estimate, robust
 95 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
 96 | *** allcombos
 97 | ddml estimate, allcombos
 98 | forvalues i=1/4 {
 99 |     ddml estimate, mname(m0) spec(`i') rep(1) replay notable
100 | }
101 | *** append, estimate, replay
102 | ddml sample, append(1)
103 | ddml crossfit
104 | ddml estimate
105 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
106 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
107 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
108 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
109 | *** allcombos
110 | ddml estimate, allcombos
111 | forvalues i=1/4 {
112 | 	forvalues r=1/2 {
113 | 		ddml estimate, mname(m0) spec(`i') rep(`r') replay notable
114 | 	}
115 | }
116 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
117 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
118 | 
119 | *** multiple learners, SS
120 | ddml init partial, kfolds(2) reps(2)
121 | ddml E[Y|X]: pystacked $Y $X , type(reg)
122 | ddml E[Y|X]: reg $Y $X 
123 | ddml E[D|X]: pystacked $D1 $X , type(reg)
124 | ddml E[D|X]: reg $D1 $X 
125 | ddml crossfit, shortstack
126 | ddml estimate, robust
127 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
128 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
129 | *** allcombos
130 | ddml estimate, allcombos
131 | forvalues i=1/4 {
132 |     ddml estimate, mname(m0) spec(`i') rep(1) replay notable
133 | }
134 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
135 | *** append, estimate, replay
136 | ddml sample, append(1)
137 | ddml crossfit, shortstack
138 | ddml estimate
139 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
140 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
141 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
142 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
143 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
144 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
145 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
146 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
147 | *** allcombos
148 | ddml estimate, allcombos
149 | forvalues i=1/4 {
150 |     forvalues r=1/2 {
151 | 		ddml estimate, mname(m0) spec(`i') rep(`r') replay notable
152 | 	}
153 | }
154 | ddml estimate, mname(m0) spec(mse) rep(1) replay notable
155 | ddml estimate, mname(m0) spec(mse) rep(2) replay notable
156 | ddml estimate, mname(m0) spec(mse) rep(mn) replay notable
157 | ddml estimate, mname(m0) spec(mse) rep(md) replay notable
158 | ddml estimate, mname(m0) spec(ss) rep(1) replay notable
159 | ddml estimate, mname(m0) spec(ss) rep(2) replay notable
160 | ddml estimate, mname(m0) spec(ss) rep(mn) replay notable
161 | ddml estimate, mname(m0) spec(ss) rep(md) replay notable
162 | 
163 | ******************************************************************************** 
164 | **** Restacking with ddml estimate											****
165 | ******************************************************************************** 
166 | 
167 | // with pystacked:
168 | 
169 | // setup
170 | set seed 123
171 | ddml init partial, kfolds(2) reps(1)
172 | ddml E[Y|X]: pystacked $Y $X , type(reg)
173 | ddml E[D|X]: pystacked $D1 $X , type(reg)
174 | 
175 | // initial = std stack, shortstack, poolstack
176 | set seed 123
177 | ddml crossfit, shortstack poolstack
178 | ddml estimate
179 | // save for later comparison
180 | ddml extract, show(weights)
181 | mat ssw_nnls1 = r(Y_logpgp95_ss)
182 | mat psw_nnls1 = r(Y_logpgp95_ps)
183 | mat stsw_nnls1 = r(Y1_pystacked_w_mn)
184 | // restack shortstack
185 | ddml estimate, shortstack ssfinalest(singlebest)
186 | ddml extract, show(weights)
187 | mat ssw = r(Y_logpgp95_ss)
188 | assert el(ssw,1,2)==0
189 | assert el(ssw,2,2)==1
190 | assert el(ssw,3,2)==0
191 | mat psw = r(Y_logpgp95_ps)
192 | assert el(psw,1,2)~=0
193 | mat stsw = r(Y1_pystacked_w_mn)
194 | assert el(stsw,1,2)~=0
195 | // restack poolstack
196 | ddml estimate, poolstack psfinalest(singlebest)
197 | ddml extract, show(weights)
198 | mat psw = r(Y_logpgp95_ps)
199 | assert el(psw,1,2)==0
200 | assert el(psw,2,2)==0
201 | assert el(psw,3,2)==1
202 | mat stsw = r(Y1_pystacked_w_mn)
203 | assert el(stsw,1,2)~=0
204 | // restack standard stack
205 | ddml estimate, stdstack stdfinalest(singlebest)
206 | ddml extract, show(weights)
207 | mat stsw = r(Y1_pystacked_w_mn)
208 | assert el(stsw,1,2)==0
209 | assert el(stsw,2,2)==0.5
210 | assert el(stsw,3,2)==0.5
211 | // all restacked to nnls1
212 | ddml estimate, finalest(nnls1)
213 | ddml extract, show(weights)
214 | mat ssw = r(Y_logpgp95_ss)
215 | mat psw = r(Y_logpgp95_ps)
216 | mat stsw = r(Y1_pystacked_w_mn)
217 | assert mreldif(ssw,ssw_nnls1) < 10e-10
218 | assert mreldif(psw,psw_nnls1) < 10e-10
219 | // slight differences, possible float/double issue
220 | assert mreldif(stsw,stsw_nnls1) < 10e-6
221 | 
222 | // initial = std stack, shortstack, poolstack
223 | set seed 123
224 | ddml crossfit, shortstack poolstack
225 | ddml estimate
226 | ddml extract, show(weights)
227 | // restack shortstack + poolstack
228 | ddml estimate, shortstack ssfinalest(singlebest) poolstack psfinalest(singlebest)
229 | ddml extract, show(weights)
230 | mat ssw = r(Y_logpgp95_ss)
231 | assert el(ssw,1,2)==0
232 | assert el(ssw,2,2)==1
233 | assert el(ssw,3,2)==0
234 | mat psw = r(Y_logpgp95_ps)
235 | assert el(psw,1,2)==0
236 | assert el(psw,2,2)==0
237 | assert el(psw,3,2)==1
238 | 
239 | // initial = std stack only
240 | set seed 123
241 | ddml crossfit
242 | ddml estimate
243 | ddml extract, show(weights)
244 | // restack - add shortstack
245 | ddml estimate, shortstack
246 | ddml extract, show(weights)
247 | 
248 | // initial = std stack only
249 | set seed 123
250 | ddml crossfit
251 | ddml estimate
252 | ddml extract, show(weights)
253 | // restack - add shortstack
254 | ddml estimate, shortstack
255 | ddml extract, show(weights)
256 | 
257 | // initial = std stack only
258 | set seed 123
259 | ddml crossfit
260 | ddml estimate
261 | ddml extract, show(weights)
262 | // restack - add shortstack with singlebest
263 | ddml estimate, shortstack ssfinalest(singlebest)
264 | ddml extract, show(weights)
265 | mat ssw = r(Y_logpgp95_ss)
266 | assert el(ssw,1,2)==0
267 | assert el(ssw,2,2)==1
268 | assert el(ssw,3,2)==0
269 | 
270 | // initial = shortstack, no standard stack
271 | set seed 123
272 | ddml crossfit, shortstack nostdstack
273 | ddml estimate
274 | ddml extract, show(weights)
275 | // restack - shortstack with singlebest
276 | ddml estimate, shortstack ssfinalest(singlebest)
277 | ddml extract, show(weights)
278 | mat ssw = r(Y_logpgp95_ss)
279 | assert el(ssw,1,2)==0
280 | assert el(ssw,2,2)==1
281 | assert el(ssw,3,2)==0
282 | 
283 | // without pystacked: not currently supported
284 | 
285 | // setup
286 | set seed 123
287 | ddml init partial, kfolds(2) reps(1)
288 | ddml E[Y|X]: pystacked $Y $X , type(reg)
289 | ddml E[Y|X]: reg $Y $X
290 | ddml E[D|X]: pystacked $D1 $X , type(reg)
291 | ddml E[D|X]: reg $D1 $X
292 | 
293 | // no shortstack
294 | set seed 123
295 | ddml crossfit
296 | ddml estimate, allcombos
297 | // restack - add shortstack
298 | cap noi ddml estimate, shortstack
299 | assert _rc==198
300 | 
301 | log close
302 | 


--------------------------------------------------------------------------------
/crossfit.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* *! version 30aug2024}{...}
  3 | {viewerjumpto "Syntax" "crossfit##syntax"}{...}
  4 | {viewerjumpto "Summary" "crossfit##summary"}{...}
  5 | {viewerjumpto "Compatible programs" "crossfit##compatibility"}{...}
  6 | {viewerjumpto "Examples" "crossfit##examples"}{...}
  7 | {viewerjumpto "Saved results" "crossfit##results"}{...}
  8 | {viewerjumpto "References" "crossfit##references"}{...}
  9 | {viewerjumpto "Authors" "crossfit##authors"}{...}
 10 | {vieweralsosee "ddml main page" "ddml"}{...}
 11 | {vieweralsosee "ddml crossfit" "ddml crossfit"}{...}
 12 | {vieweralsosee "ddml stacking" "ddml stacking"}{...}
 13 | {vieweralsosee "Other" "crossfit##also_see"}{...}
 14 | {hline}
 15 | {cmd:help crossfit}{right: v1.4.4}
 16 | {hline}
 17 | 
 18 | 
 19 | {title:crossfit - Stata program for cross-fitting}
 20 | 
 21 | {pstd}
 22 | {opt crossfit} fits a supervised machine learner on K-1 folds
 23 | and returns the out-of-sample predicted values for the holdout fold.
 24 | This is done iteratively to obtain out-of-sample ("cross-fitted") fitted values for the whole sample.
 25 | 
 26 | {pstd}
 27 | {opt crossfit} is an auxiliary program that is internally used by 
 28 | {help ddml} and {help qddml}, but can be used for other purposes.
 29 | 
 30 | 
 31 | {marker syntax}{...}
 32 | {title:Syntax}
 33 | 
 34 | {p 8 14 2}
 35 | {cmd:crossfit} , 
 36 | {opt estring(string)}
 37 | {opt g:enerate(stubname)}
 38 | [{opt kfolds(integer)}
 39 | {opt foldvar(varlist)}
 40 | {opt norandom}
 41 | {opt reps(integer)}
 42 | {opt vtype(string)}]
 43 | 
 44 | {synoptset 20}{...}
 45 | {synopthdr:Option}
 46 | {synoptline}
 47 | {synopt:{opt estring(string)}}
 48 | An estimation string, e.g. "reg y x1 x2", that will be 
 49 | repeatedly invoked. See note on compatible programs 
 50 | {help ddml##compatibility:here}.
 51 | {p_end}
 52 | {synopt:{opt g:enerate(stubname)}}
 53 | Name of the new variable to be created;
 54 | the resample number is appended to the end of the variable name.
 55 | Note that if the variable (including the resample number) already exists, it is overwritten.
 56 | {p_end}
 57 | {synopt:{opt kfolds(integer)}}
 58 | Number of randomly drawn folds; ignored if {opt foldvar(varlist)} is specified; default=5.
 59 | {p_end}
 60 | {synopt:{opt foldvar(varlist)}}
 61 | Integer variable(s) with user-specified cross-fitting folds; one foldvar per resample.
 62 | {p_end}
 63 | {synopt:{opt norandom}}
 64 | Use observations in existing order instead of randomizing before splitting into folds;
 65 | if multiple resamples, applies to first resample only;
 66 | ignored if user-defined fold variables are provided in {opt foldvar(varlist)}.
 67 | {p_end}
 68 | {synopt:{opt reps(integer)}}
 69 | Number of resampling iterations, i.e., how often the cross-fitting procedure is
 70 | repeated on randomly generated folds;
 71 | ignored if {opt foldvar(varlist)} is specified;
 72 | default=1.
 73 | {p_end}
 74 | {synopt:{opt vtype(string)}}
 75 | Variable type of the variable to be created. Defaults to {it:double}. 
 76 | {it:none} can be used to leave the type field blank.
 77 | {p_end}
 78 | 
 79 | 
 80 | {marker summary}{...}
 81 | {title:Summary}
 82 | 
 83 | {pstd}
 84 | {opt crossfit} fits a supervised machine learner on K-1 folds
 85 | and returns the out-of-sample predicted values for the holdout fold.
 86 | This process is repeated so that each fold serves once as the holdout fold
 87 | for which predictions are created.
 88 | At the end of the cross-fitting, a full set of predictions is available
 89 | in the new variable specified by the {opt generate} option.
 90 | The "supervised machine learner" can be any Stata estimator
 91 | that supports standard postestimation prediction.
 92 | {p_end}
 93 | 
 94 | {pstd}
 95 | {opt crossfit}'s default is to generate a single random split into folds.
 96 | This can be overridden by specifying user-defined fold variables,
 97 | or by the {opt norandom} option (indicating that the split use the data in the existing order).
 98 | {p_end}
 99 | 
100 | {pstd}
101 | {opt crossfit} allows multiple resampling,
102 | meaning that the procedure is applied repeatedly
103 | using multiple fold variables that indicate different fold splits.
104 | This can be done via the {opt reps} option,
105 | or by providing multiple user-defined fold variables.
106 | The resample number is appended to the generated predictions.
107 | {p_end}
108 | 
109 | {pstd}
110 | The output of {opt crossfit} can be seen as the intermediate step
111 | of standard K-fold cross-validation.
112 | In a typical cross-validation exercise, a search is conducted across a range of specifications (e.g. values for a tuning parameter).
113 | The prediction errors for the holdout folds are assembled for each specification,
114 | and the specification with the best prediction performance (e.g. smallest mean squared prediction error) is chosen.
115 | A simple example of how to use {opt crossfit} to do this is below.
116 | {p_end}
117 | 
118 | {pstd}
119 | {opt crossfit} has integrated support for {opt pystacked} (see the help for {help pystacked} if installed).
120 | {help pystacked} is a front-end for the {browse "https://scikit-learn.org/stable/index.html":scikit-learn}
121 | implementation of stacking regression.
122 | Stacking is a way of combining multiple supervised
123 | machine learners (the "base" or "level-0" learners) into
124 | an ensemble or "meta" learner.
125 | When used in conjunction with {opt crossfit}, the predictions of the {help pystacked} base learners
126 | are generated along with the ensemble predicted values.
127 | {p_end}
128 | 
129 | 
130 | {marker compatibility}{...}
131 | {title:Compatible programs}
132 | 
133 | {pstd} 
134 | See {help ddml##compatibility:here}.
135 | 
136 | 
137 | {marker examples}{...}
138 | {title:Examples}
139 | 
140 | {phang2}. {stata "use http://fmwww.bc.edu/repec/bocode/j/jtpa.dta, clear"}{p_end}
141 | {phang2}. {stata "global X sex age married black hispanic"}{p_end}
142 | {phang2}. {stata "set seed 42"}{p_end}
143 | 
144 | {pstd}Note that the variable created is called yhat_1 because the number of resamples defaults to 1.{p_end}
145 | 
146 | {phang2}. {stata "crossfit, estring(reg earnings $X) gen(yhat) kfolds(3)"}{p_end}
147 | {phang2}. {stata "sum earnings yhat_1"}{p_end}
148 | 
149 | {pstd}As above but using 5 resamples.{p_end}
150 | 
151 | {phang2}. {stata "crossfit, estring(reg earnings $X) gen(yhat) kfolds(3) reps(5)"}{p_end}
152 | {phang2}. {stata "sum earnings yhat*"}{p_end}
153 | 
154 | {pstd}As above but using {help pystacked}.
155 | The default base learners are OLS, CV-lasso and gradient boosting.{p_end}
156 | 
157 | {phang2}. {stata "crossfit, estring(pystacked earnings $X) gen(yhat) kfolds(3) reps(5)"}{p_end}
158 | {phang2}. {stata "sum earnings yhat*"}{p_end}
159 | 
160 | {pstd}A simple example of 3-fold cross-validation with 5 resamples using {opt crossfit}.
161 | The example uses {opt lasso2} from {opt lassopack}; click on {stata "ssc install lassopack"} to install.
162 | We estimate using the following values of the lambda parameter: 2000, 1000, 500, 250.
163 | Each time we call {opt crossfit} to obtain the predicted values.
164 | These could be used after cross-fitting to calculate the MSPE (mean squared prediction error),
165 | but the MSPE is one of the returned results of {opt crossfit} so we just report that.
166 | The specification that minimizes the MSPE for all 5 resamples is lambda=250.
167 | {p_end}
168 | 
169 | {phang2}. {stata "crossfit, estring(lasso2 earnings $X, lglmnet lambda(2000)) gen(yhat2000) kfolds(3) reps(5)"}{p_end}
170 | {phang2}. {stata "mat list r(mse_list)"}{p_end}
171 | {phang2}. {stata "crossfit, estring(lasso2 earnings $X, lglmnet lambda(1000)) gen(yhat1000) kfolds(3) reps(5)"}{p_end}
172 | {phang2}. {stata "mat list r(mse_list)"}{p_end}
173 | {phang2}. {stata "crossfit, estring(lasso2 earnings $X, lglmnet lambda(500)) gen(yhat500) kfolds(3) reps(5)"}{p_end}
174 | {phang2}. {stata "mat list r(mse_list)"}{p_end}
175 | {phang2}. {stata "crossfit, estring(lasso2 earnings $X, lglmnet lambda(250)) gen(yhat250) kfolds(3) reps(5)"}{p_end}
176 | {phang2}. {stata "mat list r(mse_list)"}{p_end}
177 | 
178 | {pstd}When used as a standalone program, {opt crossfit} leaves behind in Mata a eStruct ("equation struct") called "crossfit".
179 | This object contains information about the estimation, stored on associative arrays.
180 | The utility {help ddml extract} can be used to extract this information.
181 | The example below shows how to list the AA keys
182 | and how to extract the {help pystacked} stacking weights for resample 2.
183 | Rows are base learners; columns are the weights for each learner.{p_end}
184 | 
185 | {phang2}. {stata "mata: mata desc crossfit"}{p_end}
186 | {phang2}. {stata "ddml extract, ename(crossfit) keys"}{p_end}
187 | {phang2}. {stata "ddml extract, ename(crossfit) key1(yhat) key2(stack_base_est)"}{p_end}
188 | {phang2}. {stata "ddml extract, ename(crossfit) key1(yhat) key2(stack_weights) key3(2)"}{p_end}
189 | 
190 | 
191 | {marker results}{title:Saved results}
192 | 
193 | {p}{opt crossfit} saves the following results in {cmd:r()}:
194 | 
195 | Scalars
196 | {col 4}{opt r(N)}{col 25}Number of observations.
197 | {col 4}{opt r(mse)}{col 25}Mean squared prediction error in the last resample.
198 | 
199 | Macros
200 | {col 4}{opt r(cmd_list)}{col 25}Estimation command
201 | 
202 | Matrices
203 | {col 4}{opt r(N_list)}{col 25}Sample size; rows are resamples.
204 | {col 4}{opt r(mse_list)}{col 25}MSPE; rows are resamples.
205 | {col 4}{opt r(N_folds_list)}{col 25}Sample size by fold; rows are resamples.
206 | {col 4}{opt r(mse_folds_list)}{col 25}MSPE by fold; rows are resamples.
207 | 
208 | 
209 | {marker references}{title:References}
210 | 
211 | {phang}
212 | Ahrens, A., Hansen, C.B. and M.E. Schaffer. 2020.
213 | lassopack: model selection and prediction with regularized regression in Stata.
214 | {it:The Stata Journal}, 20(1):176-235.
215 | {browse "https://journals.sagepub.com/doi/abs/10.1177/1536867X20909697"}.
216 | Working paper version: {browse "https://arxiv.org/abs/1901.05397"}.{p_end}
217 | 
218 | {phang}
219 | Chernozhukov, V., Chetverikov, D., Demirer, M., 
220 | Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), 
221 | Double/debiased machine learning for 
222 | treatment and structural parameters. 
223 | {it:The Econometrics Journal}, 21: C1-C68. {browse "https://doi.org/10.1111/ectj.12097"}
224 | 
225 | 
226 | {marker authors}{title:Authors}
227 | 
228 | {pstd}
229 | Achim Ahrens, Public Policy Group, ETH Zurich, Switzerland  {break}
230 | achim.ahrens@gess.ethz.ch
231 | 
232 | {pstd}
233 | Christian B. Hansen, University of Chicago, USA {break}
234 | Christian.Hansen@chicagobooth.edu
235 | 
236 | {pstd}
237 | Mark E Schaffer, Heriot-Watt University, UK {break}
238 | m.e.schaffer@hw.ac.uk   
239 | 
240 | {pstd}
241 | Thomas Wiemann, University of Chicago, USA {break}
242 | wiemann@uchicago.edu
243 | 
244 | 
245 | {title:Also see (if installed)}
246 | 
247 | {pstd}
248 | Help: {help ddml}, {help qddml}, {help pystacked}, {help lasso2}, {help cvlasso}.{p_end}
249 | 


--------------------------------------------------------------------------------