├── boba ├── output │ ├── __init__.py │ └── csvmerger.py ├── __init__.py ├── util.py ├── baseparser.py ├── lang.py ├── graphanalyzer.py ├── conditionparser.py ├── graphparser.py ├── cli.py ├── blocksyntaxparser.py ├── codeparser.py ├── wrangler.py ├── bobarun.py └── adg.py ├── example ├── simple │ ├── output │ │ ├── post_exe.sh │ │ ├── pre_exe.sh │ │ ├── lang.json │ │ ├── summary.csv │ │ ├── code │ │ │ ├── universe_4.py │ │ │ ├── universe_5.py │ │ │ ├── universe_6.py │ │ │ ├── universe_1.py │ │ │ ├── universe_2.py │ │ │ └── universe_3.py │ │ └── overview.json │ ├── script.py │ ├── gen_data.py │ ├── template.py │ └── data.csv ├── mortgage │ ├── after_execute.sh │ ├── visualizer_config.json │ ├── visualizer_config_monitor.json │ └── template.R ├── hurricane │ ├── after_execute.sh │ ├── install.R │ ├── data_wrangling │ │ ├── debug_count.py │ │ ├── wrangle.py │ │ └── data_jung.csv │ ├── stacking_weights.R │ ├── visualizer_config.json │ ├── README.md │ ├── data.csv │ ├── reproduce │ │ ├── repro_marginalize.R │ │ └── repro_bootstrap.R │ ├── repro.R │ ├── boba_util.R │ └── template.R ├── fertility_r │ ├── spec.json │ └── template.R ├── simple_cont │ ├── script.py │ ├── gen_data.py │ ├── template.py │ └── data.csv ├── reading │ ├── r │ │ ├── brms_test.R │ │ ├── install.R │ │ └── template.R │ ├── script.r │ └── python │ │ ├── script.py │ │ └── template.py └── fertility │ ├── script.py │ └── template.py ├── MANIFEST.in ├── test ├── __init__.py ├── specs │ ├── script-no-graph-empty.py │ ├── spec-good.json │ ├── script-no-graph.py │ ├── script1.py │ ├── script2.py │ ├── script2-dup.py │ ├── script2-syntax.py │ ├── script-inline-constraints.py │ ├── script1-bad-graph.py │ ├── script1-cyclic-graph.py │ ├── script2-dup-var.py │ ├── script1-good.py │ ├── script2-block-param.py │ ├── script3-2.py │ ├── script4-3.py │ ├── script3-7.py │ ├── script3-5.py │ ├── script3-6.py │ ├── script3-3.py │ ├── script4-2.py │ ├── script3-4.py │ ├── script4-1.py │ ├── script3-1.py │ ├── continuous-err.json │ └── continuous.json ├── test_c │ ├── lang.json │ └── template.c ├── test_lang.py ├── test_graph_parser.py ├── test_block_syntax_parser.py ├── test_graph_analyzer.py └── test_constraint_parser.py ├── requirements.txt ├── deploy.sh ├── requirements_dev.txt ├── .travis.yml ├── tox.ini ├── setup.cfg ├── .gitignore ├── setup.py ├── LICENSE ├── HISTORY.rst ├── tutorial ├── cli.rst └── simple.md └── README.rst /boba/output/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/simple/output/post_exe.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/simple/output/pre_exe.sh: -------------------------------------------------------------------------------- 1 | cp ../data.csv ./code/ -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE 3 | include HISTORY.rst 4 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Unit test package for boba.""" 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.21.2 2 | scipy>=1.6.0 3 | six>=1.12.0 4 | statsmodels>=0.12.2 5 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | rm -rf boba.egg-info/ 2 | rm -rf build/ 3 | rm -rf dist/ 4 | python3 setup.py sdist bdist_wheel 5 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pip>=19.2.3 2 | bumpversion>=0.5.3 3 | wheel>=0.32.1 4 | tox>=3.14.0 5 | twine>=1.12.1 6 | -------------------------------------------------------------------------------- /test/specs/script-no-graph-empty.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | a = {{a}} 3 | b = a * 2 4 | print(b) -------------------------------------------------------------------------------- /example/simple/output/lang.json: -------------------------------------------------------------------------------- 1 | {"python": {"ext": ["py"], "run": ["python", "{{script_name}}"]}, "r": {"ext": ["R", "r"], "run": ["Rscript", "{{script_name}}"]}} -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | language: python 3 | python: 4 | - "3.6" 5 | - "3.7" 6 | install: 7 | - pip install -U tox-travis 8 | script: 9 | - tox 10 | -------------------------------------------------------------------------------- /example/mortgage/after_execute.sh: -------------------------------------------------------------------------------- 1 | cd ./multiverse 2 | boba merge estimate_{}.csv -b ./results --out estimate.csv 3 | boba merge uncertainty_{}.csv -b ./results --out uncertainty.csv 4 | -------------------------------------------------------------------------------- /boba/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Author and execute multiverse analysis""" 4 | 5 | __author__ = """Yang Liu""" 6 | __email__ = 'yliu0@uw.edu' 7 | __version__ = '1.1.2' 8 | -------------------------------------------------------------------------------- /test/specs/spec-good.json: -------------------------------------------------------------------------------- 1 | { 2 | "graph": ["A->B->C"], 3 | "decisions": [ 4 | {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" }, 5 | {"var": "b", "options": [0, 1] } 6 | ] 7 | } -------------------------------------------------------------------------------- /test/test_c/lang.json: -------------------------------------------------------------------------------- 1 | { 2 | "c" : { 3 | "ext" : ["c"], 4 | "compile" : ["gcc", "-o", "{{universe_name}}", "{{script_name}}"], 5 | "run" : ["./{{universe_name}}"] 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /test/specs/script-no-graph.py: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | { 3 | "decisions": [ 4 | {"var": "a", "options": [1]} 5 | ] 6 | } 7 | # --- (END) 8 | if __name__ == '__main__': 9 | a = {{a}} 10 | b = a * 2 11 | print(b) 12 | -------------------------------------------------------------------------------- /test/specs/script1.py: -------------------------------------------------------------------------------- 1 | # --- (A) 2 | if __name__ == '__main__': 3 | a = 1 4 | b = 2 5 | 6 | # --- (B) 7 | b = b + 2 * a 8 | 9 | if b > 1: 10 | # --- (C) 11 | b = -b 12 | else: 13 | b = 2 * b 14 | -------------------------------------------------------------------------------- /test/specs/script2.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | a = 1 3 | b = 2 4 | 5 | # --- (A) 6 | b = b + 2 * a 7 | 8 | if b > 1: 9 | # --- (B) 10 | b = -b 11 | # --- (C) 12 | else: 13 | b = 2 * b -------------------------------------------------------------------------------- /example/hurricane/after_execute.sh: -------------------------------------------------------------------------------- 1 | cd ./multiverse 2 | boba merge estimate_{}.csv -b ./results --out estimate.csv 3 | boba merge uncertainty_{}.csv -b ./results --out uncertainty.csv 4 | boba merge null_{}.csv -b ./results --out null.csv 5 | 6 | Rscript stacking_weights.R -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37 3 | recreate = True 4 | 5 | [travis] 6 | python = 7 | 3.7: py37 8 | 3.6: py36 9 | 10 | [testenv] 11 | setenv = 12 | PYTHONPATH = {toxinidir} 13 | 14 | commands = 15 | python -m unittest discover test 16 | -------------------------------------------------------------------------------- /test/test_c/template.c: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | { 3 | "lang": "lang.json" 4 | } 5 | # --- (END) 6 | 7 | #include 8 | int main() { 9 | printf("hello from universe "); 10 | printf("%d", {{id=1,2,3}}); 11 | printf("\n"); 12 | return 0; 13 | } -------------------------------------------------------------------------------- /example/simple/output/summary.csv: -------------------------------------------------------------------------------- 1 | Filename,Code Path,cutoff,A 2 | universe_1.py,_start->A->B,2,iqr 3 | universe_2.py,_start->A->B,2.5,iqr 4 | universe_3.py,_start->A->B,3,iqr 5 | universe_4.py,_start->A->B,2,std 6 | universe_5.py,_start->A->B,2.5,std 7 | universe_6.py,_start->A->B,3,std 8 | -------------------------------------------------------------------------------- /test/specs/script2-dup.py: -------------------------------------------------------------------------------- 1 | """ This script should fail to parse due to duplicated block id """ 2 | 3 | if __name__ == '__main__': 4 | a = 1 5 | b = 2 6 | 7 | # --- (A) 8 | b = b + 2 * a 9 | 10 | if b > 1: 11 | # --- (A) 12 | b = -b 13 | # --- (C) 14 | else: 15 | b = 2 * b 16 | -------------------------------------------------------------------------------- /test/specs/script2-syntax.py: -------------------------------------------------------------------------------- 1 | """ This script will fail to parse due to invalid block definition syntax """ 2 | 3 | if __name__ == '__main__': 4 | a = 1 5 | b = 2 6 | 7 | # --- A 8 | b = b + 2 * a 9 | 10 | if b > 1: 11 | # --- B 12 | b = -b 13 | # --- C 14 | else: 15 | b = 2 * b 16 | -------------------------------------------------------------------------------- /test/specs/script-inline-constraints.py: -------------------------------------------------------------------------------- 1 | """ Test inline constraints """ 2 | 3 | if __name__ == '__main__': 4 | # --- (A) a1 5 | a = 1 6 | 7 | # --- (A) a2 8 | a = 2 9 | 10 | # --- (B) b1 @if A == a1 11 | b = 1 12 | 13 | # --- (B) b2 @if A == a2 14 | b = 2 15 | 16 | # --- (C) 17 | print(a * b) 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.1.2 3 | commit = False 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:boba/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | -------------------------------------------------------------------------------- /test/specs/script1-bad-graph.py: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | { 3 | "graph": ["A->B->C", "A->"], 4 | "decisions": {} 5 | } 6 | 7 | # --- (A) 8 | if __name__ == '__main__': 9 | a = 1 10 | b = 2 11 | 12 | # --- (B) 13 | b = b + 2 * a 14 | 15 | if b > 1: 16 | # --- (C) 17 | b = -b 18 | else: 19 | b = 2 * b 20 | -------------------------------------------------------------------------------- /test/specs/script1-cyclic-graph.py: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | { 3 | "graph": ["A->B->C->A"], 4 | "decisions": {} 5 | } 6 | 7 | # --- (A) 8 | if __name__ == '__main__': 9 | a = 1 10 | b = 2 11 | 12 | # --- (B) 13 | b = b + 2 * a 14 | 15 | if b > 1: 16 | # --- (C) 17 | b = -b 18 | else: 19 | b = 2 * b 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__ 3 | *.pyc 4 | multiverse/ 5 | durante_etal_2013_study2.txt 6 | MTurk_ratings_femeninity_of_hurricanes.csv 7 | data_updated.csv 8 | multiverse_analysis.R 9 | amk-notes.txt 10 | venv/ 11 | env/ 12 | prototype/ 13 | 14 | # example 15 | example/sampling 16 | 17 | # packaging 18 | *.egg-info/ 19 | build/ 20 | dist/ 21 | 22 | .tox/ 23 | -------------------------------------------------------------------------------- /test/specs/script2-dup-var.py: -------------------------------------------------------------------------------- 1 | """ Should fail to parse because of a block and a variable 2 | have the same name.""" 3 | 4 | # --- (BOBA_CONFIG) 5 | {"decisions": [ 6 | {"var": "a", "options": [1, 2]} 7 | ]} 8 | # --- (END) 9 | 10 | if __name__ == '__main__': 11 | # --- (a) 12 | a = {{a}} 13 | 14 | # --- (b) b1 15 | b = 1 16 | 17 | # --- (b) b2 18 | b = 2 19 | 20 | # --- (c) 21 | print(a * b) 22 | -------------------------------------------------------------------------------- /boba/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class Colors: 5 | HEADER = '\033[95m' 6 | OKBLUE = '\033[94m' 7 | OKGREEN = '\033[92m' 8 | WARNING = '\033[93m' 9 | FAIL = '\033[91m' 10 | ENDC = '\033[0m' 11 | BOLD = '\033[1m' 12 | UNDERLINE = '\033[4m' 13 | 14 | 15 | def print_fail(msg): 16 | print(Colors.FAIL + msg + Colors.ENDC) 17 | 18 | def print_warn(msg): 19 | print(Colors.WARNING + msg + Colors.ENDC) 20 | -------------------------------------------------------------------------------- /test/specs/script1-good.py: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | { 3 | "graph": ["A->B->C"], 4 | "decisions": [ 5 | {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" }, 6 | {"var": "b", "options": [0, 1] } 7 | ] 8 | } 9 | 10 | # --- (A) 11 | if __name__ == '__main__': 12 | a = 1 13 | b = 2 14 | 15 | # --- (B) 16 | b = b + 2 * a 17 | 18 | if b > 1: 19 | # --- (C) 20 | b = -b 21 | else: 22 | b = 2 * b 23 | -------------------------------------------------------------------------------- /test/specs/script2-block-param.py: -------------------------------------------------------------------------------- 1 | """ Test the block-level parameter syntax """ 2 | 3 | # --- (BOBA_CONFIG) 4 | {"decisions": [ 5 | {"var": "b", "options": [1, 2]} 6 | ]} 7 | # --- (END) 8 | 9 | if __name__ == '__main__': 10 | # --- (A) a1 11 | a = {{b}} 12 | 13 | # --- (A) a2 14 | a = 2 15 | 16 | # --- (B) b1 17 | b = 1 18 | 19 | # --- (B) b2 20 | b = 2 21 | 22 | # --- (B) b3 23 | b = 3 24 | 25 | # --- (C) 26 | print(a * b) 27 | -------------------------------------------------------------------------------- /test/specs/script3-2.py: -------------------------------------------------------------------------------- 1 | """ Test constraints """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "decisions": [ 6 | {"var": "b", "options": [0, 1]} 7 | ], 8 | "constraints": [ 9 | {"block": "B", "condition": "A == a2"} 10 | ] 11 | } 12 | # --- (END) 13 | 14 | if __name__ == '__main__': 15 | # --- (A) a1 16 | a = {{b}} 17 | 18 | # --- (A) a2 19 | a = 2 20 | 21 | # --- (B) b1 22 | b = 1 23 | 24 | # --- (B) b2 25 | b = 2 26 | 27 | # --- (B) b3 28 | b = 3 29 | 30 | # --- (C) 31 | print(a * b) 32 | -------------------------------------------------------------------------------- /example/hurricane/install.R: -------------------------------------------------------------------------------- 1 | # create user library if it does not exist 2 | repo = "http://cran.us.r-project.org" 3 | lib = Sys.getenv("R_LIBS_USER") 4 | dir.create(lib) 5 | 6 | # install required packages 7 | if(!require(readr)) install.packages("readr", lib, repos=repo) 8 | if(!require(MASS)) install.packages("MASS", lib, repos=repo) 9 | if(!require(tidyverse)) install.packages("tidyverse", lib, repos=repo) 10 | if(!require(broom.mixed)) install.packages("broom.mixed", lib, repos=repo) 11 | if(!require(caret)) install.packages("caret", lib, repos=repo) 12 | -------------------------------------------------------------------------------- /example/hurricane/data_wrangling/debug_count.py: -------------------------------------------------------------------------------- 1 | # a helper script to count which universes are missing 2 | # for debug purposes 3 | 4 | import os 5 | 6 | TOTAL = 864 7 | 8 | fs = [] 9 | for f in os.listdir(os.path.join(os.getcwd(), 'multiverse/results/')): 10 | name, ext = os.path.splitext(f) 11 | if ext == '.txt': 12 | fs.append(int(name.split('_')[1])) 13 | 14 | fs.sort() 15 | 16 | j = 0 17 | res = [] 18 | for i in range(TOTAL): 19 | if i != fs[j] - 1: 20 | res.append(i + 1) 21 | else: 22 | j += 1 23 | 24 | print(res) 25 | -------------------------------------------------------------------------------- /test/specs/script4-3.py: -------------------------------------------------------------------------------- 1 | """ Test ADG and linked decisions """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "decisions": [ 6 | {"var": "a", "options": [0, 1] }, 7 | {"var": "b", "options": ["0", "1"]} 8 | ], 9 | "constraints": [ 10 | {"link": ["a", "b"]} 11 | ] 12 | } 13 | # --- (END) 14 | 15 | if __name__ == '__main__': 16 | # --- (A) 17 | a = {{a}} 18 | 19 | # --- (B) b1 20 | b = 1 + {{b}} 21 | 22 | # --- (B) b2 23 | b = 2 + {{b}} 24 | 25 | # --- (C) 26 | print(a * b) 27 | 28 | # --- (D) 29 | print(a + b) 30 | -------------------------------------------------------------------------------- /example/fertility_r/spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "decisions": [ 3 | {"var": "fertility_bounds", "options": [ 4 | "c(7, 14, 17, 25, 17, 25)", 5 | "c(6, 14, 17, 27, 17, 27)", 6 | "c(9, 17, 18, 25, 18, 25)", 7 | "c(8, 14, 1, 7, 15, 28)", 8 | "c(9, 17, 1, 8, 18, 28)" 9 | ]}, 10 | {"var": "relationship_bounds", "options": [ 11 | "c(2, 3)", "c(1, 2)", "c(1, 3)" 12 | ]} 13 | ], 14 | "outputs": [ 15 | {"name": "p-value", "value": "summar$coefficients[4, 4]"} 16 | ], 17 | "before_execute": "cp ../durante_etal_2013_study1.txt ./code/" 18 | } -------------------------------------------------------------------------------- /test/specs/script3-7.py: -------------------------------------------------------------------------------- 1 | """ Test constraints """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "decisions": [ 6 | {"var": "a", "options": [0, 1, 2, 3, 4] }, 7 | {"var": "b", "options": ["0", "1", "2", "3", "4"]} 8 | ], 9 | "constraints": [ 10 | {"link": ["a", "b"]} 11 | ] 12 | } 13 | # --- (END) 14 | 15 | if __name__ == '__main__': 16 | # --- (A) 17 | a = {{a}} 18 | 19 | # --- (B) b1 20 | b = 1 + {{b}} 21 | 22 | # --- (B) b2 23 | b = 2 + {{b}} 24 | 25 | # --- (C) 26 | print(a * b) 27 | 28 | # --- (D) 29 | print(a + b) 30 | -------------------------------------------------------------------------------- /test/specs/script3-5.py: -------------------------------------------------------------------------------- 1 | """ Test constraints """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "decisions": [ 6 | {"var": "a", "options": ["if", "else"]}, 7 | {"var": "b", "options": [0, 1] } 8 | ], 9 | "constraints": [ 10 | {"block": "C", "skippable": true, "condition": "a == if"} 11 | ] 12 | } 13 | # --- (END) 14 | 15 | if __name__ == '__main__': 16 | # --- (A) 17 | a = {{a}} 18 | 19 | # --- (B) b1 20 | b = 1 + {{b}} 21 | 22 | # --- (B) b2 23 | b = 2 + {{b}} 24 | 25 | # --- (C) 26 | print(a * b) 27 | 28 | # --- (D) 29 | print(a + b) 30 | -------------------------------------------------------------------------------- /test/specs/script3-6.py: -------------------------------------------------------------------------------- 1 | """ Test constraints """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "graph": ["A->B->C->D"], 6 | "decisions": [ 7 | {"var": "a", "options": ["if", "else"]}, 8 | {"var": "b", "options": [0, 1.5] } 9 | ], 10 | "constraints": [ 11 | {"block": "D", "condition": "a == if and B == b1"} 12 | ] 13 | } 14 | # --- (END) 15 | 16 | if __name__ == '__main__': 17 | # --- (A) 18 | a = {{a}} 19 | 20 | # --- (B) b1 21 | b = 1 + {{b}} 22 | 23 | # --- (B) b2 24 | b = 2 + {{b}} 25 | 26 | # --- (C) 27 | print(a * b) 28 | 29 | # --- (D) 30 | print(a + b) 31 | -------------------------------------------------------------------------------- /test/specs/script3-3.py: -------------------------------------------------------------------------------- 1 | """ Test constraints """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "graph": ["A->B->C", "B->D"], 6 | "decisions": [ 7 | {"var": "a", "options": ["if", "else"]}, 8 | {"var": "b", "options": [0, 1] } 9 | ], 10 | "constraints": [ 11 | {"block": "C", "condition": "B == b1"}, 12 | {"block": "D", "condition": "B == b2"} 13 | ] 14 | } 15 | # --- (END) 16 | 17 | if __name__ == '__main__': 18 | # --- (A) 19 | a = {{a}} 20 | 21 | # --- (B) b1 22 | b = 1 + {{b}} 23 | 24 | # --- (B) b2 25 | b = 2 + {{b}} 26 | 27 | # --- (C) 28 | print(a * b) 29 | 30 | # --- (D) 31 | print(a + b) 32 | -------------------------------------------------------------------------------- /test/specs/script4-2.py: -------------------------------------------------------------------------------- 1 | """ Test ADG and code graph """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "graph": ["A->B->C", "B->D"], 6 | "decisions": [ 7 | {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" }, 8 | {"var": "b", "options": [0, 1] }, 9 | {"var": "c", "options": [[1, 2], [3, 4]]} 10 | ] 11 | } 12 | # --- (END) 13 | 14 | if __name__ == '__main__': 15 | # --- (A) a1 16 | a = {{a}} 17 | 18 | # --- (A) a2 19 | a = {{a}} 20 | 21 | # --- (B) b1 22 | b = {{b}} 23 | 24 | # --- (B) b2 25 | b = 2 26 | 27 | # --- (B) b3 28 | b = 3 29 | 30 | # --- (C) 31 | print(a * b) 32 | 33 | # --- (D) 34 | -------------------------------------------------------------------------------- /test/specs/script3-4.py: -------------------------------------------------------------------------------- 1 | """ Test constraints """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "graph": ["A->B->C->D"], 6 | "decisions": [ 7 | {"var": "a", "options": ["if", "else"]}, 8 | {"var": "b", "options": [0, 1] } 9 | ], 10 | "constraints": [ 11 | {"variable": "b", "option": 1, "condition": "a.index == 0"}, 12 | {"variable": "b", "option": 0, "condition": "a == else"} 13 | ] 14 | } 15 | # --- (END) 16 | 17 | if __name__ == '__main__': 18 | # --- (A) 19 | a = {{a}} 20 | 21 | # --- (B) b1 22 | b = 1 + {{b}} 23 | 24 | # --- (B) b2 25 | b = 2 + {{b}} 26 | 27 | # --- (C) 28 | print(a * b) 29 | 30 | # --- (D) 31 | print(a + b) 32 | -------------------------------------------------------------------------------- /example/simple/output/code/universe_4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import statsmodels.api as sm 5 | 6 | if __name__ == '__main__': 7 | # read data file 8 | df = pd.read_csv('data.csv') 9 | 10 | # remove outliers based on std 11 | df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())] 12 | 13 | # fit a simple ordinary least squares model 14 | x = sm.add_constant(df.x) 15 | lm = sm.OLS(df.y, x).fit() 16 | 17 | # display results 18 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 19 | print('AIC: {:.2f}'.format(lm.aic)) 20 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 21 | -------------------------------------------------------------------------------- /example/simple/output/code/universe_5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import statsmodels.api as sm 5 | 6 | if __name__ == '__main__': 7 | # read data file 8 | df = pd.read_csv('data.csv') 9 | 10 | # remove outliers based on std 11 | df = df[np.abs(df.y - df.y.mean()) <= (2.5 * df.y.std())] 12 | 13 | # fit a simple ordinary least squares model 14 | x = sm.add_constant(df.x) 15 | lm = sm.OLS(df.y, x).fit() 16 | 17 | # display results 18 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 19 | print('AIC: {:.2f}'.format(lm.aic)) 20 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 21 | -------------------------------------------------------------------------------- /example/simple/output/code/universe_6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import statsmodels.api as sm 5 | 6 | if __name__ == '__main__': 7 | # read data file 8 | df = pd.read_csv('data.csv') 9 | 10 | # remove outliers based on std 11 | df = df[np.abs(df.y - df.y.mean()) <= (3 * df.y.std())] 12 | 13 | # fit a simple ordinary least squares model 14 | x = sm.add_constant(df.x) 15 | lm = sm.OLS(df.y, x).fit() 16 | 17 | # display results 18 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 19 | print('AIC: {:.2f}'.format(lm.aic)) 20 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 21 | -------------------------------------------------------------------------------- /example/mortgage/visualizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | {"id": "est", "path": "estimate.csv"}, 4 | {"id": "unc", "path": "uncertainty.csv"}, 5 | {"id": "fit", "path": "raw/disagg_pred_{}.csv", "multi": true} 6 | ], 7 | "schema": { 8 | "point_estimate": {"file": "est", "field": "estimate"}, 9 | "p_value": {"file": "est", "field": "p.value"}, 10 | "fit": {"file": "est", "field": "NRMSE"}, 11 | "uncertainty": {"file": "unc", "field": "estimate"}, 12 | "prediction": {"file": "fit"} 13 | }, 14 | "labels": { 15 | "dataset": "mortgage", 16 | "x_axis": "Coefficient on female", 17 | "x_axis_fit": "Approved", 18 | "x_range": [-3, 8] 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /example/simple/script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import statsmodels.api as sm 6 | 7 | if __name__ == '__main__': 8 | # read data file 9 | df = pd.read_csv('data.csv') 10 | 11 | # remove outliers 12 | # discard rows outside 2 x std 13 | df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())] 14 | 15 | # fit a simple ordinary least squares model 16 | x = sm.add_constant(df.x) 17 | lm = sm.OLS(df.y, x).fit() 18 | 19 | # display results 20 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 21 | print('AIC: {:.2f}'.format(lm.aic)) 22 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 23 | -------------------------------------------------------------------------------- /example/simple_cont/script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import statsmodels.api as sm 6 | 7 | if __name__ == '__main__': 8 | # read data file 9 | df = pd.read_csv('data.csv') 10 | 11 | # remove outliers 12 | # discard rows outside 2 x std 13 | df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())] 14 | 15 | # fit a simple ordinary least squares model 16 | x = sm.add_constant(df.x) 17 | lm = sm.OLS(df.y, x).fit() 18 | 19 | # display results 20 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 21 | print('AIC: {:.2f}'.format(lm.aic)) 22 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 23 | -------------------------------------------------------------------------------- /example/hurricane/stacking_weights.R: -------------------------------------------------------------------------------- 1 | library(rstan) 2 | library(readr) 3 | library(tidyverse) 4 | 5 | dir = './results' 6 | fs = list.files(dir, pattern='^loglik') 7 | if (length(fs) < 1) { 8 | stop('No matching files found, pattern: loglik_*.csv') 9 | } 10 | 11 | dfs = lapply(fs, function (f) { 12 | read_csv(file.path(dir, f), col_types='d') 13 | }) 14 | uids = lapply(fs, function (f) { 15 | res = strsplit(strsplit(f, '_')[[1]][2], '\\.')[[1]][1] 16 | return(strtoi(res)) 17 | }) 18 | 19 | m = bind_cols(dfs) %>% as.matrix 20 | weights = loo::stacking_weights(m) 21 | res = enframe(c(weights)) %>% 22 | select(-name) %>% 23 | add_column(unlist(uids)) %>% 24 | rename(weights = 1, uid = 2) 25 | write_csv(res, './weights.csv') 26 | -------------------------------------------------------------------------------- /example/mortgage/visualizer_config_monitor.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | {"id": "est", "path": "estimate.csv"}, 4 | {"id": "unc", "path": "uncertainty.csv"}, 5 | {"id": "fit", "path": "results/disagg_fit_{}.csv", "multi": true} 6 | ], 7 | "schema": { 8 | "point_estimate": {"file": "est", "field": "estimate"}, 9 | "p_value": {"file": "est", "field": "p.value"}, 10 | "fit": {"file": "est", "field": "R2_flipped"}, 11 | "uncertainty": {"file": "unc", "field": "estimate"}, 12 | "prediction": {"file": "fit"} 13 | }, 14 | "labels": { 15 | "dataset": "mortgage", 16 | "x_axis": "Coefficient on female", 17 | "x_axis_fit": "Approved", 18 | "x_range": [-3, 8], 19 | "fit_range": [0, 1] 20 | } 21 | } -------------------------------------------------------------------------------- /test/specs/script4-1.py: -------------------------------------------------------------------------------- 1 | """ Test ADG """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "decisions": [ 6 | {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" }, 7 | {"var": "b", "options": [0, 1] }, 8 | {"var": "c", "options": [[1, 2], [3, 4]]} 9 | ], 10 | "constraints": [ 11 | {"block": "B", "option": "b1", "condition": "A == a1"}, 12 | {"block": "B", "option": "b2", "condition": "A == a2"} 13 | ] 14 | } 15 | # --- (END) 16 | 17 | if __name__ == '__main__': 18 | # --- (A) a1 19 | a = 1 20 | 21 | # --- (A) a2 22 | a = 2 23 | 24 | # --- (B) b1 25 | b = {{b}} 26 | 27 | # --- (B) b2 28 | b = 2 29 | 30 | # --- (B) b3 31 | b = 3 32 | 33 | # --- (C) 34 | print(a * b) 35 | -------------------------------------------------------------------------------- /test/specs/script3-1.py: -------------------------------------------------------------------------------- 1 | """ Test constraints """ 2 | 3 | # --- (BOBA_CONFIG) 4 | { 5 | "decisions": [ 6 | {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" }, 7 | {"var": "b", "options": [0, 1] }, 8 | {"var": "c", "options": [[1, 2], [3, 4]]} 9 | ], 10 | "constraints": [ 11 | {"block": "B", "option": "b1", "condition": "A == a1"}, 12 | {"block": "B", "option": "b2", "condition": "A == a2"} 13 | ] 14 | } 15 | # --- (END) 16 | 17 | if __name__ == '__main__': 18 | # --- (A) a1 19 | a = {{b}} 20 | 21 | # --- (A) a2 22 | a = 2 23 | 24 | # --- (B) b1 25 | b = 1 26 | 27 | # --- (B) b2 28 | b = 2 29 | 30 | # --- (B) b3 31 | b = 3 32 | 33 | # --- (C) 34 | print(a * b) 35 | -------------------------------------------------------------------------------- /example/simple/gen_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | # create a synthetic dataset and save to data.csv 7 | if __name__ == '__main__': 8 | # create a linear series y= 10 + 0.5 * x plus random gaussian noise 9 | n = 100 10 | x = np.random.uniform(0, 5, n) 11 | y = 10 + 0.5 * x + np.random.normal(0, 0.2, n) 12 | 13 | # make outliers 14 | mean = np.mean(y) 15 | sd = np.std(y) 16 | cutoff = [2.4, 2.9, 3.4] 17 | for i in range(len(cutoff)): 18 | y[i * 2] = mean + cutoff[i] * sd 19 | y[i * 2 + 1] = mean - cutoff[i] * sd 20 | 21 | # save file 22 | df = pd.DataFrame(np.column_stack((x, y)), columns=['x', 'y']) 23 | df.to_csv('data.csv', index=False) 24 | -------------------------------------------------------------------------------- /example/simple/output/code/universe_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import statsmodels.api as sm 5 | 6 | if __name__ == '__main__': 7 | # read data file 8 | df = pd.read_csv('data.csv') 9 | 10 | # remove outliers based on iqr 11 | iqr = np.subtract(*np.percentile(df.y, [75, 25])) 12 | median = np.median(df.y) 13 | df = df[abs(df.y - median) <= 2 * iqr] 14 | 15 | # fit a simple ordinary least squares model 16 | x = sm.add_constant(df.x) 17 | lm = sm.OLS(df.y, x).fit() 18 | 19 | # display results 20 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 21 | print('AIC: {:.2f}'.format(lm.aic)) 22 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 23 | -------------------------------------------------------------------------------- /example/simple/output/code/universe_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import statsmodels.api as sm 5 | 6 | if __name__ == '__main__': 7 | # read data file 8 | df = pd.read_csv('data.csv') 9 | 10 | # remove outliers based on iqr 11 | iqr = np.subtract(*np.percentile(df.y, [75, 25])) 12 | median = np.median(df.y) 13 | df = df[abs(df.y - median) <= 2.5 * iqr] 14 | 15 | # fit a simple ordinary least squares model 16 | x = sm.add_constant(df.x) 17 | lm = sm.OLS(df.y, x).fit() 18 | 19 | # display results 20 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 21 | print('AIC: {:.2f}'.format(lm.aic)) 22 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 23 | -------------------------------------------------------------------------------- /example/simple/output/code/universe_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import statsmodels.api as sm 5 | 6 | if __name__ == '__main__': 7 | # read data file 8 | df = pd.read_csv('data.csv') 9 | 10 | # remove outliers based on iqr 11 | iqr = np.subtract(*np.percentile(df.y, [75, 25])) 12 | median = np.median(df.y) 13 | df = df[abs(df.y - median) <= 3 * iqr] 14 | 15 | # fit a simple ordinary least squares model 16 | x = sm.add_constant(df.x) 17 | lm = sm.OLS(df.y, x).fit() 18 | 19 | # display results 20 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 21 | print('AIC: {:.2f}'.format(lm.aic)) 22 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 23 | -------------------------------------------------------------------------------- /example/simple_cont/gen_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | # create a synthetic dataset and save to data.csv 7 | if __name__ == '__main__': 8 | # create a linear series y= 10 + 0.5 * x plus random gaussian noise 9 | n = 100 10 | x = np.random.uniform(0, 5, n) 11 | y = 10 + 0.5 * x + np.random.normal(0, 0.2, n) 12 | 13 | # make outliers 14 | mean = np.mean(y) 15 | sd = np.std(y) 16 | cutoff = [2.4, 2.9, 3.4] 17 | for i in range(len(cutoff)): 18 | y[i * 2] = mean + cutoff[i] * sd 19 | y[i * 2 + 1] = mean - cutoff[i] * sd 20 | 21 | # save file 22 | df = pd.DataFrame(np.column_stack((x, y)), columns=['x', 'y']) 23 | df.to_csv('data.csv', index=False) 24 | -------------------------------------------------------------------------------- /example/simple/output/overview.json: -------------------------------------------------------------------------------- 1 | { 2 | "decisions": [ 3 | { 4 | "options": [ 5 | 2, 6 | 2.5, 7 | 3 8 | ], 9 | "var": "cutoff" 10 | }, 11 | { 12 | "options": [ 13 | "std", 14 | "iqr" 15 | ], 16 | "var": "A" 17 | } 18 | ], 19 | "graph": { 20 | "edges": [ 21 | { 22 | "source": 1, 23 | "target": 0, 24 | "type": "order" 25 | } 26 | ], 27 | "nodes": [ 28 | { 29 | "id": 0, 30 | "name": "cutoff" 31 | }, 32 | { 33 | "id": 1, 34 | "name": "A" 35 | } 36 | ] 37 | }, 38 | "visualizer": { 39 | "files": [ 40 | { 41 | "id": "est", 42 | "path": "estimates.csv" 43 | } 44 | ], 45 | "schema": { 46 | "point_estimate": { 47 | "field": "estimate", 48 | "file": "est" 49 | } 50 | } 51 | } 52 | } -------------------------------------------------------------------------------- /example/hurricane/visualizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | {"id": "est", "path": "estimate.csv"}, 4 | {"id": "unc", "path": "uncertainty.csv"}, 5 | {"id": "nul", "path": "null.csv"}, 6 | {"id": "wei", "path": "weights.csv"}, 7 | {"id": "fit", "path": "results/disagg_fit_{}.csv", "multi": true} 8 | ], 9 | "schema": { 10 | "point_estimate": {"file": "est", "field": "expected_diff"}, 11 | "fit": {"file": "est", "field": "NRMSE"}, 12 | "uncertainty": {"file": "unc", "field": "expected_diff"}, 13 | "prediction": {"file": "fit", "transform": "math.log2({} + 1)"}, 14 | "null_distribution": {"field": "expected_diff", "file": "nul"}, 15 | "stacking_weight": {"field": "weights", "file": "wei"} 16 | }, 17 | "labels": { 18 | "dataset": "hurricane", 19 | "x_axis": "Expected Deaths: Female - Male", 20 | "x_axis_fit": "Log2(Death + 1)", 21 | "fit_range": [0, 1], 22 | "x_range": [-10, 50], 23 | "x_range_outer": [-120, 300] 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /example/simple/template.py: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | {"before_execute": "cp ../data.csv ./code/"} 3 | # --- (END) 4 | #!/usr/bin/env python3 5 | import pandas as pd 6 | import numpy as np 7 | import statsmodels.api as sm 8 | 9 | if __name__ == '__main__': 10 | # read data file 11 | df = pd.read_csv('data.csv') 12 | 13 | # --- (A) std 14 | # remove outliers based on std 15 | df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff=2,2.5,3}} * df.y.std())] 16 | 17 | # --- (A) iqr 18 | # remove outliers based on iqr 19 | iqr = np.subtract(*np.percentile(df.y, [75, 25])) 20 | median = np.median(df.y) 21 | df = df[abs(df.y - median) <= {{cutoff}} * iqr] 22 | 23 | # --- (B) 24 | # fit a simple ordinary least squares model 25 | x = sm.add_constant(df.x) 26 | lm = sm.OLS(df.y, x).fit() 27 | 28 | # display results 29 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 30 | print('AIC: {:.2f}'.format(lm.aic)) 31 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 32 | -------------------------------------------------------------------------------- /example/reading/r/brms_test.R: -------------------------------------------------------------------------------- 1 | library(brms) 2 | 3 | # read data 4 | zinb <- read.csv("http://stats.idre.ucla.edu/stat/data/fish.csv") 5 | zinb$camper <- factor(zinb$camper, labels = c("no", "yes")) 6 | head(zinb) 7 | 8 | # fit model 9 | fit_zinb1 <- brm(count ~ persons + child + camper, data = zinb, 10 | family = zero_inflated_poisson("log")) 11 | 12 | # view results 13 | summary(fit_zinb1) 14 | pdf(file="out.pdf") 15 | plot(fit_zinb1, pars = c("persons", "child", "camper")) 16 | marginal_effects(fit_zinb1) 17 | 18 | # get the full STAN log, for debugging purpose 19 | # library(rstan) 20 | # mc <- make_stancode(count ~ persons + child + camper, data = zinb,family = zero_inflated_poisson("log")) 21 | # stan_model(model_code = mc, verbose = TRUE) 22 | 23 | # Compilation error on macOS Majove 24 | # These shell commands worked for me: 25 | # xcode-select --install 26 | # open /Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg 27 | 28 | # read fitted model 29 | path = 'some_file.rds' 30 | fit <- suppressWarnings(try(readRDS(path), silent = TRUE)) 31 | summary(fit) 32 | -------------------------------------------------------------------------------- /example/reading/r/install.R: -------------------------------------------------------------------------------- 1 | # create user library if it does not exist 2 | repo = "http://cran.us.r-project.org" 3 | lib = Sys.getenv("R_LIBS_USER") 4 | dir.create(lib) 5 | 6 | # configure C++ toolchain on Linux in order to use RStan 7 | # https://github.com/stan-dev/rstan/wiki/Installing-RStan-on-Linux 8 | dotR <- file.path(Sys.getenv("HOME"), ".R") 9 | if (!file.exists(dotR)) dir.create(dotR) 10 | M <- file.path(dotR, "Makevars") 11 | if (!file.exists(M)) file.create(M) 12 | cat("\nCXX14FLAGS=-O3 -march=native -mtune=native -fPIC", 13 | "CXX14=g++", # or clang++ but you may need a version postfix 14 | file = M, sep = "\n", append = TRUE) 15 | 16 | # install required packages 17 | if(!require(readr)) install.packages("readr", lib, repos=repo) 18 | if(!require(lmerTest)) install.packages("lmerTest", lib, repos=repo) 19 | if(!require(brms)) install.packages("brms", lib, repos=repo) 20 | if(!require(car)) install.packages("car", lib, repos=repo) 21 | if(!require(psych)) install.packages("psych", lib, repos=repo) 22 | if(!require(scales)) install.packages("scales", lib, repos=repo) 23 | if(!require(ordinal)) install.packages("ordinal", lib, repos=repo) 24 | -------------------------------------------------------------------------------- /example/simple_cont/template.py: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | { 3 | "decisions": [ 4 | {"var": "cutoff", 5 | "options": [ 6 | { 7 | "seed" : 0, 8 | "sample" : "uniform", 9 | "count" : 50, 10 | "min" : 1.0, 11 | "max" : 3.0 12 | } 13 | ] 14 | } 15 | ], 16 | "before_execute": "cp ../data.csv ./code/" 17 | } 18 | # --- (END) 19 | 20 | #!/usr/bin/env python3 21 | import pandas as pd 22 | import numpy as np 23 | import statsmodels.api as sm 24 | 25 | if __name__ == '__main__': 26 | # read data file 27 | df = pd.read_csv('data.csv') 28 | 29 | # --- (A) std 30 | # remove outliers based on std 31 | df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff}} * df.y.std())] 32 | 33 | # --- (A) iqr 34 | # remove outliers based on iqr 35 | iqr = np.subtract(*np.percentile(df.y, [75, 25])) 36 | median = np.median(df.y) 37 | df = df[abs(df.y - median) <= {{cutoff}} * iqr] 38 | 39 | # --- (B) 40 | # fit a simple ordinary least squares model 41 | x = sm.add_constant(df.x) 42 | lm = sm.OLS(df.y, x).fit() 43 | 44 | # display results 45 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 46 | print('AIC: {:.2f}'.format(lm.aic)) 47 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 48 | -------------------------------------------------------------------------------- /example/reading/script.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(readr) 4 | library(lmerTest) 5 | library(car) 6 | library(psych) 7 | library(scales) 8 | 9 | speed_data <- read_csv('data.csv') 10 | 11 | #calculate reading speed in WPM 12 | speed_data$speed <- speed_data$num_words/(speed_data$adjust_rt/60000) 13 | 14 | #remove retake participants 15 | speed_data <- subset(speed_data, retake != 1) 16 | 17 | #remove outliers 18 | iqr = IQR(speed_data[speed_data$dyslexia_bin == 0,]$speed,na.rm=TRUE) 19 | cutoff_high = median(speed_data$speed) +3*iqr #3*iqr=645, cutoff_high = 928 20 | 21 | #-------remove trials based on speed------- 22 | result_analysis <- speed_data[! speed_data$speed > cutoff_high, ] 23 | result_analysis <- result_analysis[ ! result_analysis$speed < 10,] 24 | 25 | #-------remove smartphone users------- 26 | length(unique(subset(result_analysis$uuid, result_analysis$device=='smartphone'))) 27 | #remove 64 smartphone users, 363 trials 28 | result_analysis <- result_analysis[! result_analysis$device == 'smartphone',] 29 | 30 | #-------remove trials based on comprehension < 2/3------- 31 | result_analysis <- result_analysis[ ! result_analysis$correct_rate < .6,] 32 | #remove 111 trials 33 | 34 | result_analysis$log_speed <- log(result_analysis$speed) 35 | 36 | #dyslexia in three groups 37 | model <- lmer(log_speed ~ img_width + num_words + page_condition*as.factor(dyslexia) + age + english_native + (1 | uuid), data = result_analysis) 38 | AIC(model) 39 | summary(model) 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | 6 | with open("README.rst", "r") as fh: 7 | readme = fh.read() 8 | 9 | with open('HISTORY.rst') as history_file: 10 | history = history_file.read() 11 | 12 | requirements = ['Click>=6.0', 'dataclasses>=0.6', 'pandas>=1.0.1'] 13 | 14 | setup_requirements = [] 15 | 16 | test_requirements = [] 17 | 18 | setup( 19 | author="Yang Liu", 20 | author_email='yliu0@uw.edu', 21 | classifiers=[ 22 | 'Development Status :: 3 - Alpha', 23 | 'Intended Audience :: Science/Research', 24 | 'License :: OSI Approved :: BSD License', 25 | 'Natural Language :: English', 26 | 'Programming Language :: Python :: 3', 27 | 'Programming Language :: Python :: 3.6', 28 | 'Programming Language :: Python :: 3.7', 29 | ], 30 | description="Author and execute multiverse analysis", 31 | entry_points={ 32 | 'console_scripts': [ 33 | 'boba=boba.cli:main', 34 | ], 35 | }, 36 | install_requires=requirements, 37 | license="BSD license", 38 | long_description=readme + '\n\n' + history, 39 | include_package_data=True, 40 | keywords='multiverse analysis', 41 | name='boba', 42 | packages=find_packages(include=['boba', 'boba.*']), 43 | setup_requires=setup_requirements, 44 | test_suite='tests', 45 | tests_require=test_requirements, 46 | url='https://github.com/uwdata/boba', 47 | version='1.1.2', 48 | zip_safe=False, 49 | ) 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, University of Washington Interactive Data Lab. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 1.1.2 (2021-04-25) 6 | ================== 7 | 8 | * Remove the dependency on boba-visualizer 9 | 10 | 1.1.1 (2021-04-25) 11 | ================== 12 | 13 | * Support the Boba monitor 14 | 15 | 1.1.0 (2020-10-07) 16 | ================== 17 | 18 | * Add support for arbitrary language 19 | * Various bug fixes 20 | 21 | 1.0.0 (2020-07-31) 22 | ================== 23 | 24 | * Support continuous placeholder variable 25 | * Support running the multiverse across multiple processes 26 | * Improve boba run, such that it is not dependent on the OS 27 | * Various bug fixes 28 | * Integrate boba visualizer 29 | 30 | 0.1.4 (2020-04-19) 31 | ================== 32 | 33 | * Combine JSON spec with the template 34 | * Support inline definition for placeholder variables 35 | * Support inline constraint at block declaration 36 | * Support linked decisions 37 | * Infer ADG from specification 38 | * Update examples 39 | * Various bug fixes 40 | 41 | 0.1.3 (2019-11-30) 42 | ================== 43 | 44 | * Revise authoring syntax to support decision blocks and constraints 45 | * Revise CLI, with separate commands to compile and to run 46 | * Improve execution 47 | * Add the hurricane example 48 | 49 | 0.1.2 (2019-09-19) 50 | ================== 51 | 52 | * Fix bugs 53 | 54 | 0.1.1 (2019-09-19) 55 | ================== 56 | 57 | * Support R 58 | * Improve CLI options 59 | * Support a built-in variable {{_n}}, which represents the universe number 60 | * Support "before_execute" and "after_execute" hooks in the JSON spec 61 | * Update examples 62 | 63 | 0.1.0 (2019-08-26) 64 | ================== 65 | 66 | * First release on PyPI. 67 | -------------------------------------------------------------------------------- /boba/baseparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import sys 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass 9 | class Token: 10 | type: str 11 | value: str 12 | 13 | 14 | class ParseError(SyntaxError): 15 | pass 16 | 17 | 18 | class BaseParser: 19 | 20 | def __init__(self, line): 21 | self.line = line 22 | self.i = 0 23 | self.row = 0 24 | self.col = 0 25 | self.current = None 26 | 27 | @staticmethod 28 | def _is_whitespace(char): 29 | return any(c == char for c in ' \t\n') 30 | 31 | @staticmethod 32 | def _is_id_start(ch): 33 | return bool(re.match('[a-zA-Z]', ch)) 34 | 35 | @staticmethod 36 | def _is_id(ch): 37 | return bool(re.match('[_a-zA-Z0-9]', ch)) 38 | 39 | @staticmethod 40 | def _is_digit(ch): 41 | return bool(re.match('[0-9]', ch)) 42 | 43 | def _next_char(self): 44 | ch = self.line[self.i] 45 | self.i += 1 46 | if ch == '\n': 47 | self.row += 1 48 | self.col = 0 49 | else: 50 | self.col += 1 51 | return ch 52 | 53 | def _peek_char(self): 54 | return self.line[self.i] 55 | 56 | def _is_end(self): 57 | return self.i >= len(self.line) 58 | 59 | def _read_while(self, fun, max_len=sys.maxsize): 60 | s = '' 61 | while not self._is_end() and fun(self._peek_char()) and len(s) < max_len: 62 | s += self._next_char() 63 | return s 64 | 65 | def _peek(self): 66 | if not self.current: 67 | self.current = self._read_next() 68 | return self.current 69 | 70 | def _next(self): 71 | tmp = self.current 72 | self.current = None 73 | return tmp or self._read_next() 74 | 75 | def _read_next(self): 76 | pass 77 | -------------------------------------------------------------------------------- /test/test_lang.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Ugly hack to allow import from the root folder 4 | import shutil 5 | import sys 6 | import os 7 | sys.path.insert(0, os.path.abspath('..')) 8 | 9 | import unittest 10 | from unittest.mock import patch 11 | import io 12 | from boba.cli import compile, run 13 | 14 | from io import StringIO 15 | import sys 16 | 17 | def run_click(fn, args): 18 | """ run a click function """ 19 | 20 | stdout = sys.stdout 21 | null = open(os.devnull, 'w') 22 | sys.stdout = null 23 | try: 24 | print('here') 25 | fn(args) 26 | except SystemExit as e: 27 | if e.code != 0: 28 | raise RuntimeError('nonzero exit code: ' + str(e)) 29 | 30 | sys.stdout = stdout 31 | null.close() 32 | 33 | class TestLang(unittest.TestCase): 34 | def test_c(self): 35 | folder = 'test/test_c' 36 | script = os.path.join(folder, 'template.c') 37 | out = folder 38 | multiverse = os.path.join(folder, 'multiverse') 39 | 40 | run_click(compile, ['-s', script, '--out', folder]) 41 | 42 | file_base = os.path.join(out, 'multiverse/code/universe_') 43 | ext = '.c' 44 | for i in range(1, 4): 45 | f = file_base + str(i) + ext 46 | if not os.path.isfile(file_base + str(i) + ext): 47 | self.fail('did not generate universe ' + f) 48 | 49 | run_click(run, ['--dir', multiverse, '-a']) 50 | 51 | file_base = os.path.join(out, 'multiverse/boba_logs/log_') 52 | ext = '.txt' 53 | for i in range(1, 4): 54 | fn = file_base + str(i) + ext 55 | if not os.path.isfile(fn): 56 | self.fail('did not generate log ' + str(i)) 57 | 58 | with open(fn) as f: 59 | read = f.read() 60 | if read != 'hello from universe ' + str(i) + '\n': 61 | self.fail('universe generated unexpected output "' + read + '"') 62 | 63 | shutil.rmtree(multiverse) 64 | -------------------------------------------------------------------------------- /boba/lang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | PY = 'python' 5 | R = 'r' 6 | 7 | script = '{{script_name}}' 8 | compiled = '{{universe_name}}' 9 | 10 | DEFAULT_LANGS = { 11 | 'python' : { 12 | 'ext' : ['py'], 13 | 'run' : ['python', script] 14 | }, 15 | 'r' : { 16 | 'ext' : ['R', 'r'], 17 | 'run' : ['Rscript', script] 18 | } 19 | } 20 | 21 | class LangError(NameError): 22 | pass 23 | 24 | 25 | class Lang: 26 | def __init__(self, script, lang=None, supported_langs=None): 27 | self.supported_langs = DEFAULT_LANGS 28 | if supported_langs: 29 | for l in supported_langs: 30 | self.supported_langs[l] = supported_langs[l] 31 | 32 | self.script = script 33 | self.name, self.ext = os.path.splitext(script) 34 | self.lang = self._infer_lang(lang) 35 | 36 | def _infer_lang(self, lang): 37 | if lang: 38 | lang = lang.strip().lower() 39 | if not lang in self.supported_langs: 40 | raise LangError('Error: language "{}" is not supported'.format(lang)) 41 | 42 | return lang, self.supported_langs[lang] 43 | else: 44 | for lang, lang_properties in self.supported_langs.items(): 45 | if self.ext[1:] in lang_properties['ext']: 46 | return lang, lang_properties 47 | 48 | raise LangError('Error: cannot infer language from file extension ' + self.ext) 49 | 50 | def _format_cmd(self, cmd): 51 | return cmd.strip().replace(script, self.script).replace(compiled, self.name) 52 | 53 | def get_ext(self): 54 | return self.ext 55 | 56 | def get_cmd(self): 57 | cmd = [] 58 | if 'compile' in self.lang[1]: 59 | cmd.append([self._format_cmd(x) for x in self.lang[1]['compile']]) 60 | 61 | cmd.append([self._format_cmd(x) for x in self.lang[1]['run']]) 62 | return cmd 63 | 64 | def is_r(self): 65 | return self.lang[0] == R 66 | 67 | def is_python(self): 68 | return self.lang[0] == PY 69 | -------------------------------------------------------------------------------- /example/hurricane/data_wrangling/wrangle.py: -------------------------------------------------------------------------------- 1 | # We augmented the hurricane dataset by Jung et al. via the following steps: 2 | # (1) Add entries for two hurricanes, Katrina and Audrey 3 | # (2) Update normalized damage for all hurricanes, as adjusted to 2019 dollar values 4 | # (3) Retrieve the highest wind speed for all hurricanes 5 | # (4) Replace the femeninity ratings for all hurricanes 6 | # Normalized damage was retrived at: http://www.icatdamageestimator.com/commonsearch 7 | # The ratings for (4) is provided by Uri Simonson 8 | 9 | 10 | import pandas as pd 11 | import numpy as np 12 | from scipy.stats.stats import pearsonr 13 | 14 | # read csv 15 | jung = pd.read_csv('data_jung.csv') 16 | df = pd.read_csv('data_updated.csv') 17 | ratings = pd.read_csv('MTurk_ratings_femeninity_of_hurricanes.csv') 18 | 19 | # take the average of ratings and store in a dictionary keyed by name 20 | rs = dict() 21 | for c in ratings: 22 | if c.startswith('Q1'): 23 | # the first row is also a header, extract name from the question 24 | name = ratings[c][0].split('-')[-1] 25 | # take the average of ratings, excluding the first row 26 | rs[name] = np.mean(ratings[c][1:].astype('int32')) 27 | 28 | # fill in the ratings to our updated dataset 29 | for i in df.index: 30 | name = df.at[i, 'Name'] 31 | df.at[i, 'MasFem'] = rs[name] 32 | df.at[i, 'Gender_MF'] = 1 if rs[name] > 6 else 0 33 | df.Gender_MF = df.Gender_MF.astype('int32') 34 | 35 | # check the correlation between original and updated damage 36 | dff = df[(df.Name != 'Audrey') & (df.Name != 'Katrina')] 37 | r = pearsonr(jung.NDAM, dff.NDAM) 38 | print('Correlation of normalized damage: {}'.format(r[0])) 39 | 40 | # check the correlation between original and updated gender ratings 41 | r = pearsonr(jung.MasFem, dff.MasFem) 42 | print('Correlation of gender ratings: {}'.format(r[0])) 43 | r = pearsonr(jung.Gender_MF, dff.Gender_MF) 44 | print('Correlation of binary gender flag: {}'.format(r[0])) 45 | 46 | # results: 47 | # Correlation of normalized damage: 0.942 48 | # Correlation of gender ratings: 0.981 49 | # Correlation of binary gender flag: 0.951 50 | 51 | # save 52 | df.to_csv('./data.csv', index=False) 53 | -------------------------------------------------------------------------------- /boba/output/csvmerger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import re 5 | import pandas as pd 6 | import boba.util as util 7 | 8 | STR_MAX = 1024 9 | 10 | 11 | class CSVMerger: 12 | def __init__(self, pattern, base, out, delimiter=','): 13 | self.pattern = pattern 14 | self.base = base 15 | self.out = out 16 | self.delimiter = delimiter 17 | 18 | def _fn_func(self, i): 19 | return self.pattern.format(i) 20 | 21 | def _to_regex(self): 22 | """ Convert the string pattern to regex. """ 23 | i = self.pattern.find('{}') 24 | if i < 0: 25 | util.print_fail('Invalid pattern: {}'.format(self.pattern)) 26 | exit(1) 27 | 28 | rg = re.compile('^' + re.escape(self.pattern[:i]) + '(\d+)' + 29 | re.escape(self.pattern[i+2:])) 30 | return rg 31 | 32 | def get_files(self): 33 | """ Get a list of universe indices in the folder that matches given 34 | pattern. The indices are sorted.""" 35 | idx = [] 36 | for f in os.listdir(self.base): 37 | m = re.match(self._to_regex(), f) 38 | if m: 39 | idx.append(int(m.group(1))) 40 | idx.sort() 41 | return idx 42 | 43 | def merge(self): 44 | """ Merge the CSV files into one file """ 45 | result = pd.DataFrame() 46 | for i in self.get_files(): 47 | # read the file 48 | df = pd.read_csv(os.path.join(self.base, self._fn_func(i)), 49 | delimiter=self.delimiter, 50 | converters={i: str for i in range(0, STR_MAX)}) 51 | n = len(list(df.columns)) 52 | 53 | # augment 54 | df['uid'] = i 55 | 56 | # rearrange columns 57 | cols = list(df.columns) 58 | cols = cols[n:] + cols[:n] 59 | df = df[cols] 60 | 61 | # merge with previous results 62 | result = pd.concat([result, df], axis=0, sort=False) 63 | 64 | return result 65 | 66 | def main(self): 67 | res = self.merge() 68 | res.to_csv(self.out, index=False) 69 | -------------------------------------------------------------------------------- /example/reading/python/script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import statsmodels.formula.api as smf 6 | 7 | if __name__ == '__main__': 8 | # read data 9 | df = pd.read_csv('data.csv') 10 | 11 | # calculate reading speed in WPM 12 | df['speed'] = df.apply(lambda row: row.num_words / row.adjust_rt * 60000, 13 | axis=1) 14 | 15 | # remove retake participants 16 | df = df[df.retake != 1] 17 | 18 | # remove outliers based on reading speed 19 | iqr = np.subtract(*np.percentile(df.speed, [75, 25])) 20 | cutoff_high = np.median(df.speed) + 3 * iqr 21 | df = df[df.speed <= cutoff_high] 22 | df = df[df.speed >= 10] 23 | 24 | # remove smart phone users 25 | df = df[~df.device.isin(['smartphone'])] 26 | 27 | # drop NA rows 28 | df = df.dropna() 29 | 30 | # log-normalize speed 31 | df['log_speed'] = np.log(df.speed) 32 | 33 | # make dyslexia a categorical variable 34 | df.dyslexia = df.dyslexia.astype('category') 35 | 36 | # wrangle education level 37 | edu_order = ['pre-high school', 'high school', 'professional school', 38 | 'college', 'graduate school', 'PhD', 'postdoctoral'] 39 | tp = pd.CategoricalDtype(categories=edu_order, ordered=True) 40 | df['edu_level'] = df.education.astype(tp).cat.codes 41 | 42 | # check correlation between IVs 43 | ivs = df[['img_width', 'num_words', 'page_condition', 'age']] 44 | print(ivs.corr(), '\n') 45 | print(pd.crosstab(df.english_native, df.dyslexia, normalize='columns'), '\n') 46 | print(pd.crosstab(df.device, df.dyslexia, normalize='columns'), '\n') 47 | 48 | # fit a multinomial logit model to accuracy 49 | df['acc'] = 3 - pd.Categorical(df.correct_rate).codes 50 | print(df.groupby('acc').size(), '\n') 51 | fml = 'acc ~ page_condition*dyslexia_bin' 52 | model = smf.mnlogit(fml, df, groups=df.uuid).fit() 53 | print(model.summary(), '\n') 54 | 55 | # remove trials based on comprehension < 2/3 56 | df = df[df.correct_rate > 0.6] 57 | 58 | # fit a linear mixed effects model 59 | fml = 'log_speed ~ img_width + num_words + page_condition*dyslexia' \ 60 | '+ age + english_native' 61 | model = smf.mixedlm(fml, df, groups=df.uuid).fit() 62 | print(model.summary()) 63 | -------------------------------------------------------------------------------- /test/test_graph_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Ugly hack to allow import from the root folder 4 | import sys 5 | import os 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | import unittest 9 | from boba.graphparser import GraphParser, Edge, ParseError 10 | 11 | 12 | class TestParser(unittest.TestCase): 13 | def test_good_specs(self): 14 | spec = ['A -> B -> C1', 'B->C2'] 15 | nodes, edges = GraphParser(spec).parse() 16 | self.assertSetEqual(nodes, {'A', 'B', 'C1', 'C2'}) 17 | exp_edges = {Edge('A', 'B'), Edge('B', 'C1'), Edge('B', 'C2')} 18 | self.assertSetEqual(edges, exp_edges) 19 | 20 | def test_weird_specs(self): 21 | spec = ['a->a->a->a b '] 22 | nds, eds = GraphParser(spec).parse() 23 | self.assertSetEqual(nds, {'a', 'b'}) 24 | self.assertSetEqual(eds, {Edge('a', 'a')}) 25 | 26 | spec = ['a b', 'c'] 27 | nds, eds = GraphParser(spec).parse() 28 | self.assertSetEqual(nds, {'a', 'b', 'c'}) 29 | self.assertSetEqual(eds, set()) 30 | 31 | spec = ['a->b c->b'] 32 | nds, eds = GraphParser(spec).parse() 33 | self.assertSetEqual(nds, {'a', 'b', 'c'}) 34 | self.assertSetEqual(eds, {Edge('a', 'b'), Edge('c', 'b')}) 35 | 36 | def test_syntax_error(self): 37 | spec = ['my_first_node -> my_second_node'] 38 | nds, eds = GraphParser(spec).parse() 39 | self.assertSetEqual(nds, {'my_first_node', 'my_second_node'}) 40 | self.assertSetEqual(eds, {Edge('my_first_node', 'my_second_node')}) 41 | 42 | spec = ['_start -> _end'] 43 | with self.assertRaisesRegex(ParseError, '(?i)cannot handle character'): 44 | GraphParser(spec).parse() 45 | 46 | spec = ['-> B'] 47 | with self.assertRaisesRegex(ParseError, '(?i)source node'): 48 | GraphParser(spec).parse() 49 | 50 | spec = ['A -> B ->'] 51 | with self.assertRaisesRegex(ParseError, '(?i)target node'): 52 | GraphParser(spec).parse() 53 | 54 | spec = ['A - B'] 55 | with self.assertRaises(ParseError): 56 | GraphParser(spec).parse() 57 | 58 | spec = ['A->B->C, B->D'] 59 | with self.assertRaises(ParseError): 60 | GraphParser(spec).parse() 61 | 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /example/fertility/script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import statsmodels.api as sm 6 | import statsmodels.formula.api as smf 7 | 8 | if __name__ == '__main__': 9 | # read data file 10 | df = pd.read_csv('durante_etal_2013_study1.txt', delimiter='\t') 11 | 12 | # remove NA 13 | df = df.dropna(subset=['rel1', 'rel2', 'rel3']) 14 | 15 | # create religiosity score 16 | df['rel_comp'] = np.around((df.rel1 + df.rel2 + df.rel3) / 3, decimals=2) 17 | 18 | # next menstrual onset (nmo) assessment 19 | df.last_period_start = pd.to_datetime(df.last_period_start) 20 | df.period_before_last_start = pd.to_datetime(df.period_before_last_start) 21 | df.date_testing = pd.to_datetime(df.date_testing) 22 | 23 | # first nmo option: based on computed cycle length 24 | computed = df.last_period_start - df.period_before_last_start 25 | next_onset = df.last_period_start + computed 26 | 27 | # second nmo option: based on reported cycle length 28 | df = df.dropna(subset=['reported_cycle_length']) 29 | next_onset2 = df.last_period_start + df.reported_cycle_length.apply( 30 | lambda a: pd.Timedelta(days=a)) 31 | 32 | # compute cycle day 33 | df['cycle_day'] = pd.Timedelta('28 days') - (next_onset - df.date_testing) 34 | df.cycle_day = (df.cycle_day / np.timedelta64(1, 'D')).astype(int) 35 | df.cycle_day = np.clip(df.cycle_day, 1, 28) 36 | 37 | # fertility assessment 38 | high_bounds = [6, 14] 39 | low_bounds = [17, 27] 40 | df.loc[(high_bounds[0] <= df.cycle_day) & (df.cycle_day <= high_bounds[1]), 41 | 'fertility'] = 'High' 42 | df.loc[(low_bounds[0] <= df.cycle_day) & (df.cycle_day <= low_bounds[1]), 43 | 'fertility'] = 'Low' 44 | 45 | # relationship status assessment 46 | # single = response options 1 and 2; relationship = response options 3 and 4 47 | df.loc[df.relationship <= 2, 'relationship_status'] = 'Single' 48 | df.loc[df.relationship > 2, 'relationship_status'] = 'Relationship' 49 | 50 | # exclusion based on cycle length 51 | df = df[(df.reported_cycle_length >= 25) & 52 | (df.reported_cycle_length <= 35)] 53 | 54 | # exclusion based on certainty ratings 55 | df = df[(df.sure1 >= 6) & (df.sure2 >= 6)] 56 | 57 | # perform an ANOVA on the processed data set 58 | lm = smf.ols('rel_comp ~ relationship_status * fertility', data=df).fit() 59 | print(lm.summary(), '\n') 60 | table = sm.stats.anova_lm(lm, typ=2) 61 | print(table) 62 | -------------------------------------------------------------------------------- /example/hurricane/README.md: -------------------------------------------------------------------------------- 1 | # Multiverse of the Hurricane Dataset 2 | 3 | In this example, we implemented the specification curve analysis on Jung's hurricane study, 4 | described in the seminal paper of Simonsohn et al. 5 | 6 | Useful URLs: 7 | - Specification curve paper by Simonsohn et al.: 8 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2694998 9 | 10 | - The appendix of the specification curve paper: 11 | http://urisohn.com/sohn_files/wp/wordpress/wp-content/uploads/Supplement-Specification-Curve-2019-10-29.pdf 12 | 13 | - STATA code implementing the specification curve analysis: 14 | http://urisohn.com/sohn_files/files/Specification%20Curve.zip 15 | 16 | - Hurricane paper by Jung et al.: 17 | https://doi.org/10.1073/pnas.1402786111 18 | 19 | - Supporting material of the hurricane paper: 20 | https://www.pnas.org/content/suppl/2014/05/30/1402786111.DCSupplemental 21 | 22 | 23 | ### Augmenting the Dataset 24 | 25 | Following the description in Uri Simonsohn's STATA code, we augmented the original 26 | hurricane dataset via the following steps: 27 | 28 | - We added the two outliers excluded in Jung's study - Katrina and Audrey. 29 | - We replaced the femininity ratings (MasFem) with the average ratings from 32 MTurkers, 30 | collected using the same scale as described in Jung's paper. Uri Simonsohn kindly 31 | provided the MTurk ratings to us. 32 | - Accordingly, we also updated the binary gender indicator, so a femininity rating higher 33 | than 6 is categorized as female. 34 | - We updated the normalized damage to 2019 dollar values, using the same website as 35 | Jung et al: http://www.icatdamageestimator.com/commonsearch 36 | - We added a column of highest wind speed (mph) using the Wikipedia as the data source. 37 | 38 | ### Notes 39 | 40 | 1. The multiverse specification has two versions: `template.py`, which follows the decision definitions 41 | in page 4 of Simonsohn's appendix and creates 864 universes, and `repro.R`, which replicates Simonsohn's 42 | code implementation to create 1728 universes. The difference is due to separating the first decision (of size 6) 43 | into a cross product of two decisions (3x4), thus doubling the size of the final multiverse. 44 | 45 | 2. As we used a slightly different dataset than the one used by Jung et al., we did not obtain the same result when using the original specification in Jung's study. 46 | 47 | 3. About 40 universes, all fitting a negative binomial model, will fail because 48 | of this error: 49 | ``` 50 | Error in glm.fitter(x = X, y = Y, w = w, etastart = eta, offset = offset, : 51 | NA/NaN/Inf in 'x' 52 | Calls: glm.nb -> glm.fitter 53 | ``` 54 | The helper script `debug_count.py` outputs which universes had failed. 55 | -------------------------------------------------------------------------------- /example/mortgage/template.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # --- (BOBA_CONFIG) 3 | { 4 | "decisions": [ 5 | {"var": "black", "options": ["+ black", ""]}, 6 | {"var": "housing_expense_ratio", "options": ["+ housing_expense_ratio", ""]}, 7 | {"var": "self_employed", "options": ["+ self_employed", ""]}, 8 | {"var": "married", "options": ["+ married", ""]}, 9 | {"var": "bad_history", "options": ["+ bad_history", ""]}, 10 | {"var": "PI_ratio", "options": ["+ PI_ratio", ""]}, 11 | {"var": "loan_to_value", "options": ["+ loan_to_value", ""]}, 12 | {"var": "denied_PMI", "options": ["+ denied_PMI", ""]} 13 | ], 14 | "before_execute": "cp ../mortgage.csv ./ && rm -rf results && mkdir results", 15 | "after_execute": "cd .. && sh after_execute.sh", 16 | "visualizer": "visualizer_config.json" 17 | } 18 | # --- (END) 19 | 20 | suppressPackageStartupMessages(library(readr)) 21 | suppressPackageStartupMessages(library(tidyverse)) 22 | suppressPackageStartupMessages(library(broom.mixed)) 23 | source('../../../hurricane/boba_util.R') #fixme 24 | 25 | # read data 26 | df <- read_csv('../mortgage.csv', 27 | col_types = cols(.default = col_double())) %>% 28 | mutate( 29 | accept_scaled = accept * 100 30 | ) %>% 31 | # here we drop all NAs for simplicity, but we will drop up to 7 more data 32 | # points in some models, which may cause discrepancy with Young et al. 33 | drop_na() 34 | 35 | # linear regression 36 | model <- lm(accept_scaled ~ female {{black}} {{housing_expense_ratio}} 37 | {{self_employed}} {{married}} {{bad_history}} {{PI_ratio}} 38 | {{loan_to_value}} {{denied_PMI}}, data = df) 39 | 40 | # print summary to console 41 | smr = summary(model) 42 | smr 43 | 44 | # cross validation 45 | fit <- cross_validation(df, model, 'accept_scaled') 46 | # normalize using max - min, because IQR is zero 47 | nrmse = fit / (max(df$accept_scaled) - min(df$accept_scaled)) 48 | 49 | # wrangle results 50 | result <- tidy(model, conf.int = TRUE) %>% 51 | filter(term == 'female') %>% 52 | add_column( 53 | NRMSE = nrmse, 54 | R2_flipped = 1 - pmax(pmin(smr$adj.r.squared, 1), 0) 55 | ) 56 | 57 | # get predictions 58 | disagg_fit <- pointwise_predict(model, df) %>% 59 | select( 60 | observed = accept_scaled, 61 | expected = fit 62 | ) 63 | 64 | # get uncertainty in coefficient for female as draws from sampling distribution 65 | uncertainty <- sampling_distribution(model, 'female') %>% 66 | dplyr::select(estimate = coef) 67 | 68 | # output 69 | write_csv(result, '../results/estimate_{{_n}}.csv') 70 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv') 71 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv') 72 | -------------------------------------------------------------------------------- /boba/graphanalyzer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class InvalidGraphError(NameError): 5 | pass 6 | 7 | 8 | class GraphAnalyzer: 9 | def __init__(self, nodes, edges): 10 | self.nodes = nodes 11 | self.edges = GraphAnalyzer._convert_edges(edges) 12 | self.paths = [] 13 | 14 | @staticmethod 15 | def _convert_edges(edges): 16 | d = {} 17 | for e in edges: 18 | if e.start in d: 19 | d[e.start].append(e.end) 20 | else: 21 | d[e.start] = [e.end] 22 | return d 23 | 24 | def _throw(self, msg): 25 | msg = 'In analyzing graph structure:\n\t' + msg 26 | raise InvalidGraphError(msg) 27 | 28 | def _all_ending_nodes(self): 29 | """ nodes that have at least one incoming edge(s) """ 30 | flat = [item for lst in self.edges.values() for item in lst] 31 | return set(flat) 32 | 33 | def _get_source(self): 34 | """ nodes that have no incoming edges """ 35 | return self.nodes.difference(self._all_ending_nodes()) 36 | 37 | def _get_target(self): 38 | """ nodes that have no outgoing edges """ 39 | return self.nodes.difference(set(self.edges.keys())) 40 | 41 | def _all_paths_recur(self, a, b, visited, path): 42 | """ a recursive func to get all paths from a to b""" 43 | # mark the current node as visited and add to path 44 | visited.add(a) 45 | path.append(a) 46 | 47 | # if current node is the same as target, the path is done 48 | if a == b: 49 | self.paths.append([nd for nd in path]) 50 | else: 51 | if a in self.edges: 52 | for n in self.edges[a]: 53 | if n not in visited: 54 | self._all_paths_recur(n, b, visited, path) 55 | 56 | # remove current node from path and mark it as unvisited 57 | path.pop() 58 | visited.discard(a) 59 | 60 | def _all_paths(self, s, t): 61 | """ get all paths from s to t """ 62 | visited = set() 63 | path = [] 64 | self._all_paths_recur(s, t, visited, path) 65 | 66 | def _construct_paths(self): 67 | ss = self._get_source() 68 | ts = self._get_target() 69 | 70 | if len(ss) == 0: 71 | self._throw('Cannot find any starting node') 72 | if len(ts) == 0: 73 | self._throw('Cannot find any ending node') 74 | 75 | for s in ss: 76 | for t in ts: 77 | self._all_paths(s, t) 78 | 79 | def analyze(self): 80 | if len(self.nodes) == 0: 81 | return [] 82 | 83 | self._construct_paths() 84 | return self.paths 85 | -------------------------------------------------------------------------------- /boba/conditionparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .baseparser import BaseParser, ParseError 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | 6 | 7 | class TokenType(Enum): 8 | var = 1 9 | index_var = 2 10 | number = 3 11 | 12 | 13 | @dataclass 14 | class ParsedToken: 15 | value: str 16 | type: TokenType 17 | 18 | 19 | class ConditionParser(BaseParser): 20 | """ A class for parsing the condition string """ 21 | def __init__(self, line): 22 | super(ConditionParser, self).__init__(line) 23 | self.parsed_code = '' 24 | self.parsed_decs = [] 25 | 26 | def parse(self): 27 | while not self._is_end(): 28 | self._read_next() 29 | return self.parsed_code, self.parsed_decs 30 | 31 | @staticmethod 32 | def _is_keyword(w): 33 | return w == 'and' or w == 'or' 34 | 35 | @staticmethod 36 | def _is_operator(ch): 37 | return ch in ['=', '(', ')', '!', '>', '<'] 38 | 39 | def _throw(self, msg): 40 | msg = 'At character {} of "{}":\n\t{}'.format(self.i + 1, self.line, msg) 41 | raise ParseError(msg) 42 | 43 | def _maybe_read_index(self): 44 | # we only want to parse the LHS of == 45 | if len(self.parsed_decs) % 2 == 1: 46 | return False 47 | 48 | if not self._is_end() and self._peek_char() == '.': 49 | # try to parse .index 50 | self._next_char() 51 | v = self._read_while(self._is_id) 52 | if v == 'index': 53 | return True 54 | else: 55 | msg = 'Expected ".index", got ".{}"'.format(v) 56 | self._throw(msg) 57 | 58 | return False 59 | 60 | def _read_next(self): 61 | self.parsed_code += self._read_while(BaseParser._is_whitespace) 62 | if self._is_end(): 63 | return 64 | 65 | ch = self._peek_char() 66 | if self._is_id_start(ch): 67 | w = self._read_while(self._is_id) 68 | if ConditionParser._is_keyword(w): 69 | self.parsed_code += w 70 | return 71 | 72 | tk = ParsedToken(w, TokenType.var) 73 | if self._maybe_read_index(): 74 | tk.type = TokenType.index_var 75 | 76 | self.parsed_decs.append(tk) 77 | self.parsed_code += '{}' 78 | elif self._is_digit(ch): 79 | w = self._read_while(self._is_digit) 80 | if not self._is_end() and self._peek_char() == '.': # read decimal 81 | w += self._next_char() + self._read_while(self._is_digit) 82 | 83 | self.parsed_decs.append(ParsedToken(w, TokenType.number)) 84 | self.parsed_code += '{}' 85 | elif self._is_operator(ch): 86 | w = self._read_while(ConditionParser._is_operator) 87 | self.parsed_code += w 88 | else: 89 | msg = 'Cannot handle character "{}".'.format(ch) 90 | self._throw(msg) 91 | -------------------------------------------------------------------------------- /test/specs/continuous-err.json: -------------------------------------------------------------------------------- 1 | { 2 | "0" : { 3 | "decisions": [ 4 | {"var": "err", "options": [{"sample" : "uniform"}] , "desc" : "check 'count' omission"} 5 | ] 6 | }, 7 | "1" : { 8 | "decisions": [ 9 | {"var": "err", "options": [{"sample" : "lognormal"}] , "desc" : "check 'count' omission"} 10 | ] 11 | }, 12 | "2" : { 13 | "decisions": [ 14 | {"var": "err", "options": [{"sample" : "normal"}] , "desc" : "check 'count' omission"} 15 | ] 16 | }, 17 | "3" : { 18 | "decisions": [ 19 | {"var": "err", "options": [{"sample" : "uniform", "count" : 5}] , "desc" : "check required variable omission"} 20 | ] 21 | }, 22 | "4" : { 23 | "decisions": [ 24 | {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "min" : 0.0}] , "desc" : "check required variable omission"} 25 | ] 26 | }, 27 | "5" : { 28 | "decisions": [ 29 | {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "max" : 0.0}] , "desc" : "check required variable omission"} 30 | ] 31 | }, 32 | "6" : { 33 | "decisions": [ 34 | {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "min" : true, "max" : 5.0}] , "desc" : "check bad type for variables"} 35 | ] 36 | }, 37 | "7" : { 38 | "decisions": [ 39 | {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "min" : 1.0, "max" : true}] , "desc" : "check bad type for variables"} 40 | ] 41 | }, 42 | "8" : { 43 | "decisions": [ 44 | {"var": "err", "options": [{"sample" : "lognormal", "count" : 5, "exclusive" : 1.0}] , "desc" : "check bad type for variables"} 45 | ] 46 | }, 47 | "9" : { 48 | "decisions": [ 49 | {"var": "err", "options": [{"sample" : "lognormal", "count" : 5, "mean" : "mean"}] , "desc" : "check bad type for variables"} 50 | ] 51 | }, 52 | "10" : { 53 | "decisions": [ 54 | {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : "range"}] , "desc" : "check bad type for variables"} 55 | ] 56 | }, 57 | "11" : { 58 | "decisions": [ 59 | {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : ["range", "range"]}] , "desc" : "check bad type for variables"} 60 | ] 61 | }, 62 | "12" : { 63 | "decisions": [ 64 | {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : [0.0, 1.0, 2.0]}] , "desc" : "check bad type for variables"} 65 | ] 66 | }, 67 | "13" : { 68 | "decisions": [ 69 | {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : [1.0, 0.0]}] , "desc" : "check bad type for variables"} 70 | ] 71 | }, 72 | "14" : { 73 | "decisions": [ 74 | {"var": "err", "options": [{"sample" : "normal", "count" : 5, "std_dev" : 1.0, "range" : [1.0, 0.0]}] , "desc" : "check bad type for variables"} 75 | ] 76 | } 77 | } -------------------------------------------------------------------------------- /tutorial/cli.rst: -------------------------------------------------------------------------------- 1 | === 2 | CLI 3 | === 4 | 5 | You might invoke the command line tool via:: 6 | boba [options] 7 | 8 | Available commands: 9 | - compile 10 | - run 11 | - merge 12 | 13 | General options: 14 | 15 | ``--version`` 16 | Show version and exit. 17 | 18 | Compile 19 | ======= 20 | The compile command parses the template script and the JSON spec to generate 21 | executable universe scripts. It has the following options: 22 | 23 | ``--script, -s`` 24 | **default: ./template.py** (optional) 25 | 26 | The path to your template script. 27 | 28 | ``--out`` 29 | **default: .** (optional) 30 | 31 | The output directory to hold generated universe scripts, summary table, etc. 32 | 33 | ``--lang`` 34 | (optional) 35 | 36 | Language of your analysis script. We support python and R, and require a 37 | configuration file for any other languages. 38 | If not specified, we will infer it from the file extension. 39 | 40 | ``--help`` 41 | Show help message and exit. 42 | 43 | Run 44 | === 45 | The run command executes the generated universe scripts. You could use it to 46 | run the entire multiverse, a single universe, or a subset of universes. To run 47 | all universes, use:: 48 | 49 | boba run --all 50 | 51 | To run a single universe, provide its identifying number as the argument. For 52 | example, if you want to run universe_1.py, use:: 53 | 54 | boba run 1 55 | 56 | To run a range of universes, for example universe_1 through universe_5, use:: 57 | 58 | boba run 1 --thru 5 59 | 60 | In addition, the run command accepts the following options: 61 | 62 | ``--dir`` 63 | **default: ./multiverse (optional)** 64 | 65 | Determines the path to the multiverse directory. It should point to a directory 66 | that contains the *summary.csv* file and the *code* subfolder. 67 | 68 | ``--jobs`` 69 | **default: 1 (optional)** 70 | 71 | Determines the number of processes that can run at a time. If *jobs* is set 72 | to 0, it becomes the number of cores on the machine. 73 | 74 | ``--batch_size`` 75 | **default: see below (optional)** 76 | 77 | Determines the number of universes that will be run in a sequence in each 78 | process. Let :math:`N` denotes the number of universes, the default is 79 | :math:`sqrt(N)` or :math:`N/jobs + 1`, whichever is smaller. 80 | 81 | Merge 82 | ===== 83 | The merge command combines CSV outputs from individual universes into one file. 84 | This command works well if you used the built-in `{{_n}}` variable to output 85 | a separate CSV per universe. 86 | 87 | The command has a required argument: the filename pattern of individual outputs 88 | where the universe id is replaced by {}. For example, if your output 89 | files are output_1.csv, output_2.csv, output_3.csv, and so on, your pattern 90 | should be `output_{}.csv`. 91 | 92 | In addition, the command has the following options: 93 | 94 | ``--base, -b`` 95 | **default: ./multiverse/results (optional)** 96 | 97 | Path to the directory containing the universe outputs. 98 | 99 | ``--out`` 100 | **default: ./multiverse/merged.csv (optional)** 101 | 102 | Path to the merged file that will be created by this command. 103 | 104 | ``--delimiter`` 105 | **default: , (optional)** 106 | 107 | CSV delimiter. 108 | -------------------------------------------------------------------------------- /test/specs/continuous.json: -------------------------------------------------------------------------------- 1 | { 2 | "decisions": [ 3 | {"var": "A", "options": [{"sample" : "uniform", "count" : 10, "seed" : 0, "min" : 0.0, "max" : 5.0}] , 4 | "desc" : "uniform continuous variable expansion"}, 5 | 6 | {"var": "B", "options": [{"sample" : "lognormal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}] , 7 | "desc" : "lognormal continuous variable expansion"}, 8 | 9 | {"var": "C", "options": [{"sample" : "normal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}] , 10 | "desc" : "normal continuous variable expansion"}, 11 | 12 | {"var": "D", "options": [{"sample" : "uniform", "count" : 10, "seed" : 0, "min" : 0.0, "max" : 5.0}, 17.0] , 13 | "desc" : "uniform continuous variable expansion with additional constants"}, 14 | 15 | {"var": "E", "options": [{"sample" : "lognormal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}, 0.0, 1.0, 2.0] , 16 | "desc" : "lognormal continuous variable expansion with additional constants"}, 17 | 18 | {"var": "F", "options": [{"sample" : "normal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}, 0.0, 1.0, 2.0, 3.0, 4.0] , 19 | "desc" : "normal continuous variable expansion with additional constants"}, 20 | 21 | {"var": "G", "options": [{"sample" : "uniform", "count" : 3, "seed" : 0, "min" : 0.0, "max" : 5.0}, 22 | {"sample" : "lognormal", "count" : 3, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}, 23 | {"sample" : "normal", "count" : 3, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}] , 24 | "desc" : "multiple continuous variable expansions"}, 25 | 26 | {"var": "H", "options": [{"sample" : "uniform", "count" : 4, "seed" : 0, "min": 0.0, "max": 5.0}, 27 | {"sample" : "uniform", "count" : 4, "seed" : 1, "min": 10.0, "max": 15.0}] , 28 | "desc" : "multiple continuous variable expansions"}, 29 | 30 | {"var": "I", "options": [{"sample" : "uniform", "count" : 3, "seed" : 0, "min": 0.0, "max": 5.0}, -1.1, 31 | {"sample" : "uniform", "count" : 3, "seed" : 1, "min": 10.0, "max": 15.0}, 32 | 0.0, 1.0, 2.0, 3.1415] , 33 | "desc" : "multiple continuous variable expansions with additional constants"}, 34 | 35 | {"var": "J", "options": [{"sample" : "normal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0, "range" : [0.0, 2.5]}, 36 | {"sample" : "lognormal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0, "range" : [0.0, 2.5]}] , 37 | "desc" : "check range"}, 38 | 39 | {"var": "K", "options": [{"sample" : "lognormal", "count" : 5, "seed" : 0}, 40 | {"sample" : "lognormal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 1.0}] , 41 | "desc" : "check unrequired variable omission"}, 42 | 43 | {"var": "L", "options": [{"sample" : "normal", "count" : 5, "seed" : 0}, 44 | {"sample" : "normal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 1.0}] , 45 | "desc" : "check unrequired variable omission"} 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /boba/graphparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from dataclasses import dataclass 4 | from .baseparser import BaseParser, Token, ParseError 5 | 6 | 7 | @dataclass(frozen=True) 8 | class Edge: 9 | start: str 10 | end: str 11 | 12 | 13 | class GraphParser(BaseParser): 14 | def __init__(self, graph_spec): 15 | line = '\n'.join(graph_spec) 16 | super(GraphParser, self).__init__(line) 17 | 18 | self.spec = graph_spec 19 | self.nodes = set() 20 | self.edges = set() 21 | 22 | def _prep_err(self, msg): 23 | return 'At character {} of "{}":\n\t{}'.format(self.col+1, self.spec[self.row], msg) 24 | 25 | def _read_next(self): 26 | self._read_while(GraphParser._is_whitespace) 27 | if self._is_end(): 28 | return Token('eof', '') 29 | 30 | ch = self._peek_char() 31 | if ch == '-': 32 | return self._read_edge() 33 | elif GraphParser._is_id_start(ch): 34 | return self._read_node() 35 | else: 36 | raise ParseError(self._prep_err('Cannot handle character "{}"'.format(ch))) 37 | 38 | def _read_edge(self): 39 | val = self._next_char() 40 | ch = self._peek_char() 41 | if ch != '>': 42 | raise ParseError(self._prep_err('Expected "->", got "-{}"'.format(ch))) 43 | val += self._next_char() 44 | return Token('edge', val) 45 | 46 | def _read_node(self): 47 | nd = self._read_while(GraphParser._is_id) 48 | return Token('node', nd) 49 | 50 | def parse(self): 51 | prev_node = None 52 | 53 | while True: 54 | tk = self._next() 55 | 56 | if tk.type == 'node': 57 | self.nodes.add(tk.value) 58 | prev_node = tk.value 59 | if tk.type == 'edge': 60 | if not prev_node: 61 | raise ParseError(self._prep_err('Cannot find a source node')) 62 | nx = self._peek() 63 | if nx.type != 'node': 64 | raise ParseError(self._prep_err('Cannot find a target node')) 65 | self.edges.add(Edge(prev_node, nx.value)) 66 | if tk.type == 'eof': 67 | break 68 | 69 | return self.nodes, self.edges 70 | 71 | def replace_graph(self, decs): 72 | """ Replace the block-level decision nodes in the graph with option nodes.""" 73 | # replace nodes 74 | nds = [] 75 | for nd in self.nodes: 76 | tmp = decs[nd] if nd in decs else [nd] 77 | nds.extend(tmp) 78 | 79 | # replace edges 80 | egs = [] 81 | for eg in self.edges: 82 | ss = decs[eg.start] if eg.start in decs else [eg.start] 83 | es = decs[eg.end] if eg.end in decs else [eg.end] 84 | egs.extend([Edge(s, e) for s in ss for e in es]) 85 | 86 | self.nodes = set(nds) 87 | self.edges = set(egs) 88 | return self.nodes, self.edges 89 | 90 | def create_default_graph(self, nodes): 91 | """ 92 | Create the default graph, which is a linear flow of blocks, with the 93 | same order as they appear in the template script. 94 | :param nodes: A list of unique blocks. 95 | :return: nodes and edges 96 | """ 97 | self.nodes = set(nodes) 98 | self.edges = set() 99 | for i in range(len(nodes) - 1): 100 | self.edges.add(Edge(nodes[i], nodes[i + 1])) 101 | return self.nodes, self.edges 102 | -------------------------------------------------------------------------------- /example/fertility_r/template.R: -------------------------------------------------------------------------------- 1 | # --- (BOBA_CONFIG) 2 | { 3 | "decisions": [ 4 | {"var": "fertility_bounds", "options": [ 5 | "c(7, 14, 17, 25, 17, 25)", 6 | "c(6, 14, 17, 27, 17, 27)", 7 | "c(9, 17, 18, 25, 18, 25)", 8 | "c(8, 14, 1, 7, 15, 28)", 9 | "c(9, 17, 1, 8, 18, 28)" 10 | ]}, 11 | {"var": "relationship_bounds", "options": [ 12 | "c(2, 3)", "c(1, 2)", "c(1, 3)" 13 | ]} 14 | ], 15 | "outputs": [ 16 | {"name": "p-value", "value": "summar$coefficients[4, 4]"} 17 | ], 18 | "before_execute": "cp ../durante_etal_2013_study1.txt ./code/" 19 | } 20 | # --- (END) 21 | 22 | #read in raw data from Study 1 23 | df <- read.csv2("durante_etal_2013_study1.txt", sep = "") 24 | 25 | # create religiosity score 26 | df$RelComp <- round(rowMeans(cbind(df$Rel1, df$Rel2, df$Rel3), na.rm = TRUE), digits = 2) 27 | 28 | # next menstrual onset (nmo) assessment 29 | Sys.setenv(TZ="Europe/Berlin") # suppress time zone warning 30 | df$DateTesting <- as.Date(df$DateTesting, format = "%m/%d/%y") 31 | df$StartDateofLastPeriod <- as.Date(df$StartDateofLastPeriod, format = "%m/%d/%y") 32 | df$StartDateofPeriodBeforeLast <- as.Date(df$StartDateofPeriodBeforeLast, 33 | format = "%m/%d/%y") 34 | df$ComputedCycleLength <- df$StartDateofLastPeriod - df$StartDateofPeriodBeforeLast 35 | 36 | # --- (NMO) computed 37 | # first nmo option: based on computed cycle length 38 | df$NextMenstrualOnset <- df$StartDateofLastPeriod + df$ComputedCycleLength 39 | 40 | # --- (NMO) reported 41 | # second nmo option: based on reported cycle length 42 | df$NextMenstrualOnset <- df$StartDateofLastPeriod + df$ReportedCycleLength 43 | 44 | # # --- (NMO) estimate 45 | # # third nmo option: based on reported estimate of next menstrual onset 46 | # # note: this is not available in study one 47 | # df$NextMenstrualOnset <- df$StartDateNext 48 | 49 | # --- (ECL) computed @if NMO != reported 50 | # exclusion based on computed cycle length 51 | df <- df[!(df$ComputedCycleLength < 25 | df$ComputedCycleLength > 35), ] 52 | 53 | # --- (ECL) reported @if NMO != computed 54 | # exclusion based on reported cycle length 55 | df <- df[!(df$ReportedCycleLength < 25 | df$ReportedCycleLength > 35), ] 56 | 57 | # --- (ECL) none 58 | # include all cycle lengths 59 | 60 | # --- (A) 61 | # compute cycle day 62 | df$DaysBeforeNextOnset <- df$NextMenstrualOnset - df$DateTesting 63 | df$CycleDay <- 28 - df$DaysBeforeNextOnset 64 | df$CycleDay <- ifelse(df$CycleDay <1, 1, df$CycleDay) 65 | df$CycleDay <- ifelse(df$CycleDay > 28, 28, df$CycleDay) 66 | 67 | # fertility assessment 68 | bounds = {{fertility_bounds}} 69 | df$Fertility <- rep(NA, dim(df)[1]) # create fertility variable 70 | df$Fertility[df$CycleDay >= bounds[1] & df$CycleDay <= bounds[2]] <- "High" 71 | df$Fertility[df$CycleDay >= bounds[3] & df$CycleDay <= bounds[4]] <- "Low" 72 | df$Fertility[df$CycleDay >= bounds[5] & df$CycleDay <= bounds[6]] <- "Low" 73 | 74 | # relationship status assessment 75 | rel.bounds = {{relationship_bounds}} 76 | df$RelationshipStatus[df$Relationship <= rel.bounds[1]] <- "Single" 77 | df$RelationshipStatus[df$Relationship >= rel.bounds[2]] <- "Relationship" 78 | 79 | # --- (EC) certainty 80 | # exclusion based on certainty ratings 81 | df <- df[!(df$Sure1 < 6 | df$Sure2 < 6), ] 82 | 83 | # --- (EC) none 84 | # include all certainty ratings 85 | 86 | # --- (B) 87 | # perform an ANOVA on the processed data set 88 | df$Fertility <- factor(df$Fertility) 89 | df$RelationshipStatus <- factor(df$RelationshipStatus) 90 | an = lm("RelComp~Fertility*RelationshipStatus", df) 91 | summar <- summary(an) 92 | # the p-value of the fertility x relationship interaction 93 | summar$coefficients[4, 4] 94 | -------------------------------------------------------------------------------- /example/reading/r/template.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # --- (BOBA_CONFIG) 3 | { 4 | "graph": [ 5 | "RC->LM1->O1", 6 | "RC->LM2->O2", 7 | "OLR1->O1", 8 | "OLR2->O2" 9 | ], 10 | "decisions": [ 11 | {"var": "brmsfamily", "options": ["shifted_lognormal", "lognormal"]} 12 | ], 13 | "outputs": [ 14 | {"name": "aic/waic", "value": "aic"} 15 | ], 16 | "before_execute": "cp ../../data.csv ./code/ && mkdir results" 17 | } 18 | # --- (END) 19 | 20 | library(readr) 21 | library(lmerTest) 22 | library(car) 23 | library(psych) 24 | library(scales) 25 | library(brms) 26 | library(ordinal) 27 | 28 | speed_data <- read_csv('data.csv') 29 | 30 | # calculate reading speed in WPM 31 | speed_data$speed <- speed_data$num_words/(speed_data$adjust_rt/60000) 32 | 33 | # remove retake participants 34 | speed_data <- subset(speed_data, retake != 1) 35 | 36 | # remove outliers 37 | iqr = IQR(speed_data[speed_data$dyslexia_bin == 0,]$speed,na.rm=TRUE) 38 | cutoff_high = median(speed_data$speed) +3*iqr #3*iqr=645, cutoff_high = 928 39 | 40 | # remove trials based on speed 41 | result_analysis <- speed_data[! speed_data$speed > cutoff_high, ] 42 | result_analysis <- result_analysis[ ! result_analysis$speed < 10,] 43 | 44 | # remove smartphone users 45 | # removed 64 smartphone users, 363 trials 46 | result_analysis <- result_analysis[! result_analysis$device == 'smartphone',] 47 | 48 | # wrangle variables 49 | result_analysis$log_speed <- log(result_analysis$speed) 50 | result_analysis$dyslexia = as.factor(result_analysis$dyslexia) 51 | result_analysis$correct_num = round(result_analysis$correct_rate * 3, 0) 52 | result_analysis$acc = result_analysis$correct_num + 1 53 | result_analysis$correct_num = as.factor(result_analysis$correct_num) 54 | 55 | # --- (RC) 56 | # remove trials based on comprehension < 2/3 57 | # removed 111 trials 58 | result_analysis <- result_analysis[ ! result_analysis$correct_rate < .6,] 59 | 60 | # --- (LM1) 61 | # fit linear mixed model 62 | model <- lmer(log_speed ~ page_condition*dyslexia + img_width + num_words + age + english_native + (1 | uuid), 63 | data = result_analysis) 64 | print.odds = FALSE 65 | 66 | # --- (OLR1) 67 | # fit ordinal logistic regression using accuracy as DV 68 | model <- clmm(correct_num ~ page_condition*dyslexia + num_words + age + english_native + (1 | uuid), 69 | data=result_analysis) 70 | print.odds = TRUE 71 | 72 | # --- (LM2) 73 | # fit bayesian model 74 | model <- brm(speed ~ page_condition*dyslexia + img_width + num_words + age + english_native + (1 | uuid), 75 | data = result_analysis, family = {{brmsfamily}}(), file = '../results/brmsfit_{{_n}}', 76 | save_all_pars = TRUE, silent = TRUE, refresh = 0, seed = 0, 77 | chains = 4, cores = 4, iter = 1000) 78 | 79 | # --- (OLR2) 80 | # fit bayesian model to accuracy 81 | model <- brm(acc ~ page_condition*dyslexia + num_words + age + english_native + (1 | uuid), 82 | data = result_analysis, family = cumulative(), file = '../results/brmsfit_{{_n}}', 83 | save_all_pars = TRUE, silent = TRUE, refresh = 0, seed = 0, 84 | chains = 4, cores = 4, iter = 1000) 85 | 86 | # --- (O1) 87 | aic = AIC(model) 88 | sink('../results/summary_{{_n}}.txt') 89 | summary(model) 90 | 91 | if(print.odds){ 92 | print("Odds ratio:") 93 | exp(coef(model)) 94 | } 95 | 96 | # --- (O2) 97 | # evaluate fit 98 | aic = waic(model)$waic 99 | 100 | # output resultsf 101 | sink('../results/summary_{{_n}}.txt') 102 | summary(model) 103 | sink() 104 | pdf(file="../results/plots_{{_n}}.pdf") 105 | plot(model) 106 | marginal_effects(model) 107 | -------------------------------------------------------------------------------- /example/reading/python/template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import statsmodels.formula.api as smf 6 | 7 | # --- (BOBA_CONFIG) 8 | { 9 | "graph": [ 10 | "B1->C", 11 | "B2->C", 12 | "C->D1->F1", 13 | "C->D2->F1", 14 | "C->F2" 15 | ], 16 | "decisions": [ 17 | {"var": "sample_size", "options": [1284, 2568]}, 18 | {"var": "rt", "options": ["adjust_rt", "rt"] }, 19 | {"var": "bad_device", "options": [[], ["smartphone"], ["smartphone", "tablet"]]}, 20 | {"var": "dyslexia", "options": ["dyslexia", "dyslexia_bin"] }, 21 | {"var": "min_wpm", "options": [10, 150] }, 22 | {"var": "formula", "options": [ 23 | "log_speed ~ page_condition*dyslexia", 24 | "log_speed ~ page_condition*dyslexia + img_width + num_words + age + english_native", 25 | "log_speed ~ page_condition*dyslexia + img_width + num_words + age + english_native + device + edu_level" 26 | ]} 27 | ], 28 | "before_execute": "cp ../../data.csv ./code/" 29 | } 30 | # --- (END) 31 | 32 | if __name__ == '__main__': 33 | # read data 34 | df = pd.read_csv('./data.csv') 35 | 36 | # take the first N participants to simulate stopping condition 37 | df = df[:{{sample_size}}] 38 | 39 | # calculate reading speed in WPM 40 | df['speed'] = df.apply(lambda row: row.num_words / row['{{rt}}'] * 60000, 41 | axis=1) 42 | 43 | # convert education level into an ordinal variable 44 | edu_order = ['pre-high school', 'high school', 'professional school', 45 | 'college', 'graduate school', 'PhD', 'postdoctoral'] 46 | tp = pd.CategoricalDtype(categories=edu_order, ordered=True) 47 | df['edu_level'] = df.education.astype(tp).cat.codes 48 | 49 | # remove retake participants 50 | df = df[df.retake != 1] 51 | 52 | # remove smart phone users 53 | df = df[~df.device.isin({{bad_device}})] 54 | 55 | # remove outliers based on reading speed 56 | # --- (B1) 57 | # remove reading speed outside median + 3 x iqr 58 | iqr = np.subtract(*np.percentile(df.speed, [75, 25])) 59 | cutoff_high = np.median(df.speed) + 3 * iqr 60 | 61 | # --- (B2) 62 | # remove reading speed outside mean + 2 x std 63 | cutoff_high = np.mean(df.speed) + 2 * np.std(df.speed) 64 | 65 | # --- (C) 66 | cutoff_low = {{min_wpm}} 67 | df = df[df.speed <= cutoff_high] 68 | df = df[df.speed >= cutoff_low] 69 | 70 | # drop NA rows 71 | df = df.dropna() 72 | 73 | # log-normalize speed 74 | df['log_speed'] = np.log(df.speed) 75 | 76 | # decision: whether to bin dyslexia or not 77 | df.dyslexia = df['{{dyslexia}}'] 78 | 79 | # make dyslexia a categorical variable 80 | df.dyslexia = df.dyslexia.astype('category') 81 | 82 | # remove trials based on comprehension < 2/3 83 | # --- (D1) 84 | # just remove trials 85 | df = df[df.correct_rate > 0.6] 86 | 87 | # --- (D2) 88 | # drop entire participants 89 | bad_uuid = set() 90 | for i, row in df.iterrows(): 91 | if row.correct_rate < 0.6: 92 | bad_uuid.add(str(row.uuid)) 93 | df = df[~df.uuid.isin(bad_uuid)] 94 | 95 | # --- (F1) 96 | # fit a linear mixed effects model 97 | fml = '{{formula}}' 98 | model = smf.mixedlm(fml, df, groups=df.uuid).fit() 99 | print(model.summary()) 100 | 101 | # --- (F2) 102 | # fit a multinomial logit model to accuracy 103 | df['acc'] = 3 - pd.Categorical(df.correct_rate).codes 104 | fml = 'acc ~ page_condition*dyslexia_bin' 105 | model = smf.mnlogit(fml, df).fit() 106 | print(model.summary()) 107 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ==== 2 | boba 3 | ==== 4 | 5 | Author and visualize multiverse analysis with ease. 6 | 7 | Boba has a domain specific language (Boba DSL) for writing multiverse specifications, 8 | and a visual analysis interface (`Boba Visualizer`_) for exploring multiverse outcomes. 9 | Boba comes with a command line tool to parse your DSL specification and generate 10 | universe scripts, execute all scripts with a single command, 11 | merges outputs into a table, and invoke the visualizer. 12 | 13 | - works with both python and R, and other scripting languages to come 14 | - handles simple parameter substitution as well as complex code flow dependency 15 | - offers interactive visualizations for exploring consequential decisions, uncertainty, model fit, and more 16 | 17 | .. _Boba Visualizer: https://github.com/uwdata/boba-visualizer 18 | .. image:: https://badge.fury.io/py/boba.svg 19 | :target: https://badge.fury.io/py/boba 20 | .. image:: https://travis-ci.org/uwdata/boba.svg?branch=master 21 | :target: https://travis-ci.org/uwdata/boba 22 | .. image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg 23 | :target: https://opensource.org/licenses/BSD-3-Clause) 24 | .. image:: https://img.shields.io/pypi/pyversions/boba 25 | 26 | Installation 27 | ============ 28 | 29 | You might download and install the latest version of this software from the 30 | Python package index (PyPI):: 31 | 32 | pip install --upgrade boba 33 | pip install --upgrade boba-visualizer 34 | 35 | Usage 36 | ===== 37 | 38 | To author your multiverse, please refer to the specification rules_. 39 | Here is a `simple example`_ to get you started! 40 | 41 | 42 | To parse your specification and generate actual scripts, invoke boba and pass 43 | in the path to your template script and your JSON spec:: 44 | 45 | boba compile --script template.py 46 | 47 | To execute the multiverse, namely running all the generated scripts, use:: 48 | 49 | boba run --all 50 | 51 | To start the Boba Visualizer after getting the intermediate output files, use:: 52 | 53 | boba-server 54 | 55 | For more command line options, see `CLI`_. 56 | For more information about the Boba Visualizer, see this project_. 57 | 58 | .. _rules: https://github.com/uwdata/boba/blob/master/tutorial/rules.md 59 | .. _simple example: https://github.com/uwdata/boba/blob/master/tutorial/simple.md 60 | .. _more complex example: https://github.com/uwdata/boba/blob/master/tutorial/fertility.md 61 | .. _CLI: https://github.com/uwdata/boba/blob/master/tutorial/cli.rst 62 | .. _project: https://github.com/uwdata/boba-visualizer 63 | 64 | Examples 65 | ======== 66 | 67 | - A `simple example`_ to walk you through the basics 68 | - A `more complex example`_ using `Steegen's multiverse analysis`_ and `Durante's fertility dataset`_. 69 | - Another multiverse example_, based on the `specification curve paper`_ by Simonsohn et al. 70 | 71 | .. _reading speed dataset: https://github.com/QishengLi/CHI2019_Reader_View 72 | .. _analysis: https://github.com/uwdata/boba/tree/master/example/reading 73 | .. _example: https://github.com/uwdata/boba/tree/master/example/hurricane 74 | .. _specification curve paper: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2694998 75 | .. _Steegen's multiverse analysis: https://journals.sagepub.com/doi/pdf/10.1177/1745691616658637 76 | .. _Durante's fertility dataset: https://osf.io/zj68b/ 77 | 78 | Citation 79 | ======== 80 | 81 | If you are interested in this work, please see our research paper_ and consider citing our work:: 82 | 83 | @misc{liu2020boba, 84 | title={Boba: Authoring and visualizing multiverse analyses}, 85 | author={Yang Liu and Alex Kale and Tim Althoff and Jeffrey Heer}, 86 | year={2020}, 87 | eprint={2007.05551}, 88 | archivePrefix={arXiv}, 89 | primaryClass={cs.HC} 90 | } 91 | 92 | .. _paper: https://arxiv.org/abs/2007.05551 -------------------------------------------------------------------------------- /example/fertility/template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import statsmodels.api as sm 6 | import statsmodels.formula.api as smf 7 | # --- (BOBA_CONFIG) 8 | { 9 | "graph": [ 10 | "NMO1->ECL1->A", 11 | "NMO2->ECL2->A", 12 | "NMO1->A", 13 | "NMO2->A", 14 | "A->B", 15 | "A->EC->B" 16 | ], 17 | "decisions": [ 18 | {"var": "fertility_bounds", "options": [ 19 | [[7, 14], [17, 25], [17, 25]], 20 | [[6, 14], [17, 27], [17, 27]], 21 | [[9, 17], [18, 25], [18, 25]], 22 | [[8, 14], [1, 7], [15, 28]], 23 | [[9, 17], [1, 8], [18, 28]] 24 | ]}, 25 | {"var": "relationship_bounds", 26 | "options": [[2, 3], [1, 2], [1, 3]]} 27 | ], 28 | "before_execute": "cp ../durante_etal_2013_study1.txt ./code/" 29 | } 30 | # --- (END) 31 | 32 | if __name__ == '__main__': 33 | # read data file 34 | df = pd.read_csv('durante_etal_2013_study1.txt', delimiter='\t') 35 | 36 | # remove NA 37 | df = df.dropna(subset=['rel1', 'rel2', 'rel3']) 38 | 39 | # create religiosity score 40 | df['rel_comp'] = np.around((df.rel1 + df.rel2 + df.rel3) / 3, decimals=2) 41 | 42 | # next menstrual onset (nmo) assessment 43 | df.last_period_start = pd.to_datetime(df.last_period_start) 44 | df.period_before_last_start = pd.to_datetime(df.period_before_last_start) 45 | df.date_testing = pd.to_datetime(df.date_testing) 46 | 47 | # --- (NMO1) 48 | # first nmo option: based on computed cycle length 49 | cl = df.last_period_start - df.period_before_last_start 50 | next_onset = df.last_period_start + cl 51 | df['computed_cycle_length'] = (cl / np.timedelta64(1, 'D')).astype(int) 52 | 53 | # --- (NMO2) 54 | # second nmo option: based on reported cycle length 55 | df = df.dropna(subset=['reported_cycle_length']) 56 | next_onset = df.last_period_start + df.reported_cycle_length.apply( 57 | lambda a: pd.Timedelta(days=a)) 58 | 59 | # --- (ECL1) 60 | # exclusion based on computed cycle length 61 | df = df[(df.computed_cycle_length >= 25) & (df.computed_cycle_length <= 35)] 62 | 63 | # --- (ECL2) 64 | # exclusion based on reported cycle length 65 | df = df[(df.reported_cycle_length >= 25) & (df.reported_cycle_length <= 35)] 66 | 67 | # --- (A) 68 | # compute cycle day 69 | df['cycle_day'] = pd.Timedelta('28 days') - (next_onset - df.date_testing) 70 | df.cycle_day = (df.cycle_day / np.timedelta64(1, 'D')).astype(int) 71 | df.cycle_day = np.clip(df.cycle_day, 1, 28) 72 | 73 | # fertility assessment 74 | high_bounds = {{fertility_bounds}}[0] 75 | low_bounds1 = {{fertility_bounds}}[1] 76 | low_bounds2 = {{fertility_bounds}}[2] 77 | df.loc[(high_bounds[0] <= df.cycle_day) & (df.cycle_day <= high_bounds[1]), 78 | 'fertility'] = 'High' 79 | df.loc[(low_bounds1[0] <= df.cycle_day) & (df.cycle_day <= low_bounds1[1]), 80 | 'fertility'] = 'Low' 81 | df.loc[(low_bounds2[0] <= df.cycle_day) & (df.cycle_day <= low_bounds2[1]), 82 | 'fertility'] = 'Low' 83 | 84 | # relationship status assessment 85 | # single = response options 1 and 2; relationship = response options 3 and 4 86 | df.loc[df.relationship <= {{relationship_bounds}}[0], 87 | 'relationship_status'] = 'Single' 88 | df.loc[df.relationship >= {{relationship_bounds}}[1], 89 | 'relationship_status'] = 'Relationship' 90 | 91 | # --- (EC) 92 | # exclusion based on certainty ratings 93 | df = df[(df.sure1 >= 6) & (df.sure2 >= 6)] 94 | 95 | # --- (B) 96 | # perform an ANOVA on the processed data set 97 | lm = smf.ols('rel_comp ~ relationship_status * fertility', data=df).fit() 98 | table = sm.stats.anova_lm(lm, typ=2) 99 | print(table) 100 | -------------------------------------------------------------------------------- /example/simple/data.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | 4.583918298882086,13.079990304560285 3 | 0.30056854367327024,9.330556404593688 4 | 1.2698385703363135,13.470556335806807 5 | 2.4609400820343947,8.939990373347166 6 | 1.8866002121101615,13.861122367053326 7 | 0.32484042349493036,8.549424342100647 8 | 2.659870058057336,11.153192706623981 9 | 3.507132111497367,12.053098856383919 10 | 1.4381551638392343,10.828392583697664 11 | 3.931908974207081,11.808371158699313 12 | 1.3697559846527085,10.269450827872694 13 | 0.40406774336411655,10.242678602433266 14 | 1.3121428182320516,10.920125678505766 15 | 3.2788086733310413,11.493353333480904 16 | 2.935555345300246,11.34157864056587 17 | 0.17777224273899328,10.235834507052907 18 | 4.364812565880889,12.105686642360926 19 | 4.821388372116607,12.400326143190883 20 | 2.6717615513891824,11.144537234275228 21 | 0.4348374823413731,9.98469118257374 22 | 3.0964700345313814,11.21974014049093 23 | 1.7839201289999962,10.933488206767475 24 | 4.266818769719739,12.407242753086068 25 | 0.22052654472462352,10.59736596480809 26 | 0.6563787765526785,10.545771192487592 27 | 3.416610857323886,11.816243419817742 28 | 0.0802800394904396,9.826632245420347 29 | 3.661445614546677,11.638833441675093 30 | 1.684436444392035,10.927248777686911 31 | 2.007584105581906,10.864779349017715 32 | 3.0486016633164277,11.1478208385455 33 | 2.947939514747561,11.22047543782475 34 | 4.506414990794167,12.225569684786457 35 | 1.244018216385217,10.252740673398135 36 | 3.289282623798896,12.001629281830962 37 | 0.05067012803057991,10.013217827293415 38 | 1.6263180752938915,10.8378561530127 39 | 3.0231972920148027,11.76765653315177 40 | 2.829254821027227,11.420580285395145 41 | 3.7576233536426518,11.936002317485991 42 | 3.89420722703682,11.905725812894074 43 | 3.438191772991841,11.602232855572659 44 | 3.0895148411312547,11.661065677835904 45 | 3.7153200790225007,11.559953421078564 46 | 2.5880794649558654,11.39308245523173 47 | 0.9075578496290465,10.501341853452686 48 | 1.2871348832314604,10.612709938566729 49 | 2.150181020660253,11.08618852377759 50 | 4.227392912911492,12.160532976236091 51 | 4.347151369108863,12.10185461276485 52 | 1.2900203678061488,10.495187063648077 53 | 3.0507113334582687,11.70386024252094 54 | 4.472572013339909,12.442609294633272 55 | 0.27507220431418455,9.92961086741772 56 | 4.95255795141159,12.172037894638441 57 | 2.187690851944044,10.879864925834301 58 | 3.3030659113066942,11.557920082823808 59 | 4.453069421373162,12.193117922890561 60 | 2.561837519566263,11.536414504912928 61 | 0.8310782502405201,10.427354286565777 62 | 2.319393910459352,10.943113864300257 63 | 1.8495795260884962,11.044041724265039 64 | 4.369809683527437,12.316214488898776 65 | 4.7286559063267255,12.732221564219854 66 | 2.132724499371575,11.129810065327995 67 | 0.6316333581952194,10.013186502776936 68 | 1.7825762599908446,10.905235687487785 69 | 0.09938029432887963,10.017546152657244 70 | 1.9839363708410773,10.950687015270212 71 | 2.147324686723473,11.418973562688278 72 | 3.8727660667095263,11.754094323525093 73 | 4.6491124436662385,12.293169674588764 74 | 3.9636955957971582,11.731018926883527 75 | 1.2831687251881223,10.775362213956697 76 | 3.027460297930971,11.783257068467368 77 | 2.4194658972220817,11.338930051249813 78 | 1.933243097657601,10.646858847439713 79 | 4.6500127533476086,12.519913675054616 80 | 0.9279426196129131,10.061009331065804 81 | 0.7088392759937251,10.211592165570067 82 | 1.6972816274117375,10.609295205879185 83 | 0.7291005720188815,10.146174335915244 84 | 1.5135446237931238,10.827942182777722 85 | 0.7863892515036303,10.440251364931768 86 | 0.7369591945983167,10.126055811688161 87 | 0.4484281422642783,10.238798585570631 88 | 0.6476846910638945,10.670529311857813 89 | 4.17949792663603,11.97731742079653 90 | 0.9426370844467991,10.284665332731757 91 | 0.3850137632566658,10.28472156012984 92 | 4.264099492071075,12.119492507525234 93 | 3.1938080241767333,11.728495313517408 94 | 3.0934454156259017,11.501534794005355 95 | 4.556323953281537,12.4798273798697 96 | 2.5318064635878756,11.483549596279689 97 | 4.510113468263804,12.130591650717289 98 | 2.667606009197513,11.20696909368087 99 | 4.197221426103427,12.406283640395031 100 | 0.6870158534517201,10.01897145287496 101 | 3.541509668147845,12.09678789495433 102 | -------------------------------------------------------------------------------- /example/simple_cont/data.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | 4.583918298882086,13.079990304560285 3 | 0.30056854367327024,9.330556404593688 4 | 1.2698385703363135,13.470556335806807 5 | 2.4609400820343947,8.939990373347166 6 | 1.8866002121101615,13.861122367053326 7 | 0.32484042349493036,8.549424342100647 8 | 2.659870058057336,11.153192706623981 9 | 3.507132111497367,12.053098856383919 10 | 1.4381551638392343,10.828392583697664 11 | 3.931908974207081,11.808371158699313 12 | 1.3697559846527085,10.269450827872694 13 | 0.40406774336411655,10.242678602433266 14 | 1.3121428182320516,10.920125678505766 15 | 3.2788086733310413,11.493353333480904 16 | 2.935555345300246,11.34157864056587 17 | 0.17777224273899328,10.235834507052907 18 | 4.364812565880889,12.105686642360926 19 | 4.821388372116607,12.400326143190883 20 | 2.6717615513891824,11.144537234275228 21 | 0.4348374823413731,9.98469118257374 22 | 3.0964700345313814,11.21974014049093 23 | 1.7839201289999962,10.933488206767475 24 | 4.266818769719739,12.407242753086068 25 | 0.22052654472462352,10.59736596480809 26 | 0.6563787765526785,10.545771192487592 27 | 3.416610857323886,11.816243419817742 28 | 0.0802800394904396,9.826632245420347 29 | 3.661445614546677,11.638833441675093 30 | 1.684436444392035,10.927248777686911 31 | 2.007584105581906,10.864779349017715 32 | 3.0486016633164277,11.1478208385455 33 | 2.947939514747561,11.22047543782475 34 | 4.506414990794167,12.225569684786457 35 | 1.244018216385217,10.252740673398135 36 | 3.289282623798896,12.001629281830962 37 | 0.05067012803057991,10.013217827293415 38 | 1.6263180752938915,10.8378561530127 39 | 3.0231972920148027,11.76765653315177 40 | 2.829254821027227,11.420580285395145 41 | 3.7576233536426518,11.936002317485991 42 | 3.89420722703682,11.905725812894074 43 | 3.438191772991841,11.602232855572659 44 | 3.0895148411312547,11.661065677835904 45 | 3.7153200790225007,11.559953421078564 46 | 2.5880794649558654,11.39308245523173 47 | 0.9075578496290465,10.501341853452686 48 | 1.2871348832314604,10.612709938566729 49 | 2.150181020660253,11.08618852377759 50 | 4.227392912911492,12.160532976236091 51 | 4.347151369108863,12.10185461276485 52 | 1.2900203678061488,10.495187063648077 53 | 3.0507113334582687,11.70386024252094 54 | 4.472572013339909,12.442609294633272 55 | 0.27507220431418455,9.92961086741772 56 | 4.95255795141159,12.172037894638441 57 | 2.187690851944044,10.879864925834301 58 | 3.3030659113066942,11.557920082823808 59 | 4.453069421373162,12.193117922890561 60 | 2.561837519566263,11.536414504912928 61 | 0.8310782502405201,10.427354286565777 62 | 2.319393910459352,10.943113864300257 63 | 1.8495795260884962,11.044041724265039 64 | 4.369809683527437,12.316214488898776 65 | 4.7286559063267255,12.732221564219854 66 | 2.132724499371575,11.129810065327995 67 | 0.6316333581952194,10.013186502776936 68 | 1.7825762599908446,10.905235687487785 69 | 0.09938029432887963,10.017546152657244 70 | 1.9839363708410773,10.950687015270212 71 | 2.147324686723473,11.418973562688278 72 | 3.8727660667095263,11.754094323525093 73 | 4.6491124436662385,12.293169674588764 74 | 3.9636955957971582,11.731018926883527 75 | 1.2831687251881223,10.775362213956697 76 | 3.027460297930971,11.783257068467368 77 | 2.4194658972220817,11.338930051249813 78 | 1.933243097657601,10.646858847439713 79 | 4.6500127533476086,12.519913675054616 80 | 0.9279426196129131,10.061009331065804 81 | 0.7088392759937251,10.211592165570067 82 | 1.6972816274117375,10.609295205879185 83 | 0.7291005720188815,10.146174335915244 84 | 1.5135446237931238,10.827942182777722 85 | 0.7863892515036303,10.440251364931768 86 | 0.7369591945983167,10.126055811688161 87 | 0.4484281422642783,10.238798585570631 88 | 0.6476846910638945,10.670529311857813 89 | 4.17949792663603,11.97731742079653 90 | 0.9426370844467991,10.284665332731757 91 | 0.3850137632566658,10.28472156012984 92 | 4.264099492071075,12.119492507525234 93 | 3.1938080241767333,11.728495313517408 94 | 3.0934454156259017,11.501534794005355 95 | 4.556323953281537,12.4798273798697 96 | 2.5318064635878756,11.483549596279689 97 | 4.510113468263804,12.130591650717289 98 | 2.667606009197513,11.20696909368087 99 | 4.197221426103427,12.406283640395031 100 | 0.6870158534517201,10.01897145287496 101 | 3.541509668147845,12.09678789495433 102 | -------------------------------------------------------------------------------- /boba/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Console script.""" 4 | import click 5 | import shutil 6 | import os 7 | import pandas as pd 8 | from .parser import Parser 9 | from .output.csvmerger import CSVMerger 10 | from .bobarun import BobaRun 11 | 12 | 13 | @click.command() 14 | @click.option('--script', '-s', help='Path to template script', 15 | default='./template.py', show_default=True) 16 | @click.option('--out', help='Output directory', 17 | default='.', show_default=True) 18 | @click.option('--lang', help='Language, can be python/R [default: inferred from file extension]', 19 | default=None) 20 | def compile(script, out, lang): 21 | """Generate multiverse analysis from specifications.""" 22 | 23 | check_path(script) 24 | 25 | click.echo('Creating multiverse from {}'.format(script)) 26 | ps = Parser(script, out, lang) 27 | ps.main() 28 | 29 | ex = """To execute the multiverse, run the following commands: 30 | boba run --all 31 | """.format(os.path.join(out, 'multiverse')) 32 | click.secho('Success!', fg='green') 33 | click.secho(ex, fg='green') 34 | 35 | 36 | def check_path(p): 37 | """Check if the path exists""" 38 | if not os.path.exists(p): 39 | msg = 'Error: Path "{}" does not exist.'.format(p) 40 | print_help(msg) 41 | 42 | 43 | def print_help(err=''): 44 | """Show help message and exit.""" 45 | ctx = click.get_current_context() 46 | click.echo(ctx.get_help()) 47 | 48 | if err: 49 | click.echo('\n' + err) 50 | ctx.exit() 51 | 52 | 53 | @click.command() 54 | @click.argument('num', nargs=1, default=-1) 55 | @click.option('--all', '-a', 'run_all', is_flag=True, 56 | help='Execute all universes') 57 | @click.option('--thru', default=-1, help='Run until this universe number') 58 | @click.option('--jobs', default=1, help='The number of universes that can be running at a time.') 59 | @click.option('--batch_size', default=0, help='The approximate number of universes a processor will run in a row.') 60 | @click.option('--dir', 'folder', help='Multiverse directory', 61 | default='./multiverse', show_default=True) 62 | def run(folder, run_all, num, thru, jobs, batch_size): 63 | """ Execute the generated universe scripts. 64 | 65 | Run all universes: boba run --all 66 | 67 | Run a single universe, for example universe_1: boba run 1 68 | 69 | Run a range of universes for example 1 through 5: boba run 1 --thru 5 70 | """ 71 | 72 | check_path(folder) 73 | 74 | df = pd.read_csv(folder + '/summary.csv') 75 | num_universes = df.shape[0] 76 | 77 | if not run_all: 78 | if thru == -1: 79 | thru = num 80 | if num < 1: 81 | print_help() 82 | if thru < num: 83 | print_help('The thru parameter cannot be less than the num parameter.') 84 | if num > num_universes or thru > num_universes: 85 | print_help(f'There are only {num_universes} universes.') 86 | 87 | br = BobaRun(folder, jobs, batch_size) 88 | br.run_from_cli(run_all, num, thru) 89 | 90 | 91 | @click.command() 92 | @click.argument('pattern', nargs=1) 93 | @click.option('--base', '-b', default='./multiverse/results', 94 | show_default=True, help='Folder containing the universe outputs') 95 | @click.option('--out', default='./multiverse/merged.csv', 96 | show_default=True, help='Name of the merged file') 97 | @click.option('--delimiter', default=',', show_default=True, 98 | help='CSV delimiter') 99 | def merge(pattern, base, out, delimiter): 100 | """ 101 | Merge CSV outputs from individual universes into one file. 102 | 103 | Required argument: 104 | the filename pattern of individual outputs where the universe id is 105 | replaced by {}, for example output_{}.csv 106 | """ 107 | 108 | check_path(base) 109 | CSVMerger(pattern, base, out, delimiter).main() 110 | 111 | 112 | @click.group() 113 | @click.version_option() 114 | def main(): 115 | pass 116 | 117 | 118 | main.add_command(compile) 119 | main.add_command(run) 120 | main.add_command(merge) 121 | 122 | if __name__ == "__main__": 123 | main() 124 | -------------------------------------------------------------------------------- /test/test_block_syntax_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Ugly hack to allow import from the root folder 4 | import sys 5 | import os 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | import unittest 9 | from boba.blocksyntaxparser import BlockSyntaxParser, ParseError 10 | 11 | 12 | class TestBlockParser(unittest.TestCase): 13 | 14 | def test_steps(self): 15 | line = '# --- (A) remove_outlier' 16 | self.assertTrue(BlockSyntaxParser.can_parse(line)) 17 | bp = BlockSyntaxParser(line) 18 | self.assertEqual(bp.i, 0) 19 | bp._read_next() 20 | self.assertEqual(bp.i, 5) 21 | bp._read_next() 22 | self.assertEqual(bp.i, 9) 23 | bp._read_next() 24 | self.assertEqual(bp.i, len(line)) 25 | bp._read_next() 26 | self.assertEqual(bp.i, len(line)) 27 | 28 | def test_can_parse(self): 29 | self.assertTrue(BlockSyntaxParser.can_parse(' # --- comment')) 30 | self.assertTrue(BlockSyntaxParser.can_parse(' # ---comment ')) 31 | self.assertFalse(BlockSyntaxParser.can_parse('#--- comment')) 32 | self.assertFalse(BlockSyntaxParser.can_parse('# --')) 33 | 34 | def test_syntax(self): 35 | line = '# --- (A) remove_outlier' 36 | self.assertTrue(BlockSyntaxParser.can_parse(line)) 37 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 38 | self.assertEqual(bid, 'A:remove_outlier') 39 | self.assertEqual(par, 'A') 40 | self.assertEqual(opt, 'remove_outlier') 41 | 42 | line = '# --- (A) remove outlier' 43 | with self.assertRaises(ParseError): 44 | BlockSyntaxParser(line).parse() 45 | 46 | line = '# --- ((A)) name' 47 | with self.assertRaises(ParseError): 48 | BlockSyntaxParser(line).parse() 49 | 50 | line = '# --- ( A)' 51 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 52 | self.assertEqual(bid, 'A') 53 | self.assertEqual(par, '') 54 | self.assertEqual(opt, '') 55 | 56 | def test_condition(self): 57 | line = '# --- (A) a1 @if B == b1' 58 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 59 | self.assertEqual(bid, 'A:a1') 60 | self.assertEqual(par, 'A') 61 | self.assertEqual(opt, 'a1') 62 | self.assertEqual(cond['block'], 'A') 63 | self.assertEqual(cond['option'], 'a1') 64 | self.assertEqual(cond['condition'], 'B == b1') 65 | 66 | line = '# --- (A) @if B == b1' 67 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 68 | self.assertEqual(cond['block'], 'A') 69 | self.assertEqual(cond['condition'], 'B == b1') 70 | self.assertNotIn('option', cond) 71 | 72 | line = '# --- (A) remove outlier @if B == b1' 73 | with self.assertRaises(ParseError): 74 | BlockSyntaxParser(line).parse() 75 | 76 | def test_whitespace(self): 77 | line = '\t\t# --- (A) name' 78 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 79 | self.assertEqual(par, 'A') 80 | self.assertEqual(opt, 'name') 81 | 82 | line = ' # --- (A) name \t' 83 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 84 | self.assertEqual(par, 'A') 85 | self.assertEqual(opt, 'name') 86 | 87 | line = '# ---(A)socrowded' 88 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 89 | self.assertEqual(par, 'A') 90 | self.assertEqual(opt, 'socrowded') 91 | 92 | def test_id_syntax(self): 93 | line = '# --- (C1)' 94 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 95 | self.assertEqual(bid, 'C1') 96 | 97 | line = '# --- (aXa) ' 98 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 99 | self.assertEqual(bid, 'aXa') 100 | 101 | line = '# --- (my_variable) \t' 102 | bid, par, opt, cond = BlockSyntaxParser(line).parse() 103 | self.assertEqual(bid, 'my_variable') 104 | 105 | # ID must start with a letter 106 | line = '# --- (12)' 107 | with self.assertRaisesRegex(ParseError, '(?i)invalid identifier'): 108 | BlockSyntaxParser(line).parse() 109 | 110 | line = '# --- (_start)' 111 | with self.assertRaisesRegex(ParseError, '(?i)invalid identifier'): 112 | BlockSyntaxParser(line).parse() 113 | 114 | 115 | if __name__ == '__main__': 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /boba/blocksyntaxparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .baseparser import BaseParser, ParseError 4 | 5 | kw = '# ---' 6 | 7 | 8 | class BlockSyntaxParser(BaseParser): 9 | """ 10 | Parse the metadata of a code block, which must have the structure: 11 | # --- (ID) option @if condition 12 | option is optional, but including it will mark the block as a parameter. 13 | @if is optional and it creates a procedural dependency constraint on this 14 | block and this option (if any). 15 | """ 16 | 17 | def __init__(self, line): 18 | super(BlockSyntaxParser, self).__init__(line) 19 | 20 | self.state = 0 21 | self.parsed_id = '' 22 | self.parsed_parameter = '' 23 | self.parsed_option = '' 24 | self.parsed_condition = '' 25 | 26 | @staticmethod 27 | def can_parse(line): 28 | return line.lstrip().startswith(kw) 29 | 30 | @staticmethod 31 | def _is_operator_start(ch): 32 | return ch == '@' 33 | 34 | @staticmethod 35 | def _is_condition(word): 36 | return word == 'if' 37 | 38 | def parse(self): 39 | while not self._is_end(): 40 | self._read_next() 41 | return self.parsed_id, self.parsed_parameter, self.parsed_option,\ 42 | self.parsed_condition 43 | 44 | def _read_next(self): 45 | self._read_while(BlockSyntaxParser._is_whitespace) 46 | if self._is_end(): 47 | return 48 | 49 | if self.state == 0: 50 | self._read_kw() 51 | elif self.state == 1: 52 | self._read_id() 53 | elif self.state == 2: 54 | self._maybe_read_option() 55 | elif self.state == 3: 56 | self._read_condition() 57 | else: 58 | # we've read anything we can handle but haven't reached the end 59 | s = self._remaining().strip() 60 | self._throw('Cannot handle "{}"'.format(s)) 61 | 62 | def _end(self): 63 | self.i = len(self.line) # stop parsing 64 | 65 | def _throw(self, msg): 66 | err = 'At character {} of "{}":\n\t{}'.format(self.i+1, self.line, msg) 67 | raise ParseError(err) 68 | 69 | def _remaining(self): 70 | return self.line[self.i:] 71 | 72 | def _read_kw(self): 73 | if self._remaining().startswith(kw): 74 | self.i += len(kw) 75 | self.state += 1 76 | else: 77 | self._throw('expected {}'.format(kw)) 78 | 79 | def _read_id(self): 80 | """ Read the thing inside the parenthesis. """ 81 | # open paren 82 | if self._peek_char() != '(': 83 | self._throw('Cannot find "("') 84 | self._next_char() 85 | self._read_while(self._is_whitespace) 86 | 87 | # read the actual identifier 88 | ch = self._peek_char() 89 | if not self._is_id_start(ch): 90 | self._throw('Invalid identifier start character {}'.format(ch)) 91 | 92 | self.parsed_id = self._read_while(self._is_id) 93 | self._read_while(self._is_whitespace) 94 | 95 | # close paren 96 | if self._peek_char() != ')': 97 | self._throw('Cannot find ")"') 98 | self._next_char() 99 | self.state += 1 100 | 101 | def _maybe_read_option(self): 102 | """ Read the option, if there is any. """ 103 | self._read_while(self._is_whitespace) 104 | 105 | # check if the next word is maybe an option 106 | if not self._is_id_start(self._peek_char()): 107 | self.state += 1 108 | return 109 | 110 | # option follows the same naming convention as ID 111 | opt = self._read_while(self._is_id) 112 | if opt != '': 113 | self.parsed_parameter = self.parsed_id 114 | self.parsed_option = opt 115 | self.parsed_id += ':' + self.parsed_option 116 | 117 | self._read_while(self._is_whitespace) 118 | self.state += 1 119 | 120 | def _read_condition(self): 121 | """ Read condition. """ 122 | self._read_while(self._is_whitespace) 123 | 124 | # check if the next char is indeed an operator 125 | if not BlockSyntaxParser._is_operator_start(self._peek_char()): 126 | self.state += 1 127 | return 128 | 129 | # read @if 130 | self._next_char() 131 | w = self._read_while(self._is_id) 132 | if not BlockSyntaxParser._is_condition(w): 133 | self._throw('Cannot handle @{}'.format(w)) 134 | 135 | # read whatever remains as the condition 136 | s = self._remaining().strip() 137 | self._end() 138 | self.state += 1 139 | 140 | # construct the condition 141 | bl = self.parsed_parameter if self.parsed_option else self.parsed_id 142 | self.parsed_condition = {'block': bl, 'condition': s} 143 | if self.parsed_option: 144 | self.parsed_condition['option'] = self.parsed_option 145 | -------------------------------------------------------------------------------- /example/hurricane/data.csv: -------------------------------------------------------------------------------- 1 | Year,Name,MasFem,MinPressure_before,Minpressure_Updated_2014,Gender_MF,Category,alldeaths,HighestWindSpeed,NDAM,Elapsed Yrs,Source 2 | 1950,Easy,5.40625,958,960,0,3,2,125,2380,63,MWR 3 | 1950,King,1.59375,955,955,0,4,4,134,7220,63,MWR 4 | 1952,Able,2.96875,985,985,0,1,3,125,210,61,MWR 5 | 1953,Barbara,8.625,987,987,1,1,1,75,78,60,MWR 6 | 1953,Florence,7.875,985,985,1,1,0,115,21,60,MWR 7 | 1954,Carol,8.53125,960,960,1,3,60,115,24962,59,MWR 8 | 1954,Edna,7.625,954,954,1,3,20,125,4010,59,MWR 9 | 1954,Hazel,8.21875,938,938,1,4,20,134,36450,59,MWR 10 | 1955,Connie,8.0,962,962,1,3,0,137,2710,58,MWR 11 | 1955,Diane,8.875,987,987,1,1,200,103,52990,58,MWR 12 | 1955,Ione,6.21875,960,960,1,3,7,140,8410,58,MWR 13 | 1956,Flossy,7.21875,975,975,1,2,15,90,2060,57,MWR 14 | 1957,Audrey,8.59375,946,946,1,3,416,127,4750,56,wiki 15 | 1958,Helene,8.8125,946,946,1,3,1,150,778,55,MWR 16 | 1959,Debra,8.4375,984,984,1,1,0,85,620,54,MWR 17 | 1959,Gracie,9.53125,950,950,1,3,22,140,710,54,MWR 18 | 1960,Donna,8.875,930,930,1,4,50,143,78260,53,http://www.nhc.noaa.gov/pdf/NWS-TPC-5.pdf 19 | 1960,Ethel,7.0625,981,981,1,1,0,115,45,53,MWR 20 | 1961,Carla,8.625,931,931,1,4,46,174,22270,52,MWR 21 | 1963,Cindy,8.9375,996,996,1,1,3,80,410,50,MWR 22 | 1964,Cleo,7.125,968,968,1,2,3,155,8750,49,MWR 23 | 1964,Dora,8.28125,966,966,1,2,5,134,22720,49,MWR 24 | 1964,Hilda,7.65625,950,950,1,3,37,150,3620,49,MWR 25 | 1964,Isbell,8.3125,974,974,1,2,3,125,1120,49,MWR 26 | 1965,Betsy,9.0,948,948,1,3,75,155,21250,48,MWR 27 | 1966,Alma,7.15625,982,982,1,2,6,125,1120,47,MWR 28 | 1966,Inez,6.46875,983,983,1,1,3,150,120,47,MWR 29 | 1967,Beulah,6.34375,950,950,1,3,15,162,7010,46,MWR 30 | 1968,Gladys,7.65625,977,977,1,2,3,145,1170,45,MWR 31 | 1969,Camille,8.875,909,909,1,5,256,174,28520,44,MWR 32 | 1970,Celia,9.21875,945,945,1,3,22,127,9050,43,WIKI (http://en.wikipedia.org/wiki/Hurricane_Celia) 33 | 1971,Edith,7.65625,978,978,1,2,0,160,380,42,MWR 34 | 1971,Fern,6.71875,979,979,1,1,2,90,690,42,MWR 35 | 1971,Ginger,8.65625,995,995,1,1,0,110,270,42,MWR 36 | 1972,Agnes,7.03125,980,980,1,1,117,87,26440,41,MWR 37 | 1974,Carmen,7.78125,952,952,1,3,1,149,1530,39,MWR 38 | 1975,Eloise,8.4375,955,955,1,3,21,125,8500,38,MWR 39 | 1976,Belle,9.78125,980,980,1,1,5,120,720,37,MWR 40 | 1977,Babe,7.90625,995,995,1,1,0,75,88,36,MWR 41 | 1979,Bob,1.71875,986,986,0,1,1,75,90,34,MWR 42 | 1979,David,1.5625,970,970,0,2,15,174,3840,34,MWR 43 | 1979,Frederic,2.0625,946,946,0,3,5,134,17170,34,MWR 44 | 1980,Allen,2.03125,945,945,0,3,2,190,3040,33,MWR 45 | 1983,Alicia,9.34375,962,962,1,3,21,115,22330,30,MWR 46 | 1984,Diana,9.21875,949,949,1,2,3,134,620,29,MWR 47 | 1985,Bob,1.71875,1002,1003,0,1,0,75,190,28,MWR 48 | 1985,Danny,2.90625,987,987,0,1,1,90,200,28,MWR 49 | 1985,Elena,8.71875,959,959,1,3,4,127,5360,28,MWR 50 | 1985,Gloria,9.1875,942,942,1,3,8,143,3920,28,MWR 51 | 1985,Juan,1.875,971,971,0,1,12,103,6140,28,MWR 52 | 1985,Kate,9.1875,967,967,1,2,5,121,1800,28,MWR 53 | 1986,Bonnie,9.25,990,990,1,1,3,115,7,27,MWR 54 | 1986,Charley,4.0,990,990,0,1,5,149,79,27,MWR 55 | 1987,Floyd,2.40625,993,993,0,1,0,75,1,26,MWR 56 | 1988,Florence,7.875,984,984,1,1,1,80,2,25,MWR 57 | 1989,Chantal,8.3125,986,986,1,1,13,80,390,24,MWR 58 | 1989,Hugo,2.0,934,934,0,4,21,162,27430,24,MWR 59 | 1989,Jerry,2.78125,983,983,0,1,3,85,320,24,MWR 60 | 1991,Bob,1.71875,962,962,0,2,15,115,4690,22,MWR 61 | 1992,Andrew,1.78125,922,922,0,5,62,174,90250,21,MWR 62 | 1993,Emily,10.03125,960,961,1,3,3,162,130,20,MWR 63 | 1995,Erin,7.125,973,973,1,2,6,100,2240,18,MWR 64 | 1995,Opal,7.65625,942,942,1,3,9,149,16510,18,MWR 65 | 1996,Bertha,7.375,974,974,1,2,8,115,1020,17,MWR 66 | 1996,Fran,7.0625,954,954,1,3,26,121,18930,17,MWR 67 | 1997,Danny,2.90625,984,984,0,1,10,80,270,16,MWR 68 | 1998,Bonnie,9.25,964,964,1,2,3,115,2410,15,MWR 69 | 1998,Earl,1.875,987,987,0,1,3,87,220,15,MWR 70 | 1998,Georges,3.375,964,964,0,2,1,155,4860,15,MWR 71 | 1999,Bret,2.90625,951,951,0,3,0,145,120,14,MWR 72 | 1999,Floyd,2.40625,956,956,0,2,56,155,16030,14,MWR 73 | 1999,Irene,8.5625,987,964,1,1,8,109,1940,14,MWR 74 | 2002,Lili,9.59375,963,963,1,1,2,143,1610,11,MWR 75 | 2003,Claudette,8.71875,979,979,1,1,3,90,330,10,MWR 76 | 2003,Isabel,9.625,957,957,1,2,51,168,11010,10,MWR 77 | 2004,Alex,4.0625,972,972,0,1,1,120,7,9,MWR 78 | 2004,Charley,4.0,941,941,0,4,10,149,37180,9,MWR 79 | 2004,Frances,6.03125,960,960,1,2,7,143,19990,9,MWR 80 | 2004,Gaston,2.40625,985,985,0,1,8,75,240,9,MWR 81 | 2004,Ivan,2.09375,946,946,0,3,25,168,36910,9,MWR 82 | 2004,Jeanne,7.90625,950,950,1,3,5,121,16800,9,MWR 83 | 2005,Cindy,8.9375,991,991,1,1,1,75,420,8,MWR 84 | 2005,Dennis,2.0,946,946,0,3,15,149,3930,8,MWR 85 | 2005,Katrina,9.46875,902,902,1,3,1833,174,148240,8,MWR 86 | 2005,Ophelia,9.125,982,982,1,1,1,115,130,8,MWR 87 | 2005,Rita,8.4375,937,937,1,3,62,177,23110,8,MWR 88 | 2005,Wilma,8.375,950,950,1,3,5,183,33410,8,MWR 89 | 2007,Humberto,2.21875,985,985,0,1,1,85,63,6,MWR 90 | 2008,Dolly,10.0625,963,967,1,1,1,99,1940,5,MWR 91 | 2008,Gustav,2.09375,951,954,0,2,52,155,7900,5,MWR 92 | 2008,Ike,2.21875,935,950,0,2,84,143,44260,5,MWR 93 | 2011,Irene,8.5625,952,952,1,1,41,121,17160,2,MWR 94 | 2012,Isaac,2.0,965,966,0,1,5,81,3430,1,MWR 95 | 2012,Sandy,7.9375,945,942,1,2,159,115,80090,1,MWR 96 | -------------------------------------------------------------------------------- /test/test_graph_analyzer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Ugly hack to allow import from the root folder 4 | import sys 5 | import os 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | import unittest 9 | from boba.graphparser import GraphParser 10 | from boba.graphanalyzer import GraphAnalyzer, InvalidGraphError 11 | 12 | 13 | class TestGraphAnalyzer(unittest.TestCase): 14 | 15 | def cp_helper(self, spec, expected): 16 | nodes, edges = GraphParser(spec).parse() 17 | ga = GraphAnalyzer(nodes, edges) 18 | ga._construct_paths() 19 | expected = set([','.join(p) for p in expected]) 20 | actual = set([','.join(p) for p in ga.paths]) 21 | self.assertSetEqual(actual, expected) 22 | 23 | def test_construct_paths(self): 24 | # normal 25 | spec = ['a->b->c', 'b->d'] 26 | expected = [['a', 'b', 'c'], ['a', 'b', 'd']] 27 | self.cp_helper(spec, expected) 28 | 29 | # single node 30 | spec = ['a'] 31 | expected = [['a']] 32 | self.cp_helper(spec, expected) 33 | 34 | # multiple sources and targets 35 | spec = ['a->b->c', 'a2->b->c2'] 36 | expected = [['a', 'b', 'c'], ['a2', 'b', 'c2'], ['a', 'b', 'c2'], 37 | ['a2', 'b', 'c']] 38 | self.cp_helper(spec, expected) 39 | 40 | # disconnected 41 | spec = ['a->b->c', 'e->f'] 42 | expected = [['a', 'b', 'c'], ['e', 'f']] 43 | self.cp_helper(spec, expected) 44 | 45 | # cyclic 46 | spec = ['a->b->c->a'] 47 | nodes, edges = GraphParser(spec).parse() 48 | ga = GraphAnalyzer(nodes, edges) 49 | with self.assertRaises(InvalidGraphError): 50 | ga._construct_paths() 51 | 52 | def source_helper(self, spec, exp_source, exp_target): 53 | nodes, edges = GraphParser(spec).parse() 54 | ga = GraphAnalyzer(nodes, edges) 55 | self.assertSetEqual(ga._get_source(), exp_source) 56 | self.assertSetEqual(ga._get_target(), exp_target) 57 | 58 | def test_get_source_and_target(self): 59 | # normal 60 | spec = ['a->b->d', 'a->b->c'] 61 | source = {'a'} 62 | target = {'d', 'c'} 63 | self.source_helper(spec, source, target) 64 | 65 | # disconnected 66 | spec = ['a->b->d', 'c->e'] 67 | source = {'a', 'c'} 68 | target = {'d', 'e'} 69 | self.source_helper(spec, source, target) 70 | 71 | # a single node 72 | spec = ['a'] 73 | source = {'a'} 74 | target = {'a'} 75 | self.source_helper(spec, source, target) 76 | 77 | # cyclic 78 | spec = ['a->b->c->d->a'] 79 | source = set() 80 | target = set() 81 | self.source_helper(spec, source, target) 82 | 83 | # complex 84 | spec = ['a->b->d', 'a->c->b->d', 'c->a->d'] 85 | source = set() 86 | target = {'d'} 87 | self.source_helper(spec, source, target) 88 | 89 | def path_helper(self, spec, s, t, expected): 90 | nodes, edges = GraphParser(spec).parse() 91 | ga = GraphAnalyzer(nodes, edges) 92 | ga._all_paths(s, t) 93 | expected = set([','.join(p) for p in expected]) 94 | actual = set([','.join(p) for p in ga.paths]) 95 | self.assertSetEqual(actual, expected) 96 | 97 | def test_get_path(self): 98 | """ test if the program correctly gets all paths from s to t""" 99 | 100 | spec = ['a->b->c', 'b->d', 'e'] 101 | start = 'a' 102 | stop = 'e' 103 | expected = [] 104 | self.path_helper(spec, start, stop, expected) 105 | 106 | spec = ['a->b->c', 'b->d', 'e'] 107 | start = 'a' 108 | stop = 'c' 109 | expected = [['a', 'b', 'c']] 110 | self.path_helper(spec, start, stop, expected) 111 | 112 | spec = ['a->b->c', 'b->d', 'e'] 113 | start = 'b' 114 | stop = 'd' 115 | expected = [['b', 'd']] 116 | self.path_helper(spec, start, stop, expected) 117 | 118 | # a single node 119 | spec = ['a'] 120 | start = 'a' 121 | stop = 'a' 122 | expected = [['a']] 123 | self.path_helper(spec, start, stop, expected) 124 | 125 | # graph with a merged branch 126 | spec = ['a->b->c', 'b->d', 'c->e d->e'] 127 | start = 'a' 128 | stop = 'e' 129 | expected = [['a', 'b', 'c', 'e'], ['a', 'b', 'd', 'e']] 130 | self.path_helper(spec, start, stop, expected) 131 | 132 | # graph with a cycle 133 | spec = ['a->b->c->b', 'b->d', 'e'] 134 | start = 'a' 135 | stop = 'c' 136 | expected = [['a', 'b', 'c']] 137 | self.path_helper(spec, start, stop, expected) 138 | 139 | # a complicated graph 140 | spec = ['a->b->d', 'a->c->b->d', 'c->a->d'] 141 | start = 'c' 142 | stop = 'd' 143 | expected = [['c', 'b', 'd'], ['c', 'a', 'd'], ['c', 'a', 'b', 'd']] 144 | self.path_helper(spec, start, stop, expected) 145 | 146 | 147 | if __name__ == '__main__': 148 | unittest.main() 149 | -------------------------------------------------------------------------------- /example/hurricane/reproduce/repro_marginalize.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Replicate prior work's results using their marginalization approach 3 | # --- (BOBA_CONFIG) 4 | { 5 | "decisions": [ 6 | {"var": "outliers", "options": [ 7 | "c()", 8 | "c('Katrina')", 9 | "c('Katrina', 'Audrey')" 10 | ]}, 11 | {"var": "leverage_points", "options": [ 12 | "c()", 13 | "c('Sandy')", 14 | "c('Sandy', 'Andrew')", 15 | "c('Sandy', 'Andrew', 'Donna')" 16 | ]}, 17 | {"var": "feminity", "options": ["female", "masfem"]}, 18 | {"var": "feminity_prediction_levels", "options": ["c(0, 1)", "c(2.53, 8.29)"]}, 19 | {"var": "damage", "options": ["dam", "log_dam"]}, 20 | {"var": "predictors", "options": [ 21 | "feminity * damage", 22 | "feminity + damage + pressure + feminity:damage + feminity:pressure", 23 | "feminity + damage + zwin + feminity:damage + feminity:zwin", 24 | "feminity + damage + zcat + feminity:damage + feminity:zcat", 25 | "feminity + damage + z3 + feminity:damage + feminity:z3", 26 | "feminity + damage + z3" 27 | ]}, 28 | {"var": "covariates", "options": [ 29 | "", 30 | "+ year:damage", 31 | "+ post:damage" 32 | ]}, 33 | {"var": "predictor_list", "options": [ 34 | "damage", 35 | "damage, pressure", 36 | "damage, zwin", 37 | "damage, zcat", 38 | "damage, z3", 39 | "damage, z3" 40 | ]}, 41 | {"var": "covariate_list", "options": [ 42 | "", 43 | ", year", 44 | ", post" 45 | ]}, 46 | {"var": "back_transform", "options": [ 47 | "exp(mu + sigma^2/2) - 1", 48 | "mu" 49 | ]}, 50 | {"var": "df", "options": [ 51 | "inference$df", 52 | "df.residual(model)" 53 | ]} 54 | ], 55 | "constraints": [ 56 | {"link": ["feminity", "feminity_prediction_levels"]}, 57 | {"link": ["Model", "back_transform", "df"]}, 58 | {"link": ["predictors", "predictor_list"]}, 59 | {"link": ["covariates", "covariate_list"]} 60 | ], 61 | "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results" 62 | } 63 | # --- (END) 64 | 65 | library(readr) 66 | library(MASS) 67 | library(modelr) 68 | library(tidyverse) 69 | library(broom.mixed) 70 | library(tidybayes) 71 | 72 | # a function for post-processing predicted means and standard deviations into expected number of deaths 73 | pred2expectation <- function(mu, sigma) { 74 | return({{back_transform}}) 75 | } 76 | 77 | # read and process data 78 | df <- read_csv('../data.csv', 79 | col_types = cols( 80 | Year = col_integer(), 81 | Category = col_integer(), 82 | Gender_MF = col_integer(), 83 | alldeaths = col_integer() 84 | )) %>% 85 | # rename some variables 86 | dplyr::select( 87 | year = Year, 88 | name = Name, 89 | dam = NDAM, 90 | death = alldeaths, 91 | female = Gender_MF, 92 | masfem = MasFem, 93 | category = Category, 94 | pressure = Minpressure_Updated_2014, 95 | wind = HighestWindSpeed 96 | ) %>% 97 | # create new variables 98 | mutate( 99 | log_death = log(death + 1), 100 | log_dam = log(dam), 101 | post = ifelse(year>1979, 1, 0), 102 | zdam = scale(dam), 103 | zcat = as.numeric(scale(category)), 104 | zmin = -scale(pressure), 105 | zwin = as.numeric(scale(wind)), 106 | z3 = as.numeric((zmin + zcat + zwin) / 3) 107 | ) %>% 108 | # remove outliers 109 | filter(!(name %in% {{outliers}})) %>% 110 | filter(!(name %in% {{leverage_points}})) %>% 111 | # operationalize feminity 112 | mutate( 113 | feminity = {{feminity}}, 114 | damage = {{damage}} 115 | ) 116 | 117 | # --- (Model) ols_regression 118 | # OLS regression with log(deaths+1) as the dependent variable 119 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df) 120 | 121 | # --- (Model) negative_binomial 122 | # Negative binomial with deaths as the dependent variable 123 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df) 124 | 125 | # --- (O) 126 | # create a data frame where covariates are at their means 127 | dmeans <- df %>% 128 | summarise_at(vars({{predictor_list}} {{covariate_list}}), mean) %>% 129 | group_by({{predictor_list}} {{covariate_list}}) %>% 130 | data_grid(feminity = {{feminity_prediction_levels}})%>% 131 | ungroup() 132 | 133 | # predict 134 | pred <- predict(model, dmeans, se.fit = TRUE, type = "response") 135 | expectation <- dmeans %>% 136 | mutate( 137 | fit = pred$fit, 138 | sigma = sigma(model), 139 | expected_deaths = pred2expectation(fit, sigma) 140 | )%>% 141 | compare_levels(expected_deaths, by = feminity) %>% 142 | ungroup() %>% 143 | dplyr::select(expected_diff = expected_deaths) 144 | 145 | # get predictive check for original dataset from model 146 | pred <- predict(model, df, type = "response") 147 | disagg_fit <- df %>% 148 | mutate( 149 | fit = pred, # get fitted predictions 150 | sigma = sigma(model), # get residual standard deviation 151 | pred_deaths = pred2expectation(fit, sigma) # transform to deaths 152 | ) %>% 153 | dplyr::select( 154 | observed = death, 155 | expected = pred_deaths 156 | ) 157 | 158 | # output 159 | write_csv(expectation, '../results/estimate_{{_n}}.csv') 160 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv') 161 | -------------------------------------------------------------------------------- /boba/codeparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from dataclasses import dataclass, field 4 | from typing import List 5 | import json 6 | 7 | from .blocksyntaxparser import BlockSyntaxParser, ParseError 8 | 9 | 10 | @dataclass 11 | class Block: 12 | """ 13 | A class for code blocks. 14 | 15 | id: unique identifier. For decision block, id is parameter:option. 16 | parameter: parameter name, if the block is a decision. 17 | option: option name, if the block is a decision. 18 | chunks: code broken up at the boundaries of placeholder variables. 19 | """ 20 | 21 | id: str = '' 22 | parameter: str = '' 23 | option: str = '' 24 | chunks: List = field(default_factory=lambda: []) 25 | 26 | 27 | @dataclass 28 | class Chunk: 29 | """A class for code chunks. 30 | A code chunk contains at most one placeholder variable. 31 | 32 | variable: the corresponding placeholder variable, if any. 33 | code: the code template proceeding the variable or the block boundary. 34 | """ 35 | variable: str = '' 36 | code: str = '' 37 | 38 | 39 | class CodeParser: 40 | def __init__(self): 41 | self.blocks = {} 42 | self.order = [] 43 | 44 | self.raw_spec = '' 45 | self.spec = {} 46 | 47 | self.inline_constraints = [] 48 | self.inline_vars = [] 49 | self.used_vars = set() 50 | 51 | @staticmethod 52 | def _get_block_name(block): 53 | """Get the ID of the block, ignoring options.""" 54 | return block.id if block.parameter == '' else block.parameter 55 | 56 | def _add_block(self, block): 57 | """Add a block to our data structure.""" 58 | # handle config block 59 | if block.id == 'BOBA_CONFIG': 60 | self.raw_spec += block.chunks[0].code 61 | return 62 | if block.id == 'END': 63 | block.id = '' 64 | if len(self.order): 65 | self.blocks[self.order[-1]].chunks += block.chunks 66 | return 67 | 68 | # ignore empty block 69 | if block.id == '' and block.chunks[0].code == '': 70 | return 71 | 72 | # handle unnamed block 73 | if block.id == '': 74 | block.id = '_start' if len(self.blocks) == 0 else '_end' 75 | 76 | # check if id exists 77 | if block.id in self.blocks: 78 | err = 'Duplicated code block ID "{}"'.format(block.id) 79 | raise ParseError(err) 80 | 81 | # add to data structure 82 | self.blocks[block.id] = block 83 | bn = CodeParser._get_block_name(block) 84 | if bn not in self.order: 85 | self.order.append(bn) 86 | 87 | def get_block_names(self): 88 | """ 89 | Get the ID of all blocks, ignoring options 90 | :return: a set of unique names 91 | """ 92 | blocks = set() 93 | for b in self.blocks: 94 | bl = self.blocks[b] 95 | blocks.add(CodeParser._get_block_name(bl)) 96 | return blocks 97 | 98 | def get_decisions(self): 99 | """ 100 | Get a dict of all block-level decisions, where the key is the parameter 101 | and the value is a list of block ids (namely, parameter:option). 102 | :return: 103 | """ 104 | decs = {} 105 | for b in self.blocks: 106 | bl = self.blocks[b] 107 | if bl.parameter: 108 | p = bl.parameter 109 | if p in decs: 110 | decs[p].append(bl.id) 111 | else: 112 | decs[p] = [bl.id] 113 | return decs 114 | 115 | def parse(self, dec_parser, f): 116 | """ Make a pass over the template, parsing block declarations and 117 | placeholder variables inside the code.""" 118 | code = '' 119 | bl = Block() 120 | 121 | for line in f: 122 | if BlockSyntaxParser.can_parse(line): 123 | # end of the previous block 124 | bl.chunks.append(Chunk('', code)) 125 | code = '' 126 | self._add_block(bl) 127 | 128 | # parse the metadata and create a new block 129 | bp_id, par, opt, cond = BlockSyntaxParser(line).parse() 130 | bl = Block(bp_id, par, opt, []) 131 | 132 | # store inline constraints, if any 133 | if cond: 134 | self.inline_constraints.append(cond) 135 | else: 136 | # match decision variables 137 | try: 138 | vs, codes = dec_parser.parse_code(line) 139 | if len(vs): 140 | # store inline variables 141 | self.used_vars.update(vs) 142 | 143 | # chop into more chunks 144 | # combine first chunk with previous code 145 | bl.chunks.append(Chunk(vs[0], code + codes[0])) 146 | for i in range(1, len(vs)): 147 | bl.chunks.append(Chunk(vs[i], codes[i])) 148 | 149 | # remaining code after the last matched variable 150 | code = codes[-1] 151 | else: 152 | code += line 153 | except ParseError as e: 154 | msg = 'At line "{}"\n\t{}'.format(line, e.args[0]) 155 | raise ParseError(msg) 156 | 157 | # add the last block 158 | bl.chunks.append(Chunk('', code)) 159 | self._add_block(bl) 160 | 161 | # parse the spec 162 | try: 163 | self.spec = json.loads(self.raw_spec) if self.raw_spec else {} 164 | except ValueError as e: 165 | msg = self.raw_spec + '\n' + e.args[0] 166 | msg += '\nBoba config is not valid JSON' 167 | raise ParseError(msg) 168 | -------------------------------------------------------------------------------- /example/hurricane/repro.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # --- (BOBA_CONFIG) 3 | { 4 | "decisions": [ 5 | {"var": "outliers", "options": [ 6 | "c()", 7 | "c('Katrina')", 8 | "c('Katrina', 'Audrey')" 9 | ]}, 10 | {"var": "leverage_points", "options": [ 11 | "c()", 12 | "c('Sandy')", 13 | "c('Sandy', 'Andrew')", 14 | "c('Sandy', 'Andrew', 'Donna')" 15 | ]}, 16 | {"var": "feminity", "options": ["female", "masfem"]}, 17 | {"var": "damage", "options": ["dam", "log_dam"]}, 18 | {"var": "predictors", "options": [ 19 | "feminity * damage", 20 | "feminity + damage + pressure + feminity:damage + feminity:pressure", 21 | "feminity + damage + zwin + feminity:damage + feminity:zwin", 22 | "feminity + damage + zcat + feminity:damage + feminity:zcat", 23 | "feminity + damage + z3 + feminity:damage + feminity:z3", 24 | "feminity + damage + z3" 25 | ]}, 26 | {"var": "covariates", "options": [ 27 | "", 28 | "+ year:damage", 29 | "+ post:damage" 30 | ]}, 31 | {"var": "back_transform", "options": [ 32 | "exp(mu + sigma^2/2) - 1", 33 | "mu" 34 | ]} 35 | ], 36 | "constraints": [ 37 | {"link": ["Model", "back_transform"]} 38 | ], 39 | "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results", 40 | "after_execute": "cp ../stacking_weights.R ./", 41 | "visualizer": "visualizer_config.json" 42 | } 43 | # --- (END) 44 | 45 | suppressPackageStartupMessages(library(readr)) 46 | suppressPackageStartupMessages(library(MASS)) 47 | suppressPackageStartupMessages(library(modelr)) 48 | suppressPackageStartupMessages(library(tidyverse)) 49 | suppressPackageStartupMessages(library(broom.mixed)) 50 | suppressPackageStartupMessages(library(tidybayes)) 51 | source('../../boba_util.R') #fixme 52 | 53 | # a function for post-processing predicted means and standard deviations into expected number of deaths 54 | pred2expectation <- function(mu, sigma) { 55 | return({{back_transform}}) 56 | } 57 | 58 | # get expectation per data point 59 | compute_exp <- function (model, df) { 60 | disagg_fit <- pointwise_predict(model, df) %>% 61 | mutate(expected = pred2expectation(fit, sigma)) 62 | return(disagg_fit) 63 | } 64 | 65 | # read and process data 66 | full <- read_csv('../data.csv', 67 | col_types = cols( 68 | Year = col_integer(), 69 | Category = col_integer(), 70 | Gender_MF = col_integer(), 71 | alldeaths = col_integer() 72 | )) %>% 73 | # rename some variables 74 | dplyr::select( 75 | year = Year, 76 | name = Name, 77 | dam = NDAM, 78 | death = alldeaths, 79 | female = Gender_MF, 80 | masfem = MasFem, 81 | category = Category, 82 | pressure = Minpressure_Updated_2014, 83 | wind = HighestWindSpeed 84 | ) %>% 85 | # create new variables 86 | mutate( 87 | id = row_number(), 88 | log_death = log(death + 1), 89 | log_dam = log(dam), 90 | post = ifelse(year>1979, 1, 0), 91 | zdam = scale(dam), 92 | zcat = as.numeric(scale(category)), 93 | zmin = -scale(pressure), 94 | zwin = as.numeric(scale(wind)), 95 | z3 = as.numeric((zmin + zcat + zwin) / 3) 96 | ) %>% 97 | # operationalize feminity 98 | mutate( 99 | feminity = {{feminity}}, 100 | damage = {{damage}} 101 | ) 102 | 103 | df <- full %>% 104 | # remove outliers 105 | filter(!(name %in% {{outliers}})) %>% 106 | filter(!(name %in% {{leverage_points}})) 107 | 108 | # --- (Model) ols_regression 109 | # OLS regression with log(deaths+1) as the dependent variable 110 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df) 111 | 112 | # --- (Model) negative_binomial 113 | # Negative binomial with deaths as the dependent variable 114 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df) 115 | 116 | # --- (O) 117 | # cross validation 118 | fit <- cross_validation(df, model, "death", 119 | func = function (m, d) compute_exp(m, d)$expected) 120 | nrmse = fit / (max(df$death) - min(df$death)) 121 | 122 | # stacking 123 | loglik <- df %>% 124 | add_column(loglik = stacking(df, model)) %>% 125 | dplyr::select(id, loglik) %>% 126 | right_join(full, by='id') 127 | # add missing log likelihood 128 | if (nrow(loglik) != nrow(df)) { 129 | idx <- filter(loglik, is.na(loglik)) 130 | loglik$loglik[idx$id] <- compute_loglik(model, idx) 131 | } 132 | loglik <- dplyr::select(loglik, loglik) 133 | 134 | # permutation test 135 | null.dist <- permutation_test(df, model, c("female", "masfem", "feminity"), N = 100, 136 | func = function (m, d) margins(compute_exp(m, d), "female", "expected")$expected) %>% 137 | dplyr::select(expected_diff = value) 138 | 139 | # get prediction 140 | disagg_fit <- compute_exp(model, df) 141 | 142 | # aggregate fitted effect of female storm name 143 | expectation <- margins(disagg_fit, "female", "expected") %>% 144 | dplyr::select(expected_diff = expected) %>% 145 | add_column(NRMSE = nrmse) # add cross validation metric 146 | 147 | # propagate uncertainty in fit to model predictions 148 | uncertainty <- disagg_fit %>% 149 | mutate( 150 | .draw = list(1:200), # generate list of draw numbers 151 | t = map(df, ~rt(200, .)), # simulate draws from t distribution to transform into means 152 | x = map(df, ~rchisq(200, .)) # simulate draws from chi-squared distribution to transform into sigmas 153 | ) %>% 154 | unnest(cols = c(".draw", "t", "x")) %>% 155 | mutate( 156 | mu = t * se.fit + fit, # scale and shift t to get a sampling distribution of means 157 | sigma = sqrt(df * se.residual^2 / x), # scale and take inverse of x to get a sampling distribution of sigmas 158 | expected_deaths = pred2expectation(mu, sigma) 159 | ) %>% 160 | group_by(.draw, female) %>% # group by predictor(s) of interest 161 | summarize(expected_deaths = mean(expected_deaths)) %>% # marninalize across other predictors 162 | compare_levels(expected_deaths, by = female) %>% 163 | ungroup() %>% 164 | dplyr::select(expected_diff = expected_deaths) 165 | 166 | # only output relevant fields in disagg_fit 167 | disagg_fit <- disagg_fit %>% 168 | dplyr::select( 169 | observed = death, 170 | expected = expected 171 | ) 172 | 173 | # output 174 | write_csv(expectation, '../results/estimate_{{_n}}.csv') 175 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv') 176 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv') 177 | write_csv(null.dist, '../results/null_{{_n}}.csv') 178 | write_csv(loglik, '../results/loglik_{{_n}}.csv') 179 | -------------------------------------------------------------------------------- /boba/wrangler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import shutil 5 | import csv 6 | import json 7 | from dataclasses import dataclass 8 | from .baseparser import ParseError 9 | 10 | 11 | @dataclass 12 | class Output: 13 | name: str 14 | value: str 15 | 16 | DIR_SCRIPT = 'code/' 17 | DIR_LOG = 'boba_logs/' 18 | LOG_EXT = '.txt' 19 | 20 | def get_universe_name(universe_id): 21 | """ Get the name of a universe """ 22 | return 'universe_' + str(universe_id) 23 | 24 | 25 | def get_universe_script(universe_id, lang_extension): 26 | """ Get the file name of a universe script """ 27 | return get_universe_name(universe_id) + lang_extension 28 | 29 | 30 | def get_universe_id_from_script(universe_script): 31 | """ Get the id of a universe given the universe script """ 32 | return int(universe_script.split('.')[0].split('_')[1]) 33 | 34 | 35 | def get_universe_log(universe_id): 36 | """ Get the file name of a universe log """ 37 | return 'log_' + str(universe_id) + LOG_EXT 38 | 39 | 40 | def get_universe_error_log(universe_id): 41 | """ get the file name of a universe error log """ 42 | return 'error_' + str(universe_id) + LOG_EXT 43 | 44 | 45 | class Wrangler: 46 | """Handles outputs.""" 47 | def __init__(self, spec, lang, out): 48 | self.spec = spec 49 | self.lang = lang 50 | self.out = out 51 | self.fn = os.path.abspath(os.path.join(out, 'summary.csv')) 52 | 53 | self.outputs = {} 54 | self.col = 0 # output column number, will be set by parser 55 | self.counter = 0 56 | 57 | self.pre_exe = '' 58 | self.post_exe = '' 59 | 60 | self._read_spec() 61 | 62 | @staticmethod 63 | def _read_json_safe(obj, field): 64 | if field not in obj: 65 | raise ParseError('Cannot find "{}" in json'.format(field)) 66 | return obj[field] 67 | 68 | @staticmethod 69 | def _read_optional(obj, field, df): 70 | return obj[field] if field in obj else df 71 | 72 | def _read_spec(self): 73 | """Read misc fields from the JSON spec.""" 74 | sp = self._read_optional(self.spec, 'outputs', []) 75 | for d in sp: 76 | name = str(self._read_json_safe(d, 'name')) 77 | value = str(self._read_json_safe(d, 'value')) 78 | o = Output(name, value) 79 | self.outputs[name] = o 80 | 81 | self.pre_exe = self._read_optional(self.spec, 'before_execute', '') 82 | self.post_exe = self._read_optional(self.spec, 'after_execute', '') 83 | 84 | def _codegen_r(self): 85 | """Generate output code for R scripts.""" 86 | if len(self.outputs) == 0: 87 | return '' 88 | 89 | # read csv 90 | code = '\n\n# wrangles output\n' \ 91 | 'df <- read.csv2("{}", sep = ",", stringsAsFactors = FALSE, ' \ 92 | 'check.names=FALSE)'\ 93 | .format(self.fn) 94 | 95 | # record outputs 96 | ns = self.get_outputs() 97 | col = self.col + 1 98 | row = self.counter 99 | for n in ns: 100 | code += '\ndf[{}, {}] = {}'.format(row, col, self.outputs[n].value) 101 | col += 1 102 | 103 | # write csv 104 | code += '\nwrite.csv(df, file="{}", row.names=FALSE)'.format(self.fn) 105 | code += '\n' 106 | 107 | return code 108 | 109 | def _codegen_python(self): 110 | if len(self.outputs) == 0: 111 | return '' 112 | 113 | # TODO 114 | 115 | def _gen_code(self): 116 | """Generate output code to be appended to the end of the script.""" 117 | if self.lang.is_r(): 118 | return self._codegen_r() 119 | if self.lang.is_python(): 120 | return self._codegen_python() 121 | return '' 122 | 123 | def write_pre_exe(self): 124 | fn_pre_exec = os.path.join(self.out, 'pre_exe.sh') 125 | with open(fn_pre_exec, 'w') as f: 126 | f.write(self.pre_exe) 127 | 128 | def write_post_exe(self): 129 | fn_post_exec = os.path.join(self.out, 'post_exe.sh') 130 | with open(fn_post_exec, 'w') as f: 131 | f.write(self.post_exe) 132 | 133 | def write_lang(self): 134 | lang = os.path.join(self.out, 'lang.json') 135 | with open(lang, 'w') as f: 136 | json.dump(self.lang.supported_langs, f) 137 | 138 | 139 | def write_universe(self, code): 140 | """Write the generated code to a universe file.""" 141 | 142 | self.counter += 1 143 | fn = get_universe_script(self.counter, self.lang.get_ext()) 144 | 145 | # replace the reserved keyword _n 146 | code = code.replace('{{_n}}', str(self.counter)) 147 | 148 | # append output code 149 | code += self._gen_code() 150 | 151 | # write file 152 | with open(os.path.join(self.out, DIR_SCRIPT, fn), 'w') as f: 153 | f.write(code) 154 | f.flush() 155 | 156 | return fn 157 | 158 | def write_summary(self, rows): 159 | """Write the summary CSV file""" 160 | with open(self.fn, 'w', newline='') as f: 161 | wrt = csv.writer(f) 162 | for row in rows: 163 | wrt.writerow(row) 164 | 165 | def write_overview_json(self, res): 166 | """ Write the overview.json file""" 167 | # append visualizer block 168 | default_config = { 169 | "files": [{"id": "est", "path": "estimates.csv"}], 170 | "schema": {"point_estimate": {"file": "est", "field": "estimate"}} 171 | } 172 | vis = Wrangler._read_optional(self.spec, 'visualizer', None) 173 | 174 | # if it is a string, read config file 175 | if isinstance(vis, str): 176 | try: 177 | with open(vis) as f: 178 | vis = json.load(f) 179 | except (IOError, json.JSONDecodeError) as e: 180 | print(e) 181 | print('Cannot read the visualizer config, using the default') 182 | vis = default_config 183 | # if user does not specify the config, use the default 184 | vis = default_config if vis is None else vis 185 | res['visualizer'] = vis 186 | 187 | with open(os.path.join(self.out, 'overview.json'), 'w') as f: 188 | obj = json.dumps(res, indent=2, sort_keys=True) 189 | f.write(obj) 190 | 191 | def create_dir(self): 192 | """Create output directories.""" 193 | if os.path.exists(self.out): 194 | shutil.rmtree(self.out) 195 | os.makedirs(self.out) 196 | os.makedirs(os.path.join(self.out, DIR_SCRIPT)) 197 | 198 | def get_outputs(self): 199 | """Get a sorted list of output names.""" 200 | return sorted(list(self.outputs.keys())) 201 | -------------------------------------------------------------------------------- /example/hurricane/data_wrangling/data_jung.csv: -------------------------------------------------------------------------------- 1 | Year,Name,MasFem,MinPressure_before,Minpressure_Updated_2014,Gender_MF,Category,alldeaths,NDAM,Elapsed Yrs,Source,ZMasFem,ZMinPressure_A,ZNDAM 2 | 1950,Easy,6.77778,958,960,1,3,2,1590,63,MWR,-0.00094,-0.35636,-0.43913 3 | 1950,King,1.38889,955,955,0,3,4,5350,63,MWR,-1.67076,-0.51125,-0.14843 4 | 1952,Able,3.83333,985,985,0,1,3,150,61,MWR,-0.91331,1.03765,-0.55047 5 | 1953,Barbara,9.83333,987,987,1,1,1,58,60,MWR,0.94587,1.14091,-0.55758 6 | 1953,Florence,8.33333,985,985,1,1,0,15,60,MWR,0.48108,1.03765,-0.56090 7 | 1954,Carol,8.11111,960,960,1,3,60,19321,59,MWR,0.41222,-0.25310,0.93174 8 | 1954,Edna,8.55556,954,954,1,3,20,3230,59,MWR,0.54993,-0.56288,-0.31234 9 | 1954,Hazel,9.44444,938,938,1,4,20,24260,59,MWR,0.82537,-1.38896,1.31360 10 | 1955,Connie,8.50000,962,962,1,3,0,2030,58,MWR,0.53272,-0.14984,-0.40511 11 | 1955,Diane,9.88889,987,987,1,1,200,14730,58,MWR,0.96309,1.14091,0.57679 12 | 1955,Ione,5.94444,960,960,0,3,7,6200,58,MWR,-0.25916,-0.25310,-0.08271 13 | 1956,Flossy,7.00000,975,975,1,2,15,1540,57,MWR,0.06792,0.52135,-0.44300 14 | 1958,Helene,9.88889,946,946,1,3,1,540,55,MWR,0.96309,-0.97592,-0.52031 15 | 1959,Debra,9.88889,984,984,1,1,0,430,54,MWR,0.96309,0.98602,-0.52882 16 | 1959,Gracie,9.77778,950,950,1,3,22,510,54,MWR,0.92866,-0.76940,-0.52263 17 | 1960,Donna,9.27778,930,930,1,4,50,53270,53,http://www.nhc.noaa.gov/pdf/NWS-TPC-5.pdf,0.77372,-1.80199,3.55651 18 | 1960,Ethel,8.72222,981,981,1,1,0,35,53,MWR,0.60158,0.83113,-0.55936 19 | 1961,Carla,9.50000,931,931,1,4,46,15850,52,MWR,0.84258,-1.75036,0.66338 20 | 1963,Cindy,9.94444,996,996,1,1,3,300,50,MWR,0.98030,1.60558,-0.53887 21 | 1964,Cleo,7.94444,968,968,1,2,3,6450,49,MWR,0.36057,0.15994,-0.06338 22 | 1964,Dora,9.33333,966,966,1,2,5,16260,49,MWR,0.79094,0.05668,0.69508 23 | 1964,Hilda,8.83333,950,950,1,3,37,2770,49,MWR,0.63601,-0.76940,-0.34790 24 | 1964,Isbell,9.44444,974,974,1,2,3,800,49,MWR,0.82537,0.46972,-0.50021 25 | 1965,Betsy,8.33333,948,948,1,3,75,20000,48,MWR,0.48108,-0.87266,0.98424 26 | 1966,Alma,8.77778,982,982,1,2,6,730,47,MWR,0.61879,0.88276,-0.50562 27 | 1966,Inez,8.27778,983,983,1,1,3,99,47,MWR,0.46386,0.93439,-0.55441 28 | 1967,Beulah,7.27778,950,950,1,3,15,5060,46,MWR,0.15400,-0.76940,-0.17085 29 | 1968,Gladys,8.94444,977,977,1,2,3,800,45,MWR,0.67044,0.62461,-0.50021 30 | 1969,Camille,9.05556,909,909,1,5,256,23040,44,MWR,0.70487,-2.88622,1.21928 31 | 1970,Celia,9.44444,945,945,1,3,22,6870,43,WIKI (http://en.wikipedia.org/wiki/Hurricane_Celia),0.82537,-1.02755,-0.03091 32 | 1971,Edith,8.50000,978,978,1,2,0,300,42,MWR,0.53272,0.67624,-0.53887 33 | 1971,Fern,7.38889,979,979,1,1,2,500,42,MWR,0.18843,0.72787,-0.52341 34 | 1971,Ginger,10.00000,995,995,1,1,0,200,42,MWR,0.99752,1.55395,-0.54660 35 | 1972,Agnes,8.66667,980,980,1,1,117,20430,41,MWR,0.58436,0.77950,1.01748 36 | 1974,Carmen,8.72222,952,952,1,3,1,1180,39,MWR,0.60158,-0.66614,-0.47083 37 | 1975,Eloise,8.94444,955,955,1,3,21,6190,38,MWR,0.67044,-0.51125,-0.08348 38 | 1976,Belle,10.44445,980,980,1,1,5,570,37,MWR,1.13523,0.77950,-0.51799 39 | 1977,Babe,6.88889,995,995,1,1,0,66,36,MWR,0.03349,1.55395,-0.55696 40 | 1979,Bob,1.66667,986,986,0,1,1,70,34,MWR,-1.58468,1.08928,-0.55665 41 | 1979,David,1.72222,970,970,0,2,15,2700,34,MWR,-1.56747,0.26320,-0.35331 42 | 1979,Frederic,2.50000,946,946,0,3,5,12770,34,MWR,-1.32647,-0.97592,0.42525 43 | 1980,Allen,2.66667,945,945,0,3,2,2130,33,MWR,-1.27482,-1.02755,-0.39738 44 | 1983,Alicia,9.83333,962,962,1,3,21,10400,30,MWR,0.94587,-0.14984,0.24201 45 | 1984,Diana,9.94444,949,949,1,2,3,410,29,MWR,0.98030,-0.82103,-0.53036 46 | 1985,Bob,1.66667,1002,1003,0,1,0,130,28,MWR,-1.58468,1.91536,-0.55201 47 | 1985,Danny,2.22222,987,987,0,1,1,160,28,MWR,-1.41254,1.14091,-0.54969 48 | 1985,Elena,9.72222,959,959,1,3,4,4180,28,MWR,0.91144,-0.30473,-0.23889 49 | 1985,Gloria,9.50000,942,942,1,3,8,3020,28,MWR,0.84258,-1.18244,-0.32857 50 | 1985,Juan,1.94444,971,971,0,1,12,4730,28,MWR,-1.49861,0.31483,-0.19636 51 | 1985,Kate,9.66667,967,967,1,2,5,1310,28,MWR,0.89423,0.10831,-0.46078 52 | 1986,Bonnie,9.38889,990,990,1,1,3,6,27,MWR,0.80815,1.29580,-0.56160 53 | 1986,Charley,2.88889,990,990,0,1,5,58,27,MWR,-1.20596,1.29580,-0.55758 54 | 1987,Floyd,1.83333,993,993,0,1,0,1,26,MWR,-1.53304,1.45069,-0.56199 55 | 1988,Florence,8.33333,984,984,1,1,1,4,25,MWR,0.48108,0.98602,-0.56175 56 | 1989,Chantal,9.05556,986,986,1,1,13,290,24,MWR,0.70487,1.08928,-0.53964 57 | 1989,Hugo,2.88889,934,934,0,4,21,20020,24,MWR,-1.20596,-1.59547,0.98578 58 | 1989,Jerry,2.33333,983,983,0,1,3,230,24,MWR,-1.37811,0.93439,-0.54428 59 | 1991,Bob,1.66667,962,962,0,2,15,3620,22,MWR,-1.58468,-0.14984,-0.28218 60 | 1992,Andrew,2.22222,922,922,0,5,62,66730,21,MWR,-1.41254,-2.21503,4.59717 61 | 1993,Emily,9.83333,960,961,1,3,3,96,20,MWR,0.94587,-0.25310,-0.55464 62 | 1995,Erin,7.22222,973,973,1,2,6,1650,18,MWR,0.13678,0.41809,-0.43449 63 | 1995,Opal,8.50000,942,942,1,3,9,7550,18,MWR,0.53272,-1.18244,0.02167 64 | 1996,Bertha,8.50000,974,974,1,2,8,700,17,MWR,0.53272,0.46972,-0.50794 65 | 1996,Fran,7.16667,954,954,1,3,26,8260,17,MWR,0.11957,-0.56288,0.07656 66 | 1997,Danny,2.22222,984,984,0,1,10,200,16,MWR,-1.41254,0.98602,-0.54660 67 | 1998,Bonnie,9.38889,964,964,1,2,3,1650,15,MWR,0.80815,-0.04658,-0.43449 68 | 1998,Earl,1.88889,987,987,0,1,3,160,15,MWR,-1.51583,1.14091,-0.54969 69 | 1998,Georges,2.27778,964,964,0,2,1,3870,15,MWR,-1.39532,-0.04658,-0.26285 70 | 1999,Bret,2.33333,951,951,0,3,0,94,14,MWR,-1.37811,-0.71777,-0.55480 71 | 1999,Floyd,1.83333,956,956,0,2,56,8130,14,MWR,-1.53304,-0.45962,0.06651 72 | 1999,Irene,9.27778,987,964,1,1,8,1430,14,MWR,0.77372,1.14091,-0.45150 73 | 2002,Lili,10.33333,963,963,1,1,2,1260,11,MWR,1.10080,-0.09821,-0.46465 74 | 2003,Claudette,9.16667,979,979,1,1,3,250,10,MWR,0.73930,0.72787,-0.54274 75 | 2003,Isabel,9.38889,957,957,1,2,51,4980,10,MWR,0.80815,-0.40799,-0.17703 76 | 2004,Alex,4.16667,972,972,0,1,1,5,9,MWR,-0.81003,0.36646,-0.56168 77 | 2004,Charley,2.88889,941,941,0,4,10,20510,9,MWR,-1.20596,-1.23407,1.02367 78 | 2004,Frances,6.00000,960,960,1,2,7,12620,9,MWR,-0.24194,-0.25310,0.41365 79 | 2004,Gaston,2.66667,985,985,0,1,8,170,9,MWR,-1.27482,1.03765,-0.54892 80 | 2004,Ivan,1.05556,946,946,0,3,25,18590,9,MWR,-1.77405,-0.97592,0.87522 81 | 2004,Jeanne,8.50000,950,950,1,3,5,10210,9,MWR,0.53272,-0.76940,0.22732 82 | 2005,Cindy,9.94444,991,991,1,1,1,350,8,MWR,0.98030,1.34743,-0.53500 83 | 2005,Dennis,2.44444,946,946,0,3,15,2650,8,MWR,-1.34368,-0.97592,-0.35718 84 | 2005,Ophelia,9.16667,982,982,1,1,1,91,8,MWR,0.73930,0.88276,-0.55503 85 | 2005,Rita,9.50000,937,937,1,3,62,10690,8,MWR,0.84258,-1.44059,0.26443 86 | 2005,Wilma,8.61111,950,950,1,3,5,25960,8,MWR,0.56715,-0.76940,1.44504 87 | 2007,Humberto,2.38889,985,985,0,1,1,51,6,MWR,-1.36089,1.03765,-0.55812 88 | 2008,Dolly,9.83333,963,967,1,1,1,1110,5,MWR,0.94587,-0.09821,-0.47624 89 | 2008,Gustav,1.72222,951,954,0,2,52,4360,5,MWR,-1.56747,-0.71777,-0.22497 90 | 2008,Ike,1.88889,935,950,0,2,84,20370,5,MWR,-1.51583,-1.54384,1.01284 91 | 2011,Irene,9.27778,952,952,1,1,41,7110,2,MWR,0.77372,-0.66614,-0.01235 92 | 2012,Isaac,1.94444,965,966,0,1,5,24000,1,MWR,-1.49861,0.00505,1.29350 93 | 2012,Sandy,9.00000,945,942,1,2,159,75000,1,MWR,0.68765,-1.02755,5.23657 -------------------------------------------------------------------------------- /example/hurricane/boba_util.R: -------------------------------------------------------------------------------- 1 | # check if we support the model type 2 | # @param model The fitted model object 3 | is_supported <- function (model) { 4 | ms <- c('lm', 'lmerMod', 'negbin', 'aov') 5 | return(class(model)[1] %in% ms) 6 | } 7 | 8 | # get model predictions per data point 9 | # @param model The fitted model object 10 | # @param df The dataframe that the model will predict on 11 | pointwise_predict <- function (model, df) { 12 | if (!is_supported(model)) { 13 | stop(paste('Unsupported model type', class(model)[1])) 14 | } 15 | 16 | # fixme: lmerMod does not have se.fit 17 | pred <- predict(model, df, se.fit = TRUE, type = "response") 18 | disagg_fit <- df %>% 19 | mutate( 20 | fit = pred$fit, # inferential fits 21 | se.fit = pred$se.fit, # standard errors of predicted means 22 | df = df.residual(model), # residual degrees of freedom 23 | sigma = sigma(model), # residual standard deviation 24 | se.residual = sqrt(sum(residuals(model)^2) / df) # residual standard errors 25 | ) 26 | return(disagg_fit) 27 | } 28 | 29 | # split the train/test set in a k-fold cross validation 30 | # returns a dataframe with k rows (k is the num of folds) and two columns 31 | # - train: a list of training indices for the k-th fold 32 | # - test: a list of testing indices for the k-th fold 33 | # @param n The total number of rows 34 | cv_split <- function (n, folds = 5) { 35 | l = n %/% folds 36 | rest = n - folds * l 37 | 38 | lengths <- ifelse(1:folds <= rest, l + 1, l) 39 | f_sum <- function(x, n) sum(head(x,n)) 40 | indices <- lapply(1:folds, function (i) { 41 | i1 = f_sum(lengths, i - 1) + 1 42 | i2 = i1 + lengths[i] - 1 43 | 44 | if (i1 > 1) { 45 | if (i2+1 < n) { 46 | i_train = c(1:(i1-1), (i2+1):n) 47 | } else { 48 | i_train = 1:(i1-1) 49 | } 50 | } else { 51 | i_train = (i2+1):n 52 | } 53 | i_test = c(i1:i2) 54 | return(list(i_train, i_test)) 55 | }) 56 | 57 | indices <- as.data.frame(do.call(rbind, indices)) 58 | colnames(indices) <- c("train", "test") 59 | return(indices) 60 | } 61 | 62 | # perform k-fold cross validation 63 | # @param df The dataframe 64 | # @param model The fitted model 65 | # @param y The column name for the observed variable in df 66 | # @param folds The number of folds 67 | # @param func A function returning the fitted y vector from a model and a dataset 68 | cross_validation <- function (df, model, y, folds = 5, func = NULL) { 69 | mse = 0 70 | indices = cv_split(nrow(df), folds = folds) 71 | for (i in c(1:nrow(indices))) { 72 | d_train = df[indices$train[[i]], ] 73 | d_test = df[indices$test[[i]], ] 74 | 75 | m1 <- update(model, . ~ ., data = d_train) 76 | if (!is.null(func)) { 77 | expected <- func(m1, d_test) 78 | } else { 79 | # fixme: lmerMod need to set allow.new.levels = TRUE 80 | expected <- pointwise_predict(m1, d_test)$fit 81 | } 82 | 83 | mse = mse + sum((d_test[[y]] - expected)^2) 84 | } 85 | 86 | mse = sqrt(mse / nrow(df)) 87 | return(mse) 88 | } 89 | 90 | # marginalize model predictions 91 | # @param df The dataframe containing individual model fits 92 | # @param term The predictor of interest 93 | # @param y The value field to aggregate 94 | margins <- function (df, term, y = "fit") { 95 | expectation <- df %>% 96 | group_by(!! sym(term)) %>% # group by predictor(s) of interest 97 | summarize(expected = weighted.mean(!! sym(y))) %>% # marninalize across other predictors 98 | compare_levels(expected, by = !! sym(term)) %>% 99 | ungroup() 100 | return(expectation) 101 | } 102 | 103 | # get the sampling distribution 104 | # @param model The fitted model 105 | # @param term The predictor of interest 106 | # @param type Type of result (response or model coefficient) 107 | # @param draws The number of draws 108 | sampling_distribution <- function (model, term, type="coef", draws=200) { 109 | if (!is_supported(model)) { 110 | stop(paste('Unsupported model type', class(model)[1])) 111 | } 112 | ts = c('coef', 'coefficient', 'resp', 'response') 113 | if (!(type %in% ts)) { 114 | stop(paste('Unsupported type', type)) 115 | } 116 | 117 | if (type == "coef" || type == "coefficient") { 118 | uncertainty <- tidy(model, conf.int = TRUE) %>% 119 | filter(term == !! term) %>% 120 | mutate( 121 | df = df.residual(model), # get model degrees of freedom 122 | .draw = list(1:draws), # generate list of draw numbers 123 | t = map(df, ~rt(draws, .)) # simulate draws as t-scores 124 | ) %>% 125 | unnest(cols = c(".draw", "t")) %>% 126 | mutate(coef = t * std.error + estimate) 127 | } 128 | 129 | if (type == "resp" || type == "response") { 130 | # todo 131 | } 132 | 133 | return(uncertainty) 134 | } 135 | 136 | # permutation test to get the null distribution 137 | # @param df The dataframe 138 | # @param model The fitted model 139 | # @param terms A character vector of terms to be shuffled 140 | # @param func A function returning the point estimate from a model and a dataset 141 | # @param N The number of iterations 142 | permutation_test <- function (df, model, terms, func = NULL, N=200) { 143 | # ensure we have the same random samples across universe runs 144 | set.seed(3040) 145 | 146 | res = lapply(1:N, function (i) { 147 | # shuffle 148 | pm <- df[sample(nrow(df)), ] %>% 149 | dplyr::select(any_of(terms)) 150 | 151 | df2 = df %>% dplyr::select(-any_of(terms)) %>% 152 | bind_cols(pm) 153 | 154 | # fit the model 155 | m1 <- update(model, . ~ ., data = df2) 156 | 157 | # point estimate 158 | if (!is.null(func)) { 159 | expected <- func(m1, df2) 160 | } else { 161 | # fixme: lmerMod need to set allow.new.levels = TRUE 162 | expected <- margins(pointwise_predict(m1, df2), terms[1])$expected 163 | } 164 | 165 | return(expected) 166 | }) 167 | 168 | # remove seed because set seed is global 169 | rm(.Random.seed, envir=.GlobalEnv) 170 | 171 | return(enframe(unlist(res))) 172 | } 173 | 174 | # get the pointwise log likelihood 175 | # @param model The fitted model 176 | # @param d_test The dataframe 177 | # @private 178 | compute_loglik <- function (model, d_test) { 179 | mu <- predict(model, d_test, type = "response") 180 | sigma <- sigma(model) 181 | y <- as.list(attr(terms(model), "variables"))[[2]] 182 | return(log(dnorm(d_test[[y]], mu, sigma)+1e-307)) 183 | } 184 | 185 | # get the pointwise log likelihood for stacking 186 | # @param df The dataframe 187 | # @param model The fitted model 188 | stacking <- function (df, model) { 189 | indices = cv_split(nrow(df), folds = 5) 190 | pointwise_density <- c() 191 | 192 | for (i in c(1:nrow(indices))) { 193 | d_train = df[indices$train[[i]], ] 194 | d_test = df[indices$test[[i]], ] 195 | 196 | m1 <- update(model, . ~ ., data = d_train) 197 | pointwise_density <- append(pointwise_density, compute_loglik(m1, d_test)) 198 | } 199 | 200 | return(pointwise_density) 201 | } 202 | -------------------------------------------------------------------------------- /boba/bobarun.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import pandas as pd 3 | import os 4 | import json 5 | import multiprocessing as mp 6 | from subprocess import PIPE 7 | from .lang import Lang 8 | from .wrangler import * 9 | 10 | 11 | class BobaRun: 12 | def __init__(self, folder, jobs=1, batch_size=0): 13 | # attributes 14 | self.folder = folder 15 | self.dir_log = os.path.join(folder, DIR_LOG) 16 | self.file_log = os.path.join(self.dir_log, 'logs.csv') 17 | self.pool = None 18 | self.exit_code = [] 19 | 20 | # read summary 21 | data = pd.read_csv(self.folder + '/summary.csv') 22 | self.size = data.shape[0] 23 | 24 | # multiprocessing attributes 25 | if jobs == 0: 26 | jobs = mp.cpu_count() 27 | if batch_size == 0: 28 | batch_size = min(int(self.size**0.5), int(self.size / jobs) + 1) 29 | self.jobs = jobs 30 | self.batch_size = batch_size 31 | 32 | # language 33 | fn = data['Filename'].to_list()[0] 34 | try: 35 | with open(self.folder + '/lang.json', 'r') as f: 36 | self.lang = Lang(fn, supported_langs=json.load(f)) 37 | except IOError: 38 | self.lang = Lang(fn) 39 | 40 | 41 | def run_multiverse(self, universes=[], resume=False): 42 | """ 43 | Run the multiverse. 44 | 45 | Parameters: 46 | - universes: a list of universe ids to run 47 | - resume: skip log initialization and pre-exe hook, but the caller must 48 | make sure that these steps are done properly before calling 49 | """ 50 | # do not allow simultaneous runs 51 | if self.is_running(): 52 | return 53 | 54 | # initialize process pool 55 | self.pool = mp.Pool(self.jobs) 56 | 57 | # by default, run all universes 58 | if not len(universes): 59 | universes = list(range(1, self.size + 1)) 60 | 61 | if not resume: 62 | # before execute 63 | self.run_commands_in_folder('pre_exe.sh') 64 | 65 | # initialize the log folder and log file 66 | self.exit_code = [] 67 | if os.path.exists(self.dir_log): 68 | shutil.rmtree(self.dir_log) 69 | os.makedirs(self.dir_log) 70 | 71 | with open(self.file_log, 'w') as log: 72 | log.write('uid,exit_code\n') 73 | 74 | # callback that is run for each retrieved result. 75 | # FIXME: if stopped, the last batch will not invoke the callback 76 | def check_result(r): 77 | self.exit_code += [[res[0], res[1]] for res in r] 78 | # write the results to our logs 79 | with open(self.file_log, 'a') as f_log: 80 | for res in r: 81 | f_log.write(f'{res[0]},{res[1]}\n') 82 | 83 | # run each batch of universes as a separate task 84 | while len(universes): 85 | batch = [] 86 | while len(universes) and len(batch) < self.batch_size: 87 | u = get_universe_script(universes.pop(0), self.lang.get_ext()) 88 | batch.append(u) 89 | 90 | self.pool.apply_async(run_batch_of_universes, 91 | args=(self.folder, batch, self.lang.supported_langs), 92 | callback=check_result) 93 | 94 | # collect all the results 95 | self.pool.close() 96 | self.pool.join() 97 | 98 | # after execute 99 | self.run_commands_in_folder('post_exe.sh') 100 | self.pool = None 101 | 102 | 103 | def resume_multiverse(self, universes=[]): 104 | """ 105 | Resume the multiverse, by skipping scripts that are already run in the 106 | universe list. 107 | """ 108 | # if the log file is missing, run everything 109 | if not os.path.exists(self.file_log): 110 | return self.run_multiverse(universes) 111 | 112 | # default argument 113 | if not len(universes): 114 | universes = list(range(1, self.size + 1)) 115 | 116 | # recover previous progress from log file 117 | df = pd.read_csv(self.file_log) 118 | self.exit_code = df.values.tolist() 119 | 120 | # skip scripts that are already run 121 | lookup = set(df['uid'].tolist()) 122 | universes = [u for u in universes if u not in lookup] 123 | self.run_multiverse(universes, resume=True) 124 | 125 | 126 | def stop(self): 127 | """ Stop all outstanding work in the pool """ 128 | if self.pool is not None: 129 | print('Terminating') 130 | # stop all workers 131 | # note that everything after pool.join() will still run 132 | self.pool.terminate() 133 | 134 | 135 | def is_running(self): 136 | """ Whether the multiverse is currently running """ 137 | return self.pool is not None 138 | 139 | 140 | def run_from_cli(self, run_all=True, num=1, thru=-1): 141 | """ Entry point of boba run CLI """ 142 | # get the id of all the universes we want to run 143 | thru = num if thru == -1 else thru 144 | start = 1 if run_all else num 145 | end = self.size if run_all else thru 146 | universes = list(range(start, end + 1)) 147 | 148 | # run 149 | self.run_multiverse(universes) 150 | 151 | 152 | def run_commands_in_folder(self, file_with_commands): 153 | """ Run command """ 154 | cwd = os.getcwd() 155 | os.chdir(self.folder) 156 | with open(file_with_commands) as f: 157 | for line in f.readlines(): 158 | os.system(line) 159 | os.chdir(cwd) 160 | 161 | 162 | def run_after_execute(self): 163 | self.run_commands_in_folder('post_exe.sh') 164 | 165 | 166 | # these two functions can't be in the class because multiprocess 167 | # does not know how to properly serialize functions in classes 168 | def run_batch_of_universes(folder, universes, supported_langs): 169 | """ Run a batch of universes """ 170 | batch = [] 171 | for universe in universes: 172 | batch.append(run_universe(folder, universe, supported_langs)) 173 | 174 | return batch 175 | 176 | 177 | def run_universe(folder, script, supported_langs): 178 | """ Run one universe """ 179 | cmds = Lang(script, supported_langs=supported_langs).get_cmd() 180 | 181 | universe_id = get_universe_id_from_script(script) 182 | universe_name_fmt = '[' + get_universe_name(universe_id) + ']' 183 | for cmd in cmds: 184 | out = subprocess.Popen(cmd, cwd=os.path.join(folder, DIR_SCRIPT), 185 | stdout=PIPE, stderr=PIPE) 186 | 187 | log_dir = os.path.join(folder, DIR_LOG) 188 | with open(os.path.join(log_dir, get_universe_log(universe_id)), 'w') as log: 189 | while True: 190 | # blocks here until next line is availible. 191 | output = out.stdout.readline().decode('utf-8') 192 | if output == '' and out.poll() is not None: 193 | break 194 | if output: 195 | print(universe_name_fmt + " " + output, end='') 196 | log.write(output) 197 | rc = out.poll() 198 | 199 | err = out.communicate()[1] 200 | err_decoded = err.decode('utf-8') 201 | if err_decoded is not '': 202 | with open(os.path.join(log_dir, get_universe_error_log(universe_id)), 'w') as err_log: 203 | err_log.write(err_decoded) 204 | 205 | print(universe_name_fmt + ' error:\n' + err_decoded, end='') 206 | break 207 | 208 | return universe_id, out.returncode 209 | -------------------------------------------------------------------------------- /example/hurricane/template.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # --- (BOBA_CONFIG) 3 | { 4 | "decisions": [ 5 | {"var": "outliers", "options": [ 6 | "c()", 7 | "c('Katrina')", 8 | "c('Katrina', 'Audrey')", 9 | "c('Katrina', 'Audrey', 'Sandy')", 10 | "c('Katrina', 'Audrey', 'Sandy', 'Andrew')", 11 | "c('Katrina', 'Audrey', 'Sandy', 'Andrew', 'Donna')" 12 | ]}, 13 | {"var": "feminity", "options": ["female", "masfem"]}, 14 | {"var": "feminity_prediction_levels", "options": ["c(0, 1)", "c(2.53, 8.29)"]}, 15 | {"var": "damage", "options": ["dam", "log_dam"]}, 16 | {"var": "predictors", "options": [ 17 | "feminity * damage", 18 | "feminity + damage + pressure + feminity:damage + feminity:pressure", 19 | "feminity + damage + zwin + feminity:damage + feminity:zwin", 20 | "feminity + damage + zcat + feminity:damage + feminity:zcat", 21 | "feminity + damage + z3 + feminity:damage + feminity:z3", 22 | "feminity + damage + z3" 23 | ]}, 24 | {"var": "covariates", "options": [ 25 | "", 26 | "+ year:damage", 27 | "+ post:damage" 28 | ]}, 29 | {"var": "back_transform", "options": [ 30 | "exp(mu + sigma^2/2) - 1", 31 | "mu", 32 | "exp(mu + sigma^2/2) - 1" 33 | ]}, 34 | {"var": "df", "options": [ 35 | "pred$df", 36 | "df.residual(model)", 37 | "pred$df" 38 | ]} 39 | ], 40 | "constraints": [ 41 | {"link": ["feminity", "feminity_prediction_levels"]}, 42 | {"link": ["Model", "back_transform", "df"]} 43 | ], 44 | "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results", 45 | "visualizer": "visualizer_config.json" 46 | } 47 | # --- (END) 48 | 49 | library(readr) 50 | library(MASS) 51 | library(modelr) 52 | library(tidyverse) 53 | library(broom.mixed) 54 | library(tidybayes) 55 | 56 | # a function for post-processing predicted means and standard deviations into expected number of deaths 57 | pred2expectation <- function(mu, sigma) { 58 | return({{back_transform}}) 59 | } 60 | 61 | # a custom function for cross validation 62 | cross <- function (df, func, fml, folds = 5) { 63 | l = nrow(df) %/% folds 64 | mse = 0 65 | for (i in c(1:folds)) { 66 | # properly splitting train/test 67 | i1 = l*(i-1)+1 68 | i2 = l*i 69 | d_test = df[i1:i2, ] 70 | if (i1 > 1) { 71 | if (i2+1 < nrow(df)) { 72 | d_train = rbind(df[1:(i1-1), ], df[(i2+1):nrow(df), ]) 73 | } else { 74 | d_train = df[1:(i1-1), ] 75 | } 76 | } else { 77 | d_train = df[(i2+1):nrow(df), ] 78 | } 79 | 80 | model <- func(fml, data = d_train) 81 | mu <- predict(model, d_test, type = "response") 82 | sigma <- sigma(model) 83 | expected_deaths <- pred2expectation(mu, sigma) 84 | 85 | mse = mse + sum((d_test$death - expected_deaths)^2) 86 | } 87 | 88 | mse = sqrt(mse / nrow(df)) 89 | return(mse) 90 | } 91 | 92 | # read and process data 93 | df <- read_csv('../data.csv', 94 | col_types = cols( 95 | Year = col_integer(), 96 | Category = col_integer(), 97 | Gender_MF = col_integer(), 98 | alldeaths = col_integer() 99 | )) %>% 100 | # rename some variables 101 | dplyr::select( 102 | year = Year, 103 | name = Name, 104 | dam = NDAM, 105 | death = alldeaths, 106 | female = Gender_MF, 107 | masfem = MasFem, 108 | category = Category, 109 | pressure = Minpressure_Updated_2014, 110 | wind = HighestWindSpeed 111 | ) %>% 112 | # create new variables 113 | mutate( 114 | log_death = log(death + 1), 115 | log_dam = log(dam), 116 | post = ifelse(year>1979, 1, 0), 117 | zdam = scale(dam), 118 | zcat = as.numeric(scale(category)), 119 | zmin = -scale(pressure), 120 | zwin = as.numeric(scale(wind)), 121 | z3 = as.numeric((zmin + zcat + zwin) / 3) 122 | ) %>% 123 | # remove outliers 124 | filter(!(name %in% {{outliers}})) %>% 125 | # operationalize feminity 126 | mutate( 127 | feminity = {{feminity}}, 128 | damage = {{damage}} 129 | ) 130 | 131 | # --- (Model) ols_regression 132 | # OLS regression with log(deaths+1) as the dependent variable 133 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df) 134 | fit = cross(df, lm, log_death ~ {{predictors}} {{covariates}}) # cross validation 135 | 136 | # --- (Model) negative_binomial 137 | # Negative binomial with deaths as the dependent variable 138 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df) 139 | fit = cross(df, glm.nb, death ~ {{predictors}} {{covariates}}) # cross validation 140 | 141 | # --- (Model) anova 142 | # ANOVA with log(deaths+1) as the dependent variable 143 | model <- aov(log_death ~ {{predictors}} {{covariates}}, data = df) 144 | fit = cross(df, aov, log_death ~ {{predictors}} {{covariates}}) # cross validation 145 | 146 | # --- (O) 147 | # normalize RMSE 148 | nrmse = fit / (max(df$death) - min(df$death)) 149 | 150 | # get prediction 151 | pred <- predict(model, se.fit = TRUE, type = "response") 152 | disagg_fit <- df %>% 153 | mutate( 154 | fit = pred$fit, # add fitted predictions and standard errors to dataframe 155 | se.fit = pred$se.fit, 156 | df = {{df}}, # get degrees of freedom 157 | sigma = sigma(model), # get residual standard deviation 158 | se.residual = sqrt(sum(residuals(model)^2) / df) # get residual standard errors 159 | ) 160 | 161 | # aggregate fitted effect of female storm name 162 | expectation <- disagg_fit %>% 163 | mutate(expected_deaths = pred2expectation(fit, sigma)) %>% 164 | group_by(female) %>% # group by predictor(s) of interest 165 | summarize(expected_deaths = weighted.mean(expected_deaths)) %>% # marninalize across other predictors 166 | compare_levels(expected_deaths, by = female) %>% 167 | ungroup() %>% 168 | dplyr::select(expected_diff = expected_deaths) %>% 169 | add_column(NRMSE = nrmse) # add cross validatation metric 170 | 171 | # propagate uncertainty in fit to model predictions 172 | uncertainty <- disagg_fit %>% 173 | mutate( 174 | .draw = list(1:5000), # generate list of draw numbers 175 | t = map(df, ~rt(5000, .)), # simulate draws from t distribution to transform into means 176 | x = map(df, ~rchisq(5000, .)) # simulate draws from chi-squared distribution to transform into sigmas 177 | ) %>% 178 | unnest(cols = c(".draw", "t", "x")) %>% 179 | mutate( 180 | mu = t * se.fit + fit, # scale and shift t to get a sampling distribution of means 181 | sigma = sqrt(df * se.residual^2 / x), # scale and take inverse of x to get a sampling distribution of sigmas 182 | expected_deaths = pred2expectation(mu, sigma) 183 | ) %>% 184 | group_by(.draw, female) %>% # group by predictor(s) of interest 185 | summarize(expected_deaths = mean(expected_deaths)) %>% # marninalize across other predictors 186 | compare_levels(expected_deaths, by = female) %>% 187 | ungroup() %>% 188 | dplyr::select(expected_diff = expected_deaths) 189 | 190 | # only output relevant fields in disagg_fit 191 | disagg_fit <- disagg_fit %>% 192 | mutate(expected_deaths = pred2expectation(fit, sigma)) %>% 193 | dplyr::select( 194 | observed = death, 195 | expected = expected_deaths 196 | ) 197 | 198 | # output 199 | write_csv(expectation, '../results/estimate_{{_n}}.csv') 200 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv') 201 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv') 202 | -------------------------------------------------------------------------------- /test/test_constraint_parser.py: -------------------------------------------------------------------------------- 1 | # Ugly hack to allow import from the root folder 2 | import sys 3 | import os 4 | sys.path.insert(0, os.path.abspath('..')) 5 | 6 | import unittest 7 | from boba.constraintparser import ConstraintParser, ParseError 8 | from boba.conditionparser import ConditionParser, TokenType 9 | from boba.parser import Parser 10 | 11 | 12 | def abs_path(rel_path): 13 | return os.path.join(os.path.dirname(__file__), rel_path) 14 | 15 | 16 | def read_wrapper(spec, ps): 17 | ConstraintParser(spec).read_constraints(ps.code_parser, ps.dec_parser) 18 | 19 | 20 | class TestConstraintParser(unittest.TestCase): 21 | 22 | def test_read_json(self): 23 | base = abs_path('./specs/') 24 | ps = Parser(base+'script3-1.py') 25 | cp = ConstraintParser(ps.spec) 26 | cs = cp.read_constraints(ps.code_parser, ps.dec_parser) 27 | self.assertEqual(len(cs), 2) 28 | 29 | def test_link(self): 30 | base = abs_path('./specs/') 31 | ps = Parser(base + 'script3-7.py') 32 | cp = ConstraintParser(ps.spec) 33 | cs = cp.read_constraints(ps.code_parser, ps.dec_parser) 34 | self.assertEqual(len(cs), 10) 35 | 36 | def test_condition_parser(self): 37 | cond = '' 38 | ConditionParser(cond).parse() 39 | 40 | cond = 'a == b' 41 | _, decs = ConditionParser(cond).parse() 42 | self.assertListEqual(['a', 'b'], [d.value for d in decs]) 43 | 44 | cond = 'a.index == 1' 45 | _, decs = ConditionParser(cond).parse() 46 | self.assertListEqual(['a', '1'], [d.value for d in decs]) 47 | self.assertListEqual([TokenType.index_var, TokenType.number], 48 | [d.type for d in decs]) 49 | 50 | cond = 'a = 2.5' 51 | _, decs = ConditionParser(cond).parse() 52 | self.assertListEqual(['a', '2.5'], [d.value for d in decs]) 53 | self.assertListEqual([TokenType.var, TokenType.number], 54 | [d.type for d in decs]) 55 | 56 | cond = 'a.index == b.index' # .index not allowed on RHS, should fail 57 | with self.assertRaises(ParseError): 58 | ConditionParser(cond).parse() 59 | 60 | cond = '1 2 a b 4' # we did not check other semantics ... 61 | ConditionParser(cond).parse() 62 | 63 | def test_eval(self): 64 | """ Evaluation of various conditions """ 65 | # expr and expr 66 | base = abs_path('./specs/') 67 | ps = Parser(base + 'script3-6.py', base) 68 | ps.main(verbose=False) 69 | self.assertEqual(ps.wrangler.counter, 2) 70 | 71 | # expr or expr 72 | ps.spec['constraints'] = [{"block": "D", "condition": "a == if or B == b1"}] 73 | ps._parse_constraints() 74 | ps.main(verbose=False) 75 | self.assertEqual(ps.wrangler.counter, 6) 76 | 77 | # expr and (expr or expr) 78 | ps.spec['constraints'] = [{"block": "D", "condition": "a == if and (B == b1 or B == b2)"}] 79 | ps._parse_constraints() 80 | ps.main(verbose=False) 81 | self.assertEqual(ps.wrangler.counter, 4) 82 | 83 | # testing != 84 | ps.spec['constraints'] = [{"block": "D", "condition": "a != if"}] 85 | ps._parse_constraints() 86 | ps.main(verbose=False) 87 | self.assertEqual(ps.wrangler.counter, 4) 88 | 89 | # testing >= 90 | ps.spec['constraints'] = [{"block": "D", "condition": "a.index >= 1"}] 91 | ps._parse_constraints() 92 | ps.main(verbose=False) 93 | self.assertEqual(ps.wrangler.counter, 4) 94 | 95 | # testing index 96 | ps.spec['constraints'] = [{"block": "D", "condition": "b.index == 1"}] 97 | ps._parse_constraints() 98 | ps.main(verbose=False) 99 | self.assertEqual(ps.wrangler.counter, 4) 100 | 101 | # testing option with integer type 102 | ps.spec['constraints'] = [{"block": "D", "condition": "b == 0"}] 103 | ps._parse_constraints() 104 | ps.main(verbose=False) 105 | self.assertEqual(ps.wrangler.counter, 4) 106 | 107 | # testing option with float type 108 | ps.spec['constraints'] = [{"block": "D", "condition": "b == 1.5"}] 109 | ps._parse_constraints() 110 | ps.main(verbose=False) 111 | self.assertEqual(ps.wrangler.counter, 4) 112 | 113 | # testing unmade decision 114 | ps.spec['constraints'] = [{"block": "A", "condition": "b.index == 0"}] 115 | ps._parse_constraints() 116 | ps.main(verbose=False) 117 | self.assertEqual(ps.wrangler.counter, 0) 118 | 119 | # testing if the decision is made when the block depends on a variable 120 | # inside the block 121 | ps.spec['constraints'] = [{"block": "B", "condition": "b.index == 0"}] 122 | ps._parse_constraints() 123 | ps.main(verbose=False) 124 | self.assertEqual(ps.wrangler.counter, 0) 125 | 126 | def test_condition_syntax(self): 127 | """ Does the condition code contain python syntax error? """ 128 | 129 | base = abs_path('./specs/') 130 | ps = Parser(base+'script3-1.py', base) 131 | 132 | spec = {'constraints': [{'block': 'A', 'condition': 'B=b1'}]} 133 | with self.assertRaises(ParseError): 134 | read_wrapper(spec, ps) 135 | 136 | spec = {'constraints': [{'block': 'A', 'condition': 'B b1'}]} 137 | with self.assertRaises(ParseError): 138 | read_wrapper(spec, ps) 139 | 140 | spec = {'constraints': [{'block': 'A', 'condition': 'B == 2.5'}]} 141 | read_wrapper(spec, ps) 142 | 143 | def test_json_syntax(self): 144 | """ Test various possibilities to specify constraints in JSON """ 145 | 146 | base = abs_path('./specs/') 147 | ps = Parser(base+'script3-1.py', base) 148 | 149 | # empty - should parse 150 | spec = {} 151 | read_wrapper(spec, ps) 152 | 153 | # empty array - should parse 154 | spec = {'constraints': []} 155 | read_wrapper(spec, ps) 156 | 157 | # empty element - should fail 158 | spec = {'constraints': [{}]} 159 | with self.assertRaises(ParseError): 160 | read_wrapper(spec, ps) 161 | 162 | # no matching block - should fail 163 | spec = {'constraints': [{'block': 'a'}]} 164 | with self.assertRaises(ParseError): 165 | read_wrapper(spec, ps) 166 | 167 | # no matching variable - should fail 168 | spec = {'constraints': [{'variable': 'c'}]} 169 | with self.assertRaises(ParseError): 170 | read_wrapper(spec, ps) 171 | 172 | # loner option - should fail 173 | spec = {'constraints': [{'option': 'a1'}]} 174 | with self.assertRaises(ParseError): 175 | read_wrapper(spec, ps) 176 | 177 | # loner block - should parse 178 | spec = {'constraints': [{'block': 'A', 'condition': 'B==b1'}]} 179 | read_wrapper(spec, ps) 180 | 181 | # block and option - should parse 182 | spec = {'constraints': [{'block': 'A', 'option': 'a1', 'condition': 'B==b1'}]} 183 | read_wrapper(spec, ps) 184 | 185 | # variable and option - should parse 186 | spec = {'constraints': [{'variable': 'a', 'option': '2.5', 'condition': 'B==b1'}]} 187 | read_wrapper(spec, ps) 188 | 189 | # weird option - should parse 190 | # fixme: {'option': '[1,2]'} will fail 191 | spec = {'constraints': [{'variable': 'c', 'option': '[1, 2]', 'condition': 'B==b1'}]} 192 | read_wrapper(spec, ps) 193 | 194 | # variables in condition do not match - should fail 195 | spec = {'constraints': [{'block': 'A', 'condition': 'H==b1'}]} 196 | with self.assertRaises(ParseError): 197 | read_wrapper(spec, ps) 198 | 199 | # variables in condition do not match - should fail 200 | spec = {'constraints': [{'block': 'A', 'condition': 'H.index==1'}]} 201 | with self.assertRaises(ParseError): 202 | read_wrapper(spec, ps) 203 | 204 | 205 | if __name__ == '__main__': 206 | unittest.main() 207 | -------------------------------------------------------------------------------- /tutorial/simple.md: -------------------------------------------------------------------------------- 1 | # Getting started 2 | 3 | In this tutorial, we will walk you through a simple analysis scenario to 4 | demonstrate how you might write and execute multiverse using our tool. 5 | 6 | ### A simple analysis script 7 | 8 | Let's say we have the following analysis script that reads a data file, removes 9 | outliers, and fits a linear model. 10 | 11 | ```python 12 | import pandas as pd 13 | import numpy as np 14 | import statsmodels.api as sm 15 | 16 | if __name__ == '__main__': 17 | # read data file 18 | df = pd.read_csv('data.csv') 19 | 20 | # remove outliers 21 | # discard rows outside 2 x std 22 | df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())] 23 | 24 | # fit a simple ordinary least squares model 25 | x = sm.add_constant(df.x) 26 | lm = sm.OLS(df.y, x).fit() 27 | ``` 28 | 29 | ### Placeholder variable 30 | 31 | Suppose the threshold for removing outliers is pretty subjective; you can 32 | justify removing data points outside 2, 2.5 or 3 standard deviations of the 33 | mean. Would the prediction change if you adopt a different threshold? To test 34 | this, you might insert a decision point and ask the tool to output a 35 | separate script for each possible threshold configuration. To insert a decision, 36 | first insert a placeholder variable `{{var_name}}` in the above code: 37 | 38 | ```python 39 | df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff}} * df.y.std())] 40 | ``` 41 | 42 | Then, in a separate JSON file, you could list the possible options this 43 | placeholder variable can take up: 44 | 45 | ```json 46 | { 47 | "decisions": [ 48 | {"var": "cutoff", "options": [2, 2.5, 3] } 49 | ] 50 | } 51 | ``` 52 | 53 | Now, calling the tool with the file path to your script and JSON will output 3 54 | python scripts. Each script is a universe where you choose a different cutoff 55 | value for removing outliers; for example, one of the universes is exactly the 56 | same as the analysis script we started with. The tool also outputs a summary 57 | table to let you know what parameter value is taken up by which file: 58 | 59 | |Filename |Code Path|cutoff| 60 | |-------------|---------|------| 61 | |universe_1.py|_start |2 | 62 | |universe_2.py|_start |2.5 | 63 | |universe_3.py|_start |3 | 64 | 65 | (The table contains an unfamiliar column "Code Path", which we will explain in 66 | a minute!) 67 | 68 | If you specify multiple decisions, we will output **all combinations** of 69 | possible alternatives. Namely, the number of output scripts will be the 70 | cross-product of the number of options for each decision. 71 | 72 | ### Code blocks 73 | 74 | Your decision point can be more complex than replacing values of a variable. 75 | For example, instead of removing data points outside some standard deviations 76 | of the mean, it is also reasonable to remove data points outside some IQRs of 77 | the median. 78 | 79 | ```python 80 | iqr = np.subtract(*np.percentile(df.y, [75, 25])) 81 | median = np.median(df.y) 82 | df = df[abs(df.y - median) <= 3 * iqr] 83 | ``` 84 | As you can see, this alternative requires a few lines to implement; it is no 85 | longer a straightforward value substitution. You can of course write the entire 86 | block of code as a string into the options array, but it will be really 87 | cumbersome. 88 | 89 | You might instead consider using code blocks. Instead of a 90 | linear flow from start to end, your code can consist of blocks, similar to 91 | cells in Jupyter notebook or R markdown. To specify a code block, simply insert 92 | a comment line with the syntax `# --- (ID) option` immediately 93 | before the starting line of the block. The lines of code between this 94 | declaration and the next (or the end of file) is a block 95 | named `ID`. We will go ahead and insert three such comments into 96 | our script: 97 | 98 | ```python 99 | import pandas as pd 100 | import numpy as np 101 | import statsmodels.api as sm 102 | 103 | if __name__ == '__main__': 104 | # read data file 105 | df = pd.read_csv('../data.csv') 106 | 107 | # --- (A) std 108 | # remove outliers based on std 109 | df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff}} * df.y.std())] 110 | 111 | # --- (A) iqr 112 | # remove outliers based on iqr 113 | iqr = np.subtract(*np.percentile(df.y, [75, 25])) 114 | median = np.median(df.y) 115 | df = df[abs(df.y - median) <= 3 * iqr] 116 | 117 | # --- (B) 118 | # fit a simple ordinary least squares model 119 | x = sm.add_constant(df.x) 120 | lm = sm.OLS(df.y, x).fit() 121 | 122 | # display results 123 | print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) 124 | print('AIC: {:.2f}'.format(lm.aic)) 125 | print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj)) 126 | ``` 127 | 128 | These three comments break the code into **four** blocks. All lines before 129 | `# --- (A) std` belong to the first, unnamed block. All lines between `# --- (A) std` 130 | and `# --- (A) iqr` belong to block "A" with option "std". All lines between 131 | `# --- (B)` and the end of the file belong to block "B". 132 | 133 | Note that we have two types of blocks: some blocks, such as `(A) std` and 134 | `(A) iqr`, specify an *option* after the parenthesis. 135 | Such blocks are called *decision blocks*; the same ID can take up different 136 | options, not unlike a placeholder variable. Other blocks, such as `(B)`, are 137 | normal blocks that do not act like a decision point. 138 | 139 | We now need to tell boba the relationship between the blocks. 140 | We want to remove outliers before fitting the model, so the order of the blocks 141 | should be A followed by B. Note that while A has two options `std` and `iqr`, 142 | we only use `A` in the graph and boba will choose different options in 143 | different universes. Let's specify the relationship of the blocks as a directed 144 | graph in the JSON file: 145 | 146 | ```json 147 | { 148 | "graph": ["A->B"], 149 | "decisions": [ 150 | {"var": "cutoff", "options": [2, 2.5, 3] } 151 | ] 152 | } 153 | ``` 154 | The graph is optional, with the default being a linear path of all the blocks 155 | according to their order in the template script. In this example, we could 156 | omit the graph and still get the same result. 157 | 158 | Now, calling the program with our updated script and JSON will generate 4 159 | universes where the following value and code path is chosen: 160 | 161 | |Filename |Code Path |cutoff|(A)| 162 | |-------------|------------|------|---| 163 | |universe_1.py|_start->A->B|2 |std| 164 | |universe_2.py|_start->A->B|2.5 |std| 165 | |universe_3.py|_start->A->B|3 |std| 166 | |universe_4.py|_start->A->B| |iqr| 167 | 168 | Since we did not use the parameter `cutoff` in our outlier removal code 169 | involving IQR, our multiverse does not expand the parameter `cutoff` when 170 | block `A` takes the option `iqr`. If we change the code in IQR to be: 171 | 172 | ```python 173 | df = df[abs(df.y - median) <= {{cutoff}} * iqr] 174 | ``` 175 | 176 | We will get 6 universes: 177 | 178 | |Filename |Code Path |cutoff|(A)| 179 | |-------------|------------|------|---| 180 | |universe_1.py|_start->A->B|2 |iqr| 181 | |universe_2.py|_start->A->B|2.5 |iqr| 182 | |universe_3.py|_start->A->B|3 |iqr| 183 | |universe_4.py|_start->A->B|2 |std| 184 | |universe_5.py|_start->A->B|2.5 |std| 185 | |universe_6.py|_start->A->B|3 |std| 186 | 187 | Take a look at the generated python scripts 188 | [here](https://github.com/uwdata/boba/tree/master/example/simple/output/code). 189 | 190 | (You may notice that all code paths in the above table are the same. In a more 191 | complex analysis, we might produce differing code paths, by creating 192 | branches in the directed graph. We will cover 193 | advanced usage of the graph in a later tutorial.) 194 | 195 | ### Executing the multiverse 196 | After you are happy with the generated scripts, you might want to execute them 197 | all to compute the results. Boba has a command for executing universes: 198 | 199 | ```bash 200 | boba run --all 201 | ``` 202 | It will run **all** the scripts for you! Before you do this, you might want 203 | to run one script, or simply look at a few scripts, to ensure that 204 | the generated code does not have syntax errors, etc. To run a selected range 205 | of universes, for example universe number 1 through 3, do: 206 | 207 | ```bash 208 | boba run 1 --thru 3 209 | ``` 210 | 211 | ### Try it yourself! 212 | 213 | The code and data of this example is available [here](https://github.com/uwdata/boba/tree/master/example/simple). 214 | To run the example, clone this repository and run the following commands: 215 | 216 | ```bash 217 | pip install -e . 218 | pip install -r requirements.txt 219 | cd example/simple 220 | boba compile 221 | ``` 222 | -------------------------------------------------------------------------------- /example/hurricane/reproduce/repro_bootstrap.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Use bootstrapping to get uncertainty distribution 3 | # Issue: majority of glm.nb models fail to fit on bootstrapped data 4 | # --- (BOBA_CONFIG) 5 | { 6 | "decisions": [ 7 | {"var": "outliers", "options": [ 8 | "c()", 9 | "c('Katrina')", 10 | "c('Katrina', 'Audrey')" 11 | ]}, 12 | {"var": "leverage_points", "options": [ 13 | "c()", 14 | "c('Sandy')", 15 | "c('Sandy', 'Andrew')", 16 | "c('Sandy', 'Andrew', 'Donna')" 17 | ]}, 18 | {"var": "feminity", "options": ["female", "masfem"]}, 19 | {"var": "damage", "options": ["dam", "log_dam"]}, 20 | {"var": "predictors", "options": [ 21 | "feminity * damage", 22 | "feminity + damage + pressure + feminity:damage + feminity:pressure", 23 | "feminity + damage + zwin + feminity:damage + feminity:zwin", 24 | "feminity + damage + zcat + feminity:damage + feminity:zcat", 25 | "feminity + damage + z3 + feminity:damage + feminity:z3", 26 | "feminity + damage + z3" 27 | ]}, 28 | {"var": "covariates", "options": [ 29 | "", 30 | "+ year:damage", 31 | "+ post:damage" 32 | ]}, 33 | {"var": "back_transform", "options": [ 34 | "exp(mu + sigma^2/2) - 1", 35 | "mu" 36 | ]}, 37 | {"var": "model_prefix", "options": ["lm(log_death", "glm.nb(death"]}, 38 | {"var": "df", "options": [ 39 | "pred$df", 40 | "df.residual(model)" 41 | ]} 42 | ], 43 | "constraints": [ 44 | {"link": ["Model", "back_transform", "df", "model_prefix"]} 45 | ], 46 | "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results" 47 | } 48 | # --- (END) 49 | 50 | library(readr) 51 | library(MASS) 52 | library(modelr) 53 | library(tidyverse) 54 | library(broom.mixed) 55 | library(tidybayes) 56 | library(boot) 57 | 58 | # a function for post-processing predicted means and standard deviations into expected number of deaths 59 | pred2expectation <- function(mu, sigma) { 60 | return({{back_transform}}) 61 | } 62 | 63 | # a custom function for cross validation 64 | cross <- function (df, func, fml, folds = 5) { 65 | l = nrow(df) %/% folds 66 | mse = 0 67 | for (i in c(1:folds)) { 68 | # properly splitting train/test 69 | i1 = l*(i-1)+1 70 | i2 = l*i 71 | d_test = df[i1:i2, ] 72 | if (i1 > 1) { 73 | if (i2+1 < nrow(df)) { 74 | d_train = rbind(df[1:(i1-1), ], df[(i2+1):nrow(df), ]) 75 | } else { 76 | d_train = df[1:(i1-1), ] 77 | } 78 | } else { 79 | d_train = df[(i2+1):nrow(df), ] 80 | } 81 | 82 | model <- func(fml, data = d_train) 83 | mu <- predict(model, d_test, type = "response") 84 | sigma <- sigma(model) 85 | expected_deaths <- pred2expectation(mu, sigma) 86 | 87 | mse = mse + sum((d_test$death - expected_deaths)^2) 88 | } 89 | 90 | mse = sqrt(mse / nrow(df)) 91 | return(mse) 92 | } 93 | 94 | # read and process data 95 | df <- read_csv('../data.csv', 96 | col_types = cols( 97 | Year = col_integer(), 98 | Category = col_integer(), 99 | Gender_MF = col_integer(), 100 | alldeaths = col_integer() 101 | )) %>% 102 | # rename some variables 103 | dplyr::select( 104 | year = Year, 105 | name = Name, 106 | dam = NDAM, 107 | death = alldeaths, 108 | female = Gender_MF, 109 | masfem = MasFem, 110 | category = Category, 111 | pressure = Minpressure_Updated_2014, 112 | wind = HighestWindSpeed 113 | ) %>% 114 | # create new variables 115 | mutate( 116 | log_death = log(death + 1), 117 | log_dam = log(dam), 118 | post = ifelse(year>1979, 1, 0), 119 | zdam = scale(dam), 120 | zcat = as.numeric(scale(category)), 121 | zmin = -scale(pressure), 122 | zwin = as.numeric(scale(wind)), 123 | z3 = as.numeric((zmin + zcat + zwin) / 3) 124 | ) %>% 125 | # remove outliers 126 | filter(!(name %in% {{outliers}})) %>% 127 | filter(!(name %in% {{leverage_points}})) %>% 128 | # operationalize feminity 129 | mutate( 130 | feminity = {{feminity}}, 131 | damage = {{damage}} 132 | ) 133 | 134 | # --- (Model) ols_regression 135 | # OLS regression with log(deaths+1) as the dependent variable 136 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df) 137 | fit = cross(df, lm, log_death ~ {{predictors}} {{covariates}}) # cross validation 138 | 139 | # --- (Model) negative_binomial 140 | # Negative binomial with deaths as the dependent variable 141 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df) 142 | fit = cross(df, glm.nb, death ~ {{predictors}} {{covariates}}) # cross validation 143 | 144 | # --- (O) 145 | # normalize RMSE 146 | nrmse = fit / (max(df$death) - min(df$death)) 147 | 148 | # get prediction 149 | pred <- predict(model, se.fit = TRUE, type = "response") 150 | disagg_fit <- df %>% 151 | mutate( 152 | fit = pred$fit, # add inferential fits and standard errors to dataframe 153 | se.fit = pred$se.fit, 154 | df = {{df}}, # get degrees of freedom 155 | sigma = sigma(model), # get residual standard deviation 156 | se.residual = sqrt(sum(residuals(model)^2) / df) # get residual standard errors 157 | ) 158 | 159 | # aggregate fitted effect of female storm name 160 | expectation <- disagg_fit %>% 161 | mutate(expected_deaths = pred2expectation(fit, sigma)) %>% 162 | group_by(female) %>% # group by predictor(s) of interest 163 | summarize(expected_deaths = weighted.mean(expected_deaths)) %>% # marninalize across other predictors 164 | compare_levels(expected_deaths, by = female) %>% 165 | ungroup() %>% 166 | dplyr::select(expected_diff = expected_deaths) %>% 167 | add_column(NRMSE = nrmse) # add cross validatation metric 168 | 169 | # bootstrap function 170 | rsq <- function(data, indices) { 171 | d <- data[indices,] # allows boot to select sample 172 | model <- {{model_prefix}} ~ {{predictors}} {{covariates}} , data = d) 173 | pred <- predict(model, se.fit = TRUE, type = "response") 174 | disagg_fit <- d %>% 175 | mutate( 176 | fit = pred$fit, # add inferential fits and standard errors to dataframe 177 | sigma = sigma(model) # get residual standard deviation 178 | ) 179 | expectation <- disagg_fit %>% 180 | mutate(expected_deaths = pred2expectation(fit, sigma)) %>% 181 | group_by(female) %>% # group by predictor(s) of interest 182 | summarize(expected_deaths = weighted.mean(expected_deaths)) %>% # marninalize across other predictors 183 | compare_levels(expected_deaths, by = female) %>% 184 | ungroup() 185 | return(expectation$expected_deaths) 186 | } 187 | 188 | # bootstrap 189 | bootstrap <- boot(data=df, statistic=rsq, R=200) 190 | bootstrap <- tidy(bootstrap$t) %>% 191 | select(expected_diff=x) 192 | 193 | # propagate uncertainty in fit to model predictions 194 | uncertainty <- disagg_fit %>% 195 | mutate( 196 | .draw = list(1:200), # generate list of draw numbers 197 | t = map(df, ~rt(200, .)), # simulate draws from t distribution to transform into means 198 | x = map(df, ~rchisq(200, .)) # simulate draws from chi-squared distribution to transform into sigmas 199 | ) %>% 200 | unnest(cols = c(".draw", "t", "x")) %>% 201 | mutate( 202 | mu = t * se.fit + fit, # scale and shift t to get a sampling distribution of means 203 | sigma = sqrt(df * se.residual^2 / x), # scale and take inverse of x to get a sampling distribution of sigmas 204 | expected_deaths = pred2expectation(mu, sigma) 205 | ) %>% 206 | group_by(.draw, female) %>% # group by predictor(s) of interest 207 | summarize(expected_deaths = mean(expected_deaths)) %>% # marninalize across other predictors 208 | compare_levels(expected_deaths, by = female) %>% 209 | ungroup() %>% 210 | dplyr::select(expected_diff = expected_deaths) 211 | 212 | # only output relevant fields in disagg_fit 213 | disagg_fit <- disagg_fit %>% 214 | mutate(expected_deaths = pred2expectation(fit, sigma)) %>% 215 | dplyr::select( 216 | observed = death, 217 | expected = expected_deaths 218 | ) 219 | 220 | # visualize 221 | library(ggplot2) 222 | ggsave(qplot(expected_diff, data=bootstrap, geom="histogram"), file='../results/bootstrap_{{_n}}.pdf') 223 | ggsave(qplot(expected_diff, data=uncertainty, geom="histogram"), file='../results/uncertainty_{{_n}}.pdf') 224 | 225 | # output 226 | write_csv(expectation, '../results/estimate_{{_n}}.csv') 227 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv') 228 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv') 229 | write_csv(bootstrap, '../results/bootstrap_{{_n}}.csv') 230 | -------------------------------------------------------------------------------- /boba/adg.py: -------------------------------------------------------------------------------- 1 | 2 | class ADG: 3 | """ For creating ADG. """ 4 | def __init__(self): 5 | self.nodes = set() 6 | self.edges = {} 7 | self.proc_edges = {} 8 | 9 | self._graph_nodes = set() 10 | self._graph_edges = {} 11 | self._links = [] # linked decisions 12 | self._constraint_proc = set() # procedural deps from constraints 13 | self._decs = set() # all decisions 14 | 15 | @staticmethod 16 | def _convert_edges(edges): 17 | d = {} 18 | for e in edges: 19 | ADG._add_edge(d, e.start, e.end) 20 | return d 21 | 22 | @staticmethod 23 | def _add_edge(res, start, end): 24 | if start in res and end not in res[start]: 25 | res[start].append(end) 26 | else: 27 | res[start] = [end] 28 | 29 | @staticmethod 30 | def _all_ending_nodes(edges): 31 | """ nodes that have at least one incoming edge(s) """ 32 | flat = [item for lst in edges.values() for item in lst] 33 | return set(flat) 34 | 35 | @staticmethod 36 | def _get_source(nodes, edges): 37 | """ nodes that have no incoming edges """ 38 | return nodes.difference(ADG._all_ending_nodes(edges)) 39 | 40 | @staticmethod 41 | def _get_target(nodes, edges): 42 | """ nodes that have no outgoing edges """ 43 | return nodes.difference(set(edges.keys())) 44 | 45 | @staticmethod 46 | def _group_by(lst, func): 47 | res = {} 48 | for item in lst: 49 | k = func(item) 50 | ADG._add_edge(res, k, item) 51 | return res 52 | 53 | @staticmethod 54 | def _bn(name): 55 | """ Get the block decision name """ 56 | return name.split('-')[0].split(':')[0] 57 | 58 | def _merge_one(self, prev, cur): 59 | groups = ADG._group_by(cur, ADG._bn) 60 | if prev: 61 | self.nodes.add(prev) 62 | for k in groups.keys(): 63 | ADG._add_edge(self.edges, prev, k) 64 | self.nodes.add(k) 65 | return groups 66 | 67 | def _merge(self): 68 | """ Merge alternatives """ 69 | src = ADG._get_source(self._graph_nodes, self._graph_edges) 70 | groups = self._merge_one(None, src) 71 | nds = list(groups.keys()) 72 | i = 0 73 | while len(nds): 74 | nd = nds.pop() 75 | 76 | # look up the alternatives, then restore the correct node id 77 | alts = groups[nd] 78 | nd = nd.split('-')[0] 79 | 80 | # find the children of all alts of this node and perform merge 81 | cur = [self._graph_edges[n] for n in alts if n in self._graph_edges] 82 | cur = [item for sublist in cur for item in sublist] 83 | # print(nd, set(cur)) 84 | gp = self._merge_one(nd, set(cur)) 85 | 86 | # if the child node is already in groups, give it a different id 87 | for g in gp.copy(): 88 | val = gp[g] 89 | key = '{}-{}'.format(g, i) if g in groups else g 90 | i += 1 if g in groups else 0 91 | del gp[g] 92 | gp[key] = val 93 | 94 | # update the loop 95 | groups.update(gp) 96 | nds.extend(gp.keys()) 97 | 98 | # print(self.nodes, self.edges) 99 | 100 | def _prune_recur(self, node, nodes, edges): 101 | """ Recursive helper for prune. Make sure the graph has no cycles! """ 102 | # leaf node 103 | if node not in self.edges: 104 | return [node] if node in self._decs else None 105 | 106 | clean = [] 107 | # recursively prune children 108 | for nd in self.edges[node]: 109 | ret = self._prune_recur(nd, nodes, edges) 110 | if ret: 111 | clean.extend(ret) 112 | elif len(self.edges[node]) > 1: # preserve branches 113 | clean.append(nd) 114 | 115 | # skip if not decision, else add to edges 116 | if node in self._decs: 117 | nodes.update(clean) 118 | nodes.add(node) 119 | for nd in clean: 120 | ADG._add_edge(edges, node, nd) 121 | return [node] 122 | else: 123 | return clean 124 | 125 | def _prune(self): 126 | """ Remove non-decision nodes """ 127 | edges = {} 128 | nodes = set() 129 | src = ADG._get_source(self.nodes, self.edges) 130 | for s in src: 131 | self._prune_recur(s, nodes, edges) 132 | 133 | # replace nodes and edges 134 | self.nodes = nodes 135 | self.edges = edges 136 | 137 | def _get_linked_vars(self, blocks): 138 | """ Get linked placeholders """ 139 | bd = set([blocks[b].parameter for b in blocks if blocks[b].parameter]) 140 | res = set() 141 | for l in self._links: 142 | bls = [b for b in l if b in bd] 143 | if len(bls): 144 | # skip all vars if they are linked with blocks 145 | res.update(set(l).difference(set(bls))) 146 | else: 147 | # otherwise, skip all vars except the first 148 | res.update(l[1:]) 149 | return res 150 | 151 | def set_graph(self, nodes, edges): 152 | """ Set code graph """ 153 | self._graph_nodes = nodes 154 | self._graph_edges = ADG._convert_edges(edges) 155 | 156 | def set_constraints(self, links, proc): 157 | """ Save the intermediate data from constraint parser """ 158 | self._constraint_proc = proc 159 | self._links = links 160 | 161 | def create(self, blocks): 162 | """ Create the ADG """ 163 | # abort if ADG has already been created 164 | if len(self.nodes): 165 | return 166 | 167 | # add placeholder vars to the code graph 168 | decs = [] 169 | for bl in blocks: 170 | # get the variables associated with a block 171 | vs = [chunk.variable for chunk in blocks[bl].chunks 172 | if chunk.variable != ''] 173 | decs.extend(vs) 174 | 175 | # remove linked vars 176 | linked = self._get_linked_vars(blocks) 177 | vs = [v for v in vs if v not in linked] 178 | 179 | # remove duplicates within this block 180 | tmp = [] 181 | [tmp.append(v) for v in vs if v not in tmp] 182 | vs = tmp 183 | 184 | # skip variables that have appeared in previous blocks 185 | # fixme 186 | gp = ADG._group_by(self._graph_nodes, ADG._bn) 187 | vs = [v for v in vs if v not in gp 188 | or gp[v][0].split('-')[1].split(':')[0] == ADG._bn(bl)] 189 | 190 | # name the placeholders differently as distinct nodes for now 191 | vs = ['{}-{}'.format(v, bl) for v in vs] 192 | self._graph_nodes.update(vs) 193 | 194 | # move children of block to the last var 195 | vs = [bl] + vs 196 | last = vs[len(vs) - 1] 197 | if bl in self._graph_edges: 198 | temp = self._graph_edges[bl] 199 | self._graph_edges[bl] = [] 200 | self._graph_edges[last] = temp 201 | 202 | # add edges between vars 203 | for i in range(len(vs) - 1): 204 | ADG._add_edge(self._graph_edges, vs[i], vs[i + 1]) 205 | 206 | # save all decisions, including placeholders and decision blocks 207 | bd = set([blocks[b].parameter for b in blocks if blocks[b].parameter]) 208 | self._decs = set(decs).union(bd) 209 | 210 | # infer ADG from the graph 211 | self._merge() 212 | self._prune() 213 | 214 | # any branch should be a procedural branch 215 | for s in self.edges: 216 | t = self.edges[s] 217 | if len(t) > 1: 218 | self.proc_edges[s] = t 219 | # add the procedural deps from constraint 220 | for proc in self._constraint_proc: 221 | s = proc.split('-')[0] 222 | e = proc.split('-')[1] 223 | ADG._add_edge(self.proc_edges, s, e) 224 | 225 | # todo: remove linked blocks if they don't have procedural branches 226 | 227 | def get_used_decs(self): 228 | """ Get the decisions that are used in the ADG """ 229 | return [n for n in self.nodes if n in self._decs] 230 | 231 | def output(self): 232 | """ Output the graph object in server JSON """ 233 | nodes = [] 234 | edges = [] 235 | 236 | # nodes 237 | i = 0 238 | lookup = {} 239 | for n in self.nodes: 240 | nodes.append({"id": i, "name": n}) 241 | lookup[n] = i 242 | i += 1 243 | 244 | # first add procedural edges 245 | done = set() 246 | for s in self.proc_edges: 247 | ts = self.edges[s] 248 | for t in ts: 249 | done.add('{}->{}'.format(s, t)) 250 | edges.append({"source": lookup[s], "target": lookup[t], 251 | "type": "procedural"}) 252 | 253 | # add order edges, skip those already added 254 | for s in self.edges: 255 | ts = self.edges[s] 256 | for t in ts: 257 | if '{}->{}'.format(s, t) not in done: 258 | edges.append({"source": lookup[s], "target": lookup[t], 259 | "type": "order"}) 260 | 261 | return {"graph": {"nodes": nodes, "edges": edges}} 262 | --------------------------------------------------------------------------------