├── boba
    ├── output
    │   ├── __init__.py
    │   └── csvmerger.py
    ├── __init__.py
    ├── util.py
    ├── baseparser.py
    ├── lang.py
    ├── graphanalyzer.py
    ├── conditionparser.py
    ├── graphparser.py
    ├── cli.py
    ├── blocksyntaxparser.py
    ├── codeparser.py
    ├── wrangler.py
    ├── bobarun.py
    └── adg.py
├── example
    ├── simple
    │   ├── output
    │   │   ├── post_exe.sh
    │   │   ├── pre_exe.sh
    │   │   ├── lang.json
    │   │   ├── summary.csv
    │   │   ├── code
    │   │   │   ├── universe_4.py
    │   │   │   ├── universe_5.py
    │   │   │   ├── universe_6.py
    │   │   │   ├── universe_1.py
    │   │   │   ├── universe_2.py
    │   │   │   └── universe_3.py
    │   │   └── overview.json
    │   ├── script.py
    │   ├── gen_data.py
    │   ├── template.py
    │   └── data.csv
    ├── mortgage
    │   ├── after_execute.sh
    │   ├── visualizer_config.json
    │   ├── visualizer_config_monitor.json
    │   └── template.R
    ├── hurricane
    │   ├── after_execute.sh
    │   ├── install.R
    │   ├── data_wrangling
    │   │   ├── debug_count.py
    │   │   ├── wrangle.py
    │   │   └── data_jung.csv
    │   ├── stacking_weights.R
    │   ├── visualizer_config.json
    │   ├── README.md
    │   ├── data.csv
    │   ├── reproduce
    │   │   ├── repro_marginalize.R
    │   │   └── repro_bootstrap.R
    │   ├── repro.R
    │   ├── boba_util.R
    │   └── template.R
    ├── fertility_r
    │   ├── spec.json
    │   └── template.R
    ├── simple_cont
    │   ├── script.py
    │   ├── gen_data.py
    │   ├── template.py
    │   └── data.csv
    ├── reading
    │   ├── r
    │   │   ├── brms_test.R
    │   │   ├── install.R
    │   │   └── template.R
    │   ├── script.r
    │   └── python
    │   │   ├── script.py
    │   │   └── template.py
    └── fertility
    │   ├── script.py
    │   └── template.py
├── MANIFEST.in
├── test
    ├── __init__.py
    ├── specs
    │   ├── script-no-graph-empty.py
    │   ├── spec-good.json
    │   ├── script-no-graph.py
    │   ├── script1.py
    │   ├── script2.py
    │   ├── script2-dup.py
    │   ├── script2-syntax.py
    │   ├── script-inline-constraints.py
    │   ├── script1-bad-graph.py
    │   ├── script1-cyclic-graph.py
    │   ├── script2-dup-var.py
    │   ├── script1-good.py
    │   ├── script2-block-param.py
    │   ├── script3-2.py
    │   ├── script4-3.py
    │   ├── script3-7.py
    │   ├── script3-5.py
    │   ├── script3-6.py
    │   ├── script3-3.py
    │   ├── script4-2.py
    │   ├── script3-4.py
    │   ├── script4-1.py
    │   ├── script3-1.py
    │   ├── continuous-err.json
    │   └── continuous.json
    ├── test_c
    │   ├── lang.json
    │   └── template.c
    ├── test_lang.py
    ├── test_graph_parser.py
    ├── test_block_syntax_parser.py
    ├── test_graph_analyzer.py
    └── test_constraint_parser.py
├── requirements.txt
├── deploy.sh
├── requirements_dev.txt
├── .travis.yml
├── tox.ini
├── setup.cfg
├── .gitignore
├── setup.py
├── LICENSE
├── HISTORY.rst
├── tutorial
    ├── cli.rst
    └── simple.md
└── README.rst


/boba/output/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/simple/output/post_exe.sh:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/simple/output/pre_exe.sh:
--------------------------------------------------------------------------------
1 | cp ../data.csv ./code/


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | include HISTORY.rst
4 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """Unit test package for boba."""
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=0.21.2
2 | scipy>=1.6.0
3 | six>=1.12.0
4 | statsmodels>=0.12.2
5 | 


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | rm -rf boba.egg-info/
2 | rm -rf build/
3 | rm -rf dist/
4 | python3 setup.py sdist bdist_wheel
5 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | pip>=19.2.3
2 | bumpversion>=0.5.3
3 | wheel>=0.32.1
4 | tox>=3.14.0
5 | twine>=1.12.1
6 | 


--------------------------------------------------------------------------------
/test/specs/script-no-graph-empty.py:
--------------------------------------------------------------------------------
1 | if __name__ == '__main__':
2 |     a = {{a}}
3 |     b = a * 2
4 |     print(b)


--------------------------------------------------------------------------------
/example/simple/output/lang.json:
--------------------------------------------------------------------------------
1 | {"python": {"ext": ["py"], "run": ["python", "{{script_name}}"]}, "r": {"ext": ["R", "r"], "run": ["Rscript", "{{script_name}}"]}}


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: xenial
 2 | language: python
 3 | python:
 4 |   - "3.6"
 5 |   - "3.7"
 6 | install:
 7 |   - pip install -U tox-travis
 8 | script:
 9 |   - tox
10 | 


--------------------------------------------------------------------------------
/example/mortgage/after_execute.sh:
--------------------------------------------------------------------------------
1 | cd ./multiverse
2 | boba merge estimate_{}.csv -b ./results --out estimate.csv
3 | boba merge uncertainty_{}.csv -b ./results --out uncertainty.csv
4 | 


--------------------------------------------------------------------------------
/boba/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """Author and execute multiverse analysis"""
4 | 
5 | __author__ = """Yang Liu"""
6 | __email__ = 'yliu0@uw.edu'
7 | __version__ = '1.1.2'
8 | 


--------------------------------------------------------------------------------
/test/specs/spec-good.json:
--------------------------------------------------------------------------------
1 | {
2 |   "graph": ["A->B->C"],
3 |   "decisions": [
4 |     {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" },
5 |     {"var": "b", "options": [0, 1] }
6 |   ]
7 | }


--------------------------------------------------------------------------------
/test/test_c/lang.json:
--------------------------------------------------------------------------------
1 | {
2 |     "c" : {
3 |         "ext" : ["c"],
4 |         "compile" : ["gcc", "-o", "{{universe_name}}", "{{script_name}}"],
5 |         "run" : ["./{{universe_name}}"]
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/test/specs/script-no-graph.py:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {
 3 |   "decisions": [
 4 |     {"var": "a", "options": [1]}
 5 |   ]
 6 | }
 7 | # --- (END)
 8 | if __name__ == '__main__':
 9 |     a = {{a}}
10 |     b = a * 2
11 |     print(b)
12 | 


--------------------------------------------------------------------------------
/test/specs/script1.py:
--------------------------------------------------------------------------------
 1 | # --- (A)
 2 | if __name__ == '__main__':
 3 |     a = 1
 4 |     b = 2
 5 | 
 6 |     # --- (B)
 7 |     b = b + 2 * a
 8 | 
 9 |     if b > 1:
10 |         # --- (C)
11 |         b = -b
12 |     else:
13 |         b = 2 * b
14 | 


--------------------------------------------------------------------------------
/test/specs/script2.py:
--------------------------------------------------------------------------------
 1 | if __name__ == '__main__':
 2 |     a = 1
 3 |     b = 2
 4 | 
 5 |     # --- (A)
 6 |     b = b + 2 * a
 7 | 
 8 |     if b > 1:
 9 |         # --- (B)
10 |         b = -b
11 |     # --- (C)
12 |     else:
13 |         b = 2 * b


--------------------------------------------------------------------------------
/example/hurricane/after_execute.sh:
--------------------------------------------------------------------------------
1 | cd ./multiverse
2 | boba merge estimate_{}.csv -b ./results --out estimate.csv
3 | boba merge uncertainty_{}.csv -b ./results --out uncertainty.csv
4 | boba merge null_{}.csv -b ./results --out null.csv
5 | 
6 | Rscript stacking_weights.R


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37
 3 | recreate = True
 4 | 
 5 | [travis]
 6 | python =
 7 |     3.7: py37
 8 |     3.6: py36
 9 | 
10 | [testenv]
11 | setenv =
12 |     PYTHONPATH = {toxinidir}
13 | 
14 | commands =
15 |     python -m unittest discover test
16 | 


--------------------------------------------------------------------------------
/test/test_c/template.c:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {
 3 |   "lang": "lang.json"
 4 | }
 5 | # --- (END)
 6 | 
 7 | #include <stdio.h>
 8 | int main() {
 9 |     printf("hello from universe ");
10 |     printf("%d", {{id=1,2,3}});
11 |     printf("\n");
12 |     return 0;
13 | }


--------------------------------------------------------------------------------
/example/simple/output/summary.csv:
--------------------------------------------------------------------------------
1 | Filename,Code Path,cutoff,A
2 | universe_1.py,_start->A->B,2,iqr
3 | universe_2.py,_start->A->B,2.5,iqr
4 | universe_3.py,_start->A->B,3,iqr
5 | universe_4.py,_start->A->B,2,std
6 | universe_5.py,_start->A->B,2.5,std
7 | universe_6.py,_start->A->B,3,std
8 | 


--------------------------------------------------------------------------------
/test/specs/script2-dup.py:
--------------------------------------------------------------------------------
 1 | """ This script should fail to parse due to duplicated block id """
 2 | 
 3 | if __name__ == '__main__':
 4 |     a = 1
 5 |     b = 2
 6 | 
 7 |     # --- (A)
 8 |     b = b + 2 * a
 9 | 
10 |     if b > 1:
11 |         # --- (A)
12 |         b = -b
13 |     # --- (C)
14 |     else:
15 |         b = 2 * b
16 | 


--------------------------------------------------------------------------------
/test/specs/script2-syntax.py:
--------------------------------------------------------------------------------
 1 | """ This script will fail to parse due to invalid block definition syntax """
 2 | 
 3 | if __name__ == '__main__':
 4 |     a = 1
 5 |     b = 2
 6 | 
 7 |     # --- A
 8 |     b = b + 2 * a
 9 | 
10 |     if b > 1:
11 |         # --- B
12 |         b = -b
13 |     # --- C
14 |     else:
15 |         b = 2 * b
16 | 


--------------------------------------------------------------------------------
/test/specs/script-inline-constraints.py:
--------------------------------------------------------------------------------
 1 | """ Test inline constraints """
 2 | 
 3 | if __name__ == '__main__':
 4 |     # --- (A) a1
 5 |     a = 1
 6 | 
 7 |     # --- (A) a2
 8 |     a = 2
 9 | 
10 |     # --- (B) b1 @if A == a1
11 |     b = 1
12 | 
13 |     # --- (B) b2 @if A == a2
14 |     b = 2
15 | 
16 |     # --- (C)
17 |     print(a * b)
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.1.2
 3 | commit = False
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:boba/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 


--------------------------------------------------------------------------------
/test/specs/script1-bad-graph.py:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {
 3 |   "graph": ["A->B->C", "A->"],
 4 |   "decisions": {}
 5 | }
 6 | 
 7 | # --- (A)
 8 | if __name__ == '__main__':
 9 |     a = 1
10 |     b = 2
11 | 
12 |     # --- (B)
13 |     b = b + 2 * a
14 | 
15 |     if b > 1:
16 |         # --- (C)
17 |         b = -b
18 |     else:
19 |         b = 2 * b
20 | 


--------------------------------------------------------------------------------
/test/specs/script1-cyclic-graph.py:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {
 3 |   "graph": ["A->B->C->A"],
 4 |   "decisions": {}
 5 | }
 6 | 
 7 | # --- (A)
 8 | if __name__ == '__main__':
 9 |     a = 1
10 |     b = 2
11 | 
12 |     # --- (B)
13 |     b = b + 2 * a
14 | 
15 |     if b > 1:
16 |         # --- (C)
17 |         b = -b
18 |     else:
19 |         b = 2 * b
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | __pycache__
 3 | *.pyc
 4 | multiverse/
 5 | durante_etal_2013_study2.txt
 6 | MTurk_ratings_femeninity_of_hurricanes.csv
 7 | data_updated.csv
 8 | multiverse_analysis.R
 9 | amk-notes.txt
10 | venv/
11 | env/
12 | prototype/
13 | 
14 | # example
15 | example/sampling
16 | 
17 | # packaging
18 | *.egg-info/
19 | build/
20 | dist/
21 | 
22 | .tox/
23 | 


--------------------------------------------------------------------------------
/test/specs/script2-dup-var.py:
--------------------------------------------------------------------------------
 1 | """ Should fail to parse because of a block and a variable
 2 |  have the same name."""
 3 | 
 4 | # --- (BOBA_CONFIG)
 5 | {"decisions": [
 6 |     {"var": "a", "options": [1, 2]}
 7 | ]}
 8 | # --- (END)
 9 | 
10 | if __name__ == '__main__':
11 |     # --- (a)
12 |     a = {{a}}
13 | 
14 |     # --- (b) b1
15 |     b = 1
16 | 
17 |     # --- (b) b2
18 |     b = 2
19 | 
20 |     # --- (c)
21 |     print(a * b)
22 | 


--------------------------------------------------------------------------------
/boba/util.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | class Colors:
 5 |     HEADER = '\033[95m'
 6 |     OKBLUE = '\033[94m'
 7 |     OKGREEN = '\033[92m'
 8 |     WARNING = '\033[93m'
 9 |     FAIL = '\033[91m'
10 |     ENDC = '\033[0m'
11 |     BOLD = '\033[1m'
12 |     UNDERLINE = '\033[4m'
13 | 
14 | 
15 | def print_fail(msg):
16 |     print(Colors.FAIL + msg + Colors.ENDC)
17 | 
18 | def print_warn(msg):
19 |     print(Colors.WARNING + msg + Colors.ENDC)
20 | 


--------------------------------------------------------------------------------
/test/specs/script1-good.py:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {
 3 |   "graph": ["A->B->C"],
 4 |   "decisions": [
 5 |     {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" },
 6 |     {"var": "b", "options": [0, 1] }
 7 |   ]
 8 | }
 9 | 
10 | # --- (A)
11 | if __name__ == '__main__':
12 |     a = 1
13 |     b = 2
14 | 
15 |     # --- (B)
16 |     b = b + 2 * a
17 | 
18 |     if b > 1:
19 |         # --- (C)
20 |         b = -b
21 |     else:
22 |         b = 2 * b
23 | 


--------------------------------------------------------------------------------
/test/specs/script2-block-param.py:
--------------------------------------------------------------------------------
 1 | """ Test the block-level parameter syntax """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {"decisions": [
 5 |     {"var": "b", "options": [1, 2]}
 6 | ]}
 7 | # --- (END)
 8 | 
 9 | if __name__ == '__main__':
10 |     # --- (A) a1
11 |     a = {{b}}
12 | 
13 |     # --- (A) a2
14 |     a = 2
15 | 
16 |     # --- (B) b1
17 |     b = 1
18 | 
19 |     # --- (B) b2
20 |     b = 2
21 | 
22 |     # --- (B) b3
23 |     b = 3
24 | 
25 |     # --- (C)
26 |     print(a * b)
27 | 


--------------------------------------------------------------------------------
/test/specs/script3-2.py:
--------------------------------------------------------------------------------
 1 | """ Test constraints """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "decisions": [
 6 |     {"var": "b", "options": [0, 1]}
 7 |   ],
 8 |   "constraints": [
 9 |     {"block": "B", "condition": "A == a2"}
10 |   ]
11 | }
12 | # --- (END)
13 | 
14 | if __name__ == '__main__':
15 |     # --- (A) a1
16 |     a = {{b}}
17 | 
18 |     # --- (A) a2
19 |     a = 2
20 | 
21 |     # --- (B) b1
22 |     b = 1
23 | 
24 |     # --- (B) b2
25 |     b = 2
26 | 
27 |     # --- (B) b3
28 |     b = 3
29 | 
30 |     # --- (C)
31 |     print(a * b)
32 | 


--------------------------------------------------------------------------------
/example/hurricane/install.R:
--------------------------------------------------------------------------------
 1 | # create user library if it does not exist
 2 | repo = "http://cran.us.r-project.org"
 3 | lib = Sys.getenv("R_LIBS_USER")
 4 | dir.create(lib)
 5 | 
 6 | # install required packages
 7 | if(!require(readr)) install.packages("readr", lib, repos=repo)
 8 | if(!require(MASS)) install.packages("MASS", lib, repos=repo)
 9 | if(!require(tidyverse)) install.packages("tidyverse", lib, repos=repo)
10 | if(!require(broom.mixed)) install.packages("broom.mixed", lib, repos=repo)
11 | if(!require(caret)) install.packages("caret", lib, repos=repo)
12 | 


--------------------------------------------------------------------------------
/example/hurricane/data_wrangling/debug_count.py:
--------------------------------------------------------------------------------
 1 | # a helper script to count which universes are missing
 2 | # for debug purposes
 3 | 
 4 | import os
 5 | 
 6 | TOTAL = 864
 7 | 
 8 | fs = []
 9 | for f in os.listdir(os.path.join(os.getcwd(), 'multiverse/results/')):
10 |     name, ext = os.path.splitext(f)
11 |     if ext == '.txt':
12 |         fs.append(int(name.split('_')[1]))
13 | 
14 | fs.sort()
15 | 
16 | j = 0
17 | res = []
18 | for i in range(TOTAL):
19 |     if i != fs[j] - 1:
20 |         res.append(i + 1)
21 |     else:
22 |         j += 1
23 | 
24 | print(res)
25 | 


--------------------------------------------------------------------------------
/test/specs/script4-3.py:
--------------------------------------------------------------------------------
 1 | """ Test ADG and linked decisions """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "decisions": [
 6 |     {"var": "a", "options": [0, 1] },
 7 |     {"var": "b", "options": ["0", "1"]}
 8 |   ],
 9 |   "constraints": [
10 |     {"link": ["a", "b"]}
11 |   ]
12 | }
13 | # --- (END)
14 | 
15 | if __name__ == '__main__':
16 |     # --- (A)
17 |     a = {{a}}
18 | 
19 |     # --- (B) b1
20 |     b = 1 + {{b}}
21 | 
22 |     # --- (B) b2
23 |     b = 2 + {{b}}
24 | 
25 |     # --- (C)
26 |     print(a * b)
27 | 
28 |     # --- (D)
29 |     print(a + b)
30 | 


--------------------------------------------------------------------------------
/example/fertility_r/spec.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "decisions": [
 3 |     {"var": "fertility_bounds", "options": [
 4 |       "c(7, 14, 17, 25, 17, 25)",
 5 |       "c(6, 14, 17, 27, 17, 27)",
 6 |       "c(9, 17, 18, 25, 18, 25)",
 7 |       "c(8, 14, 1, 7, 15, 28)",
 8 |       "c(9, 17, 1, 8, 18, 28)"
 9 |     ]},
10 |     {"var": "relationship_bounds", "options": [
11 |       "c(2, 3)", "c(1, 2)", "c(1, 3)"
12 |     ]}
13 |   ],
14 |   "outputs": [
15 |     {"name": "p-value", "value": "summar$coefficients[4, 4]"}
16 |   ],
17 |   "before_execute": "cp ../durante_etal_2013_study1.txt ./code/"
18 | }


--------------------------------------------------------------------------------
/test/specs/script3-7.py:
--------------------------------------------------------------------------------
 1 | """ Test constraints """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "decisions": [
 6 |     {"var": "a", "options": [0, 1, 2, 3, 4] },
 7 |     {"var": "b", "options": ["0", "1", "2", "3", "4"]}
 8 |   ],
 9 |   "constraints": [
10 |     {"link": ["a", "b"]}
11 |   ]
12 | }
13 | # --- (END)
14 | 
15 | if __name__ == '__main__':
16 |     # --- (A)
17 |     a = {{a}}
18 | 
19 |     # --- (B) b1
20 |     b = 1 + {{b}}
21 | 
22 |     # --- (B) b2
23 |     b = 2 + {{b}}
24 | 
25 |     # --- (C)
26 |     print(a * b)
27 | 
28 |     # --- (D)
29 |     print(a + b)
30 | 


--------------------------------------------------------------------------------
/test/specs/script3-5.py:
--------------------------------------------------------------------------------
 1 | """ Test constraints """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "decisions": [
 6 |     {"var": "a", "options": ["if", "else"]},
 7 |     {"var": "b", "options": [0, 1] }
 8 |   ],
 9 |   "constraints": [
10 |     {"block": "C", "skippable": true, "condition": "a == if"}
11 |   ]
12 | }
13 | # --- (END)
14 | 
15 | if __name__ == '__main__':
16 |     # --- (A)
17 |     a = {{a}}
18 | 
19 |     # --- (B) b1
20 |     b = 1 + {{b}}
21 | 
22 |     # --- (B) b2
23 |     b = 2 + {{b}}
24 | 
25 |     # --- (C)
26 |     print(a * b)
27 | 
28 |     # --- (D)
29 |     print(a + b)
30 | 


--------------------------------------------------------------------------------
/test/specs/script3-6.py:
--------------------------------------------------------------------------------
 1 | """ Test constraints """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "graph": ["A->B->C->D"],
 6 |   "decisions": [
 7 |     {"var": "a", "options": ["if", "else"]},
 8 |     {"var": "b", "options": [0, 1.5] }
 9 |   ],
10 |   "constraints": [
11 |     {"block": "D", "condition": "a == if and B == b1"}
12 |   ]
13 | }
14 | # --- (END)
15 | 
16 | if __name__ == '__main__':
17 |     # --- (A)
18 |     a = {{a}}
19 | 
20 |     # --- (B) b1
21 |     b = 1 + {{b}}
22 | 
23 |     # --- (B) b2
24 |     b = 2 + {{b}}
25 | 
26 |     # --- (C)
27 |     print(a * b)
28 | 
29 |     # --- (D)
30 |     print(a + b)
31 | 


--------------------------------------------------------------------------------
/test/specs/script3-3.py:
--------------------------------------------------------------------------------
 1 | """ Test constraints """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "graph": ["A->B->C", "B->D"],
 6 |   "decisions": [
 7 |     {"var": "a", "options": ["if", "else"]},
 8 |     {"var": "b", "options": [0, 1] }
 9 |   ],
10 |   "constraints": [
11 |     {"block": "C", "condition": "B == b1"},
12 |     {"block": "D", "condition": "B == b2"}
13 |   ]
14 | }
15 | # --- (END)
16 | 
17 | if __name__ == '__main__':
18 |     # --- (A)
19 |     a = {{a}}
20 | 
21 |     # --- (B) b1
22 |     b = 1 + {{b}}
23 | 
24 |     # --- (B) b2
25 |     b = 2 + {{b}}
26 | 
27 |     # --- (C)
28 |     print(a * b)
29 | 
30 |     # --- (D)
31 |     print(a + b)
32 | 


--------------------------------------------------------------------------------
/test/specs/script4-2.py:
--------------------------------------------------------------------------------
 1 | """ Test ADG and code graph """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "graph": ["A->B->C", "B->D"],
 6 |   "decisions": [
 7 |     {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" },
 8 |     {"var": "b", "options": [0, 1] },
 9 |     {"var": "c", "options": [[1, 2], [3, 4]]}
10 |   ]
11 | }
12 | # --- (END)
13 | 
14 | if __name__ == '__main__':
15 |     # --- (A) a1
16 |     a = {{a}}
17 | 
18 |     # --- (A) a2
19 |     a = {{a}}
20 | 
21 |     # --- (B) b1
22 |     b = {{b}}
23 | 
24 |     # --- (B) b2
25 |     b = 2
26 | 
27 |     # --- (B) b3
28 |     b = 3
29 | 
30 |     # --- (C)
31 |     print(a * b)
32 | 
33 |     # --- (D)
34 | 


--------------------------------------------------------------------------------
/test/specs/script3-4.py:
--------------------------------------------------------------------------------
 1 | """ Test constraints """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "graph": ["A->B->C->D"],
 6 |   "decisions": [
 7 |     {"var": "a", "options": ["if", "else"]},
 8 |     {"var": "b", "options": [0, 1] }
 9 |   ],
10 |   "constraints": [
11 |     {"variable": "b", "option": 1, "condition": "a.index == 0"},
12 |     {"variable": "b", "option": 0, "condition": "a == else"}
13 |   ]
14 | }
15 | # --- (END)
16 | 
17 | if __name__ == '__main__':
18 |     # --- (A)
19 |     a = {{a}}
20 | 
21 |     # --- (B) b1
22 |     b = 1 + {{b}}
23 | 
24 |     # --- (B) b2
25 |     b = 2 + {{b}}
26 | 
27 |     # --- (C)
28 |     print(a * b)
29 | 
30 |     # --- (D)
31 |     print(a + b)
32 | 


--------------------------------------------------------------------------------
/example/simple/output/code/universe_4.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import statsmodels.api as sm
 5 | 
 6 | if __name__ == '__main__':
 7 |     # read data file
 8 |     df = pd.read_csv('data.csv')
 9 | 
10 |     # remove outliers based on std
11 |     df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())]
12 | 
13 |     # fit a simple ordinary least squares model
14 |     x = sm.add_constant(df.x)
15 |     lm = sm.OLS(df.y, x).fit()
16 | 
17 |     # display results
18 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
19 |     print('AIC: {:.2f}'.format(lm.aic))
20 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
21 | 


--------------------------------------------------------------------------------
/example/simple/output/code/universe_5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import statsmodels.api as sm
 5 | 
 6 | if __name__ == '__main__':
 7 |     # read data file
 8 |     df = pd.read_csv('data.csv')
 9 | 
10 |     # remove outliers based on std
11 |     df = df[np.abs(df.y - df.y.mean()) <= (2.5 * df.y.std())]
12 | 
13 |     # fit a simple ordinary least squares model
14 |     x = sm.add_constant(df.x)
15 |     lm = sm.OLS(df.y, x).fit()
16 | 
17 |     # display results
18 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
19 |     print('AIC: {:.2f}'.format(lm.aic))
20 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
21 | 


--------------------------------------------------------------------------------
/example/simple/output/code/universe_6.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import statsmodels.api as sm
 5 | 
 6 | if __name__ == '__main__':
 7 |     # read data file
 8 |     df = pd.read_csv('data.csv')
 9 | 
10 |     # remove outliers based on std
11 |     df = df[np.abs(df.y - df.y.mean()) <= (3 * df.y.std())]
12 | 
13 |     # fit a simple ordinary least squares model
14 |     x = sm.add_constant(df.x)
15 |     lm = sm.OLS(df.y, x).fit()
16 | 
17 |     # display results
18 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
19 |     print('AIC: {:.2f}'.format(lm.aic))
20 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
21 | 


--------------------------------------------------------------------------------
/example/mortgage/visualizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [
 3 |     {"id": "est", "path": "estimate.csv"},
 4 |     {"id": "unc", "path": "uncertainty.csv"},
 5 |     {"id": "fit", "path": "raw/disagg_pred_{}.csv", "multi": true}
 6 |   ],
 7 |   "schema": {
 8 |     "point_estimate": {"file": "est", "field": "estimate"},
 9 |     "p_value": {"file": "est", "field": "p.value"},
10 |     "fit": {"file": "est", "field": "NRMSE"},
11 |     "uncertainty": {"file": "unc", "field": "estimate"},
12 |     "prediction": {"file": "fit"}
13 |   },
14 |   "labels": {
15 |     "dataset": "mortgage",
16 |     "x_axis": "Coefficient on female",
17 |     "x_axis_fit": "Approved",
18 |     "x_range": [-3, 8]
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/example/simple/script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | import statsmodels.api as sm
 6 | 
 7 | if __name__ == '__main__':
 8 |     # read data file
 9 |     df = pd.read_csv('data.csv')
10 | 
11 |     # remove outliers
12 |     # discard rows outside 2 x std
13 |     df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())]
14 | 
15 |     # fit a simple ordinary least squares model
16 |     x = sm.add_constant(df.x)
17 |     lm = sm.OLS(df.y, x).fit()
18 | 
19 |     # display results
20 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
21 |     print('AIC: {:.2f}'.format(lm.aic))
22 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
23 | 


--------------------------------------------------------------------------------
/example/simple_cont/script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | import statsmodels.api as sm
 6 | 
 7 | if __name__ == '__main__':
 8 |     # read data file
 9 |     df = pd.read_csv('data.csv')
10 | 
11 |     # remove outliers
12 |     # discard rows outside 2 x std
13 |     df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())]
14 | 
15 |     # fit a simple ordinary least squares model
16 |     x = sm.add_constant(df.x)
17 |     lm = sm.OLS(df.y, x).fit()
18 | 
19 |     # display results
20 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
21 |     print('AIC: {:.2f}'.format(lm.aic))
22 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
23 | 


--------------------------------------------------------------------------------
/example/hurricane/stacking_weights.R:
--------------------------------------------------------------------------------
 1 | library(rstan)
 2 | library(readr)
 3 | library(tidyverse)
 4 | 
 5 | dir = './results'
 6 | fs = list.files(dir, pattern='^loglik')
 7 | if (length(fs) < 1) {
 8 |   stop('No matching files found, pattern: loglik_*.csv')
 9 | }
10 | 
11 | dfs = lapply(fs, function (f) {
12 |   read_csv(file.path(dir, f), col_types='d')
13 | })
14 | uids = lapply(fs, function (f) {
15 |   res = strsplit(strsplit(f, '_')[[1]][2], '\\.')[[1]][1]
16 |   return(strtoi(res))
17 | })
18 | 
19 | m = bind_cols(dfs) %>% as.matrix
20 | weights = loo::stacking_weights(m)
21 | res = enframe(c(weights)) %>%
22 |   select(-name) %>%
23 |   add_column(unlist(uids)) %>%
24 |   rename(weights = 1, uid = 2)
25 | write_csv(res, './weights.csv')
26 | 


--------------------------------------------------------------------------------
/example/mortgage/visualizer_config_monitor.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [
 3 |     {"id": "est", "path": "estimate.csv"},
 4 |     {"id": "unc", "path": "uncertainty.csv"},
 5 |     {"id": "fit", "path": "results/disagg_fit_{}.csv", "multi": true}
 6 |   ],
 7 |   "schema": {
 8 |     "point_estimate": {"file": "est", "field": "estimate"},
 9 |     "p_value": {"file": "est", "field": "p.value"},
10 |     "fit": {"file": "est", "field": "R2_flipped"},
11 |     "uncertainty": {"file": "unc", "field": "estimate"},
12 |     "prediction": {"file": "fit"}
13 |   },
14 |   "labels": {
15 |     "dataset": "mortgage",
16 |     "x_axis": "Coefficient on female",
17 |     "x_axis_fit": "Approved",
18 |     "x_range": [-3, 8],
19 |     "fit_range": [0, 1]
20 |   }
21 | }


--------------------------------------------------------------------------------
/test/specs/script4-1.py:
--------------------------------------------------------------------------------
 1 | """ Test ADG """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "decisions": [
 6 |     {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" },
 7 |     {"var": "b", "options": [0, 1] },
 8 |     {"var": "c", "options": [[1, 2], [3, 4]]}
 9 |   ],
10 |   "constraints": [
11 |     {"block": "B", "option": "b1", "condition": "A == a1"},
12 |     {"block": "B", "option": "b2", "condition": "A == a2"}
13 |   ]
14 | }
15 | # --- (END)
16 | 
17 | if __name__ == '__main__':
18 |     # --- (A) a1
19 |     a = 1
20 | 
21 |     # --- (A) a2
22 |     a = 2
23 | 
24 |     # --- (B) b1
25 |     b = {{b}}
26 | 
27 |     # --- (B) b2
28 |     b = 2
29 | 
30 |     # --- (B) b3
31 |     b = 3
32 | 
33 |     # --- (C)
34 |     print(a * b)
35 | 


--------------------------------------------------------------------------------
/test/specs/script3-1.py:
--------------------------------------------------------------------------------
 1 | """ Test constraints """
 2 | 
 3 | # --- (BOBA_CONFIG)
 4 | {
 5 |   "decisions": [
 6 |     {"var": "a", "options": [2, 2.5, 3], "desc": "outlier" },
 7 |     {"var": "b", "options": [0, 1] },
 8 |     {"var": "c", "options": [[1, 2], [3, 4]]}
 9 |   ],
10 |   "constraints": [
11 |     {"block": "B", "option": "b1", "condition": "A == a1"},
12 |     {"block": "B", "option": "b2", "condition": "A == a2"}
13 |   ]
14 | }
15 | # --- (END)
16 | 
17 | if __name__ == '__main__':
18 |     # --- (A) a1
19 |     a = {{b}}
20 | 
21 |     # --- (A) a2
22 |     a = 2
23 | 
24 |     # --- (B) b1
25 |     b = 1
26 | 
27 |     # --- (B) b2
28 |     b = 2
29 | 
30 |     # --- (B) b3
31 |     b = 3
32 | 
33 |     # --- (C)
34 |     print(a * b)
35 | 


--------------------------------------------------------------------------------
/example/simple/gen_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | # create a synthetic dataset and save to data.csv
 7 | if __name__ == '__main__':
 8 |     # create a linear series y= 10 + 0.5 * x plus random gaussian noise
 9 |     n = 100
10 |     x = np.random.uniform(0, 5, n)
11 |     y = 10 + 0.5 * x + np.random.normal(0, 0.2, n)
12 | 
13 |     # make outliers
14 |     mean = np.mean(y)
15 |     sd = np.std(y)
16 |     cutoff = [2.4, 2.9, 3.4]
17 |     for i in range(len(cutoff)):
18 |         y[i * 2] = mean + cutoff[i] * sd
19 |         y[i * 2 + 1] = mean - cutoff[i] * sd
20 | 
21 |     # save file
22 |     df = pd.DataFrame(np.column_stack((x, y)), columns=['x', 'y'])
23 |     df.to_csv('data.csv', index=False)
24 | 


--------------------------------------------------------------------------------
/example/simple/output/code/universe_1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import statsmodels.api as sm
 5 | 
 6 | if __name__ == '__main__':
 7 |     # read data file
 8 |     df = pd.read_csv('data.csv')
 9 | 
10 |     # remove outliers based on iqr
11 |     iqr = np.subtract(*np.percentile(df.y, [75, 25]))
12 |     median = np.median(df.y)
13 |     df = df[abs(df.y - median) <= 2 * iqr]
14 | 
15 |     # fit a simple ordinary least squares model
16 |     x = sm.add_constant(df.x)
17 |     lm = sm.OLS(df.y, x).fit()
18 | 
19 |     # display results
20 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
21 |     print('AIC: {:.2f}'.format(lm.aic))
22 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
23 | 


--------------------------------------------------------------------------------
/example/simple/output/code/universe_2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import statsmodels.api as sm
 5 | 
 6 | if __name__ == '__main__':
 7 |     # read data file
 8 |     df = pd.read_csv('data.csv')
 9 | 
10 |     # remove outliers based on iqr
11 |     iqr = np.subtract(*np.percentile(df.y, [75, 25]))
12 |     median = np.median(df.y)
13 |     df = df[abs(df.y - median) <= 2.5 * iqr]
14 | 
15 |     # fit a simple ordinary least squares model
16 |     x = sm.add_constant(df.x)
17 |     lm = sm.OLS(df.y, x).fit()
18 | 
19 |     # display results
20 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
21 |     print('AIC: {:.2f}'.format(lm.aic))
22 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
23 | 


--------------------------------------------------------------------------------
/example/simple/output/code/universe_3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import statsmodels.api as sm
 5 | 
 6 | if __name__ == '__main__':
 7 |     # read data file
 8 |     df = pd.read_csv('data.csv')
 9 | 
10 |     # remove outliers based on iqr
11 |     iqr = np.subtract(*np.percentile(df.y, [75, 25]))
12 |     median = np.median(df.y)
13 |     df = df[abs(df.y - median) <= 3 * iqr]
14 | 
15 |     # fit a simple ordinary least squares model
16 |     x = sm.add_constant(df.x)
17 |     lm = sm.OLS(df.y, x).fit()
18 | 
19 |     # display results
20 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
21 |     print('AIC: {:.2f}'.format(lm.aic))
22 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
23 | 


--------------------------------------------------------------------------------
/example/simple_cont/gen_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | # create a synthetic dataset and save to data.csv
 7 | if __name__ == '__main__':
 8 |     # create a linear series y= 10 + 0.5 * x plus random gaussian noise
 9 |     n = 100
10 |     x = np.random.uniform(0, 5, n)
11 |     y = 10 + 0.5 * x + np.random.normal(0, 0.2, n)
12 | 
13 |     # make outliers
14 |     mean = np.mean(y)
15 |     sd = np.std(y)
16 |     cutoff = [2.4, 2.9, 3.4]
17 |     for i in range(len(cutoff)):
18 |         y[i * 2] = mean + cutoff[i] * sd
19 |         y[i * 2 + 1] = mean - cutoff[i] * sd
20 | 
21 |     # save file
22 |     df = pd.DataFrame(np.column_stack((x, y)), columns=['x', 'y'])
23 |     df.to_csv('data.csv', index=False)
24 | 


--------------------------------------------------------------------------------
/example/simple/output/overview.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "decisions": [
 3 |     {
 4 |       "options": [
 5 |         2,
 6 |         2.5,
 7 |         3
 8 |       ],
 9 |       "var": "cutoff"
10 |     },
11 |     {
12 |       "options": [
13 |         "std",
14 |         "iqr"
15 |       ],
16 |       "var": "A"
17 |     }
18 |   ],
19 |   "graph": {
20 |     "edges": [
21 |       {
22 |         "source": 1,
23 |         "target": 0,
24 |         "type": "order"
25 |       }
26 |     ],
27 |     "nodes": [
28 |       {
29 |         "id": 0,
30 |         "name": "cutoff"
31 |       },
32 |       {
33 |         "id": 1,
34 |         "name": "A"
35 |       }
36 |     ]
37 |   },
38 |   "visualizer": {
39 |     "files": [
40 |       {
41 |         "id": "est",
42 |         "path": "estimates.csv"
43 |       }
44 |     ],
45 |     "schema": {
46 |       "point_estimate": {
47 |         "field": "estimate",
48 |         "file": "est"
49 |       }
50 |     }
51 |   }
52 | }


--------------------------------------------------------------------------------
/example/hurricane/visualizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [
 3 |     {"id": "est", "path": "estimate.csv"},
 4 |     {"id": "unc", "path": "uncertainty.csv"},
 5 |     {"id": "nul", "path": "null.csv"},
 6 |     {"id": "wei", "path": "weights.csv"},
 7 |     {"id": "fit", "path": "results/disagg_fit_{}.csv", "multi": true}
 8 |   ],
 9 |   "schema": {
10 |     "point_estimate": {"file": "est", "field": "expected_diff"},
11 |     "fit": {"file": "est", "field": "NRMSE"},
12 |     "uncertainty": {"file": "unc", "field": "expected_diff"},
13 |     "prediction": {"file": "fit", "transform": "math.log2({} + 1)"},
14 |     "null_distribution": {"field": "expected_diff", "file": "nul"},
15 |     "stacking_weight": {"field": "weights", "file": "wei"}
16 |   },
17 |   "labels": {
18 |     "dataset": "hurricane",
19 |     "x_axis": "Expected Deaths: Female - Male",
20 |     "x_axis_fit": "Log2(Death + 1)",
21 |     "fit_range": [0, 1],
22 |     "x_range": [-10, 50],
23 |     "x_range_outer": [-120, 300]
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/example/simple/template.py:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {"before_execute": "cp ../data.csv ./code/"}
 3 | # --- (END)
 4 | #!/usr/bin/env python3
 5 | import pandas as pd
 6 | import numpy as np
 7 | import statsmodels.api as sm
 8 | 
 9 | if __name__ == '__main__':
10 |     # read data file
11 |     df = pd.read_csv('data.csv')
12 | 
13 |     # --- (A) std
14 |     # remove outliers based on std
15 |     df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff=2,2.5,3}} * df.y.std())]
16 | 
17 |     # --- (A) iqr
18 |     # remove outliers based on iqr
19 |     iqr = np.subtract(*np.percentile(df.y, [75, 25]))
20 |     median = np.median(df.y)
21 |     df = df[abs(df.y - median) <= {{cutoff}} * iqr]
22 | 
23 |     # --- (B)
24 |     # fit a simple ordinary least squares model
25 |     x = sm.add_constant(df.x)
26 |     lm = sm.OLS(df.y, x).fit()
27 | 
28 |     # display results
29 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
30 |     print('AIC: {:.2f}'.format(lm.aic))
31 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
32 | 


--------------------------------------------------------------------------------
/example/reading/r/brms_test.R:
--------------------------------------------------------------------------------
 1 | library(brms)
 2 | 
 3 | # read data
 4 | zinb <- read.csv("http://stats.idre.ucla.edu/stat/data/fish.csv")
 5 | zinb$camper <- factor(zinb$camper, labels = c("no", "yes"))
 6 | head(zinb)
 7 | 
 8 | # fit model
 9 | fit_zinb1 <- brm(count ~ persons + child + camper, data = zinb,
10 |                  family = zero_inflated_poisson("log"))
11 | 
12 | # view results
13 | summary(fit_zinb1)
14 | pdf(file="out.pdf")
15 | plot(fit_zinb1, pars = c("persons", "child", "camper"))
16 | marginal_effects(fit_zinb1)
17 | 
18 | # get the full STAN log, for debugging purpose
19 | # library(rstan)
20 | # mc <- make_stancode(count ~ persons + child + camper, data = zinb,family = zero_inflated_poisson("log"))
21 | # stan_model(model_code = mc, verbose = TRUE)
22 | 
23 | # Compilation error on macOS Majove
24 | # These shell commands worked for me:
25 | # xcode-select --install
26 | # open /Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg
27 | 
28 | # read fitted model
29 | path = 'some_file.rds'
30 | fit <- suppressWarnings(try(readRDS(path), silent = TRUE))
31 | summary(fit)
32 | 


--------------------------------------------------------------------------------
/example/reading/r/install.R:
--------------------------------------------------------------------------------
 1 | # create user library if it does not exist
 2 | repo = "http://cran.us.r-project.org"
 3 | lib = Sys.getenv("R_LIBS_USER")
 4 | dir.create(lib)
 5 | 
 6 | # configure C++ toolchain on Linux in order to use RStan
 7 | # https://github.com/stan-dev/rstan/wiki/Installing-RStan-on-Linux
 8 | dotR <- file.path(Sys.getenv("HOME"), ".R")
 9 | if (!file.exists(dotR)) dir.create(dotR)
10 | M <- file.path(dotR, "Makevars")
11 | if (!file.exists(M)) file.create(M)
12 | cat("\nCXX14FLAGS=-O3 -march=native -mtune=native -fPIC",
13 |     "CXX14=g++", # or clang++ but you may need a version postfix
14 |     file = M, sep = "\n", append = TRUE)
15 | 
16 | # install required packages
17 | if(!require(readr)) install.packages("readr", lib, repos=repo)
18 | if(!require(lmerTest)) install.packages("lmerTest", lib, repos=repo)
19 | if(!require(brms)) install.packages("brms", lib, repos=repo)
20 | if(!require(car)) install.packages("car", lib, repos=repo)
21 | if(!require(psych)) install.packages("psych", lib, repos=repo)
22 | if(!require(scales)) install.packages("scales", lib, repos=repo)
23 | if(!require(ordinal)) install.packages("ordinal", lib, repos=repo)
24 | 


--------------------------------------------------------------------------------
/example/simple_cont/template.py:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {
 3 |   "decisions": [
 4 |     {"var": "cutoff",
 5 |       "options": [
 6 |         {
 7 |             "seed" : 0,
 8 |             "sample" : "uniform",
 9 |             "count" : 50,
10 |             "min" : 1.0,
11 |             "max" : 3.0
12 |         }
13 |       ]
14 |     }
15 |   ],
16 |   "before_execute": "cp ../data.csv ./code/"
17 | }
18 | # --- (END)
19 | 
20 | #!/usr/bin/env python3
21 | import pandas as pd
22 | import numpy as np
23 | import statsmodels.api as sm
24 | 
25 | if __name__ == '__main__':
26 |     # read data file
27 |     df = pd.read_csv('data.csv')
28 | 
29 |     # --- (A) std
30 |     # remove outliers based on std
31 |     df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff}} * df.y.std())]
32 | 
33 |     # --- (A) iqr
34 |     # remove outliers based on iqr
35 |     iqr = np.subtract(*np.percentile(df.y, [75, 25]))
36 |     median = np.median(df.y)
37 |     df = df[abs(df.y - median) <= {{cutoff}} * iqr]
38 | 
39 |     # --- (B)
40 |     # fit a simple ordinary least squares model
41 |     x = sm.add_constant(df.x)
42 |     lm = sm.OLS(df.y, x).fit()
43 | 
44 |     # display results
45 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
46 |     print('AIC: {:.2f}'.format(lm.aic))
47 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
48 | 


--------------------------------------------------------------------------------
/example/reading/script.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(readr)
 4 | library(lmerTest)
 5 | library(car)
 6 | library(psych)
 7 | library(scales)
 8 | 
 9 | speed_data <- read_csv('data.csv')
10 | 
11 | #calculate reading speed in WPM
12 | speed_data$speed <- speed_data$num_words/(speed_data$adjust_rt/60000)
13 | 
14 | #remove retake participants
15 | speed_data <- subset(speed_data, retake != 1)
16 | 
17 | #remove outliers
18 | iqr = IQR(speed_data[speed_data$dyslexia_bin == 0,]$speed,na.rm=TRUE)
19 | cutoff_high = median(speed_data$speed) +3*iqr #3*iqr=645, cutoff_high = 928
20 | 
21 | #-------remove trials based on speed-------
22 | result_analysis <- speed_data[! speed_data$speed > cutoff_high, ]
23 | result_analysis <- result_analysis[ ! result_analysis$speed < 10,]
24 | 
25 | #-------remove smartphone users-------
26 | length(unique(subset(result_analysis$uuid, result_analysis$device=='smartphone')))
27 | #remove 64 smartphone users, 363 trials
28 | result_analysis <- result_analysis[! result_analysis$device == 'smartphone',]
29 | 
30 | #-------remove trials based on comprehension < 2/3-------
31 | result_analysis <- result_analysis[ ! result_analysis$correct_rate < .6,]
32 | #remove 111 trials
33 | 
34 | result_analysis$log_speed <- log(result_analysis$speed)
35 | 
36 | #dyslexia in three groups
37 | model <- lmer(log_speed ~ img_width + num_words + page_condition*as.factor(dyslexia) + age + english_native + (1 | uuid), data = result_analysis)
38 | AIC(model)
39 | summary(model)
40 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | with open("README.rst", "r") as fh:
 7 |     readme = fh.read()
 8 | 
 9 | with open('HISTORY.rst') as history_file:
10 |     history = history_file.read()
11 | 
12 | requirements = ['Click>=6.0', 'dataclasses>=0.6', 'pandas>=1.0.1']
13 | 
14 | setup_requirements = []
15 | 
16 | test_requirements = []
17 | 
18 | setup(
19 |     author="Yang Liu",
20 |     author_email='yliu0@uw.edu',
21 |     classifiers=[
22 |         'Development Status :: 3 - Alpha',
23 |         'Intended Audience :: Science/Research',
24 |         'License :: OSI Approved :: BSD License',
25 |         'Natural Language :: English',
26 |         'Programming Language :: Python :: 3',
27 |         'Programming Language :: Python :: 3.6',
28 |         'Programming Language :: Python :: 3.7',
29 |     ],
30 |     description="Author and execute multiverse analysis",
31 |     entry_points={
32 |         'console_scripts': [
33 |             'boba=boba.cli:main',
34 |         ],
35 |     },
36 |     install_requires=requirements,
37 |     license="BSD license",
38 |     long_description=readme + '\n\n' + history,
39 |     include_package_data=True,
40 |     keywords='multiverse analysis',
41 |     name='boba',
42 |     packages=find_packages(include=['boba', 'boba.*']),
43 |     setup_requires=setup_requirements,
44 |     test_suite='tests',
45 |     tests_require=test_requirements,
46 |     url='https://github.com/uwdata/boba',
47 |     version='1.1.2',
48 |     zip_safe=False,
49 | )
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, University of Washington Interactive Data Lab.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |    this list of conditions and the following disclaimer in the documentation
12 |    and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 |   may be used to endorse or promote products derived from this software
16 |   without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | History
 3 | =======
 4 | 
 5 | 1.1.2 (2021-04-25)
 6 | ==================
 7 | 
 8 | * Remove the dependency on boba-visualizer
 9 | 
10 | 1.1.1 (2021-04-25)
11 | ==================
12 | 
13 | * Support the Boba monitor
14 | 
15 | 1.1.0 (2020-10-07)
16 | ==================
17 | 
18 | * Add support for arbitrary language
19 | * Various bug fixes
20 | 
21 | 1.0.0 (2020-07-31)
22 | ==================
23 | 
24 | * Support continuous placeholder variable
25 | * Support running the multiverse across multiple processes
26 | * Improve boba run, such that it is not dependent on the OS
27 | * Various bug fixes
28 | * Integrate boba visualizer
29 | 
30 | 0.1.4 (2020-04-19)
31 | ==================
32 | 
33 | * Combine JSON spec with the template
34 | * Support inline definition for placeholder variables
35 | * Support inline constraint at block declaration
36 | * Support linked decisions
37 | * Infer ADG from specification
38 | * Update examples
39 | * Various bug fixes
40 | 
41 | 0.1.3 (2019-11-30)
42 | ==================
43 | 
44 | * Revise authoring syntax to support decision blocks and constraints
45 | * Revise CLI, with separate commands to compile and to run
46 | * Improve execution
47 | * Add the hurricane example
48 | 
49 | 0.1.2 (2019-09-19)
50 | ==================
51 | 
52 | * Fix bugs
53 | 
54 | 0.1.1 (2019-09-19)
55 | ==================
56 | 
57 | * Support R
58 | * Improve CLI options
59 | * Support a built-in variable {{_n}}, which represents the universe number
60 | * Support "before_execute" and "after_execute" hooks in the JSON spec
61 | * Update examples
62 | 
63 | 0.1.0 (2019-08-26)
64 | ==================
65 | 
66 | * First release on PyPI.
67 | 


--------------------------------------------------------------------------------
/boba/baseparser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | import sys
 5 | from dataclasses import dataclass
 6 | 
 7 | 
 8 | @dataclass
 9 | class Token:
10 |     type: str
11 |     value: str
12 | 
13 | 
14 | class ParseError(SyntaxError):
15 |     pass
16 | 
17 | 
18 | class BaseParser:
19 | 
20 |     def __init__(self, line):
21 |         self.line = line
22 |         self.i = 0
23 |         self.row = 0
24 |         self.col = 0
25 |         self.current = None
26 | 
27 |     @staticmethod
28 |     def _is_whitespace(char):
29 |         return any(c == char for c in ' \t\n')
30 | 
31 |     @staticmethod
32 |     def _is_id_start(ch):
33 |         return bool(re.match('[a-zA-Z]', ch))
34 | 
35 |     @staticmethod
36 |     def _is_id(ch):
37 |         return bool(re.match('[_a-zA-Z0-9]', ch))
38 | 
39 |     @staticmethod
40 |     def _is_digit(ch):
41 |         return bool(re.match('[0-9]', ch))
42 | 
43 |     def _next_char(self):
44 |         ch = self.line[self.i]
45 |         self.i += 1
46 |         if ch == '\n':
47 |             self.row += 1
48 |             self.col = 0
49 |         else:
50 |             self.col += 1
51 |         return ch
52 | 
53 |     def _peek_char(self):
54 |         return self.line[self.i]
55 | 
56 |     def _is_end(self):
57 |         return self.i >= len(self.line)
58 | 
59 |     def _read_while(self, fun, max_len=sys.maxsize):
60 |         s = ''
61 |         while not self._is_end() and fun(self._peek_char()) and len(s) < max_len:
62 |             s += self._next_char()
63 |         return s
64 | 
65 |     def _peek(self):
66 |         if not self.current:
67 |             self.current = self._read_next()
68 |         return self.current
69 | 
70 |     def _next(self):
71 |         tmp = self.current
72 |         self.current = None
73 |         return tmp or self._read_next()
74 | 
75 |     def _read_next(self):
76 |         pass
77 | 


--------------------------------------------------------------------------------
/test/test_lang.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Ugly hack to allow import from the root folder
 4 | import shutil
 5 | import sys
 6 | import os
 7 | sys.path.insert(0, os.path.abspath('..'))
 8 | 
 9 | import unittest
10 | from unittest.mock import patch
11 | import io
12 | from boba.cli import compile, run
13 | 
14 | from io import StringIO 
15 | import sys
16 | 
17 | def run_click(fn, args):
18 |     """ run a click function """
19 | 
20 |     stdout = sys.stdout
21 |     null = open(os.devnull, 'w')
22 |     sys.stdout = null
23 |     try:
24 |         print('here')
25 |         fn(args)
26 |     except SystemExit as e:
27 |         if e.code != 0:
28 |             raise RuntimeError('nonzero exit code: ' + str(e))
29 | 
30 |     sys.stdout = stdout
31 |     null.close()
32 | 
33 | class TestLang(unittest.TestCase):
34 |     def test_c(self):
35 |         folder = 'test/test_c'
36 |         script = os.path.join(folder, 'template.c')
37 |         out = folder
38 |         multiverse = os.path.join(folder, 'multiverse')
39 | 
40 |         run_click(compile, ['-s', script, '--out', folder])
41 | 
42 |         file_base = os.path.join(out, 'multiverse/code/universe_')
43 |         ext = '.c'
44 |         for i in range(1, 4):
45 |             f = file_base + str(i) + ext
46 |             if not os.path.isfile(file_base + str(i) + ext):
47 |                 self.fail('did not generate universe ' + f)
48 | 
49 |         run_click(run, ['--dir', multiverse, '-a'])
50 |         
51 |         file_base = os.path.join(out, 'multiverse/boba_logs/log_')
52 |         ext = '.txt'
53 |         for i in range(1, 4):
54 |             fn = file_base + str(i) + ext
55 |             if not os.path.isfile(fn):
56 |                 self.fail('did not generate log ' + str(i))
57 | 
58 |             with open(fn) as f:
59 |                 read = f.read()
60 |                 if read != 'hello from universe ' + str(i) + '\n':
61 |                     self.fail('universe generated unexpected output "' + read + '"')
62 | 
63 |         shutil.rmtree(multiverse)
64 | 


--------------------------------------------------------------------------------
/boba/lang.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | PY = 'python'
 5 | R = 'r'
 6 | 
 7 | script = '{{script_name}}'
 8 | compiled = '{{universe_name}}'
 9 | 
10 | DEFAULT_LANGS = {
11 |     'python' : {
12 |         'ext' : ['py'],
13 |         'run' : ['python', script]
14 |     },
15 |     'r' : {
16 |         'ext' : ['R', 'r'],
17 |         'run' : ['Rscript', script]
18 |     }
19 | }
20 | 
21 | class LangError(NameError):
22 |     pass
23 | 
24 | 
25 | class Lang:
26 |     def __init__(self, script, lang=None, supported_langs=None):
27 |         self.supported_langs = DEFAULT_LANGS
28 |         if supported_langs:
29 |             for l in supported_langs:
30 |                 self.supported_langs[l] = supported_langs[l]
31 |             
32 |         self.script = script
33 |         self.name, self.ext = os.path.splitext(script)
34 |         self.lang = self._infer_lang(lang)
35 | 
36 |     def _infer_lang(self, lang):
37 |         if lang:
38 |             lang = lang.strip().lower()
39 |             if not lang in self.supported_langs:
40 |                 raise LangError('Error: language "{}" is not supported'.format(lang))
41 | 
42 |             return lang, self.supported_langs[lang]
43 |         else:
44 |             for lang, lang_properties in self.supported_langs.items():
45 |                 if self.ext[1:] in lang_properties['ext']:
46 |                     return lang, lang_properties
47 | 
48 |             raise LangError('Error: cannot infer language from file extension ' + self.ext)
49 | 
50 |     def _format_cmd(self, cmd):
51 |         return cmd.strip().replace(script, self.script).replace(compiled, self.name)
52 |     
53 |     def get_ext(self):
54 |         return self.ext
55 | 
56 |     def get_cmd(self):
57 |         cmd = []
58 |         if 'compile' in self.lang[1]:
59 |             cmd.append([self._format_cmd(x) for x in self.lang[1]['compile']])
60 |             
61 |         cmd.append([self._format_cmd(x) for x in self.lang[1]['run']])
62 |         return cmd
63 | 
64 |     def is_r(self):
65 |         return self.lang[0] == R
66 | 
67 |     def is_python(self):
68 |         return self.lang[0] == PY
69 | 


--------------------------------------------------------------------------------
/example/hurricane/data_wrangling/wrangle.py:
--------------------------------------------------------------------------------
 1 | # We augmented the hurricane dataset by Jung et al. via the following steps:
 2 | # (1) Add entries for two hurricanes, Katrina and Audrey
 3 | # (2) Update normalized damage for all hurricanes, as adjusted to 2019 dollar values
 4 | # (3) Retrieve the highest wind speed for all hurricanes
 5 | # (4) Replace the femeninity ratings for all hurricanes
 6 | # Normalized damage was retrived at: http://www.icatdamageestimator.com/commonsearch
 7 | # The ratings for (4) is provided by Uri Simonson
 8 | 
 9 | 
10 | import pandas as pd
11 | import numpy as np
12 | from scipy.stats.stats import pearsonr
13 | 
14 | # read csv
15 | jung = pd.read_csv('data_jung.csv')
16 | df = pd.read_csv('data_updated.csv')
17 | ratings = pd.read_csv('MTurk_ratings_femeninity_of_hurricanes.csv')
18 | 
19 | # take the average of ratings and store in a dictionary keyed by name
20 | rs = dict()
21 | for c in ratings:
22 |     if c.startswith('Q1'):
23 |         # the first row is also a header, extract name from the question
24 |         name = ratings[c][0].split('-')[-1]
25 |         # take the average of ratings, excluding the first row
26 |         rs[name] = np.mean(ratings[c][1:].astype('int32'))
27 | 
28 | # fill in the ratings to our updated dataset
29 | for i in df.index:
30 |     name = df.at[i, 'Name']
31 |     df.at[i, 'MasFem'] = rs[name]
32 |     df.at[i, 'Gender_MF'] = 1 if rs[name] > 6 else 0
33 | df.Gender_MF = df.Gender_MF.astype('int32')
34 | 
35 | # check the correlation between original and updated damage
36 | dff = df[(df.Name != 'Audrey') & (df.Name != 'Katrina')]
37 | r = pearsonr(jung.NDAM, dff.NDAM)
38 | print('Correlation of normalized damage: {}'.format(r[0]))
39 | 
40 | # check the correlation between original and updated gender ratings
41 | r = pearsonr(jung.MasFem, dff.MasFem)
42 | print('Correlation of gender ratings: {}'.format(r[0]))
43 | r = pearsonr(jung.Gender_MF, dff.Gender_MF)
44 | print('Correlation of binary gender flag: {}'.format(r[0]))
45 | 
46 | # results:
47 | # Correlation of normalized damage: 0.942
48 | # Correlation of gender ratings: 0.981
49 | # Correlation of binary gender flag: 0.951
50 | 
51 | # save
52 | df.to_csv('./data.csv', index=False)
53 | 


--------------------------------------------------------------------------------
/boba/output/csvmerger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import re
 5 | import pandas as pd
 6 | import boba.util as util
 7 | 
 8 | STR_MAX = 1024
 9 | 
10 | 
11 | class CSVMerger:
12 |     def __init__(self, pattern, base, out, delimiter=','):
13 |         self.pattern = pattern
14 |         self.base = base
15 |         self.out = out
16 |         self.delimiter = delimiter
17 | 
18 |     def _fn_func(self, i):
19 |         return self.pattern.format(i)
20 | 
21 |     def _to_regex(self):
22 |         """ Convert the string pattern to regex. """
23 |         i = self.pattern.find('{}')
24 |         if i < 0:
25 |             util.print_fail('Invalid pattern: {}'.format(self.pattern))
26 |             exit(1)
27 | 
28 |         rg = re.compile('^' + re.escape(self.pattern[:i]) + '(\d+)' +
29 |                         re.escape(self.pattern[i+2:]))
30 |         return rg
31 | 
32 |     def get_files(self):
33 |         """ Get a list of universe indices in the folder that matches given
34 |         pattern. The indices are sorted."""
35 |         idx = []
36 |         for f in os.listdir(self.base):
37 |             m = re.match(self._to_regex(), f)
38 |             if m:
39 |                 idx.append(int(m.group(1)))
40 |         idx.sort()
41 |         return idx
42 | 
43 |     def merge(self):
44 |         """ Merge the CSV files into one file """
45 |         result = pd.DataFrame()
46 |         for i in self.get_files():
47 |             # read the file
48 |             df = pd.read_csv(os.path.join(self.base, self._fn_func(i)),
49 |                              delimiter=self.delimiter,
50 |                              converters={i: str for i in range(0, STR_MAX)})
51 |             n = len(list(df.columns))
52 | 
53 |             # augment
54 |             df['uid'] = i
55 | 
56 |             # rearrange columns
57 |             cols = list(df.columns)
58 |             cols = cols[n:] + cols[:n]
59 |             df = df[cols]
60 | 
61 |             # merge with previous results
62 |             result = pd.concat([result, df], axis=0, sort=False)
63 | 
64 |         return result
65 | 
66 |     def main(self):
67 |         res = self.merge()
68 |         res.to_csv(self.out, index=False)
69 | 


--------------------------------------------------------------------------------
/example/reading/python/script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | import statsmodels.formula.api as smf
 6 | 
 7 | if __name__ == '__main__':
 8 |     # read data
 9 |     df = pd.read_csv('data.csv')
10 | 
11 |     # calculate reading speed in WPM
12 |     df['speed'] = df.apply(lambda row: row.num_words / row.adjust_rt * 60000,
13 |                            axis=1)
14 | 
15 |     # remove retake participants
16 |     df = df[df.retake != 1]
17 | 
18 |     # remove outliers based on reading speed
19 |     iqr = np.subtract(*np.percentile(df.speed, [75, 25]))
20 |     cutoff_high = np.median(df.speed) + 3 * iqr
21 |     df = df[df.speed <= cutoff_high]
22 |     df = df[df.speed >= 10]
23 | 
24 |     # remove smart phone users
25 |     df = df[~df.device.isin(['smartphone'])]
26 | 
27 |     # drop NA rows
28 |     df = df.dropna()
29 | 
30 |     # log-normalize speed
31 |     df['log_speed'] = np.log(df.speed)
32 | 
33 |     # make dyslexia a categorical variable
34 |     df.dyslexia = df.dyslexia.astype('category')
35 | 
36 |     # wrangle education level
37 |     edu_order = ['pre-high school', 'high school', 'professional school',
38 |                  'college', 'graduate school', 'PhD', 'postdoctoral']
39 |     tp = pd.CategoricalDtype(categories=edu_order, ordered=True)
40 |     df['edu_level'] = df.education.astype(tp).cat.codes
41 | 
42 |     # check correlation between IVs
43 |     ivs = df[['img_width', 'num_words', 'page_condition', 'age']]
44 |     print(ivs.corr(), '\n')
45 |     print(pd.crosstab(df.english_native, df.dyslexia, normalize='columns'), '\n')
46 |     print(pd.crosstab(df.device, df.dyslexia, normalize='columns'), '\n')
47 | 
48 |     # fit a multinomial logit model to accuracy
49 |     df['acc'] = 3 - pd.Categorical(df.correct_rate).codes
50 |     print(df.groupby('acc').size(), '\n')
51 |     fml = 'acc ~ page_condition*dyslexia_bin'
52 |     model = smf.mnlogit(fml, df, groups=df.uuid).fit()
53 |     print(model.summary(), '\n')
54 | 
55 |     # remove trials based on comprehension < 2/3
56 |     df = df[df.correct_rate > 0.6]
57 | 
58 |     # fit a linear mixed effects model
59 |     fml = 'log_speed ~ img_width + num_words + page_condition*dyslexia' \
60 |           '+ age + english_native'
61 |     model = smf.mixedlm(fml, df, groups=df.uuid).fit()
62 |     print(model.summary())
63 | 


--------------------------------------------------------------------------------
/test/test_graph_parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Ugly hack to allow import from the root folder
 4 | import sys
 5 | import os
 6 | sys.path.insert(0, os.path.abspath('..'))
 7 | 
 8 | import unittest
 9 | from boba.graphparser import GraphParser, Edge, ParseError
10 | 
11 | 
12 | class TestParser(unittest.TestCase):
13 |     def test_good_specs(self):
14 |         spec = ['A -> B -> C1', 'B->C2']
15 |         nodes, edges = GraphParser(spec).parse()
16 |         self.assertSetEqual(nodes, {'A', 'B', 'C1', 'C2'})
17 |         exp_edges = {Edge('A', 'B'), Edge('B', 'C1'), Edge('B', 'C2')}
18 |         self.assertSetEqual(edges, exp_edges)
19 | 
20 |     def test_weird_specs(self):
21 |         spec = ['a->a->a->a  b ']
22 |         nds, eds = GraphParser(spec).parse()
23 |         self.assertSetEqual(nds, {'a', 'b'})
24 |         self.assertSetEqual(eds, {Edge('a', 'a')})
25 | 
26 |         spec = ['a  b', 'c']
27 |         nds, eds = GraphParser(spec).parse()
28 |         self.assertSetEqual(nds, {'a', 'b', 'c'})
29 |         self.assertSetEqual(eds, set())
30 | 
31 |         spec = ['a->b c->b']
32 |         nds, eds = GraphParser(spec).parse()
33 |         self.assertSetEqual(nds, {'a', 'b', 'c'})
34 |         self.assertSetEqual(eds, {Edge('a', 'b'), Edge('c', 'b')})
35 | 
36 |     def test_syntax_error(self):
37 |         spec = ['my_first_node -> my_second_node']
38 |         nds, eds = GraphParser(spec).parse()
39 |         self.assertSetEqual(nds, {'my_first_node', 'my_second_node'})
40 |         self.assertSetEqual(eds, {Edge('my_first_node', 'my_second_node')})
41 | 
42 |         spec = ['_start -> _end']
43 |         with self.assertRaisesRegex(ParseError, '(?i)cannot handle character'):
44 |             GraphParser(spec).parse()
45 | 
46 |         spec = ['-> B']
47 |         with self.assertRaisesRegex(ParseError, '(?i)source node'):
48 |             GraphParser(spec).parse()
49 | 
50 |         spec = ['A -> B ->']
51 |         with self.assertRaisesRegex(ParseError, '(?i)target node'):
52 |             GraphParser(spec).parse()
53 | 
54 |         spec = ['A - B']
55 |         with self.assertRaises(ParseError):
56 |             GraphParser(spec).parse()
57 | 
58 |         spec = ['A->B->C, B->D']
59 |         with self.assertRaises(ParseError):
60 |             GraphParser(spec).parse()
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/example/fertility/script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import statsmodels.api as sm
 6 | import statsmodels.formula.api as smf
 7 | 
 8 | if __name__ == '__main__':
 9 |     # read data file
10 |     df = pd.read_csv('durante_etal_2013_study1.txt', delimiter='\t')
11 | 
12 |     # remove NA
13 |     df = df.dropna(subset=['rel1', 'rel2', 'rel3'])
14 | 
15 |     # create religiosity score
16 |     df['rel_comp'] = np.around((df.rel1 + df.rel2 + df.rel3) / 3, decimals=2)
17 | 
18 |     # next menstrual onset (nmo) assessment
19 |     df.last_period_start = pd.to_datetime(df.last_period_start)
20 |     df.period_before_last_start = pd.to_datetime(df.period_before_last_start)
21 |     df.date_testing = pd.to_datetime(df.date_testing)
22 | 
23 |     # first nmo option: based on computed cycle length
24 |     computed = df.last_period_start - df.period_before_last_start
25 |     next_onset = df.last_period_start + computed
26 | 
27 |     # second nmo option: based on reported cycle length
28 |     df = df.dropna(subset=['reported_cycle_length'])
29 |     next_onset2 = df.last_period_start + df.reported_cycle_length.apply(
30 |         lambda a: pd.Timedelta(days=a))
31 | 
32 |     # compute cycle day
33 |     df['cycle_day'] = pd.Timedelta('28 days') - (next_onset - df.date_testing)
34 |     df.cycle_day = (df.cycle_day / np.timedelta64(1, 'D')).astype(int)
35 |     df.cycle_day = np.clip(df.cycle_day, 1, 28)
36 | 
37 |     # fertility assessment
38 |     high_bounds = [6, 14]
39 |     low_bounds = [17, 27]
40 |     df.loc[(high_bounds[0] <= df.cycle_day) & (df.cycle_day <= high_bounds[1]),
41 |            'fertility'] = 'High'
42 |     df.loc[(low_bounds[0] <= df.cycle_day) & (df.cycle_day <= low_bounds[1]),
43 |            'fertility'] = 'Low'
44 | 
45 |     # relationship status assessment
46 |     # single = response options 1 and 2; relationship = response options 3 and 4
47 |     df.loc[df.relationship <= 2, 'relationship_status'] = 'Single'
48 |     df.loc[df.relationship > 2, 'relationship_status'] = 'Relationship'
49 | 
50 |     # exclusion based on cycle length
51 |     df = df[(df.reported_cycle_length >= 25) &
52 |             (df.reported_cycle_length <= 35)]
53 | 
54 |     # exclusion based on certainty ratings
55 |     df = df[(df.sure1 >= 6) & (df.sure2 >= 6)]
56 | 
57 |     # perform an ANOVA on the processed data set
58 |     lm = smf.ols('rel_comp ~ relationship_status * fertility', data=df).fit()
59 |     print(lm.summary(), '\n')
60 |     table = sm.stats.anova_lm(lm, typ=2)
61 |     print(table)
62 | 


--------------------------------------------------------------------------------
/example/hurricane/README.md:
--------------------------------------------------------------------------------
 1 | # Multiverse of the Hurricane Dataset
 2 | 
 3 | In this example, we implemented the specification curve analysis on Jung's hurricane study,
 4 | described in the seminal paper of Simonsohn et al. 
 5 | 
 6 | Useful URLs:
 7 | - Specification curve paper by Simonsohn et al.: 
 8 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2694998
 9 | 
10 | - The appendix of the specification curve paper:
11 | http://urisohn.com/sohn_files/wp/wordpress/wp-content/uploads/Supplement-Specification-Curve-2019-10-29.pdf
12 | 
13 | - STATA code implementing the specification curve analysis:
14 | http://urisohn.com/sohn_files/files/Specification%20Curve.zip
15 | 
16 | - Hurricane paper by Jung et al.:
17 | https://doi.org/10.1073/pnas.1402786111
18 | 
19 | - Supporting material of the hurricane paper:
20 | https://www.pnas.org/content/suppl/2014/05/30/1402786111.DCSupplemental
21 | 
22 | 
23 | ### Augmenting the Dataset
24 | 
25 | Following the description in Uri Simonsohn's STATA code, we augmented the original
26 | hurricane dataset via the following steps:
27 | 
28 | - We added the two outliers excluded in Jung's study - Katrina and Audrey.
29 | - We replaced the femininity ratings (MasFem) with the average ratings from 32 MTurkers,
30 | collected using the same scale as described in Jung's paper. Uri Simonsohn kindly
31 | provided the MTurk ratings to us.
32 | - Accordingly, we also updated the binary gender indicator, so a femininity rating higher
33 | than 6 is categorized as female.
34 | - We updated the normalized damage to 2019 dollar values, using the same website as
35 | Jung et al: http://www.icatdamageestimator.com/commonsearch
36 | - We added a column of highest wind speed (mph) using the Wikipedia as the data source.
37 | 
38 | ### Notes
39 | 
40 | 1. The multiverse specification has two versions: `template.py`, which follows the decision definitions 
41 | in page 4 of Simonsohn's appendix and creates 864 universes, and `repro.R`, which replicates Simonsohn's
42 | code implementation to create 1728 universes. The difference is due to separating the first decision (of size 6)
43 | into a cross product of two decisions (3x4), thus doubling the size of the final multiverse.
44 | 
45 | 2. As we used a slightly different dataset than the one used by Jung et al., we did not obtain the same result when using the original specification in Jung's study.
46 | 
47 | 3. About 40 universes, all fitting a negative binomial model, will fail because
48 | of this error:
49 | ```
50 | Error in glm.fitter(x = X, y = Y, w = w, etastart = eta, offset = offset,  : 
51 |   NA/NaN/Inf in 'x'
52 | Calls: glm.nb -> glm.fitter
53 | ```
54 | The helper script `debug_count.py` outputs which universes had failed.
55 | 


--------------------------------------------------------------------------------
/example/mortgage/template.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # --- (BOBA_CONFIG)
 3 | {
 4 |   "decisions": [
 5 |     {"var": "black", "options": ["+ black", ""]},
 6 |     {"var": "housing_expense_ratio", "options": ["+ housing_expense_ratio", ""]},
 7 |     {"var": "self_employed", "options": ["+ self_employed", ""]},
 8 |     {"var": "married", "options": ["+ married", ""]},
 9 |     {"var": "bad_history", "options": ["+ bad_history", ""]},
10 |     {"var": "PI_ratio", "options": ["+ PI_ratio", ""]},
11 |     {"var": "loan_to_value", "options": ["+ loan_to_value", ""]},
12 |     {"var": "denied_PMI", "options": ["+ denied_PMI", ""]}
13 |   ],
14 |   "before_execute": "cp ../mortgage.csv ./ && rm -rf results && mkdir results",
15 |   "after_execute": "cd .. && sh after_execute.sh",
16 |   "visualizer": "visualizer_config.json"
17 | }
18 | # --- (END)
19 | 
20 | suppressPackageStartupMessages(library(readr))
21 | suppressPackageStartupMessages(library(tidyverse))
22 | suppressPackageStartupMessages(library(broom.mixed))
23 | source('../../../hurricane/boba_util.R') #fixme
24 | 
25 | # read data
26 | df <- read_csv('../mortgage.csv', 
27 |     col_types = cols(.default = col_double())) %>%
28 |     mutate(
29 |         accept_scaled = accept * 100
30 |     ) %>%
31 |     # here we drop all NAs for simplicity, but we will drop up to 7 more data
32 |     # points in some models, which may cause discrepancy with Young et al.
33 |     drop_na()
34 | 
35 | # linear regression
36 | model <- lm(accept_scaled ~ female {{black}} {{housing_expense_ratio}}
37 |     {{self_employed}} {{married}} {{bad_history}} {{PI_ratio}}
38 |     {{loan_to_value}} {{denied_PMI}}, data = df)
39 | 
40 | # print summary to console
41 | smr = summary(model)
42 | smr
43 | 
44 | # cross validation
45 | fit <- cross_validation(df, model, 'accept_scaled')
46 | # normalize using max - min, because IQR is zero
47 | nrmse = fit / (max(df$accept_scaled) - min(df$accept_scaled))
48 | 
49 | # wrangle results
50 | result <- tidy(model, conf.int = TRUE) %>%
51 |     filter(term == 'female') %>%
52 |     add_column(
53 |         NRMSE = nrmse,
54 |         R2_flipped = 1 - pmax(pmin(smr$adj.r.squared, 1), 0)
55 |     )
56 | 
57 | # get predictions
58 | disagg_fit <- pointwise_predict(model, df) %>%
59 |     select(
60 |         observed = accept_scaled,
61 |         expected = fit
62 |     )
63 | 
64 | # get uncertainty in coefficient for female as draws from sampling distribution 
65 | uncertainty <- sampling_distribution(model, 'female') %>%
66 |     dplyr::select(estimate = coef)
67 | 
68 | # output
69 | write_csv(result, '../results/estimate_{{_n}}.csv')
70 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv')
71 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv')
72 | 


--------------------------------------------------------------------------------
/boba/graphanalyzer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | class InvalidGraphError(NameError):
 5 |     pass
 6 | 
 7 | 
 8 | class GraphAnalyzer:
 9 |     def __init__(self, nodes, edges):
10 |         self.nodes = nodes
11 |         self.edges = GraphAnalyzer._convert_edges(edges)
12 |         self.paths = []
13 | 
14 |     @staticmethod
15 |     def _convert_edges(edges):
16 |         d = {}
17 |         for e in edges:
18 |             if e.start in d:
19 |                 d[e.start].append(e.end)
20 |             else:
21 |                 d[e.start] = [e.end]
22 |         return d
23 | 
24 |     def _throw(self, msg):
25 |         msg = 'In analyzing graph structure:\n\t' + msg
26 |         raise InvalidGraphError(msg)
27 | 
28 |     def _all_ending_nodes(self):
29 |         """ nodes that have at least one incoming edge(s) """
30 |         flat = [item for lst in self.edges.values() for item in lst]
31 |         return set(flat)
32 | 
33 |     def _get_source(self):
34 |         """ nodes that have no incoming edges """
35 |         return self.nodes.difference(self._all_ending_nodes())
36 | 
37 |     def _get_target(self):
38 |         """ nodes that have no outgoing edges """
39 |         return self.nodes.difference(set(self.edges.keys()))
40 | 
41 |     def _all_paths_recur(self, a, b, visited, path):
42 |         """ a recursive func to get all paths from a to b"""
43 |         # mark the current node as visited and add to path
44 |         visited.add(a)
45 |         path.append(a)
46 | 
47 |         # if current node is the same as target, the path is done
48 |         if a == b:
49 |             self.paths.append([nd for nd in path])
50 |         else:
51 |             if a in self.edges:
52 |                 for n in self.edges[a]:
53 |                     if n not in visited:
54 |                         self._all_paths_recur(n, b, visited, path)
55 | 
56 |         # remove current node from path and mark it as unvisited
57 |         path.pop()
58 |         visited.discard(a)
59 | 
60 |     def _all_paths(self, s, t):
61 |         """ get all paths from s to t """
62 |         visited = set()
63 |         path = []
64 |         self._all_paths_recur(s, t, visited, path)
65 | 
66 |     def _construct_paths(self):
67 |         ss = self._get_source()
68 |         ts = self._get_target()
69 | 
70 |         if len(ss) == 0:
71 |             self._throw('Cannot find any starting node')
72 |         if len(ts) == 0:
73 |             self._throw('Cannot find any ending node')
74 | 
75 |         for s in ss:
76 |             for t in ts:
77 |                 self._all_paths(s, t)
78 | 
79 |     def analyze(self):
80 |         if len(self.nodes) == 0:
81 |             return []
82 | 
83 |         self._construct_paths()
84 |         return self.paths
85 | 


--------------------------------------------------------------------------------
/boba/conditionparser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .baseparser import BaseParser, ParseError
 3 | from dataclasses import dataclass
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class TokenType(Enum):
 8 |     var = 1
 9 |     index_var = 2
10 |     number = 3
11 | 
12 | 
13 | @dataclass
14 | class ParsedToken:
15 |     value: str
16 |     type: TokenType
17 | 
18 | 
19 | class ConditionParser(BaseParser):
20 |     """ A class for parsing the condition string """
21 |     def __init__(self, line):
22 |         super(ConditionParser, self).__init__(line)
23 |         self.parsed_code = ''
24 |         self.parsed_decs = []
25 | 
26 |     def parse(self):
27 |         while not self._is_end():
28 |             self._read_next()
29 |         return self.parsed_code, self.parsed_decs
30 | 
31 |     @staticmethod
32 |     def _is_keyword(w):
33 |         return w == 'and' or w == 'or'
34 | 
35 |     @staticmethod
36 |     def _is_operator(ch):
37 |         return ch in ['=', '(', ')', '!', '>', '<']
38 | 
39 |     def _throw(self, msg):
40 |         msg = 'At character {} of "{}":\n\t{}'.format(self.i + 1, self.line, msg)
41 |         raise ParseError(msg)
42 | 
43 |     def _maybe_read_index(self):
44 |         # we only want to parse the LHS of ==
45 |         if len(self.parsed_decs) % 2 == 1:
46 |             return False
47 | 
48 |         if not self._is_end() and self._peek_char() == '.':
49 |             # try to parse .index
50 |             self._next_char()
51 |             v = self._read_while(self._is_id)
52 |             if v == 'index':
53 |                 return True
54 |             else:
55 |                 msg = 'Expected ".index", got ".{}"'.format(v)
56 |                 self._throw(msg)
57 | 
58 |         return False
59 | 
60 |     def _read_next(self):
61 |         self.parsed_code += self._read_while(BaseParser._is_whitespace)
62 |         if self._is_end():
63 |             return
64 | 
65 |         ch = self._peek_char()
66 |         if self._is_id_start(ch):
67 |             w = self._read_while(self._is_id)
68 |             if ConditionParser._is_keyword(w):
69 |                 self.parsed_code += w
70 |                 return
71 | 
72 |             tk = ParsedToken(w, TokenType.var)
73 |             if self._maybe_read_index():
74 |                 tk.type = TokenType.index_var
75 | 
76 |             self.parsed_decs.append(tk)
77 |             self.parsed_code += '{}'
78 |         elif self._is_digit(ch):
79 |             w = self._read_while(self._is_digit)
80 |             if not self._is_end() and self._peek_char() == '.':  # read decimal
81 |                 w += self._next_char() + self._read_while(self._is_digit)
82 | 
83 |             self.parsed_decs.append(ParsedToken(w, TokenType.number))
84 |             self.parsed_code += '{}'
85 |         elif self._is_operator(ch):
86 |             w = self._read_while(ConditionParser._is_operator)
87 |             self.parsed_code += w
88 |         else:
89 |             msg = 'Cannot handle character "{}".'.format(ch)
90 |             self._throw(msg)
91 | 


--------------------------------------------------------------------------------
/test/specs/continuous-err.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "0" : {
 3 |         "decisions": [
 4 |             {"var": "err", "options": [{"sample" : "uniform"}] , "desc" : "check 'count' omission"}
 5 |         ]
 6 |     },
 7 |     "1" : {
 8 |         "decisions": [
 9 |             {"var": "err", "options": [{"sample" : "lognormal"}] , "desc" : "check 'count' omission"}
10 |         ]
11 |     },
12 |     "2" : {
13 |         "decisions": [
14 |             {"var": "err", "options": [{"sample" : "normal"}] , "desc" : "check 'count' omission"}
15 |         ]
16 |     },
17 |     "3" : {
18 |         "decisions": [
19 |             {"var": "err", "options": [{"sample" : "uniform", "count" : 5}] , "desc" : "check required variable omission"}
20 |         ]
21 |     },
22 |     "4" : {
23 |         "decisions": [
24 |             {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "min" : 0.0}] , "desc" : "check required variable omission"}
25 |         ]
26 |     },
27 |     "5" : {
28 |         "decisions": [
29 |             {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "max" : 0.0}] , "desc" : "check required variable omission"}
30 |         ]
31 |     },
32 |     "6" : {
33 |         "decisions": [
34 |             {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "min" : true, "max" : 5.0}] , "desc" : "check bad type for variables"}
35 |         ]
36 |     },
37 |     "7" : {
38 |         "decisions": [
39 |             {"var": "err", "options": [{"sample" : "uniform", "count" : 5, "min" : 1.0, "max" : true}] , "desc" : "check bad type for variables"}
40 |         ]
41 |     },
42 |     "8" : {
43 |         "decisions": [
44 |             {"var": "err", "options": [{"sample" : "lognormal", "count" : 5, "exclusive" : 1.0}] , "desc" : "check bad type for variables"}
45 |         ]
46 |     },
47 |     "9" : {
48 |         "decisions": [
49 |             {"var": "err", "options": [{"sample" : "lognormal", "count" : 5, "mean" : "mean"}] , "desc" : "check bad type for variables"}
50 |         ]
51 |     },
52 |     "10" : {
53 |         "decisions": [
54 |             {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : "range"}] , "desc" : "check bad type for variables"}
55 |         ]
56 |     },
57 |     "11" : {
58 |         "decisions": [
59 |             {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : ["range", "range"]}] , "desc" : "check bad type for variables"}
60 |         ]
61 |     },
62 |     "12" : {
63 |         "decisions": [
64 |             {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : [0.0, 1.0, 2.0]}] , "desc" : "check bad type for variables"}
65 |         ]
66 |     },
67 |     "13" : {
68 |         "decisions": [
69 |             {"var": "err", "options": [{"sample" : "normal", "count" : 5, "range" : [1.0, 0.0]}] , "desc" : "check bad type for variables"}
70 |         ]
71 |     },
72 |     "14" : {
73 |         "decisions": [
74 |             {"var": "err", "options": [{"sample" : "normal", "count" : 5, "std_dev" : 1.0, "range" : [1.0, 0.0]}] , "desc" : "check bad type for variables"}
75 |         ]
76 |     }
77 | }


--------------------------------------------------------------------------------
/tutorial/cli.rst:
--------------------------------------------------------------------------------
  1 | ===
  2 | CLI
  3 | ===
  4 | 
  5 | You might invoke the command line tool via::
  6 |   boba <command> [options]
  7 | 
  8 | Available commands:
  9 |  - compile
 10 |  - run
 11 |  - merge
 12 | 
 13 | General options:
 14 | 
 15 | ``--version``
 16 |   Show version and exit.
 17 | 
 18 | Compile
 19 | =======
 20 | The compile command parses the template script and the JSON spec to generate
 21 | executable universe scripts. It has the following options:
 22 | 
 23 | ``--script, -s``
 24 |   **default: ./template.py** (optional)
 25 | 
 26 |   The path to your template script.
 27 | 
 28 | ``--out``
 29 |   **default: .** (optional)
 30 | 
 31 |   The output directory to hold generated universe scripts, summary table, etc.
 32 | 
 33 | ``--lang``
 34 |   (optional)
 35 | 
 36 |   Language of your analysis script. We support python and R, and require a
 37 |   configuration file for any other languages.
 38 |   If not specified, we will infer it from the file extension.
 39 | 
 40 | ``--help``
 41 |   Show help message and exit.
 42 | 
 43 | Run
 44 | ===
 45 | The run command executes the generated universe scripts. You could use it to
 46 | run the entire multiverse, a single universe, or a subset of universes. To run
 47 | all universes, use::
 48 | 
 49 |   boba run --all
 50 | 
 51 | To run a single universe, provide its identifying number as the argument. For
 52 | example, if you want to run universe_1.py, use::
 53 | 
 54 |   boba run 1
 55 | 
 56 | To run a range of universes, for example universe_1 through universe_5, use::
 57 | 
 58 |   boba run 1 --thru 5
 59 | 
 60 | In addition, the run command accepts the following options:
 61 | 
 62 | ``--dir``
 63 |   **default: ./multiverse (optional)**
 64 | 
 65 |   Determines the path to the multiverse directory. It should point to a directory
 66 |   that contains the *summary.csv* file and the *code* subfolder.
 67 | 
 68 | ``--jobs``
 69 |   **default: 1 (optional)**
 70 | 
 71 |   Determines the number of processes that can run at a time. If *jobs* is set
 72 |   to 0, it becomes the number of cores on the machine.
 73 | 
 74 | ``--batch_size``
 75 |   **default: see below (optional)**
 76 | 
 77 |   Determines the number of universes that will be run in a sequence in each
 78 |   process. Let :math:`N` denotes the number of universes, the default is
 79 |   :math:`sqrt(N)` or :math:`N/jobs + 1`, whichever is smaller.
 80 | 
 81 | Merge
 82 | =====
 83 | The merge command combines CSV outputs from individual universes into one file.
 84 | This command works well if you used the built-in `{{_n}}` variable to output
 85 | a separate CSV per universe.
 86 | 
 87 | The command has a required argument: the filename pattern of individual outputs
 88 | where the universe id is replaced by {}. For example, if your output
 89 | files are output_1.csv, output_2.csv, output_3.csv, and so on, your pattern
 90 | should be `output_{}.csv`.
 91 | 
 92 | In addition, the command has the following options:
 93 | 
 94 | ``--base, -b``
 95 |   **default: ./multiverse/results (optional)**
 96 | 
 97 |   Path to the directory containing the universe outputs.
 98 | 
 99 | ``--out``
100 |   **default: ./multiverse/merged.csv (optional)**
101 | 
102 |   Path to the merged file that will be created by this command.
103 | 
104 | ``--delimiter``
105 |   **default: , (optional)**
106 | 
107 |   CSV delimiter.
108 | 


--------------------------------------------------------------------------------
/test/specs/continuous.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decisions": [
 3 |       {"var": "A", "options": [{"sample" : "uniform", "count" : 10, "seed" : 0, "min" : 0.0, "max" : 5.0}] , 
 4 |                    "desc" : "uniform continuous variable expansion"},
 5 | 
 6 |       {"var": "B", "options": [{"sample" : "lognormal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}] , 
 7 |                    "desc" : "lognormal continuous variable expansion"},
 8 | 
 9 |       {"var": "C", "options": [{"sample" : "normal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}] , 
10 |                    "desc" : "normal continuous variable expansion"},
11 |       
12 |       {"var": "D", "options": [{"sample" : "uniform", "count" : 10, "seed" : 0, "min" : 0.0, "max" : 5.0}, 17.0] , 
13 |                    "desc" : "uniform continuous variable expansion with additional constants"},
14 |       
15 |       {"var": "E", "options": [{"sample" : "lognormal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}, 0.0, 1.0, 2.0] , 
16 |                    "desc" : "lognormal continuous variable expansion with additional constants"},
17 | 
18 |       {"var": "F", "options": [{"sample" : "normal", "count" : 10, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}, 0.0, 1.0, 2.0, 3.0, 4.0] , 
19 |                    "desc" : "normal continuous variable expansion with additional constants"},
20 |       
21 |       {"var": "G", "options": [{"sample" : "uniform", "count" : 3, "seed" : 0, "min" : 0.0, "max" : 5.0}, 
22 |                                {"sample" : "lognormal", "count" : 3, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0},
23 |                                {"sample" : "normal", "count" : 3, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0}] , 
24 |                    "desc" : "multiple continuous variable expansions"},
25 | 
26 |       {"var": "H", "options": [{"sample" : "uniform", "count" : 4, "seed" : 0, "min": 0.0, "max": 5.0},
27 |                                {"sample" : "uniform", "count" : 4, "seed" : 1, "min": 10.0, "max": 15.0}] , 
28 |                    "desc" : "multiple continuous variable expansions"},
29 | 
30 |       {"var": "I", "options": [{"sample" : "uniform", "count" : 3, "seed" : 0, "min": 0.0, "max": 5.0}, -1.1,
31 |                                {"sample" : "uniform", "count" : 3, "seed" : 1, "min": 10.0, "max": 15.0},
32 |                                0.0, 1.0, 2.0, 3.1415] , 
33 |                    "desc" : "multiple continuous variable expansions with additional constants"},
34 | 
35 |       {"var": "J", "options": [{"sample" : "normal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0, "range" : [0.0, 2.5]},
36 |                                {"sample" : "lognormal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 5.0, "range" : [0.0, 2.5]}] , 
37 |                    "desc" : "check range"},
38 |       
39 |       {"var": "K", "options": [{"sample" : "lognormal", "count" : 5, "seed" : 0}, 
40 |                                 {"sample" : "lognormal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 1.0}] , 
41 |                     "desc" : "check unrequired variable omission"},
42 |                 
43 |       {"var": "L", "options": [{"sample" : "normal", "count" : 5, "seed" : 0}, 
44 |                                 {"sample" : "normal", "count" : 5, "seed" : 0, "mean" : 0.0, "std_dev" : 1.0}] , 
45 |                     "desc" : "check unrequired variable omission"}
46 |     ]
47 | }
48 | 


--------------------------------------------------------------------------------
/boba/graphparser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from dataclasses import dataclass
  4 | from .baseparser import BaseParser, Token, ParseError
  5 | 
  6 | 
  7 | @dataclass(frozen=True)
  8 | class Edge:
  9 |     start: str
 10 |     end: str
 11 | 
 12 | 
 13 | class GraphParser(BaseParser):
 14 |     def __init__(self, graph_spec):
 15 |         line = '\n'.join(graph_spec)
 16 |         super(GraphParser, self).__init__(line)
 17 | 
 18 |         self.spec = graph_spec
 19 |         self.nodes = set()
 20 |         self.edges = set()
 21 | 
 22 |     def _prep_err(self, msg):
 23 |         return 'At character {} of "{}":\n\t{}'.format(self.col+1, self.spec[self.row], msg)
 24 | 
 25 |     def _read_next(self):
 26 |         self._read_while(GraphParser._is_whitespace)
 27 |         if self._is_end():
 28 |             return Token('eof', '')
 29 | 
 30 |         ch = self._peek_char()
 31 |         if ch == '-':
 32 |             return self._read_edge()
 33 |         elif GraphParser._is_id_start(ch):
 34 |             return self._read_node()
 35 |         else:
 36 |             raise ParseError(self._prep_err('Cannot handle character "{}"'.format(ch)))
 37 | 
 38 |     def _read_edge(self):
 39 |         val = self._next_char()
 40 |         ch = self._peek_char()
 41 |         if ch != '>':
 42 |             raise ParseError(self._prep_err('Expected "->", got "-{}"'.format(ch)))
 43 |         val += self._next_char()
 44 |         return Token('edge', val)
 45 | 
 46 |     def _read_node(self):
 47 |         nd = self._read_while(GraphParser._is_id)
 48 |         return Token('node', nd)
 49 | 
 50 |     def parse(self):
 51 |         prev_node = None
 52 | 
 53 |         while True:
 54 |             tk = self._next()
 55 | 
 56 |             if tk.type == 'node':
 57 |                 self.nodes.add(tk.value)
 58 |                 prev_node = tk.value
 59 |             if tk.type == 'edge':
 60 |                 if not prev_node:
 61 |                     raise ParseError(self._prep_err('Cannot find a source node'))
 62 |                 nx = self._peek()
 63 |                 if nx.type != 'node':
 64 |                     raise ParseError(self._prep_err('Cannot find a target node'))
 65 |                 self.edges.add(Edge(prev_node, nx.value))
 66 |             if tk.type == 'eof':
 67 |                 break
 68 | 
 69 |         return self.nodes, self.edges
 70 | 
 71 |     def replace_graph(self, decs):
 72 |         """ Replace the block-level decision nodes in the graph with option nodes."""
 73 |         # replace nodes
 74 |         nds = []
 75 |         for nd in self.nodes:
 76 |             tmp = decs[nd] if nd in decs else [nd]
 77 |             nds.extend(tmp)
 78 | 
 79 |         # replace edges
 80 |         egs = []
 81 |         for eg in self.edges:
 82 |             ss = decs[eg.start] if eg.start in decs else [eg.start]
 83 |             es = decs[eg.end] if eg.end in decs else [eg.end]
 84 |             egs.extend([Edge(s, e) for s in ss for e in es])
 85 | 
 86 |         self.nodes = set(nds)
 87 |         self.edges = set(egs)
 88 |         return self.nodes, self.edges
 89 | 
 90 |     def create_default_graph(self, nodes):
 91 |         """
 92 |         Create the default graph, which is a linear flow of blocks, with the
 93 |         same order as they appear in the template script.
 94 |         :param nodes: A list of unique blocks.
 95 |         :return: nodes and edges
 96 |         """
 97 |         self.nodes = set(nodes)
 98 |         self.edges = set()
 99 |         for i in range(len(nodes) - 1):
100 |             self.edges.add(Edge(nodes[i], nodes[i + 1]))
101 |         return self.nodes, self.edges
102 | 


--------------------------------------------------------------------------------
/example/fertility_r/template.R:
--------------------------------------------------------------------------------
 1 | # --- (BOBA_CONFIG)
 2 | {
 3 |   "decisions": [
 4 |     {"var": "fertility_bounds", "options": [
 5 |       "c(7, 14, 17, 25, 17, 25)",
 6 |       "c(6, 14, 17, 27, 17, 27)",
 7 |       "c(9, 17, 18, 25, 18, 25)",
 8 |       "c(8, 14, 1, 7, 15, 28)",
 9 |       "c(9, 17, 1, 8, 18, 28)"
10 |     ]},
11 |     {"var": "relationship_bounds", "options": [
12 |       "c(2, 3)", "c(1, 2)", "c(1, 3)"
13 |     ]}
14 |   ],
15 |   "outputs": [
16 |     {"name": "p-value", "value": "summar$coefficients[4, 4]"}
17 |   ],
18 |   "before_execute": "cp ../durante_etal_2013_study1.txt ./code/"
19 | }
20 | # --- (END)
21 | 
22 | #read in raw data from Study 1
23 | df <- read.csv2("durante_etal_2013_study1.txt", sep = "")
24 | 
25 | # create religiosity score
26 | df$RelComp <- round(rowMeans(cbind(df$Rel1, df$Rel2, df$Rel3), na.rm = TRUE), digits = 2)
27 | 
28 | # next menstrual onset (nmo) assessment
29 | Sys.setenv(TZ="Europe/Berlin") # suppress time zone warning
30 | df$DateTesting <- as.Date(df$DateTesting, format = "%m/%d/%y")
31 | df$StartDateofLastPeriod <- as.Date(df$StartDateofLastPeriod, format = "%m/%d/%y")
32 | df$StartDateofPeriodBeforeLast <- as.Date(df$StartDateofPeriodBeforeLast,
33 |                                                     format = "%m/%d/%y")
34 | df$ComputedCycleLength <- df$StartDateofLastPeriod - df$StartDateofPeriodBeforeLast
35 | 
36 | # --- (NMO) computed
37 | # first nmo option: based on computed cycle length
38 | df$NextMenstrualOnset <- df$StartDateofLastPeriod + df$ComputedCycleLength
39 | 
40 | # --- (NMO) reported
41 | # second nmo option: based on reported cycle length
42 | df$NextMenstrualOnset <- df$StartDateofLastPeriod + df$ReportedCycleLength
43 | 
44 | # # --- (NMO) estimate
45 | # # third nmo option: based on reported estimate of next menstrual onset
46 | # # note: this is not available in study one
47 | # df$NextMenstrualOnset <- df$StartDateNext
48 | 
49 | # --- (ECL) computed @if NMO != reported
50 | # exclusion based on computed cycle length
51 | df <- df[!(df$ComputedCycleLength < 25 | df$ComputedCycleLength > 35), ]
52 | 
53 | # --- (ECL) reported @if NMO != computed
54 | # exclusion based on reported cycle length
55 | df <- df[!(df$ReportedCycleLength < 25 | df$ReportedCycleLength > 35), ]
56 | 
57 | # --- (ECL) none
58 | # include all cycle lengths
59 | 
60 | # --- (A)
61 | # compute cycle day
62 | df$DaysBeforeNextOnset <- df$NextMenstrualOnset - df$DateTesting
63 | df$CycleDay <- 28 - df$DaysBeforeNextOnset
64 | df$CycleDay <- ifelse(df$CycleDay <1, 1, df$CycleDay)
65 | df$CycleDay <- ifelse(df$CycleDay > 28, 28, df$CycleDay)
66 | 
67 | # fertility assessment
68 | bounds = {{fertility_bounds}}
69 | df$Fertility <- rep(NA, dim(df)[1])  # create fertility variable
70 | df$Fertility[df$CycleDay >= bounds[1] & df$CycleDay <= bounds[2]] <- "High"
71 | df$Fertility[df$CycleDay >= bounds[3] & df$CycleDay <= bounds[4]] <- "Low"
72 | df$Fertility[df$CycleDay >= bounds[5] & df$CycleDay <= bounds[6]] <- "Low"
73 | 
74 | # relationship status assessment
75 | rel.bounds = {{relationship_bounds}}
76 | df$RelationshipStatus[df$Relationship <= rel.bounds[1]] <- "Single"
77 | df$RelationshipStatus[df$Relationship >= rel.bounds[2]] <- "Relationship"
78 | 
79 | # --- (EC) certainty
80 | # exclusion based on certainty ratings
81 | df <- df[!(df$Sure1 < 6 | df$Sure2 < 6), ]
82 | 
83 | # --- (EC) none
84 | # include all certainty ratings
85 | 
86 | # --- (B)
87 | # perform an ANOVA on the processed data set
88 | df$Fertility <- factor(df$Fertility)
89 | df$RelationshipStatus <- factor(df$RelationshipStatus)
90 | an = lm("RelComp~Fertility*RelationshipStatus", df)
91 | summar <- summary(an)
92 | # the p-value of the fertility x relationship interaction
93 | summar$coefficients[4, 4]
94 | 


--------------------------------------------------------------------------------
/example/reading/r/template.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # --- (BOBA_CONFIG)
  3 | {
  4 |   "graph": [
  5 |     "RC->LM1->O1",
  6 |     "RC->LM2->O2",
  7 |     "OLR1->O1",
  8 |     "OLR2->O2"
  9 |   ],
 10 |   "decisions": [
 11 |     {"var": "brmsfamily", "options": ["shifted_lognormal", "lognormal"]}
 12 |   ],
 13 |   "outputs": [
 14 |     {"name": "aic/waic", "value": "aic"}
 15 |   ],
 16 |   "before_execute": "cp ../../data.csv ./code/ && mkdir results"
 17 | }
 18 | # --- (END)
 19 | 
 20 | library(readr)
 21 | library(lmerTest)
 22 | library(car)
 23 | library(psych)
 24 | library(scales)
 25 | library(brms)
 26 | library(ordinal)
 27 | 
 28 | speed_data <- read_csv('data.csv')
 29 | 
 30 | # calculate reading speed in WPM
 31 | speed_data$speed <- speed_data$num_words/(speed_data$adjust_rt/60000)
 32 | 
 33 | # remove retake participants
 34 | speed_data <- subset(speed_data, retake != 1)
 35 | 
 36 | # remove outliers
 37 | iqr = IQR(speed_data[speed_data$dyslexia_bin == 0,]$speed,na.rm=TRUE)
 38 | cutoff_high = median(speed_data$speed) +3*iqr #3*iqr=645, cutoff_high = 928
 39 | 
 40 | # remove trials based on speed
 41 | result_analysis <- speed_data[! speed_data$speed > cutoff_high, ]
 42 | result_analysis <- result_analysis[ ! result_analysis$speed < 10,]
 43 | 
 44 | # remove smartphone users
 45 | # removed 64 smartphone users, 363 trials
 46 | result_analysis <- result_analysis[! result_analysis$device == 'smartphone',]
 47 | 
 48 | # wrangle variables
 49 | result_analysis$log_speed <- log(result_analysis$speed)
 50 | result_analysis$dyslexia = as.factor(result_analysis$dyslexia)
 51 | result_analysis$correct_num = round(result_analysis$correct_rate * 3, 0)
 52 | result_analysis$acc = result_analysis$correct_num + 1
 53 | result_analysis$correct_num = as.factor(result_analysis$correct_num)
 54 | 
 55 | # --- (RC)
 56 | # remove trials based on comprehension < 2/3
 57 | # removed 111 trials
 58 | result_analysis <- result_analysis[ ! result_analysis$correct_rate < .6,]
 59 | 
 60 | # --- (LM1)
 61 | # fit linear mixed model
 62 | model <- lmer(log_speed ~ page_condition*dyslexia + img_width + num_words + age + english_native + (1 | uuid),
 63 |               data = result_analysis)
 64 | print.odds = FALSE
 65 | 
 66 | # --- (OLR1)
 67 | # fit ordinal logistic regression using accuracy as DV
 68 | model <- clmm(correct_num ~ page_condition*dyslexia + num_words + age + english_native + (1 | uuid),
 69 |               data=result_analysis)
 70 | print.odds = TRUE
 71 | 
 72 | # --- (LM2)
 73 | # fit bayesian model
 74 | model <- brm(speed ~ page_condition*dyslexia + img_width + num_words + age + english_native + (1 | uuid),
 75 |              data = result_analysis, family = {{brmsfamily}}(), file = '../results/brmsfit_{{_n}}',
 76 |              save_all_pars = TRUE, silent = TRUE, refresh = 0, seed = 0,
 77 |              chains = 4, cores = 4, iter = 1000)
 78 | 
 79 | # --- (OLR2)
 80 | # fit bayesian model to accuracy
 81 | model <- brm(acc ~ page_condition*dyslexia + num_words + age + english_native + (1 | uuid),
 82 |              data = result_analysis, family = cumulative(), file = '../results/brmsfit_{{_n}}',
 83 |              save_all_pars = TRUE, silent = TRUE, refresh = 0, seed = 0,
 84 |              chains = 4, cores = 4, iter = 1000)
 85 | 
 86 | # --- (O1)
 87 | aic = AIC(model)
 88 | sink('../results/summary_{{_n}}.txt')
 89 | summary(model)
 90 | 
 91 | if(print.odds){
 92 |     print("Odds ratio:")
 93 |     exp(coef(model))
 94 | }
 95 | 
 96 | # --- (O2)
 97 | # evaluate fit
 98 | aic = waic(model)$waic
 99 | 
100 | # output resultsf
101 | sink('../results/summary_{{_n}}.txt')
102 | summary(model)
103 | sink()
104 | pdf(file="../results/plots_{{_n}}.pdf")
105 | plot(model)
106 | marginal_effects(model)
107 | 


--------------------------------------------------------------------------------
/example/reading/python/template.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import statsmodels.formula.api as smf
  6 | 
  7 | # --- (BOBA_CONFIG)
  8 | {
  9 |   "graph": [
 10 |     "B1->C",
 11 |     "B2->C",
 12 |     "C->D1->F1",
 13 |     "C->D2->F1",
 14 |     "C->F2"
 15 |   ],
 16 |   "decisions": [
 17 |     {"var": "sample_size", "options": [1284, 2568]},
 18 |     {"var": "rt", "options": ["adjust_rt", "rt"] },
 19 |     {"var": "bad_device", "options": [[], ["smartphone"], ["smartphone", "tablet"]]},
 20 |     {"var": "dyslexia", "options": ["dyslexia", "dyslexia_bin"] },
 21 |     {"var": "min_wpm", "options": [10, 150] },
 22 |     {"var": "formula", "options":  [
 23 |       "log_speed ~ page_condition*dyslexia",
 24 |       "log_speed ~ page_condition*dyslexia + img_width + num_words + age + english_native",
 25 |       "log_speed ~ page_condition*dyslexia + img_width + num_words + age + english_native + device + edu_level"
 26 |     ]}
 27 |   ],
 28 |   "before_execute": "cp ../../data.csv ./code/"
 29 | }
 30 | # --- (END)
 31 | 
 32 | if __name__ == '__main__':
 33 |     # read data
 34 |     df = pd.read_csv('./data.csv')
 35 | 
 36 |     # take the first N participants to simulate stopping condition
 37 |     df = df[:{{sample_size}}]
 38 | 
 39 |     # calculate reading speed in WPM
 40 |     df['speed'] = df.apply(lambda row: row.num_words / row['{{rt}}'] * 60000,
 41 |                            axis=1)
 42 | 
 43 |     # convert education level into an ordinal variable
 44 |     edu_order = ['pre-high school', 'high school', 'professional school',
 45 |                  'college', 'graduate school', 'PhD', 'postdoctoral']
 46 |     tp = pd.CategoricalDtype(categories=edu_order, ordered=True)
 47 |     df['edu_level'] = df.education.astype(tp).cat.codes
 48 | 
 49 |     # remove retake participants
 50 |     df = df[df.retake != 1]
 51 | 
 52 |     # remove smart phone users
 53 |     df = df[~df.device.isin({{bad_device}})]
 54 | 
 55 |     # remove outliers based on reading speed
 56 |     # --- (B1)
 57 |     # remove reading speed outside median + 3 x iqr
 58 |     iqr = np.subtract(*np.percentile(df.speed, [75, 25]))
 59 |     cutoff_high = np.median(df.speed) + 3 * iqr
 60 | 
 61 |     # --- (B2)
 62 |     # remove reading speed outside mean + 2 x std
 63 |     cutoff_high = np.mean(df.speed) + 2 * np.std(df.speed)
 64 | 
 65 |     # --- (C)
 66 |     cutoff_low = {{min_wpm}}
 67 |     df = df[df.speed <= cutoff_high]
 68 |     df = df[df.speed >= cutoff_low]
 69 | 
 70 |     # drop NA rows
 71 |     df = df.dropna()
 72 | 
 73 |     # log-normalize speed
 74 |     df['log_speed'] = np.log(df.speed)
 75 | 
 76 |     # decision: whether to bin dyslexia or not
 77 |     df.dyslexia = df['{{dyslexia}}']
 78 | 
 79 |     # make dyslexia a categorical variable
 80 |     df.dyslexia = df.dyslexia.astype('category')
 81 | 
 82 |     # remove trials based on comprehension < 2/3
 83 |     # --- (D1)
 84 |     # just remove trials
 85 |     df = df[df.correct_rate > 0.6]
 86 | 
 87 |     # --- (D2)
 88 |     # drop entire participants
 89 |     bad_uuid = set()
 90 |     for i, row in df.iterrows():
 91 |         if row.correct_rate < 0.6:
 92 |             bad_uuid.add(str(row.uuid))
 93 |     df = df[~df.uuid.isin(bad_uuid)]
 94 | 
 95 |     # --- (F1)
 96 |     # fit a linear mixed effects model
 97 |     fml = '{{formula}}'
 98 |     model = smf.mixedlm(fml, df, groups=df.uuid).fit()
 99 |     print(model.summary())
100 | 
101 |     # --- (F2)
102 |     # fit a multinomial logit model to accuracy
103 |     df['acc'] = 3 - pd.Categorical(df.correct_rate).codes
104 |     fml = 'acc ~ page_condition*dyslexia_bin'
105 |     model = smf.mnlogit(fml, df).fit()
106 |     print(model.summary())
107 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ====
 2 | boba
 3 | ====
 4 | 
 5 | Author and visualize multiverse analysis with ease.
 6 | 
 7 | Boba has a domain specific language (Boba DSL) for writing multiverse specifications,
 8 | and a visual analysis interface (`Boba Visualizer`_) for exploring multiverse outcomes.
 9 | Boba comes with a command line tool to parse your DSL specification and generate
10 | universe scripts, execute all scripts with a single command,
11 | merges outputs into a table, and invoke the visualizer.
12 | 
13 | - works with both python and R, and other scripting languages to come
14 | - handles simple parameter substitution as well as complex code flow dependency
15 | - offers interactive visualizations for exploring consequential decisions, uncertainty, model fit, and more
16 | 
17 | .. _Boba Visualizer: https://github.com/uwdata/boba-visualizer
18 | .. image:: https://badge.fury.io/py/boba.svg
19 |   :target: https://badge.fury.io/py/boba
20 | .. image:: https://travis-ci.org/uwdata/boba.svg?branch=master
21 |   :target: https://travis-ci.org/uwdata/boba
22 | .. image:: https://img.shields.io/badge/License-BSD%203--Clause-blue.svg
23 |   :target: https://opensource.org/licenses/BSD-3-Clause)
24 | .. image:: https://img.shields.io/pypi/pyversions/boba
25 | 
26 | Installation
27 | ============
28 | 
29 | You might download and install the latest version of this software from the
30 | Python package index (PyPI)::
31 | 
32 |   pip install --upgrade boba
33 |   pip install --upgrade boba-visualizer
34 | 
35 | Usage
36 | =====
37 | 
38 | To author your multiverse, please refer to the specification rules_.
39 | Here is a `simple example`_ to get you started!
40 | 
41 | 
42 | To parse your specification and generate actual scripts, invoke boba and pass
43 | in the path to your template script and your JSON spec::
44 | 
45 |   boba compile --script template.py
46 | 
47 | To execute the multiverse, namely running all the generated scripts, use::
48 | 
49 |   boba run --all
50 | 
51 | To start the Boba Visualizer after getting the intermediate output files, use::
52 | 
53 |   boba-server
54 | 
55 | For more command line options, see `CLI`_.
56 | For more information about the Boba Visualizer, see this project_.
57 | 
58 | .. _rules: https://github.com/uwdata/boba/blob/master/tutorial/rules.md
59 | .. _simple example: https://github.com/uwdata/boba/blob/master/tutorial/simple.md
60 | .. _more complex example: https://github.com/uwdata/boba/blob/master/tutorial/fertility.md
61 | .. _CLI: https://github.com/uwdata/boba/blob/master/tutorial/cli.rst
62 | .. _project: https://github.com/uwdata/boba-visualizer
63 | 
64 | Examples
65 | ========
66 | 
67 | - A `simple example`_ to walk you through the basics
68 | - A `more complex example`_ using `Steegen's multiverse analysis`_ and `Durante's fertility dataset`_.
69 | - Another multiverse example_, based on the `specification curve paper`_ by Simonsohn et al.
70 | 
71 | .. _reading speed dataset: https://github.com/QishengLi/CHI2019_Reader_View
72 | .. _analysis: https://github.com/uwdata/boba/tree/master/example/reading
73 | .. _example: https://github.com/uwdata/boba/tree/master/example/hurricane
74 | .. _specification curve paper: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2694998
75 | .. _Steegen's multiverse analysis: https://journals.sagepub.com/doi/pdf/10.1177/1745691616658637
76 | .. _Durante's fertility dataset: https://osf.io/zj68b/
77 | 
78 | Citation
79 | ========
80 | 
81 | If you are interested in this work, please see our research paper_ and consider citing our work::
82 | 
83 |   @misc{liu2020boba,
84 |     title={Boba: Authoring and visualizing multiverse analyses},
85 |     author={Yang Liu and Alex Kale and Tim Althoff and Jeffrey Heer},
86 |     year={2020},
87 |     eprint={2007.05551},
88 |     archivePrefix={arXiv},
89 |     primaryClass={cs.HC}
90 |   }
91 | 
92 | .. _paper: https://arxiv.org/abs/2007.05551


--------------------------------------------------------------------------------
/example/fertility/template.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import statsmodels.api as sm
  6 | import statsmodels.formula.api as smf
  7 | # --- (BOBA_CONFIG)
  8 | {
  9 |   "graph": [
 10 |     "NMO1->ECL1->A",
 11 |     "NMO2->ECL2->A",
 12 |     "NMO1->A",
 13 |     "NMO2->A",
 14 |     "A->B",
 15 |     "A->EC->B"
 16 |   ],
 17 |   "decisions": [
 18 |     {"var": "fertility_bounds", "options": [
 19 |       [[7, 14], [17, 25], [17, 25]],
 20 |       [[6, 14], [17, 27], [17, 27]],
 21 |       [[9, 17], [18, 25], [18, 25]],
 22 |       [[8, 14], [1, 7], [15, 28]],
 23 |       [[9, 17], [1, 8], [18, 28]]
 24 |     ]},
 25 |     {"var": "relationship_bounds",
 26 |       "options": [[2, 3], [1, 2], [1, 3]]}
 27 |   ],
 28 |   "before_execute": "cp ../durante_etal_2013_study1.txt ./code/"
 29 | }
 30 | # --- (END)
 31 | 
 32 | if __name__ == '__main__':
 33 |     # read data file
 34 |     df = pd.read_csv('durante_etal_2013_study1.txt', delimiter='\t')
 35 | 
 36 |     # remove NA
 37 |     df = df.dropna(subset=['rel1', 'rel2', 'rel3'])
 38 | 
 39 |     # create religiosity score
 40 |     df['rel_comp'] = np.around((df.rel1 + df.rel2 + df.rel3) / 3, decimals=2)
 41 | 
 42 |     # next menstrual onset (nmo) assessment
 43 |     df.last_period_start = pd.to_datetime(df.last_period_start)
 44 |     df.period_before_last_start = pd.to_datetime(df.period_before_last_start)
 45 |     df.date_testing = pd.to_datetime(df.date_testing)
 46 | 
 47 |     # --- (NMO1)
 48 |     # first nmo option: based on computed cycle length
 49 |     cl = df.last_period_start - df.period_before_last_start
 50 |     next_onset = df.last_period_start + cl
 51 |     df['computed_cycle_length'] = (cl / np.timedelta64(1, 'D')).astype(int)
 52 | 
 53 |     # --- (NMO2)
 54 |     # second nmo option: based on reported cycle length
 55 |     df = df.dropna(subset=['reported_cycle_length'])
 56 |     next_onset = df.last_period_start + df.reported_cycle_length.apply(
 57 |         lambda a: pd.Timedelta(days=a))
 58 | 
 59 |     # --- (ECL1)
 60 |     # exclusion based on computed cycle length
 61 |     df = df[(df.computed_cycle_length >= 25) & (df.computed_cycle_length <= 35)]
 62 | 
 63 |     # --- (ECL2)
 64 |     # exclusion based on reported cycle length
 65 |     df = df[(df.reported_cycle_length >= 25) & (df.reported_cycle_length <= 35)]
 66 | 
 67 |     # --- (A)
 68 |     # compute cycle day
 69 |     df['cycle_day'] = pd.Timedelta('28 days') - (next_onset - df.date_testing)
 70 |     df.cycle_day = (df.cycle_day / np.timedelta64(1, 'D')).astype(int)
 71 |     df.cycle_day = np.clip(df.cycle_day, 1, 28)
 72 | 
 73 |     # fertility assessment
 74 |     high_bounds = {{fertility_bounds}}[0]
 75 |     low_bounds1 = {{fertility_bounds}}[1]
 76 |     low_bounds2 = {{fertility_bounds}}[2]
 77 |     df.loc[(high_bounds[0] <= df.cycle_day) & (df.cycle_day <= high_bounds[1]),
 78 |            'fertility'] = 'High'
 79 |     df.loc[(low_bounds1[0] <= df.cycle_day) & (df.cycle_day <= low_bounds1[1]),
 80 |            'fertility'] = 'Low'
 81 |     df.loc[(low_bounds2[0] <= df.cycle_day) & (df.cycle_day <= low_bounds2[1]),
 82 |            'fertility'] = 'Low'
 83 | 
 84 |     # relationship status assessment
 85 |     # single = response options 1 and 2; relationship = response options 3 and 4
 86 |     df.loc[df.relationship <= {{relationship_bounds}}[0],
 87 |            'relationship_status'] = 'Single'
 88 |     df.loc[df.relationship >= {{relationship_bounds}}[1],
 89 |            'relationship_status'] = 'Relationship'
 90 | 
 91 |     # --- (EC)
 92 |     # exclusion based on certainty ratings
 93 |     df = df[(df.sure1 >= 6) & (df.sure2 >= 6)]
 94 | 
 95 |     # --- (B)
 96 |     # perform an ANOVA on the processed data set
 97 |     lm = smf.ols('rel_comp ~ relationship_status * fertility', data=df).fit()
 98 |     table = sm.stats.anova_lm(lm, typ=2)
 99 |     print(table)
100 | 


--------------------------------------------------------------------------------
/example/simple/data.csv:
--------------------------------------------------------------------------------
  1 | x,y
  2 | 4.583918298882086,13.079990304560285
  3 | 0.30056854367327024,9.330556404593688
  4 | 1.2698385703363135,13.470556335806807
  5 | 2.4609400820343947,8.939990373347166
  6 | 1.8866002121101615,13.861122367053326
  7 | 0.32484042349493036,8.549424342100647
  8 | 2.659870058057336,11.153192706623981
  9 | 3.507132111497367,12.053098856383919
 10 | 1.4381551638392343,10.828392583697664
 11 | 3.931908974207081,11.808371158699313
 12 | 1.3697559846527085,10.269450827872694
 13 | 0.40406774336411655,10.242678602433266
 14 | 1.3121428182320516,10.920125678505766
 15 | 3.2788086733310413,11.493353333480904
 16 | 2.935555345300246,11.34157864056587
 17 | 0.17777224273899328,10.235834507052907
 18 | 4.364812565880889,12.105686642360926
 19 | 4.821388372116607,12.400326143190883
 20 | 2.6717615513891824,11.144537234275228
 21 | 0.4348374823413731,9.98469118257374
 22 | 3.0964700345313814,11.21974014049093
 23 | 1.7839201289999962,10.933488206767475
 24 | 4.266818769719739,12.407242753086068
 25 | 0.22052654472462352,10.59736596480809
 26 | 0.6563787765526785,10.545771192487592
 27 | 3.416610857323886,11.816243419817742
 28 | 0.0802800394904396,9.826632245420347
 29 | 3.661445614546677,11.638833441675093
 30 | 1.684436444392035,10.927248777686911
 31 | 2.007584105581906,10.864779349017715
 32 | 3.0486016633164277,11.1478208385455
 33 | 2.947939514747561,11.22047543782475
 34 | 4.506414990794167,12.225569684786457
 35 | 1.244018216385217,10.252740673398135
 36 | 3.289282623798896,12.001629281830962
 37 | 0.05067012803057991,10.013217827293415
 38 | 1.6263180752938915,10.8378561530127
 39 | 3.0231972920148027,11.76765653315177
 40 | 2.829254821027227,11.420580285395145
 41 | 3.7576233536426518,11.936002317485991
 42 | 3.89420722703682,11.905725812894074
 43 | 3.438191772991841,11.602232855572659
 44 | 3.0895148411312547,11.661065677835904
 45 | 3.7153200790225007,11.559953421078564
 46 | 2.5880794649558654,11.39308245523173
 47 | 0.9075578496290465,10.501341853452686
 48 | 1.2871348832314604,10.612709938566729
 49 | 2.150181020660253,11.08618852377759
 50 | 4.227392912911492,12.160532976236091
 51 | 4.347151369108863,12.10185461276485
 52 | 1.2900203678061488,10.495187063648077
 53 | 3.0507113334582687,11.70386024252094
 54 | 4.472572013339909,12.442609294633272
 55 | 0.27507220431418455,9.92961086741772
 56 | 4.95255795141159,12.172037894638441
 57 | 2.187690851944044,10.879864925834301
 58 | 3.3030659113066942,11.557920082823808
 59 | 4.453069421373162,12.193117922890561
 60 | 2.561837519566263,11.536414504912928
 61 | 0.8310782502405201,10.427354286565777
 62 | 2.319393910459352,10.943113864300257
 63 | 1.8495795260884962,11.044041724265039
 64 | 4.369809683527437,12.316214488898776
 65 | 4.7286559063267255,12.732221564219854
 66 | 2.132724499371575,11.129810065327995
 67 | 0.6316333581952194,10.013186502776936
 68 | 1.7825762599908446,10.905235687487785
 69 | 0.09938029432887963,10.017546152657244
 70 | 1.9839363708410773,10.950687015270212
 71 | 2.147324686723473,11.418973562688278
 72 | 3.8727660667095263,11.754094323525093
 73 | 4.6491124436662385,12.293169674588764
 74 | 3.9636955957971582,11.731018926883527
 75 | 1.2831687251881223,10.775362213956697
 76 | 3.027460297930971,11.783257068467368
 77 | 2.4194658972220817,11.338930051249813
 78 | 1.933243097657601,10.646858847439713
 79 | 4.6500127533476086,12.519913675054616
 80 | 0.9279426196129131,10.061009331065804
 81 | 0.7088392759937251,10.211592165570067
 82 | 1.6972816274117375,10.609295205879185
 83 | 0.7291005720188815,10.146174335915244
 84 | 1.5135446237931238,10.827942182777722
 85 | 0.7863892515036303,10.440251364931768
 86 | 0.7369591945983167,10.126055811688161
 87 | 0.4484281422642783,10.238798585570631
 88 | 0.6476846910638945,10.670529311857813
 89 | 4.17949792663603,11.97731742079653
 90 | 0.9426370844467991,10.284665332731757
 91 | 0.3850137632566658,10.28472156012984
 92 | 4.264099492071075,12.119492507525234
 93 | 3.1938080241767333,11.728495313517408
 94 | 3.0934454156259017,11.501534794005355
 95 | 4.556323953281537,12.4798273798697
 96 | 2.5318064635878756,11.483549596279689
 97 | 4.510113468263804,12.130591650717289
 98 | 2.667606009197513,11.20696909368087
 99 | 4.197221426103427,12.406283640395031
100 | 0.6870158534517201,10.01897145287496
101 | 3.541509668147845,12.09678789495433
102 | 


--------------------------------------------------------------------------------
/example/simple_cont/data.csv:
--------------------------------------------------------------------------------
  1 | x,y
  2 | 4.583918298882086,13.079990304560285
  3 | 0.30056854367327024,9.330556404593688
  4 | 1.2698385703363135,13.470556335806807
  5 | 2.4609400820343947,8.939990373347166
  6 | 1.8866002121101615,13.861122367053326
  7 | 0.32484042349493036,8.549424342100647
  8 | 2.659870058057336,11.153192706623981
  9 | 3.507132111497367,12.053098856383919
 10 | 1.4381551638392343,10.828392583697664
 11 | 3.931908974207081,11.808371158699313
 12 | 1.3697559846527085,10.269450827872694
 13 | 0.40406774336411655,10.242678602433266
 14 | 1.3121428182320516,10.920125678505766
 15 | 3.2788086733310413,11.493353333480904
 16 | 2.935555345300246,11.34157864056587
 17 | 0.17777224273899328,10.235834507052907
 18 | 4.364812565880889,12.105686642360926
 19 | 4.821388372116607,12.400326143190883
 20 | 2.6717615513891824,11.144537234275228
 21 | 0.4348374823413731,9.98469118257374
 22 | 3.0964700345313814,11.21974014049093
 23 | 1.7839201289999962,10.933488206767475
 24 | 4.266818769719739,12.407242753086068
 25 | 0.22052654472462352,10.59736596480809
 26 | 0.6563787765526785,10.545771192487592
 27 | 3.416610857323886,11.816243419817742
 28 | 0.0802800394904396,9.826632245420347
 29 | 3.661445614546677,11.638833441675093
 30 | 1.684436444392035,10.927248777686911
 31 | 2.007584105581906,10.864779349017715
 32 | 3.0486016633164277,11.1478208385455
 33 | 2.947939514747561,11.22047543782475
 34 | 4.506414990794167,12.225569684786457
 35 | 1.244018216385217,10.252740673398135
 36 | 3.289282623798896,12.001629281830962
 37 | 0.05067012803057991,10.013217827293415
 38 | 1.6263180752938915,10.8378561530127
 39 | 3.0231972920148027,11.76765653315177
 40 | 2.829254821027227,11.420580285395145
 41 | 3.7576233536426518,11.936002317485991
 42 | 3.89420722703682,11.905725812894074
 43 | 3.438191772991841,11.602232855572659
 44 | 3.0895148411312547,11.661065677835904
 45 | 3.7153200790225007,11.559953421078564
 46 | 2.5880794649558654,11.39308245523173
 47 | 0.9075578496290465,10.501341853452686
 48 | 1.2871348832314604,10.612709938566729
 49 | 2.150181020660253,11.08618852377759
 50 | 4.227392912911492,12.160532976236091
 51 | 4.347151369108863,12.10185461276485
 52 | 1.2900203678061488,10.495187063648077
 53 | 3.0507113334582687,11.70386024252094
 54 | 4.472572013339909,12.442609294633272
 55 | 0.27507220431418455,9.92961086741772
 56 | 4.95255795141159,12.172037894638441
 57 | 2.187690851944044,10.879864925834301
 58 | 3.3030659113066942,11.557920082823808
 59 | 4.453069421373162,12.193117922890561
 60 | 2.561837519566263,11.536414504912928
 61 | 0.8310782502405201,10.427354286565777
 62 | 2.319393910459352,10.943113864300257
 63 | 1.8495795260884962,11.044041724265039
 64 | 4.369809683527437,12.316214488898776
 65 | 4.7286559063267255,12.732221564219854
 66 | 2.132724499371575,11.129810065327995
 67 | 0.6316333581952194,10.013186502776936
 68 | 1.7825762599908446,10.905235687487785
 69 | 0.09938029432887963,10.017546152657244
 70 | 1.9839363708410773,10.950687015270212
 71 | 2.147324686723473,11.418973562688278
 72 | 3.8727660667095263,11.754094323525093
 73 | 4.6491124436662385,12.293169674588764
 74 | 3.9636955957971582,11.731018926883527
 75 | 1.2831687251881223,10.775362213956697
 76 | 3.027460297930971,11.783257068467368
 77 | 2.4194658972220817,11.338930051249813
 78 | 1.933243097657601,10.646858847439713
 79 | 4.6500127533476086,12.519913675054616
 80 | 0.9279426196129131,10.061009331065804
 81 | 0.7088392759937251,10.211592165570067
 82 | 1.6972816274117375,10.609295205879185
 83 | 0.7291005720188815,10.146174335915244
 84 | 1.5135446237931238,10.827942182777722
 85 | 0.7863892515036303,10.440251364931768
 86 | 0.7369591945983167,10.126055811688161
 87 | 0.4484281422642783,10.238798585570631
 88 | 0.6476846910638945,10.670529311857813
 89 | 4.17949792663603,11.97731742079653
 90 | 0.9426370844467991,10.284665332731757
 91 | 0.3850137632566658,10.28472156012984
 92 | 4.264099492071075,12.119492507525234
 93 | 3.1938080241767333,11.728495313517408
 94 | 3.0934454156259017,11.501534794005355
 95 | 4.556323953281537,12.4798273798697
 96 | 2.5318064635878756,11.483549596279689
 97 | 4.510113468263804,12.130591650717289
 98 | 2.667606009197513,11.20696909368087
 99 | 4.197221426103427,12.406283640395031
100 | 0.6870158534517201,10.01897145287496
101 | 3.541509668147845,12.09678789495433
102 | 


--------------------------------------------------------------------------------
/boba/cli.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Console script."""
  4 | import click
  5 | import shutil
  6 | import os
  7 | import pandas as pd
  8 | from .parser import Parser
  9 | from .output.csvmerger import CSVMerger
 10 | from .bobarun import BobaRun
 11 | 
 12 | 
 13 | @click.command()
 14 | @click.option('--script', '-s', help='Path to template script',
 15 |               default='./template.py', show_default=True)
 16 | @click.option('--out', help='Output directory',
 17 |               default='.', show_default=True)
 18 | @click.option('--lang', help='Language, can be python/R [default: inferred from file extension]',
 19 |               default=None)
 20 | def compile(script, out, lang):
 21 |     """Generate multiverse analysis from specifications."""
 22 | 
 23 |     check_path(script)
 24 | 
 25 |     click.echo('Creating multiverse from {}'.format(script))
 26 |     ps = Parser(script, out, lang)
 27 |     ps.main()
 28 | 
 29 |     ex = """To execute the multiverse, run the following commands:
 30 |     boba run --all
 31 |     """.format(os.path.join(out, 'multiverse'))
 32 |     click.secho('Success!', fg='green')
 33 |     click.secho(ex, fg='green')
 34 | 
 35 | 
 36 | def check_path(p):
 37 |     """Check if the path exists"""
 38 |     if not os.path.exists(p):
 39 |         msg = 'Error: Path "{}" does not exist.'.format(p)
 40 |         print_help(msg)
 41 | 
 42 | 
 43 | def print_help(err=''):
 44 |     """Show help message and exit."""
 45 |     ctx = click.get_current_context()
 46 |     click.echo(ctx.get_help())
 47 | 
 48 |     if err:
 49 |         click.echo('\n' + err)
 50 |     ctx.exit()
 51 | 
 52 | 
 53 | @click.command()
 54 | @click.argument('num', nargs=1, default=-1)
 55 | @click.option('--all', '-a', 'run_all', is_flag=True,
 56 |               help='Execute all universes')
 57 | @click.option('--thru', default=-1, help='Run until this universe number')
 58 | @click.option('--jobs', default=1, help='The number of universes that can be running at a time.')
 59 | @click.option('--batch_size', default=0, help='The approximate number of universes a processor will run in a row.')
 60 | @click.option('--dir', 'folder', help='Multiverse directory',
 61 |               default='./multiverse', show_default=True)
 62 | def run(folder, run_all, num, thru, jobs, batch_size):
 63 |     """ Execute the generated universe scripts.
 64 | 
 65 |     Run all universes: boba run --all
 66 | 
 67 |     Run a single universe, for example universe_1: boba run 1
 68 | 
 69 |     Run a range of universes for example 1 through 5: boba run 1 --thru 5
 70 |     """
 71 | 
 72 |     check_path(folder)
 73 | 
 74 |     df = pd.read_csv(folder + '/summary.csv')
 75 |     num_universes = df.shape[0]
 76 | 
 77 |     if not run_all:
 78 |         if thru == -1:
 79 |             thru = num
 80 |         if num < 1:
 81 |             print_help()
 82 |         if thru < num:
 83 |             print_help('The thru parameter cannot be less than the num parameter.')
 84 |         if num > num_universes or thru > num_universes:
 85 |             print_help(f'There are only {num_universes} universes.')
 86 | 
 87 |     br = BobaRun(folder, jobs, batch_size)
 88 |     br.run_from_cli(run_all, num, thru)
 89 | 
 90 | 
 91 | @click.command()
 92 | @click.argument('pattern', nargs=1)
 93 | @click.option('--base', '-b', default='./multiverse/results',
 94 |               show_default=True, help='Folder containing the universe outputs')
 95 | @click.option('--out', default='./multiverse/merged.csv',
 96 |               show_default=True, help='Name of the merged file')
 97 | @click.option('--delimiter', default=',', show_default=True,
 98 |               help='CSV delimiter')
 99 | def merge(pattern, base, out, delimiter):
100 |     """
101 |     Merge CSV outputs from individual universes into one file.
102 | 
103 |     Required argument:
104 |     the filename pattern of individual outputs where the universe id is
105 |     replaced by {}, for example output_{}.csv
106 |     """
107 | 
108 |     check_path(base)
109 |     CSVMerger(pattern, base, out, delimiter).main()
110 | 
111 | 
112 | @click.group()
113 | @click.version_option()
114 | def main():
115 |     pass
116 | 
117 | 
118 | main.add_command(compile)
119 | main.add_command(run)
120 | main.add_command(merge)
121 | 
122 | if __name__ == "__main__":
123 |     main()
124 | 


--------------------------------------------------------------------------------
/test/test_block_syntax_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Ugly hack to allow import from the root folder
  4 | import sys
  5 | import os
  6 | sys.path.insert(0, os.path.abspath('..'))
  7 | 
  8 | import unittest
  9 | from boba.blocksyntaxparser import BlockSyntaxParser, ParseError
 10 | 
 11 | 
 12 | class TestBlockParser(unittest.TestCase):
 13 | 
 14 |     def test_steps(self):
 15 |         line = '# --- (A) remove_outlier'
 16 |         self.assertTrue(BlockSyntaxParser.can_parse(line))
 17 |         bp = BlockSyntaxParser(line)
 18 |         self.assertEqual(bp.i, 0)
 19 |         bp._read_next()
 20 |         self.assertEqual(bp.i, 5)
 21 |         bp._read_next()
 22 |         self.assertEqual(bp.i, 9)
 23 |         bp._read_next()
 24 |         self.assertEqual(bp.i, len(line))
 25 |         bp._read_next()
 26 |         self.assertEqual(bp.i, len(line))
 27 | 
 28 |     def test_can_parse(self):
 29 |         self.assertTrue(BlockSyntaxParser.can_parse('   # --- comment'))
 30 |         self.assertTrue(BlockSyntaxParser.can_parse('   # ---comment   '))
 31 |         self.assertFalse(BlockSyntaxParser.can_parse('#--- comment'))
 32 |         self.assertFalse(BlockSyntaxParser.can_parse('# --'))
 33 | 
 34 |     def test_syntax(self):
 35 |         line = '# --- (A) remove_outlier'
 36 |         self.assertTrue(BlockSyntaxParser.can_parse(line))
 37 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 38 |         self.assertEqual(bid, 'A:remove_outlier')
 39 |         self.assertEqual(par, 'A')
 40 |         self.assertEqual(opt, 'remove_outlier')
 41 | 
 42 |         line = '# --- (A) remove outlier'
 43 |         with self.assertRaises(ParseError):
 44 |             BlockSyntaxParser(line).parse()
 45 | 
 46 |         line = '# --- ((A)) name'
 47 |         with self.assertRaises(ParseError):
 48 |             BlockSyntaxParser(line).parse()
 49 | 
 50 |         line = '# --- ( A)'
 51 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 52 |         self.assertEqual(bid, 'A')
 53 |         self.assertEqual(par, '')
 54 |         self.assertEqual(opt, '')
 55 | 
 56 |     def test_condition(self):
 57 |         line = '# --- (A) a1 @if B == b1'
 58 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 59 |         self.assertEqual(bid, 'A:a1')
 60 |         self.assertEqual(par, 'A')
 61 |         self.assertEqual(opt, 'a1')
 62 |         self.assertEqual(cond['block'], 'A')
 63 |         self.assertEqual(cond['option'], 'a1')
 64 |         self.assertEqual(cond['condition'], 'B == b1')
 65 | 
 66 |         line = '# --- (A) @if B == b1'
 67 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 68 |         self.assertEqual(cond['block'], 'A')
 69 |         self.assertEqual(cond['condition'], 'B == b1')
 70 |         self.assertNotIn('option', cond)
 71 | 
 72 |         line = '# --- (A) remove outlier @if B == b1'
 73 |         with self.assertRaises(ParseError):
 74 |             BlockSyntaxParser(line).parse()
 75 | 
 76 |     def test_whitespace(self):
 77 |         line = '\t\t# --- (A) name'
 78 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 79 |         self.assertEqual(par, 'A')
 80 |         self.assertEqual(opt, 'name')
 81 | 
 82 |         line = '    # --- (A) name    \t'
 83 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 84 |         self.assertEqual(par, 'A')
 85 |         self.assertEqual(opt, 'name')
 86 | 
 87 |         line = '# ---(A)socrowded'
 88 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 89 |         self.assertEqual(par, 'A')
 90 |         self.assertEqual(opt, 'socrowded')
 91 | 
 92 |     def test_id_syntax(self):
 93 |         line = '# --- (C1)'
 94 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 95 |         self.assertEqual(bid, 'C1')
 96 | 
 97 |         line = '# --- (aXa) '
 98 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
 99 |         self.assertEqual(bid, 'aXa')
100 | 
101 |         line = '# --- (my_variable) \t'
102 |         bid, par, opt, cond = BlockSyntaxParser(line).parse()
103 |         self.assertEqual(bid, 'my_variable')
104 | 
105 |         # ID must start with a letter
106 |         line = '# --- (12)'
107 |         with self.assertRaisesRegex(ParseError, '(?i)invalid identifier'):
108 |             BlockSyntaxParser(line).parse()
109 | 
110 |         line = '# --- (_start)'
111 |         with self.assertRaisesRegex(ParseError, '(?i)invalid identifier'):
112 |             BlockSyntaxParser(line).parse()
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     unittest.main()
117 | 


--------------------------------------------------------------------------------
/boba/blocksyntaxparser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from .baseparser import BaseParser, ParseError
  4 | 
  5 | kw = '# ---'
  6 | 
  7 | 
  8 | class BlockSyntaxParser(BaseParser):
  9 |     """
 10 |     Parse the metadata of a code block, which must have the structure:
 11 |         # --- (ID) option @if condition
 12 |     option is optional, but including it will mark the block as a parameter.
 13 |     @if is optional and it creates a procedural dependency constraint on this
 14 |         block and this option (if any).
 15 |     """
 16 | 
 17 |     def __init__(self, line):
 18 |         super(BlockSyntaxParser, self).__init__(line)
 19 | 
 20 |         self.state = 0
 21 |         self.parsed_id = ''
 22 |         self.parsed_parameter = ''
 23 |         self.parsed_option = ''
 24 |         self.parsed_condition = ''
 25 | 
 26 |     @staticmethod
 27 |     def can_parse(line):
 28 |         return line.lstrip().startswith(kw)
 29 | 
 30 |     @staticmethod
 31 |     def _is_operator_start(ch):
 32 |         return ch == '@'
 33 | 
 34 |     @staticmethod
 35 |     def _is_condition(word):
 36 |         return word == 'if'
 37 | 
 38 |     def parse(self):
 39 |         while not self._is_end():
 40 |             self._read_next()
 41 |         return self.parsed_id, self.parsed_parameter, self.parsed_option,\
 42 |             self.parsed_condition
 43 | 
 44 |     def _read_next(self):
 45 |         self._read_while(BlockSyntaxParser._is_whitespace)
 46 |         if self._is_end():
 47 |             return
 48 | 
 49 |         if self.state == 0:
 50 |             self._read_kw()
 51 |         elif self.state == 1:
 52 |             self._read_id()
 53 |         elif self.state == 2:
 54 |             self._maybe_read_option()
 55 |         elif self.state == 3:
 56 |             self._read_condition()
 57 |         else:
 58 |             # we've read anything we can handle but haven't reached the end
 59 |             s = self._remaining().strip()
 60 |             self._throw('Cannot handle "{}"'.format(s))
 61 | 
 62 |     def _end(self):
 63 |         self.i = len(self.line)  # stop parsing
 64 | 
 65 |     def _throw(self, msg):
 66 |         err = 'At character {} of "{}":\n\t{}'.format(self.i+1, self.line, msg)
 67 |         raise ParseError(err)
 68 | 
 69 |     def _remaining(self):
 70 |         return self.line[self.i:]
 71 | 
 72 |     def _read_kw(self):
 73 |         if self._remaining().startswith(kw):
 74 |             self.i += len(kw)
 75 |             self.state += 1
 76 |         else:
 77 |             self._throw('expected {}'.format(kw))
 78 | 
 79 |     def _read_id(self):
 80 |         """ Read the thing inside the parenthesis. """
 81 |         # open paren
 82 |         if self._peek_char() != '(':
 83 |             self._throw('Cannot find "("')
 84 |         self._next_char()
 85 |         self._read_while(self._is_whitespace)
 86 | 
 87 |         # read the actual identifier
 88 |         ch = self._peek_char()
 89 |         if not self._is_id_start(ch):
 90 |             self._throw('Invalid identifier start character {}'.format(ch))
 91 | 
 92 |         self.parsed_id = self._read_while(self._is_id)
 93 |         self._read_while(self._is_whitespace)
 94 | 
 95 |         # close paren
 96 |         if self._peek_char() != ')':
 97 |             self._throw('Cannot find ")"')
 98 |         self._next_char()
 99 |         self.state += 1
100 | 
101 |     def _maybe_read_option(self):
102 |         """ Read the option, if there is any. """
103 |         self._read_while(self._is_whitespace)
104 | 
105 |         # check if the next word is maybe an option
106 |         if not self._is_id_start(self._peek_char()):
107 |             self.state += 1
108 |             return
109 | 
110 |         # option follows the same naming convention as ID
111 |         opt = self._read_while(self._is_id)
112 |         if opt != '':
113 |             self.parsed_parameter = self.parsed_id
114 |             self.parsed_option = opt
115 |             self.parsed_id += ':' + self.parsed_option
116 | 
117 |         self._read_while(self._is_whitespace)
118 |         self.state += 1
119 | 
120 |     def _read_condition(self):
121 |         """ Read condition. """
122 |         self._read_while(self._is_whitespace)
123 | 
124 |         # check if the next char is indeed an operator
125 |         if not BlockSyntaxParser._is_operator_start(self._peek_char()):
126 |             self.state += 1
127 |             return
128 | 
129 |         # read @if
130 |         self._next_char()
131 |         w = self._read_while(self._is_id)
132 |         if not BlockSyntaxParser._is_condition(w):
133 |             self._throw('Cannot handle @{}'.format(w))
134 | 
135 |         # read whatever remains as the condition
136 |         s = self._remaining().strip()
137 |         self._end()
138 |         self.state += 1
139 | 
140 |         # construct the condition
141 |         bl = self.parsed_parameter if self.parsed_option else self.parsed_id
142 |         self.parsed_condition = {'block': bl, 'condition': s}
143 |         if self.parsed_option:
144 |             self.parsed_condition['option'] = self.parsed_option
145 | 


--------------------------------------------------------------------------------
/example/hurricane/data.csv:
--------------------------------------------------------------------------------
 1 | Year,Name,MasFem,MinPressure_before,Minpressure_Updated_2014,Gender_MF,Category,alldeaths,HighestWindSpeed,NDAM,Elapsed Yrs,Source
 2 | 1950,Easy,5.40625,958,960,0,3,2,125,2380,63,MWR
 3 | 1950,King,1.59375,955,955,0,4,4,134,7220,63,MWR
 4 | 1952,Able,2.96875,985,985,0,1,3,125,210,61,MWR
 5 | 1953,Barbara,8.625,987,987,1,1,1,75,78,60,MWR
 6 | 1953,Florence,7.875,985,985,1,1,0,115,21,60,MWR
 7 | 1954,Carol,8.53125,960,960,1,3,60,115,24962,59,MWR
 8 | 1954,Edna,7.625,954,954,1,3,20,125,4010,59,MWR
 9 | 1954,Hazel,8.21875,938,938,1,4,20,134,36450,59,MWR
10 | 1955,Connie,8.0,962,962,1,3,0,137,2710,58,MWR
11 | 1955,Diane,8.875,987,987,1,1,200,103,52990,58,MWR
12 | 1955,Ione,6.21875,960,960,1,3,7,140,8410,58,MWR
13 | 1956,Flossy,7.21875,975,975,1,2,15,90,2060,57,MWR
14 | 1957,Audrey,8.59375,946,946,1,3,416,127,4750,56,wiki
15 | 1958,Helene,8.8125,946,946,1,3,1,150,778,55,MWR
16 | 1959,Debra,8.4375,984,984,1,1,0,85,620,54,MWR
17 | 1959,Gracie,9.53125,950,950,1,3,22,140,710,54,MWR
18 | 1960,Donna,8.875,930,930,1,4,50,143,78260,53,http://www.nhc.noaa.gov/pdf/NWS-TPC-5.pdf
19 | 1960,Ethel,7.0625,981,981,1,1,0,115,45,53,MWR
20 | 1961,Carla,8.625,931,931,1,4,46,174,22270,52,MWR
21 | 1963,Cindy,8.9375,996,996,1,1,3,80,410,50,MWR
22 | 1964,Cleo,7.125,968,968,1,2,3,155,8750,49,MWR
23 | 1964,Dora,8.28125,966,966,1,2,5,134,22720,49,MWR
24 | 1964,Hilda,7.65625,950,950,1,3,37,150,3620,49,MWR
25 | 1964,Isbell,8.3125,974,974,1,2,3,125,1120,49,MWR
26 | 1965,Betsy,9.0,948,948,1,3,75,155,21250,48,MWR
27 | 1966,Alma,7.15625,982,982,1,2,6,125,1120,47,MWR
28 | 1966,Inez,6.46875,983,983,1,1,3,150,120,47,MWR
29 | 1967,Beulah,6.34375,950,950,1,3,15,162,7010,46,MWR
30 | 1968,Gladys,7.65625,977,977,1,2,3,145,1170,45,MWR
31 | 1969,Camille,8.875,909,909,1,5,256,174,28520,44,MWR
32 | 1970,Celia,9.21875,945,945,1,3,22,127,9050,43,WIKI (http://en.wikipedia.org/wiki/Hurricane_Celia)
33 | 1971,Edith,7.65625,978,978,1,2,0,160,380,42,MWR
34 | 1971,Fern,6.71875,979,979,1,1,2,90,690,42,MWR
35 | 1971,Ginger,8.65625,995,995,1,1,0,110,270,42,MWR
36 | 1972,Agnes,7.03125,980,980,1,1,117,87,26440,41,MWR
37 | 1974,Carmen,7.78125,952,952,1,3,1,149,1530,39,MWR
38 | 1975,Eloise,8.4375,955,955,1,3,21,125,8500,38,MWR
39 | 1976,Belle,9.78125,980,980,1,1,5,120,720,37,MWR
40 | 1977,Babe,7.90625,995,995,1,1,0,75,88,36,MWR
41 | 1979,Bob,1.71875,986,986,0,1,1,75,90,34,MWR
42 | 1979,David,1.5625,970,970,0,2,15,174,3840,34,MWR
43 | 1979,Frederic,2.0625,946,946,0,3,5,134,17170,34,MWR
44 | 1980,Allen,2.03125,945,945,0,3,2,190,3040,33,MWR
45 | 1983,Alicia,9.34375,962,962,1,3,21,115,22330,30,MWR
46 | 1984,Diana,9.21875,949,949,1,2,3,134,620,29,MWR
47 | 1985,Bob,1.71875,1002,1003,0,1,0,75,190,28,MWR
48 | 1985,Danny,2.90625,987,987,0,1,1,90,200,28,MWR
49 | 1985,Elena,8.71875,959,959,1,3,4,127,5360,28,MWR
50 | 1985,Gloria,9.1875,942,942,1,3,8,143,3920,28,MWR
51 | 1985,Juan,1.875,971,971,0,1,12,103,6140,28,MWR
52 | 1985,Kate,9.1875,967,967,1,2,5,121,1800,28,MWR
53 | 1986,Bonnie,9.25,990,990,1,1,3,115,7,27,MWR
54 | 1986,Charley,4.0,990,990,0,1,5,149,79,27,MWR
55 | 1987,Floyd,2.40625,993,993,0,1,0,75,1,26,MWR
56 | 1988,Florence,7.875,984,984,1,1,1,80,2,25,MWR
57 | 1989,Chantal,8.3125,986,986,1,1,13,80,390,24,MWR
58 | 1989,Hugo,2.0,934,934,0,4,21,162,27430,24,MWR
59 | 1989,Jerry,2.78125,983,983,0,1,3,85,320,24,MWR
60 | 1991,Bob,1.71875,962,962,0,2,15,115,4690,22,MWR
61 | 1992,Andrew,1.78125,922,922,0,5,62,174,90250,21,MWR
62 | 1993,Emily,10.03125,960,961,1,3,3,162,130,20,MWR
63 | 1995,Erin,7.125,973,973,1,2,6,100,2240,18,MWR
64 | 1995,Opal,7.65625,942,942,1,3,9,149,16510,18,MWR
65 | 1996,Bertha,7.375,974,974,1,2,8,115,1020,17,MWR
66 | 1996,Fran,7.0625,954,954,1,3,26,121,18930,17,MWR
67 | 1997,Danny,2.90625,984,984,0,1,10,80,270,16,MWR
68 | 1998,Bonnie,9.25,964,964,1,2,3,115,2410,15,MWR
69 | 1998,Earl,1.875,987,987,0,1,3,87,220,15,MWR
70 | 1998,Georges,3.375,964,964,0,2,1,155,4860,15,MWR
71 | 1999,Bret,2.90625,951,951,0,3,0,145,120,14,MWR
72 | 1999,Floyd,2.40625,956,956,0,2,56,155,16030,14,MWR
73 | 1999,Irene,8.5625,987,964,1,1,8,109,1940,14,MWR
74 | 2002,Lili,9.59375,963,963,1,1,2,143,1610,11,MWR
75 | 2003,Claudette,8.71875,979,979,1,1,3,90,330,10,MWR
76 | 2003,Isabel,9.625,957,957,1,2,51,168,11010,10,MWR
77 | 2004,Alex,4.0625,972,972,0,1,1,120,7,9,MWR
78 | 2004,Charley,4.0,941,941,0,4,10,149,37180,9,MWR
79 | 2004,Frances,6.03125,960,960,1,2,7,143,19990,9,MWR
80 | 2004,Gaston,2.40625,985,985,0,1,8,75,240,9,MWR
81 | 2004,Ivan,2.09375,946,946,0,3,25,168,36910,9,MWR
82 | 2004,Jeanne,7.90625,950,950,1,3,5,121,16800,9,MWR
83 | 2005,Cindy,8.9375,991,991,1,1,1,75,420,8,MWR
84 | 2005,Dennis,2.0,946,946,0,3,15,149,3930,8,MWR
85 | 2005,Katrina,9.46875,902,902,1,3,1833,174,148240,8,MWR
86 | 2005,Ophelia,9.125,982,982,1,1,1,115,130,8,MWR
87 | 2005,Rita,8.4375,937,937,1,3,62,177,23110,8,MWR
88 | 2005,Wilma,8.375,950,950,1,3,5,183,33410,8,MWR
89 | 2007,Humberto,2.21875,985,985,0,1,1,85,63,6,MWR
90 | 2008,Dolly,10.0625,963,967,1,1,1,99,1940,5,MWR
91 | 2008,Gustav,2.09375,951,954,0,2,52,155,7900,5,MWR
92 | 2008,Ike,2.21875,935,950,0,2,84,143,44260,5,MWR
93 | 2011,Irene,8.5625,952,952,1,1,41,121,17160,2,MWR
94 | 2012,Isaac,2.0,965,966,0,1,5,81,3430,1,MWR
95 | 2012,Sandy,7.9375,945,942,1,2,159,115,80090,1,MWR
96 | 


--------------------------------------------------------------------------------
/test/test_graph_analyzer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Ugly hack to allow import from the root folder
  4 | import sys
  5 | import os
  6 | sys.path.insert(0, os.path.abspath('..'))
  7 | 
  8 | import unittest
  9 | from boba.graphparser import GraphParser
 10 | from boba.graphanalyzer import GraphAnalyzer, InvalidGraphError
 11 | 
 12 | 
 13 | class TestGraphAnalyzer(unittest.TestCase):
 14 | 
 15 |     def cp_helper(self, spec, expected):
 16 |         nodes, edges = GraphParser(spec).parse()
 17 |         ga = GraphAnalyzer(nodes, edges)
 18 |         ga._construct_paths()
 19 |         expected = set([','.join(p) for p in expected])
 20 |         actual = set([','.join(p) for p in ga.paths])
 21 |         self.assertSetEqual(actual, expected)
 22 | 
 23 |     def test_construct_paths(self):
 24 |         # normal
 25 |         spec = ['a->b->c', 'b->d']
 26 |         expected = [['a', 'b', 'c'], ['a', 'b', 'd']]
 27 |         self.cp_helper(spec, expected)
 28 | 
 29 |         # single node
 30 |         spec = ['a']
 31 |         expected = [['a']]
 32 |         self.cp_helper(spec, expected)
 33 | 
 34 |         # multiple sources and targets
 35 |         spec = ['a->b->c', 'a2->b->c2']
 36 |         expected = [['a', 'b', 'c'], ['a2', 'b', 'c2'], ['a', 'b', 'c2'],
 37 |                     ['a2', 'b', 'c']]
 38 |         self.cp_helper(spec, expected)
 39 | 
 40 |         # disconnected
 41 |         spec = ['a->b->c', 'e->f']
 42 |         expected = [['a', 'b', 'c'], ['e', 'f']]
 43 |         self.cp_helper(spec, expected)
 44 | 
 45 |         # cyclic
 46 |         spec = ['a->b->c->a']
 47 |         nodes, edges = GraphParser(spec).parse()
 48 |         ga = GraphAnalyzer(nodes, edges)
 49 |         with self.assertRaises(InvalidGraphError):
 50 |             ga._construct_paths()
 51 | 
 52 |     def source_helper(self, spec, exp_source, exp_target):
 53 |         nodes, edges = GraphParser(spec).parse()
 54 |         ga = GraphAnalyzer(nodes, edges)
 55 |         self.assertSetEqual(ga._get_source(), exp_source)
 56 |         self.assertSetEqual(ga._get_target(), exp_target)
 57 | 
 58 |     def test_get_source_and_target(self):
 59 |         # normal
 60 |         spec = ['a->b->d', 'a->b->c']
 61 |         source = {'a'}
 62 |         target = {'d', 'c'}
 63 |         self.source_helper(spec, source, target)
 64 | 
 65 |         # disconnected
 66 |         spec = ['a->b->d', 'c->e']
 67 |         source = {'a', 'c'}
 68 |         target = {'d', 'e'}
 69 |         self.source_helper(spec, source, target)
 70 | 
 71 |         # a single node
 72 |         spec = ['a']
 73 |         source = {'a'}
 74 |         target = {'a'}
 75 |         self.source_helper(spec, source, target)
 76 | 
 77 |         # cyclic
 78 |         spec = ['a->b->c->d->a']
 79 |         source = set()
 80 |         target = set()
 81 |         self.source_helper(spec, source, target)
 82 | 
 83 |         # complex
 84 |         spec = ['a->b->d', 'a->c->b->d', 'c->a->d']
 85 |         source = set()
 86 |         target = {'d'}
 87 |         self.source_helper(spec, source, target)
 88 | 
 89 |     def path_helper(self, spec, s, t, expected):
 90 |         nodes, edges = GraphParser(spec).parse()
 91 |         ga = GraphAnalyzer(nodes, edges)
 92 |         ga._all_paths(s, t)
 93 |         expected = set([','.join(p) for p in expected])
 94 |         actual = set([','.join(p) for p in ga.paths])
 95 |         self.assertSetEqual(actual, expected)
 96 | 
 97 |     def test_get_path(self):
 98 |         """ test if the program correctly gets all paths from s to t"""
 99 | 
100 |         spec = ['a->b->c', 'b->d', 'e']
101 |         start = 'a'
102 |         stop = 'e'
103 |         expected = []
104 |         self.path_helper(spec, start, stop, expected)
105 | 
106 |         spec = ['a->b->c', 'b->d', 'e']
107 |         start = 'a'
108 |         stop = 'c'
109 |         expected = [['a', 'b', 'c']]
110 |         self.path_helper(spec, start, stop, expected)
111 | 
112 |         spec = ['a->b->c', 'b->d', 'e']
113 |         start = 'b'
114 |         stop = 'd'
115 |         expected = [['b', 'd']]
116 |         self.path_helper(spec, start, stop, expected)
117 | 
118 |         # a single node
119 |         spec = ['a']
120 |         start = 'a'
121 |         stop = 'a'
122 |         expected = [['a']]
123 |         self.path_helper(spec, start, stop, expected)
124 | 
125 |         # graph with a merged branch
126 |         spec = ['a->b->c', 'b->d', 'c->e d->e']
127 |         start = 'a'
128 |         stop = 'e'
129 |         expected = [['a', 'b', 'c', 'e'], ['a', 'b', 'd', 'e']]
130 |         self.path_helper(spec, start, stop, expected)
131 | 
132 |         # graph with a cycle
133 |         spec = ['a->b->c->b', 'b->d', 'e']
134 |         start = 'a'
135 |         stop = 'c'
136 |         expected = [['a', 'b', 'c']]
137 |         self.path_helper(spec, start, stop, expected)
138 | 
139 |         # a complicated graph
140 |         spec = ['a->b->d', 'a->c->b->d', 'c->a->d']
141 |         start = 'c'
142 |         stop = 'd'
143 |         expected = [['c', 'b', 'd'], ['c', 'a', 'd'], ['c', 'a', 'b', 'd']]
144 |         self.path_helper(spec, start, stop, expected)
145 | 
146 | 
147 | if __name__ == '__main__':
148 |     unittest.main()
149 | 


--------------------------------------------------------------------------------
/example/hurricane/reproduce/repro_marginalize.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # Replicate prior work's results using their marginalization approach
  3 | # --- (BOBA_CONFIG)
  4 | {
  5 |   "decisions": [
  6 |     {"var": "outliers", "options": [
  7 |         "c()",
  8 |         "c('Katrina')",
  9 |         "c('Katrina', 'Audrey')"
 10 |     ]},
 11 |     {"var": "leverage_points", "options": [
 12 |         "c()",
 13 |         "c('Sandy')",
 14 |         "c('Sandy', 'Andrew')",
 15 |         "c('Sandy', 'Andrew', 'Donna')"
 16 |     ]},
 17 |     {"var": "feminity", "options": ["female", "masfem"]},
 18 |     {"var": "feminity_prediction_levels", "options": ["c(0, 1)", "c(2.53, 8.29)"]},
 19 |     {"var": "damage", "options": ["dam", "log_dam"]},
 20 |     {"var": "predictors", "options": [
 21 |         "feminity * damage",
 22 |         "feminity + damage + pressure + feminity:damage + feminity:pressure",
 23 |         "feminity + damage + zwin + feminity:damage + feminity:zwin",
 24 |         "feminity + damage + zcat + feminity:damage + feminity:zcat",
 25 |         "feminity + damage + z3 + feminity:damage + feminity:z3",
 26 |         "feminity + damage + z3"
 27 |     ]},
 28 |     {"var": "covariates", "options": [
 29 |         "",
 30 |         "+ year:damage",
 31 |         "+ post:damage"
 32 |     ]},
 33 |     {"var": "predictor_list", "options": [
 34 |         "damage",
 35 |         "damage, pressure",
 36 |         "damage, zwin",
 37 |         "damage, zcat",
 38 |         "damage, z3",
 39 |         "damage, z3"
 40 |     ]},
 41 |     {"var": "covariate_list", "options": [
 42 |         "",
 43 |         ", year",
 44 |         ", post"
 45 |     ]},
 46 |     {"var": "back_transform", "options": [
 47 |       "exp(mu + sigma^2/2) - 1",
 48 |       "mu"
 49 |     ]},
 50 |     {"var": "df", "options": [
 51 |         "inference$df",
 52 |         "df.residual(model)"
 53 |     ]}
 54 |   ],
 55 |   "constraints": [
 56 |     {"link": ["feminity", "feminity_prediction_levels"]},
 57 |     {"link": ["Model", "back_transform", "df"]},
 58 |     {"link": ["predictors", "predictor_list"]},
 59 |     {"link": ["covariates", "covariate_list"]}
 60 |   ],
 61 |   "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results"
 62 | }
 63 | # --- (END)
 64 | 
 65 | library(readr)
 66 | library(MASS)
 67 | library(modelr)
 68 | library(tidyverse)
 69 | library(broom.mixed)
 70 | library(tidybayes)
 71 | 
 72 | # a function for post-processing predicted means and standard deviations into expected number of deaths
 73 | pred2expectation <- function(mu, sigma) {
 74 |     return({{back_transform}})
 75 | }
 76 | 
 77 | # read and process data
 78 | df <- read_csv('../data.csv',
 79 |     col_types = cols(
 80 |         Year = col_integer(),
 81 |         Category = col_integer(),
 82 |         Gender_MF = col_integer(),
 83 |         alldeaths = col_integer()
 84 |     )) %>%
 85 |     # rename some variables
 86 |     dplyr::select(
 87 |         year = Year,
 88 |         name = Name,
 89 |         dam = NDAM,
 90 |         death = alldeaths,
 91 |         female = Gender_MF,
 92 |         masfem = MasFem,
 93 |         category = Category,
 94 |         pressure = Minpressure_Updated_2014,
 95 |         wind = HighestWindSpeed
 96 |     ) %>%
 97 |     # create new variables
 98 |     mutate(
 99 |         log_death = log(death + 1),
100 |         log_dam = log(dam),
101 |         post = ifelse(year>1979, 1, 0),
102 |         zdam = scale(dam),
103 |         zcat = as.numeric(scale(category)),
104 |         zmin = -scale(pressure),
105 |         zwin = as.numeric(scale(wind)),
106 |         z3 = as.numeric((zmin + zcat + zwin) / 3)
107 |     ) %>%
108 |     # remove outliers
109 |     filter(!(name %in% {{outliers}})) %>%
110 |     filter(!(name %in% {{leverage_points}})) %>%
111 |     # operationalize feminity
112 |     mutate(
113 |         feminity = {{feminity}},
114 |         damage =  {{damage}}
115 |     )
116 | 
117 | # --- (Model) ols_regression
118 | # OLS regression with log(deaths+1) as the dependent variable 
119 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df)
120 | 
121 | # --- (Model) negative_binomial
122 | # Negative binomial with deaths as the dependent variable
123 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df)
124 | 
125 | # --- (O)
126 | # create a data frame where covariates are at their means
127 | dmeans <- df %>%
128 |     summarise_at(vars({{predictor_list}} {{covariate_list}}), mean) %>%
129 |     group_by({{predictor_list}} {{covariate_list}}) %>%
130 |     data_grid(feminity = {{feminity_prediction_levels}})%>%
131 |     ungroup()
132 | 
133 | # predict
134 | pred <- predict(model, dmeans, se.fit = TRUE, type = "response")
135 | expectation <- dmeans %>%
136 |     mutate(
137 |         fit = pred$fit,
138 |         sigma = sigma(model),
139 |         expected_deaths = pred2expectation(fit, sigma)
140 |     )%>%
141 |     compare_levels(expected_deaths, by = feminity) %>%
142 |     ungroup() %>%
143 |     dplyr::select(expected_diff = expected_deaths)
144 | 
145 | # get predictive check for original dataset from model
146 | pred <- predict(model, df, type = "response")
147 | disagg_fit <- df %>%
148 |     mutate(
149 |         fit = pred,                                 # get fitted predictions
150 |         sigma = sigma(model),                       # get residual standard deviation
151 |         pred_deaths = pred2expectation(fit, sigma)  # transform to deaths
152 |     ) %>%
153 |     dplyr::select(
154 |         observed = death,
155 |         expected = pred_deaths
156 |     )
157 | 
158 | # output
159 | write_csv(expectation, '../results/estimate_{{_n}}.csv')
160 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv')
161 | 


--------------------------------------------------------------------------------
/boba/codeparser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from dataclasses import dataclass, field
  4 | from typing import List
  5 | import json
  6 | 
  7 | from .blocksyntaxparser import BlockSyntaxParser, ParseError
  8 | 
  9 | 
 10 | @dataclass
 11 | class Block:
 12 |     """
 13 |     A class for code blocks.
 14 | 
 15 |     id: unique identifier. For decision block, id is parameter:option.
 16 |     parameter: parameter name, if the block is a decision.
 17 |     option: option name, if the block is a decision.
 18 |     chunks: code broken up at the boundaries of placeholder variables.
 19 |     """
 20 | 
 21 |     id: str = ''
 22 |     parameter: str = ''
 23 |     option: str = ''
 24 |     chunks: List = field(default_factory=lambda: [])
 25 | 
 26 | 
 27 | @dataclass
 28 | class Chunk:
 29 |     """A class for code chunks.
 30 |     A code chunk contains at most one placeholder variable.
 31 | 
 32 |     variable: the corresponding placeholder variable, if any.
 33 |     code: the code template proceeding the variable or the block boundary.
 34 |     """
 35 |     variable: str = ''
 36 |     code: str = ''
 37 | 
 38 | 
 39 | class CodeParser:
 40 |     def __init__(self):
 41 |         self.blocks = {}
 42 |         self.order = []
 43 | 
 44 |         self.raw_spec = ''
 45 |         self.spec = {}
 46 | 
 47 |         self.inline_constraints = []
 48 |         self.inline_vars = []
 49 |         self.used_vars = set()
 50 | 
 51 |     @staticmethod
 52 |     def _get_block_name(block):
 53 |         """Get the ID of the block, ignoring options."""
 54 |         return block.id if block.parameter == '' else block.parameter
 55 | 
 56 |     def _add_block(self, block):
 57 |         """Add a block to our data structure."""
 58 |         # handle config block
 59 |         if block.id == 'BOBA_CONFIG':
 60 |             self.raw_spec += block.chunks[0].code
 61 |             return
 62 |         if block.id == 'END':
 63 |             block.id = ''
 64 |             if len(self.order):
 65 |                 self.blocks[self.order[-1]].chunks += block.chunks
 66 |                 return
 67 | 
 68 |         # ignore empty block
 69 |         if block.id == '' and block.chunks[0].code == '':
 70 |             return
 71 | 
 72 |         # handle unnamed block
 73 |         if block.id == '':
 74 |             block.id = '_start' if len(self.blocks) == 0 else '_end'
 75 | 
 76 |         # check if id exists
 77 |         if block.id in self.blocks:
 78 |             err = 'Duplicated code block ID "{}"'.format(block.id)
 79 |             raise ParseError(err)
 80 | 
 81 |         # add to data structure
 82 |         self.blocks[block.id] = block
 83 |         bn = CodeParser._get_block_name(block)
 84 |         if bn not in self.order:
 85 |             self.order.append(bn)
 86 | 
 87 |     def get_block_names(self):
 88 |         """
 89 |         Get the ID of all blocks, ignoring options
 90 |         :return: a set of unique names
 91 |         """
 92 |         blocks = set()
 93 |         for b in self.blocks:
 94 |             bl = self.blocks[b]
 95 |             blocks.add(CodeParser._get_block_name(bl))
 96 |         return blocks
 97 | 
 98 |     def get_decisions(self):
 99 |         """
100 |         Get a dict of all block-level decisions, where the key is the parameter
101 |         and the value is a list of block ids (namely, parameter:option).
102 |         :return:
103 |         """
104 |         decs = {}
105 |         for b in self.blocks:
106 |             bl = self.blocks[b]
107 |             if bl.parameter:
108 |                 p = bl.parameter
109 |                 if p in decs:
110 |                     decs[p].append(bl.id)
111 |                 else:
112 |                     decs[p] = [bl.id]
113 |         return decs
114 | 
115 |     def parse(self, dec_parser, f):
116 |         """ Make a pass over the template, parsing block declarations and
117 |         placeholder variables inside the code."""
118 |         code = ''
119 |         bl = Block()
120 | 
121 |         for line in f:
122 |             if BlockSyntaxParser.can_parse(line):
123 |                 # end of the previous block
124 |                 bl.chunks.append(Chunk('', code))
125 |                 code = ''
126 |                 self._add_block(bl)
127 | 
128 |                 # parse the metadata and create a new block
129 |                 bp_id, par, opt, cond = BlockSyntaxParser(line).parse()
130 |                 bl = Block(bp_id, par, opt, [])
131 | 
132 |                 # store inline constraints, if any
133 |                 if cond:
134 |                     self.inline_constraints.append(cond)
135 |             else:
136 |                 # match decision variables
137 |                 try:
138 |                     vs, codes = dec_parser.parse_code(line)
139 |                     if len(vs):
140 |                         # store inline variables
141 |                         self.used_vars.update(vs)
142 | 
143 |                         # chop into more chunks
144 |                         # combine first chunk with previous code
145 |                         bl.chunks.append(Chunk(vs[0], code + codes[0]))
146 |                         for i in range(1, len(vs)):
147 |                             bl.chunks.append(Chunk(vs[i], codes[i]))
148 | 
149 |                         # remaining code after the last matched variable
150 |                         code = codes[-1]
151 |                     else:
152 |                         code += line
153 |                 except ParseError as e:
154 |                     msg = 'At line "{}"\n\t{}'.format(line, e.args[0])
155 |                     raise ParseError(msg)
156 | 
157 |         # add the last block
158 |         bl.chunks.append(Chunk('', code))
159 |         self._add_block(bl)
160 | 
161 |         # parse the spec
162 |         try:
163 |             self.spec = json.loads(self.raw_spec) if self.raw_spec else {}
164 |         except ValueError as e:
165 |             msg = self.raw_spec + '\n' + e.args[0]
166 |             msg += '\nBoba config is not valid JSON'
167 |             raise ParseError(msg)
168 | 


--------------------------------------------------------------------------------
/example/hurricane/repro.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # --- (BOBA_CONFIG)
  3 | {
  4 |   "decisions": [
  5 |     {"var": "outliers", "options": [
  6 |         "c()",
  7 |         "c('Katrina')",
  8 |         "c('Katrina', 'Audrey')"
  9 |     ]},
 10 |     {"var": "leverage_points", "options": [
 11 |         "c()",
 12 |         "c('Sandy')",
 13 |         "c('Sandy', 'Andrew')",
 14 |         "c('Sandy', 'Andrew', 'Donna')"
 15 |     ]},
 16 |     {"var": "feminity", "options": ["female", "masfem"]},
 17 |     {"var": "damage", "options": ["dam", "log_dam"]},
 18 |     {"var": "predictors", "options": [
 19 |         "feminity * damage",
 20 |         "feminity + damage + pressure + feminity:damage + feminity:pressure",
 21 |         "feminity + damage + zwin + feminity:damage + feminity:zwin",
 22 |         "feminity + damage + zcat + feminity:damage + feminity:zcat",
 23 |         "feminity + damage + z3 + feminity:damage + feminity:z3",
 24 |         "feminity + damage + z3"
 25 |     ]},
 26 |     {"var": "covariates", "options": [
 27 |         "",
 28 |         "+ year:damage",
 29 |         "+ post:damage"
 30 |     ]},
 31 |     {"var": "back_transform", "options": [
 32 |       "exp(mu + sigma^2/2) - 1",
 33 |       "mu"
 34 |     ]}
 35 |   ],
 36 |   "constraints": [
 37 |     {"link": ["Model", "back_transform"]}
 38 |   ],
 39 |   "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results",
 40 |   "after_execute": "cp ../stacking_weights.R ./",
 41 |   "visualizer": "visualizer_config.json"
 42 | }
 43 | # --- (END)
 44 | 
 45 | suppressPackageStartupMessages(library(readr))
 46 | suppressPackageStartupMessages(library(MASS))
 47 | suppressPackageStartupMessages(library(modelr))
 48 | suppressPackageStartupMessages(library(tidyverse))
 49 | suppressPackageStartupMessages(library(broom.mixed))
 50 | suppressPackageStartupMessages(library(tidybayes))
 51 | source('../../boba_util.R') #fixme
 52 | 
 53 | # a function for post-processing predicted means and standard deviations into expected number of deaths
 54 | pred2expectation <- function(mu, sigma) {
 55 |   return({{back_transform}})
 56 | }
 57 | 
 58 | # get expectation per data point
 59 | compute_exp <- function (model, df) {
 60 |   disagg_fit <- pointwise_predict(model, df) %>%
 61 |     mutate(expected = pred2expectation(fit, sigma))
 62 |   return(disagg_fit)
 63 | }
 64 | 
 65 | # read and process data
 66 | full <- read_csv('../data.csv',
 67 |   col_types = cols(
 68 |     Year = col_integer(),
 69 |     Category = col_integer(),
 70 |     Gender_MF = col_integer(),
 71 |     alldeaths = col_integer()
 72 |   )) %>%
 73 |   # rename some variables
 74 |   dplyr::select(
 75 |     year = Year,
 76 |     name = Name,
 77 |     dam = NDAM,
 78 |     death = alldeaths,
 79 |     female = Gender_MF,
 80 |     masfem = MasFem,
 81 |     category = Category,
 82 |     pressure = Minpressure_Updated_2014,
 83 |     wind = HighestWindSpeed
 84 |   ) %>%
 85 |   # create new variables
 86 |   mutate(
 87 |     id = row_number(),
 88 |     log_death = log(death + 1),
 89 |     log_dam = log(dam),
 90 |     post = ifelse(year>1979, 1, 0),
 91 |     zdam = scale(dam),
 92 |     zcat = as.numeric(scale(category)),
 93 |     zmin = -scale(pressure),
 94 |     zwin = as.numeric(scale(wind)),
 95 |     z3 = as.numeric((zmin + zcat + zwin) / 3)
 96 |   ) %>%
 97 |   # operationalize feminity
 98 |   mutate(
 99 |     feminity = {{feminity}},
100 |     damage =  {{damage}}
101 |   )
102 | 
103 | df <- full %>%
104 |   # remove outliers
105 |   filter(!(name %in% {{outliers}})) %>%
106 |   filter(!(name %in% {{leverage_points}}))
107 | 
108 | # --- (Model) ols_regression
109 | # OLS regression with log(deaths+1) as the dependent variable 
110 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df)
111 | 
112 | # --- (Model) negative_binomial
113 | # Negative binomial with deaths as the dependent variable
114 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df)
115 | 
116 | # --- (O)
117 | # cross validation
118 | fit <- cross_validation(df, model, "death",
119 |   func = function (m, d) compute_exp(m, d)$expected)
120 | nrmse = fit / (max(df$death) - min(df$death))
121 | 
122 | # stacking
123 | loglik <- df %>%
124 |   add_column(loglik = stacking(df, model)) %>%
125 |   dplyr::select(id, loglik) %>%
126 |   right_join(full, by='id')
127 | # add missing log likelihood
128 | if (nrow(loglik) != nrow(df)) {
129 |   idx <- filter(loglik, is.na(loglik))
130 |   loglik$loglik[idx$id] <- compute_loglik(model, idx)
131 | }
132 | loglik <- dplyr::select(loglik, loglik)
133 | 
134 | # permutation test
135 | null.dist <- permutation_test(df, model, c("female", "masfem", "feminity"), N = 100,
136 |   func = function (m, d) margins(compute_exp(m, d), "female", "expected")$expected) %>%
137 |   dplyr::select(expected_diff = value)
138 | 
139 | # get prediction
140 | disagg_fit <- compute_exp(model, df)
141 | 
142 | # aggregate fitted effect of female storm name
143 | expectation <- margins(disagg_fit, "female", "expected") %>%
144 |   dplyr::select(expected_diff = expected) %>%
145 |   add_column(NRMSE = nrmse)  # add cross validation metric
146 | 
147 | # propagate uncertainty in fit to model predictions
148 | uncertainty <- disagg_fit %>%
149 |     mutate(
150 |         .draw = list(1:200),                               # generate list of draw numbers
151 |         t = map(df, ~rt(200, .)),                          # simulate draws from t distribution to transform into means
152 |         x = map(df, ~rchisq(200, .))                       # simulate draws from chi-squared distribution to transform into sigmas
153 |     ) %>%
154 |     unnest(cols = c(".draw", "t", "x")) %>%
155 |     mutate(
156 |         mu = t * se.fit + fit,                              # scale and shift t to get a sampling distribution of means
157 |         sigma = sqrt(df * se.residual^2 / x),               # scale and take inverse of x to get a sampling distribution of sigmas
158 |         expected_deaths = pred2expectation(mu, sigma)
159 |     ) %>%
160 |     group_by(.draw, female) %>%                             # group by predictor(s) of interest
161 |     summarize(expected_deaths = mean(expected_deaths)) %>%  # marninalize across other predictors
162 |     compare_levels(expected_deaths, by = female) %>%
163 |     ungroup() %>%
164 |     dplyr::select(expected_diff = expected_deaths)
165 | 
166 | # only output relevant fields in disagg_fit
167 | disagg_fit <- disagg_fit %>%
168 |   dplyr::select(
169 |     observed = death,
170 |     expected = expected
171 |   )
172 | 
173 | # output
174 | write_csv(expectation, '../results/estimate_{{_n}}.csv')
175 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv')
176 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv')
177 | write_csv(null.dist, '../results/null_{{_n}}.csv')
178 | write_csv(loglik, '../results/loglik_{{_n}}.csv')
179 | 


--------------------------------------------------------------------------------
/boba/wrangler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import shutil
  5 | import csv
  6 | import json
  7 | from dataclasses import dataclass
  8 | from .baseparser import ParseError
  9 | 
 10 | 
 11 | @dataclass
 12 | class Output:
 13 |     name: str
 14 |     value: str
 15 | 
 16 | DIR_SCRIPT = 'code/'
 17 | DIR_LOG = 'boba_logs/'
 18 | LOG_EXT = '.txt'
 19 | 
 20 | def get_universe_name(universe_id):
 21 |     """ Get the name of a universe """
 22 |     return 'universe_' + str(universe_id)
 23 | 
 24 | 
 25 | def get_universe_script(universe_id, lang_extension):
 26 |     """ Get the file name of a universe script """
 27 |     return get_universe_name(universe_id) + lang_extension
 28 | 
 29 | 
 30 | def get_universe_id_from_script(universe_script):
 31 |     """ Get the id of a universe given the universe script """
 32 |     return int(universe_script.split('.')[0].split('_')[1])
 33 | 
 34 | 
 35 | def get_universe_log(universe_id):
 36 |     """ Get the file name of a universe log """
 37 |     return 'log_' + str(universe_id) + LOG_EXT
 38 | 
 39 | 
 40 | def get_universe_error_log(universe_id):
 41 |     """ get the file name of a universe error log """
 42 |     return 'error_' + str(universe_id) + LOG_EXT
 43 | 
 44 | 
 45 | class Wrangler:
 46 |     """Handles outputs."""
 47 |     def __init__(self, spec, lang, out):
 48 |         self.spec = spec
 49 |         self.lang = lang
 50 |         self.out = out
 51 |         self.fn = os.path.abspath(os.path.join(out, 'summary.csv'))
 52 | 
 53 |         self.outputs = {}
 54 |         self.col = 0  # output column number, will be set by parser
 55 |         self.counter = 0
 56 | 
 57 |         self.pre_exe = ''
 58 |         self.post_exe = ''
 59 | 
 60 |         self._read_spec()
 61 | 
 62 |     @staticmethod
 63 |     def _read_json_safe(obj, field):
 64 |         if field not in obj:
 65 |             raise ParseError('Cannot find "{}" in json'.format(field))
 66 |         return obj[field]
 67 | 
 68 |     @staticmethod
 69 |     def _read_optional(obj, field, df):
 70 |         return obj[field] if field in obj else df
 71 | 
 72 |     def _read_spec(self):
 73 |         """Read misc fields from the JSON spec."""
 74 |         sp = self._read_optional(self.spec, 'outputs', [])
 75 |         for d in sp:
 76 |             name = str(self._read_json_safe(d, 'name'))
 77 |             value = str(self._read_json_safe(d, 'value'))
 78 |             o = Output(name, value)
 79 |             self.outputs[name] = o
 80 | 
 81 |         self.pre_exe = self._read_optional(self.spec, 'before_execute', '')
 82 |         self.post_exe = self._read_optional(self.spec, 'after_execute', '')
 83 | 
 84 |     def _codegen_r(self):
 85 |         """Generate output code for R scripts."""
 86 |         if len(self.outputs) == 0:
 87 |             return ''
 88 | 
 89 |         # read csv
 90 |         code = '\n\n# wrangles output\n' \
 91 |             'df <- read.csv2("{}", sep = ",", stringsAsFactors = FALSE, ' \
 92 |                'check.names=FALSE)'\
 93 |             .format(self.fn)
 94 | 
 95 |         # record outputs
 96 |         ns = self.get_outputs()
 97 |         col = self.col + 1
 98 |         row = self.counter
 99 |         for n in ns:
100 |             code += '\ndf[{}, {}] = {}'.format(row, col, self.outputs[n].value)
101 |             col += 1
102 | 
103 |         # write csv
104 |         code += '\nwrite.csv(df, file="{}", row.names=FALSE)'.format(self.fn)
105 |         code += '\n'
106 | 
107 |         return code
108 | 
109 |     def _codegen_python(self):
110 |         if len(self.outputs) == 0:
111 |             return ''
112 | 
113 |         # TODO
114 | 
115 |     def _gen_code(self):
116 |         """Generate output code to be appended to the end of the script."""
117 |         if self.lang.is_r():
118 |             return self._codegen_r()
119 |         if self.lang.is_python():
120 |             return self._codegen_python()      
121 |         return ''
122 | 
123 |     def write_pre_exe(self):
124 |         fn_pre_exec = os.path.join(self.out, 'pre_exe.sh')
125 |         with open(fn_pre_exec, 'w') as f:
126 |             f.write(self.pre_exe)
127 |     
128 |     def write_post_exe(self):
129 |         fn_post_exec = os.path.join(self.out, 'post_exe.sh')
130 |         with open(fn_post_exec, 'w') as f:
131 |             f.write(self.post_exe)
132 | 
133 |     def write_lang(self):
134 |         lang = os.path.join(self.out, 'lang.json')
135 |         with open(lang, 'w') as f:
136 |             json.dump(self.lang.supported_langs, f)
137 | 
138 | 
139 |     def write_universe(self, code):
140 |         """Write the generated code to a universe file."""
141 | 
142 |         self.counter += 1
143 |         fn = get_universe_script(self.counter, self.lang.get_ext())
144 | 
145 |         # replace the reserved keyword _n
146 |         code = code.replace('{{_n}}', str(self.counter))
147 | 
148 |         # append output code
149 |         code += self._gen_code()
150 | 
151 |         # write file
152 |         with open(os.path.join(self.out, DIR_SCRIPT, fn), 'w') as f:
153 |             f.write(code)
154 |             f.flush()
155 | 
156 |         return fn
157 | 
158 |     def write_summary(self, rows):
159 |         """Write the summary CSV file"""
160 |         with open(self.fn, 'w', newline='') as f:
161 |             wrt = csv.writer(f)
162 |             for row in rows:
163 |                 wrt.writerow(row)
164 | 
165 |     def write_overview_json(self, res):
166 |         """ Write the overview.json file"""
167 |         # append visualizer block
168 |         default_config = {
169 |             "files": [{"id": "est", "path": "estimates.csv"}],
170 |             "schema": {"point_estimate": {"file": "est", "field": "estimate"}}
171 |         }
172 |         vis = Wrangler._read_optional(self.spec, 'visualizer', None)
173 | 
174 |         # if it is a string, read config file
175 |         if isinstance(vis, str):
176 |             try:
177 |                 with open(vis) as f:
178 |                     vis = json.load(f)
179 |             except (IOError, json.JSONDecodeError) as e:
180 |                 print(e)
181 |                 print('Cannot read the visualizer config, using the default')
182 |                 vis = default_config
183 |         # if user does not specify the config, use the default
184 |         vis = default_config if vis is None else vis
185 |         res['visualizer'] = vis
186 | 
187 |         with open(os.path.join(self.out, 'overview.json'), 'w') as f:
188 |             obj = json.dumps(res, indent=2, sort_keys=True)
189 |             f.write(obj)
190 | 
191 |     def create_dir(self):
192 |         """Create output directories."""
193 |         if os.path.exists(self.out):
194 |             shutil.rmtree(self.out)
195 |         os.makedirs(self.out)
196 |         os.makedirs(os.path.join(self.out, DIR_SCRIPT))
197 | 
198 |     def get_outputs(self):
199 |         """Get a sorted list of output names."""
200 |         return sorted(list(self.outputs.keys()))
201 | 


--------------------------------------------------------------------------------
/example/hurricane/data_wrangling/data_jung.csv:
--------------------------------------------------------------------------------
 1 | Year,Name,MasFem,MinPressure_before,Minpressure_Updated_2014,Gender_MF,Category,alldeaths,NDAM,Elapsed Yrs,Source,ZMasFem,ZMinPressure_A,ZNDAM
 2 | 1950,Easy,6.77778,958,960,1,3,2,1590,63,MWR,-0.00094,-0.35636,-0.43913
 3 | 1950,King,1.38889,955,955,0,3,4,5350,63,MWR,-1.67076,-0.51125,-0.14843
 4 | 1952,Able,3.83333,985,985,0,1,3,150,61,MWR,-0.91331,1.03765,-0.55047
 5 | 1953,Barbara,9.83333,987,987,1,1,1,58,60,MWR,0.94587,1.14091,-0.55758
 6 | 1953,Florence,8.33333,985,985,1,1,0,15,60,MWR,0.48108,1.03765,-0.56090
 7 | 1954,Carol,8.11111,960,960,1,3,60,19321,59,MWR,0.41222,-0.25310,0.93174
 8 | 1954,Edna,8.55556,954,954,1,3,20,3230,59,MWR,0.54993,-0.56288,-0.31234
 9 | 1954,Hazel,9.44444,938,938,1,4,20,24260,59,MWR,0.82537,-1.38896,1.31360
10 | 1955,Connie,8.50000,962,962,1,3,0,2030,58,MWR,0.53272,-0.14984,-0.40511
11 | 1955,Diane,9.88889,987,987,1,1,200,14730,58,MWR,0.96309,1.14091,0.57679
12 | 1955,Ione,5.94444,960,960,0,3,7,6200,58,MWR,-0.25916,-0.25310,-0.08271
13 | 1956,Flossy,7.00000,975,975,1,2,15,1540,57,MWR,0.06792,0.52135,-0.44300
14 | 1958,Helene,9.88889,946,946,1,3,1,540,55,MWR,0.96309,-0.97592,-0.52031
15 | 1959,Debra,9.88889,984,984,1,1,0,430,54,MWR,0.96309,0.98602,-0.52882
16 | 1959,Gracie,9.77778,950,950,1,3,22,510,54,MWR,0.92866,-0.76940,-0.52263
17 | 1960,Donna,9.27778,930,930,1,4,50,53270,53,http://www.nhc.noaa.gov/pdf/NWS-TPC-5.pdf,0.77372,-1.80199,3.55651
18 | 1960,Ethel,8.72222,981,981,1,1,0,35,53,MWR,0.60158,0.83113,-0.55936
19 | 1961,Carla,9.50000,931,931,1,4,46,15850,52,MWR,0.84258,-1.75036,0.66338
20 | 1963,Cindy,9.94444,996,996,1,1,3,300,50,MWR,0.98030,1.60558,-0.53887
21 | 1964,Cleo,7.94444,968,968,1,2,3,6450,49,MWR,0.36057,0.15994,-0.06338
22 | 1964,Dora,9.33333,966,966,1,2,5,16260,49,MWR,0.79094,0.05668,0.69508
23 | 1964,Hilda,8.83333,950,950,1,3,37,2770,49,MWR,0.63601,-0.76940,-0.34790
24 | 1964,Isbell,9.44444,974,974,1,2,3,800,49,MWR,0.82537,0.46972,-0.50021
25 | 1965,Betsy,8.33333,948,948,1,3,75,20000,48,MWR,0.48108,-0.87266,0.98424
26 | 1966,Alma,8.77778,982,982,1,2,6,730,47,MWR,0.61879,0.88276,-0.50562
27 | 1966,Inez,8.27778,983,983,1,1,3,99,47,MWR,0.46386,0.93439,-0.55441
28 | 1967,Beulah,7.27778,950,950,1,3,15,5060,46,MWR,0.15400,-0.76940,-0.17085
29 | 1968,Gladys,8.94444,977,977,1,2,3,800,45,MWR,0.67044,0.62461,-0.50021
30 | 1969,Camille,9.05556,909,909,1,5,256,23040,44,MWR,0.70487,-2.88622,1.21928
31 | 1970,Celia,9.44444,945,945,1,3,22,6870,43,WIKI (http://en.wikipedia.org/wiki/Hurricane_Celia),0.82537,-1.02755,-0.03091
32 | 1971,Edith,8.50000,978,978,1,2,0,300,42,MWR,0.53272,0.67624,-0.53887
33 | 1971,Fern,7.38889,979,979,1,1,2,500,42,MWR,0.18843,0.72787,-0.52341
34 | 1971,Ginger,10.00000,995,995,1,1,0,200,42,MWR,0.99752,1.55395,-0.54660
35 | 1972,Agnes,8.66667,980,980,1,1,117,20430,41,MWR,0.58436,0.77950,1.01748
36 | 1974,Carmen,8.72222,952,952,1,3,1,1180,39,MWR,0.60158,-0.66614,-0.47083
37 | 1975,Eloise,8.94444,955,955,1,3,21,6190,38,MWR,0.67044,-0.51125,-0.08348
38 | 1976,Belle,10.44445,980,980,1,1,5,570,37,MWR,1.13523,0.77950,-0.51799
39 | 1977,Babe,6.88889,995,995,1,1,0,66,36,MWR,0.03349,1.55395,-0.55696
40 | 1979,Bob,1.66667,986,986,0,1,1,70,34,MWR,-1.58468,1.08928,-0.55665
41 | 1979,David,1.72222,970,970,0,2,15,2700,34,MWR,-1.56747,0.26320,-0.35331
42 | 1979,Frederic,2.50000,946,946,0,3,5,12770,34,MWR,-1.32647,-0.97592,0.42525
43 | 1980,Allen,2.66667,945,945,0,3,2,2130,33,MWR,-1.27482,-1.02755,-0.39738
44 | 1983,Alicia,9.83333,962,962,1,3,21,10400,30,MWR,0.94587,-0.14984,0.24201
45 | 1984,Diana,9.94444,949,949,1,2,3,410,29,MWR,0.98030,-0.82103,-0.53036
46 | 1985,Bob,1.66667,1002,1003,0,1,0,130,28,MWR,-1.58468,1.91536,-0.55201
47 | 1985,Danny,2.22222,987,987,0,1,1,160,28,MWR,-1.41254,1.14091,-0.54969
48 | 1985,Elena,9.72222,959,959,1,3,4,4180,28,MWR,0.91144,-0.30473,-0.23889
49 | 1985,Gloria,9.50000,942,942,1,3,8,3020,28,MWR,0.84258,-1.18244,-0.32857
50 | 1985,Juan,1.94444,971,971,0,1,12,4730,28,MWR,-1.49861,0.31483,-0.19636
51 | 1985,Kate,9.66667,967,967,1,2,5,1310,28,MWR,0.89423,0.10831,-0.46078
52 | 1986,Bonnie,9.38889,990,990,1,1,3,6,27,MWR,0.80815,1.29580,-0.56160
53 | 1986,Charley,2.88889,990,990,0,1,5,58,27,MWR,-1.20596,1.29580,-0.55758
54 | 1987,Floyd,1.83333,993,993,0,1,0,1,26,MWR,-1.53304,1.45069,-0.56199
55 | 1988,Florence,8.33333,984,984,1,1,1,4,25,MWR,0.48108,0.98602,-0.56175
56 | 1989,Chantal,9.05556,986,986,1,1,13,290,24,MWR,0.70487,1.08928,-0.53964
57 | 1989,Hugo,2.88889,934,934,0,4,21,20020,24,MWR,-1.20596,-1.59547,0.98578
58 | 1989,Jerry,2.33333,983,983,0,1,3,230,24,MWR,-1.37811,0.93439,-0.54428
59 | 1991,Bob,1.66667,962,962,0,2,15,3620,22,MWR,-1.58468,-0.14984,-0.28218
60 | 1992,Andrew,2.22222,922,922,0,5,62,66730,21,MWR,-1.41254,-2.21503,4.59717
61 | 1993,Emily,9.83333,960,961,1,3,3,96,20,MWR,0.94587,-0.25310,-0.55464
62 | 1995,Erin,7.22222,973,973,1,2,6,1650,18,MWR,0.13678,0.41809,-0.43449
63 | 1995,Opal,8.50000,942,942,1,3,9,7550,18,MWR,0.53272,-1.18244,0.02167
64 | 1996,Bertha,8.50000,974,974,1,2,8,700,17,MWR,0.53272,0.46972,-0.50794
65 | 1996,Fran,7.16667,954,954,1,3,26,8260,17,MWR,0.11957,-0.56288,0.07656
66 | 1997,Danny,2.22222,984,984,0,1,10,200,16,MWR,-1.41254,0.98602,-0.54660
67 | 1998,Bonnie,9.38889,964,964,1,2,3,1650,15,MWR,0.80815,-0.04658,-0.43449
68 | 1998,Earl,1.88889,987,987,0,1,3,160,15,MWR,-1.51583,1.14091,-0.54969
69 | 1998,Georges,2.27778,964,964,0,2,1,3870,15,MWR,-1.39532,-0.04658,-0.26285
70 | 1999,Bret,2.33333,951,951,0,3,0,94,14,MWR,-1.37811,-0.71777,-0.55480
71 | 1999,Floyd,1.83333,956,956,0,2,56,8130,14,MWR,-1.53304,-0.45962,0.06651
72 | 1999,Irene,9.27778,987,964,1,1,8,1430,14,MWR,0.77372,1.14091,-0.45150
73 | 2002,Lili,10.33333,963,963,1,1,2,1260,11,MWR,1.10080,-0.09821,-0.46465
74 | 2003,Claudette,9.16667,979,979,1,1,3,250,10,MWR,0.73930,0.72787,-0.54274
75 | 2003,Isabel,9.38889,957,957,1,2,51,4980,10,MWR,0.80815,-0.40799,-0.17703
76 | 2004,Alex,4.16667,972,972,0,1,1,5,9,MWR,-0.81003,0.36646,-0.56168
77 | 2004,Charley,2.88889,941,941,0,4,10,20510,9,MWR,-1.20596,-1.23407,1.02367
78 | 2004,Frances,6.00000,960,960,1,2,7,12620,9,MWR,-0.24194,-0.25310,0.41365
79 | 2004,Gaston,2.66667,985,985,0,1,8,170,9,MWR,-1.27482,1.03765,-0.54892
80 | 2004,Ivan,1.05556,946,946,0,3,25,18590,9,MWR,-1.77405,-0.97592,0.87522
81 | 2004,Jeanne,8.50000,950,950,1,3,5,10210,9,MWR,0.53272,-0.76940,0.22732
82 | 2005,Cindy,9.94444,991,991,1,1,1,350,8,MWR,0.98030,1.34743,-0.53500
83 | 2005,Dennis,2.44444,946,946,0,3,15,2650,8,MWR,-1.34368,-0.97592,-0.35718
84 | 2005,Ophelia,9.16667,982,982,1,1,1,91,8,MWR,0.73930,0.88276,-0.55503
85 | 2005,Rita,9.50000,937,937,1,3,62,10690,8,MWR,0.84258,-1.44059,0.26443
86 | 2005,Wilma,8.61111,950,950,1,3,5,25960,8,MWR,0.56715,-0.76940,1.44504
87 | 2007,Humberto,2.38889,985,985,0,1,1,51,6,MWR,-1.36089,1.03765,-0.55812
88 | 2008,Dolly,9.83333,963,967,1,1,1,1110,5,MWR,0.94587,-0.09821,-0.47624
89 | 2008,Gustav,1.72222,951,954,0,2,52,4360,5,MWR,-1.56747,-0.71777,-0.22497
90 | 2008,Ike,1.88889,935,950,0,2,84,20370,5,MWR,-1.51583,-1.54384,1.01284
91 | 2011,Irene,9.27778,952,952,1,1,41,7110,2,MWR,0.77372,-0.66614,-0.01235
92 | 2012,Isaac,1.94444,965,966,0,1,5,24000,1,MWR,-1.49861,0.00505,1.29350
93 | 2012,Sandy,9.00000,945,942,1,2,159,75000,1,MWR,0.68765,-1.02755,5.23657


--------------------------------------------------------------------------------
/example/hurricane/boba_util.R:
--------------------------------------------------------------------------------
  1 | # check if we support the model type
  2 | # @param model The fitted model object
  3 | is_supported <- function (model) {
  4 |   ms <- c('lm', 'lmerMod', 'negbin', 'aov')
  5 |   return(class(model)[1] %in% ms)
  6 | }
  7 | 
  8 | # get model predictions per data point
  9 | # @param model The fitted model object
 10 | # @param df The dataframe that the model will predict on
 11 | pointwise_predict <- function (model, df) {
 12 |   if (!is_supported(model)) {
 13 |     stop(paste('Unsupported model type', class(model)[1]))
 14 |   }
 15 | 
 16 |   # fixme: lmerMod does not have se.fit
 17 |   pred <- predict(model, df, se.fit = TRUE, type = "response")
 18 |   disagg_fit <- df  %>%
 19 |     mutate(
 20 |       fit = pred$fit,                                     # inferential fits
 21 |       se.fit = pred$se.fit,                               # standard errors of predicted means
 22 |       df = df.residual(model),                            # residual degrees of freedom
 23 |       sigma = sigma(model),                               # residual standard deviation
 24 |       se.residual = sqrt(sum(residuals(model)^2) / df)    # residual standard errors
 25 |     )
 26 |   return(disagg_fit)
 27 | }
 28 | 
 29 | # split the train/test set in a k-fold cross validation
 30 | # returns a dataframe with k rows (k is the num of folds) and two columns
 31 | #   - train: a list of training indices for the k-th fold
 32 | #   - test: a list of testing indices for the k-th fold
 33 | # @param n The total number of rows
 34 | cv_split <- function (n, folds = 5) {
 35 |   l = n %/% folds
 36 |   rest = n - folds * l
 37 |   
 38 |   lengths <- ifelse(1:folds <= rest, l + 1, l)
 39 |   f_sum <- function(x, n) sum(head(x,n))
 40 |   indices <- lapply(1:folds, function (i) {
 41 |     i1 = f_sum(lengths, i - 1) + 1
 42 |     i2 = i1 + lengths[i] - 1
 43 |     
 44 |     if (i1 > 1) {
 45 |       if (i2+1 < n) {
 46 |         i_train = c(1:(i1-1), (i2+1):n)
 47 |       } else {
 48 |         i_train = 1:(i1-1)
 49 |       }
 50 |     } else {
 51 |       i_train = (i2+1):n
 52 |     }
 53 |     i_test = c(i1:i2)
 54 |     return(list(i_train, i_test))
 55 |   })
 56 |   
 57 |   indices <- as.data.frame(do.call(rbind, indices))
 58 |   colnames(indices) <- c("train", "test")
 59 |   return(indices)
 60 | }
 61 | 
 62 | # perform k-fold cross validation
 63 | # @param df The dataframe
 64 | # @param model The fitted model
 65 | # @param y The column name for the observed variable in df
 66 | # @param folds The number of folds
 67 | # @param func A function returning the fitted y vector from a model and a dataset
 68 | cross_validation <- function (df, model, y, folds = 5, func = NULL) {
 69 |   mse = 0
 70 |   indices = cv_split(nrow(df), folds = folds)
 71 |   for (i in c(1:nrow(indices))) {
 72 |     d_train = df[indices$train[[i]], ]
 73 |     d_test = df[indices$test[[i]], ]
 74 | 
 75 |     m1 <- update(model, . ~ ., data = d_train)
 76 |     if (!is.null(func)) {
 77 |         expected <- func(m1, d_test)
 78 |     } else {
 79 |         # fixme: lmerMod need to set allow.new.levels = TRUE
 80 |         expected <- pointwise_predict(m1, d_test)$fit
 81 |     }
 82 | 
 83 |     mse = mse + sum((d_test[[y]] - expected)^2)
 84 |   }
 85 | 
 86 |   mse = sqrt(mse / nrow(df))
 87 |   return(mse)
 88 | }
 89 | 
 90 | # marginalize model predictions
 91 | # @param df The dataframe containing individual model fits
 92 | # @param term The predictor of interest
 93 | # @param y The value field to aggregate
 94 | margins <- function (df, term, y = "fit") {
 95 |   expectation <- df %>%
 96 |     group_by(!! sym(term)) %>%                   # group by predictor(s) of interest
 97 |     summarize(expected = weighted.mean(!! sym(y))) %>%  # marninalize across other predictors
 98 |     compare_levels(expected, by = !! sym(term)) %>%
 99 |     ungroup()
100 |   return(expectation)
101 | }
102 | 
103 | # get the sampling distribution
104 | # @param model The fitted model
105 | # @param term The predictor of interest
106 | # @param type Type of result (response or model coefficient)
107 | # @param draws The number of draws
108 | sampling_distribution <- function (model, term, type="coef", draws=200) {
109 |   if (!is_supported(model)) {
110 |     stop(paste('Unsupported model type', class(model)[1]))
111 |   }
112 |   ts = c('coef', 'coefficient', 'resp', 'response')
113 |   if (!(type %in% ts)) {
114 |     stop(paste('Unsupported type', type))
115 |   }
116 | 
117 |   if (type == "coef" || type == "coefficient") {
118 |     uncertainty <- tidy(model, conf.int = TRUE) %>%
119 |       filter(term == !! term) %>%
120 |       mutate(
121 |         df = df.residual(model),                # get model degrees of freedom
122 |         .draw = list(1:draws),                  # generate list of draw numbers
123 |         t = map(df, ~rt(draws, .))              # simulate draws as t-scores
124 |       ) %>%
125 |       unnest(cols = c(".draw", "t")) %>%
126 |       mutate(coef = t * std.error + estimate)
127 |   }
128 | 
129 |   if (type == "resp" || type == "response") {
130 |     # todo
131 |   }
132 | 
133 |   return(uncertainty)
134 | }
135 | 
136 | # permutation test to get the null distribution
137 | # @param df The dataframe
138 | # @param model The fitted model
139 | # @param terms A character vector of terms to be shuffled
140 | # @param func A function returning the point estimate from a model and a dataset
141 | # @param N The number of iterations
142 | permutation_test <- function (df, model, terms, func = NULL, N=200) {
143 |   # ensure we have the same random samples across universe runs
144 |   set.seed(3040)
145 | 
146 |   res = lapply(1:N, function (i) {
147 |     # shuffle
148 |     pm <- df[sample(nrow(df)), ] %>%
149 |       dplyr::select(any_of(terms))
150 | 
151 |     df2 = df %>% dplyr::select(-any_of(terms)) %>%
152 |       bind_cols(pm)
153 | 
154 |     # fit the model
155 |     m1 <- update(model, . ~ ., data = df2)
156 | 
157 |     # point estimate
158 |     if (!is.null(func)) {
159 |       expected <- func(m1, df2)
160 |     } else {
161 |       # fixme: lmerMod need to set allow.new.levels = TRUE
162 |       expected <- margins(pointwise_predict(m1, df2), terms[1])$expected
163 |     }
164 | 
165 |     return(expected)
166 |   })
167 | 
168 |   # remove seed because set seed is global
169 |   rm(.Random.seed, envir=.GlobalEnv)
170 | 
171 |   return(enframe(unlist(res)))
172 | }
173 | 
174 | # get the pointwise log likelihood
175 | # @param model The fitted model
176 | # @param d_test The dataframe
177 | # @private
178 | compute_loglik <- function (model, d_test) {
179 |   mu <- predict(model, d_test, type = "response")
180 |   sigma <- sigma(model)
181 |   y <- as.list(attr(terms(model), "variables"))[[2]]
182 |   return(log(dnorm(d_test[[y]], mu, sigma)+1e-307))
183 | }
184 | 
185 | # get the pointwise log likelihood for stacking
186 | # @param df The dataframe
187 | # @param model The fitted model
188 | stacking <- function (df, model) {
189 |   indices = cv_split(nrow(df), folds = 5)
190 |   pointwise_density <- c()
191 | 
192 |   for (i in c(1:nrow(indices))) {
193 |     d_train = df[indices$train[[i]], ]
194 |     d_test = df[indices$test[[i]], ]
195 | 
196 |     m1 <- update(model, . ~ ., data = d_train)
197 |     pointwise_density <- append(pointwise_density, compute_loglik(m1, d_test))
198 |   }
199 | 
200 |   return(pointwise_density)
201 | }
202 | 


--------------------------------------------------------------------------------
/boba/bobarun.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import pandas as pd
  3 | import os
  4 | import json
  5 | import multiprocessing as mp
  6 | from subprocess import PIPE
  7 | from .lang import Lang
  8 | from .wrangler import *
  9 | 
 10 | 
 11 | class BobaRun:
 12 |     def __init__(self, folder, jobs=1, batch_size=0):
 13 |         # attributes
 14 |         self.folder = folder
 15 |         self.dir_log = os.path.join(folder, DIR_LOG)
 16 |         self.file_log = os.path.join(self.dir_log, 'logs.csv')
 17 |         self.pool = None
 18 |         self.exit_code = []
 19 | 
 20 |         # read summary
 21 |         data = pd.read_csv(self.folder + '/summary.csv')
 22 |         self.size = data.shape[0]
 23 | 
 24 |         # multiprocessing attributes
 25 |         if jobs == 0:
 26 |             jobs = mp.cpu_count()
 27 |         if batch_size == 0:
 28 |             batch_size = min(int(self.size**0.5), int(self.size / jobs) + 1)
 29 |         self.jobs = jobs
 30 |         self.batch_size = batch_size
 31 | 
 32 |         # language
 33 |         fn = data['Filename'].to_list()[0]
 34 |         try:
 35 |             with open(self.folder + '/lang.json', 'r') as f:
 36 |                 self.lang = Lang(fn, supported_langs=json.load(f))
 37 |         except IOError:
 38 |             self.lang = Lang(fn)
 39 | 
 40 | 
 41 |     def run_multiverse(self, universes=[], resume=False):
 42 |         """
 43 |         Run the multiverse.
 44 |         
 45 |         Parameters:
 46 |          - universes: a list of universe ids to run
 47 |          - resume: skip log initialization and pre-exe hook, but the caller must
 48 |            make sure that these steps are done properly before calling
 49 |         """
 50 |         # do not allow simultaneous runs
 51 |         if self.is_running():
 52 |             return
 53 | 
 54 |         # initialize process pool
 55 |         self.pool = mp.Pool(self.jobs)
 56 | 
 57 |         # by default, run all universes
 58 |         if not len(universes):
 59 |             universes = list(range(1, self.size + 1))
 60 | 
 61 |         if not resume:
 62 |             # before execute
 63 |             self.run_commands_in_folder('pre_exe.sh')
 64 | 
 65 |             # initialize the log folder and log file
 66 |             self.exit_code = []
 67 |             if os.path.exists(self.dir_log):
 68 |                 shutil.rmtree(self.dir_log)
 69 |             os.makedirs(self.dir_log)
 70 | 
 71 |             with open(self.file_log, 'w') as log:
 72 |                 log.write('uid,exit_code\n')
 73 | 
 74 |         # callback that is run for each retrieved result.
 75 |         # FIXME: if stopped, the last batch will not invoke the callback
 76 |         def check_result(r):
 77 |             self.exit_code += [[res[0], res[1]] for res in r]
 78 |             # write the results to our logs
 79 |             with open(self.file_log, 'a') as f_log:
 80 |                 for res in r:
 81 |                     f_log.write(f'{res[0]},{res[1]}\n')
 82 | 
 83 |         # run each batch of universes as a separate task
 84 |         while len(universes):
 85 |             batch = []
 86 |             while len(universes) and len(batch) < self.batch_size:
 87 |                 u = get_universe_script(universes.pop(0), self.lang.get_ext())
 88 |                 batch.append(u)
 89 | 
 90 |             self.pool.apply_async(run_batch_of_universes,
 91 |                 args=(self.folder, batch, self.lang.supported_langs),
 92 |                 callback=check_result)
 93 | 
 94 |         # collect all the results
 95 |         self.pool.close()
 96 |         self.pool.join()
 97 | 
 98 |         # after execute
 99 |         self.run_commands_in_folder('post_exe.sh')
100 |         self.pool = None
101 | 
102 | 
103 |     def resume_multiverse(self, universes=[]):
104 |         """
105 |         Resume the multiverse, by skipping scripts that are already run in the
106 |         universe list.
107 |         """
108 |         # if the log file is missing, run everything
109 |         if not os.path.exists(self.file_log):
110 |             return self.run_multiverse(universes)
111 | 
112 |         # default argument
113 |         if not len(universes):
114 |             universes = list(range(1, self.size + 1))
115 | 
116 |         # recover previous progress from log file
117 |         df = pd.read_csv(self.file_log)
118 |         self.exit_code = df.values.tolist()
119 | 
120 |         # skip scripts that are already run
121 |         lookup = set(df['uid'].tolist())
122 |         universes = [u for u in universes if u not in lookup]
123 |         self.run_multiverse(universes, resume=True)
124 | 
125 | 
126 |     def stop(self):
127 |         """ Stop all outstanding work in the pool """
128 |         if self.pool is not None:
129 |             print('Terminating')
130 |             # stop all workers
131 |             # note that everything after pool.join() will still run
132 |             self.pool.terminate()
133 | 
134 | 
135 |     def is_running(self):
136 |         """ Whether the multiverse is currently running """
137 |         return self.pool is not None
138 | 
139 | 
140 |     def run_from_cli(self, run_all=True, num=1, thru=-1):
141 |         """ Entry point of boba run CLI """
142 |         # get the id of all the universes we want to run
143 |         thru = num if thru == -1 else thru
144 |         start = 1 if run_all else num
145 |         end = self.size if run_all else thru
146 |         universes = list(range(start, end + 1))
147 | 
148 |         # run
149 |         self.run_multiverse(universes)
150 | 
151 | 
152 |     def run_commands_in_folder(self, file_with_commands):
153 |         """ Run command """
154 |         cwd = os.getcwd()
155 |         os.chdir(self.folder)
156 |         with open(file_with_commands) as f:
157 |             for line in f.readlines():
158 |                 os.system(line)
159 |         os.chdir(cwd)
160 | 
161 | 
162 |     def run_after_execute(self):
163 |         self.run_commands_in_folder('post_exe.sh')
164 | 
165 | 
166 | # these two functions can't be in the class because multiprocess
167 | # does not know how to properly serialize functions in classes
168 | def run_batch_of_universes(folder, universes, supported_langs):
169 |     """ Run a batch of universes """
170 |     batch = []
171 |     for universe in universes:
172 |         batch.append(run_universe(folder, universe, supported_langs))
173 | 
174 |     return batch
175 | 
176 | 
177 | def run_universe(folder, script, supported_langs):
178 |     """ Run one universe """
179 |     cmds = Lang(script, supported_langs=supported_langs).get_cmd()
180 | 
181 |     universe_id = get_universe_id_from_script(script)
182 |     universe_name_fmt = '[' + get_universe_name(universe_id) + ']'
183 |     for cmd in cmds:
184 |         out = subprocess.Popen(cmd, cwd=os.path.join(folder, DIR_SCRIPT),
185 |                             stdout=PIPE, stderr=PIPE)
186 | 
187 |         log_dir = os.path.join(folder, DIR_LOG)
188 |         with open(os.path.join(log_dir, get_universe_log(universe_id)), 'w') as log:
189 |             while True:
190 |                 # blocks here until next line is availible.
191 |                 output = out.stdout.readline().decode('utf-8')
192 |                 if output == '' and out.poll() is not None:
193 |                     break
194 |                 if output:
195 |                     print(universe_name_fmt + " " + output, end='')
196 |                     log.write(output)
197 |                 rc = out.poll()
198 | 
199 |         err = out.communicate()[1]
200 |         err_decoded = err.decode('utf-8')
201 |         if err_decoded is not '':
202 |             with open(os.path.join(log_dir, get_universe_error_log(universe_id)), 'w') as err_log:
203 |                 err_log.write(err_decoded)
204 |             
205 |             print(universe_name_fmt + ' error:\n' + err_decoded, end='')
206 |             break
207 | 
208 |     return universe_id, out.returncode
209 | 


--------------------------------------------------------------------------------
/example/hurricane/template.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # --- (BOBA_CONFIG)
  3 | {
  4 |   "decisions": [
  5 |     {"var": "outliers", "options": [
  6 |         "c()",
  7 |         "c('Katrina')",
  8 |         "c('Katrina', 'Audrey')",
  9 |         "c('Katrina', 'Audrey', 'Sandy')",
 10 |         "c('Katrina', 'Audrey', 'Sandy', 'Andrew')",
 11 |         "c('Katrina', 'Audrey', 'Sandy', 'Andrew', 'Donna')"
 12 |     ]},
 13 |     {"var": "feminity", "options": ["female", "masfem"]},
 14 |     {"var": "feminity_prediction_levels", "options": ["c(0, 1)", "c(2.53, 8.29)"]},
 15 |     {"var": "damage", "options": ["dam", "log_dam"]},
 16 |     {"var": "predictors", "options": [
 17 |         "feminity * damage",
 18 |         "feminity + damage + pressure + feminity:damage + feminity:pressure",
 19 |         "feminity + damage + zwin + feminity:damage + feminity:zwin",
 20 |         "feminity + damage + zcat + feminity:damage + feminity:zcat",
 21 |         "feminity + damage + z3 + feminity:damage + feminity:z3",
 22 |         "feminity + damage + z3"
 23 |     ]},
 24 |     {"var": "covariates", "options": [
 25 |         "",
 26 |         "+ year:damage",
 27 |         "+ post:damage"
 28 |     ]},
 29 |     {"var": "back_transform", "options": [
 30 |       "exp(mu + sigma^2/2) - 1",
 31 |       "mu",
 32 |       "exp(mu + sigma^2/2) - 1"
 33 |     ]},
 34 |     {"var": "df", "options": [
 35 |         "pred$df",
 36 |         "df.residual(model)",
 37 |         "pred$df"
 38 |     ]}
 39 |   ],
 40 |   "constraints": [
 41 |     {"link": ["feminity", "feminity_prediction_levels"]},
 42 |     {"link": ["Model", "back_transform", "df"]}
 43 |   ],
 44 |   "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results",
 45 |   "visualizer": "visualizer_config.json"
 46 | }
 47 | # --- (END)
 48 | 
 49 | library(readr)
 50 | library(MASS)
 51 | library(modelr)
 52 | library(tidyverse)
 53 | library(broom.mixed)
 54 | library(tidybayes)
 55 | 
 56 | # a function for post-processing predicted means and standard deviations into expected number of deaths
 57 | pred2expectation <- function(mu, sigma) {
 58 |     return({{back_transform}})
 59 | }
 60 | 
 61 | # a custom function for cross validation
 62 | cross <- function (df, func, fml, folds = 5) {
 63 |   l = nrow(df) %/% folds
 64 |   mse = 0
 65 |   for (i in c(1:folds)) {
 66 |     # properly splitting train/test
 67 |     i1 = l*(i-1)+1
 68 |     i2 = l*i
 69 |     d_test = df[i1:i2, ]
 70 |     if (i1 > 1) {
 71 |       if (i2+1 < nrow(df)) {
 72 |         d_train = rbind(df[1:(i1-1), ], df[(i2+1):nrow(df), ])
 73 |       } else {
 74 |         d_train = df[1:(i1-1), ]
 75 |       }
 76 |     } else {
 77 |       d_train = df[(i2+1):nrow(df), ]
 78 |     }
 79 | 
 80 |     model <- func(fml, data = d_train)
 81 |     mu <- predict(model, d_test, type = "response")
 82 |     sigma <- sigma(model)
 83 |     expected_deaths <- pred2expectation(mu, sigma)
 84 | 
 85 |     mse = mse + sum((d_test$death - expected_deaths)^2)
 86 |   }
 87 | 
 88 |   mse = sqrt(mse / nrow(df))
 89 |   return(mse)
 90 | }
 91 | 
 92 | # read and process data
 93 | df <- read_csv('../data.csv',
 94 |     col_types = cols(
 95 |         Year = col_integer(),
 96 |         Category = col_integer(),
 97 |         Gender_MF = col_integer(),
 98 |         alldeaths = col_integer()
 99 |     )) %>%
100 |     # rename some variables
101 |     dplyr::select(
102 |         year = Year,
103 |         name = Name,
104 |         dam = NDAM,
105 |         death = alldeaths,
106 |         female = Gender_MF,
107 |         masfem = MasFem,
108 |         category = Category,
109 |         pressure = Minpressure_Updated_2014,
110 |         wind = HighestWindSpeed
111 |     ) %>%
112 |     # create new variables
113 |     mutate(
114 |         log_death = log(death + 1),
115 |         log_dam = log(dam),
116 |         post = ifelse(year>1979, 1, 0),
117 |         zdam = scale(dam),
118 |         zcat = as.numeric(scale(category)),
119 |         zmin = -scale(pressure),
120 |         zwin = as.numeric(scale(wind)),
121 |         z3 = as.numeric((zmin + zcat + zwin) / 3)
122 |     ) %>%
123 |     # remove outliers
124 |     filter(!(name %in% {{outliers}})) %>%
125 |     # operationalize feminity
126 |     mutate(
127 |         feminity = {{feminity}},
128 |         damage =  {{damage}}
129 |     )
130 | 
131 | # --- (Model) ols_regression
132 | # OLS regression with log(deaths+1) as the dependent variable 
133 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df)
134 | fit = cross(df, lm, log_death ~ {{predictors}} {{covariates}}) # cross validation
135 | 
136 | # --- (Model) negative_binomial
137 | # Negative binomial with deaths as the dependent variable
138 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df)
139 | fit = cross(df, glm.nb, death ~ {{predictors}} {{covariates}}) # cross validation
140 | 
141 | # --- (Model) anova
142 | # ANOVA with log(deaths+1) as the dependent variable
143 | model <- aov(log_death ~ {{predictors}} {{covariates}}, data = df)
144 | fit = cross(df, aov, log_death ~ {{predictors}} {{covariates}}) # cross validation
145 | 
146 | # --- (O)
147 | # normalize RMSE
148 | nrmse = fit / (max(df$death) - min(df$death))
149 | 
150 | # get prediction
151 | pred <- predict(model, se.fit = TRUE, type = "response")
152 | disagg_fit <- df  %>%
153 |     mutate(
154 |         fit = pred$fit,                                     # add fitted predictions and standard errors to dataframe
155 |         se.fit = pred$se.fit,
156 |         df = {{df}},                                        # get degrees of freedom
157 |         sigma = sigma(model),                               # get residual standard deviation
158 |         se.residual = sqrt(sum(residuals(model)^2) / df)    # get residual standard errors
159 |     )
160 | 
161 | # aggregate fitted effect of female storm name
162 | expectation <- disagg_fit %>%
163 |     mutate(expected_deaths = pred2expectation(fit, sigma)) %>% 
164 |     group_by(female) %>%                                            # group by predictor(s) of interest
165 |     summarize(expected_deaths = weighted.mean(expected_deaths)) %>% # marninalize across other predictors       
166 |     compare_levels(expected_deaths, by = female) %>%
167 |     ungroup() %>%
168 |     dplyr::select(expected_diff = expected_deaths) %>%
169 |     add_column(NRMSE = nrmse)                                       # add cross validatation metric
170 | 
171 | # propagate uncertainty in fit to model predictions
172 | uncertainty <- disagg_fit %>%
173 |     mutate(
174 |         .draw = list(1:5000),                               # generate list of draw numbers
175 |         t = map(df, ~rt(5000, .)),                          # simulate draws from t distribution to transform into means
176 |         x = map(df, ~rchisq(5000, .))                       # simulate draws from chi-squared distribution to transform into sigmas
177 |     ) %>%
178 |     unnest(cols = c(".draw", "t", "x")) %>%
179 |     mutate(
180 |         mu = t * se.fit + fit,                              # scale and shift t to get a sampling distribution of means
181 |         sigma = sqrt(df * se.residual^2 / x),               # scale and take inverse of x to get a sampling distribution of sigmas
182 |         expected_deaths = pred2expectation(mu, sigma)
183 |     ) %>%        
184 |     group_by(.draw, female) %>%                             # group by predictor(s) of interest
185 |     summarize(expected_deaths = mean(expected_deaths)) %>%  # marninalize across other predictors
186 |     compare_levels(expected_deaths, by = female) %>%
187 |     ungroup() %>%
188 |     dplyr::select(expected_diff = expected_deaths)
189 | 
190 | # only output relevant fields in disagg_fit
191 | disagg_fit <- disagg_fit %>%
192 |     mutate(expected_deaths = pred2expectation(fit, sigma)) %>%
193 |     dplyr::select(
194 |         observed = death,
195 |         expected = expected_deaths
196 |     )
197 | 
198 | # output
199 | write_csv(expectation, '../results/estimate_{{_n}}.csv')
200 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv')
201 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv')
202 | 


--------------------------------------------------------------------------------
/test/test_constraint_parser.py:
--------------------------------------------------------------------------------
  1 | # Ugly hack to allow import from the root folder
  2 | import sys
  3 | import os
  4 | sys.path.insert(0, os.path.abspath('..'))
  5 | 
  6 | import unittest
  7 | from boba.constraintparser import ConstraintParser, ParseError
  8 | from boba.conditionparser import ConditionParser, TokenType
  9 | from boba.parser import Parser
 10 | 
 11 | 
 12 | def abs_path(rel_path):
 13 |     return os.path.join(os.path.dirname(__file__), rel_path)
 14 | 
 15 | 
 16 | def read_wrapper(spec, ps):
 17 |     ConstraintParser(spec).read_constraints(ps.code_parser, ps.dec_parser)
 18 | 
 19 | 
 20 | class TestConstraintParser(unittest.TestCase):
 21 | 
 22 |     def test_read_json(self):
 23 |         base = abs_path('./specs/')
 24 |         ps = Parser(base+'script3-1.py')
 25 |         cp = ConstraintParser(ps.spec)
 26 |         cs = cp.read_constraints(ps.code_parser, ps.dec_parser)
 27 |         self.assertEqual(len(cs), 2)
 28 | 
 29 |     def test_link(self):
 30 |         base = abs_path('./specs/')
 31 |         ps = Parser(base + 'script3-7.py')
 32 |         cp = ConstraintParser(ps.spec)
 33 |         cs = cp.read_constraints(ps.code_parser, ps.dec_parser)
 34 |         self.assertEqual(len(cs), 10)
 35 | 
 36 |     def test_condition_parser(self):
 37 |         cond = ''
 38 |         ConditionParser(cond).parse()
 39 | 
 40 |         cond = 'a == b'
 41 |         _, decs = ConditionParser(cond).parse()
 42 |         self.assertListEqual(['a', 'b'], [d.value for d in decs])
 43 | 
 44 |         cond = 'a.index == 1'
 45 |         _, decs = ConditionParser(cond).parse()
 46 |         self.assertListEqual(['a', '1'], [d.value for d in decs])
 47 |         self.assertListEqual([TokenType.index_var, TokenType.number],
 48 |                              [d.type for d in decs])
 49 | 
 50 |         cond = 'a = 2.5'
 51 |         _, decs = ConditionParser(cond).parse()
 52 |         self.assertListEqual(['a', '2.5'], [d.value for d in decs])
 53 |         self.assertListEqual([TokenType.var, TokenType.number],
 54 |                              [d.type for d in decs])
 55 | 
 56 |         cond = 'a.index == b.index'  # .index not allowed on RHS, should fail
 57 |         with self.assertRaises(ParseError):
 58 |             ConditionParser(cond).parse()
 59 | 
 60 |         cond = '1 2 a b 4'  # we did not check other semantics ...
 61 |         ConditionParser(cond).parse()
 62 | 
 63 |     def test_eval(self):
 64 |         """ Evaluation of various conditions """
 65 |         # expr and expr
 66 |         base = abs_path('./specs/')
 67 |         ps = Parser(base + 'script3-6.py', base)
 68 |         ps.main(verbose=False)
 69 |         self.assertEqual(ps.wrangler.counter, 2)
 70 | 
 71 |         # expr or expr
 72 |         ps.spec['constraints'] = [{"block": "D", "condition": "a == if or B == b1"}]
 73 |         ps._parse_constraints()
 74 |         ps.main(verbose=False)
 75 |         self.assertEqual(ps.wrangler.counter, 6)
 76 | 
 77 |         # expr and (expr or expr)
 78 |         ps.spec['constraints'] = [{"block": "D", "condition": "a == if and (B == b1 or B == b2)"}]
 79 |         ps._parse_constraints()
 80 |         ps.main(verbose=False)
 81 |         self.assertEqual(ps.wrangler.counter, 4)
 82 | 
 83 |         # testing !=
 84 |         ps.spec['constraints'] = [{"block": "D", "condition": "a != if"}]
 85 |         ps._parse_constraints()
 86 |         ps.main(verbose=False)
 87 |         self.assertEqual(ps.wrangler.counter, 4)
 88 | 
 89 |         # testing >=
 90 |         ps.spec['constraints'] = [{"block": "D", "condition": "a.index >= 1"}]
 91 |         ps._parse_constraints()
 92 |         ps.main(verbose=False)
 93 |         self.assertEqual(ps.wrangler.counter, 4)
 94 | 
 95 |         # testing index
 96 |         ps.spec['constraints'] = [{"block": "D", "condition": "b.index == 1"}]
 97 |         ps._parse_constraints()
 98 |         ps.main(verbose=False)
 99 |         self.assertEqual(ps.wrangler.counter, 4)
100 | 
101 |         # testing option with integer type
102 |         ps.spec['constraints'] = [{"block": "D", "condition": "b == 0"}]
103 |         ps._parse_constraints()
104 |         ps.main(verbose=False)
105 |         self.assertEqual(ps.wrangler.counter, 4)
106 | 
107 |         # testing option with float type
108 |         ps.spec['constraints'] = [{"block": "D", "condition": "b == 1.5"}]
109 |         ps._parse_constraints()
110 |         ps.main(verbose=False)
111 |         self.assertEqual(ps.wrangler.counter, 4)
112 | 
113 |         # testing unmade decision
114 |         ps.spec['constraints'] = [{"block": "A", "condition": "b.index == 0"}]
115 |         ps._parse_constraints()
116 |         ps.main(verbose=False)
117 |         self.assertEqual(ps.wrangler.counter, 0)
118 | 
119 |         # testing if the decision is made when the block depends on a variable
120 |         # inside the block
121 |         ps.spec['constraints'] = [{"block": "B", "condition": "b.index == 0"}]
122 |         ps._parse_constraints()
123 |         ps.main(verbose=False)
124 |         self.assertEqual(ps.wrangler.counter, 0)
125 | 
126 |     def test_condition_syntax(self):
127 |         """ Does the condition code contain python syntax error? """
128 | 
129 |         base = abs_path('./specs/')
130 |         ps = Parser(base+'script3-1.py', base)
131 | 
132 |         spec = {'constraints': [{'block': 'A', 'condition': 'B=b1'}]}
133 |         with self.assertRaises(ParseError):
134 |             read_wrapper(spec, ps)
135 | 
136 |         spec = {'constraints': [{'block': 'A', 'condition': 'B b1'}]}
137 |         with self.assertRaises(ParseError):
138 |             read_wrapper(spec, ps)
139 | 
140 |         spec = {'constraints': [{'block': 'A', 'condition': 'B == 2.5'}]}
141 |         read_wrapper(spec, ps)
142 | 
143 |     def test_json_syntax(self):
144 |         """ Test various possibilities to specify constraints in JSON """
145 | 
146 |         base = abs_path('./specs/')
147 |         ps = Parser(base+'script3-1.py', base)
148 | 
149 |         # empty - should parse
150 |         spec = {}
151 |         read_wrapper(spec, ps)
152 | 
153 |         # empty array - should parse
154 |         spec = {'constraints': []}
155 |         read_wrapper(spec, ps)
156 | 
157 |         # empty element - should fail
158 |         spec = {'constraints': [{}]}
159 |         with self.assertRaises(ParseError):
160 |             read_wrapper(spec, ps)
161 | 
162 |         # no matching block - should fail
163 |         spec = {'constraints': [{'block': 'a'}]}
164 |         with self.assertRaises(ParseError):
165 |             read_wrapper(spec, ps)
166 | 
167 |         # no matching variable - should fail
168 |         spec = {'constraints': [{'variable': 'c'}]}
169 |         with self.assertRaises(ParseError):
170 |             read_wrapper(spec, ps)
171 | 
172 |         # loner option - should fail
173 |         spec = {'constraints': [{'option': 'a1'}]}
174 |         with self.assertRaises(ParseError):
175 |             read_wrapper(spec, ps)
176 | 
177 |         # loner block - should parse
178 |         spec = {'constraints': [{'block': 'A', 'condition': 'B==b1'}]}
179 |         read_wrapper(spec, ps)
180 | 
181 |         # block and option - should parse
182 |         spec = {'constraints': [{'block': 'A', 'option': 'a1', 'condition': 'B==b1'}]}
183 |         read_wrapper(spec, ps)
184 | 
185 |         # variable and option - should parse
186 |         spec = {'constraints': [{'variable': 'a', 'option': '2.5', 'condition': 'B==b1'}]}
187 |         read_wrapper(spec, ps)
188 | 
189 |         # weird option - should parse
190 |         # fixme: {'option': '[1,2]'} will fail
191 |         spec = {'constraints': [{'variable': 'c', 'option': '[1, 2]', 'condition': 'B==b1'}]}
192 |         read_wrapper(spec, ps)
193 | 
194 |         # variables in condition do not match - should fail
195 |         spec = {'constraints': [{'block': 'A', 'condition': 'H==b1'}]}
196 |         with self.assertRaises(ParseError):
197 |             read_wrapper(spec, ps)
198 | 
199 |         # variables in condition do not match - should fail
200 |         spec = {'constraints': [{'block': 'A', 'condition': 'H.index==1'}]}
201 |         with self.assertRaises(ParseError):
202 |             read_wrapper(spec, ps)
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     unittest.main()
207 | 


--------------------------------------------------------------------------------
/tutorial/simple.md:
--------------------------------------------------------------------------------
  1 | # Getting started
  2 | 
  3 | In this tutorial, we will walk you through a simple analysis scenario to
  4 | demonstrate how you might write and execute multiverse using our tool.
  5 | 
  6 | ### A simple analysis script
  7 | 
  8 | Let's say we have the following analysis script that reads a data file, removes
  9 |  outliers, and fits a linear model.
 10 | 
 11 | ```python
 12 | import pandas as pd
 13 | import numpy as np
 14 | import statsmodels.api as sm
 15 | 
 16 | if __name__ == '__main__':
 17 |     # read data file
 18 |     df = pd.read_csv('data.csv')
 19 |     
 20 |     # remove outliers
 21 |     # discard rows outside 2 x std
 22 |     df = df[np.abs(df.y - df.y.mean()) <= (2 * df.y.std())]
 23 |     
 24 |     # fit a simple ordinary least squares model
 25 |     x = sm.add_constant(df.x)
 26 |     lm = sm.OLS(df.y, x).fit()
 27 | ```
 28 | 
 29 | ### Placeholder variable
 30 | 
 31 | Suppose the threshold for removing outliers is pretty subjective; you can
 32 | justify removing data points outside 2, 2.5 or 3 standard deviations of the
 33 | mean. Would the prediction change if you adopt a different threshold? To test
 34 | this, you might insert a decision point and ask the tool to output a
 35 | separate script for each possible threshold configuration. To insert a decision,
 36 | first insert a placeholder variable `{{var_name}}` in the above code:
 37 | 
 38 | ```python
 39 | df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff}} * df.y.std())]
 40 | ```
 41 | 
 42 | Then, in a separate JSON file, you could list the possible options this
 43 | placeholder variable can take up:
 44 | 
 45 | ```json
 46 | {
 47 |   "decisions": [
 48 |     {"var": "cutoff", "options": [2, 2.5, 3] }
 49 |   ]
 50 | }
 51 | ```
 52 | 
 53 | Now, calling the tool with the file path to your script and JSON will output 3
 54 | python scripts. Each script is a universe where you choose a different cutoff
 55 | value for removing outliers; for example, one of the universes is exactly the
 56 | same as the analysis script we started with. The tool also outputs a summary
 57 | table to let you know what parameter value is taken up by which file:
 58 | 
 59 | |Filename     |Code Path|cutoff|
 60 | |-------------|---------|------|
 61 | |universe_1.py|_start   |2     |
 62 | |universe_2.py|_start   |2.5   |
 63 | |universe_3.py|_start   |3     |
 64 | 
 65 | (The table contains an unfamiliar column "Code Path", which we will explain in
 66 | a minute!)
 67 | 
 68 | If you specify multiple decisions, we will output **all combinations** of
 69 | possible alternatives. Namely, the number of output scripts will be the
 70 | cross-product of the number of options for each decision.
 71 | 
 72 | ### Code blocks
 73 | 
 74 | Your decision point can be more complex than replacing values of a variable.
 75 | For example, instead of removing data points outside some standard deviations
 76 | of the mean, it is also reasonable to remove data points outside some IQRs of
 77 | the median. 
 78 | 
 79 | ```python
 80 | iqr = np.subtract(*np.percentile(df.y, [75, 25]))
 81 | median = np.median(df.y)
 82 | df = df[abs(df.y - median) <= 3 * iqr]
 83 | ```
 84 | As you can see, this alternative requires a few lines to implement; it is no
 85 | longer a straightforward value substitution. You can of course write the entire
 86 | block of code as a string into the options array, but it will be really
 87 | cumbersome.
 88 | 
 89 | You might instead consider using code blocks. Instead of a
 90 | linear flow from start to end, your code can consist of blocks, similar to
 91 | cells in Jupyter notebook or R markdown. To specify a code block, simply insert
 92 | a comment line with the syntax `# --- (ID) option` immediately
 93 | before the starting line of the block. The lines of code between this
 94 | declaration and the next (or the end of file) is a block
 95 | named `ID`. We will go ahead and insert three such comments into
 96 | our script:
 97 | 
 98 | ```python
 99 | import pandas as pd
100 | import numpy as np
101 | import statsmodels.api as sm
102 | 
103 | if __name__ == '__main__':
104 |     # read data file
105 |     df = pd.read_csv('../data.csv')
106 | 
107 |     # --- (A) std
108 |     # remove outliers based on std
109 |     df = df[np.abs(df.y - df.y.mean()) <= ({{cutoff}} * df.y.std())]
110 | 
111 |     # --- (A) iqr
112 |     # remove outliers based on iqr
113 |     iqr = np.subtract(*np.percentile(df.y, [75, 25]))
114 |     median = np.median(df.y)
115 |     df = df[abs(df.y - median) <= 3 * iqr]
116 | 
117 |     # --- (B)
118 |     # fit a simple ordinary least squares model
119 |     x = sm.add_constant(df.x)
120 |     lm = sm.OLS(df.y, x).fit()
121 | 
122 |     # display results
123 |     print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
124 |     print('AIC: {:.2f}'.format(lm.aic))
125 |     print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
126 | ```
127 | 
128 | These three comments break the code into **four** blocks. All lines before
129 | `# --- (A) std` belong to the first, unnamed block. All lines between `# --- (A) std`
130 | and `# --- (A) iqr` belong to block "A" with option "std". All lines between
131 | `# --- (B)` and the end of the file belong to block "B".
132 | 
133 | Note that we have two types of blocks: some blocks, such as `(A) std` and
134 | `(A) iqr`, specify an *option* after the parenthesis.
135 | Such blocks are called *decision blocks*; the same ID can take up different
136 | options, not unlike a placeholder variable. Other blocks, such as `(B)`, are
137 | normal blocks that do not act like a decision point.
138 | 
139 | We now need to tell boba the relationship between the blocks. 
140 | We want to remove outliers before fitting the model, so the order of the blocks
141 | should be A followed by B. Note that while A has two options `std` and `iqr`,
142 | we only use `A` in the graph and boba will choose different options in
143 | different universes. Let's specify the relationship of the blocks as a directed
144 | graph in the JSON file:
145 | 
146 | ```json
147 | {
148 |   "graph": ["A->B"],
149 |   "decisions": [
150 |     {"var": "cutoff", "options": [2, 2.5, 3] }
151 |   ]
152 | }
153 | ```
154 | The graph is optional, with the default being a linear path of all the blocks
155 | according to their order in the template script. In this example, we could
156 | omit the graph and still get the same result.
157 | 
158 | Now, calling the program with our updated script and JSON will generate 4
159 | universes where the following value and code path is chosen:
160 | 
161 | |Filename     |Code Path   |cutoff|(A)|
162 | |-------------|------------|------|---|
163 | |universe_1.py|_start->A->B|2     |std|
164 | |universe_2.py|_start->A->B|2.5   |std|
165 | |universe_3.py|_start->A->B|3     |std|
166 | |universe_4.py|_start->A->B|      |iqr|
167 | 
168 | Since we did not use the parameter `cutoff` in our outlier removal code
169 | involving IQR, our multiverse does not expand the parameter `cutoff` when
170 | block `A` takes the option `iqr`. If we change the code in IQR to be:
171 | 
172 | ```python
173 | df = df[abs(df.y - median) <= {{cutoff}} * iqr]
174 | ```
175 | 
176 | We will get 6 universes:
177 | 
178 | |Filename     |Code Path   |cutoff|(A)|
179 | |-------------|------------|------|---|
180 | |universe_1.py|_start->A->B|2     |iqr|
181 | |universe_2.py|_start->A->B|2.5   |iqr|
182 | |universe_3.py|_start->A->B|3     |iqr|
183 | |universe_4.py|_start->A->B|2     |std|
184 | |universe_5.py|_start->A->B|2.5   |std|
185 | |universe_6.py|_start->A->B|3     |std|
186 | 
187 | Take a look at the generated python scripts
188 | [here](https://github.com/uwdata/boba/tree/master/example/simple/output/code).
189 | 
190 | (You may notice that all code paths in the above table are the same. In a more
191 | complex analysis, we might produce differing code paths, by creating
192 | branches in the directed graph. We will cover
193 | advanced usage of the graph in a later tutorial.)
194 | 
195 | ### Executing the multiverse
196 | After you are happy with the generated scripts, you might want to execute them
197 | all to compute the results. Boba has a command for executing universes:
198 | 
199 | ```bash
200 | boba run --all
201 | ```
202 | It will run **all** the scripts for you! Before you do this, you might want
203 | to run one script, or simply look at a few scripts, to ensure that
204 | the generated code does not have syntax errors, etc. To run a selected range
205 | of universes, for example universe number 1 through 3, do:
206 | 
207 | ```bash
208 | boba run 1 --thru 3
209 | ```
210 | 
211 | ### Try it yourself!
212 | 
213 | The code and data of this example is available [here](https://github.com/uwdata/boba/tree/master/example/simple).
214 | To run the example, clone this repository and run the following commands:
215 | 
216 | ```bash
217 | pip install -e .
218 | pip install -r requirements.txt
219 | cd example/simple
220 | boba compile
221 | ```
222 | 


--------------------------------------------------------------------------------
/example/hurricane/reproduce/repro_bootstrap.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # Use bootstrapping to get uncertainty distribution
  3 | # Issue: majority of glm.nb models fail to fit on bootstrapped data
  4 | # --- (BOBA_CONFIG)
  5 | {
  6 |   "decisions": [
  7 |     {"var": "outliers", "options": [
  8 |         "c()",
  9 |         "c('Katrina')",
 10 |         "c('Katrina', 'Audrey')"
 11 |     ]},
 12 |     {"var": "leverage_points", "options": [
 13 |         "c()",
 14 |         "c('Sandy')",
 15 |         "c('Sandy', 'Andrew')",
 16 |         "c('Sandy', 'Andrew', 'Donna')"
 17 |     ]},
 18 |     {"var": "feminity", "options": ["female", "masfem"]},
 19 |     {"var": "damage", "options": ["dam", "log_dam"]},
 20 |     {"var": "predictors", "options": [
 21 |         "feminity * damage",
 22 |         "feminity + damage + pressure + feminity:damage + feminity:pressure",
 23 |         "feminity + damage + zwin + feminity:damage + feminity:zwin",
 24 |         "feminity + damage + zcat + feminity:damage + feminity:zcat",
 25 |         "feminity + damage + z3 + feminity:damage + feminity:z3",
 26 |         "feminity + damage + z3"
 27 |     ]},
 28 |     {"var": "covariates", "options": [
 29 |         "",
 30 |         "+ year:damage",
 31 |         "+ post:damage"
 32 |     ]},
 33 |     {"var": "back_transform", "options": [
 34 |       "exp(mu + sigma^2/2) - 1",
 35 |       "mu"
 36 |     ]},
 37 |     {"var": "model_prefix", "options": ["lm(log_death", "glm.nb(death"]},
 38 |     {"var": "df", "options": [
 39 |         "pred$df",
 40 |         "df.residual(model)"
 41 |     ]}
 42 |   ],
 43 |   "constraints": [
 44 |     {"link": ["Model", "back_transform", "df", "model_prefix"]}
 45 |   ],
 46 |   "before_execute": "cp ../data.csv ./ && rm -rf results && mkdir results"
 47 | }
 48 | # --- (END)
 49 | 
 50 | library(readr)
 51 | library(MASS)
 52 | library(modelr)
 53 | library(tidyverse)
 54 | library(broom.mixed)
 55 | library(tidybayes)
 56 | library(boot)
 57 | 
 58 | # a function for post-processing predicted means and standard deviations into expected number of deaths
 59 | pred2expectation <- function(mu, sigma) {
 60 |     return({{back_transform}})
 61 | }
 62 | 
 63 | # a custom function for cross validation
 64 | cross <- function (df, func, fml, folds = 5) {
 65 |   l = nrow(df) %/% folds
 66 |   mse = 0
 67 |   for (i in c(1:folds)) {
 68 |     # properly splitting train/test
 69 |     i1 = l*(i-1)+1
 70 |     i2 = l*i
 71 |     d_test = df[i1:i2, ]
 72 |     if (i1 > 1) {
 73 |       if (i2+1 < nrow(df)) {
 74 |         d_train = rbind(df[1:(i1-1), ], df[(i2+1):nrow(df), ])
 75 |       } else {
 76 |         d_train = df[1:(i1-1), ]
 77 |       }
 78 |     } else {
 79 |       d_train = df[(i2+1):nrow(df), ]
 80 |     }
 81 | 
 82 |     model <- func(fml, data = d_train)
 83 |     mu <- predict(model, d_test, type = "response")
 84 |     sigma <- sigma(model)
 85 |     expected_deaths <- pred2expectation(mu, sigma)
 86 | 
 87 |     mse = mse + sum((d_test$death - expected_deaths)^2)
 88 |   }
 89 | 
 90 |   mse = sqrt(mse / nrow(df))
 91 |   return(mse)
 92 | }
 93 | 
 94 | # read and process data
 95 | df <- read_csv('../data.csv',
 96 |     col_types = cols(
 97 |         Year = col_integer(),
 98 |         Category = col_integer(),
 99 |         Gender_MF = col_integer(),
100 |         alldeaths = col_integer()
101 |     )) %>%
102 |     # rename some variables
103 |     dplyr::select(
104 |         year = Year,
105 |         name = Name,
106 |         dam = NDAM,
107 |         death = alldeaths,
108 |         female = Gender_MF,
109 |         masfem = MasFem,
110 |         category = Category,
111 |         pressure = Minpressure_Updated_2014,
112 |         wind = HighestWindSpeed
113 |     ) %>%
114 |     # create new variables
115 |     mutate(
116 |         log_death = log(death + 1),
117 |         log_dam = log(dam),
118 |         post = ifelse(year>1979, 1, 0),
119 |         zdam = scale(dam),
120 |         zcat = as.numeric(scale(category)),
121 |         zmin = -scale(pressure),
122 |         zwin = as.numeric(scale(wind)),
123 |         z3 = as.numeric((zmin + zcat + zwin) / 3)
124 |     ) %>%
125 |     # remove outliers
126 |     filter(!(name %in% {{outliers}})) %>%
127 |     filter(!(name %in% {{leverage_points}})) %>%
128 |     # operationalize feminity
129 |     mutate(
130 |         feminity = {{feminity}},
131 |         damage =  {{damage}}
132 |     )
133 | 
134 | # --- (Model) ols_regression
135 | # OLS regression with log(deaths+1) as the dependent variable
136 | model <- lm(log_death ~ {{predictors}} {{covariates}}, data = df)
137 | fit = cross(df, lm, log_death ~ {{predictors}} {{covariates}}) # cross validation
138 | 
139 | # --- (Model) negative_binomial
140 | # Negative binomial with deaths as the dependent variable
141 | model <- glm.nb(death ~ {{predictors}} {{covariates}}, data = df)
142 | fit = cross(df, glm.nb, death ~ {{predictors}} {{covariates}}) # cross validation
143 | 
144 | # --- (O)
145 | # normalize RMSE
146 | nrmse = fit / (max(df$death) - min(df$death))
147 | 
148 | # get prediction
149 | pred <- predict(model, se.fit = TRUE, type = "response")
150 | disagg_fit <- df  %>%
151 |     mutate(
152 |         fit = pred$fit,                            # add inferential fits and standard errors to dataframe
153 |         se.fit = pred$se.fit,
154 |         df = {{df}},                                        # get degrees of freedom
155 |         sigma = sigma(model),                               # get residual standard deviation
156 |         se.residual = sqrt(sum(residuals(model)^2) / df)    # get residual standard errors
157 |     )
158 | 
159 | # aggregate fitted effect of female storm name
160 | expectation <- disagg_fit %>%
161 |     mutate(expected_deaths = pred2expectation(fit, sigma)) %>%
162 |     group_by(female) %>%                                            # group by predictor(s) of interest
163 |     summarize(expected_deaths = weighted.mean(expected_deaths)) %>% # marninalize across other predictors
164 |     compare_levels(expected_deaths, by = female) %>%
165 |     ungroup() %>%
166 |     dplyr::select(expected_diff = expected_deaths) %>%
167 |     add_column(NRMSE = nrmse)                                       # add cross validatation metric
168 | 
169 | # bootstrap function
170 | rsq <- function(data, indices) {
171 |   d <- data[indices,] # allows boot to select sample
172 |   model <- {{model_prefix}} ~ {{predictors}} {{covariates}} , data = d)
173 |   pred <- predict(model, se.fit = TRUE, type = "response")
174 |   disagg_fit <- d  %>%
175 |     mutate(
176 |         fit = pred$fit,                            # add inferential fits and standard errors to dataframe
177 |         sigma = sigma(model)                               # get residual standard deviation
178 |     )
179 |   expectation <- disagg_fit %>%
180 |     mutate(expected_deaths = pred2expectation(fit, sigma)) %>%
181 |     group_by(female) %>%                                            # group by predictor(s) of interest
182 |     summarize(expected_deaths = weighted.mean(expected_deaths)) %>% # marninalize across other predictors
183 |     compare_levels(expected_deaths, by = female) %>%
184 |     ungroup()
185 |   return(expectation$expected_deaths)
186 | }
187 | 
188 | # bootstrap
189 | bootstrap <- boot(data=df, statistic=rsq, R=200)
190 | bootstrap <- tidy(bootstrap$t) %>%
191 |   select(expected_diff=x)
192 | 
193 | # propagate uncertainty in fit to model predictions
194 | uncertainty <- disagg_fit %>%
195 |     mutate(
196 |         .draw = list(1:200),                               # generate list of draw numbers
197 |         t = map(df, ~rt(200, .)),                          # simulate draws from t distribution to transform into means
198 |         x = map(df, ~rchisq(200, .))                       # simulate draws from chi-squared distribution to transform into sigmas
199 |     ) %>%
200 |     unnest(cols = c(".draw", "t", "x")) %>%
201 |     mutate(
202 |         mu = t * se.fit + fit,                              # scale and shift t to get a sampling distribution of means
203 |         sigma = sqrt(df * se.residual^2 / x),               # scale and take inverse of x to get a sampling distribution of sigmas
204 |         expected_deaths = pred2expectation(mu, sigma)
205 |     ) %>%
206 |     group_by(.draw, female) %>%                             # group by predictor(s) of interest
207 |     summarize(expected_deaths = mean(expected_deaths)) %>%  # marninalize across other predictors
208 |     compare_levels(expected_deaths, by = female) %>%
209 |     ungroup() %>%
210 |     dplyr::select(expected_diff = expected_deaths)
211 | 
212 | # only output relevant fields in disagg_fit
213 | disagg_fit <- disagg_fit %>%
214 |     mutate(expected_deaths = pred2expectation(fit, sigma)) %>%
215 |     dplyr::select(
216 |         observed = death,
217 |         expected = expected_deaths
218 |     )
219 | 
220 | # visualize
221 | library(ggplot2)
222 | ggsave(qplot(expected_diff, data=bootstrap, geom="histogram"), file='../results/bootstrap_{{_n}}.pdf')
223 | ggsave(qplot(expected_diff, data=uncertainty, geom="histogram"), file='../results/uncertainty_{{_n}}.pdf')
224 | 
225 | # output
226 | write_csv(expectation, '../results/estimate_{{_n}}.csv')
227 | write_csv(disagg_fit, '../results/disagg_fit_{{_n}}.csv')
228 | write_csv(uncertainty, '../results/uncertainty_{{_n}}.csv')
229 | write_csv(bootstrap, '../results/bootstrap_{{_n}}.csv')
230 | 


--------------------------------------------------------------------------------
/boba/adg.py:
--------------------------------------------------------------------------------
  1 | 
  2 | class ADG:
  3 |     """ For creating ADG. """
  4 |     def __init__(self):
  5 |         self.nodes = set()
  6 |         self.edges = {}
  7 |         self.proc_edges = {}
  8 | 
  9 |         self._graph_nodes = set()
 10 |         self._graph_edges = {}
 11 |         self._links = []    # linked decisions
 12 |         self._constraint_proc = set()  # procedural deps from constraints
 13 |         self._decs = set()  # all decisions
 14 | 
 15 |     @staticmethod
 16 |     def _convert_edges(edges):
 17 |         d = {}
 18 |         for e in edges:
 19 |             ADG._add_edge(d, e.start, e.end)
 20 |         return d
 21 | 
 22 |     @staticmethod
 23 |     def _add_edge(res, start, end):
 24 |         if start in res and end not in res[start]:
 25 |             res[start].append(end)
 26 |         else:
 27 |             res[start] = [end]
 28 | 
 29 |     @staticmethod
 30 |     def _all_ending_nodes(edges):
 31 |         """ nodes that have at least one incoming edge(s) """
 32 |         flat = [item for lst in edges.values() for item in lst]
 33 |         return set(flat)
 34 | 
 35 |     @staticmethod
 36 |     def _get_source(nodes, edges):
 37 |         """ nodes that have no incoming edges """
 38 |         return nodes.difference(ADG._all_ending_nodes(edges))
 39 | 
 40 |     @staticmethod
 41 |     def _get_target(nodes, edges):
 42 |         """ nodes that have no outgoing edges """
 43 |         return nodes.difference(set(edges.keys()))
 44 | 
 45 |     @staticmethod
 46 |     def _group_by(lst, func):
 47 |         res = {}
 48 |         for item in lst:
 49 |             k = func(item)
 50 |             ADG._add_edge(res, k, item)
 51 |         return res
 52 | 
 53 |     @staticmethod
 54 |     def _bn(name):
 55 |         """ Get the block decision name """
 56 |         return name.split('-')[0].split(':')[0]
 57 | 
 58 |     def _merge_one(self, prev, cur):
 59 |         groups = ADG._group_by(cur, ADG._bn)
 60 |         if prev:
 61 |             self.nodes.add(prev)
 62 |             for k in groups.keys():
 63 |                 ADG._add_edge(self.edges, prev, k)
 64 |                 self.nodes.add(k)
 65 |         return groups
 66 | 
 67 |     def _merge(self):
 68 |         """ Merge alternatives """
 69 |         src = ADG._get_source(self._graph_nodes, self._graph_edges)
 70 |         groups = self._merge_one(None, src)
 71 |         nds = list(groups.keys())
 72 |         i = 0
 73 |         while len(nds):
 74 |             nd = nds.pop()
 75 | 
 76 |             # look up the alternatives, then restore the correct node id
 77 |             alts = groups[nd]
 78 |             nd = nd.split('-')[0]
 79 | 
 80 |             # find the children of all alts of this node and perform merge
 81 |             cur = [self._graph_edges[n] for n in alts if n in self._graph_edges]
 82 |             cur = [item for sublist in cur for item in sublist]
 83 |             # print(nd, set(cur))
 84 |             gp = self._merge_one(nd, set(cur))
 85 | 
 86 |             # if the child node is already in groups, give it a different id
 87 |             for g in gp.copy():
 88 |                 val = gp[g]
 89 |                 key = '{}-{}'.format(g, i) if g in groups else g
 90 |                 i += 1 if g in groups else 0
 91 |                 del gp[g]
 92 |                 gp[key] = val
 93 | 
 94 |             # update the loop
 95 |             groups.update(gp)
 96 |             nds.extend(gp.keys())
 97 | 
 98 |         # print(self.nodes, self.edges)
 99 | 
100 |     def _prune_recur(self, node, nodes, edges):
101 |         """ Recursive helper for prune. Make sure the graph has no cycles! """
102 |         # leaf node
103 |         if node not in self.edges:
104 |             return [node] if node in self._decs else None
105 | 
106 |         clean = []
107 |         # recursively prune children
108 |         for nd in self.edges[node]:
109 |             ret = self._prune_recur(nd, nodes, edges)
110 |             if ret:
111 |                 clean.extend(ret)
112 |             elif len(self.edges[node]) > 1:  # preserve branches
113 |                 clean.append(nd)
114 | 
115 |         # skip if not decision, else add to edges
116 |         if node in self._decs:
117 |             nodes.update(clean)
118 |             nodes.add(node)
119 |             for nd in clean:
120 |                 ADG._add_edge(edges, node, nd)
121 |             return [node]
122 |         else:
123 |             return clean
124 | 
125 |     def _prune(self):
126 |         """ Remove non-decision nodes """
127 |         edges = {}
128 |         nodes = set()
129 |         src = ADG._get_source(self.nodes, self.edges)
130 |         for s in src:
131 |             self._prune_recur(s, nodes, edges)
132 | 
133 |         # replace nodes and edges
134 |         self.nodes = nodes
135 |         self.edges = edges
136 | 
137 |     def _get_linked_vars(self, blocks):
138 |         """ Get linked placeholders """
139 |         bd = set([blocks[b].parameter for b in blocks if blocks[b].parameter])
140 |         res = set()
141 |         for l in self._links:
142 |             bls = [b for b in l if b in bd]
143 |             if len(bls):
144 |                 # skip all vars if they are linked with blocks
145 |                 res.update(set(l).difference(set(bls)))
146 |             else:
147 |                 # otherwise, skip all vars except the first
148 |                 res.update(l[1:])
149 |         return res
150 | 
151 |     def set_graph(self, nodes, edges):
152 |         """ Set code graph """
153 |         self._graph_nodes = nodes
154 |         self._graph_edges = ADG._convert_edges(edges)
155 | 
156 |     def set_constraints(self, links, proc):
157 |         """ Save the intermediate data from constraint parser """
158 |         self._constraint_proc = proc
159 |         self._links = links
160 | 
161 |     def create(self, blocks):
162 |         """ Create the ADG """
163 |         # abort if ADG has already been created
164 |         if len(self.nodes):
165 |             return
166 | 
167 |         # add placeholder vars to the code graph
168 |         decs = []
169 |         for bl in blocks:
170 |             # get the variables associated with a block
171 |             vs = [chunk.variable for chunk in blocks[bl].chunks
172 |                   if chunk.variable != '']
173 |             decs.extend(vs)
174 | 
175 |             # remove linked vars
176 |             linked = self._get_linked_vars(blocks)
177 |             vs = [v for v in vs if v not in linked]
178 | 
179 |             # remove duplicates within this block
180 |             tmp = []
181 |             [tmp.append(v) for v in vs if v not in tmp]
182 |             vs = tmp
183 | 
184 |             # skip variables that have appeared in previous blocks
185 |             # fixme
186 |             gp = ADG._group_by(self._graph_nodes, ADG._bn)
187 |             vs = [v for v in vs if v not in gp
188 |                   or gp[v][0].split('-')[1].split(':')[0] == ADG._bn(bl)]
189 | 
190 |             # name the placeholders differently as distinct nodes for now
191 |             vs = ['{}-{}'.format(v, bl) for v in vs]
192 |             self._graph_nodes.update(vs)
193 | 
194 |             # move children of block to the last var
195 |             vs = [bl] + vs
196 |             last = vs[len(vs) - 1]
197 |             if bl in self._graph_edges:
198 |                 temp = self._graph_edges[bl]
199 |                 self._graph_edges[bl] = []
200 |                 self._graph_edges[last] = temp
201 | 
202 |             # add edges between vars
203 |             for i in range(len(vs) - 1):
204 |                 ADG._add_edge(self._graph_edges, vs[i], vs[i + 1])
205 | 
206 |         # save all decisions, including placeholders and decision blocks
207 |         bd = set([blocks[b].parameter for b in blocks if blocks[b].parameter])
208 |         self._decs = set(decs).union(bd)
209 | 
210 |         # infer ADG from the graph
211 |         self._merge()
212 |         self._prune()
213 | 
214 |         # any branch should be a procedural branch
215 |         for s in self.edges:
216 |             t = self.edges[s]
217 |             if len(t) > 1:
218 |                 self.proc_edges[s] = t
219 |         # add the procedural deps from constraint
220 |         for proc in self._constraint_proc:
221 |             s = proc.split('-')[0]
222 |             e = proc.split('-')[1]
223 |             ADG._add_edge(self.proc_edges, s, e)
224 | 
225 |         # todo: remove linked blocks if they don't have procedural branches
226 | 
227 |     def get_used_decs(self):
228 |         """ Get the decisions that are used in the ADG """
229 |         return [n for n in self.nodes if n in self._decs]
230 | 
231 |     def output(self):
232 |         """ Output the graph object in server JSON """
233 |         nodes = []
234 |         edges = []
235 | 
236 |         # nodes
237 |         i = 0
238 |         lookup = {}
239 |         for n in self.nodes:
240 |             nodes.append({"id": i, "name": n})
241 |             lookup[n] = i
242 |             i += 1
243 | 
244 |         # first add procedural edges
245 |         done = set()
246 |         for s in self.proc_edges:
247 |             ts = self.edges[s]
248 |             for t in ts:
249 |                 done.add('{}->{}'.format(s, t))
250 |                 edges.append({"source": lookup[s], "target": lookup[t],
251 |                               "type": "procedural"})
252 | 
253 |         # add order edges, skip those already added
254 |         for s in self.edges:
255 |             ts = self.edges[s]
256 |             for t in ts:
257 |                 if '{}->{}'.format(s, t) not in done:
258 |                     edges.append({"source": lookup[s], "target": lookup[t],
259 |                                   "type": "order"})
260 | 
261 |         return {"graph": {"nodes": nodes, "edges": edges}}
262 | 


--------------------------------------------------------------------------------