├── .gitignore
├── LICENSE
├── README.md
├── experiments-for-plots.sh
├── experiments.sh
├── poster-preferences-implicit-in-the-state-of-the-world.pdf
├── setup.py
└── src
    ├── __init__.py
    ├── envs
        ├── __init__.py
        ├── apples.py
        ├── apples_spec.py
        ├── batteries.py
        ├── batteries_spec.py
        ├── env.py
        ├── room.py
        ├── room_spec.py
        ├── tests
        │   ├── apples_test.py
        │   ├── batteries_test.py
        │   ├── env_test.py
        │   ├── room_test.py
        │   └── train_test.py
        ├── train.py
        └── train_spec.py
    ├── plotting.py
    ├── relative_reachability.py
    ├── rlsp.py
    ├── run.py
    ├── sampling.py
    ├── utils.py
    └── value_iter.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *~
  2 | *.swp
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Center for Human-Compatible AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reward Learning by Simulating the Past
2 | 
3 | This is the code accompanying the paper "Preferences Implicit in the State of the World". [Paper](https://arxiv.org/abs/1902.04198), [blog post](https://bair.berkeley.edu/blog/2019/02/11/learning_preferences/), [poster](https://github.com/HumanCompatibleAI/rlsp/blob/master/poster-preferences-implicit-in-the-state-of-the-world.pdf).
4 | 
5 | Tests can be run with `python setup.py test`.
6 | 
7 | Instructions for running the experiments can be found in `experiments.sh`. The script `experiments-for-plots.sh` generates the plots from the paper.
8 | 


--------------------------------------------------------------------------------
/experiments-for-plots.sh:
--------------------------------------------------------------------------------
  1 | # Script to generate the plots in the paper. This script will create a "results" folder, and write the experiment
  2 | # outputs into it. Hereafter the plots would be generated in the "results" folder using "src/plotting.py" script.
  3 | # Running this script would take several (3-6) hours.
  4 | 
  5 | ###############
  6 | # Section 5.4 #
  7 | ###############
  8 | 
  9 | # Robustness to the choice of Alice's planning horizon T.
 10 | mkdir -p results/horizon
 11 | 
 12 | # room env
 13 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 1 -o results/horizon -x 20 -d true_reward,final_reward
 14 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 2 -o results/horizon -x 20 -d true_reward,final_reward
 15 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 3 -o results/horizon -x 20 -d true_reward,final_reward
 16 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 5 -o results/horizon -x 20 -d true_reward,final_reward
 17 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 10 -o results/horizon -x 20 -d true_reward,final_reward
 18 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 20 -o results/horizon -x 20 -d true_reward,final_reward
 19 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 30 -o results/horizon -x 20 -d true_reward,final_reward
 20 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 50 -o results/horizon -x 20 -d true_reward,final_reward
 21 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 100 -o results/horizon -x 20 -d true_reward,final_reward
 22 | 
 23 | # train env
 24 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 1 -o results/horizon -x 20 -d true_reward,final_reward
 25 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 2 -o results/horizon -x 20 -d true_reward,final_reward
 26 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 3 -o results/horizon -x 20 -d true_reward,final_reward
 27 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 5 -o results/horizon -x 20 -d true_reward,final_reward
 28 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 10 -o results/horizon -x 20 -d true_reward,final_reward
 29 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 20 -o results/horizon -x 20 -d true_reward,final_reward
 30 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 30 -o results/horizon -x 20 -d true_reward,final_reward
 31 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 50 -o results/horizon -x 20 -d true_reward,final_reward
 32 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 100 -o results/horizon -x 20 -d true_reward,final_reward
 33 | 
 34 | # apples env
 35 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 1 -o results/horizon -x 20 -d true_reward,final_reward
 36 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 2 -o results/horizon -x 20 -d true_reward,final_reward
 37 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 3 -o results/horizon -x 20 -d true_reward,final_reward
 38 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 5 -o results/horizon -x 20 -d true_reward,final_reward
 39 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 10 -o results/horizon -x 20 -d true_reward,final_reward
 40 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 20 -o results/horizon -x 20 -d true_reward,final_reward
 41 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 30 -o results/horizon -x 20 -d true_reward,final_reward
 42 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 50 -o results/horizon -x 20 -d true_reward,final_reward
 43 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 100 -o results/horizon -x 20 -d true_reward,final_reward
 44 | 
 45 | # batteries env
 46 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 1 -o results/horizon -x 20 -d true_reward,final_reward
 47 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 2 -o results/horizon -x 20 -d true_reward,final_reward
 48 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 3 -o results/horizon -x 20 -d true_reward,final_reward
 49 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 5 -o results/horizon -x 20 -d true_reward,final_reward
 50 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 10 -o results/horizon -x 20 -d true_reward,final_reward
 51 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 20 -o results/horizon -x 20 -d true_reward,final_reward
 52 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 30 -o results/horizon -x 20 -d true_reward,final_reward
 53 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 50 -o results/horizon -x 20 -d true_reward,final_reward
 54 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000  -T 100 -o results/horizon -x 20 -d true_reward,final_reward
 55 | 
 56 | 
 57 | ##############
 58 | # Appendix D #
 59 | ##############
 60 | 
 61 | # Option -c additive stands for the Additive method, and -c bayesian for the Bayesian method
 62 | # The -k parameter controls the standard deviation (set to 0.5 by default)
 63 | mkdir -p results/additive-vs-bayesian
 64 | 
 65 | # room env additive
 66 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward
 67 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward
 68 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward
 69 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward
 70 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward
 71 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward
 72 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward
 73 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward
 74 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward
 75 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward
 76 | 
 77 | # train env additive
 78 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward
 79 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward
 80 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward
 81 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward
 82 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward
 83 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward
 84 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward
 85 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward
 86 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward
 87 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward
 88 | 
 89 | # batteries env additive
 90 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward
 91 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward
 92 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward
 93 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward
 94 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward
 95 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward
 96 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward
 97 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward
 98 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward
 99 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward
100 | 
101 | # room env bayesian
102 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward
103 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward
104 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward
105 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward
106 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward
107 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward
108 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward
109 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward
110 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward
111 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 10 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward
112 | 
113 | # train env bayesian
114 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward
115 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward
116 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward
117 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward
118 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward
119 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward
120 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward
121 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward
122 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward
123 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 8 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward
124 | 
125 | # batteries env bayesian
126 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward
127 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward
128 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward
129 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward
130 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward
131 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward
132 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward
133 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward
134 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward
135 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000  -T 11 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward
136 | 
137 | 
138 | ######################
139 | # Generate the plots #
140 | ######################
141 | python src/plotting.py
142 | 


--------------------------------------------------------------------------------
/experiments.sh:
--------------------------------------------------------------------------------
 1 | # Commands for the experiments in the paper. These will write to stdout, and are meant to be run individually.
 2 | # Most experiments should run in seconds, though some can take minutes (especially with the sampling algorithm).
 3 | 
 4 | ###############
 5 | # Section 5.2 #
 6 | ###############
 7 | 
 8 | # Comparison to baselines (Table 1 and Figure 2)
 9 | 
10 | # Room: Specified reward, deviation, reachability, RLSP
11 | python src/run.py -e room -p default -c additive -i spec -d true_reward,final_reward -T 7 -x 20
12 | python src/run.py -e room -p default -c additive -i deviation -d true_reward,final_reward -T 7 -x 20 -w 0.5
13 | python src/run.py -e room -p default -c additive -i reachability -d true_reward,final_reward -T 7 -x 20
14 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20
15 | 
16 | # Train:
17 | python src/run.py -e train -p default -c additive -i spec -d true_reward,final_reward -T 8 -x 20
18 | python src/run.py -e train -p default -c additive -i deviation -d true_reward,final_reward -T 8 -x 20 -w 0.5
19 | python src/run.py -e train -p default -c additive -i reachability -d true_reward,final_reward -T 8 -x 20
20 | python src/run.py -e train -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 8 -x 20
21 | 
22 | # Apples:
23 | python src/run.py -e apples -p default -c additive -i spec -d true_reward,final_reward -T 11 -x 20
24 | python src/run.py -e apples -p default -c additive -i deviation -d true_reward,final_reward -T 11 -x 20 -w 0.5
25 | python src/run.py -e apples -p default -c additive -i reachability -d true_reward,final_reward -T 11 -x 20
26 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20
27 | 
28 | # Batteries, easy:
29 | python src/run.py -e batteries -p easy -c additive -i spec -d true_reward,final_reward -T 11 -x 20
30 | python src/run.py -e batteries -p easy -c additive -i deviation -d true_reward,final_reward -T 11 -x 20 -w 0.5
31 | python src/run.py -e batteries -p easy -c additive -i reachability -d true_reward,final_reward -T 11 -x 20
32 | python src/run.py -e batteries -p easy -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20
33 | 
34 | # Batteries, hard:
35 | python src/run.py -e batteries -p default -c additive -i spec -d true_reward,final_reward -T 11 -x 20
36 | python src/run.py -e batteries -p default -c additive -i deviation -d true_reward,final_reward -T 11 -x 20 -w 0.5
37 | python src/run.py -e batteries -p default -c additive -i reachability -d true_reward,final_reward -T 11 -x 20
38 | python src/run.py -e batteries -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20
39 | 
40 | # Far away vase:
41 | python src/run.py -e room -p bad -c additive -i spec -d true_reward,final_reward -T 5 -x 20
42 | python src/run.py -e room -p bad -c additive -i deviation -d true_reward,final_reward -T 5 -x 20 -w 0.5
43 | python src/run.py -e room -p bad -c additive -i reachability -d true_reward,final_reward -T 5 -x 20
44 | python src/run.py -e room -p bad -c additive -i rlsp -d true_reward,final_reward -s 0 -T 5 -x 20
45 | 
46 | ###############
47 | # Section 5.3 #
48 | ###############
49 | 
50 | # Comparison between knowing the s_{-T} vs. using a uniform distribution over s_{-T}
51 | # The commands are the same in the knowing the s_{-T} case; for the uniform distribution we simply add -u True
52 | 
53 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20
54 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 -u True
55 | python src/run.py -e train -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 8 -x 20
56 | python src/run.py -e train -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 8 -x 20 -u True
57 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20
58 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 -u True
59 | python src/run.py -e batteries -p easy -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20
60 | python src/run.py -e batteries -p easy -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 -u True
61 | python src/run.py -e batteries -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20
62 | python src/run.py -e batteries -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 -u True
63 | python src/run.py -e room -p bad -c additive -i rlsp -d true_reward,final_reward -s 0 -T 5 -x 20
64 | python src/run.py -e room -p bad -c additive -i rlsp -d true_reward,final_reward -s 0 -T 5 -x 20 -u True
65 | 
66 | ###############
67 | # Section 5.4 #
68 | ###############
69 | 
70 | # Robustness to the choice of Alice's planning horizon T.
71 | # Simply take the RLSP commands from before and try different values of T, for example:
72 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 20 -x 20
73 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 20 -x 20
74 | 
75 | # It is also possible to run with multiple values of T and collect the results in an output file, see src/run.py for details.
76 | 
77 | ##############
78 | # Appendix C #
79 | ##############
80 | 
81 | # MCMC sampling
82 | # Simply replace -i rlsp with -i sampling:
83 | python src/run.py -e room -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 7 -x 20
84 | python src/run.py -e train -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 8 -x 20
85 | python src/run.py -e apples -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 11 -x 20
86 | python src/run.py -e batteries -p easy -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 11 -x 20
87 | python src/run.py -e batteries -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 11 -x 20
88 | python src/run.py -e room -p bad -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 5 -x 20
89 | 
90 | ##############
91 | # Appendix D #
92 | ##############
93 | 
94 | # Use -c additive for the Additive method, and -c bayesian for the Bayesian method
95 | # Use the -k parameter to control the standard deviation (set to 0.5 by default)
96 | # Note that since the Apples environment has no specified reward, the -c option has no effect on it.
97 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 -k 1
98 | python src/run.py -e room -p default -c bayesian -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 -k 1
99 | 


--------------------------------------------------------------------------------
/poster-preferences-implicit-in-the-state-of-the-world.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCompatibleAI/rlsp/cacae643752a02b2be092870df2ce3de8d674144/poster-preferences-implicit-in-the-state-of-the-world.pdf


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='rlsp',
 5 |     version=1.0,
 6 |     description='Reward Learning by Simulating the Past',
 7 |     author='Rohin Shah, Dmitrii Krasheninnikov, Jordan Alexander, et al',
 8 |     author_email='rohinmshah@berkeley.edu',
 9 |     python_requires='>=3.6.0',
10 |     url='https://github.com/HumanCompatibleAI/rlsp',
11 |     packages=find_packages('src'),
12 |     package_dir={'': 'src'},
13 |     install_requires=[
14 |         'numpy>=1.13',
15 |         'scipy>=0.19',
16 |     ],
17 |     test_suite='nose.collector',
18 |     tests_require=['nose', 'nose-cover3'],
19 |     include_package_data=True,
20 |     license='MIT',
21 |     classifiers=[
22 |         # Trove classifiers
23 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
24 |         'License :: OSI Approved :: MIT License',
25 |         'Programming Language :: Python',
26 |         'Programming Language :: Python :: 3',
27 |         'Programming Language :: Python :: 3.6',
28 |     ],
29 | )
30 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCompatibleAI/rlsp/cacae643752a02b2be092870df2ce3de8d674144/src/__init__.py


--------------------------------------------------------------------------------
/src/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCompatibleAI/rlsp/cacae643752a02b2be092870df2ce3de8d674144/src/envs/__init__.py


--------------------------------------------------------------------------------
/src/envs/apples.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from copy import copy, deepcopy
  3 | from itertools import product
  4 | 
  5 | from envs.env import Env, Direction
  6 | 
  7 | 
  8 | class ApplesState(object):
  9 |     '''
 10 |     state of the environment; describes positions of all objects in the env.
 11 |     '''
 12 |     def __init__(self, agent_pos, tree_states, bucket_states, carrying_apple):
 13 |         """
 14 |         agent_pos: (orientation, x, y) tuple for the agent's location
 15 |         tree_states: Dictionary mapping (x, y) tuples to booleans.
 16 |         bucket_states: Dictionary mapping (x, y) tuples to integers.
 17 |         carrying_apple: Boolean, True if carrying an apple, False otherwise.
 18 |         """
 19 |         self.agent_pos = agent_pos
 20 |         self.tree_states = tree_states
 21 |         self.bucket_states = bucket_states
 22 |         self.carrying_apple = carrying_apple
 23 | 
 24 |     def __eq__(self, other):
 25 |         return isinstance(other, ApplesState) and \
 26 |             self.agent_pos == other.agent_pos and \
 27 |             self.tree_states == other.tree_states and \
 28 |             self.bucket_states == other.bucket_states and \
 29 |             self.carrying_apple == other.carrying_apple
 30 | 
 31 |     def __hash__(self):
 32 |         def get_vals(dictionary):
 33 |             return tuple([dictionary[loc] for loc in sorted(dictionary.keys())])
 34 |         return hash(self.agent_pos + get_vals(self.tree_states) + get_vals(self.bucket_states) + (self.carrying_apple,))
 35 | 
 36 | 
 37 | class ApplesEnv(Env):
 38 |     def __init__(self, spec, compute_transitions=True):
 39 |         """
 40 |         height: Integer, height of the grid. Y coordinates are in [0, height).
 41 |         width: Integer, width of the grid. X coordinates are in [0, width).
 42 |         init_state: ApplesState, initial state of the environment
 43 |         vase_locations: List of (x, y) tuples, locations of vases
 44 |         num_vases: Integer, number of vases
 45 |         carpet_locations: Set of (x, y) tuples, locations of carpets
 46 |         feature_locations: List of (x, y) tuples, locations of features
 47 |         s: ApplesState, Current state
 48 |         nA: Integer, number of actions
 49 |         """
 50 |         self.height = spec.height
 51 |         self.width = spec.width
 52 |         self.apple_regen_probability = spec.apple_regen_probability
 53 |         self.bucket_capacity = spec.bucket_capacity
 54 |         self.init_state = deepcopy(spec.init_state)
 55 |         self.include_location_features = spec.include_location_features
 56 | 
 57 |         self.tree_locations = list(self.init_state.tree_states.keys())
 58 |         self.bucket_locations = list(self.init_state.bucket_states.keys())
 59 |         used_locations = set(self.tree_locations + self.bucket_locations)
 60 |         self.possible_agent_locations = list(filter(
 61 |             lambda pos: pos not in used_locations,
 62 |             product(range(self.width), range(self.height))))
 63 | 
 64 |         self.num_trees = len(self.tree_locations)
 65 |         self.num_buckets = len(self.bucket_locations)
 66 | 
 67 |         self.default_action = Direction.get_number_from_direction(Direction.STAY)
 68 |         self.nA = 6
 69 |         self.num_features = len(self.s_to_f(self.init_state))
 70 | 
 71 |         self.reset()
 72 | 
 73 |         if compute_transitions:
 74 |             states = self.enumerate_states()
 75 |             self.make_transition_matrices(
 76 |                 states, range(self.nA), self.nS, self.nA)
 77 |             self.make_f_matrix(self.nS, self.num_features)
 78 | 
 79 | 
 80 |     def enumerate_states(self):
 81 |         all_agent_positions = filter(
 82 |             lambda pos: (pos[1], pos[2]) in self.possible_agent_locations,
 83 |             product(range(4), range(self.width), range(self.height)))
 84 |         all_tree_states = map(
 85 |             lambda tree_vals: dict(zip(self.tree_locations, tree_vals)),
 86 |             product([True, False], repeat=self.num_trees))
 87 |         all_bucket_states = map(
 88 |             lambda bucket_vals: dict(zip(self.bucket_locations, bucket_vals)),
 89 |             product(range(self.bucket_capacity + 1), repeat=self.num_buckets))
 90 |         all_states = map(
 91 |             lambda x: ApplesState(*x),
 92 |             product(all_agent_positions, all_tree_states, all_bucket_states, [True, False]))
 93 | 
 94 |         state_num = {}
 95 |         for state in all_states:
 96 |             if state not in state_num:
 97 |                 state_num[state] = len(state_num)
 98 | 
 99 |         self.state_num = state_num
100 |         self.num_state = {v: k for k, v in self.state_num.items()}
101 |         self.nS = len(state_num)
102 | 
103 |         return state_num.keys()
104 | 
105 |     def get_num_from_state(self, state):
106 |         return self.state_num[state]
107 | 
108 |     def get_state_from_num(self, num):
109 |         return self.num_state[num]
110 | 
111 | 
112 |     def s_to_f(self, s):
113 |         '''
114 |         Returns features of the state:
115 |         - Number of apples in buckets
116 |         - Number of apples on trees
117 |         - Whether the agent is carrying an apple
118 |         - For each other location, whether the agent is on that location
119 |         '''
120 |         num_bucket_apples = sum(s.bucket_states.values())
121 |         num_tree_apples = sum(map(int, s.tree_states.values()))
122 |         carrying_apple = int(s.carrying_apple)
123 |         agent_pos = s.agent_pos[1], s.agent_pos[2]  # Drop orientation
124 |         features = [num_bucket_apples, num_tree_apples, carrying_apple]
125 |         if self.include_location_features:
126 |             features = features + [int(agent_pos == pos) for pos in self.possible_agent_locations]
127 |         return np.array(features)
128 | 
129 | 
130 |     def get_next_states(self, state, action):
131 |         '''returns the next state given a state and an action'''
132 |         action = int(action)
133 |         orientation, x, y = state.agent_pos
134 |         new_orientation, new_x, new_y = state.agent_pos
135 |         new_tree_states = deepcopy(state.tree_states)
136 |         new_bucket_states = deepcopy(state.bucket_states)
137 |         new_carrying_apple = state.carrying_apple
138 | 
139 |         if action == Direction.get_number_from_direction(Direction.STAY):
140 |             pass
141 |         elif action < len(Direction.ALL_DIRECTIONS):
142 |             new_orientation = action
143 |             move_x, move_y = Direction.move_in_direction_number((x, y), action)
144 |             # New position is legal
145 |             if (0 <= move_x < self.width and \
146 |                 0 <= move_y < self.height and \
147 |                 (move_x, move_y) in self.possible_agent_locations):
148 |                 new_x, new_y = move_x, move_y
149 |             else:
150 |                 # Move only changes orientation, which we already handled
151 |                 pass
152 |         elif action == 5:
153 |             obj_pos = Direction.move_in_direction_number((x, y), orientation)
154 |             if state.carrying_apple:
155 |                 # We always drop the apple
156 |                 new_carrying_apple = False
157 |                 # If we're facing a bucket, it goes there
158 |                 if obj_pos in new_bucket_states:
159 |                     prev_apples = new_bucket_states[obj_pos]
160 |                     new_bucket_states[obj_pos] = min(prev_apples + 1, self.bucket_capacity)
161 |             elif obj_pos in new_tree_states and new_tree_states[obj_pos]:
162 |                 new_carrying_apple = True
163 |                 new_tree_states[obj_pos] = False
164 |             else:
165 |                 # Interact while holding nothing and not facing a tree.
166 |                 pass
167 |         else:
168 |             raise ValueError('Invalid action {}'.format(action))
169 | 
170 |         new_pos = new_orientation, new_x, new_y
171 | 
172 |         def make_state(prob_apples_tuple):
173 |             prob, tree_apples = prob_apples_tuple
174 |             trees = dict(zip(self.tree_locations, tree_apples))
175 |             s = ApplesState(new_pos, trees, new_bucket_states, new_carrying_apple)
176 |             return (prob, s, 0)
177 | 
178 |         # For apple regeneration, don't regenerate apples that were just picked,
179 |         # so use the apple booleans from the original state
180 |         old_tree_apples = [state.tree_states[loc] for loc in self.tree_locations]
181 |         new_tree_apples = [new_tree_states[loc] for loc in self.tree_locations]
182 |         return list(map(make_state, self.regen_apples(old_tree_apples, new_tree_apples)))
183 | 
184 |     def regen_apples(self, old_tree_apples, new_tree_apples):
185 |         if len(old_tree_apples) == 0:
186 |             yield (1, [])
187 |             return
188 |         for prob, apples in self.regen_apples(old_tree_apples[1:], new_tree_apples[1:]):
189 |             if old_tree_apples[0]:
190 |                 yield prob, [new_tree_apples[0]] + apples
191 |             else:
192 |                 yield prob * self.apple_regen_probability, [True] + apples
193 |                 yield prob * (1 - self.apple_regen_probability), [False] + apples
194 | 
195 | 
196 |     def print_state(self, state):
197 |         '''Renders the state.'''
198 |         h, w = self.height, self.width
199 |         canvas = np.zeros(tuple([2*h-1, 2*w+1]), dtype='int8')
200 | 
201 |         # cell borders
202 |         for y in range(1, canvas.shape[0], 2):
203 |             canvas[y, :] = 1
204 |         for x in range(0, canvas.shape[1], 2):
205 |             canvas[:, x] = 2
206 | 
207 |         # trees
208 |         for (x, y), has_apple in state.tree_states.items():
209 |             canvas[2*y, 2*x+1] = 3 if has_apple else 4
210 | 
211 |         for x, y in self.bucket_locations:
212 |             canvas[2*y, 2*x+1] = 5
213 | 
214 |         # agent
215 |         orientation, x, y = state.agent_pos
216 |         canvas[2*y, 2*x+1] = 6
217 | 
218 |         black_color = '\x1b[0m'
219 |         purple_background_color = '\x1b[0;35;85m'
220 | 
221 |         for line in canvas:
222 |             for char_num in line:
223 |                 if char_num==0:
224 |                     print('\u2003', end='')
225 |                 elif char_num==1:
226 |                     print('─', end='')
227 |                 elif char_num==2:
228 |                     print('│', end='')
229 |                 elif char_num==3:
230 |                     print('\x1b[0;32;85m█'+black_color , end='')
231 |                 elif char_num==4:
232 |                     print('\033[91m█'+black_color, end='')
233 |                 elif char_num==5:
234 |                     print('\033[93m█'+black_color, end='')
235 |                 elif char_num==6:
236 |                     orientation_char = self.get_orientation_char(orientation)
237 |                     agent_color = '\x1b[1;42;42m' if state.carrying_apple else '\x1b[0m'
238 |                     print(agent_color+orientation_char+black_color, end='')
239 |             print('')
240 | 
241 |     def get_orientation_char(self, orientation):
242 |         DIRECTION_TO_CHAR = {
243 |             Direction.NORTH: '↑',
244 |             Direction.SOUTH: '↓',
245 |             Direction.WEST: '←',
246 |             Direction.EAST: '→',
247 |             Direction.STAY: '*'
248 |         }
249 |         direction = Direction.get_direction_from_number(orientation)
250 |         return DIRECTION_TO_CHAR[direction]
251 | 


--------------------------------------------------------------------------------
/src/envs/apples_spec.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from envs.apples import ApplesState
 3 | from envs.env import Direction
 4 | 
 5 | class ApplesSpec(object):
 6 |     def __init__(self, height, width, init_state, apple_regen_probability,
 7 |                  bucket_capacity, include_location_features):
 8 |         """See ApplesEnv.__init__ in apples.py for details."""
 9 |         self.height = height
10 |         self.width = width
11 |         self.init_state = init_state
12 |         self.apple_regen_probability = apple_regen_probability
13 |         self.bucket_capacity = bucket_capacity
14 |         self.include_location_features = include_location_features
15 | 
16 | 
17 | # In the diagrams below, T is a tree, B is a bucket, C is a carpet, A is the
18 | # agent. Each tuple is of the form (spec, current state, task R, true R).
19 | 
20 | APPLES_PROBLEMS = {
21 |     # -----
22 |     # |T T|
23 |     # |   |
24 |     # | B |
25 |     # |   |
26 |     # |A T|
27 |     # -----
28 |     # After 11 actions (riuiruuildi), it looks like this:
29 |     # -----
30 |     # |T T|
31 |     # | A |
32 |     # | B |
33 |     # |   |
34 |     # |  T|
35 |     # -----
36 |     # Where the agent has picked the right trees once and put the fruit in the
37 |     # basket.
38 |     'default': (
39 |         ApplesSpec(5, 3,
40 |                    ApplesState(agent_pos=(0, 0, 2),
41 |                                tree_states={(0, 0): True, (2, 0): True, (2, 4): True},
42 |                                bucket_states={(1, 2): 0},
43 |                                carrying_apple=False),
44 |                    apple_regen_probability = 0.1,
45 |                    bucket_capacity=10,
46 |                    include_location_features=True),
47 |         ApplesState(agent_pos=(Direction.get_number_from_direction(Direction.SOUTH),
48 |                                1, 1),
49 |                     tree_states={(0, 0): True, (2, 0): False, (2, 4): True},
50 |                     bucket_states={(1, 2): 2},
51 |                     carrying_apple=False),
52 |         np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
53 |         np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
54 |     )
55 | }
56 | 


--------------------------------------------------------------------------------
/src/envs/batteries.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from copy import copy, deepcopy
  3 | from itertools import product
  4 | 
  5 | from envs.env import DeterministicEnv, Direction
  6 | 
  7 | 
  8 | class BatteriesState(object):
  9 |     '''
 10 |     state of the environment; describes positions of all objects in the env.
 11 |     '''
 12 |     def __init__(self, agent_pos, train_pos, train_life, battery_present, carrying_battery):
 13 |         """
 14 |         agent_pos: (x, y) tuple for the agent's location
 15 |         vase_states: Dictionary mapping (x, y) tuples to booleans, where True
 16 |             means that the vase is intact
 17 |         """
 18 |         self.agent_pos = agent_pos
 19 |         self.train_pos = train_pos
 20 |         self.train_life = train_life
 21 |         self.battery_present = battery_present
 22 |         self.carrying_battery = carrying_battery
 23 | 
 24 |     def is_valid(self):
 25 |         pos = self.agent_pos
 26 |         # Can't be standing on a battery and not carrying a battery
 27 |         if pos in self.battery_present and self.battery_present[pos] and not self.carrying_battery:
 28 |             return False
 29 |         return True
 30 | 
 31 |     def __eq__(self, other):
 32 |         return isinstance(other, BatteriesState) and \
 33 |             self.agent_pos == other.agent_pos and \
 34 |             self.train_pos == other.train_pos and \
 35 |             self.train_life == other.train_life and \
 36 |             self.battery_present == other.battery_present and \
 37 |             self.carrying_battery == other.carrying_battery
 38 | 
 39 |     def __hash__(self):
 40 |         def get_vals(dictionary):
 41 |             return tuple([dictionary[loc] for loc in sorted(dictionary.keys())])
 42 |         return hash(self.agent_pos + self.train_pos + (self.train_life,) + get_vals(self.battery_present) + (self.carrying_battery,))
 43 | 
 44 | 
 45 | class BatteriesEnv(DeterministicEnv):
 46 |     def __init__(self, spec, compute_transitions=True):
 47 |         """
 48 |         height: Integer, height of the grid. Y coordinates are in [0, height).
 49 |         width: Integer, width of the grid. X coordinates are in [0, width).
 50 |         init_state: BatteriesState, initial state of the environment
 51 |         vase_locations: List of (x, y) tuples, locations of vases
 52 |         num_vases: Integer, number of vases
 53 |         carpet_locations: Set of (x, y) tuples, locations of carpets
 54 |         feature_locations: List of (x, y) tuples, locations of features
 55 |         s: BatteriesState, Current state
 56 |         nA: Integer, number of actions
 57 |         """
 58 |         self.height = spec.height
 59 |         self.width = spec.width
 60 |         self.init_state = deepcopy(spec.init_state)
 61 |         self.battery_locations = sorted(list(self.init_state.battery_present.keys()))
 62 |         self.num_batteries = len(self.battery_locations)
 63 |         self.feature_locations = list(spec.feature_locations)
 64 |         self.train_transition = spec.train_transition
 65 |         self.train_locations = list(self.train_transition.keys())
 66 |         assert set(self.train_locations) == set(self.train_transition.values())
 67 | 
 68 |         self.default_action = Direction.get_number_from_direction(Direction.STAY)
 69 |         self.nA = 5
 70 |         self.num_features = len(self.s_to_f(self.init_state))
 71 | 
 72 |         self.reset()
 73 | 
 74 |         if compute_transitions:
 75 |             states = self.enumerate_states()
 76 |             self.make_transition_matrices(
 77 |                 states, range(self.nA), self.nS, self.nA)
 78 |             self.make_f_matrix(self.nS, self.num_features)
 79 | 
 80 | 
 81 |     def enumerate_states(self):
 82 |         state_num = {}
 83 |         all_agent_positions = product(range(self.width), range(self.height))
 84 |         all_battery_states = map(
 85 |             lambda battery_vals: dict(zip(self.battery_locations, battery_vals)),
 86 |             product([True, False], repeat=self.num_batteries))
 87 |         all_states = map(
 88 |             lambda x: BatteriesState(*x),
 89 |             product(all_agent_positions, self.train_locations, range(10), all_battery_states, [True, False]))
 90 |         all_states = filter(lambda state: state.is_valid(), all_states)
 91 | 
 92 |         state_num = {}
 93 |         for state in all_states:
 94 |             if state not in state_num:
 95 |                 state_num[state] = len(state_num)
 96 | 
 97 |         self.state_num = state_num
 98 |         self.num_state = {v: k for k, v in self.state_num.items()}
 99 |         self.nS = len(state_num)
100 | 
101 |         return state_num.keys()
102 | 
103 |     def get_num_from_state(self, state):
104 |         return self.state_num[state]
105 | 
106 |     def get_state_from_num(self, num):
107 |         return self.num_state[num]
108 | 
109 | 
110 |     def s_to_f(self, s):
111 |         '''
112 |         Returns features of the state:
113 |         - Number of batteries
114 |         - Whether the train is still alive
115 |         - For each train location, whether the train is at that location
116 |         - For each feature location, whether the agent is on that location
117 |         '''
118 |         num_batteries = list(s.battery_present.values()).count(True)
119 |         train_dead_feature = int(s.train_life == 0)
120 |         train_pos_features = [int(s.train_pos == pos) for pos in self.train_locations]
121 |         loc_features = [int(s.agent_pos == fpos) for fpos in self.feature_locations]
122 |         features = train_pos_features + loc_features
123 |         features = [num_batteries, train_dead_feature] + features
124 |         return np.array(features)
125 | 
126 | 
127 |     def get_next_state(self, state, action):
128 |         '''returns the next state given a state and an action'''
129 |         action = int(action)
130 |         new_x, new_y = Direction.move_in_direction_number(state.agent_pos, action)
131 |         # New position is still in bounds:
132 |         if not (0 <= new_x < self.width and 0 <= new_y < self.height):
133 |             new_x, new_y = state.agent_pos
134 |         new_agent_pos = new_x, new_y
135 | 
136 |         new_train_pos, new_train_life = state.train_pos, state.train_life
137 |         new_battery_present = deepcopy(state.battery_present)
138 |         new_carrying_battery = state.carrying_battery
139 |         if new_agent_pos == state.train_pos and state.carrying_battery:
140 |             new_train_life = 10
141 |             new_carrying_battery = False
142 | 
143 |         if new_train_life > 0:
144 |             new_train_pos = self.train_transition[state.train_pos]
145 |             new_train_life -= 1
146 | 
147 |         if new_agent_pos in state.battery_present and state.battery_present[new_agent_pos] and not state.carrying_battery:
148 |             new_carrying_battery = True
149 |             new_battery_present[new_agent_pos] = False
150 | 
151 |         result = BatteriesState(new_agent_pos, new_train_pos, new_train_life, new_battery_present, new_carrying_battery)
152 |         return result
153 | 
154 | 
155 |     def print_state(self, state):
156 |         '''Renders the state.'''
157 |         h, w = self.height, self.width
158 |         grid = [[' '] * w for _ in range(h)]
159 |         x, y = state.agent_pos
160 |         grid[y][x] = 'A'
161 |         x, y = state.train_pos
162 |         grid[y][x] = 'T'
163 |         for (x, y), val in state.battery_present.items():
164 |             if val:
165 |                 grid[y][x] = 'B'
166 |         print('\n'.join(['|'.join(row) for row in grid]))
167 | 
168 |         print('carrying_battery: ', state.carrying_battery)
169 | 


--------------------------------------------------------------------------------
/src/envs/batteries_spec.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from envs.batteries import BatteriesState
 3 | 
 4 | class BatteriesSpec(object):
 5 |     def __init__(self, height, width, init_state, feature_locations, train_transition):
 6 |         """See BatteriesEnv.__init__ in batteries.py for details."""
 7 |         self.height = height
 8 |         self.width = width
 9 |         self.init_state = init_state
10 |         self.feature_locations = feature_locations
11 |         self.train_transition = train_transition
12 | 
13 | 
14 | def get_problem(version):
15 |     # In the diagram below, G is a goal location, B is a battery, A is the
16 |     # agent, and T is the train.
17 |     # Each tuple is of the form (spec, current state, task R, true R).
18 |     # -------
19 |     # |B G  |
20 |     # |  TT |
21 |     # |  TTG|
22 |     # |     |
23 |     # |A   B|
24 |     # -------
25 |     spec = BatteriesSpec(
26 |         5, 5,
27 |         BatteriesState((0, 4), (2, 1), 8,
28 |                        {(0, 0): True, (4, 4): True},
29 |                        False),
30 |         [(2, 0), (4, 2)],
31 |         {
32 |             (2, 1): (3, 1),
33 |             (3, 1): (3, 2),
34 |             (3, 2): (2, 2),
35 |             (2, 2): (2, 1)
36 |         })
37 |     final_state = BatteriesState((2, 0), (3, 2), 8,
38 |                                  {(0, 0): False, (4, 4): True},
39 |                                  False)
40 |     train_weight = -1 if version == 'easy' else 0
41 |     task_reward = np.array([0, train_weight, 0, 0, 0, 0, 0, 1])
42 |     true_reward = np.array([0, -1, 0, 0, 0, 0, 0, 1])
43 |     return (spec, final_state, task_reward, true_reward)
44 | 
45 | 
46 | BATTERIES_PROBLEMS = {
47 |     'default': get_problem('default'),
48 |     'easy': get_problem('easy')
49 | }
50 | 


--------------------------------------------------------------------------------
/src/envs/env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import defaultdict
  3 | from copy import deepcopy
  4 | from scipy.sparse import lil_matrix
  5 | 
  6 | 
  7 | class Env(object):
  8 |     def __init__(self):
  9 |         raise ValueError('Cannot instantiate abstract class Env')
 10 | 
 11 |     def is_deterministic(self):
 12 |         return False
 13 | 
 14 |     def get_initial_state_distribution(self, known_initial_state=True):
 15 |         if known_initial_state:
 16 |             p_0 = np.zeros(self.nS)
 17 |             p_0[self.get_num_from_state(self.init_state)] = 1
 18 |         else:
 19 |             p_0 = np.ones(self.nS) / self.nS
 20 |         return p_0
 21 | 
 22 |     def make_transition_matrices(self, states_iter, actions_iter, nS, nA):
 23 |         """
 24 |         states_iter: ITERATOR of states (i.e. can only be used once)
 25 |         actions_iter: ITERATOR of actions (i.e. can only be used once)
 26 |         """
 27 |         P = {}
 28 |         T_matrix = lil_matrix((nS * nA, nS))
 29 |         baseline_matrix = lil_matrix((nS, nS))
 30 |         actions = list(actions_iter)
 31 |         for state in states_iter:
 32 |             state_id = self.get_num_from_state(state)
 33 |             P[state_id] = {}
 34 |             for action in actions:
 35 |                 next_s = self.get_next_states(state, action)
 36 |                 next_s = [(p, self.get_num_from_state(s), r) for p, s, r in next_s]
 37 |                 P[state_id][action] = next_s
 38 |                 state_action_index = state_id * nA + action
 39 |                 for prob, next_state_id, _ in next_s:
 40 |                     T_matrix[state_action_index, next_state_id] = prob
 41 |                     if action == self.default_action:
 42 |                         baseline_matrix[state_id, next_state_id] = prob
 43 |         self.P = P
 44 |         self.T_matrix = T_matrix.tocsr()
 45 |         self.T_matrix_transpose = T_matrix.transpose().tocsr()
 46 |         self.baseline_matrix_transpose = baseline_matrix.transpose().tocsr()
 47 | 
 48 | 
 49 |     def make_f_matrix(self, nS, num_features):
 50 |         self.f_matrix = np.zeros((nS, num_features))
 51 |         for state_id in self.P.keys():
 52 |             state = self.get_state_from_num(state_id)
 53 |             self.f_matrix[state_id, :] = self.s_to_f(state)
 54 | 
 55 | 
 56 |     def reset(self, state=None):
 57 |         if state is None: state = self.init_state
 58 |         self.timestep = 0
 59 |         self.s = deepcopy(state)
 60 | 
 61 |     def state_step(self, action, state=None):
 62 |         if state == None: state = self.s
 63 |         next_states = self.get_next_states(state, action)
 64 |         probabilities = [p for p, _, _ in next_states]
 65 |         idx = np.random.choice(np.arange(len(next_states)), p=probabilities)
 66 |         return next_states[idx][1]
 67 | 
 68 |     def step(self, action, r_vec=None):
 69 |         """
 70 |         given an action, takes a step from self.s, updates self.s and returns:
 71 |         - the observation (features of the next state)
 72 |         - the associated reward
 73 |         - done, the indicator of completed episode
 74 |         - info
 75 |         """
 76 |         self.s = self.state_step(action)
 77 |         self.timestep+=1
 78 | 
 79 |         obs = self.s_to_f(self.s)
 80 |         reward = 0 if r_vec is None else np.array(obs.T @ r_vec)
 81 |         done = False
 82 |         info = defaultdict(lambda : '')
 83 |         return np.array(obs, dtype='float32'), reward, np.array(done, dtype='bool'), info
 84 | 
 85 | 
 86 | class DeterministicEnv(Env):
 87 |     def __init__(self):
 88 |         raise ValueError('Cannot instantiate abstract class DeterministicEnv')
 89 | 
 90 |     def is_deterministic(self):
 91 |         return True
 92 | 
 93 |     def make_transition_matrices(self, states_iter, actions_iter, nS, nA):
 94 |         """
 95 |         states_iter: ITERATOR of states (i.e. can only be used once)
 96 |         actions_iter: ITERATOR of actions (i.e. can only be used once)
 97 |         nS: Number of states
 98 |         nA: Number of actions
 99 |         """
100 |         Env.make_transition_matrices(self, states_iter, actions_iter, nS, nA)
101 |         self._make_deterministic_transition_matrix(nS, nA)
102 |         self._make_deterministic_transition_transpose_matrix(nS, nA)
103 | 
104 | 
105 |     def get_next_states(self, state, action):
106 |         return [(1, self.get_next_state(state, action), 0)]
107 | 
108 |     def state_step(self, action, state=None):
109 |         if state == None: state = self.s
110 |         return self.get_next_state(state, action)
111 | 
112 |     def _make_deterministic_transition_matrix(self, nS, nA):
113 |         """Create self.deterministic_T, a matrix with index S,A -> S'   """
114 |         self.deterministic_T = np.zeros((nS, nA), dtype='int32')
115 |         for s in range(nS):
116 |             for a in range(nA):
117 |                 self.deterministic_T[s,a]=self.P[s][a][0][1]
118 | 
119 |     def _make_deterministic_transition_transpose_matrix(self, nS, nA):
120 |         """Create self.deterministic_transpose, a matrix with index S,A -> S', with the inverse dynamics """
121 |         self.deterministic_transpose = np.zeros((nS, nA), dtype='int32')
122 |         for s in range(nS):
123 |             for a in range(nA):
124 |                 self.deterministic_transpose[self.P[s][a][0][1],a]=s
125 | 
126 | 
127 | class Direction(object):
128 |     """A class that contains the five actions available in Gridworlds.
129 | 
130 |     Includes definitions of the actions as well as utility functions for
131 |     manipulating them or applying them.
132 |     """
133 |     NORTH = (0, -1)
134 |     SOUTH = (0, 1)
135 |     EAST  = (1, 0)
136 |     WEST  = (-1, 0)
137 |     STAY = (0, 0)
138 |     INDEX_TO_DIRECTION = [NORTH, SOUTH, EAST, WEST, STAY]
139 |     DIRECTION_TO_INDEX = { a:i for i, a in enumerate(INDEX_TO_DIRECTION) }
140 |     ALL_DIRECTIONS = INDEX_TO_DIRECTION
141 | 
142 |     @staticmethod
143 |     def move_in_direction(point, direction):
144 |         """Takes a step in the given direction and returns the new point.
145 | 
146 |         point: Tuple (x, y) representing a point in the x-y plane.
147 |         direction: One of the Directions.
148 |         """
149 |         x, y = point
150 |         dx, dy = direction
151 |         return (x + dx, y + dy)
152 | 
153 |     @staticmethod
154 |     def move_in_direction_number(point, num):
155 |         direction = Direction.get_direction_from_number(num)
156 |         return Direction.move_in_direction(point, direction)
157 | 
158 |     @staticmethod
159 |     def get_number_from_direction(direction):
160 |         return Direction.DIRECTION_TO_INDEX[direction]
161 | 
162 |     @staticmethod
163 |     def get_direction_from_number(number):
164 |         return Direction.INDEX_TO_DIRECTION[number]
165 | 
166 | 


--------------------------------------------------------------------------------
/src/envs/room.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from copy import deepcopy
  3 | from itertools import product
  4 | 
  5 | from envs.env import DeterministicEnv, Direction
  6 | 
  7 | 
  8 | class RoomState(object):
  9 |     '''
 10 |     state of the environment; describes positions of all objects in the env.
 11 |     '''
 12 |     def __init__(self, agent_pos, vase_states):
 13 |         """
 14 |         agent_pos: (x, y) tuple for the agent's location
 15 |         vase_states: Dictionary mapping (x, y) tuples to booleans, where True
 16 |             means that the vase is intact
 17 |         """
 18 |         self.agent_pos = agent_pos
 19 |         self.vase_states = vase_states
 20 | 
 21 |     def __eq__(self, other):
 22 |         return isinstance(other, RoomState) and \
 23 |             self.agent_pos == other.agent_pos and \
 24 |             self.vase_states == other.vase_states
 25 | 
 26 |     def __hash__(self):
 27 |         def get_vals(dictionary):
 28 |             return tuple([dictionary[loc] for loc in sorted(dictionary.keys())])
 29 |         return hash(self.agent_pos + get_vals(self.vase_states))
 30 | 
 31 | 
 32 | class RoomEnv(DeterministicEnv):
 33 |     def __init__(self, spec, compute_transitions=True):
 34 |         """
 35 |         height: Integer, height of the grid. Y coordinates are in [0, height).
 36 |         width: Integer, width of the grid. X coordinates are in [0, width).
 37 |         init_state: RoomState, initial state of the environment
 38 |         vase_locations: List of (x, y) tuples, locations of vases
 39 |         num_vases: Integer, number of vases
 40 |         carpet_locations: Set of (x, y) tuples, locations of carpets
 41 |         feature_locations: List of (x, y) tuples, locations of features
 42 |         s: RoomState, Current state
 43 |         nA: Integer, number of actions
 44 |         """
 45 |         self.height = spec.height
 46 |         self.width = spec.width
 47 |         self.init_state = deepcopy(spec.init_state)
 48 |         self.vase_locations = list(self.init_state.vase_states.keys())
 49 |         self.num_vases = len(self.vase_locations)
 50 |         self.carpet_locations = set(spec.carpet_locations)
 51 |         self.feature_locations = list(spec.feature_locations)
 52 | 
 53 |         self.default_action = Direction.get_number_from_direction(Direction.STAY)
 54 |         self.nA = 5
 55 |         self.num_features = len(self.s_to_f(self.init_state))
 56 | 
 57 |         self.reset()
 58 | 
 59 |         if compute_transitions:
 60 |             states = self.enumerate_states()
 61 |             self.make_transition_matrices(
 62 |                 states, range(self.nA), self.nS, self.nA)
 63 |             self.make_f_matrix(self.nS, self.num_features)
 64 | 
 65 | 
 66 |     def enumerate_states(self):
 67 |         state_num = {}
 68 | 
 69 |         # Possible vase states
 70 |         for vase_intact_bools in product([True, False], repeat=self.num_vases):
 71 |             vase_states = dict(zip(self.vase_locations, vase_intact_bools))
 72 |             # Possible agent positions
 73 |             for y in range(self.height):
 74 |                 for x in range(self.width):
 75 |                     pos = (x, y)
 76 |                     if pos in vase_states and vase_states[pos]:
 77 |                         # Can't have the agent on an intact vase
 78 |                         continue
 79 |                     state = RoomState(pos, vase_states)
 80 |                     if state not in state_num:
 81 |                         state_num[state] = len(state_num)
 82 | 
 83 |         self.state_num = state_num
 84 |         self.num_state = {v: k for k, v in self.state_num.items()}
 85 |         self.nS = len(state_num)
 86 | 
 87 |         return state_num.keys()
 88 | 
 89 |     def get_num_from_state(self, state):
 90 |         return self.state_num[state]
 91 | 
 92 |     def get_state_from_num(self, num):
 93 |         return self.num_state[num]
 94 | 
 95 | 
 96 |     def s_to_f(self, s):
 97 |         '''
 98 |         Returns features of the state:
 99 |         - Number of broken vases
100 |         - Whether the agent is on a carpet
101 |         - For each feature location, whether the agent is on that location
102 |         '''
103 |         num_broken_vases = list(s.vase_states.values()).count(False)
104 |         carpet_feature = int(s.agent_pos in self.carpet_locations)
105 |         features = [int(s.agent_pos == fpos) for fpos in self.feature_locations]
106 |         features = [num_broken_vases, carpet_feature] + features
107 |         return np.array(features)
108 | 
109 | 
110 |     def get_next_state(self, state, action):
111 |         '''returns the next state given a state and an action'''
112 |         action = int(action)
113 |         new_x, new_y = Direction.move_in_direction_number(state.agent_pos, action)
114 |         # New position is still in bounds:
115 |         if not (0 <= new_x < self.width and 0 <= new_y < self.height):
116 |             new_x, new_y = state.agent_pos
117 |         new_agent_pos = new_x, new_y
118 |         new_vase_states = deepcopy(state.vase_states)
119 |         if new_agent_pos in new_vase_states:
120 |             new_vase_states[new_agent_pos] = False  # Break the vase
121 |         return RoomState(new_agent_pos, new_vase_states)
122 | 
123 | 
124 |     def print_state(self, state):
125 |         '''Renders the state.'''
126 |         h, w = self.height, self.width
127 |         canvas = np.zeros(tuple([2*h-1, 3*w+1]), dtype='int8')
128 | 
129 |         # cell borders
130 |         for y in range(1, canvas.shape[0], 2):
131 |             canvas[y, :] = 1
132 |         for x in range(0, canvas.shape[1], 3):
133 |             canvas[:, x] = 2
134 | 
135 |         # vases
136 |         for x, y in self.vase_locations:
137 |             if state.vase_states[(x, y)]:
138 |                 canvas[2*y, 3*x+1] = 4
139 |             else:
140 |                 canvas[2*y, 3*x+1] = 6
141 | 
142 |         # agent
143 |         x, y = state.agent_pos
144 |         canvas[2*y, 3*x + 2] = 3
145 | 
146 |         black_color = '\x1b[0m'
147 |         purple_background_color = '\x1b[0;35;85m'
148 | 
149 |         for line in canvas:
150 |             for char_num in line:
151 |                 if char_num==0:
152 |                     print('\u2003', end='')
153 |                 elif char_num==1:
154 |                     print('─', end='')
155 |                 elif char_num==2:
156 |                     print('│', end='')
157 |                 elif char_num==3:
158 |                     print('\x1b[0;33;85m█'+black_color, end='')
159 |                 elif char_num==4:
160 |                     print('\x1b[0;32;85m█'+black_color , end='')
161 |                 elif char_num==5:
162 |                     print(purple_background_color+'█'+black_color, end='')
163 |                 elif char_num==6:
164 |                     print('\033[91m█'+black_color, end='')
165 |             print('')
166 | 


--------------------------------------------------------------------------------
/src/envs/room_spec.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from envs.room import RoomState
 3 | 
 4 | class RoomSpec(object):
 5 |     def __init__(self, height, width, init_state, carpet_locations, feature_locations):
 6 |         """See RoomEnv.__init__ in room.py for details."""
 7 |         self.height = height
 8 |         self.width = width
 9 |         self.init_state = init_state
10 |         self.carpet_locations = carpet_locations
11 |         self.feature_locations = feature_locations
12 | 
13 | 
14 | 
15 | # In the diagrams below, G is a goal location, V is a vase, C is a carpet, A is
16 | # the agent. Each tuple is of the form (spec, current state, task R, true R).
17 | 
18 | ROOM_PROBLEMS = {
19 |     # -------
20 |     # |  G  |
21 |     # |GCVC |
22 |     # |  A  |
23 |     # -------
24 |     'default': (
25 |         RoomSpec(3, 5,
26 |                  RoomState((2, 2), {(2, 1): True}),
27 |                  [(1, 1), (3, 1)],
28 |                  [(0, 1), (2, 0)]),
29 |         RoomState((2, 0), {(2, 1): True}),
30 |         np.array([0, 0, 1, 0]),
31 |         np.array([-1, 0, 1, 0])
32 |     ),
33 |     # -------
34 |     # |G  VG|
35 |     # |     |
36 |     # |A  C |
37 |     # -------
38 |     'bad': (
39 |         RoomSpec(3, 5,
40 |                  RoomState((0, 2), {(3, 0): True}),
41 |                  [(3, 2)],
42 |                  [(0, 0), (4, 0)]),
43 |         RoomState((0, 0), {(3, 0): True}),
44 |         np.array([0, 0, 0, 1]),
45 |         np.array([-1, 0, 0, 1])
46 |     )
47 | }
48 | 


--------------------------------------------------------------------------------
/src/envs/tests/apples_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from envs.apples import ApplesState, ApplesEnv
 4 | from envs.env import Direction
 5 | 
 6 | 
 7 | class TestApplesSpec(object):
 8 |     def __init__(self):
 9 |         """Test spec for the Apples environment.
10 | 
11 |         T is a tree, B is a bucket, C is a carpet, A is the agent.
12 |         -----
13 |         |T T|
14 |         |   |
15 |         |AB |
16 |         -----
17 |         """
18 |         self.height = 3
19 |         self.width = 5
20 |         self.init_state = ApplesState(
21 |             agent_pos=(0, 0, 2),
22 |             tree_states={(0, 0): True, (2, 0): True},
23 |             bucket_states={(1, 2): 0},
24 |             carrying_apple=False)
25 |         # Use a power of 2, to avoid rounding issues
26 |         self.apple_regen_probability = 1.0 / 4
27 |         self.bucket_capacity = 10
28 |         self.include_location_features = True
29 | 
30 | 
31 | class TestApplesEnv(unittest.TestCase):
32 |     def check_trajectory(self, env, trajectory):
33 |         state = env.s
34 |         for action, prob, next_state in trajectory:
35 |             actual_next_states = env.get_next_states(state, action)
36 |             self.assertEqual(sum([p for p, _, _ in actual_next_states]), 1.0)
37 |             self.assertIn((prob, next_state, 0), actual_next_states)
38 |             state = next_state
39 | 
40 |     def test_trajectories(self):
41 |         u, d, l, r, s = map(
42 |             Direction.get_number_from_direction,
43 |             [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST, Direction.STAY])
44 |         i = 5  # interact action
45 | 
46 |         def make_state(agent_pos, tree1, tree2, bucket, carrying_apple):
47 |             tree_states = { (0, 0): tree1, (2, 0): tree2 }
48 |             bucket_state = { (1, 2): bucket }
49 |             return ApplesState(agent_pos, tree_states, bucket_state, carrying_apple)
50 | 
51 |         apples_env = ApplesEnv(TestApplesSpec(), compute_transitions=False)
52 |         self.check_trajectory(apples_env, [
53 |             (u, 1.0,    make_state((u, 0, 1), True, True, 0, False)),
54 |             (i, 1.0,    make_state((u, 0, 1), False, True, 0, True)),
55 |             (r, 3.0/4,  make_state((r, 1, 1), False, True, 0, True)),
56 |             (d, 3.0/4,  make_state((d, 1, 1), False, True, 0, True)),
57 |             (i, 3.0/4,  make_state((d, 1, 1), False, True, 1, False)),
58 |             (u, 3.0/4,  make_state((u, 1, 0), False, True, 1, False)),
59 |             (r, 3.0/4,  make_state((r, 1, 0), False, True, 1, False)),
60 |             (i, 3.0/4,  make_state((r, 1, 0), False, False, 1, True)),
61 |             (d, 9.0/16, make_state((d, 1, 1), False, False, 1, True)),
62 |             (i, 3.0/16, make_state((d, 1, 1), True, False, 2, False)),
63 |             (s, 1.0/4,  make_state((d, 1, 1), True, True, 2, False)),
64 |         ])
65 | 
66 | if __name__ == '__main__':
67 |     unittest.main()
68 | 


--------------------------------------------------------------------------------
/src/envs/tests/batteries_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from envs.batteries import BatteriesState, BatteriesEnv
 4 | from envs.env import Direction
 5 | 
 6 | 
 7 | class TestBatteriesSpec(object):
 8 |     def __init__(self):
 9 |         """Test spec for the Batteries environment.
10 | 
11 |         G is a goal location, B is a battery, A is the agent, and T is the train.
12 |         -------
13 |         |B G  |
14 |         |  TT |
15 |         |  TTG|
16 |         |     |
17 |         |A   B|
18 |         -------
19 |         """
20 |         self.height = 5
21 |         self.width = 5
22 |         self.init_state = BatteriesState((0, 4), (2, 1), 8,
23 |                                          {(0, 0): True, (4, 4): True},
24 |                                          False)
25 |         self.feature_locations = [(2, 0), (4, 2)]
26 |         self.train_transition = {
27 |             (2, 1): (3, 1),
28 |             (3, 1): (3, 2),
29 |             (3, 2): (2, 2),
30 |             (2, 2): (2, 1)
31 |         }
32 | 
33 | 
34 | class TestBatteriesEnv(unittest.TestCase):
35 |     def check_trajectory(self, env, trajectory):
36 |         state = env.s
37 |         for action, next_state in trajectory:
38 |             self.assertEqual(env.state_step(action, state), next_state)
39 |             self.assertEqual(env.state_step(action), next_state)
40 |             features, reward, done, info = env.step(action)
41 |             self.assertEqual(env.s, next_state)
42 |             state = next_state
43 | 
44 |     def test_trajectories(self):
45 |         batteries_env = BatteriesEnv(TestBatteriesSpec(), compute_transitions=False)
46 |         u, d, l, r, s = map(
47 |             Direction.get_number_from_direction,
48 |             [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST, Direction.STAY])
49 | 
50 |         def make_state(agent, train, life, battery_vals, carrying_battery):
51 |             battery_present = dict(zip([(0, 0), (4, 4)], battery_vals))
52 |             return BatteriesState(agent, train, life, battery_present, carrying_battery)
53 | 
54 |         self.check_trajectory(batteries_env, [
55 |             (u, make_state((0, 3), (3, 1), 7, [True, True], False)),
56 |             (u, make_state((0, 2), (3, 2), 6, [True, True], False)),
57 |             (u, make_state((0, 1), (2, 2), 5, [True, True], False)),
58 |             (u, make_state((0, 0), (2, 1), 4, [False, True], True)),
59 |             (r, make_state((1, 0), (3, 1), 3, [False, True], True)),
60 |             (r, make_state((2, 0), (3, 2), 2, [False, True], True)),
61 |             (s, make_state((2, 0), (2, 2), 1, [False, True], True)),
62 |             (s, make_state((2, 0), (2, 1), 0, [False, True], True)),
63 |             (d, make_state((2, 1), (3, 1), 9, [False, True], False)),
64 |             (u, make_state((2, 0), (3, 2), 8, [False, True], False)),
65 |         ])
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     unittest.main()
70 | 


--------------------------------------------------------------------------------
/src/envs/tests/env_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from envs.env import Direction
 4 | 
 5 | class TestDirection(unittest.TestCase):
 6 |     def test_direction_number_conversion(self):
 7 |         all_directions = Direction.ALL_DIRECTIONS
 8 |         all_numbers = []
 9 | 
10 |         for direction in Direction.ALL_DIRECTIONS:
11 |             number = Direction.get_number_from_direction(direction)
12 |             direction_again = Direction.get_direction_from_number(number)
13 |             self.assertEqual(direction, direction_again)
14 |             all_numbers.append(number)
15 | 
16 |         # Check that all directions are distinct
17 |         num_directions = len(all_directions)
18 |         self.assertEqual(len(set(all_directions)), num_directions)
19 |         # Check that the numbers are 0, 1, ... num_directions - 1
20 |         self.assertEqual(set(all_numbers), set(range(num_directions)))
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     unittest.main()
25 | 


--------------------------------------------------------------------------------
/src/envs/tests/room_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from envs.room import RoomState, RoomEnv
 4 | from envs.env import Direction
 5 | 
 6 | 
 7 | class TestRoomSpec(object):
 8 |     def __init__(self):
 9 |         """Test spec for the Room environment.
10 | 
11 |         G is a goal location, V is a vase, C is a carpet, A is the agent.
12 |         -------
13 |         |G G G|
14 |         | CVC |
15 |         |  A  |
16 |         -------
17 |         """
18 |         self.height = 3
19 |         self.width = 5
20 |         self.init_state = RoomState((2, 2), {(2, 1): True})
21 |         self.carpet_locations = [(1, 1), (3, 1)]
22 |         self.feature_locations = [(0, 0), (2, 0), (4, 0)]
23 | 
24 | 
25 | class TestRoomEnv(unittest.TestCase):
26 |     def setUp(self):
27 |         self.room = RoomEnv(TestRoomSpec(), compute_transitions=False)
28 |         u, d, l, r = map(
29 |             Direction.get_number_from_direction,
30 |             [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST])
31 | 
32 |         self.trajectory1 = [
33 |             (l, RoomState((1, 2), {(2, 1): True})),
34 |             (u, RoomState((1, 1), {(2, 1): True})),
35 |             (u, RoomState((1, 0), {(2, 1): True})),
36 |             (r, RoomState((2, 0), {(2, 1): True}))
37 |         ]
38 |         self.trajectory2 = [
39 |             (u, RoomState((2, 1), {(2, 1): False})),
40 |             (u, RoomState((2, 0), {(2, 1): False}))
41 |         ]
42 |         self.trajectory3 = [
43 |             (r, RoomState((3, 2), {(2, 1): True})),
44 |             (u, RoomState((3, 1), {(2, 1): True})),
45 |             (l, RoomState((2, 1), {(2, 1): False})),
46 |             (d, RoomState((2, 2), {(2, 1): False}))
47 |         ]
48 | 
49 |     def check_trajectory(self, env, trajectory, reset=True):
50 |         if reset:
51 |             env.reset()
52 | 
53 |         state = env.s
54 |         for action, next_state in trajectory:
55 |             self.assertEqual(env.state_step(action, state), next_state)
56 |             self.assertEqual(env.state_step(action), next_state)
57 |             features, reward, done, info = env.step(action)
58 |             self.assertEqual(env.s, next_state)
59 |             state = next_state
60 | 
61 |     def test_trajectories(self):
62 |         self.check_trajectory(self.room, self.trajectory1, reset=False)
63 |         self.check_trajectory(self.room, self.trajectory2)
64 |         self.check_trajectory(self.room, self.trajectory3)
65 | 
66 | if __name__ == '__main__':
67 |     unittest.main()
68 | 


--------------------------------------------------------------------------------
/src/envs/tests/train_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from envs.train import TrainState, TrainEnv
 4 | from envs.env import Direction
 5 | 
 6 | 
 7 | class TestTrainSpec(object):
 8 |     def __init__(self):
 9 |         """Test spec for the Train environment.
10 | 
11 |         G is a goal location, V is a vase, C is a carpet, A is the agent.
12 |         -------
13 |         |  G C|
14 |         |  TT |
15 |         | VTTG|
16 |         |     |
17 |         |A    |
18 |         -------
19 |         """
20 |         self.height = 5
21 |         self.width = 5
22 |         self.init_state = TrainState((0, 4), {(1, 2): True}, (2, 1), True)
23 |         self.carpet_locations = [(4, 0)]
24 |         self.feature_locations = [(2, 0), (4, 2)],
25 |         self.train_transition = {
26 |             (2, 1): (3, 1),
27 |             (3, 1): (3, 2),
28 |             (3, 2): (2, 2),
29 |             (2, 2): (2, 1)
30 |         }
31 | 
32 | 
33 | class TestTrainEnv(unittest.TestCase):
34 |     def check_trajectory(self, env, trajectory):
35 |         state = env.s
36 |         for action, next_state in trajectory:
37 |             self.assertEqual(env.state_step(action, state), next_state)
38 |             self.assertEqual(env.state_step(action), next_state)
39 |             features, reward, done, info = env.step(action)
40 |             self.assertEqual(env.s, next_state)
41 |             state = next_state
42 | 
43 |     def test_trajectories(self):
44 |         train_env = TrainEnv(TestTrainSpec(), compute_transitions=False)
45 |         u, d, l, r, s = map(
46 |             Direction.get_number_from_direction,
47 |             [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST, Direction.STAY])
48 | 
49 |         self.check_trajectory(train_env, [
50 |             (u, TrainState((0, 3), {(1, 2): True}, (3, 1), True)),
51 |             (u, TrainState((0, 2), {(1, 2): True}, (3, 2), True)),
52 |             (u, TrainState((0, 1), {(1, 2): True}, (2, 2), True)),
53 |             (r, TrainState((1, 1), {(1, 2): True}, (2, 1), True)),
54 |             (u, TrainState((1, 0), {(1, 2): True}, (3, 1), True)),
55 |             (r, TrainState((2, 0), {(1, 2): True}, (3, 2), True)),
56 |             (s, TrainState((2, 0), {(1, 2): True}, (2, 2), True)),
57 |             (s, TrainState((2, 0), {(1, 2): True}, (2, 1), True)),
58 |         ])
59 | 
60 |         train_env.reset()
61 |         self.check_trajectory(train_env, [
62 |             (u, TrainState((0, 3), {(1, 2): True}, (3, 1), True)),
63 |             (r, TrainState((1, 3), {(1, 2): True}, (3, 2), True)),
64 |             (r, TrainState((2, 3), {(1, 2): True}, (2, 2), True)),
65 |         ])
66 | 
67 |         train_env.reset()
68 |         self.check_trajectory(train_env, [
69 |             (r, TrainState((1, 4), {(1, 2): True}, (3, 1), True)),
70 |             (r, TrainState((2, 4), {(1, 2): True}, (3, 2), True)),
71 |             (r, TrainState((3, 4), {(1, 2): True}, (2, 2), True)),
72 |             (u, TrainState((3, 3), {(1, 2): True}, (2, 1), True)),
73 |             (u, TrainState((3, 2), {(1, 2): True}, (3, 1), True)),
74 |             (s, TrainState((3, 2), {(1, 2): True}, (3, 2), False)),
75 |             (s, TrainState((3, 2), {(1, 2): True}, (3, 2), False)),
76 |             (u, TrainState((3, 1), {(1, 2): True}, (3, 2), False)),
77 |             (l, TrainState((2, 1), {(1, 2): True}, (3, 2), False)),
78 |         ])
79 | 
80 | if __name__ == '__main__':
81 |     unittest.main()
82 | 


--------------------------------------------------------------------------------
/src/envs/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from copy import copy, deepcopy
  3 | from itertools import product
  4 | 
  5 | from envs.env import DeterministicEnv, Direction
  6 | 
  7 | 
  8 | class TrainState(object):
  9 |     '''
 10 |     state of the environment; describes positions of all objects in the env.
 11 |     '''
 12 |     def __init__(self, agent_pos, vase_states, train_pos, train_intact):
 13 |         """
 14 |         agent_pos: (x, y) tuple for the agent's location
 15 |         vase_states: Dictionary mapping (x, y) tuples to booleans, where True
 16 |             means that the vase is intact
 17 |         """
 18 |         self.agent_pos = agent_pos
 19 |         self.vase_states = vase_states
 20 |         self.train_pos = train_pos
 21 |         self.train_intact = train_intact
 22 | 
 23 |     def is_valid(self):
 24 |         pos = self.agent_pos
 25 |         # Can't be standing on the vase and have the vase intact
 26 |         if pos in self.vase_states and self.vase_states[pos]:
 27 |             return False
 28 |         # Can't be standing on the train and have the train intact
 29 |         if pos == self.train_pos and self.train_intact:
 30 |             return False
 31 |         return True
 32 | 
 33 |     def __eq__(self, other):
 34 |         return isinstance(other, TrainState) and \
 35 |             self.agent_pos == other.agent_pos and \
 36 |             self.vase_states == other.vase_states and \
 37 |             self.train_pos == other.train_pos and \
 38 |             self.train_intact == other.train_intact
 39 | 
 40 |     def __hash__(self):
 41 |         def get_vals(dictionary):
 42 |             return tuple([dictionary[loc] for loc in sorted(dictionary.keys())])
 43 |         return hash(self.agent_pos + get_vals(self.vase_states) + self.train_pos + (self.train_intact,))
 44 | 
 45 | 
 46 | class TrainEnv(DeterministicEnv):
 47 |     def __init__(self, spec, compute_transitions=True):
 48 |         """
 49 |         height: Integer, height of the grid. Y coordinates are in [0, height).
 50 |         width: Integer, width of the grid. X coordinates are in [0, width).
 51 |         init_state: TrainState, initial state of the environment
 52 |         vase_locations: List of (x, y) tuples, locations of vases
 53 |         num_vases: Integer, number of vases
 54 |         carpet_locations: Set of (x, y) tuples, locations of carpets
 55 |         feature_locations: List of (x, y) tuples, locations of features
 56 |         s: TrainState, Current state
 57 |         nA: Integer, number of actions
 58 |         """
 59 |         self.height = spec.height
 60 |         self.width = spec.width
 61 |         self.init_state = deepcopy(spec.init_state)
 62 |         self.vase_locations = list(self.init_state.vase_states.keys())
 63 |         self.num_vases = len(self.vase_locations)
 64 |         self.carpet_locations = set(spec.carpet_locations)
 65 |         self.feature_locations = list(spec.feature_locations)
 66 |         self.train_transition = spec.train_transition
 67 |         self.train_locations = list(self.train_transition.keys())
 68 |         assert set(self.train_locations) == set(self.train_transition.values())
 69 | 
 70 |         self.default_action = Direction.get_number_from_direction(Direction.STAY)
 71 |         self.nA = 5
 72 |         self.num_features = len(self.s_to_f(self.init_state))
 73 | 
 74 |         self.reset()
 75 | 
 76 |         if compute_transitions:
 77 |             states = self.enumerate_states()
 78 |             self.make_transition_matrices(
 79 |                 states, range(self.nA), self.nS, self.nA)
 80 |             self.make_f_matrix(self.nS, self.num_features)
 81 | 
 82 | 
 83 |     def enumerate_states(self):
 84 |         state_num = {}
 85 |         all_agent_positions = product(range(self.width), range(self.height))
 86 |         all_vase_states = map(
 87 |             lambda vase_vals: dict(zip(self.vase_locations, vase_vals)),
 88 |             product([True, False], repeat=self.num_vases))
 89 |         all_states = map(
 90 |             lambda x: TrainState(*x),
 91 |             product(all_agent_positions, all_vase_states, self.train_locations, [True, False]))
 92 |         all_states = filter(lambda state: state.is_valid(), all_states)
 93 | 
 94 |         state_num = {}
 95 |         for state in all_states:
 96 |             if state not in state_num:
 97 |                 state_num[state] = len(state_num)
 98 | 
 99 |         self.state_num = state_num
100 |         self.num_state = {v: k for k, v in self.state_num.items()}
101 |         self.nS = len(state_num)
102 | 
103 |         return state_num.keys()
104 | 
105 |     def get_num_from_state(self, state):
106 |         return self.state_num[state]
107 | 
108 |     def get_state_from_num(self, num):
109 |         return self.num_state[num]
110 | 
111 | 
112 |     def s_to_f(self, s):
113 |         '''
114 |         Returns features of the state:
115 |         - Number of broken vases
116 |         - Whether the agent is on a carpet
117 |         - For each feature location, whether the agent is on that location
118 |         '''
119 |         num_broken_vases = list(s.vase_states.values()).count(False)
120 |         carpet_feature = int(s.agent_pos in self.carpet_locations)
121 |         train_intact_feature = int(not s.train_intact)
122 |         train_pos_features = [int(s.train_pos == pos) for pos in self.train_locations]
123 |         loc_features = [int(s.agent_pos == fpos) for fpos in self.feature_locations]
124 |         features = train_pos_features + loc_features
125 |         features = [num_broken_vases, carpet_feature, train_intact_feature] + features
126 |         return np.array(features)
127 | 
128 | 
129 |     def get_next_state(self, state, action):
130 |         '''returns the next state given a state and an action'''
131 |         action = int(action)
132 |         new_x, new_y = Direction.move_in_direction_number(state.agent_pos, action)
133 |         # New position is still in bounds:
134 |         if not (0 <= new_x < self.width and 0 <= new_y < self.height):
135 |             new_x, new_y = state.agent_pos
136 |         new_agent_pos = new_x, new_y
137 |         new_vase_states = deepcopy(state.vase_states)
138 |         new_train_pos, new_train_intact = state.train_pos, state.train_intact
139 |         if state.train_intact:
140 |             new_train_pos = self.train_transition[state.train_pos]
141 | 
142 |         # Break the vase and train if appropriate
143 |         if new_agent_pos in new_vase_states:
144 |             new_vase_states[new_agent_pos] = False
145 |         if new_agent_pos == new_train_pos:
146 |             new_train_intact = False
147 |         return TrainState(new_agent_pos, new_vase_states, new_train_pos, new_train_intact)
148 | 
149 | 
150 |     def print_state(self, state):
151 |         '''Renders the state.'''
152 |         h, w = self.height, self.width
153 |         canvas = np.zeros(tuple([2*h-1, 3*w+1]), dtype='int8')
154 | 
155 |         # cell borders
156 |         for y in range(1, canvas.shape[0], 2):
157 |             canvas[y, :] = 1
158 |         for x in range(0, canvas.shape[1], 3):
159 |             canvas[:, x] = 2
160 | 
161 |         # vases
162 |         for x, y in self.vase_locations:
163 |             if state.vase_states[(x, y)]:
164 |                 canvas[2*y, 3*x+1] = 4
165 |             else:
166 |                 canvas[2*y, 3*x+1] = 6
167 | 
168 |         # agent
169 |         x, y = state.agent_pos
170 |         canvas[2*y, 3*x + 2] = 3
171 | 
172 |         # train
173 |         x, y = state.train_pos
174 |         if state.train_intact:
175 |             canvas[2*y, 3*x + 1] = 5
176 |         else:
177 |             canvas[2*y, 3*x + 1] = 6
178 | 
179 | 
180 | 
181 |         black_color = '\x1b[0m'
182 |         purple_background_color = '\x1b[0;35;85m'
183 | 
184 |         for line in canvas:
185 |             for char_num in line:
186 |                 if char_num==0:
187 |                     print('\u2003', end='')
188 |                 elif char_num==1:
189 |                     print('─', end='')
190 |                 elif char_num==2:
191 |                     print('│', end='')
192 |                 elif char_num==3:
193 |                     print('\x1b[0;33;85m█'+black_color, end='')
194 |                 elif char_num==4:
195 |                     print('\x1b[0;32;85m█'+black_color , end='')
196 |                 elif char_num==5:
197 |                     print(purple_background_color+'█'+black_color, end='')
198 |                 elif char_num==6:
199 |                     print('\033[91m█'+black_color, end='')
200 |             print('')
201 | 


--------------------------------------------------------------------------------
/src/envs/train_spec.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from envs.train import TrainState
 3 | 
 4 | class TrainSpec(object):
 5 |     def __init__(self, height, width, init_state, carpet_locations, feature_locations, train_transition):
 6 |         """See TrainEnv.__init__ in train.py for details."""
 7 |         self.height = height
 8 |         self.width = width
 9 |         self.init_state = init_state
10 |         self.carpet_locations = carpet_locations
11 |         self.feature_locations = feature_locations
12 |         self.train_transition = train_transition
13 | 
14 | 
15 | 
16 | # In the diagrams below, G is a goal location, V is a vase, C is a carpet, A is
17 | # the agent, and T is the train.
18 | # Each tuple is of the form (spec, current state, task R, true R).
19 | 
20 | TRAIN_PROBLEMS = {
21 |     # -------
22 |     # |  G C|
23 |     # |  TT |
24 |     # | VTTG|
25 |     # |     |
26 |     # |A    |
27 |     # -------
28 |     'default': (
29 |         TrainSpec(5, 5,
30 |                   TrainState((0, 4), {(1, 2): True}, (2, 1), True),
31 |                   [(4, 0)],
32 |                   [(2, 0), (4, 2)],
33 |                   {
34 |                       (2, 1): (3, 1),
35 |                       (3, 1): (3, 2),
36 |                       (3, 2): (2, 2),
37 |                       (2, 2): (2, 1)
38 |                   }),
39 |         TrainState((2, 0), {(1, 2): True}, (2, 2), True),
40 |         np.array([0, 0, 0, 0, 0, 0, 0, 0, 1]),
41 |         np.array([-1, 0, -1, 0, 0, 0, 0, 0, 1])
42 |     )
43 | }
44 | 


--------------------------------------------------------------------------------
/src/plotting.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from matplotlib.backends.backend_pdf import PdfPages
  6 | 
  7 | def get_stats(algorithm, env, spec, comb, param_tuned, path, temp_index=0):
  8 |     results_list=[]
  9 |     for file in os.listdir(path):
 10 |         if algorithm in file and env in file and spec in file and comb in file and "-"+param_tuned in file:
 11 |             with open(os.path.join(path, file), 'rt') as f:
 12 |                 reader = csv.reader(f)
 13 |                 # the first line is names of returned items, e g [seed, true_r, final_r]
 14 |                 list_results = list(reader)[1::]
 15 |                 list_rewards = []
 16 |                 for res in list_results:
 17 |                     s = res[1]
 18 |                     s = s.replace(']', '').replace('[', '').replace(' ', '').split(',')
 19 |                     list_rewards.append(float(s[temp_index]))
 20 |                 list_rewards = np.asarray(list_rewards)
 21 | 
 22 |                 param_val = file.split('-'+param_tuned+'=', 1)[-1]
 23 |                 param_val = param_val.split('-')[0]
 24 | 
 25 |                 results_list.append([float(param_val), np.mean(list_rewards), np.std(list_rewards)])
 26 |     results_list = np.asarray(results_list)
 27 |     # return a list sorted by the value of param_tuned
 28 |     return results_list[results_list[:,0].argsort()]
 29 | 
 30 | 
 31 | def plot_params_one_subplot(stats_list_per_env, ax, color_list, env_names,
 32 |                             y_min, y_max, comb, title=None, current_subplot=0):
 33 |     ticks_string=[]
 34 |     for i in stats_list_per_env[0][0][:,0]:
 35 |         tick = str(i)
 36 |         if tick[len(tick)-2::]=='.0':
 37 |             tick = tick[0:len(tick)-2]
 38 |         ticks_string.append(tick)
 39 | 
 40 |     for j, stats_list in enumerate(stats_list_per_env):
 41 |         stats_stack = np.vstack(stats_list)
 42 | 
 43 |         for i in range(len(stats_list)):
 44 |             c = color_list[i]
 45 |             stats = stats_list[i]
 46 | 
 47 |             ax.set_ylim(y_min, y_max)
 48 |             ax.scatter(np.log2(stats[:,0]), stats[:,1], color=c, edgecolor=c, s=40, label=comb[i]+env_names[i])
 49 |             ax.plot(np.log2(stats[:,0]), stats[:,1], color=c)
 50 | 
 51 |             plt.tick_params(axis='both', labelsize=12)
 52 |             ax.tick_params(axis='both', labelsize='large')
 53 |             plt.xticks(np.log2(stats[::2,0]), ticks_string[0::2])
 54 | 
 55 |             if current_subplot==0:
 56 |                 plt.ylabel("Fraction of max R", fontsize=17)
 57 |                 handles, labels = ax.get_legend_handles_labels()
 58 |                 # sort both labels and handles by labels
 59 |                 labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
 60 |                 plt.legend(handles, labels, loc="best", fontsize=12, handletextpad=-0.4)
 61 | 
 62 |             # xlabel only for the middle subplot when plotting additive vs bayesian
 63 |             if current_subplot==1:
 64 |                 plt.xlabel("Standard deviation", fontsize=21)
 65 | 
 66 |         if title is not None: plt.title(title, fontsize=24)
 67 | 
 68 | 
 69 | def plot_params_multiple_subplots(env_lists_per_t, titles_list, y_min=0.45, y_max=1.05):
 70 |     fig = plt.figure(figsize=(5*len(env_lists_per_t), 3.4))
 71 |     for j, stats_list in enumerate(env_lists_per_t):
 72 |         ax = plt.subplot(1, len(env_lists_per_t), j+1)
 73 |         plot_params_one_subplot(stats_list, ax,
 74 |                                  color_list=['blue', 'orange', '#5177d6', '#ffe500', 'deepskyblue', 'coral'],
 75 |                                  env_names=[' room', ' room', ' train', ' train', ' batteries', ' batteries'],
 76 |                                  comb=['Bayesian,', 'Additive,','Bayesian,', 'Additive,', 'Bayesian,', 'Additive,'],
 77 |                                  title=titles_list[j], current_subplot=j, y_min=y_min, y_max=y_max)
 78 |     fig.subplots_adjust(top=1.1)
 79 |     plt.tight_layout()
 80 | 
 81 |     pp = PdfPages('./results/additive-vs-bayesian.pdf')
 82 |     pp.savefig()
 83 |     pp.close()
 84 | 
 85 | 
 86 | if __name__ == "__main__":
 87 |     ###############
 88 |     # Appendix D  #
 89 |     ###############
 90 |     # plot Additive vs Bayesian
 91 | 
 92 |     # temperature=0 (rational agent)
 93 |     avb_stats_list_per_env_t0 = [[get_stats("rlsp", "room", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=0),
 94 |                               get_stats("rlsp", "room", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=0),
 95 |                               get_stats("rlsp", "train", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=0),
 96 |                               get_stats("rlsp", "train", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=0),
 97 |                               get_stats("rlsp", "batteries", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=0),
 98 |                               get_stats("rlsp", "batteries", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=0)]]
 99 |     # temperature=0.1
100 |     avb_stats_list_per_env_t01 = [[get_stats("rlsp", "room", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=1),
101 |                               get_stats("rlsp", "room", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=1),
102 |                               get_stats("rlsp", "train", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=1),
103 |                               get_stats("rlsp", "train", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=1),
104 |                               get_stats("rlsp", "batteries", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=1),
105 |                               get_stats("rlsp", "batteries", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=1)]]
106 |     # temperature=1
107 |     avb_stats_list_per_env_t1 = [[get_stats("rlsp", "room", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=2),
108 |                               get_stats("rlsp", "room", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=2),
109 |                               get_stats("rlsp", "train", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=2),
110 |                               get_stats("rlsp", "train", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=2),
111 |                               get_stats("rlsp", "batteries", "default", "bayesian", "k",  "./results/additive-vs-bayesian", temp_index=2),
112 |                               get_stats("rlsp", "batteries", "default", "additive", "k",  "./results/additive-vs-bayesian", temp_index=2)]]
113 | 
114 |     env_lists_per_t = [avb_stats_list_per_env_t0, avb_stats_list_per_env_t01, avb_stats_list_per_env_t1]
115 |     titles_list = ['temperature = 0','temperature = 0.1','temperature = 1']
116 | 
117 |     plt.rcParams["font.family"] = "Times New Roman"
118 |     plot_params_multiple_subplots(env_lists_per_t, titles_list=titles_list)
119 | 
120 |     ###############
121 |     # Section 5.4 #
122 |     ###############
123 |     # plot robustness to the choice of Alice's planning horizon
124 | 
125 |     # temperature=0 (rational agent). This is the stat we're plotting. To plot boltzmann-rational agents, replace
126 |     # "stats_list_per_env_t0" by the stat corresponding to the temperature you want to plot in the cell below.
127 |     h_stats_list_per_env_t0 = [[get_stats("rlsp", "train", "default", "additive", "T",  "./results/horizon"),
128 |                              get_stats("rlsp", "room", "default", "additive", "T",  "./results/horizon"),
129 |                              get_stats("rlsp", "batteries", "default", "additive", "T",  "./results/horizon"),
130 |                              get_stats("rlsp", "apples", "default", "additive", "T",  "./results/horizon")]]
131 |     # temperature=0.1
132 |     h_stats_list_per_env_t01 = [[get_stats("rlsp", "room", "default", "additive", "T",  "./results/horizon", temp_index=1),
133 |                              get_stats("rlsp", "train", "default", "additive", "T",  "./results/horizon", temp_index=1),
134 |                              get_stats("rlsp", "batteries", "default", "additive", "T",  "./results/horizon", temp_index=1),
135 |                              get_stats("rlsp", "apples", "default", "additive", "T",  "./results/horizon", temp_index=1)]]
136 |     # temperature=1
137 |     h_stats_list_per_env_t1 = [[get_stats("rlsp", "room", "default", "additive", "T",  "./results/horizon", temp_index=2),
138 |                              get_stats("rlsp", "train", "default", "additive", "T",  "./results/horizon", temp_index=2),
139 |                              get_stats("rlsp", "batteries", "default", "additive", "T",  "./results/horizon", temp_index=2),
140 |                              get_stats("rlsp", "apples", "default", "additive", "T",  "./results/horizon", temp_index=2)]]
141 | 
142 |     fig = plt.figure(figsize=(4.0, 2.6))
143 |     ax = plt.subplot(1, 1, 1)
144 |     plot_params_one_subplot(h_stats_list_per_env_t0, ax, y_min=0.45, y_max=1.05,
145 |                                     env_names=['train', 'room', 'batteries', 'apples'],
146 |                                     comb=['','','',''],
147 |                                     color_list=['green', 'orange', '#5177d6', 'firebrick'])
148 |     plt.xlabel("Horizon", fontsize=17)
149 |     ax.legend(bbox_to_anchor=(1, 1.051), fontsize=12,  handletextpad=-0.4, borderpad=0.1)
150 |     plt.tight_layout()
151 | 
152 |     pp = PdfPages('./results/horizon_t0.pdf')
153 |     pp.savefig()
154 |     pp.close()
155 | 


--------------------------------------------------------------------------------
/src/relative_reachability.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def relative_reachability_penalty(mdp, horizon, start):
 4 |     """
 5 |     Calculates the undiscounted relative reachability penalty for each state in an mdp, compared to the starting state baseline. 
 6 |      
 7 |     Based on the algorithm described in: https://arxiv.org/pdf/1806.01186.pdf
 8 |     """
 9 |     coverage = get_coverage(mdp, horizon)
10 |     distributions = baseline_state_distributions(mdp, horizon, start)
11 | 
12 |     def penalty(state):
13 |         return np.sum(np.maximum(coverage[state, :] - coverage, 0), axis=1)
14 | 
15 |     def penalty_for_baseline_distribution(dist):
16 |         return sum((dist[state] * penalty(state) for state in range(mdp.nS) if dist[state] != 0))
17 | 
18 |     r_r = np.array(list(map(penalty_for_baseline_distribution, distributions)))
19 |     if np.amax(r_r) == 0:
20 |         return np.zeros_like(r_r)
21 |     return r_r / np.amax(r_r)
22 | 
23 | def get_coverage(mdp, horizon):
24 |     coverage = np.identity(mdp.nS)
25 |     for i in range(horizon):
26 |         # coverage(s0, sk) = \max_{a0} \sum_{s1} P(s1 | s0, a) * coverage(s1, sk)
27 |         action_coverage = mdp.T_matrix.dot(coverage)
28 |         action_coverage = action_coverage.reshape((mdp.nS, mdp.nA, mdp.nS))
29 |         coverage = np.amax(action_coverage, axis=1)
30 |     return coverage
31 | 
32 | def baseline_state_distributions(mdp, horizon, start):
33 |     distribution = np.zeros(mdp.nS)
34 |     distribution[start] = 1
35 |     distributions = [ distribution ]
36 |     for _ in range(horizon - 1):
37 |         distribution = mdp.baseline_matrix_transpose.dot(distribution)
38 |         distributions.append(distribution)
39 |     return distributions
40 | 


--------------------------------------------------------------------------------
/src/rlsp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.optimize import check_grad
  3 | 
  4 | from value_iter import value_iter
  5 | from utils import norm_distr, laplace_distr, printoptions
  6 | 
  7 | 
  8 | def compute_g(mdp, policy, p_0, T, d_last_step_list, expected_features_list):
  9 |     nS, nA, nF = mdp.nS, mdp.nA, mdp.num_features
 10 | 
 11 |     # base case
 12 |     G = np.zeros((nS, nF))
 13 |     # recursive case
 14 |     for t in range(T-1):
 15 |         # G(s') = \sum_{s, a} p(a | s) p(s' | s, a) [ p(s) g(s, a) + G_prev[s] ]
 16 |         # p(s) is given by d_last_step_list[t]
 17 |         # g(s, a) = f(s) - F(s) + \sum_{s'} p(s' | s, a) F(s')
 18 |         # Distribute the addition to get three different terms:
 19 |         # First term:  p(s) [f(s') - F(s')]
 20 |         # Second term: p(s) \sum_{s2} p(s2 | s, a) F(s2)
 21 |         # Third term:  G_prev[s]
 22 |         g_first = mdp.f_matrix - expected_features_list[t]
 23 |         g_second = mdp.T_matrix.dot(expected_features_list[t+1])
 24 |         g_second = g_second.reshape((nS, nA, nF))
 25 |         g_total = np.expand_dims(g_first, axis=1) + g_second
 26 | 
 27 |         prob_s_a = np.expand_dims(d_last_step_list[t].reshape(nS), axis=1) * policy[t]
 28 | 
 29 |         G_value = np.expand_dims(prob_s_a, axis=2) * g_total
 30 |         G_value = mdp.T_matrix_transpose.dot(G_value.reshape((nS * nA, nF)))
 31 | 
 32 |         G_recurse = np.expand_dims(policy[t], axis=-1) * np.expand_dims(G, axis=1)
 33 |         G_recurse = mdp.T_matrix_transpose.dot(G_recurse.reshape((nS * nA, nF)))
 34 | 
 35 |         G = G_value + G_recurse
 36 | 
 37 |     return G
 38 | 
 39 | 
 40 | def compute_d_last_step(mdp, policy, p_0, T, gamma=1, verbose=False, return_all=False):
 41 |     """Computes the last-step occupancy measure"""
 42 |     D, d_last_step_list = p_0, [p_0]
 43 |     for t in range(T-1):
 44 |         # D(s') = \sum_{s, a} D_prev(s) * p(a | s) * p(s' | s, a)
 45 |         state_action_prob = np.expand_dims(D, axis=1) * policy[t]
 46 |         D = mdp.T_matrix_transpose.dot(state_action_prob.flatten())
 47 | 
 48 |         if verbose is True: print(D)
 49 |         if return_all: d_last_step_list.append(D)
 50 | 
 51 |     return (D, d_last_step_list) if return_all else D
 52 | 
 53 | def compute_feature_expectations(mdp, policy, p_0, T):
 54 |     nS, nA, nF = mdp.nS, mdp.nA, mdp.num_features
 55 |     expected_features = mdp.f_matrix
 56 |     expected_feature_list = [expected_features]
 57 |     for t in range(T-2, -1, -1):
 58 |         # F(s) = f(s) + \sum_{a, s'} p(a | s) * p(s' | s, a) * F(s')
 59 |         future_features = mdp.T_matrix.dot(expected_features).reshape((nS, nA, nF))
 60 |         future_features = future_features * np.expand_dims(policy[t], axis=2)
 61 |         expected_features = mdp.f_matrix + np.sum(future_features, axis=1)
 62 |         expected_feature_list.append(expected_features)
 63 |     return expected_features, expected_feature_list[::-1]
 64 | 
 65 | 
 66 | def rlsp(mdp, s_current, p_0, horizon, temp=1, epochs=1, learning_rate=0.2,
 67 |          r_prior=None, r_vec=None, threshold=1e-3, check_grad_flag=False):
 68 |     """The RLSP algorithm"""
 69 |     def compute_grad(r_vec):
 70 |         # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s)
 71 |         policy = value_iter(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp)
 72 |         d_last_step, d_last_step_list = compute_d_last_step(
 73 |             mdp, policy, p_0, horizon, return_all=True)
 74 |         if d_last_step[s_current] == 0:
 75 |             print('Error in om_method: No feasible trajectories!')
 76 |             return r_vec
 77 | 
 78 |         expected_features, expected_features_list = compute_feature_expectations(
 79 |             mdp, policy, p_0, horizon)
 80 | 
 81 |         G = compute_g(mdp, policy, p_0, horizon, d_last_step_list, expected_features_list)
 82 |         # Compute the gradient
 83 |         dL_dr_vec = G[s_current] / d_last_step[s_current]
 84 |         # Gradient of the prior
 85 |         if r_prior!= None: dL_dr_vec += r_prior.logdistr_grad(r_vec)
 86 |         return dL_dr_vec
 87 | 
 88 |     def compute_log_likelihood(r_vec):
 89 |         policy = value_iter(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp)
 90 |         d_last_step = compute_d_last_step(mdp, policy, p_0, horizon)
 91 |         log_likelihood = np.log(d_last_step[s_current])
 92 |         if r_prior!= None: log_likelihood += np.sum(r_prior.logpdf(r_vec))
 93 |         return log_likelihood
 94 | 
 95 |     def get_grad(_):
 96 |         """dummy function for use with check_grad()"""
 97 |         return dL_dr_vec
 98 | 
 99 |     if r_vec is None:
100 |         r_vec = 0.01*np.random.randn(mdp.f_matrix.shape[1])
101 |     print('Initial reward vector: {}'.format(r_vec))
102 | 
103 |     if check_grad_flag: grad_error_list=[]
104 | 
105 |     for i in range(epochs):
106 |         dL_dr_vec = compute_grad(r_vec)
107 |         if check_grad_flag:
108 |             grad_error_list.append(check_grad(compute_log_likelihood, get_grad, r_vec))
109 | 
110 |         # Gradient ascent
111 |         r_vec = r_vec + learning_rate * dL_dr_vec
112 | 
113 |         # with printoptions(precision=4, suppress=True):
114 |         #     print('Epoch {}; Reward vector: {}'.format(i, r_vec))
115 |         #     if check_grad_flag: print('grad error: {}'.format(grad_error_list[-1]))
116 | 
117 |         if np.linalg.norm(dL_dr_vec) < threshold:
118 |             if check_grad_flag:
119 |                 print()
120 |                 print('Max grad error: {}'.format(np.amax(np.asarray(grad_error_list))))
121 |                 print('Median grad error: {}'.format(np.median(np.asarray(grad_error_list))))
122 |             break
123 | 
124 |     return r_vec
125 | 


--------------------------------------------------------------------------------
/src/run.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import datetime
  4 | import numpy as np
  5 | import os
  6 | import sys
  7 | 
  8 | from scipy.stats import uniform as uniform_distr
  9 | 
 10 | from envs.apples import ApplesEnv, ApplesState
 11 | from envs.apples_spec import APPLES_PROBLEMS
 12 | from envs.batteries import BatteriesEnv, BatteriesState
 13 | from envs.batteries_spec import BATTERIES_PROBLEMS
 14 | from envs.room import RoomEnv, RoomState
 15 | from envs.room_spec import ROOM_PROBLEMS
 16 | from envs.train import TrainEnv, TrainState
 17 | from envs.train_spec import TRAIN_PROBLEMS
 18 | 
 19 | from relative_reachability import relative_reachability_penalty
 20 | from rlsp import rlsp
 21 | from sampling import sample_from_posterior
 22 | from utils import norm_distr, laplace_distr, printoptions
 23 | from value_iter import value_iter, evaluate_policy
 24 | 
 25 | 
 26 | def print_rollout(env, start_state, policies, last_steps_printed, horizon):
 27 |     if last_steps_printed == 0:
 28 |         last_steps_printed = horizon
 29 | 
 30 |     env.reset(start_state)
 31 |     print("Executing the policy from state:")
 32 |     env.print_state(env.s); print()
 33 |     print('Last {} of the {} rolled out steps:'.format(
 34 |         last_steps_printed, horizon))
 35 | 
 36 |     for i in range(horizon-1):
 37 |         s_num = env.get_num_from_state(env.s)
 38 |         a = np.random.choice(env.nA, p=policies[i][s_num,:])
 39 |         env.step(a)
 40 | 
 41 |         if i>=(horizon-last_steps_printed-1):
 42 |             env.print_state(env.s); print()
 43 | 
 44 | 
 45 | def forward_rl(env, r_planning, r_true, h=40, temp=0, last_steps_printed=0,
 46 |                current_s_num=None, weight=1, penalize_deviation=False,
 47 |                relative_reachability=False, print_level=1):
 48 |     '''Given an env and R, runs soft VI for h steps and rolls out the resulting policy'''
 49 |     current_state = env.get_state_from_num(current_s_num)
 50 |     r_s = env.f_matrix @ r_planning
 51 |     time_dependent_reward = False
 52 | 
 53 |     if penalize_deviation:
 54 |         diff = env.f_matrix - env.s_to_f(current_state).T
 55 |         r_s -= weight * np.linalg.norm(diff, axis=1)
 56 |     if relative_reachability:
 57 |         time_dependent_reward = True
 58 |         r_r = relative_reachability_penalty(env, h, current_s_num)
 59 |         r_s = np.expand_dims(r_s, 0) - weight * r_r
 60 | 
 61 |     # For evaluation, plan optimally instead of Boltzmann-rationally
 62 |     policies = value_iter(env, 1, r_s, h, temperature=temp, time_dependent_reward=time_dependent_reward)
 63 | 
 64 |     # For print level >= 1, print a rollout
 65 |     if print_level >= 1:
 66 |         print_rollout(env, current_state, policies, last_steps_printed, h)
 67 | 
 68 |     return evaluate_policy(env, policies, current_s_num, 1, env.f_matrix @ r_true, h)
 69 | 
 70 | 
 71 | PROBLEMS = {
 72 |     'room': ROOM_PROBLEMS,
 73 |     'apples': APPLES_PROBLEMS,
 74 |     'train': TRAIN_PROBLEMS,
 75 |     'batteries': BATTERIES_PROBLEMS
 76 | }
 77 | 
 78 | ENV_CLASSES = {
 79 |     'room': RoomEnv,
 80 |     'apples': ApplesEnv,
 81 |     'train': TrainEnv,
 82 |     'batteries': BatteriesEnv
 83 | }
 84 | 
 85 | 
 86 | def get_problem_parameters(env_name, problem_name):
 87 |     if env_name not in ENV_CLASSES:
 88 |         raise ValueError('Environment {} is not one of {}'.format(
 89 |             env_name, list(ENV_CLASSES.keys())))
 90 |     if problem_name not in PROBLEMS[env_name]:
 91 |         raise ValueError('Problem spec {} is not one of {}'.format(
 92 |             problem_name, list(PROBLEMS[env_name].keys())))
 93 | 
 94 |     spec, cur_state, r_task, r_true = PROBLEMS[env_name][problem_name]
 95 |     env = ENV_CLASSES[env_name](spec)
 96 |     return env, env.get_num_from_state(cur_state), r_task, r_true
 97 | 
 98 | 
 99 | def get_r_prior(prior, reward_center, std):
100 |     if prior == "gaussian":
101 |         return norm_distr(reward_center, std)
102 |     elif prior == "laplace":
103 |         return laplace_distr(reward_center, std)
104 |     elif prior == "uniform":
105 |         return None
106 |     else:
107 |         raise ValueError('Unknown prior {}'.format(prior))
108 | 
109 | 
110 | def experiment_wrapper(env_name='vases',
111 |                        problem_spec='default',
112 |                        inference_algorithm='rlsp',
113 |                        combination_algorithm='additive',
114 |                        prior='gaussian',
115 |                        horizon=20,
116 |                        evaluation_horizon=0,
117 |                        temperature=1,
118 |                        learning_rate=.1,
119 |                        inferred_weight=1,
120 |                        epochs=200,
121 |                        uniform_prior=False,
122 |                        measures=['final_reward'],
123 |                        n_samples=10000,
124 |                        mcmc_burn_in=1000,
125 |                        step_size=.01,
126 |                        seed=0,
127 |                        std=0.5,
128 |                        print_level=1,
129 |                        soft_forward_rl=False,
130 |                        reward_constant=1.0):
131 |     # Check the parameters so that we fail fast
132 |     assert inference_algorithm in ['rlsp', 'sampling', 'deviation', 'reachability', 'spec']
133 |     assert combination_algorithm in ['additive', 'bayesian']
134 |     assert prior in ['gaussian', 'laplace', 'uniform']
135 |     assert all((measure in ['true_reward', 'final_reward'] for measure in measures))
136 | 
137 |     if evaluation_horizon==0:
138 |         evaluation_horizon = horizon
139 | 
140 |     if combination_algorithm == 'bayesian':
141 |         assert inference_algorithm in ['rlsp', 'sampling']
142 | 
143 |     np.random.seed(seed)
144 |     env, s_current, r_task, r_true = get_problem_parameters(env_name, problem_spec)
145 | 
146 |     if print_level >= 1:
147 |         print('Initial state:')
148 |         env.print_state(env.init_state)
149 |         print()
150 | 
151 |     p_0 = env.get_initial_state_distribution(known_initial_state=not uniform_prior)
152 | 
153 |     deviation = inference_algorithm == "deviation"
154 |     reachability = inference_algorithm == "reachability"
155 |     reward_center = r_task if combination_algorithm == "bayesian" else np.zeros(env.num_features)
156 |     r_prior = get_r_prior(prior, reward_center, std)
157 | 
158 |     # Infer reward by observing the world state
159 |     if inference_algorithm == "rlsp":
160 |         r_inferred = rlsp(env, s_current, p_0, horizon, temperature, epochs, learning_rate, r_prior)
161 |     elif inference_algorithm == "sampling":
162 |         r_samples = sample_from_posterior(
163 |             env, s_current, p_0, horizon, temperature, n_samples, step_size,
164 |             r_prior, gamma=1, print_level=print_level)
165 |         r_inferred = np.mean(r_samples[mcmc_burn_in::], axis=0)
166 |     elif inference_algorithm in ["deviation", "reachability", "spec"]:
167 |         r_inferred = None
168 |     else:
169 |         raise ValueError('Unknown inference algorithm: {}'.format(inference_algorithm))
170 | 
171 |     if print_level >= 1 and r_inferred is not None:
172 |         with printoptions(precision=4, suppress=True):
173 |             print(); print('Inferred reward vector: ', r_inferred)
174 | 
175 |     # Run forward RL to evaluate
176 |     def evaluate(forward_rl_temp):
177 |         if combination_algorithm == "additive":
178 |             r_final = r_task
179 |             if r_inferred is not None:
180 |                 r_final = r_task + inferred_weight * r_inferred
181 |             true_reward_obtained = forward_rl(env, r_final, r_true, temp=forward_rl_temp, h=evaluation_horizon, current_s_num=s_current, weight=inferred_weight, penalize_deviation=deviation, relative_reachability=reachability, print_level=print_level)
182 |         elif combination_algorithm == "bayesian":
183 |             assert r_inferred is not None
184 |             assert (not deviation) and (not reachability)
185 |             r_final = r_inferred
186 |             true_reward_obtained = forward_rl(env, r_final, r_true, temp=forward_rl_temp, h=evaluation_horizon, current_s_num=s_current, penalize_deviation=False, relative_reachability=False, print_level=print_level)
187 |         else:
188 |             raise ValueError('Unknown combination algorithm: {}'.format(combination_algorithm))
189 | 
190 |         best_possible_reward = forward_rl(env, r_true, r_true, temp=forward_rl_temp, h=evaluation_horizon, current_s_num=s_current, print_level=0)
191 | 
192 |         # Add the reward constant in
193 |         true_reward_obtained += reward_constant * evaluation_horizon
194 |         best_possible_reward += reward_constant * evaluation_horizon
195 | 
196 |         def get_measure(measure):
197 |             if measure == 'final_reward':
198 |                 return r_final
199 |             elif measure == 'true_reward':
200 |                 return true_reward_obtained * 1.0 / best_possible_reward
201 |             else:
202 |                 raise ValueError('Unknown measure {}'.format(measure))
203 | 
204 |         return [get_measure(measure) for measure in measures]
205 | 
206 |     if soft_forward_rl:
207 |         return [evaluate(temp) for temp in [0, 0.1, 0.5, 1, 5, 10]]
208 |     else:
209 |         return [evaluate(0.0)]
210 | 
211 | 
212 | 
213 | # The command line parameters that should be included in the filename of the
214 | # file summarizing the results.
215 | PARAMETERS = [
216 |     ('-e', '--env_name', 'room', None,
217 |      'Environment to run: one of [vases, boxes, room, apples, train, batteries]'),
218 |     ('-p', '--problem_spec', 'default', None,
219 |      'The name of the problem specification to solve.'),
220 |     ('-i', '--inference_algorithm', 'spec', None,
221 |      'Frame condition inference algorithm: one of [rlsp, sampling, deviation, reachability, spec].'),
222 |     ('-c', '--combination_algorithm', 'additive', None,
223 |      'How to combine the task reward and inferred reward for forward RL: one of [additive, bayesian]. bayesian only has an effect if algorithm is rlsp or sampling.'),
224 |     ('-r', '--prior', 'gaussian', None,
225 |      'Prior on the inferred reward function: one of [gaussian, laplace, uniform]. Centered at zero if combination_algorithm is additive, and at the task reward if combination_algorithm is bayesian. Only has an effect if inference_algorithm is rlsp or sampling.'),
226 |     ('-T', '--horizon', '20', int,
227 |      'Number of timesteps we assume the human has been acting.'),
228 |     ('-x', '--evaluation_horizon', '0', int,
229 |      'Number of timesteps we act after inferring the reward.'),
230 |     ('-t', '--temperature', '1.0', float,
231 |      'Boltzmann rationality constant for the human. Note this is temperature, which is the inverse of beta.'),
232 |     ('-l', '--learning_rate', '0.1', float,
233 |      'Learning rate for gradient descent. Applies when inference_algorithm is rlsp.'),
234 |     ('-w', '--inferred_weight', '1', float,
235 |      'Weight for the inferred reward when adding task and inferred rewards. Applies if combination_algorithm is additive.'),
236 |     ('-m', '--epochs', '50', int,
237 |      'Number of gradient descent steps to take.'),
238 |     ('-u', '--uniform_prior', 'False', lambda x: x != 'False',
239 |      'Whether to use a uniform prior over initial states, or to know the initial state. Either true or false.'),
240 |     ('-d', '--dependent_vars', 'final_reward', None,
241 |      'Dependent variables to measure and report'),
242 |     ('-n', '--n_samples', '10000', int,
243 |      'Number of samples to generate with MCMC'),
244 |     ('-b', '--mcmc_burn_in', '1000', int,
245 |      'Number of samples to ignore at the start'),
246 |     ('-z', '--step_size', '0.01', float,
247 |      'Step size for computing neighbor reward functions. Only has an effect if inference_algorithm is sampling.'),
248 |     ('-s', '--seed', '0', int,
249 |      'Random seed.'),
250 |     ('-k', '--std', '0.5', float,
251 |      'Standard deviation for the prior'),
252 |     ('-v', '--print_level', '1', int,
253 |      'Level of verbosity.'),
254 |     ('-f', '--soft_forward_rl', 'False', lambda x: x != 'False',
255 |      'Evaluate with a range of temperatures for soft VI for forward RL if true, else evaluate with hard VI for forward RL'),
256 |     ('-q', '--reward_constant', '1.0', float,
257 |      'Living reward provided when evaluating performance.'),
258 | ]
259 | 
260 | # Writing output for experiments
261 | def get_filename(args):
262 |     # Drop the '--' in front of the names
263 |     param_short_names = [name[1:] for name, _, _, _, _ in PARAMETERS]
264 |     param_names = [name[2:] for _, name, _, _, _ in PARAMETERS]
265 |     param_values = [args.__dict__[name] for name in param_names]
266 | 
267 |     filename = '{}-' + '={}-'.join(param_short_names) + '={}.csv'
268 |     #time_str = str(datetime.datetime.now()).replace(':', '-').replace('.', '-').replace(' ', '-')
269 |     time_str = 'res'
270 |     filename = filename.format(time_str, *param_values)
271 |     return args.output_folder + '/' + filename
272 | 
273 | def write_output(results, indep_var, indep_vals, dependent_vars, args):
274 |     with open(get_filename(args), 'w', newline='') as csvfile:
275 |         writer = csv.DictWriter(csvfile, fieldnames=[indep_var] + dependent_vars)
276 |         writer.writeheader()
277 |         for indep_val, result in zip(indep_vals, results):
278 |             row = {}
279 |             row[indep_var] = indep_val
280 |             for dependent_var, dependent_val in zip(dependent_vars, result):
281 |                 row[dependent_var] = dependent_val
282 |             writer.writerow(row)
283 | 
284 | 
285 | # Command-line arguments
286 | def parse_args(args=None):
287 |     parser = argparse.ArgumentParser()
288 |     for name, long_name, default, _, help_str in PARAMETERS:
289 |         parser.add_argument(name, long_name, type=str, default=default, help=help_str)
290 | 
291 |     # Parameters that shouldn't be included in the filename.
292 |     parser.add_argument('-o', '--output_folder', type=str, default='',
293 |                         help='Output folder')
294 |     return parser.parse_args(args)
295 | 
296 | 
297 | def setup_experiment(args):
298 |     indep_vars_dict, control_vars_dict = {}, {}
299 | 
300 |     for _, var, _, fn, _ in PARAMETERS:
301 |         var = var[2:]
302 |         if var == 'dependent_vars': continue
303 |         if fn is None: fn = lambda x: x
304 | 
305 |         vals = [fn(x) for x in args.__dict__[var].split(',')]
306 |         if len(vals) > 1:
307 |             indep_vars_dict[var] = vals
308 |         else:
309 |             control_vars_dict[var] = vals[0]
310 | 
311 |     return indep_vars_dict, control_vars_dict, args.dependent_vars.split(',')
312 | 
313 | 
314 | def main():
315 |     if sys.platform == "win32":
316 |         import colorama; colorama.init()
317 | 
318 |     args = parse_args()
319 |     print(args)
320 |     indep_vars_dict, control_vars_dict, dependent_vars = setup_experiment(args)
321 |     # print(indep_vars_dict, control_vars_dict, dependent_vars)
322 |     # For now, restrict to zero or one independent variables, but it
323 |     # could be generalized to two variables
324 |     if len(indep_vars_dict) == 0:
325 |         indep_var = 'N/A'
326 |         indep_vals = ['N/A']
327 |         results = [[] for _ in range(len(dependent_vars))]
328 |         for condition_result in experiment_wrapper(measures=dependent_vars, **control_vars_dict):
329 |             for i, result in enumerate(condition_result):
330 |                 results[i].append(result)
331 |         results = [results]
332 |     elif len(indep_vars_dict) == 1:
333 |         indep_var = next(iter(indep_vars_dict.keys()))
334 |         indep_vals = indep_vars_dict[indep_var]
335 |         results = []
336 |         for indep_val in indep_vals:
337 |             curr_results = [[] for _ in range(len(dependent_vars))]
338 |             experiment_args = control_vars_dict.copy()
339 |             experiment_args[indep_var] = indep_val
340 |             experiment_args['measures'] = dependent_vars
341 |             for condition_result in experiment_wrapper(**experiment_args):
342 |                 for i, result in enumerate(condition_result):
343 |                     curr_results[i].append(result)
344 |             results.append(curr_results)
345 |     else:
346 |         raise ValueError('Can only support up to one independent variable (that is, a flag with multiple comma-separated values)')
347 | 
348 |     if args.output_folder == '' or os.path.isfile(get_filename(args)):
349 |         print(results)
350 |     else:
351 |         write_output(results, indep_var, indep_vals, dependent_vars, args)
352 | 
353 | 
354 | if __name__ == '__main__':
355 |     main()
356 | 


--------------------------------------------------------------------------------
/src/sampling.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from math import exp
 3 | 
 4 | from value_iter import value_iter
 5 | from rlsp import compute_d_last_step
 6 | 
 7 | 
 8 | def sample_from_posterior(
 9 |         env, s_current, p_0, h, temp, n_samples, step_size, r_prior, gamma=1,
10 |         print_level=1):
11 |     """
12 |     Algorithm similar to BIRL that uses the last-step OM of a Boltzmann rational
13 |     policy instead of the BIRL likelihood. Samples the reward from the posterior
14 |     p(r | s_T, r_spec) \propto  p(s_T | \theta) * p(r | r_spec).
15 | 
16 |     This is Algorithm 1 in Appendix C of the paper.
17 |     """
18 | 
19 |     def log_last_step_om(policy):
20 |         d_last_step = compute_d_last_step(env, policy, p_0, h)
21 |         return np.log(d_last_step[s_current])
22 | 
23 |     def log_probability(r_vec, verbose=False):
24 |         pi = value_iter(env, gamma, env.f_matrix @ r_vec, h, temp)
25 |         log_p = log_last_step_om(pi)
26 | 
27 |         log_prior = 0
28 |         if r_prior is not None:
29 |             log_prior = np.sum(r_prior.logpdf(r_vec))
30 | 
31 |         if verbose:
32 |             print('Log prior: {}\nLog prob:  {}\nTotal:     {}'.format(
33 |                 log_prior, log_p, log_p + log_prior))
34 |         return log_p + log_prior
35 | 
36 |     times_accepted = 0
37 |     samples = []
38 | 
39 |     if r_prior is None:
40 |         r = .01*np.random.randn(env.num_features)
41 |     else:
42 |         r = 0.1 * r_prior.rvs()
43 | 
44 |     if print_level >= 1:
45 |         print('Initial reward: {}'.format(r))
46 | 
47 |     # probability of the initial reward
48 |     log_p = log_probability(r, verbose=(print_level >= 1))
49 | 
50 |     while len(samples) < n_samples:
51 |         verbose = (print_level >= 1) and (len(samples) % 200 == 199)
52 |         if verbose:
53 |             print('\nGenerating sample {}'.format(len(samples) + 1))
54 | 
55 |         r_prime = np.random.normal(r, step_size)
56 |         log_p_1 = log_probability(r_prime, verbose=verbose)
57 | 
58 |         # Accept or reject the new sample
59 |         # If we reject, the new sample is the previous sample
60 |         acceptance_probability = exp(log_p_1-log_p)
61 |         if np.random.uniform() < acceptance_probability:
62 |             times_accepted += 1
63 |             r, log_p = r_prime, log_p_1
64 |         samples.append(r)
65 | 
66 | 
67 |         if verbose:
68 |             # Acceptance probability should not be very high or very low
69 |             print('Acceptance probability is {}'.format(acceptance_probability))
70 | 
71 |     if print_level >= 1:
72 |         print('Done! Accepted {} of samples'.format(times_accepted/n_samples))
73 |     return samples
74 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import numpy as np
 3 | from scipy.stats import norm, laplace
 4 | 
 5 | 
 6 | class norm_distr(object):
 7 |     def __init__(self, mu, sigma=1):
 8 |         self.mu = mu
 9 |         self.sigma = sigma
10 |         self.distribution = norm(loc=mu, scale=sigma)
11 | 
12 |     def rvs(self):
13 |         '''sample'''
14 |         return self.distribution.rvs()
15 | 
16 |     def pdf(self, x):
17 |         return self.distribution.pdf(x)
18 | 
19 |     def logpdf(self, x):
20 |         return self.distribution.logpdf(x)
21 | 
22 |     def logdistr_grad(self, x):
23 |         return (self.mu-x)/(self.sigma**2)
24 | 
25 | 
26 | class laplace_distr(object):
27 |     def __init__(self, mu, b=1):
28 |         self.mu = mu
29 |         self.b = b
30 |         self.distribution = laplace(loc=mu, scale=b)
31 | 
32 |     def rvs(self):
33 |         '''sample'''
34 |         return self.distribution.rvs()
35 | 
36 |     def pdf(self, x):
37 |         return self.distribution.pdf(x)
38 | 
39 |     def logpdf(self, x):
40 |         return self.distribution.logpdf(x)
41 | 
42 |     def logdistr_grad(self, x):
43 |         return (self.mu-x)/(np.fabs(x-self.mu)*self.b)
44 | 
45 | 
46 | @contextlib.contextmanager
47 | def printoptions(*args, **kwargs):
48 |     original = np.get_printoptions()
49 |     np.set_printoptions(*args, **kwargs)
50 |     try:
51 |         yield
52 |     finally:
53 |         np.set_printoptions(**original)
54 | 


--------------------------------------------------------------------------------
/src/value_iter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def value_iter(mdp, gamma, r, horizon, temperature=1, threshold=1e-10, time_dependent_reward=False):
  5 |     """
  6 |     Finds the optimal state and state-action value functions via value
  7 |     iteration with the "soft" max-ent Bellman backup:
  8 | 
  9 |     Q_{sa} = r_s + gamma * \sum_{s'} p(s'|s,a)V_{s'}
 10 |     V'_s = temperature * log(\sum_a exp(Q_{sa}/temperature))
 11 | 
 12 |     Computes the Boltzmann rational policy
 13 |     \pi_{s,a} = exp((Q_{s,a} - V_s)/temperature).
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     mdp : object
 18 |         Instance of the Env class (see envs/env.py).
 19 | 
 20 |     gamma : float
 21 |         Discount factor; 0<=gamma<=1.
 22 |     r : 1D numpy array
 23 |         Initial reward vector with the length equal to the
 24 |         number of states in the MDP.
 25 |     horizon : int
 26 |         Horizon for the finite horizon version of value iteration.
 27 |     temperature: float
 28 |         Rationality constant to use in the value iteration equation.
 29 |     threshold : float
 30 |         Convergence threshold.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     1D numpy array
 35 |         Array of shape (mdp.nS, 1), each V[s] is the value of state s under
 36 |         the reward r and Boltzmann policy.
 37 |     2D numpy array
 38 |         Array of shape (mdp.nS, mdp.nA), each Q[s,a] is the value of
 39 |         state-action pair [s,a] under the reward r and Boltzmann policy.
 40 |     List of 2D numpy arrays
 41 |         Arrays of shape (mdp.nS, mdp.nA), each value p[t][s,a] is the
 42 |         probability of taking action a in state s at time t.
 43 |     """
 44 |     nS, nA = mdp.nS, mdp.nA
 45 |     # Functions for computing the policy
 46 |     expt = lambda x: np.exp(x/temperature)
 47 |     tlog = lambda x: temperature * np.log(x)
 48 | 
 49 |     if not time_dependent_reward:
 50 |         r = [r] * horizon  # Fast, since we aren't making copies
 51 | 
 52 |     policies = []
 53 |     V = np.copy(r[horizon-1])
 54 |     for t in range(horizon-2, -1, -1):
 55 |         future_values = mdp.T_matrix.dot(V).reshape((nS, nA))
 56 |         Q = np.expand_dims(r[t], axis=1) + gamma * future_values
 57 | 
 58 |         if temperature==0:
 59 |             V = Q.max(axis=1)
 60 |             # Argmax to find the action number, then index into np.eye to
 61 |             # one hot encode. Note this will deterministically break ties
 62 |             # towards the smaller action.
 63 |             policy = np.eye(nA)[np.argmax(Q, axis=1)]
 64 |         else:
 65 |             # ∀ s: V_s = temperature * log(\sum_a exp(Q_sa/temperature))
 66 |             # ∀ s,a: policy_{s,a} = exp((Q_{s,a} - V_s)/t)
 67 |             V = softmax(Q, temperature)
 68 |             policy = expt(Q - np.expand_dims(V, axis=1))
 69 | 
 70 | 
 71 |         policies.append(policy)
 72 | 
 73 |         if gamma==1:
 74 |             # When \gamma=1, the backup operator is equivariant under adding
 75 |             # a constant to all entries of V, so we can translate min(V)
 76 |             # to be 0 at each step of the softmax value iteration without
 77 |             # changing the policy it converges to, and this fixes the problem
 78 |             # where log(nA) keep getting added at each iteration.
 79 |             V = V - np.amin(V)
 80 | 
 81 |     return policies[::-1]
 82 | 
 83 | 
 84 | def evaluate_policy(mdp, policy, start, gamma, r, horizon):
 85 |     """Expected reward from the policy."""
 86 |     V = r
 87 |     for t in range(horizon-2, -1, -1):
 88 |         future_values = mdp.T_matrix.dot(V).reshape((mdp.nS, mdp.nA))
 89 |         Q = np.expand_dims(r, axis=1) + gamma * future_values
 90 |         V = np.sum(policy[t] * Q, axis=1)
 91 |     return V[start]
 92 | 
 93 | 
 94 | def softmax(x, t=1):
 95 |     """
 96 |     Numerically stable computation of t*log(\sum_j^n exp(x_j / t))
 97 | 
 98 |     If the input is a 1D numpy array, computes it's softmax:
 99 |         output = t*log(\sum_j^n exp(x_j / t)).
100 |     If the input is a 2D numpy array, computes the softmax of each of the rows:
101 |         output_i = t*log(\sum_j^n exp(x_{ij} / t))
102 | 
103 |     Parameters
104 |     ----------
105 |     x : 1D or 2D numpy array
106 | 
107 |     Returns
108 |     -------
109 |     1D numpy array
110 |         shape = (n,), where:
111 |             n = 1 if x was 1D, or
112 |             n is the number of rows (=x.shape[0]) if x was 2D.
113 |     """
114 |     assert t>=0
115 |     if len(x.shape) == 1: x = x.reshape((1,-1))
116 |     if t == 0: return np.amax(x, axis=1)
117 |     if x.shape[1] == 1: return x
118 | 
119 |     def softmax_2_arg(x1,x2, t):
120 |         """
121 |         Numerically stable computation of t*log(exp(x1/t) + exp(x2/t))
122 | 
123 |         Parameters
124 |         ----------
125 |         x1 : numpy array of shape (n,1)
126 |         x2 : numpy array of shape (n,1)
127 | 
128 |         Returns
129 |         -------
130 |         numpy array of shape (n,1)
131 |             Each output_i = t*log(exp(x1_i / t) + exp(x2_i / t))
132 |         """
133 |         tlog = lambda x: t * np.log(x)
134 |         expt = lambda x: np.exp(x/t)
135 | 
136 |         max_x = np.amax((x1,x2),axis=0)
137 |         min_x = np.amin((x1,x2),axis=0)
138 |         return max_x + tlog(1+expt((min_x - max_x)))
139 | 
140 |     sm = softmax_2_arg(x[:,0],x[:,1], t)
141 |     # Use the following property of softmax_2_arg:
142 |     # softmax_2_arg(softmax_2_arg(x1,x2),x3) = log(exp(x1) + exp(x2) + exp(x3))
143 |     # which is true since
144 |     # log(exp(log(exp(x1) + exp(x2))) + exp(x3)) = log(exp(x1) + exp(x2) + exp(x3))
145 |     for (i, x_i) in enumerate(x.T):
146 |         if i>1: sm = softmax_2_arg(sm, x_i, t)
147 |     return sm
148 | 


--------------------------------------------------------------------------------