├── .gitignore ├── LICENSE ├── README.md ├── experiments-for-plots.sh ├── experiments.sh ├── poster-preferences-implicit-in-the-state-of-the-world.pdf ├── setup.py └── src ├── __init__.py ├── envs ├── __init__.py ├── apples.py ├── apples_spec.py ├── batteries.py ├── batteries_spec.py ├── env.py ├── room.py ├── room_spec.py ├── tests │ ├── apples_test.py │ ├── batteries_test.py │ ├── env_test.py │ ├── room_test.py │ └── train_test.py ├── train.py └── train_spec.py ├── plotting.py ├── relative_reachability.py ├── rlsp.py ├── run.py ├── sampling.py ├── utils.py └── value_iter.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.swp 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Center for Human-Compatible AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reward Learning by Simulating the Past 2 | 3 | This is the code accompanying the paper "Preferences Implicit in the State of the World". [Paper](https://arxiv.org/abs/1902.04198), [blog post](https://bair.berkeley.edu/blog/2019/02/11/learning_preferences/), [poster](https://github.com/HumanCompatibleAI/rlsp/blob/master/poster-preferences-implicit-in-the-state-of-the-world.pdf). 4 | 5 | Tests can be run with `python setup.py test`. 6 | 7 | Instructions for running the experiments can be found in `experiments.sh`. The script `experiments-for-plots.sh` generates the plots from the paper. 8 | -------------------------------------------------------------------------------- /experiments-for-plots.sh: -------------------------------------------------------------------------------- 1 | # Script to generate the plots in the paper. This script will create a "results" folder, and write the experiment 2 | # outputs into it. Hereafter the plots would be generated in the "results" folder using "src/plotting.py" script. 3 | # Running this script would take several (3-6) hours. 4 | 5 | ############### 6 | # Section 5.4 # 7 | ############### 8 | 9 | # Robustness to the choice of Alice's planning horizon T. 10 | mkdir -p results/horizon 11 | 12 | # room env 13 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 1 -o results/horizon -x 20 -d true_reward,final_reward 14 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 2 -o results/horizon -x 20 -d true_reward,final_reward 15 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 3 -o results/horizon -x 20 -d true_reward,final_reward 16 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 5 -o results/horizon -x 20 -d true_reward,final_reward 17 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 10 -o results/horizon -x 20 -d true_reward,final_reward 18 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 20 -o results/horizon -x 20 -d true_reward,final_reward 19 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 30 -o results/horizon -x 20 -d true_reward,final_reward 20 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 50 -o results/horizon -x 20 -d true_reward,final_reward 21 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 100 -o results/horizon -x 20 -d true_reward,final_reward 22 | 23 | # train env 24 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 1 -o results/horizon -x 20 -d true_reward,final_reward 25 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 2 -o results/horizon -x 20 -d true_reward,final_reward 26 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 3 -o results/horizon -x 20 -d true_reward,final_reward 27 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 5 -o results/horizon -x 20 -d true_reward,final_reward 28 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 10 -o results/horizon -x 20 -d true_reward,final_reward 29 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 20 -o results/horizon -x 20 -d true_reward,final_reward 30 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 30 -o results/horizon -x 20 -d true_reward,final_reward 31 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 50 -o results/horizon -x 20 -d true_reward,final_reward 32 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 100 -o results/horizon -x 20 -d true_reward,final_reward 33 | 34 | # apples env 35 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 1 -o results/horizon -x 20 -d true_reward,final_reward 36 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 2 -o results/horizon -x 20 -d true_reward,final_reward 37 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 3 -o results/horizon -x 20 -d true_reward,final_reward 38 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 5 -o results/horizon -x 20 -d true_reward,final_reward 39 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 10 -o results/horizon -x 20 -d true_reward,final_reward 40 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 20 -o results/horizon -x 20 -d true_reward,final_reward 41 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 30 -o results/horizon -x 20 -d true_reward,final_reward 42 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 50 -o results/horizon -x 20 -d true_reward,final_reward 43 | python src/run.py -e apples -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 100 -o results/horizon -x 20 -d true_reward,final_reward 44 | 45 | # batteries env 46 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 1 -o results/horizon -x 20 -d true_reward,final_reward 47 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 2 -o results/horizon -x 20 -d true_reward,final_reward 48 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 3 -o results/horizon -x 20 -d true_reward,final_reward 49 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 5 -o results/horizon -x 20 -d true_reward,final_reward 50 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 10 -o results/horizon -x 20 -d true_reward,final_reward 51 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 20 -o results/horizon -x 20 -d true_reward,final_reward 52 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 30 -o results/horizon -x 20 -d true_reward,final_reward 53 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 50 -o results/horizon -x 20 -d true_reward,final_reward 54 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -u True -m 1000 -T 100 -o results/horizon -x 20 -d true_reward,final_reward 55 | 56 | 57 | ############## 58 | # Appendix D # 59 | ############## 60 | 61 | # Option -c additive stands for the Additive method, and -c bayesian for the Bayesian method 62 | # The -k parameter controls the standard deviation (set to 0.5 by default) 63 | mkdir -p results/additive-vs-bayesian 64 | 65 | # room env additive 66 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward 67 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward 68 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward 69 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward 70 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward 71 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward 72 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward 73 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward 74 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward 75 | python src/run.py -e room -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward 76 | 77 | # train env additive 78 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward 79 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward 80 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward 81 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward 82 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward 83 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward 84 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward 85 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward 86 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward 87 | python src/run.py -e train -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward 88 | 89 | # batteries env additive 90 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward 91 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward 92 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward 93 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward 94 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward 95 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward 96 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward 97 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward 98 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward 99 | python src/run.py -e batteries -p default -c additive -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward 100 | 101 | # room env bayesian 102 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward 103 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward 104 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward 105 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward 106 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward 107 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward 108 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward 109 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward 110 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward 111 | python src/run.py -e room -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 10 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward 112 | 113 | # train env bayesian 114 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward 115 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward 116 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward 117 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward 118 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward 119 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward 120 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward 121 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward 122 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward 123 | python src/run.py -e train -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 8 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward 124 | 125 | # batteries env bayesian 126 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.05 -o results/additive-vs-bayesian -d true_reward,final_reward 127 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.1 -o results/additive-vs-bayesian -d true_reward,final_reward 128 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.2 -o results/additive-vs-bayesian -d true_reward,final_reward 129 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.3 -o results/additive-vs-bayesian -d true_reward,final_reward 130 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 0.5 -o results/additive-vs-bayesian -d true_reward,final_reward 131 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 1 -o results/additive-vs-bayesian -d true_reward,final_reward 132 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 2 -o results/additive-vs-bayesian -d true_reward,final_reward 133 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 3 -o results/additive-vs-bayesian -d true_reward,final_reward 134 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 5 -o results/additive-vs-bayesian -d true_reward,final_reward 135 | python src/run.py -e batteries -p default -c bayesian -i rlsp -f True -s 0 -l 0.001 -m 1000 -T 11 -k 10 -o results/additive-vs-bayesian -d true_reward,final_reward 136 | 137 | 138 | ###################### 139 | # Generate the plots # 140 | ###################### 141 | python src/plotting.py 142 | -------------------------------------------------------------------------------- /experiments.sh: -------------------------------------------------------------------------------- 1 | # Commands for the experiments in the paper. These will write to stdout, and are meant to be run individually. 2 | # Most experiments should run in seconds, though some can take minutes (especially with the sampling algorithm). 3 | 4 | ############### 5 | # Section 5.2 # 6 | ############### 7 | 8 | # Comparison to baselines (Table 1 and Figure 2) 9 | 10 | # Room: Specified reward, deviation, reachability, RLSP 11 | python src/run.py -e room -p default -c additive -i spec -d true_reward,final_reward -T 7 -x 20 12 | python src/run.py -e room -p default -c additive -i deviation -d true_reward,final_reward -T 7 -x 20 -w 0.5 13 | python src/run.py -e room -p default -c additive -i reachability -d true_reward,final_reward -T 7 -x 20 14 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 15 | 16 | # Train: 17 | python src/run.py -e train -p default -c additive -i spec -d true_reward,final_reward -T 8 -x 20 18 | python src/run.py -e train -p default -c additive -i deviation -d true_reward,final_reward -T 8 -x 20 -w 0.5 19 | python src/run.py -e train -p default -c additive -i reachability -d true_reward,final_reward -T 8 -x 20 20 | python src/run.py -e train -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 8 -x 20 21 | 22 | # Apples: 23 | python src/run.py -e apples -p default -c additive -i spec -d true_reward,final_reward -T 11 -x 20 24 | python src/run.py -e apples -p default -c additive -i deviation -d true_reward,final_reward -T 11 -x 20 -w 0.5 25 | python src/run.py -e apples -p default -c additive -i reachability -d true_reward,final_reward -T 11 -x 20 26 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 27 | 28 | # Batteries, easy: 29 | python src/run.py -e batteries -p easy -c additive -i spec -d true_reward,final_reward -T 11 -x 20 30 | python src/run.py -e batteries -p easy -c additive -i deviation -d true_reward,final_reward -T 11 -x 20 -w 0.5 31 | python src/run.py -e batteries -p easy -c additive -i reachability -d true_reward,final_reward -T 11 -x 20 32 | python src/run.py -e batteries -p easy -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 33 | 34 | # Batteries, hard: 35 | python src/run.py -e batteries -p default -c additive -i spec -d true_reward,final_reward -T 11 -x 20 36 | python src/run.py -e batteries -p default -c additive -i deviation -d true_reward,final_reward -T 11 -x 20 -w 0.5 37 | python src/run.py -e batteries -p default -c additive -i reachability -d true_reward,final_reward -T 11 -x 20 38 | python src/run.py -e batteries -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 39 | 40 | # Far away vase: 41 | python src/run.py -e room -p bad -c additive -i spec -d true_reward,final_reward -T 5 -x 20 42 | python src/run.py -e room -p bad -c additive -i deviation -d true_reward,final_reward -T 5 -x 20 -w 0.5 43 | python src/run.py -e room -p bad -c additive -i reachability -d true_reward,final_reward -T 5 -x 20 44 | python src/run.py -e room -p bad -c additive -i rlsp -d true_reward,final_reward -s 0 -T 5 -x 20 45 | 46 | ############### 47 | # Section 5.3 # 48 | ############### 49 | 50 | # Comparison between knowing the s_{-T} vs. using a uniform distribution over s_{-T} 51 | # The commands are the same in the knowing the s_{-T} case; for the uniform distribution we simply add -u True 52 | 53 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 54 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 -u True 55 | python src/run.py -e train -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 8 -x 20 56 | python src/run.py -e train -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 8 -x 20 -u True 57 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 58 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 -u True 59 | python src/run.py -e batteries -p easy -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 60 | python src/run.py -e batteries -p easy -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 -u True 61 | python src/run.py -e batteries -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 62 | python src/run.py -e batteries -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 11 -x 20 -u True 63 | python src/run.py -e room -p bad -c additive -i rlsp -d true_reward,final_reward -s 0 -T 5 -x 20 64 | python src/run.py -e room -p bad -c additive -i rlsp -d true_reward,final_reward -s 0 -T 5 -x 20 -u True 65 | 66 | ############### 67 | # Section 5.4 # 68 | ############### 69 | 70 | # Robustness to the choice of Alice's planning horizon T. 71 | # Simply take the RLSP commands from before and try different values of T, for example: 72 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 20 -x 20 73 | python src/run.py -e apples -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 20 -x 20 74 | 75 | # It is also possible to run with multiple values of T and collect the results in an output file, see src/run.py for details. 76 | 77 | ############## 78 | # Appendix C # 79 | ############## 80 | 81 | # MCMC sampling 82 | # Simply replace -i rlsp with -i sampling: 83 | python src/run.py -e room -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 7 -x 20 84 | python src/run.py -e train -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 8 -x 20 85 | python src/run.py -e apples -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 11 -x 20 86 | python src/run.py -e batteries -p easy -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 11 -x 20 87 | python src/run.py -e batteries -p default -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 11 -x 20 88 | python src/run.py -e room -p bad -c additive -i sampling -d true_reward,final_reward -s 0,1,2,3,4 -T 5 -x 20 89 | 90 | ############## 91 | # Appendix D # 92 | ############## 93 | 94 | # Use -c additive for the Additive method, and -c bayesian for the Bayesian method 95 | # Use the -k parameter to control the standard deviation (set to 0.5 by default) 96 | # Note that since the Apples environment has no specified reward, the -c option has no effect on it. 97 | python src/run.py -e room -p default -c additive -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 -k 1 98 | python src/run.py -e room -p default -c bayesian -i rlsp -d true_reward,final_reward -s 0 -T 7 -x 20 -k 1 99 | -------------------------------------------------------------------------------- /poster-preferences-implicit-in-the-state-of-the-world.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCompatibleAI/rlsp/cacae643752a02b2be092870df2ce3de8d674144/poster-preferences-implicit-in-the-state-of-the-world.pdf -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='rlsp', 5 | version=1.0, 6 | description='Reward Learning by Simulating the Past', 7 | author='Rohin Shah, Dmitrii Krasheninnikov, Jordan Alexander, et al', 8 | author_email='rohinmshah@berkeley.edu', 9 | python_requires='>=3.6.0', 10 | url='https://github.com/HumanCompatibleAI/rlsp', 11 | packages=find_packages('src'), 12 | package_dir={'': 'src'}, 13 | install_requires=[ 14 | 'numpy>=1.13', 15 | 'scipy>=0.19', 16 | ], 17 | test_suite='nose.collector', 18 | tests_require=['nose', 'nose-cover3'], 19 | include_package_data=True, 20 | license='MIT', 21 | classifiers=[ 22 | # Trove classifiers 23 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 24 | 'License :: OSI Approved :: MIT License', 25 | 'Programming Language :: Python', 26 | 'Programming Language :: Python :: 3', 27 | 'Programming Language :: Python :: 3.6', 28 | ], 29 | ) 30 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCompatibleAI/rlsp/cacae643752a02b2be092870df2ce3de8d674144/src/__init__.py -------------------------------------------------------------------------------- /src/envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCompatibleAI/rlsp/cacae643752a02b2be092870df2ce3de8d674144/src/envs/__init__.py -------------------------------------------------------------------------------- /src/envs/apples.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import copy, deepcopy 3 | from itertools import product 4 | 5 | from envs.env import Env, Direction 6 | 7 | 8 | class ApplesState(object): 9 | ''' 10 | state of the environment; describes positions of all objects in the env. 11 | ''' 12 | def __init__(self, agent_pos, tree_states, bucket_states, carrying_apple): 13 | """ 14 | agent_pos: (orientation, x, y) tuple for the agent's location 15 | tree_states: Dictionary mapping (x, y) tuples to booleans. 16 | bucket_states: Dictionary mapping (x, y) tuples to integers. 17 | carrying_apple: Boolean, True if carrying an apple, False otherwise. 18 | """ 19 | self.agent_pos = agent_pos 20 | self.tree_states = tree_states 21 | self.bucket_states = bucket_states 22 | self.carrying_apple = carrying_apple 23 | 24 | def __eq__(self, other): 25 | return isinstance(other, ApplesState) and \ 26 | self.agent_pos == other.agent_pos and \ 27 | self.tree_states == other.tree_states and \ 28 | self.bucket_states == other.bucket_states and \ 29 | self.carrying_apple == other.carrying_apple 30 | 31 | def __hash__(self): 32 | def get_vals(dictionary): 33 | return tuple([dictionary[loc] for loc in sorted(dictionary.keys())]) 34 | return hash(self.agent_pos + get_vals(self.tree_states) + get_vals(self.bucket_states) + (self.carrying_apple,)) 35 | 36 | 37 | class ApplesEnv(Env): 38 | def __init__(self, spec, compute_transitions=True): 39 | """ 40 | height: Integer, height of the grid. Y coordinates are in [0, height). 41 | width: Integer, width of the grid. X coordinates are in [0, width). 42 | init_state: ApplesState, initial state of the environment 43 | vase_locations: List of (x, y) tuples, locations of vases 44 | num_vases: Integer, number of vases 45 | carpet_locations: Set of (x, y) tuples, locations of carpets 46 | feature_locations: List of (x, y) tuples, locations of features 47 | s: ApplesState, Current state 48 | nA: Integer, number of actions 49 | """ 50 | self.height = spec.height 51 | self.width = spec.width 52 | self.apple_regen_probability = spec.apple_regen_probability 53 | self.bucket_capacity = spec.bucket_capacity 54 | self.init_state = deepcopy(spec.init_state) 55 | self.include_location_features = spec.include_location_features 56 | 57 | self.tree_locations = list(self.init_state.tree_states.keys()) 58 | self.bucket_locations = list(self.init_state.bucket_states.keys()) 59 | used_locations = set(self.tree_locations + self.bucket_locations) 60 | self.possible_agent_locations = list(filter( 61 | lambda pos: pos not in used_locations, 62 | product(range(self.width), range(self.height)))) 63 | 64 | self.num_trees = len(self.tree_locations) 65 | self.num_buckets = len(self.bucket_locations) 66 | 67 | self.default_action = Direction.get_number_from_direction(Direction.STAY) 68 | self.nA = 6 69 | self.num_features = len(self.s_to_f(self.init_state)) 70 | 71 | self.reset() 72 | 73 | if compute_transitions: 74 | states = self.enumerate_states() 75 | self.make_transition_matrices( 76 | states, range(self.nA), self.nS, self.nA) 77 | self.make_f_matrix(self.nS, self.num_features) 78 | 79 | 80 | def enumerate_states(self): 81 | all_agent_positions = filter( 82 | lambda pos: (pos[1], pos[2]) in self.possible_agent_locations, 83 | product(range(4), range(self.width), range(self.height))) 84 | all_tree_states = map( 85 | lambda tree_vals: dict(zip(self.tree_locations, tree_vals)), 86 | product([True, False], repeat=self.num_trees)) 87 | all_bucket_states = map( 88 | lambda bucket_vals: dict(zip(self.bucket_locations, bucket_vals)), 89 | product(range(self.bucket_capacity + 1), repeat=self.num_buckets)) 90 | all_states = map( 91 | lambda x: ApplesState(*x), 92 | product(all_agent_positions, all_tree_states, all_bucket_states, [True, False])) 93 | 94 | state_num = {} 95 | for state in all_states: 96 | if state not in state_num: 97 | state_num[state] = len(state_num) 98 | 99 | self.state_num = state_num 100 | self.num_state = {v: k for k, v in self.state_num.items()} 101 | self.nS = len(state_num) 102 | 103 | return state_num.keys() 104 | 105 | def get_num_from_state(self, state): 106 | return self.state_num[state] 107 | 108 | def get_state_from_num(self, num): 109 | return self.num_state[num] 110 | 111 | 112 | def s_to_f(self, s): 113 | ''' 114 | Returns features of the state: 115 | - Number of apples in buckets 116 | - Number of apples on trees 117 | - Whether the agent is carrying an apple 118 | - For each other location, whether the agent is on that location 119 | ''' 120 | num_bucket_apples = sum(s.bucket_states.values()) 121 | num_tree_apples = sum(map(int, s.tree_states.values())) 122 | carrying_apple = int(s.carrying_apple) 123 | agent_pos = s.agent_pos[1], s.agent_pos[2] # Drop orientation 124 | features = [num_bucket_apples, num_tree_apples, carrying_apple] 125 | if self.include_location_features: 126 | features = features + [int(agent_pos == pos) for pos in self.possible_agent_locations] 127 | return np.array(features) 128 | 129 | 130 | def get_next_states(self, state, action): 131 | '''returns the next state given a state and an action''' 132 | action = int(action) 133 | orientation, x, y = state.agent_pos 134 | new_orientation, new_x, new_y = state.agent_pos 135 | new_tree_states = deepcopy(state.tree_states) 136 | new_bucket_states = deepcopy(state.bucket_states) 137 | new_carrying_apple = state.carrying_apple 138 | 139 | if action == Direction.get_number_from_direction(Direction.STAY): 140 | pass 141 | elif action < len(Direction.ALL_DIRECTIONS): 142 | new_orientation = action 143 | move_x, move_y = Direction.move_in_direction_number((x, y), action) 144 | # New position is legal 145 | if (0 <= move_x < self.width and \ 146 | 0 <= move_y < self.height and \ 147 | (move_x, move_y) in self.possible_agent_locations): 148 | new_x, new_y = move_x, move_y 149 | else: 150 | # Move only changes orientation, which we already handled 151 | pass 152 | elif action == 5: 153 | obj_pos = Direction.move_in_direction_number((x, y), orientation) 154 | if state.carrying_apple: 155 | # We always drop the apple 156 | new_carrying_apple = False 157 | # If we're facing a bucket, it goes there 158 | if obj_pos in new_bucket_states: 159 | prev_apples = new_bucket_states[obj_pos] 160 | new_bucket_states[obj_pos] = min(prev_apples + 1, self.bucket_capacity) 161 | elif obj_pos in new_tree_states and new_tree_states[obj_pos]: 162 | new_carrying_apple = True 163 | new_tree_states[obj_pos] = False 164 | else: 165 | # Interact while holding nothing and not facing a tree. 166 | pass 167 | else: 168 | raise ValueError('Invalid action {}'.format(action)) 169 | 170 | new_pos = new_orientation, new_x, new_y 171 | 172 | def make_state(prob_apples_tuple): 173 | prob, tree_apples = prob_apples_tuple 174 | trees = dict(zip(self.tree_locations, tree_apples)) 175 | s = ApplesState(new_pos, trees, new_bucket_states, new_carrying_apple) 176 | return (prob, s, 0) 177 | 178 | # For apple regeneration, don't regenerate apples that were just picked, 179 | # so use the apple booleans from the original state 180 | old_tree_apples = [state.tree_states[loc] for loc in self.tree_locations] 181 | new_tree_apples = [new_tree_states[loc] for loc in self.tree_locations] 182 | return list(map(make_state, self.regen_apples(old_tree_apples, new_tree_apples))) 183 | 184 | def regen_apples(self, old_tree_apples, new_tree_apples): 185 | if len(old_tree_apples) == 0: 186 | yield (1, []) 187 | return 188 | for prob, apples in self.regen_apples(old_tree_apples[1:], new_tree_apples[1:]): 189 | if old_tree_apples[0]: 190 | yield prob, [new_tree_apples[0]] + apples 191 | else: 192 | yield prob * self.apple_regen_probability, [True] + apples 193 | yield prob * (1 - self.apple_regen_probability), [False] + apples 194 | 195 | 196 | def print_state(self, state): 197 | '''Renders the state.''' 198 | h, w = self.height, self.width 199 | canvas = np.zeros(tuple([2*h-1, 2*w+1]), dtype='int8') 200 | 201 | # cell borders 202 | for y in range(1, canvas.shape[0], 2): 203 | canvas[y, :] = 1 204 | for x in range(0, canvas.shape[1], 2): 205 | canvas[:, x] = 2 206 | 207 | # trees 208 | for (x, y), has_apple in state.tree_states.items(): 209 | canvas[2*y, 2*x+1] = 3 if has_apple else 4 210 | 211 | for x, y in self.bucket_locations: 212 | canvas[2*y, 2*x+1] = 5 213 | 214 | # agent 215 | orientation, x, y = state.agent_pos 216 | canvas[2*y, 2*x+1] = 6 217 | 218 | black_color = '\x1b[0m' 219 | purple_background_color = '\x1b[0;35;85m' 220 | 221 | for line in canvas: 222 | for char_num in line: 223 | if char_num==0: 224 | print('\u2003', end='') 225 | elif char_num==1: 226 | print('─', end='') 227 | elif char_num==2: 228 | print('│', end='') 229 | elif char_num==3: 230 | print('\x1b[0;32;85m█'+black_color , end='') 231 | elif char_num==4: 232 | print('\033[91m█'+black_color, end='') 233 | elif char_num==5: 234 | print('\033[93m█'+black_color, end='') 235 | elif char_num==6: 236 | orientation_char = self.get_orientation_char(orientation) 237 | agent_color = '\x1b[1;42;42m' if state.carrying_apple else '\x1b[0m' 238 | print(agent_color+orientation_char+black_color, end='') 239 | print('') 240 | 241 | def get_orientation_char(self, orientation): 242 | DIRECTION_TO_CHAR = { 243 | Direction.NORTH: '↑', 244 | Direction.SOUTH: '↓', 245 | Direction.WEST: '←', 246 | Direction.EAST: '→', 247 | Direction.STAY: '*' 248 | } 249 | direction = Direction.get_direction_from_number(orientation) 250 | return DIRECTION_TO_CHAR[direction] 251 | -------------------------------------------------------------------------------- /src/envs/apples_spec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from envs.apples import ApplesState 3 | from envs.env import Direction 4 | 5 | class ApplesSpec(object): 6 | def __init__(self, height, width, init_state, apple_regen_probability, 7 | bucket_capacity, include_location_features): 8 | """See ApplesEnv.__init__ in apples.py for details.""" 9 | self.height = height 10 | self.width = width 11 | self.init_state = init_state 12 | self.apple_regen_probability = apple_regen_probability 13 | self.bucket_capacity = bucket_capacity 14 | self.include_location_features = include_location_features 15 | 16 | 17 | # In the diagrams below, T is a tree, B is a bucket, C is a carpet, A is the 18 | # agent. Each tuple is of the form (spec, current state, task R, true R). 19 | 20 | APPLES_PROBLEMS = { 21 | # ----- 22 | # |T T| 23 | # | | 24 | # | B | 25 | # | | 26 | # |A T| 27 | # ----- 28 | # After 11 actions (riuiruuildi), it looks like this: 29 | # ----- 30 | # |T T| 31 | # | A | 32 | # | B | 33 | # | | 34 | # | T| 35 | # ----- 36 | # Where the agent has picked the right trees once and put the fruit in the 37 | # basket. 38 | 'default': ( 39 | ApplesSpec(5, 3, 40 | ApplesState(agent_pos=(0, 0, 2), 41 | tree_states={(0, 0): True, (2, 0): True, (2, 4): True}, 42 | bucket_states={(1, 2): 0}, 43 | carrying_apple=False), 44 | apple_regen_probability = 0.1, 45 | bucket_capacity=10, 46 | include_location_features=True), 47 | ApplesState(agent_pos=(Direction.get_number_from_direction(Direction.SOUTH), 48 | 1, 1), 49 | tree_states={(0, 0): True, (2, 0): False, (2, 4): True}, 50 | bucket_states={(1, 2): 2}, 51 | carrying_apple=False), 52 | np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 53 | np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 54 | ) 55 | } 56 | -------------------------------------------------------------------------------- /src/envs/batteries.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import copy, deepcopy 3 | from itertools import product 4 | 5 | from envs.env import DeterministicEnv, Direction 6 | 7 | 8 | class BatteriesState(object): 9 | ''' 10 | state of the environment; describes positions of all objects in the env. 11 | ''' 12 | def __init__(self, agent_pos, train_pos, train_life, battery_present, carrying_battery): 13 | """ 14 | agent_pos: (x, y) tuple for the agent's location 15 | vase_states: Dictionary mapping (x, y) tuples to booleans, where True 16 | means that the vase is intact 17 | """ 18 | self.agent_pos = agent_pos 19 | self.train_pos = train_pos 20 | self.train_life = train_life 21 | self.battery_present = battery_present 22 | self.carrying_battery = carrying_battery 23 | 24 | def is_valid(self): 25 | pos = self.agent_pos 26 | # Can't be standing on a battery and not carrying a battery 27 | if pos in self.battery_present and self.battery_present[pos] and not self.carrying_battery: 28 | return False 29 | return True 30 | 31 | def __eq__(self, other): 32 | return isinstance(other, BatteriesState) and \ 33 | self.agent_pos == other.agent_pos and \ 34 | self.train_pos == other.train_pos and \ 35 | self.train_life == other.train_life and \ 36 | self.battery_present == other.battery_present and \ 37 | self.carrying_battery == other.carrying_battery 38 | 39 | def __hash__(self): 40 | def get_vals(dictionary): 41 | return tuple([dictionary[loc] for loc in sorted(dictionary.keys())]) 42 | return hash(self.agent_pos + self.train_pos + (self.train_life,) + get_vals(self.battery_present) + (self.carrying_battery,)) 43 | 44 | 45 | class BatteriesEnv(DeterministicEnv): 46 | def __init__(self, spec, compute_transitions=True): 47 | """ 48 | height: Integer, height of the grid. Y coordinates are in [0, height). 49 | width: Integer, width of the grid. X coordinates are in [0, width). 50 | init_state: BatteriesState, initial state of the environment 51 | vase_locations: List of (x, y) tuples, locations of vases 52 | num_vases: Integer, number of vases 53 | carpet_locations: Set of (x, y) tuples, locations of carpets 54 | feature_locations: List of (x, y) tuples, locations of features 55 | s: BatteriesState, Current state 56 | nA: Integer, number of actions 57 | """ 58 | self.height = spec.height 59 | self.width = spec.width 60 | self.init_state = deepcopy(spec.init_state) 61 | self.battery_locations = sorted(list(self.init_state.battery_present.keys())) 62 | self.num_batteries = len(self.battery_locations) 63 | self.feature_locations = list(spec.feature_locations) 64 | self.train_transition = spec.train_transition 65 | self.train_locations = list(self.train_transition.keys()) 66 | assert set(self.train_locations) == set(self.train_transition.values()) 67 | 68 | self.default_action = Direction.get_number_from_direction(Direction.STAY) 69 | self.nA = 5 70 | self.num_features = len(self.s_to_f(self.init_state)) 71 | 72 | self.reset() 73 | 74 | if compute_transitions: 75 | states = self.enumerate_states() 76 | self.make_transition_matrices( 77 | states, range(self.nA), self.nS, self.nA) 78 | self.make_f_matrix(self.nS, self.num_features) 79 | 80 | 81 | def enumerate_states(self): 82 | state_num = {} 83 | all_agent_positions = product(range(self.width), range(self.height)) 84 | all_battery_states = map( 85 | lambda battery_vals: dict(zip(self.battery_locations, battery_vals)), 86 | product([True, False], repeat=self.num_batteries)) 87 | all_states = map( 88 | lambda x: BatteriesState(*x), 89 | product(all_agent_positions, self.train_locations, range(10), all_battery_states, [True, False])) 90 | all_states = filter(lambda state: state.is_valid(), all_states) 91 | 92 | state_num = {} 93 | for state in all_states: 94 | if state not in state_num: 95 | state_num[state] = len(state_num) 96 | 97 | self.state_num = state_num 98 | self.num_state = {v: k for k, v in self.state_num.items()} 99 | self.nS = len(state_num) 100 | 101 | return state_num.keys() 102 | 103 | def get_num_from_state(self, state): 104 | return self.state_num[state] 105 | 106 | def get_state_from_num(self, num): 107 | return self.num_state[num] 108 | 109 | 110 | def s_to_f(self, s): 111 | ''' 112 | Returns features of the state: 113 | - Number of batteries 114 | - Whether the train is still alive 115 | - For each train location, whether the train is at that location 116 | - For each feature location, whether the agent is on that location 117 | ''' 118 | num_batteries = list(s.battery_present.values()).count(True) 119 | train_dead_feature = int(s.train_life == 0) 120 | train_pos_features = [int(s.train_pos == pos) for pos in self.train_locations] 121 | loc_features = [int(s.agent_pos == fpos) for fpos in self.feature_locations] 122 | features = train_pos_features + loc_features 123 | features = [num_batteries, train_dead_feature] + features 124 | return np.array(features) 125 | 126 | 127 | def get_next_state(self, state, action): 128 | '''returns the next state given a state and an action''' 129 | action = int(action) 130 | new_x, new_y = Direction.move_in_direction_number(state.agent_pos, action) 131 | # New position is still in bounds: 132 | if not (0 <= new_x < self.width and 0 <= new_y < self.height): 133 | new_x, new_y = state.agent_pos 134 | new_agent_pos = new_x, new_y 135 | 136 | new_train_pos, new_train_life = state.train_pos, state.train_life 137 | new_battery_present = deepcopy(state.battery_present) 138 | new_carrying_battery = state.carrying_battery 139 | if new_agent_pos == state.train_pos and state.carrying_battery: 140 | new_train_life = 10 141 | new_carrying_battery = False 142 | 143 | if new_train_life > 0: 144 | new_train_pos = self.train_transition[state.train_pos] 145 | new_train_life -= 1 146 | 147 | if new_agent_pos in state.battery_present and state.battery_present[new_agent_pos] and not state.carrying_battery: 148 | new_carrying_battery = True 149 | new_battery_present[new_agent_pos] = False 150 | 151 | result = BatteriesState(new_agent_pos, new_train_pos, new_train_life, new_battery_present, new_carrying_battery) 152 | return result 153 | 154 | 155 | def print_state(self, state): 156 | '''Renders the state.''' 157 | h, w = self.height, self.width 158 | grid = [[' '] * w for _ in range(h)] 159 | x, y = state.agent_pos 160 | grid[y][x] = 'A' 161 | x, y = state.train_pos 162 | grid[y][x] = 'T' 163 | for (x, y), val in state.battery_present.items(): 164 | if val: 165 | grid[y][x] = 'B' 166 | print('\n'.join(['|'.join(row) for row in grid])) 167 | 168 | print('carrying_battery: ', state.carrying_battery) 169 | -------------------------------------------------------------------------------- /src/envs/batteries_spec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from envs.batteries import BatteriesState 3 | 4 | class BatteriesSpec(object): 5 | def __init__(self, height, width, init_state, feature_locations, train_transition): 6 | """See BatteriesEnv.__init__ in batteries.py for details.""" 7 | self.height = height 8 | self.width = width 9 | self.init_state = init_state 10 | self.feature_locations = feature_locations 11 | self.train_transition = train_transition 12 | 13 | 14 | def get_problem(version): 15 | # In the diagram below, G is a goal location, B is a battery, A is the 16 | # agent, and T is the train. 17 | # Each tuple is of the form (spec, current state, task R, true R). 18 | # ------- 19 | # |B G | 20 | # | TT | 21 | # | TTG| 22 | # | | 23 | # |A B| 24 | # ------- 25 | spec = BatteriesSpec( 26 | 5, 5, 27 | BatteriesState((0, 4), (2, 1), 8, 28 | {(0, 0): True, (4, 4): True}, 29 | False), 30 | [(2, 0), (4, 2)], 31 | { 32 | (2, 1): (3, 1), 33 | (3, 1): (3, 2), 34 | (3, 2): (2, 2), 35 | (2, 2): (2, 1) 36 | }) 37 | final_state = BatteriesState((2, 0), (3, 2), 8, 38 | {(0, 0): False, (4, 4): True}, 39 | False) 40 | train_weight = -1 if version == 'easy' else 0 41 | task_reward = np.array([0, train_weight, 0, 0, 0, 0, 0, 1]) 42 | true_reward = np.array([0, -1, 0, 0, 0, 0, 0, 1]) 43 | return (spec, final_state, task_reward, true_reward) 44 | 45 | 46 | BATTERIES_PROBLEMS = { 47 | 'default': get_problem('default'), 48 | 'easy': get_problem('easy') 49 | } 50 | -------------------------------------------------------------------------------- /src/envs/env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | from copy import deepcopy 4 | from scipy.sparse import lil_matrix 5 | 6 | 7 | class Env(object): 8 | def __init__(self): 9 | raise ValueError('Cannot instantiate abstract class Env') 10 | 11 | def is_deterministic(self): 12 | return False 13 | 14 | def get_initial_state_distribution(self, known_initial_state=True): 15 | if known_initial_state: 16 | p_0 = np.zeros(self.nS) 17 | p_0[self.get_num_from_state(self.init_state)] = 1 18 | else: 19 | p_0 = np.ones(self.nS) / self.nS 20 | return p_0 21 | 22 | def make_transition_matrices(self, states_iter, actions_iter, nS, nA): 23 | """ 24 | states_iter: ITERATOR of states (i.e. can only be used once) 25 | actions_iter: ITERATOR of actions (i.e. can only be used once) 26 | """ 27 | P = {} 28 | T_matrix = lil_matrix((nS * nA, nS)) 29 | baseline_matrix = lil_matrix((nS, nS)) 30 | actions = list(actions_iter) 31 | for state in states_iter: 32 | state_id = self.get_num_from_state(state) 33 | P[state_id] = {} 34 | for action in actions: 35 | next_s = self.get_next_states(state, action) 36 | next_s = [(p, self.get_num_from_state(s), r) for p, s, r in next_s] 37 | P[state_id][action] = next_s 38 | state_action_index = state_id * nA + action 39 | for prob, next_state_id, _ in next_s: 40 | T_matrix[state_action_index, next_state_id] = prob 41 | if action == self.default_action: 42 | baseline_matrix[state_id, next_state_id] = prob 43 | self.P = P 44 | self.T_matrix = T_matrix.tocsr() 45 | self.T_matrix_transpose = T_matrix.transpose().tocsr() 46 | self.baseline_matrix_transpose = baseline_matrix.transpose().tocsr() 47 | 48 | 49 | def make_f_matrix(self, nS, num_features): 50 | self.f_matrix = np.zeros((nS, num_features)) 51 | for state_id in self.P.keys(): 52 | state = self.get_state_from_num(state_id) 53 | self.f_matrix[state_id, :] = self.s_to_f(state) 54 | 55 | 56 | def reset(self, state=None): 57 | if state is None: state = self.init_state 58 | self.timestep = 0 59 | self.s = deepcopy(state) 60 | 61 | def state_step(self, action, state=None): 62 | if state == None: state = self.s 63 | next_states = self.get_next_states(state, action) 64 | probabilities = [p for p, _, _ in next_states] 65 | idx = np.random.choice(np.arange(len(next_states)), p=probabilities) 66 | return next_states[idx][1] 67 | 68 | def step(self, action, r_vec=None): 69 | """ 70 | given an action, takes a step from self.s, updates self.s and returns: 71 | - the observation (features of the next state) 72 | - the associated reward 73 | - done, the indicator of completed episode 74 | - info 75 | """ 76 | self.s = self.state_step(action) 77 | self.timestep+=1 78 | 79 | obs = self.s_to_f(self.s) 80 | reward = 0 if r_vec is None else np.array(obs.T @ r_vec) 81 | done = False 82 | info = defaultdict(lambda : '') 83 | return np.array(obs, dtype='float32'), reward, np.array(done, dtype='bool'), info 84 | 85 | 86 | class DeterministicEnv(Env): 87 | def __init__(self): 88 | raise ValueError('Cannot instantiate abstract class DeterministicEnv') 89 | 90 | def is_deterministic(self): 91 | return True 92 | 93 | def make_transition_matrices(self, states_iter, actions_iter, nS, nA): 94 | """ 95 | states_iter: ITERATOR of states (i.e. can only be used once) 96 | actions_iter: ITERATOR of actions (i.e. can only be used once) 97 | nS: Number of states 98 | nA: Number of actions 99 | """ 100 | Env.make_transition_matrices(self, states_iter, actions_iter, nS, nA) 101 | self._make_deterministic_transition_matrix(nS, nA) 102 | self._make_deterministic_transition_transpose_matrix(nS, nA) 103 | 104 | 105 | def get_next_states(self, state, action): 106 | return [(1, self.get_next_state(state, action), 0)] 107 | 108 | def state_step(self, action, state=None): 109 | if state == None: state = self.s 110 | return self.get_next_state(state, action) 111 | 112 | def _make_deterministic_transition_matrix(self, nS, nA): 113 | """Create self.deterministic_T, a matrix with index S,A -> S' """ 114 | self.deterministic_T = np.zeros((nS, nA), dtype='int32') 115 | for s in range(nS): 116 | for a in range(nA): 117 | self.deterministic_T[s,a]=self.P[s][a][0][1] 118 | 119 | def _make_deterministic_transition_transpose_matrix(self, nS, nA): 120 | """Create self.deterministic_transpose, a matrix with index S,A -> S', with the inverse dynamics """ 121 | self.deterministic_transpose = np.zeros((nS, nA), dtype='int32') 122 | for s in range(nS): 123 | for a in range(nA): 124 | self.deterministic_transpose[self.P[s][a][0][1],a]=s 125 | 126 | 127 | class Direction(object): 128 | """A class that contains the five actions available in Gridworlds. 129 | 130 | Includes definitions of the actions as well as utility functions for 131 | manipulating them or applying them. 132 | """ 133 | NORTH = (0, -1) 134 | SOUTH = (0, 1) 135 | EAST = (1, 0) 136 | WEST = (-1, 0) 137 | STAY = (0, 0) 138 | INDEX_TO_DIRECTION = [NORTH, SOUTH, EAST, WEST, STAY] 139 | DIRECTION_TO_INDEX = { a:i for i, a in enumerate(INDEX_TO_DIRECTION) } 140 | ALL_DIRECTIONS = INDEX_TO_DIRECTION 141 | 142 | @staticmethod 143 | def move_in_direction(point, direction): 144 | """Takes a step in the given direction and returns the new point. 145 | 146 | point: Tuple (x, y) representing a point in the x-y plane. 147 | direction: One of the Directions. 148 | """ 149 | x, y = point 150 | dx, dy = direction 151 | return (x + dx, y + dy) 152 | 153 | @staticmethod 154 | def move_in_direction_number(point, num): 155 | direction = Direction.get_direction_from_number(num) 156 | return Direction.move_in_direction(point, direction) 157 | 158 | @staticmethod 159 | def get_number_from_direction(direction): 160 | return Direction.DIRECTION_TO_INDEX[direction] 161 | 162 | @staticmethod 163 | def get_direction_from_number(number): 164 | return Direction.INDEX_TO_DIRECTION[number] 165 | 166 | -------------------------------------------------------------------------------- /src/envs/room.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import deepcopy 3 | from itertools import product 4 | 5 | from envs.env import DeterministicEnv, Direction 6 | 7 | 8 | class RoomState(object): 9 | ''' 10 | state of the environment; describes positions of all objects in the env. 11 | ''' 12 | def __init__(self, agent_pos, vase_states): 13 | """ 14 | agent_pos: (x, y) tuple for the agent's location 15 | vase_states: Dictionary mapping (x, y) tuples to booleans, where True 16 | means that the vase is intact 17 | """ 18 | self.agent_pos = agent_pos 19 | self.vase_states = vase_states 20 | 21 | def __eq__(self, other): 22 | return isinstance(other, RoomState) and \ 23 | self.agent_pos == other.agent_pos and \ 24 | self.vase_states == other.vase_states 25 | 26 | def __hash__(self): 27 | def get_vals(dictionary): 28 | return tuple([dictionary[loc] for loc in sorted(dictionary.keys())]) 29 | return hash(self.agent_pos + get_vals(self.vase_states)) 30 | 31 | 32 | class RoomEnv(DeterministicEnv): 33 | def __init__(self, spec, compute_transitions=True): 34 | """ 35 | height: Integer, height of the grid. Y coordinates are in [0, height). 36 | width: Integer, width of the grid. X coordinates are in [0, width). 37 | init_state: RoomState, initial state of the environment 38 | vase_locations: List of (x, y) tuples, locations of vases 39 | num_vases: Integer, number of vases 40 | carpet_locations: Set of (x, y) tuples, locations of carpets 41 | feature_locations: List of (x, y) tuples, locations of features 42 | s: RoomState, Current state 43 | nA: Integer, number of actions 44 | """ 45 | self.height = spec.height 46 | self.width = spec.width 47 | self.init_state = deepcopy(spec.init_state) 48 | self.vase_locations = list(self.init_state.vase_states.keys()) 49 | self.num_vases = len(self.vase_locations) 50 | self.carpet_locations = set(spec.carpet_locations) 51 | self.feature_locations = list(spec.feature_locations) 52 | 53 | self.default_action = Direction.get_number_from_direction(Direction.STAY) 54 | self.nA = 5 55 | self.num_features = len(self.s_to_f(self.init_state)) 56 | 57 | self.reset() 58 | 59 | if compute_transitions: 60 | states = self.enumerate_states() 61 | self.make_transition_matrices( 62 | states, range(self.nA), self.nS, self.nA) 63 | self.make_f_matrix(self.nS, self.num_features) 64 | 65 | 66 | def enumerate_states(self): 67 | state_num = {} 68 | 69 | # Possible vase states 70 | for vase_intact_bools in product([True, False], repeat=self.num_vases): 71 | vase_states = dict(zip(self.vase_locations, vase_intact_bools)) 72 | # Possible agent positions 73 | for y in range(self.height): 74 | for x in range(self.width): 75 | pos = (x, y) 76 | if pos in vase_states and vase_states[pos]: 77 | # Can't have the agent on an intact vase 78 | continue 79 | state = RoomState(pos, vase_states) 80 | if state not in state_num: 81 | state_num[state] = len(state_num) 82 | 83 | self.state_num = state_num 84 | self.num_state = {v: k for k, v in self.state_num.items()} 85 | self.nS = len(state_num) 86 | 87 | return state_num.keys() 88 | 89 | def get_num_from_state(self, state): 90 | return self.state_num[state] 91 | 92 | def get_state_from_num(self, num): 93 | return self.num_state[num] 94 | 95 | 96 | def s_to_f(self, s): 97 | ''' 98 | Returns features of the state: 99 | - Number of broken vases 100 | - Whether the agent is on a carpet 101 | - For each feature location, whether the agent is on that location 102 | ''' 103 | num_broken_vases = list(s.vase_states.values()).count(False) 104 | carpet_feature = int(s.agent_pos in self.carpet_locations) 105 | features = [int(s.agent_pos == fpos) for fpos in self.feature_locations] 106 | features = [num_broken_vases, carpet_feature] + features 107 | return np.array(features) 108 | 109 | 110 | def get_next_state(self, state, action): 111 | '''returns the next state given a state and an action''' 112 | action = int(action) 113 | new_x, new_y = Direction.move_in_direction_number(state.agent_pos, action) 114 | # New position is still in bounds: 115 | if not (0 <= new_x < self.width and 0 <= new_y < self.height): 116 | new_x, new_y = state.agent_pos 117 | new_agent_pos = new_x, new_y 118 | new_vase_states = deepcopy(state.vase_states) 119 | if new_agent_pos in new_vase_states: 120 | new_vase_states[new_agent_pos] = False # Break the vase 121 | return RoomState(new_agent_pos, new_vase_states) 122 | 123 | 124 | def print_state(self, state): 125 | '''Renders the state.''' 126 | h, w = self.height, self.width 127 | canvas = np.zeros(tuple([2*h-1, 3*w+1]), dtype='int8') 128 | 129 | # cell borders 130 | for y in range(1, canvas.shape[0], 2): 131 | canvas[y, :] = 1 132 | for x in range(0, canvas.shape[1], 3): 133 | canvas[:, x] = 2 134 | 135 | # vases 136 | for x, y in self.vase_locations: 137 | if state.vase_states[(x, y)]: 138 | canvas[2*y, 3*x+1] = 4 139 | else: 140 | canvas[2*y, 3*x+1] = 6 141 | 142 | # agent 143 | x, y = state.agent_pos 144 | canvas[2*y, 3*x + 2] = 3 145 | 146 | black_color = '\x1b[0m' 147 | purple_background_color = '\x1b[0;35;85m' 148 | 149 | for line in canvas: 150 | for char_num in line: 151 | if char_num==0: 152 | print('\u2003', end='') 153 | elif char_num==1: 154 | print('─', end='') 155 | elif char_num==2: 156 | print('│', end='') 157 | elif char_num==3: 158 | print('\x1b[0;33;85m█'+black_color, end='') 159 | elif char_num==4: 160 | print('\x1b[0;32;85m█'+black_color , end='') 161 | elif char_num==5: 162 | print(purple_background_color+'█'+black_color, end='') 163 | elif char_num==6: 164 | print('\033[91m█'+black_color, end='') 165 | print('') 166 | -------------------------------------------------------------------------------- /src/envs/room_spec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from envs.room import RoomState 3 | 4 | class RoomSpec(object): 5 | def __init__(self, height, width, init_state, carpet_locations, feature_locations): 6 | """See RoomEnv.__init__ in room.py for details.""" 7 | self.height = height 8 | self.width = width 9 | self.init_state = init_state 10 | self.carpet_locations = carpet_locations 11 | self.feature_locations = feature_locations 12 | 13 | 14 | 15 | # In the diagrams below, G is a goal location, V is a vase, C is a carpet, A is 16 | # the agent. Each tuple is of the form (spec, current state, task R, true R). 17 | 18 | ROOM_PROBLEMS = { 19 | # ------- 20 | # | G | 21 | # |GCVC | 22 | # | A | 23 | # ------- 24 | 'default': ( 25 | RoomSpec(3, 5, 26 | RoomState((2, 2), {(2, 1): True}), 27 | [(1, 1), (3, 1)], 28 | [(0, 1), (2, 0)]), 29 | RoomState((2, 0), {(2, 1): True}), 30 | np.array([0, 0, 1, 0]), 31 | np.array([-1, 0, 1, 0]) 32 | ), 33 | # ------- 34 | # |G VG| 35 | # | | 36 | # |A C | 37 | # ------- 38 | 'bad': ( 39 | RoomSpec(3, 5, 40 | RoomState((0, 2), {(3, 0): True}), 41 | [(3, 2)], 42 | [(0, 0), (4, 0)]), 43 | RoomState((0, 0), {(3, 0): True}), 44 | np.array([0, 0, 0, 1]), 45 | np.array([-1, 0, 0, 1]) 46 | ) 47 | } 48 | -------------------------------------------------------------------------------- /src/envs/tests/apples_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from envs.apples import ApplesState, ApplesEnv 4 | from envs.env import Direction 5 | 6 | 7 | class TestApplesSpec(object): 8 | def __init__(self): 9 | """Test spec for the Apples environment. 10 | 11 | T is a tree, B is a bucket, C is a carpet, A is the agent. 12 | ----- 13 | |T T| 14 | | | 15 | |AB | 16 | ----- 17 | """ 18 | self.height = 3 19 | self.width = 5 20 | self.init_state = ApplesState( 21 | agent_pos=(0, 0, 2), 22 | tree_states={(0, 0): True, (2, 0): True}, 23 | bucket_states={(1, 2): 0}, 24 | carrying_apple=False) 25 | # Use a power of 2, to avoid rounding issues 26 | self.apple_regen_probability = 1.0 / 4 27 | self.bucket_capacity = 10 28 | self.include_location_features = True 29 | 30 | 31 | class TestApplesEnv(unittest.TestCase): 32 | def check_trajectory(self, env, trajectory): 33 | state = env.s 34 | for action, prob, next_state in trajectory: 35 | actual_next_states = env.get_next_states(state, action) 36 | self.assertEqual(sum([p for p, _, _ in actual_next_states]), 1.0) 37 | self.assertIn((prob, next_state, 0), actual_next_states) 38 | state = next_state 39 | 40 | def test_trajectories(self): 41 | u, d, l, r, s = map( 42 | Direction.get_number_from_direction, 43 | [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST, Direction.STAY]) 44 | i = 5 # interact action 45 | 46 | def make_state(agent_pos, tree1, tree2, bucket, carrying_apple): 47 | tree_states = { (0, 0): tree1, (2, 0): tree2 } 48 | bucket_state = { (1, 2): bucket } 49 | return ApplesState(agent_pos, tree_states, bucket_state, carrying_apple) 50 | 51 | apples_env = ApplesEnv(TestApplesSpec(), compute_transitions=False) 52 | self.check_trajectory(apples_env, [ 53 | (u, 1.0, make_state((u, 0, 1), True, True, 0, False)), 54 | (i, 1.0, make_state((u, 0, 1), False, True, 0, True)), 55 | (r, 3.0/4, make_state((r, 1, 1), False, True, 0, True)), 56 | (d, 3.0/4, make_state((d, 1, 1), False, True, 0, True)), 57 | (i, 3.0/4, make_state((d, 1, 1), False, True, 1, False)), 58 | (u, 3.0/4, make_state((u, 1, 0), False, True, 1, False)), 59 | (r, 3.0/4, make_state((r, 1, 0), False, True, 1, False)), 60 | (i, 3.0/4, make_state((r, 1, 0), False, False, 1, True)), 61 | (d, 9.0/16, make_state((d, 1, 1), False, False, 1, True)), 62 | (i, 3.0/16, make_state((d, 1, 1), True, False, 2, False)), 63 | (s, 1.0/4, make_state((d, 1, 1), True, True, 2, False)), 64 | ]) 65 | 66 | if __name__ == '__main__': 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /src/envs/tests/batteries_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from envs.batteries import BatteriesState, BatteriesEnv 4 | from envs.env import Direction 5 | 6 | 7 | class TestBatteriesSpec(object): 8 | def __init__(self): 9 | """Test spec for the Batteries environment. 10 | 11 | G is a goal location, B is a battery, A is the agent, and T is the train. 12 | ------- 13 | |B G | 14 | | TT | 15 | | TTG| 16 | | | 17 | |A B| 18 | ------- 19 | """ 20 | self.height = 5 21 | self.width = 5 22 | self.init_state = BatteriesState((0, 4), (2, 1), 8, 23 | {(0, 0): True, (4, 4): True}, 24 | False) 25 | self.feature_locations = [(2, 0), (4, 2)] 26 | self.train_transition = { 27 | (2, 1): (3, 1), 28 | (3, 1): (3, 2), 29 | (3, 2): (2, 2), 30 | (2, 2): (2, 1) 31 | } 32 | 33 | 34 | class TestBatteriesEnv(unittest.TestCase): 35 | def check_trajectory(self, env, trajectory): 36 | state = env.s 37 | for action, next_state in trajectory: 38 | self.assertEqual(env.state_step(action, state), next_state) 39 | self.assertEqual(env.state_step(action), next_state) 40 | features, reward, done, info = env.step(action) 41 | self.assertEqual(env.s, next_state) 42 | state = next_state 43 | 44 | def test_trajectories(self): 45 | batteries_env = BatteriesEnv(TestBatteriesSpec(), compute_transitions=False) 46 | u, d, l, r, s = map( 47 | Direction.get_number_from_direction, 48 | [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST, Direction.STAY]) 49 | 50 | def make_state(agent, train, life, battery_vals, carrying_battery): 51 | battery_present = dict(zip([(0, 0), (4, 4)], battery_vals)) 52 | return BatteriesState(agent, train, life, battery_present, carrying_battery) 53 | 54 | self.check_trajectory(batteries_env, [ 55 | (u, make_state((0, 3), (3, 1), 7, [True, True], False)), 56 | (u, make_state((0, 2), (3, 2), 6, [True, True], False)), 57 | (u, make_state((0, 1), (2, 2), 5, [True, True], False)), 58 | (u, make_state((0, 0), (2, 1), 4, [False, True], True)), 59 | (r, make_state((1, 0), (3, 1), 3, [False, True], True)), 60 | (r, make_state((2, 0), (3, 2), 2, [False, True], True)), 61 | (s, make_state((2, 0), (2, 2), 1, [False, True], True)), 62 | (s, make_state((2, 0), (2, 1), 0, [False, True], True)), 63 | (d, make_state((2, 1), (3, 1), 9, [False, True], False)), 64 | (u, make_state((2, 0), (3, 2), 8, [False, True], False)), 65 | ]) 66 | 67 | 68 | if __name__ == '__main__': 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /src/envs/tests/env_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from envs.env import Direction 4 | 5 | class TestDirection(unittest.TestCase): 6 | def test_direction_number_conversion(self): 7 | all_directions = Direction.ALL_DIRECTIONS 8 | all_numbers = [] 9 | 10 | for direction in Direction.ALL_DIRECTIONS: 11 | number = Direction.get_number_from_direction(direction) 12 | direction_again = Direction.get_direction_from_number(number) 13 | self.assertEqual(direction, direction_again) 14 | all_numbers.append(number) 15 | 16 | # Check that all directions are distinct 17 | num_directions = len(all_directions) 18 | self.assertEqual(len(set(all_directions)), num_directions) 19 | # Check that the numbers are 0, 1, ... num_directions - 1 20 | self.assertEqual(set(all_numbers), set(range(num_directions))) 21 | 22 | 23 | if __name__ == '__main__': 24 | unittest.main() 25 | -------------------------------------------------------------------------------- /src/envs/tests/room_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from envs.room import RoomState, RoomEnv 4 | from envs.env import Direction 5 | 6 | 7 | class TestRoomSpec(object): 8 | def __init__(self): 9 | """Test spec for the Room environment. 10 | 11 | G is a goal location, V is a vase, C is a carpet, A is the agent. 12 | ------- 13 | |G G G| 14 | | CVC | 15 | | A | 16 | ------- 17 | """ 18 | self.height = 3 19 | self.width = 5 20 | self.init_state = RoomState((2, 2), {(2, 1): True}) 21 | self.carpet_locations = [(1, 1), (3, 1)] 22 | self.feature_locations = [(0, 0), (2, 0), (4, 0)] 23 | 24 | 25 | class TestRoomEnv(unittest.TestCase): 26 | def setUp(self): 27 | self.room = RoomEnv(TestRoomSpec(), compute_transitions=False) 28 | u, d, l, r = map( 29 | Direction.get_number_from_direction, 30 | [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST]) 31 | 32 | self.trajectory1 = [ 33 | (l, RoomState((1, 2), {(2, 1): True})), 34 | (u, RoomState((1, 1), {(2, 1): True})), 35 | (u, RoomState((1, 0), {(2, 1): True})), 36 | (r, RoomState((2, 0), {(2, 1): True})) 37 | ] 38 | self.trajectory2 = [ 39 | (u, RoomState((2, 1), {(2, 1): False})), 40 | (u, RoomState((2, 0), {(2, 1): False})) 41 | ] 42 | self.trajectory3 = [ 43 | (r, RoomState((3, 2), {(2, 1): True})), 44 | (u, RoomState((3, 1), {(2, 1): True})), 45 | (l, RoomState((2, 1), {(2, 1): False})), 46 | (d, RoomState((2, 2), {(2, 1): False})) 47 | ] 48 | 49 | def check_trajectory(self, env, trajectory, reset=True): 50 | if reset: 51 | env.reset() 52 | 53 | state = env.s 54 | for action, next_state in trajectory: 55 | self.assertEqual(env.state_step(action, state), next_state) 56 | self.assertEqual(env.state_step(action), next_state) 57 | features, reward, done, info = env.step(action) 58 | self.assertEqual(env.s, next_state) 59 | state = next_state 60 | 61 | def test_trajectories(self): 62 | self.check_trajectory(self.room, self.trajectory1, reset=False) 63 | self.check_trajectory(self.room, self.trajectory2) 64 | self.check_trajectory(self.room, self.trajectory3) 65 | 66 | if __name__ == '__main__': 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /src/envs/tests/train_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from envs.train import TrainState, TrainEnv 4 | from envs.env import Direction 5 | 6 | 7 | class TestTrainSpec(object): 8 | def __init__(self): 9 | """Test spec for the Train environment. 10 | 11 | G is a goal location, V is a vase, C is a carpet, A is the agent. 12 | ------- 13 | | G C| 14 | | TT | 15 | | VTTG| 16 | | | 17 | |A | 18 | ------- 19 | """ 20 | self.height = 5 21 | self.width = 5 22 | self.init_state = TrainState((0, 4), {(1, 2): True}, (2, 1), True) 23 | self.carpet_locations = [(4, 0)] 24 | self.feature_locations = [(2, 0), (4, 2)], 25 | self.train_transition = { 26 | (2, 1): (3, 1), 27 | (3, 1): (3, 2), 28 | (3, 2): (2, 2), 29 | (2, 2): (2, 1) 30 | } 31 | 32 | 33 | class TestTrainEnv(unittest.TestCase): 34 | def check_trajectory(self, env, trajectory): 35 | state = env.s 36 | for action, next_state in trajectory: 37 | self.assertEqual(env.state_step(action, state), next_state) 38 | self.assertEqual(env.state_step(action), next_state) 39 | features, reward, done, info = env.step(action) 40 | self.assertEqual(env.s, next_state) 41 | state = next_state 42 | 43 | def test_trajectories(self): 44 | train_env = TrainEnv(TestTrainSpec(), compute_transitions=False) 45 | u, d, l, r, s = map( 46 | Direction.get_number_from_direction, 47 | [Direction.NORTH, Direction.SOUTH, Direction.WEST, Direction.EAST, Direction.STAY]) 48 | 49 | self.check_trajectory(train_env, [ 50 | (u, TrainState((0, 3), {(1, 2): True}, (3, 1), True)), 51 | (u, TrainState((0, 2), {(1, 2): True}, (3, 2), True)), 52 | (u, TrainState((0, 1), {(1, 2): True}, (2, 2), True)), 53 | (r, TrainState((1, 1), {(1, 2): True}, (2, 1), True)), 54 | (u, TrainState((1, 0), {(1, 2): True}, (3, 1), True)), 55 | (r, TrainState((2, 0), {(1, 2): True}, (3, 2), True)), 56 | (s, TrainState((2, 0), {(1, 2): True}, (2, 2), True)), 57 | (s, TrainState((2, 0), {(1, 2): True}, (2, 1), True)), 58 | ]) 59 | 60 | train_env.reset() 61 | self.check_trajectory(train_env, [ 62 | (u, TrainState((0, 3), {(1, 2): True}, (3, 1), True)), 63 | (r, TrainState((1, 3), {(1, 2): True}, (3, 2), True)), 64 | (r, TrainState((2, 3), {(1, 2): True}, (2, 2), True)), 65 | ]) 66 | 67 | train_env.reset() 68 | self.check_trajectory(train_env, [ 69 | (r, TrainState((1, 4), {(1, 2): True}, (3, 1), True)), 70 | (r, TrainState((2, 4), {(1, 2): True}, (3, 2), True)), 71 | (r, TrainState((3, 4), {(1, 2): True}, (2, 2), True)), 72 | (u, TrainState((3, 3), {(1, 2): True}, (2, 1), True)), 73 | (u, TrainState((3, 2), {(1, 2): True}, (3, 1), True)), 74 | (s, TrainState((3, 2), {(1, 2): True}, (3, 2), False)), 75 | (s, TrainState((3, 2), {(1, 2): True}, (3, 2), False)), 76 | (u, TrainState((3, 1), {(1, 2): True}, (3, 2), False)), 77 | (l, TrainState((2, 1), {(1, 2): True}, (3, 2), False)), 78 | ]) 79 | 80 | if __name__ == '__main__': 81 | unittest.main() 82 | -------------------------------------------------------------------------------- /src/envs/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import copy, deepcopy 3 | from itertools import product 4 | 5 | from envs.env import DeterministicEnv, Direction 6 | 7 | 8 | class TrainState(object): 9 | ''' 10 | state of the environment; describes positions of all objects in the env. 11 | ''' 12 | def __init__(self, agent_pos, vase_states, train_pos, train_intact): 13 | """ 14 | agent_pos: (x, y) tuple for the agent's location 15 | vase_states: Dictionary mapping (x, y) tuples to booleans, where True 16 | means that the vase is intact 17 | """ 18 | self.agent_pos = agent_pos 19 | self.vase_states = vase_states 20 | self.train_pos = train_pos 21 | self.train_intact = train_intact 22 | 23 | def is_valid(self): 24 | pos = self.agent_pos 25 | # Can't be standing on the vase and have the vase intact 26 | if pos in self.vase_states and self.vase_states[pos]: 27 | return False 28 | # Can't be standing on the train and have the train intact 29 | if pos == self.train_pos and self.train_intact: 30 | return False 31 | return True 32 | 33 | def __eq__(self, other): 34 | return isinstance(other, TrainState) and \ 35 | self.agent_pos == other.agent_pos and \ 36 | self.vase_states == other.vase_states and \ 37 | self.train_pos == other.train_pos and \ 38 | self.train_intact == other.train_intact 39 | 40 | def __hash__(self): 41 | def get_vals(dictionary): 42 | return tuple([dictionary[loc] for loc in sorted(dictionary.keys())]) 43 | return hash(self.agent_pos + get_vals(self.vase_states) + self.train_pos + (self.train_intact,)) 44 | 45 | 46 | class TrainEnv(DeterministicEnv): 47 | def __init__(self, spec, compute_transitions=True): 48 | """ 49 | height: Integer, height of the grid. Y coordinates are in [0, height). 50 | width: Integer, width of the grid. X coordinates are in [0, width). 51 | init_state: TrainState, initial state of the environment 52 | vase_locations: List of (x, y) tuples, locations of vases 53 | num_vases: Integer, number of vases 54 | carpet_locations: Set of (x, y) tuples, locations of carpets 55 | feature_locations: List of (x, y) tuples, locations of features 56 | s: TrainState, Current state 57 | nA: Integer, number of actions 58 | """ 59 | self.height = spec.height 60 | self.width = spec.width 61 | self.init_state = deepcopy(spec.init_state) 62 | self.vase_locations = list(self.init_state.vase_states.keys()) 63 | self.num_vases = len(self.vase_locations) 64 | self.carpet_locations = set(spec.carpet_locations) 65 | self.feature_locations = list(spec.feature_locations) 66 | self.train_transition = spec.train_transition 67 | self.train_locations = list(self.train_transition.keys()) 68 | assert set(self.train_locations) == set(self.train_transition.values()) 69 | 70 | self.default_action = Direction.get_number_from_direction(Direction.STAY) 71 | self.nA = 5 72 | self.num_features = len(self.s_to_f(self.init_state)) 73 | 74 | self.reset() 75 | 76 | if compute_transitions: 77 | states = self.enumerate_states() 78 | self.make_transition_matrices( 79 | states, range(self.nA), self.nS, self.nA) 80 | self.make_f_matrix(self.nS, self.num_features) 81 | 82 | 83 | def enumerate_states(self): 84 | state_num = {} 85 | all_agent_positions = product(range(self.width), range(self.height)) 86 | all_vase_states = map( 87 | lambda vase_vals: dict(zip(self.vase_locations, vase_vals)), 88 | product([True, False], repeat=self.num_vases)) 89 | all_states = map( 90 | lambda x: TrainState(*x), 91 | product(all_agent_positions, all_vase_states, self.train_locations, [True, False])) 92 | all_states = filter(lambda state: state.is_valid(), all_states) 93 | 94 | state_num = {} 95 | for state in all_states: 96 | if state not in state_num: 97 | state_num[state] = len(state_num) 98 | 99 | self.state_num = state_num 100 | self.num_state = {v: k for k, v in self.state_num.items()} 101 | self.nS = len(state_num) 102 | 103 | return state_num.keys() 104 | 105 | def get_num_from_state(self, state): 106 | return self.state_num[state] 107 | 108 | def get_state_from_num(self, num): 109 | return self.num_state[num] 110 | 111 | 112 | def s_to_f(self, s): 113 | ''' 114 | Returns features of the state: 115 | - Number of broken vases 116 | - Whether the agent is on a carpet 117 | - For each feature location, whether the agent is on that location 118 | ''' 119 | num_broken_vases = list(s.vase_states.values()).count(False) 120 | carpet_feature = int(s.agent_pos in self.carpet_locations) 121 | train_intact_feature = int(not s.train_intact) 122 | train_pos_features = [int(s.train_pos == pos) for pos in self.train_locations] 123 | loc_features = [int(s.agent_pos == fpos) for fpos in self.feature_locations] 124 | features = train_pos_features + loc_features 125 | features = [num_broken_vases, carpet_feature, train_intact_feature] + features 126 | return np.array(features) 127 | 128 | 129 | def get_next_state(self, state, action): 130 | '''returns the next state given a state and an action''' 131 | action = int(action) 132 | new_x, new_y = Direction.move_in_direction_number(state.agent_pos, action) 133 | # New position is still in bounds: 134 | if not (0 <= new_x < self.width and 0 <= new_y < self.height): 135 | new_x, new_y = state.agent_pos 136 | new_agent_pos = new_x, new_y 137 | new_vase_states = deepcopy(state.vase_states) 138 | new_train_pos, new_train_intact = state.train_pos, state.train_intact 139 | if state.train_intact: 140 | new_train_pos = self.train_transition[state.train_pos] 141 | 142 | # Break the vase and train if appropriate 143 | if new_agent_pos in new_vase_states: 144 | new_vase_states[new_agent_pos] = False 145 | if new_agent_pos == new_train_pos: 146 | new_train_intact = False 147 | return TrainState(new_agent_pos, new_vase_states, new_train_pos, new_train_intact) 148 | 149 | 150 | def print_state(self, state): 151 | '''Renders the state.''' 152 | h, w = self.height, self.width 153 | canvas = np.zeros(tuple([2*h-1, 3*w+1]), dtype='int8') 154 | 155 | # cell borders 156 | for y in range(1, canvas.shape[0], 2): 157 | canvas[y, :] = 1 158 | for x in range(0, canvas.shape[1], 3): 159 | canvas[:, x] = 2 160 | 161 | # vases 162 | for x, y in self.vase_locations: 163 | if state.vase_states[(x, y)]: 164 | canvas[2*y, 3*x+1] = 4 165 | else: 166 | canvas[2*y, 3*x+1] = 6 167 | 168 | # agent 169 | x, y = state.agent_pos 170 | canvas[2*y, 3*x + 2] = 3 171 | 172 | # train 173 | x, y = state.train_pos 174 | if state.train_intact: 175 | canvas[2*y, 3*x + 1] = 5 176 | else: 177 | canvas[2*y, 3*x + 1] = 6 178 | 179 | 180 | 181 | black_color = '\x1b[0m' 182 | purple_background_color = '\x1b[0;35;85m' 183 | 184 | for line in canvas: 185 | for char_num in line: 186 | if char_num==0: 187 | print('\u2003', end='') 188 | elif char_num==1: 189 | print('─', end='') 190 | elif char_num==2: 191 | print('│', end='') 192 | elif char_num==3: 193 | print('\x1b[0;33;85m█'+black_color, end='') 194 | elif char_num==4: 195 | print('\x1b[0;32;85m█'+black_color , end='') 196 | elif char_num==5: 197 | print(purple_background_color+'█'+black_color, end='') 198 | elif char_num==6: 199 | print('\033[91m█'+black_color, end='') 200 | print('') 201 | -------------------------------------------------------------------------------- /src/envs/train_spec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from envs.train import TrainState 3 | 4 | class TrainSpec(object): 5 | def __init__(self, height, width, init_state, carpet_locations, feature_locations, train_transition): 6 | """See TrainEnv.__init__ in train.py for details.""" 7 | self.height = height 8 | self.width = width 9 | self.init_state = init_state 10 | self.carpet_locations = carpet_locations 11 | self.feature_locations = feature_locations 12 | self.train_transition = train_transition 13 | 14 | 15 | 16 | # In the diagrams below, G is a goal location, V is a vase, C is a carpet, A is 17 | # the agent, and T is the train. 18 | # Each tuple is of the form (spec, current state, task R, true R). 19 | 20 | TRAIN_PROBLEMS = { 21 | # ------- 22 | # | G C| 23 | # | TT | 24 | # | VTTG| 25 | # | | 26 | # |A | 27 | # ------- 28 | 'default': ( 29 | TrainSpec(5, 5, 30 | TrainState((0, 4), {(1, 2): True}, (2, 1), True), 31 | [(4, 0)], 32 | [(2, 0), (4, 2)], 33 | { 34 | (2, 1): (3, 1), 35 | (3, 1): (3, 2), 36 | (3, 2): (2, 2), 37 | (2, 2): (2, 1) 38 | }), 39 | TrainState((2, 0), {(1, 2): True}, (2, 2), True), 40 | np.array([0, 0, 0, 0, 0, 0, 0, 0, 1]), 41 | np.array([-1, 0, -1, 0, 0, 0, 0, 0, 1]) 42 | ) 43 | } 44 | -------------------------------------------------------------------------------- /src/plotting.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from matplotlib.backends.backend_pdf import PdfPages 6 | 7 | def get_stats(algorithm, env, spec, comb, param_tuned, path, temp_index=0): 8 | results_list=[] 9 | for file in os.listdir(path): 10 | if algorithm in file and env in file and spec in file and comb in file and "-"+param_tuned in file: 11 | with open(os.path.join(path, file), 'rt') as f: 12 | reader = csv.reader(f) 13 | # the first line is names of returned items, e g [seed, true_r, final_r] 14 | list_results = list(reader)[1::] 15 | list_rewards = [] 16 | for res in list_results: 17 | s = res[1] 18 | s = s.replace(']', '').replace('[', '').replace(' ', '').split(',') 19 | list_rewards.append(float(s[temp_index])) 20 | list_rewards = np.asarray(list_rewards) 21 | 22 | param_val = file.split('-'+param_tuned+'=', 1)[-1] 23 | param_val = param_val.split('-')[0] 24 | 25 | results_list.append([float(param_val), np.mean(list_rewards), np.std(list_rewards)]) 26 | results_list = np.asarray(results_list) 27 | # return a list sorted by the value of param_tuned 28 | return results_list[results_list[:,0].argsort()] 29 | 30 | 31 | def plot_params_one_subplot(stats_list_per_env, ax, color_list, env_names, 32 | y_min, y_max, comb, title=None, current_subplot=0): 33 | ticks_string=[] 34 | for i in stats_list_per_env[0][0][:,0]: 35 | tick = str(i) 36 | if tick[len(tick)-2::]=='.0': 37 | tick = tick[0:len(tick)-2] 38 | ticks_string.append(tick) 39 | 40 | for j, stats_list in enumerate(stats_list_per_env): 41 | stats_stack = np.vstack(stats_list) 42 | 43 | for i in range(len(stats_list)): 44 | c = color_list[i] 45 | stats = stats_list[i] 46 | 47 | ax.set_ylim(y_min, y_max) 48 | ax.scatter(np.log2(stats[:,0]), stats[:,1], color=c, edgecolor=c, s=40, label=comb[i]+env_names[i]) 49 | ax.plot(np.log2(stats[:,0]), stats[:,1], color=c) 50 | 51 | plt.tick_params(axis='both', labelsize=12) 52 | ax.tick_params(axis='both', labelsize='large') 53 | plt.xticks(np.log2(stats[::2,0]), ticks_string[0::2]) 54 | 55 | if current_subplot==0: 56 | plt.ylabel("Fraction of max R", fontsize=17) 57 | handles, labels = ax.get_legend_handles_labels() 58 | # sort both labels and handles by labels 59 | labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0])) 60 | plt.legend(handles, labels, loc="best", fontsize=12, handletextpad=-0.4) 61 | 62 | # xlabel only for the middle subplot when plotting additive vs bayesian 63 | if current_subplot==1: 64 | plt.xlabel("Standard deviation", fontsize=21) 65 | 66 | if title is not None: plt.title(title, fontsize=24) 67 | 68 | 69 | def plot_params_multiple_subplots(env_lists_per_t, titles_list, y_min=0.45, y_max=1.05): 70 | fig = plt.figure(figsize=(5*len(env_lists_per_t), 3.4)) 71 | for j, stats_list in enumerate(env_lists_per_t): 72 | ax = plt.subplot(1, len(env_lists_per_t), j+1) 73 | plot_params_one_subplot(stats_list, ax, 74 | color_list=['blue', 'orange', '#5177d6', '#ffe500', 'deepskyblue', 'coral'], 75 | env_names=[' room', ' room', ' train', ' train', ' batteries', ' batteries'], 76 | comb=['Bayesian,', 'Additive,','Bayesian,', 'Additive,', 'Bayesian,', 'Additive,'], 77 | title=titles_list[j], current_subplot=j, y_min=y_min, y_max=y_max) 78 | fig.subplots_adjust(top=1.1) 79 | plt.tight_layout() 80 | 81 | pp = PdfPages('./results/additive-vs-bayesian.pdf') 82 | pp.savefig() 83 | pp.close() 84 | 85 | 86 | if __name__ == "__main__": 87 | ############### 88 | # Appendix D # 89 | ############### 90 | # plot Additive vs Bayesian 91 | 92 | # temperature=0 (rational agent) 93 | avb_stats_list_per_env_t0 = [[get_stats("rlsp", "room", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=0), 94 | get_stats("rlsp", "room", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=0), 95 | get_stats("rlsp", "train", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=0), 96 | get_stats("rlsp", "train", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=0), 97 | get_stats("rlsp", "batteries", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=0), 98 | get_stats("rlsp", "batteries", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=0)]] 99 | # temperature=0.1 100 | avb_stats_list_per_env_t01 = [[get_stats("rlsp", "room", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=1), 101 | get_stats("rlsp", "room", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=1), 102 | get_stats("rlsp", "train", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=1), 103 | get_stats("rlsp", "train", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=1), 104 | get_stats("rlsp", "batteries", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=1), 105 | get_stats("rlsp", "batteries", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=1)]] 106 | # temperature=1 107 | avb_stats_list_per_env_t1 = [[get_stats("rlsp", "room", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=2), 108 | get_stats("rlsp", "room", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=2), 109 | get_stats("rlsp", "train", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=2), 110 | get_stats("rlsp", "train", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=2), 111 | get_stats("rlsp", "batteries", "default", "bayesian", "k", "./results/additive-vs-bayesian", temp_index=2), 112 | get_stats("rlsp", "batteries", "default", "additive", "k", "./results/additive-vs-bayesian", temp_index=2)]] 113 | 114 | env_lists_per_t = [avb_stats_list_per_env_t0, avb_stats_list_per_env_t01, avb_stats_list_per_env_t1] 115 | titles_list = ['temperature = 0','temperature = 0.1','temperature = 1'] 116 | 117 | plt.rcParams["font.family"] = "Times New Roman" 118 | plot_params_multiple_subplots(env_lists_per_t, titles_list=titles_list) 119 | 120 | ############### 121 | # Section 5.4 # 122 | ############### 123 | # plot robustness to the choice of Alice's planning horizon 124 | 125 | # temperature=0 (rational agent). This is the stat we're plotting. To plot boltzmann-rational agents, replace 126 | # "stats_list_per_env_t0" by the stat corresponding to the temperature you want to plot in the cell below. 127 | h_stats_list_per_env_t0 = [[get_stats("rlsp", "train", "default", "additive", "T", "./results/horizon"), 128 | get_stats("rlsp", "room", "default", "additive", "T", "./results/horizon"), 129 | get_stats("rlsp", "batteries", "default", "additive", "T", "./results/horizon"), 130 | get_stats("rlsp", "apples", "default", "additive", "T", "./results/horizon")]] 131 | # temperature=0.1 132 | h_stats_list_per_env_t01 = [[get_stats("rlsp", "room", "default", "additive", "T", "./results/horizon", temp_index=1), 133 | get_stats("rlsp", "train", "default", "additive", "T", "./results/horizon", temp_index=1), 134 | get_stats("rlsp", "batteries", "default", "additive", "T", "./results/horizon", temp_index=1), 135 | get_stats("rlsp", "apples", "default", "additive", "T", "./results/horizon", temp_index=1)]] 136 | # temperature=1 137 | h_stats_list_per_env_t1 = [[get_stats("rlsp", "room", "default", "additive", "T", "./results/horizon", temp_index=2), 138 | get_stats("rlsp", "train", "default", "additive", "T", "./results/horizon", temp_index=2), 139 | get_stats("rlsp", "batteries", "default", "additive", "T", "./results/horizon", temp_index=2), 140 | get_stats("rlsp", "apples", "default", "additive", "T", "./results/horizon", temp_index=2)]] 141 | 142 | fig = plt.figure(figsize=(4.0, 2.6)) 143 | ax = plt.subplot(1, 1, 1) 144 | plot_params_one_subplot(h_stats_list_per_env_t0, ax, y_min=0.45, y_max=1.05, 145 | env_names=['train', 'room', 'batteries', 'apples'], 146 | comb=['','','',''], 147 | color_list=['green', 'orange', '#5177d6', 'firebrick']) 148 | plt.xlabel("Horizon", fontsize=17) 149 | ax.legend(bbox_to_anchor=(1, 1.051), fontsize=12, handletextpad=-0.4, borderpad=0.1) 150 | plt.tight_layout() 151 | 152 | pp = PdfPages('./results/horizon_t0.pdf') 153 | pp.savefig() 154 | pp.close() 155 | -------------------------------------------------------------------------------- /src/relative_reachability.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def relative_reachability_penalty(mdp, horizon, start): 4 | """ 5 | Calculates the undiscounted relative reachability penalty for each state in an mdp, compared to the starting state baseline. 6 | 7 | Based on the algorithm described in: https://arxiv.org/pdf/1806.01186.pdf 8 | """ 9 | coverage = get_coverage(mdp, horizon) 10 | distributions = baseline_state_distributions(mdp, horizon, start) 11 | 12 | def penalty(state): 13 | return np.sum(np.maximum(coverage[state, :] - coverage, 0), axis=1) 14 | 15 | def penalty_for_baseline_distribution(dist): 16 | return sum((dist[state] * penalty(state) for state in range(mdp.nS) if dist[state] != 0)) 17 | 18 | r_r = np.array(list(map(penalty_for_baseline_distribution, distributions))) 19 | if np.amax(r_r) == 0: 20 | return np.zeros_like(r_r) 21 | return r_r / np.amax(r_r) 22 | 23 | def get_coverage(mdp, horizon): 24 | coverage = np.identity(mdp.nS) 25 | for i in range(horizon): 26 | # coverage(s0, sk) = \max_{a0} \sum_{s1} P(s1 | s0, a) * coverage(s1, sk) 27 | action_coverage = mdp.T_matrix.dot(coverage) 28 | action_coverage = action_coverage.reshape((mdp.nS, mdp.nA, mdp.nS)) 29 | coverage = np.amax(action_coverage, axis=1) 30 | return coverage 31 | 32 | def baseline_state_distributions(mdp, horizon, start): 33 | distribution = np.zeros(mdp.nS) 34 | distribution[start] = 1 35 | distributions = [ distribution ] 36 | for _ in range(horizon - 1): 37 | distribution = mdp.baseline_matrix_transpose.dot(distribution) 38 | distributions.append(distribution) 39 | return distributions 40 | -------------------------------------------------------------------------------- /src/rlsp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.optimize import check_grad 3 | 4 | from value_iter import value_iter 5 | from utils import norm_distr, laplace_distr, printoptions 6 | 7 | 8 | def compute_g(mdp, policy, p_0, T, d_last_step_list, expected_features_list): 9 | nS, nA, nF = mdp.nS, mdp.nA, mdp.num_features 10 | 11 | # base case 12 | G = np.zeros((nS, nF)) 13 | # recursive case 14 | for t in range(T-1): 15 | # G(s') = \sum_{s, a} p(a | s) p(s' | s, a) [ p(s) g(s, a) + G_prev[s] ] 16 | # p(s) is given by d_last_step_list[t] 17 | # g(s, a) = f(s) - F(s) + \sum_{s'} p(s' | s, a) F(s') 18 | # Distribute the addition to get three different terms: 19 | # First term: p(s) [f(s') - F(s')] 20 | # Second term: p(s) \sum_{s2} p(s2 | s, a) F(s2) 21 | # Third term: G_prev[s] 22 | g_first = mdp.f_matrix - expected_features_list[t] 23 | g_second = mdp.T_matrix.dot(expected_features_list[t+1]) 24 | g_second = g_second.reshape((nS, nA, nF)) 25 | g_total = np.expand_dims(g_first, axis=1) + g_second 26 | 27 | prob_s_a = np.expand_dims(d_last_step_list[t].reshape(nS), axis=1) * policy[t] 28 | 29 | G_value = np.expand_dims(prob_s_a, axis=2) * g_total 30 | G_value = mdp.T_matrix_transpose.dot(G_value.reshape((nS * nA, nF))) 31 | 32 | G_recurse = np.expand_dims(policy[t], axis=-1) * np.expand_dims(G, axis=1) 33 | G_recurse = mdp.T_matrix_transpose.dot(G_recurse.reshape((nS * nA, nF))) 34 | 35 | G = G_value + G_recurse 36 | 37 | return G 38 | 39 | 40 | def compute_d_last_step(mdp, policy, p_0, T, gamma=1, verbose=False, return_all=False): 41 | """Computes the last-step occupancy measure""" 42 | D, d_last_step_list = p_0, [p_0] 43 | for t in range(T-1): 44 | # D(s') = \sum_{s, a} D_prev(s) * p(a | s) * p(s' | s, a) 45 | state_action_prob = np.expand_dims(D, axis=1) * policy[t] 46 | D = mdp.T_matrix_transpose.dot(state_action_prob.flatten()) 47 | 48 | if verbose is True: print(D) 49 | if return_all: d_last_step_list.append(D) 50 | 51 | return (D, d_last_step_list) if return_all else D 52 | 53 | def compute_feature_expectations(mdp, policy, p_0, T): 54 | nS, nA, nF = mdp.nS, mdp.nA, mdp.num_features 55 | expected_features = mdp.f_matrix 56 | expected_feature_list = [expected_features] 57 | for t in range(T-2, -1, -1): 58 | # F(s) = f(s) + \sum_{a, s'} p(a | s) * p(s' | s, a) * F(s') 59 | future_features = mdp.T_matrix.dot(expected_features).reshape((nS, nA, nF)) 60 | future_features = future_features * np.expand_dims(policy[t], axis=2) 61 | expected_features = mdp.f_matrix + np.sum(future_features, axis=1) 62 | expected_feature_list.append(expected_features) 63 | return expected_features, expected_feature_list[::-1] 64 | 65 | 66 | def rlsp(mdp, s_current, p_0, horizon, temp=1, epochs=1, learning_rate=0.2, 67 | r_prior=None, r_vec=None, threshold=1e-3, check_grad_flag=False): 68 | """The RLSP algorithm""" 69 | def compute_grad(r_vec): 70 | # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 71 | policy = value_iter(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp) 72 | d_last_step, d_last_step_list = compute_d_last_step( 73 | mdp, policy, p_0, horizon, return_all=True) 74 | if d_last_step[s_current] == 0: 75 | print('Error in om_method: No feasible trajectories!') 76 | return r_vec 77 | 78 | expected_features, expected_features_list = compute_feature_expectations( 79 | mdp, policy, p_0, horizon) 80 | 81 | G = compute_g(mdp, policy, p_0, horizon, d_last_step_list, expected_features_list) 82 | # Compute the gradient 83 | dL_dr_vec = G[s_current] / d_last_step[s_current] 84 | # Gradient of the prior 85 | if r_prior!= None: dL_dr_vec += r_prior.logdistr_grad(r_vec) 86 | return dL_dr_vec 87 | 88 | def compute_log_likelihood(r_vec): 89 | policy = value_iter(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp) 90 | d_last_step = compute_d_last_step(mdp, policy, p_0, horizon) 91 | log_likelihood = np.log(d_last_step[s_current]) 92 | if r_prior!= None: log_likelihood += np.sum(r_prior.logpdf(r_vec)) 93 | return log_likelihood 94 | 95 | def get_grad(_): 96 | """dummy function for use with check_grad()""" 97 | return dL_dr_vec 98 | 99 | if r_vec is None: 100 | r_vec = 0.01*np.random.randn(mdp.f_matrix.shape[1]) 101 | print('Initial reward vector: {}'.format(r_vec)) 102 | 103 | if check_grad_flag: grad_error_list=[] 104 | 105 | for i in range(epochs): 106 | dL_dr_vec = compute_grad(r_vec) 107 | if check_grad_flag: 108 | grad_error_list.append(check_grad(compute_log_likelihood, get_grad, r_vec)) 109 | 110 | # Gradient ascent 111 | r_vec = r_vec + learning_rate * dL_dr_vec 112 | 113 | # with printoptions(precision=4, suppress=True): 114 | # print('Epoch {}; Reward vector: {}'.format(i, r_vec)) 115 | # if check_grad_flag: print('grad error: {}'.format(grad_error_list[-1])) 116 | 117 | if np.linalg.norm(dL_dr_vec) < threshold: 118 | if check_grad_flag: 119 | print() 120 | print('Max grad error: {}'.format(np.amax(np.asarray(grad_error_list)))) 121 | print('Median grad error: {}'.format(np.median(np.asarray(grad_error_list)))) 122 | break 123 | 124 | return r_vec 125 | -------------------------------------------------------------------------------- /src/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import datetime 4 | import numpy as np 5 | import os 6 | import sys 7 | 8 | from scipy.stats import uniform as uniform_distr 9 | 10 | from envs.apples import ApplesEnv, ApplesState 11 | from envs.apples_spec import APPLES_PROBLEMS 12 | from envs.batteries import BatteriesEnv, BatteriesState 13 | from envs.batteries_spec import BATTERIES_PROBLEMS 14 | from envs.room import RoomEnv, RoomState 15 | from envs.room_spec import ROOM_PROBLEMS 16 | from envs.train import TrainEnv, TrainState 17 | from envs.train_spec import TRAIN_PROBLEMS 18 | 19 | from relative_reachability import relative_reachability_penalty 20 | from rlsp import rlsp 21 | from sampling import sample_from_posterior 22 | from utils import norm_distr, laplace_distr, printoptions 23 | from value_iter import value_iter, evaluate_policy 24 | 25 | 26 | def print_rollout(env, start_state, policies, last_steps_printed, horizon): 27 | if last_steps_printed == 0: 28 | last_steps_printed = horizon 29 | 30 | env.reset(start_state) 31 | print("Executing the policy from state:") 32 | env.print_state(env.s); print() 33 | print('Last {} of the {} rolled out steps:'.format( 34 | last_steps_printed, horizon)) 35 | 36 | for i in range(horizon-1): 37 | s_num = env.get_num_from_state(env.s) 38 | a = np.random.choice(env.nA, p=policies[i][s_num,:]) 39 | env.step(a) 40 | 41 | if i>=(horizon-last_steps_printed-1): 42 | env.print_state(env.s); print() 43 | 44 | 45 | def forward_rl(env, r_planning, r_true, h=40, temp=0, last_steps_printed=0, 46 | current_s_num=None, weight=1, penalize_deviation=False, 47 | relative_reachability=False, print_level=1): 48 | '''Given an env and R, runs soft VI for h steps and rolls out the resulting policy''' 49 | current_state = env.get_state_from_num(current_s_num) 50 | r_s = env.f_matrix @ r_planning 51 | time_dependent_reward = False 52 | 53 | if penalize_deviation: 54 | diff = env.f_matrix - env.s_to_f(current_state).T 55 | r_s -= weight * np.linalg.norm(diff, axis=1) 56 | if relative_reachability: 57 | time_dependent_reward = True 58 | r_r = relative_reachability_penalty(env, h, current_s_num) 59 | r_s = np.expand_dims(r_s, 0) - weight * r_r 60 | 61 | # For evaluation, plan optimally instead of Boltzmann-rationally 62 | policies = value_iter(env, 1, r_s, h, temperature=temp, time_dependent_reward=time_dependent_reward) 63 | 64 | # For print level >= 1, print a rollout 65 | if print_level >= 1: 66 | print_rollout(env, current_state, policies, last_steps_printed, h) 67 | 68 | return evaluate_policy(env, policies, current_s_num, 1, env.f_matrix @ r_true, h) 69 | 70 | 71 | PROBLEMS = { 72 | 'room': ROOM_PROBLEMS, 73 | 'apples': APPLES_PROBLEMS, 74 | 'train': TRAIN_PROBLEMS, 75 | 'batteries': BATTERIES_PROBLEMS 76 | } 77 | 78 | ENV_CLASSES = { 79 | 'room': RoomEnv, 80 | 'apples': ApplesEnv, 81 | 'train': TrainEnv, 82 | 'batteries': BatteriesEnv 83 | } 84 | 85 | 86 | def get_problem_parameters(env_name, problem_name): 87 | if env_name not in ENV_CLASSES: 88 | raise ValueError('Environment {} is not one of {}'.format( 89 | env_name, list(ENV_CLASSES.keys()))) 90 | if problem_name not in PROBLEMS[env_name]: 91 | raise ValueError('Problem spec {} is not one of {}'.format( 92 | problem_name, list(PROBLEMS[env_name].keys()))) 93 | 94 | spec, cur_state, r_task, r_true = PROBLEMS[env_name][problem_name] 95 | env = ENV_CLASSES[env_name](spec) 96 | return env, env.get_num_from_state(cur_state), r_task, r_true 97 | 98 | 99 | def get_r_prior(prior, reward_center, std): 100 | if prior == "gaussian": 101 | return norm_distr(reward_center, std) 102 | elif prior == "laplace": 103 | return laplace_distr(reward_center, std) 104 | elif prior == "uniform": 105 | return None 106 | else: 107 | raise ValueError('Unknown prior {}'.format(prior)) 108 | 109 | 110 | def experiment_wrapper(env_name='vases', 111 | problem_spec='default', 112 | inference_algorithm='rlsp', 113 | combination_algorithm='additive', 114 | prior='gaussian', 115 | horizon=20, 116 | evaluation_horizon=0, 117 | temperature=1, 118 | learning_rate=.1, 119 | inferred_weight=1, 120 | epochs=200, 121 | uniform_prior=False, 122 | measures=['final_reward'], 123 | n_samples=10000, 124 | mcmc_burn_in=1000, 125 | step_size=.01, 126 | seed=0, 127 | std=0.5, 128 | print_level=1, 129 | soft_forward_rl=False, 130 | reward_constant=1.0): 131 | # Check the parameters so that we fail fast 132 | assert inference_algorithm in ['rlsp', 'sampling', 'deviation', 'reachability', 'spec'] 133 | assert combination_algorithm in ['additive', 'bayesian'] 134 | assert prior in ['gaussian', 'laplace', 'uniform'] 135 | assert all((measure in ['true_reward', 'final_reward'] for measure in measures)) 136 | 137 | if evaluation_horizon==0: 138 | evaluation_horizon = horizon 139 | 140 | if combination_algorithm == 'bayesian': 141 | assert inference_algorithm in ['rlsp', 'sampling'] 142 | 143 | np.random.seed(seed) 144 | env, s_current, r_task, r_true = get_problem_parameters(env_name, problem_spec) 145 | 146 | if print_level >= 1: 147 | print('Initial state:') 148 | env.print_state(env.init_state) 149 | print() 150 | 151 | p_0 = env.get_initial_state_distribution(known_initial_state=not uniform_prior) 152 | 153 | deviation = inference_algorithm == "deviation" 154 | reachability = inference_algorithm == "reachability" 155 | reward_center = r_task if combination_algorithm == "bayesian" else np.zeros(env.num_features) 156 | r_prior = get_r_prior(prior, reward_center, std) 157 | 158 | # Infer reward by observing the world state 159 | if inference_algorithm == "rlsp": 160 | r_inferred = rlsp(env, s_current, p_0, horizon, temperature, epochs, learning_rate, r_prior) 161 | elif inference_algorithm == "sampling": 162 | r_samples = sample_from_posterior( 163 | env, s_current, p_0, horizon, temperature, n_samples, step_size, 164 | r_prior, gamma=1, print_level=print_level) 165 | r_inferred = np.mean(r_samples[mcmc_burn_in::], axis=0) 166 | elif inference_algorithm in ["deviation", "reachability", "spec"]: 167 | r_inferred = None 168 | else: 169 | raise ValueError('Unknown inference algorithm: {}'.format(inference_algorithm)) 170 | 171 | if print_level >= 1 and r_inferred is not None: 172 | with printoptions(precision=4, suppress=True): 173 | print(); print('Inferred reward vector: ', r_inferred) 174 | 175 | # Run forward RL to evaluate 176 | def evaluate(forward_rl_temp): 177 | if combination_algorithm == "additive": 178 | r_final = r_task 179 | if r_inferred is not None: 180 | r_final = r_task + inferred_weight * r_inferred 181 | true_reward_obtained = forward_rl(env, r_final, r_true, temp=forward_rl_temp, h=evaluation_horizon, current_s_num=s_current, weight=inferred_weight, penalize_deviation=deviation, relative_reachability=reachability, print_level=print_level) 182 | elif combination_algorithm == "bayesian": 183 | assert r_inferred is not None 184 | assert (not deviation) and (not reachability) 185 | r_final = r_inferred 186 | true_reward_obtained = forward_rl(env, r_final, r_true, temp=forward_rl_temp, h=evaluation_horizon, current_s_num=s_current, penalize_deviation=False, relative_reachability=False, print_level=print_level) 187 | else: 188 | raise ValueError('Unknown combination algorithm: {}'.format(combination_algorithm)) 189 | 190 | best_possible_reward = forward_rl(env, r_true, r_true, temp=forward_rl_temp, h=evaluation_horizon, current_s_num=s_current, print_level=0) 191 | 192 | # Add the reward constant in 193 | true_reward_obtained += reward_constant * evaluation_horizon 194 | best_possible_reward += reward_constant * evaluation_horizon 195 | 196 | def get_measure(measure): 197 | if measure == 'final_reward': 198 | return r_final 199 | elif measure == 'true_reward': 200 | return true_reward_obtained * 1.0 / best_possible_reward 201 | else: 202 | raise ValueError('Unknown measure {}'.format(measure)) 203 | 204 | return [get_measure(measure) for measure in measures] 205 | 206 | if soft_forward_rl: 207 | return [evaluate(temp) for temp in [0, 0.1, 0.5, 1, 5, 10]] 208 | else: 209 | return [evaluate(0.0)] 210 | 211 | 212 | 213 | # The command line parameters that should be included in the filename of the 214 | # file summarizing the results. 215 | PARAMETERS = [ 216 | ('-e', '--env_name', 'room', None, 217 | 'Environment to run: one of [vases, boxes, room, apples, train, batteries]'), 218 | ('-p', '--problem_spec', 'default', None, 219 | 'The name of the problem specification to solve.'), 220 | ('-i', '--inference_algorithm', 'spec', None, 221 | 'Frame condition inference algorithm: one of [rlsp, sampling, deviation, reachability, spec].'), 222 | ('-c', '--combination_algorithm', 'additive', None, 223 | 'How to combine the task reward and inferred reward for forward RL: one of [additive, bayesian]. bayesian only has an effect if algorithm is rlsp or sampling.'), 224 | ('-r', '--prior', 'gaussian', None, 225 | 'Prior on the inferred reward function: one of [gaussian, laplace, uniform]. Centered at zero if combination_algorithm is additive, and at the task reward if combination_algorithm is bayesian. Only has an effect if inference_algorithm is rlsp or sampling.'), 226 | ('-T', '--horizon', '20', int, 227 | 'Number of timesteps we assume the human has been acting.'), 228 | ('-x', '--evaluation_horizon', '0', int, 229 | 'Number of timesteps we act after inferring the reward.'), 230 | ('-t', '--temperature', '1.0', float, 231 | 'Boltzmann rationality constant for the human. Note this is temperature, which is the inverse of beta.'), 232 | ('-l', '--learning_rate', '0.1', float, 233 | 'Learning rate for gradient descent. Applies when inference_algorithm is rlsp.'), 234 | ('-w', '--inferred_weight', '1', float, 235 | 'Weight for the inferred reward when adding task and inferred rewards. Applies if combination_algorithm is additive.'), 236 | ('-m', '--epochs', '50', int, 237 | 'Number of gradient descent steps to take.'), 238 | ('-u', '--uniform_prior', 'False', lambda x: x != 'False', 239 | 'Whether to use a uniform prior over initial states, or to know the initial state. Either true or false.'), 240 | ('-d', '--dependent_vars', 'final_reward', None, 241 | 'Dependent variables to measure and report'), 242 | ('-n', '--n_samples', '10000', int, 243 | 'Number of samples to generate with MCMC'), 244 | ('-b', '--mcmc_burn_in', '1000', int, 245 | 'Number of samples to ignore at the start'), 246 | ('-z', '--step_size', '0.01', float, 247 | 'Step size for computing neighbor reward functions. Only has an effect if inference_algorithm is sampling.'), 248 | ('-s', '--seed', '0', int, 249 | 'Random seed.'), 250 | ('-k', '--std', '0.5', float, 251 | 'Standard deviation for the prior'), 252 | ('-v', '--print_level', '1', int, 253 | 'Level of verbosity.'), 254 | ('-f', '--soft_forward_rl', 'False', lambda x: x != 'False', 255 | 'Evaluate with a range of temperatures for soft VI for forward RL if true, else evaluate with hard VI for forward RL'), 256 | ('-q', '--reward_constant', '1.0', float, 257 | 'Living reward provided when evaluating performance.'), 258 | ] 259 | 260 | # Writing output for experiments 261 | def get_filename(args): 262 | # Drop the '--' in front of the names 263 | param_short_names = [name[1:] for name, _, _, _, _ in PARAMETERS] 264 | param_names = [name[2:] for _, name, _, _, _ in PARAMETERS] 265 | param_values = [args.__dict__[name] for name in param_names] 266 | 267 | filename = '{}-' + '={}-'.join(param_short_names) + '={}.csv' 268 | #time_str = str(datetime.datetime.now()).replace(':', '-').replace('.', '-').replace(' ', '-') 269 | time_str = 'res' 270 | filename = filename.format(time_str, *param_values) 271 | return args.output_folder + '/' + filename 272 | 273 | def write_output(results, indep_var, indep_vals, dependent_vars, args): 274 | with open(get_filename(args), 'w', newline='') as csvfile: 275 | writer = csv.DictWriter(csvfile, fieldnames=[indep_var] + dependent_vars) 276 | writer.writeheader() 277 | for indep_val, result in zip(indep_vals, results): 278 | row = {} 279 | row[indep_var] = indep_val 280 | for dependent_var, dependent_val in zip(dependent_vars, result): 281 | row[dependent_var] = dependent_val 282 | writer.writerow(row) 283 | 284 | 285 | # Command-line arguments 286 | def parse_args(args=None): 287 | parser = argparse.ArgumentParser() 288 | for name, long_name, default, _, help_str in PARAMETERS: 289 | parser.add_argument(name, long_name, type=str, default=default, help=help_str) 290 | 291 | # Parameters that shouldn't be included in the filename. 292 | parser.add_argument('-o', '--output_folder', type=str, default='', 293 | help='Output folder') 294 | return parser.parse_args(args) 295 | 296 | 297 | def setup_experiment(args): 298 | indep_vars_dict, control_vars_dict = {}, {} 299 | 300 | for _, var, _, fn, _ in PARAMETERS: 301 | var = var[2:] 302 | if var == 'dependent_vars': continue 303 | if fn is None: fn = lambda x: x 304 | 305 | vals = [fn(x) for x in args.__dict__[var].split(',')] 306 | if len(vals) > 1: 307 | indep_vars_dict[var] = vals 308 | else: 309 | control_vars_dict[var] = vals[0] 310 | 311 | return indep_vars_dict, control_vars_dict, args.dependent_vars.split(',') 312 | 313 | 314 | def main(): 315 | if sys.platform == "win32": 316 | import colorama; colorama.init() 317 | 318 | args = parse_args() 319 | print(args) 320 | indep_vars_dict, control_vars_dict, dependent_vars = setup_experiment(args) 321 | # print(indep_vars_dict, control_vars_dict, dependent_vars) 322 | # For now, restrict to zero or one independent variables, but it 323 | # could be generalized to two variables 324 | if len(indep_vars_dict) == 0: 325 | indep_var = 'N/A' 326 | indep_vals = ['N/A'] 327 | results = [[] for _ in range(len(dependent_vars))] 328 | for condition_result in experiment_wrapper(measures=dependent_vars, **control_vars_dict): 329 | for i, result in enumerate(condition_result): 330 | results[i].append(result) 331 | results = [results] 332 | elif len(indep_vars_dict) == 1: 333 | indep_var = next(iter(indep_vars_dict.keys())) 334 | indep_vals = indep_vars_dict[indep_var] 335 | results = [] 336 | for indep_val in indep_vals: 337 | curr_results = [[] for _ in range(len(dependent_vars))] 338 | experiment_args = control_vars_dict.copy() 339 | experiment_args[indep_var] = indep_val 340 | experiment_args['measures'] = dependent_vars 341 | for condition_result in experiment_wrapper(**experiment_args): 342 | for i, result in enumerate(condition_result): 343 | curr_results[i].append(result) 344 | results.append(curr_results) 345 | else: 346 | raise ValueError('Can only support up to one independent variable (that is, a flag with multiple comma-separated values)') 347 | 348 | if args.output_folder == '' or os.path.isfile(get_filename(args)): 349 | print(results) 350 | else: 351 | write_output(results, indep_var, indep_vals, dependent_vars, args) 352 | 353 | 354 | if __name__ == '__main__': 355 | main() 356 | -------------------------------------------------------------------------------- /src/sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from math import exp 3 | 4 | from value_iter import value_iter 5 | from rlsp import compute_d_last_step 6 | 7 | 8 | def sample_from_posterior( 9 | env, s_current, p_0, h, temp, n_samples, step_size, r_prior, gamma=1, 10 | print_level=1): 11 | """ 12 | Algorithm similar to BIRL that uses the last-step OM of a Boltzmann rational 13 | policy instead of the BIRL likelihood. Samples the reward from the posterior 14 | p(r | s_T, r_spec) \propto p(s_T | \theta) * p(r | r_spec). 15 | 16 | This is Algorithm 1 in Appendix C of the paper. 17 | """ 18 | 19 | def log_last_step_om(policy): 20 | d_last_step = compute_d_last_step(env, policy, p_0, h) 21 | return np.log(d_last_step[s_current]) 22 | 23 | def log_probability(r_vec, verbose=False): 24 | pi = value_iter(env, gamma, env.f_matrix @ r_vec, h, temp) 25 | log_p = log_last_step_om(pi) 26 | 27 | log_prior = 0 28 | if r_prior is not None: 29 | log_prior = np.sum(r_prior.logpdf(r_vec)) 30 | 31 | if verbose: 32 | print('Log prior: {}\nLog prob: {}\nTotal: {}'.format( 33 | log_prior, log_p, log_p + log_prior)) 34 | return log_p + log_prior 35 | 36 | times_accepted = 0 37 | samples = [] 38 | 39 | if r_prior is None: 40 | r = .01*np.random.randn(env.num_features) 41 | else: 42 | r = 0.1 * r_prior.rvs() 43 | 44 | if print_level >= 1: 45 | print('Initial reward: {}'.format(r)) 46 | 47 | # probability of the initial reward 48 | log_p = log_probability(r, verbose=(print_level >= 1)) 49 | 50 | while len(samples) < n_samples: 51 | verbose = (print_level >= 1) and (len(samples) % 200 == 199) 52 | if verbose: 53 | print('\nGenerating sample {}'.format(len(samples) + 1)) 54 | 55 | r_prime = np.random.normal(r, step_size) 56 | log_p_1 = log_probability(r_prime, verbose=verbose) 57 | 58 | # Accept or reject the new sample 59 | # If we reject, the new sample is the previous sample 60 | acceptance_probability = exp(log_p_1-log_p) 61 | if np.random.uniform() < acceptance_probability: 62 | times_accepted += 1 63 | r, log_p = r_prime, log_p_1 64 | samples.append(r) 65 | 66 | 67 | if verbose: 68 | # Acceptance probability should not be very high or very low 69 | print('Acceptance probability is {}'.format(acceptance_probability)) 70 | 71 | if print_level >= 1: 72 | print('Done! Accepted {} of samples'.format(times_accepted/n_samples)) 73 | return samples 74 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import numpy as np 3 | from scipy.stats import norm, laplace 4 | 5 | 6 | class norm_distr(object): 7 | def __init__(self, mu, sigma=1): 8 | self.mu = mu 9 | self.sigma = sigma 10 | self.distribution = norm(loc=mu, scale=sigma) 11 | 12 | def rvs(self): 13 | '''sample''' 14 | return self.distribution.rvs() 15 | 16 | def pdf(self, x): 17 | return self.distribution.pdf(x) 18 | 19 | def logpdf(self, x): 20 | return self.distribution.logpdf(x) 21 | 22 | def logdistr_grad(self, x): 23 | return (self.mu-x)/(self.sigma**2) 24 | 25 | 26 | class laplace_distr(object): 27 | def __init__(self, mu, b=1): 28 | self.mu = mu 29 | self.b = b 30 | self.distribution = laplace(loc=mu, scale=b) 31 | 32 | def rvs(self): 33 | '''sample''' 34 | return self.distribution.rvs() 35 | 36 | def pdf(self, x): 37 | return self.distribution.pdf(x) 38 | 39 | def logpdf(self, x): 40 | return self.distribution.logpdf(x) 41 | 42 | def logdistr_grad(self, x): 43 | return (self.mu-x)/(np.fabs(x-self.mu)*self.b) 44 | 45 | 46 | @contextlib.contextmanager 47 | def printoptions(*args, **kwargs): 48 | original = np.get_printoptions() 49 | np.set_printoptions(*args, **kwargs) 50 | try: 51 | yield 52 | finally: 53 | np.set_printoptions(**original) 54 | -------------------------------------------------------------------------------- /src/value_iter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def value_iter(mdp, gamma, r, horizon, temperature=1, threshold=1e-10, time_dependent_reward=False): 5 | """ 6 | Finds the optimal state and state-action value functions via value 7 | iteration with the "soft" max-ent Bellman backup: 8 | 9 | Q_{sa} = r_s + gamma * \sum_{s'} p(s'|s,a)V_{s'} 10 | V'_s = temperature * log(\sum_a exp(Q_{sa}/temperature)) 11 | 12 | Computes the Boltzmann rational policy 13 | \pi_{s,a} = exp((Q_{s,a} - V_s)/temperature). 14 | 15 | Parameters 16 | ---------- 17 | mdp : object 18 | Instance of the Env class (see envs/env.py). 19 | 20 | gamma : float 21 | Discount factor; 0<=gamma<=1. 22 | r : 1D numpy array 23 | Initial reward vector with the length equal to the 24 | number of states in the MDP. 25 | horizon : int 26 | Horizon for the finite horizon version of value iteration. 27 | temperature: float 28 | Rationality constant to use in the value iteration equation. 29 | threshold : float 30 | Convergence threshold. 31 | 32 | Returns 33 | ------- 34 | 1D numpy array 35 | Array of shape (mdp.nS, 1), each V[s] is the value of state s under 36 | the reward r and Boltzmann policy. 37 | 2D numpy array 38 | Array of shape (mdp.nS, mdp.nA), each Q[s,a] is the value of 39 | state-action pair [s,a] under the reward r and Boltzmann policy. 40 | List of 2D numpy arrays 41 | Arrays of shape (mdp.nS, mdp.nA), each value p[t][s,a] is the 42 | probability of taking action a in state s at time t. 43 | """ 44 | nS, nA = mdp.nS, mdp.nA 45 | # Functions for computing the policy 46 | expt = lambda x: np.exp(x/temperature) 47 | tlog = lambda x: temperature * np.log(x) 48 | 49 | if not time_dependent_reward: 50 | r = [r] * horizon # Fast, since we aren't making copies 51 | 52 | policies = [] 53 | V = np.copy(r[horizon-1]) 54 | for t in range(horizon-2, -1, -1): 55 | future_values = mdp.T_matrix.dot(V).reshape((nS, nA)) 56 | Q = np.expand_dims(r[t], axis=1) + gamma * future_values 57 | 58 | if temperature==0: 59 | V = Q.max(axis=1) 60 | # Argmax to find the action number, then index into np.eye to 61 | # one hot encode. Note this will deterministically break ties 62 | # towards the smaller action. 63 | policy = np.eye(nA)[np.argmax(Q, axis=1)] 64 | else: 65 | # ∀ s: V_s = temperature * log(\sum_a exp(Q_sa/temperature)) 66 | # ∀ s,a: policy_{s,a} = exp((Q_{s,a} - V_s)/t) 67 | V = softmax(Q, temperature) 68 | policy = expt(Q - np.expand_dims(V, axis=1)) 69 | 70 | 71 | policies.append(policy) 72 | 73 | if gamma==1: 74 | # When \gamma=1, the backup operator is equivariant under adding 75 | # a constant to all entries of V, so we can translate min(V) 76 | # to be 0 at each step of the softmax value iteration without 77 | # changing the policy it converges to, and this fixes the problem 78 | # where log(nA) keep getting added at each iteration. 79 | V = V - np.amin(V) 80 | 81 | return policies[::-1] 82 | 83 | 84 | def evaluate_policy(mdp, policy, start, gamma, r, horizon): 85 | """Expected reward from the policy.""" 86 | V = r 87 | for t in range(horizon-2, -1, -1): 88 | future_values = mdp.T_matrix.dot(V).reshape((mdp.nS, mdp.nA)) 89 | Q = np.expand_dims(r, axis=1) + gamma * future_values 90 | V = np.sum(policy[t] * Q, axis=1) 91 | return V[start] 92 | 93 | 94 | def softmax(x, t=1): 95 | """ 96 | Numerically stable computation of t*log(\sum_j^n exp(x_j / t)) 97 | 98 | If the input is a 1D numpy array, computes it's softmax: 99 | output = t*log(\sum_j^n exp(x_j / t)). 100 | If the input is a 2D numpy array, computes the softmax of each of the rows: 101 | output_i = t*log(\sum_j^n exp(x_{ij} / t)) 102 | 103 | Parameters 104 | ---------- 105 | x : 1D or 2D numpy array 106 | 107 | Returns 108 | ------- 109 | 1D numpy array 110 | shape = (n,), where: 111 | n = 1 if x was 1D, or 112 | n is the number of rows (=x.shape[0]) if x was 2D. 113 | """ 114 | assert t>=0 115 | if len(x.shape) == 1: x = x.reshape((1,-1)) 116 | if t == 0: return np.amax(x, axis=1) 117 | if x.shape[1] == 1: return x 118 | 119 | def softmax_2_arg(x1,x2, t): 120 | """ 121 | Numerically stable computation of t*log(exp(x1/t) + exp(x2/t)) 122 | 123 | Parameters 124 | ---------- 125 | x1 : numpy array of shape (n,1) 126 | x2 : numpy array of shape (n,1) 127 | 128 | Returns 129 | ------- 130 | numpy array of shape (n,1) 131 | Each output_i = t*log(exp(x1_i / t) + exp(x2_i / t)) 132 | """ 133 | tlog = lambda x: t * np.log(x) 134 | expt = lambda x: np.exp(x/t) 135 | 136 | max_x = np.amax((x1,x2),axis=0) 137 | min_x = np.amin((x1,x2),axis=0) 138 | return max_x + tlog(1+expt((min_x - max_x))) 139 | 140 | sm = softmax_2_arg(x[:,0],x[:,1], t) 141 | # Use the following property of softmax_2_arg: 142 | # softmax_2_arg(softmax_2_arg(x1,x2),x3) = log(exp(x1) + exp(x2) + exp(x3)) 143 | # which is true since 144 | # log(exp(log(exp(x1) + exp(x2))) + exp(x3)) = log(exp(x1) + exp(x2) + exp(x3)) 145 | for (i, x_i) in enumerate(x.T): 146 | if i>1: sm = softmax_2_arg(sm, x_i, t) 147 | return sm 148 | --------------------------------------------------------------------------------