├── .gitignore ├── LICENSE.txt ├── README.md ├── params ├── acrobot │ └── sarsa.json ├── cartpole │ ├── sarsa.json │ ├── sarsa_alphabound.json │ └── sarsa_twopoles.json ├── chain │ └── delayed_qlearning.json ├── mountaincar │ ├── example_randtrial.json │ ├── ilstd.json │ ├── lspi.json │ ├── lstd.json │ ├── lstdq.json │ ├── mdba.json │ ├── mdq.json │ ├── mdsarsa.json │ ├── modelbased.json │ ├── nac_lstdq.json │ ├── nacs.json │ ├── olstd.json │ ├── qlearning.json │ ├── rlstd.json │ ├── sarsa.json │ ├── sarsa_ann.json │ ├── sarsa_lecun.json │ ├── ttac1.json │ └── ttnac3.json └── puddleworld │ └── sarsa.json ├── pyrl ├── .gitignore ├── Makefile ├── README.md ├── TODO ├── __init__.py ├── agents │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── delayed_qlearning.py │ ├── lstd.py │ ├── mirror_descent.py │ ├── modelbased.py │ ├── models │ │ ├── .gitignore │ │ ├── README.md │ │ ├── __init__.py │ │ ├── batch_model.py │ │ └── model.py │ ├── planners │ │ ├── .gitignore │ │ ├── README.md │ │ ├── __init__.py │ │ ├── fitted_qiteration.py │ │ └── planner.py │ ├── policy_gradient.py │ ├── qlearning.py │ ├── sarsa_lambda.py │ ├── sarsa_lambda_ann.py │ ├── skeleton_agent.py │ └── stepsizes.py ├── basis │ ├── .gitignore │ ├── CTiles │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── __init__.py │ │ ├── doc │ │ │ └── tiles.html │ │ ├── fancytiles.py │ │ ├── src │ │ │ ├── tiles.cpp │ │ │ ├── tiles.h │ │ │ └── tilesInt.C │ │ └── tiletimes.py │ ├── README.md │ ├── Tiles │ │ ├── .gitignore │ │ ├── README.md │ │ ├── __init__.py │ │ ├── fancytiles.py │ │ ├── tiles.py │ │ └── tiletimes.py │ ├── __init__.py │ ├── fourier.py │ ├── rbf.py │ ├── tilecode.py │ └── trivial.py ├── environments │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── acrobot.py │ ├── batch_replenish.py │ ├── bicycle.py │ ├── cartpole.py │ ├── chain.py │ ├── configs │ │ ├── neurostim │ │ │ ├── params.dat │ │ │ ├── test_features.dat │ │ │ ├── test_labels.dat │ │ │ └── test_stimulation.dat │ │ ├── pinball │ │ │ ├── pinball_hard_single.cfg │ │ │ └── pinball_simple_single.cfg │ │ ├── pomdps │ │ │ └── tiger.POMDP │ │ └── tetris │ │ │ ├── 3brick.dat │ │ │ ├── melax.dat │ │ │ ├── standard.dat │ │ │ └── sztetris.dat │ ├── fuelworld.py │ ├── gridworld.py │ ├── libPOMDP │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── COPYING │ │ ├── README.md │ │ ├── __init__.py │ │ └── src │ │ │ ├── imm-reward.c │ │ │ ├── imm-reward.h │ │ │ ├── libpomdp.c │ │ │ ├── libpomdp.h │ │ │ ├── mdp-common.h │ │ │ ├── mdp.c │ │ │ ├── mdp.h │ │ │ ├── parse_constant.h │ │ │ ├── parse_err.c │ │ │ ├── parse_err.h │ │ │ ├── parse_hash.c │ │ │ ├── parse_hash.h │ │ │ ├── parser.y │ │ │ ├── scanner.l │ │ │ ├── sparse-matrix.c │ │ │ └── sparse-matrix.h │ ├── marble_maze.py │ ├── mdptetris │ │ ├── CMakeLists.txt │ │ ├── COPYING │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data │ │ │ ├── features │ │ │ │ ├── bertsekas_initial.dat │ │ │ │ ├── ce_bdu.dat │ │ │ │ ├── ce_bertsekas.dat │ │ │ │ ├── ce_bertsekas_dellacherie.dat │ │ │ │ ├── ce_dellacherie.dat │ │ │ │ ├── ce_du.dat │ │ │ │ ├── dellacherie_initial.dat │ │ │ │ ├── dellacherie_ourwellsums.dat │ │ │ │ ├── record_bdu.dat │ │ │ │ ├── record_du.dat │ │ │ │ └── value_estimator_bertsekas.dat │ │ │ ├── pieces3.dat │ │ │ ├── pieces4.dat │ │ │ └── pieces_melax.dat │ │ └── src │ │ │ ├── board.c │ │ │ ├── board.h │ │ │ ├── brick_masks.c │ │ │ ├── brick_masks.h │ │ │ ├── common_parameters.c │ │ │ ├── common_parameters.h │ │ │ ├── config.h │ │ │ ├── feature_functions.c │ │ │ ├── feature_functions.h │ │ │ ├── feature_policy.c │ │ │ ├── feature_policy.h │ │ │ ├── file_tools.c │ │ │ ├── file_tools.h │ │ │ ├── game.c │ │ │ ├── game.h │ │ │ ├── games_statistics.c │ │ │ ├── games_statistics.h │ │ │ ├── hashtable.c │ │ │ ├── hashtable.h │ │ │ ├── interruptions.c │ │ │ ├── interruptions.h │ │ │ ├── last_move_info.c │ │ │ ├── last_move_info.h │ │ │ ├── macros.h │ │ │ ├── mdptetris.c │ │ │ ├── piece.c │ │ │ ├── piece.h │ │ │ ├── random.c │ │ │ ├── random.h │ │ │ ├── rewards.c │ │ │ ├── rewards.h │ │ │ ├── simple_tetris.c │ │ │ ├── simple_tetris.h │ │ │ ├── tetris.c │ │ │ └── types.h │ ├── mountaincar.py │ ├── multiroom.py │ ├── neurostim.py │ ├── pinball.py │ ├── pomdp.py │ ├── puddleworld.py │ ├── skeleton_environment.py │ ├── taxi.py │ ├── tetris.py │ ├── twip.py │ └── windyworld.py ├── experiments │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── episodic.py │ └── randomized.py ├── misc │ ├── .gitignore │ ├── __init__.py │ ├── json.py │ ├── matrix.py │ ├── parameter.py │ └── timer.py ├── rlglue │ ├── .gitignore │ ├── RLGlueLocal.py │ ├── TaskSpecRLGlue.py │ ├── __init__.py │ ├── registry.py │ └── run.py └── visualizers │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── compareParameters.py │ ├── plotExperiment.py │ └── plotParameters.py └── scripts ├── generate_spearmint.sh ├── spearmint_config.py └── spearmint_template.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | *~ 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | python-rl 2 | ========= 3 | 4 | Some Reinforcement Learning in Python 5 | 6 | 7 | Run with: 8 | 9 | python -m pyrl.rlglue.run 10 | 11 | Many other run options exist. A good starting point is with the command line help: 12 | 13 | python -m pyrl.rlglue.run --help 14 | 15 | The params/ directory contains examples of experiments that demonstrate many of the different agent algorithms. 16 | As an example, a randomized trial experiment using mountain car, and a randomly generated 'fixed policy' can be 17 | run with: 18 | 19 | python -m pyrl.rlglue.run --load params/mountaincar/example_randtrial.json 20 | 21 | The out put of this particular experiment is of the form: 22 | #evaluation points, list of evaluation index and evaluation value pairs, list of parameter values 23 | 24 | For example: 25 | 1,0,-4999.0,0.0,0.219169344211,0.1,1.0,0.7,1,13709650200845 26 | 27 | For this, there is oly 1 evaluation point (which is because this experiment only runs one episode). 28 | Then the evaluation index is zero, for the zero-th episode, followed by the return for that episode. 29 | Then we see a learning rate of 0.0 (because this is a fixed policy), followed by other parameters of 30 | Sarsa which in this case are not important. The final value of the line is the random seed used to 31 | generate the fixed policy. 32 | 33 | 34 | Contributors 35 | ============ 36 | Will Dabney 37 | 38 | Pierre-Luc Bacon 39 | -------------------------------------------------------------------------------- /params/acrobot/sarsa.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Acrobot"}, 3 | "experiment": {"params": {"num_episodes": 50, "num_runs": 30}, "name": "Episodic"}, 4 | "agent": {"params": { 5 | "alpha": 0.003, 6 | "epsilon": 0.01, 7 | "gamma": 0.99, 8 | "lmbda": 0.7, 9 | "basis": "fourier", 10 | "fourier_order": 3 11 | }, "name": "Sarsa"} 12 | } 13 | -------------------------------------------------------------------------------- /params/cartpole/sarsa.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Cart Pole"}, 3 | "experiment": {"params": {"num_episodes": 50, "num_runs": 30}, "name": "Episodic"}, 4 | "agent": { 5 | "params": { 6 | "basis": "fourier", 7 | "fourier_order": 3, 8 | "alpha": 0.005, 9 | "gamma": 0.99, 10 | "lmbda": 0.7, 11 | "epsilon": 0.01 12 | }, 13 | "name": "Sarsa" 14 | } 15 | } -------------------------------------------------------------------------------- /params/cartpole/sarsa_alphabound.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Cart Pole"}, 3 | "experiment": {"params": {"num_episodes": 30}, "name": "Episodic"}, 4 | "agent": {"params": { 5 | "alpha": 1.0, 6 | "epsilon": 0.01, 7 | "gamma": 0.99, 8 | "lmbda": 0.7, 9 | "basis": "fourier", 10 | "fourier_order": 3 11 | }, "name": "Adaptive (AlphaBound) Sarsa"} 12 | } 13 | -------------------------------------------------------------------------------- /params/cartpole/sarsa_twopoles.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {"pole_scales": [1.0, 0.1]}, "name": "Cart Pole"}, 3 | "experiment": {"params": {"num_episodes": 10000, "num_runs": 10}, "name": "Episodic"}, 4 | "agent": { 5 | "params": { 6 | "basis": "fourier", 7 | "fourier_order": 5, 8 | "alpha": 0.00001, 9 | "gamma": 0.99, 10 | "lmbda": 0.7, 11 | "epsilon": 0.01 12 | }, 13 | "name": "Sarsa" 14 | } 15 | } -------------------------------------------------------------------------------- /params/chain/delayed_qlearning.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Chain"}, 3 | "experiment": {"params": {"num_episodes": 10, "maxsteps":1000, "num_runs": 10}, "name": "Episodic"}, 4 | "agent": {"params": { 5 | "m": 10, 6 | "gamma": 0.99, 7 | "epsilon": 0.1 8 | }, "name": "Delayed Q-Learning"} 9 | } 10 | -------------------------------------------------------------------------------- /params/mountaincar/example_randtrial.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Mountain Car"}, 3 | "experiment": {"params": {"num_trials": 30, "num_runs": 10, "num_episodes": 1}, "name": "Randomized Trial"}, 4 | "agent": { 5 | "params": 6 | { 7 | "alpha": 0.1, 8 | "lmbda": 0.7, 9 | "gamma": 1.0, 10 | "softmax": true 11 | }, "name": "Fixed Policy"} 12 | } -------------------------------------------------------------------------------- /params/mountaincar/ilstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "alpha": 0.000001, 12 | "epsilon": 0.01, 13 | "ilstd_sweeps": 1, 14 | "lmbda": 0.0, 15 | "gamma": 0.99, 16 | "basis": "fourier", 17 | "fourier_order": 3 18 | }, "name": "Incremental Least Squares TD"} 19 | } -------------------------------------------------------------------------------- /params/mountaincar/lspi.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Mountain Car"}, 3 | "experiment": {"params": {"num_episodes": 50}, "name": "Episodic"}, 4 | "agent": {"params": {"gamma": 1.0, "lspi_threshold": 0.001, "lstd_num_samples": 700, "lstd_precond": 0.001}, "name": "LSPI"} 5 | } 6 | -------------------------------------------------------------------------------- /params/mountaincar/lstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "lstd_update_freq": 100, 12 | "epsilon": 0.01, 13 | "basis": "fourier", 14 | "fourier_order": 3 15 | }, "name": "Least Squares Temporal Difference Learning"} 16 | } -------------------------------------------------------------------------------- /params/mountaincar/lstdq.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Mountain Car"}, 3 | "experiment": {"params": {"num_episodes": 50}, "name": "Episodic"}, 4 | "agent": {"params": {"lstd_update_freq": 500, "lstd_num_samples": 500, "lstd_precond": 0.1, "basis": "fourier"}, "name": "LSTD-Q"} 5 | } 6 | -------------------------------------------------------------------------------- /params/mountaincar/mdba.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "nonlinear_lr": 0.000001, 12 | "sparsity": 0.0001, 13 | "epsilon": 0.01, 14 | "gamma": 0.99, 15 | "lmbda": 0.7, 16 | "alpha": 0.007, 17 | "basis": "fourier", 18 | "fourier_order": 3 19 | }, "name": "Sparse Mirror Descent Q-Learning with Non-Linear Basis Adaptation"} 20 | } -------------------------------------------------------------------------------- /params/mountaincar/mdq.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "sparsity": 0.0001, 12 | "epsilon": 0.01, 13 | "gamma": 0.99, 14 | "lmbda": 0.7, 15 | "alpha": 0.007, 16 | "basis": "fourier", 17 | "fourier_order": 3 18 | }, "name": "Sparse Mirror Descent Q-Learning"} 19 | } -------------------------------------------------------------------------------- /params/mountaincar/mdsarsa.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "sparsity": 0.0001, 12 | "epsilon": 0.01, 13 | "gamma": 0.99, 14 | "lmbda": 0.7, 15 | "alpha": 0.007, 16 | "basis": "fourier", 17 | "fourier_order": 3 18 | }, "name": "Sparse Mirror Descent Sarsa"} 19 | } -------------------------------------------------------------------------------- /params/mountaincar/modelbased.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "planner_params": {"basis": "fourier", "regressor": "ridge", "iterations": 1000, "support_size": 50, "resample": 15}, 12 | "model_params": {"update_freq": 20, "known_threshold": 0.95, "max_experiences": 700}, 13 | "gamma": 0.99 14 | }, "name": "Model Based Agent"} 15 | } -------------------------------------------------------------------------------- /params/mountaincar/nac_lstdq.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "epsilon": 0.01, 12 | "nac_freq": 100, 13 | "gamma": 0.99, 14 | "lmbda": 0.7, 15 | "alpha": 0.004, 16 | "basis": "fourier", 17 | "fourier_order": 3 18 | }, "name": "Natural Actor-Critic with LSTD-Q"} 19 | } -------------------------------------------------------------------------------- /params/mountaincar/nacs.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "epsilon": 0.01, 12 | "beta": 0.004, 13 | "nac_freq": 200, 14 | "gamma": 0.99, 15 | "lmbda": 0.7, 16 | "alpha": 0.004, 17 | "basis": "fourier", 18 | "fourier_order": 3 19 | }, "name": "Natural Actor-Critic with Sarsa"} 20 | } -------------------------------------------------------------------------------- /params/mountaincar/olstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "epsilon": 0.01, 12 | "basis": "fourier", 13 | "fourier_order": 3 14 | }, "name": "Online Least Squares TD"} 15 | } -------------------------------------------------------------------------------- /params/mountaincar/qlearning.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "epsilon": 0.01, 12 | "gamma": 0.99, 13 | "lmbda": 0.7, 14 | "alpha": 0.004, 15 | "basis": "fourier", 16 | "fourier_order": 3 17 | }, "name": "Q-Learning"} 18 | } -------------------------------------------------------------------------------- /params/mountaincar/rlstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "rlstd_delta": 1.0, 12 | "epsilon": 0.01, 13 | "basis": "fourier", 14 | "fourier_order": 3 15 | }, "name": "Recursive Least Squares TD"} 16 | } -------------------------------------------------------------------------------- /params/mountaincar/sarsa.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Mountain Car"}, 3 | "experiment": {"params": {"num_episodes": 50, "num_runs": 5}, "name": "Episodic"}, 4 | "agent": { 5 | "params": { 6 | "basis": "fourier", 7 | "fourier_order": 3, 8 | "alpha": 0.004, 9 | "gamma": 1.0, 10 | "lmbda": 0.7, 11 | "epsilon": 0.01 12 | }, 13 | "name": "Sarsa" 14 | } 15 | } -------------------------------------------------------------------------------- /params/mountaincar/sarsa_ann.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Mountain Car"}, 3 | "experiment": {"params": {"num_episodes": 100}, "name": "Episodic"}, 4 | "agent": {"params": { 5 | "num_hidden": 5, 6 | "epsilon": 0.01, 7 | "gamma": 1.0, 8 | "lmbda": 0.9, 9 | "alpha": 0.00001 10 | }, "name": "Sarsa ANN"} 11 | } -------------------------------------------------------------------------------- /params/mountaincar/sarsa_lecun.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Mountain Car"}, 3 | "experiment": {"params": {"num_episodes": 30}, "name": "Episodic"}, 4 | "agent": {"params": {"basis": "fourier"}, "name": "Adaptive (InvMaxEigen) Sarsa"} 5 | } 6 | -------------------------------------------------------------------------------- /params/mountaincar/ttac1.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "beta": 0.001, 12 | "epsilon": 0.01, 13 | "gamma": 0.99, 14 | "lmbda": 0.7, 15 | "alpha": 0.0001, 16 | "basis": "fourier", 17 | "fourier_order": 3 18 | }, "name": "Two-Timescale Actor-Critic"} 19 | } -------------------------------------------------------------------------------- /params/mountaincar/ttnac3.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": { 3 | "params": { 4 | }, "name": "Mountain Car"}, 5 | "experiment": { 6 | "params": { 7 | "num_episodes": 50 8 | }, "name": "Episodic"}, 9 | "agent": { 10 | "params": { 11 | "beta": 0.001, 12 | "epsilon": 0.01, 13 | "gamma": 0.99, 14 | "lmbda": 0.7, 15 | "alpha": 0.0001, 16 | "basis": "fourier", 17 | "fourier_order": 3 18 | }, "name": "Two-Timescale Natural Actor-Critic"} 19 | } -------------------------------------------------------------------------------- /params/puddleworld/sarsa.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": {"params": {}, "name": "Puddle World"}, 3 | "experiment": {"params": {"num_episodes": 50, "num_runs": 30}, "name": "Episodic"}, 4 | "agent": { 5 | "params": { 6 | "basis": "fourier", 7 | "fourier_order": 3, 8 | "alpha": 0.1, 9 | "gamma": 1.0, 10 | "lmbda": 0.7, 11 | "epsilon": 0.01 12 | }, 13 | "name": "Sarsa" 14 | } 15 | } -------------------------------------------------------------------------------- /pyrl/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | */*.o 3 | */*.so 4 | 5 | # Packages 6 | *.egg 7 | *.egg-info 8 | dist 9 | build 10 | eggs 11 | parts 12 | bin 13 | var 14 | sdist 15 | develop-eggs 16 | .installed.cfg 17 | */*~ 18 | # Installer logs 19 | pip-log.txt 20 | 21 | # Unit test / coverage reports 22 | .coverage 23 | .tox 24 | 25 | #Translations 26 | *.mo 27 | 28 | #Mr Developer 29 | .mr.developer.cfg 30 | -------------------------------------------------------------------------------- /pyrl/Makefile: -------------------------------------------------------------------------------- 1 | 2 | DIRS = basis/CTiles/build environments/libPOMDP/build environments/mdptetris/build 3 | 4 | all: 5 | -for d in $(DIRS); do (mkdir $$d; cd $$d; cmake ..; $(MAKE) ); done 6 | 7 | clean: 8 | -for d in $(DIRS); do (mkdir $$d; cd $$d; cmake ..; $(MAKE) clean; cd ..; rm -rf build); done 9 | -find . -type f -name "*.pyc" -exec rm -f {} \; -------------------------------------------------------------------------------- /pyrl/README.md: -------------------------------------------------------------------------------- 1 | pyRL 2 | ========= 3 | 4 | I could rant all day long about the fact that most of the time Reinforcement Learning code 5 | available online tends to be completely broken, out of date, or very minimally useful. By 6 | far the biggest exception to this, in my opinion, has been the RL-Glue project. However, 7 | the project has either matured or been left on the shelf with few real updates in the last 8 | couple of years. 9 | 10 | pyRL is a project meant to provide an up to date collection of Reinforcement Learning 11 | agents, environments, and supporting methods written in Python, built on and extending the 12 | RL-Glue framework. Whenever possible it will make use of optimized python libraries such as 13 | numpy, scipy, scikits-learn, and neurolab. Some modules requiring additional speed will be 14 | written in C and will be compilable to Python modules. 15 | 16 | All agents and environments will be able to act as standalone RL-Glue network interfaces run 17 | from the commandline. However, pyRL also includes a module allowing agent, environment and experiment 18 | to be run together without the use of sockets. RL-Glue version 3.0 does not currently support that 19 | functionality for Python. 20 | 21 | This project is very much under development, but in the near-term I hope to have the most 22 | common RL environments, model-free and model-based agents implemented and working. From there 23 | I hope to add interesting new algorithms that I come across in the field (whenever I'm able to 24 | implement them successfully). 25 | 26 | --Will Dabney 27 | -------------------------------------------------------------------------------- /pyrl/TODO: -------------------------------------------------------------------------------- 1 | 2 | TODO for pyRL project 3 | ==================== 4 | 5 | Implement Environments (in Python) or Convert C/C++ into Module: 6 | ------------------------------ 7 | Dart Throwing (from Bruno Castro da Silva's paper on parameterized skills) 8 | Partially Observable Taxi 9 | Simulated-Simplified Red Room (GDK/S.K.'s work, similar to a more functional continuous playroom) 10 | N-DOF Reaching and Reaching through viapoint 11 | Ball in Cup simulation 12 | 13 | 14 | Implement Agents (in Python) or Convert C/C++ into Module: 15 | ------------------------------ 16 | 17 | Implement "save_trajectory filename numsteps" message support into base class of existing agents 18 | 19 | TD-delta Pi 20 | TDC / GTD 21 | Skill Chaining 22 | PoWER 23 | PI^2 24 | 25 | 26 | Extensions to Existing Implementations: 27 | ------------------------------ 28 | 29 | Add other exploration bonus methods to the modelbased agents 30 | Add pygame based viewer for Tetris environment 31 | Provide reasonable 'working' parameters for every agent algorithm. Currently missing: 32 | Sarsa ANN 33 | REINFORCE 34 | Composite Mirror Descent 35 | By having working parameters for every algorithm on at least one domain, and 36 | every domain for at least one algorithm, I plan to build a script which uses 37 | them as tests. This should ensure that I don't break things accidentally. 38 | Actual unit tests on RL agents and environments would be great, but until 39 | inspiration hits, this is probably the best approach. -------------------------------------------------------------------------------- /pyrl/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /pyrl/agents/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | */*~ 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /pyrl/agents/README.md: -------------------------------------------------------------------------------- 1 | pyrl.agents 2 | ========= 3 | 4 | Reinforcement Learning agents that have been implemented in python using the RLGlue framework. 5 | -------------------------------------------------------------------------------- /pyrl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Pierre-Luc Bacon 3 | 4 | __all__ = ["skeleton_agent", "stepsizes"] 5 | 6 | try: 7 | import sklearn 8 | __all__.append("modelbased") 9 | except: 10 | pass 11 | 12 | try: 13 | import pyrl.basis.tilecode 14 | __all__.append("qlearning") 15 | __all__.append("delayed_qlearning") 16 | __all__.append("sarsa_lambda") 17 | __all__.append("lstd") 18 | __all__.append("policy_gradient") 19 | __all__.append("mirror_descent") 20 | except: 21 | pass 22 | 23 | try: 24 | import neurolab 25 | __all__.append("sarsa_lambda_ann") 26 | except: 27 | pass 28 | 29 | -------------------------------------------------------------------------------- /pyrl/agents/delayed_qlearning.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import qlearning 5 | from pyrl.rlglue.registry import register_agent 6 | 7 | @register_agent 8 | class delayed_qlearning(qlearning.qlearning_agent): 9 | """Delayed Q-Learning algorithm. This algorithm is only directly applicable 10 | to discrete state, discrete action domains. Thus, it should throw an assertion 11 | failure if you attempt to use it not in such a domain. 12 | 13 | Unfortunately, I have no yet been able to get this to work consistently on 14 | the marble maze domain. It seems likely that it would work on something simpler 15 | like chain domain. Maybe there's a bug? 16 | 17 | From the paper: 18 | PAC Model-Free Reinforcement Learning. 2006. 19 | Alexander Strehl, Lihong Li, Eric Wiewiora, John Langford, and Michael Littman. 20 | """ 21 | 22 | name = "Delayed Q-Learning" 23 | 24 | def init_parameters(self): 25 | self.gamma = self.params.setdefault('gamma', 0.99) 26 | self.epsilon = self.params.setdefault('epsilon', 0.1) 27 | super(delayed_qlearning, self).init_parameters() 28 | self.m = self.params.setdefault('m', 100) 29 | 30 | @classmethod 31 | def agent_parameters(cls): 32 | param_set = parameter_set(cls.name, description="Parameters required for running an RL agent algorithm.") 33 | add_parameter(param_set, "epsilon", default=0.1) 34 | add_parameter(param_set, "gamma", default=0.99) 35 | add_parameter(param_set, "m", default=100, type=int, min=1, max=1000) 36 | return param_set 37 | 38 | def agent_supported(self, parsedSpec): 39 | if parsedSpec.valid: 40 | # Check observation form, and then set up number of features/states 41 | assert len(parsedSpec.getIntObservations()) > 0, "Expecting at least one discrete observation" 42 | assert len(parsedSpec.getDoubleObservations()) == 0, "Expecting no continuous observations." 43 | 44 | # Check action form, and then set number of actions 45 | assert len(parsedSpec.getIntActions())==1, "Expecting 1-dimensional discrete actions" 46 | assert len(parsedSpec.getDoubleActions())==0, "Expecting no continuous actions" 47 | assert not parsedSpec.isSpecial(parsedSpec.getIntActions()[0][0]), "Expecting min action to be a number not a special value" 48 | assert not parsedSpec.isSpecial(parsedSpec.getIntActions()[0][1]), "Expecting max action to be a number not a special value" 49 | self.reward_range = numpy.array(parsedSpec.getRewardRange()[0]) 50 | return True 51 | else: 52 | return False 53 | 54 | def agent_init(self,taskSpec): 55 | super(delayed_qlearning, self).agent_init(taskSpec) 56 | self.weights.fill(1./(1. - self.gamma)) 57 | self.updates = numpy.zeros(self.weights.shape) 58 | self.visit_count = numpy.zeros(self.weights.shape) 59 | self.update_time = numpy.zeros(self.weights.shape) 60 | self.LEARN = numpy.ones(self.weights.shape, dtype=bool) 61 | self.last_update = 0 62 | self.step_count = 0 63 | # Compute the 'correct' m to use (from the paper) 64 | # But tends to be so large as to be impractical 65 | #k = 1./((1. - self.gamma)*self.epsilon) 66 | #delta = 0.1 67 | #self.m = numpy.log(3. * self.numDiscStates * self.numActions * (1. + self.numDiscStates * self.numActions * k) / delta) 68 | #self.m /= 2. * self.epsilon**2 * (1. - self.gamma)**2 69 | #self.m = int(self.m) 70 | #print self.m 71 | 72 | def getAction(self, state, discState): 73 | """Get the action under the current policy for the given state. 74 | 75 | Args: 76 | state: The array of continuous state features 77 | discState: The integer representing the current discrete state value 78 | 79 | Returns: 80 | The current policy action, or a random action with some probability. 81 | """ 82 | return numpy.dot(self.weights[discState,:,:].T, self.basis.computeFeatures(state)).argmax() 83 | 84 | def update(self, phi_t, state, discState, reward): 85 | reward = (reward - self.reward_range[0]) / (self.reward_range[1] - self.reward_range[0]) 86 | self.step_count += 1 87 | state_action = numpy.where(phi_t != 0) 88 | if self.LEARN[state_action]: # If Learn[s,a] 89 | qvalues = self.getActionValues(state, discState) 90 | self.updates[state_action] += reward + self.gamma * qvalues.max() 91 | self.visit_count[state_action] += 1 92 | if self.visit_count[state_action] == self.m: 93 | if self.weights[state_action] - self.updates[state_action]/self.m >= 2. * self.epsilon: 94 | self.weights[state_action] = self.updates[state_action]/self.m + self.epsilon 95 | self.last_update = self.step_count 96 | #print (self.weights.ravel() < self.weights.max()).sum(), self.weights.size 97 | elif self.update_time[state_action] >= self.last_update: 98 | self.LEARN[state_action] = False 99 | self.update_time[state_action] = self.step_count 100 | self.updates[state_action] = 0 101 | self.visit_count[state_action] = 0 102 | elif self.update_time[state_action] < self.last_update: 103 | self.LEARN[state_action] = True 104 | 105 | if __name__=="__main__": 106 | from pyrl.agents.skeleton_agent import runAgent 107 | runAgent(delayed_qlearning) 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /pyrl/agents/models/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | */*~ 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /pyrl/agents/models/README.md: -------------------------------------------------------------------------------- 1 | pyrl.agents.models 2 | ========= 3 | 4 | Model learners and models for use with the RL agents. 5 | -------------------------------------------------------------------------------- /pyrl/agents/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __all__ = ["model", "batch_model"] 3 | -------------------------------------------------------------------------------- /pyrl/agents/models/model.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | class ModelLearner(object): 4 | 5 | def __init__(self, **kwargs): 6 | self.params = kwargs 7 | 8 | def model_init(self, numDiscStates, contFeatureRanges, numActions, rewardRange): 9 | self.numDiscStates = numDiscStates 10 | self.numContStates = len(contFeatureRanges) 11 | self.numActions = numActions 12 | self.reward_range = rewardRange 13 | self.feature_ranges = numpy.array([[0, self.numDiscStates-1]] + list(contFeatureRanges)) 14 | self.feature_span = numpy.ones((len(self.feature_ranges),)) 15 | non_constants = self.feature_ranges[:,0]!=self.feature_ranges[:,1] 16 | self.feature_span[non_constants] = self.feature_ranges[non_constants,1] - self.feature_ranges[non_constants,0] 17 | 18 | def randParameter(self, param_key, args, sample=None): 19 | """A utility function for use inside randomize_parameters. Takes a parameter 20 | key (name), the named arguments passed to randomize_parameters, and optionally 21 | the sampled random value to set in case the key does not exist in the arguments. 22 | 23 | This will then set it (if not already present) in args and assign which ever value 24 | args ends up with into params. 25 | """ 26 | if sample is None: 27 | sample = numpy.random.random() 28 | self.params[param_key] = args.setdefault(param_key, sample) 29 | 30 | def randomize_parameters(self, **args): 31 | """Generate parameters randomly, constrained by given named parameters. 32 | 33 | If used, this must be called before agent_init in order to have desired effect. 34 | 35 | Parameters that fundamentally change the algorithm are not randomized over. For 36 | example, basis and softmax fundamentally change the domain and have very few values 37 | to be considered. They are not randomized over. 38 | 39 | Basis parameters, on the other hand, have many possible values and ARE randomized. 40 | 41 | Args: 42 | **args: Named parameters to fix, which will not be randomly generated 43 | 44 | Returns: 45 | List of resulting parameters of the class. Will always be in the same order. 46 | Empty list if parameter free. 47 | 48 | """ 49 | return args 50 | 51 | def updateExperience(self, lastState, action, newState, reward): 52 | return False 53 | 54 | def getStateSpace(self): 55 | return self.feature_ranges, self.numActions 56 | 57 | # This method does not gaurantee that num_requested is filled, but will not 58 | # provide more than num_requested. 59 | def sampleStateActions(self, num_requested): 60 | pass 61 | 62 | def predict(self, state, action): 63 | pass 64 | 65 | def predictSet(self, states): 66 | pass 67 | 68 | 69 | def isKnown(self, state, action): 70 | return False 71 | 72 | 73 | -------------------------------------------------------------------------------- /pyrl/agents/planners/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | */*~ 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /pyrl/agents/planners/README.md: -------------------------------------------------------------------------------- 1 | pyrl.agents.planners 2 | ========= 3 | 4 | Planners for use with models and RL agents. 5 | -------------------------------------------------------------------------------- /pyrl/agents/planners/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __all__ = ["planner", "fitted_qiteration"] 3 | -------------------------------------------------------------------------------- /pyrl/agents/planners/planner.py: -------------------------------------------------------------------------------- 1 | 2 | from random import Random 3 | import numpy 4 | 5 | class Planner(object): 6 | 7 | def __init__(self, model, **kwargs): 8 | self.model = model 9 | self.gamma = kwargs.setdefault('gamma', 1.0) 10 | self.params = kwargs 11 | self.randGenerator = Random() 12 | 13 | 14 | def planner_init(self, numDiscStates, contFeatureRanges, numActions, rewardRange): 15 | pass 16 | 17 | def randParameter(self, param_key, args, sample=None): 18 | """A utility function for use inside randomize_parameters. Takes a parameter 19 | key (name), the named arguments passed to randomize_parameters, and optionally 20 | the sampled random value to set in case the key does not exist in the arguments. 21 | 22 | This will then set it (if not already present) in args and assign which ever value 23 | args ends up with into params. 24 | """ 25 | if sample is None: 26 | sample = numpy.random.random() 27 | self.params[param_key] = args.setdefault(param_key, sample) 28 | 29 | def randomize_parameters(self, **args): 30 | """Generate parameters randomly, constrained by given named parameters. 31 | 32 | Parameters that fundamentally change the algorithm are not randomized over. For 33 | example, basis and softmax fundamentally change the domain and have very few values 34 | to be considered. They are not randomized over. 35 | 36 | Basis parameters, on the other hand, have many possible values and ARE randomized. 37 | 38 | Args: 39 | **args: Named parameters to fix, which will not be randomly generated 40 | 41 | Returns: 42 | List of resulting parameters of the class. Will always be in the same order. 43 | Empty list if parameter free. 44 | 45 | """ 46 | self.randParameter('gamma', args) 47 | return args 48 | 49 | def updateExperience(self, lastState, action, newState, reward): 50 | if self.model.updateExperience(lastState, action, newState, reward): 51 | self.updatePlan() 52 | 53 | def updatePlan(self): 54 | pass 55 | 56 | def getAction(self, state): 57 | pass 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /pyrl/agents/qlearning.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Will Dabney 3 | 4 | from random import Random 5 | import numpy 6 | import copy 7 | 8 | from rlglue.agent.Agent import Agent 9 | from rlglue.agent import AgentLoader as AgentLoader 10 | from rlglue.types import Action 11 | from rlglue.types import Observation 12 | from rlglue.utils import TaskSpecVRLGLUE3 13 | from pyrl.rlglue.registry import register_agent 14 | 15 | import sarsa_lambda 16 | import stepsizes 17 | 18 | @register_agent 19 | class qlearning_agent(sarsa_lambda.sarsa_lambda): 20 | name = "Q-Learning" 21 | 22 | def agent_step(self,reward, observation): 23 | """Take one step in an episode for the agent, as the result of taking the last action. 24 | 25 | Args: 26 | reward: The reward received for taking the last action from the previous state. 27 | observation: The next observation of the episode, which is the consequence of taking the previous action. 28 | 29 | Returns: 30 | The next action the RL agent chooses to take, represented as an RLGlue Action object. 31 | """ 32 | 33 | newState = numpy.array(list(observation.doubleArray)) 34 | lastState = numpy.array(list(self.lastObservation.doubleArray)) 35 | lastAction = self.lastAction.intArray[0] 36 | 37 | newDiscState = self.getDiscState(observation.intArray) 38 | lastDiscState = self.getDiscState(self.lastObservation.intArray) 39 | 40 | # Update eligibility traces 41 | phi_t = numpy.zeros(self.traces.shape) 42 | phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) 43 | 44 | self.update_traces(phi_t, None) 45 | self.update(phi_t, newState, newDiscState, reward) 46 | 47 | # QLearning can choose action after update 48 | newIntAction = self.getAction(newState, newDiscState) 49 | returnAction=Action() 50 | returnAction.intArray=[newIntAction] 51 | 52 | self.lastAction=copy.deepcopy(returnAction) 53 | self.lastObservation=copy.deepcopy(observation) 54 | return returnAction 55 | 56 | def getActionValues(self, state, discState): 57 | if state is not None: 58 | return numpy.dot(self.weights[discState,:,:].T, self.basis.computeFeatures(state)) 59 | else: 60 | return numpy.zeros((self.numActions,)) 61 | 62 | def update(self, phi_t, state, discState, reward): 63 | qvalues = self.getActionValues(state, discState) 64 | a_tp = qvalues.argmax() 65 | phi_tp = numpy.zeros(self.traces.shape) 66 | if state is not None: 67 | phi_tp[discState, :, a_tp] = self.basis.computeFeatures(state) 68 | 69 | # Compute Delta (TD-error) 70 | delta = self.gamma*qvalues[a_tp] + reward - numpy.dot(self.weights.flatten(), phi_t.flatten()) 71 | 72 | # Update the weights with both a scalar and vector stepsize used 73 | # (Maybe we should actually make them both work together naturally) 74 | self.weights += self.rescale_update(phi_t, phi_tp, delta, reward, delta*self.traces) 75 | 76 | def agent_end(self,reward): 77 | """Receive the final reward in an episode, also signaling the end of the episode. 78 | 79 | Args: 80 | reward: The reward received for taking the last action from the previous state. 81 | """ 82 | 83 | lastState = numpy.array(list(self.lastObservation.doubleArray)) 84 | lastAction = self.lastAction.intArray[0] 85 | 86 | lastDiscState = self.getDiscState(self.lastObservation.intArray) 87 | 88 | # Update eligibility traces 89 | phi_t = numpy.zeros(self.traces.shape) 90 | phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) 91 | 92 | self.update_traces(phi_t, None) 93 | self.update(phi_t, None, 0, reward) 94 | 95 | 96 | 97 | if __name__=="__main__": 98 | from pyrl.agents.skeleton_agent import runAgent 99 | runAgent(qlearning_agent) 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /pyrl/basis/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | */*.o 3 | */*.so 4 | 5 | # Packages 6 | *.egg 7 | *.egg-info 8 | dist 9 | build 10 | eggs 11 | parts 12 | bin 13 | var 14 | sdist 15 | develop-eggs 16 | .installed.cfg 17 | */*~ 18 | # Installer logs 19 | pip-log.txt 20 | 21 | # Unit test / coverage reports 22 | .coverage 23 | .tox 24 | 25 | #Translations 26 | *.mo 27 | 28 | #Mr Developer 29 | .mr.developer.cfg 30 | -------------------------------------------------------------------------------- /pyrl/basis/CTiles/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | */*.o 3 | */*.so 4 | 5 | # Packages 6 | *.egg 7 | *.egg-info 8 | dist 9 | build 10 | eggs 11 | parts 12 | bin 13 | var 14 | sdist 15 | develop-eggs 16 | .installed.cfg 17 | */*~ 18 | # Installer logs 19 | pip-log.txt 20 | 21 | # Unit test / coverage reports 22 | .coverage 23 | .tox 24 | 25 | #Translations 26 | *.mo 27 | 28 | #Mr Developer 29 | .mr.developer.cfg 30 | -------------------------------------------------------------------------------- /pyrl/basis/CTiles/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | PROJECT(CTiles) 4 | 5 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) 6 | 7 | file(GLOB SRC 8 | "src/*.h" 9 | "src/*.cpp" 10 | "src/*.C" 11 | ) 12 | 13 | find_package(PythonInterp REQUIRED) 14 | find_package(PythonLibs REQUIRED) 15 | 16 | include_directories(${PYTHON_INCLUDE_DIRS}) 17 | 18 | 19 | add_library(tiles MODULE ${SRC}) 20 | set_target_properties(tiles PROPERTIES PREFIX "") 21 | target_link_libraries(tiles ${PYTHON_LIBRARIES}) 22 | -------------------------------------------------------------------------------- /pyrl/basis/CTiles/README.md: -------------------------------------------------------------------------------- 1 | pyrl.basis.CTiles 2 | ==================== 3 | 4 | This folder contains the C version of tile coding, as well as the python routines which call the c version of tiles. 5 | This was written by Rich Sutton and only the makefile has changed from the original. 6 | 7 | The following files are here: 8 | 9 | Makefile - compiles both the C version and the Python->C version (for Mac or Linux) 10 | tiles.h - header for C version of tiles 11 | tiles.cpp - c++ version of tiles 12 | tiletimes.cpp - timing code for c calling c version of tiles 13 | tilesInt.C - interface so that Python can call the c version 14 | tiletimes.py - timing code for the python calling c version of tiles 15 | fancytiles.py - code to get different shapes and sizes of tiles 16 | 17 | To use these: 18 | In a terminal window: 19 | cmake . 20 | make 21 | ... this creates the tiles.so and tiles.o files 22 | 23 | 24 | Note About CMake and Python on Mac: 25 | For some reason things can sometimes get messed up with this combination. Some people claim 26 | this is a bug in cmake or a bug from mac. It comes up when you have multiple python distributions 27 | installed. So, most people should be fine, but if you get a fatal error when trying to use this 28 | module in python you should look into uninstalling the unused distributions or pass to cmake 29 | the following arguments with the correct values filled in: 30 | 31 | -DPYTHON_LIBRARY=... -DPYTHON_INCLUDE=... 32 | 33 | -------------------------------------------------------------------------------- /pyrl/basis/CTiles/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /pyrl/basis/CTiles/src/tiles.h: -------------------------------------------------------------------------------- 1 | #ifndef _TILES_H_ #define _TILES_H_ #include #include #include #include #define MAX_NUM_VARS 20 // Maximum number of variables in a grid-tiling #define MAX_NUM_COORDS 100 // Maximum number of hashing coordinates #define MaxLONGINT 2147483647 void tiles( int the_tiles[], // provided array contains returned tiles (tile indices) int num_tilings, // number of tile indices to be returned in tiles int memory_size, // total number of possible tiles float floats[], // array of floating point variables int num_floats, // number of floating point variables int ints[], // array of integer variables int num_ints); // number of integer variables class collision_table { public: collision_table(int,int); ~collision_table(); long m; long *data; int safe; long calls; long clearhits; long collisions; void reset(); int usage(); void print(); void save(int); void restore(int); }; void tiles( int the_tiles[], // provided array contains returned tiles (tile indices) int num_tilings, // number of tile indices to be returned in tiles collision_table *ctable, // total number of possible tiles float floats[], // array of floating point variables int num_floats, // number of floating point variables int ints[], // array of integer variables int num_ints); // number of integer variables int hash_UNH(int *ints, int num_ints, long m, int increment); int hash(int *ints, int num_ints, collision_table *ctable); // no ints void tiles(int the_tiles[],int nt,int memory,float floats[],int nf); void tiles(int the_tiles[],int nt,collision_table *ct,float floats[],int nf); // one int void tiles(int the_tiles[],int nt,int memory,float floats[],int nf,int h1); void tiles(int the_tiles[],int nt,collision_table *ct,float floats[],int nf,int h1); // two ints void tiles(int the_tiles[],int nt,int memory,float floats[],int nf,int h1,int h2); void tiles(int the_tiles[],int nt,collision_table *ct,float floats[],int nf,int h1,int h2); // three ints void tiles(int the_tiles[],int nt,int memory,float floats[],int nf,int h1,int h2,int h3); void tiles(int the_tiles[],int nt,collision_table *ct,float floats[],int nf,int h1,int h2,int h3); // one float, no ints void tiles1(int the_tiles[],int nt,int memory,float f1); void tiles1(int the_tiles[],int nt,collision_table *ct,float f1); // one float, one int void tiles1(int the_tiles[],int nt,int memory,float f1,int h1); void tiles1(int the_tiles[],int nt,collision_table *ct,float f1,int h1); // one float, two ints void tiles1(int the_tiles[],int nt,int memory,float f1,int h1,int h2); void tiles1(int the_tiles[],int nt,collision_table *ct,float f1,int h1,int h2); // one float, three ints void tiles1(int the_tiles[],int nt,int memory,float f1,int h1,int h2,int h3); void tiles1(int the_tiles[],int nt,collision_table *ct,float f1,int h1,int h2,int h3); // two floats, no ints void tiles2(int the_tiles[],int nt,int memory,float f1,float f2); void tiles2(int the_tiles[],int nt,collision_table *ct,float f1,float f2); // two floats, one int void tiles2(int the_tiles[],int nt,int memory,float f1,float f2,int h1); void tiles2(int the_tiles[],int nt,collision_table *ct,float f1,float f2,int h1); // two floats, two ints void tiles2(int the_tiles[],int nt,int memory,float f1,float f2,int h1,int h2); void tiles2(int the_tiles[],int nt,collision_table *ct,float f1,float f2,int h1,int h2); // two floats, three ints void tiles2(int the_tiles[],int nt,int memory,float f1,float f2,int h1,int h2,int h3); void tiles2(int the_tiles[],int nt,collision_table *ct,float f1,float f2,int h1,int h2,int h3); void tileswrap( int the_tiles[], // provided array contains returned tiles (tile indices) int num_tilings, // number of tile indices to be returned in tiles int memory_size, // total number of possible tiles float floats[], // array of floating point variables int num_floats, // number of floating point variables int wrap_widths[], // array of widths (length and units as in floats) int ints[], // array of integer variables int num_ints); // number of integer variables void tileswrap( int the_tiles[], // provided array contains returned tiles (tile indices) int num_tilings, // number of tile indices to be returned in tiles collision_table *ctable, // total number of possible tiles float floats[], // array of floating point variables int num_floats, // number of floating point variables int wrap_widths[], // array of widths (length and units as in floats) int ints[], // array of integer variables int num_ints); // number of integer variables #endif -------------------------------------------------------------------------------- /pyrl/basis/CTiles/tiletimes.py: -------------------------------------------------------------------------------- 1 | # tile timing tests 2 | import random 3 | random.seed(65597) 4 | 5 | import tiles 6 | import timeit 7 | 8 | def runit (num=10, ct=2048, numt=1): 9 | for i in xrange(num): 10 | for j in xrange(num): 11 | t = tiles.tiles(numt, ct, [i*0.5, j*0.5]) 12 | def runit2 (num=10, ct=2048, numt=1): 13 | for i in xrange(num): 14 | for j in xrange(num): 15 | t = tiles.tiles(numt, ct, [i*0.5, j*0.5, float(i+j)/2, float(i-j)/2], [i, j]) 16 | def runitw (num=10, ct=2048, numt=1): 17 | for i in xrange(num): 18 | for j in xrange(num): 19 | t = tiles.tileswrap(numt, ct, [i*0.5, j*0.5], [10, 1]) 20 | def runitl (num=10, ct=2048, numt=1): 21 | tlist = [None for i in range(num*num*numt)] 22 | for i in xrange(num): 23 | for j in xrange(num): 24 | t = tiles.loadtiles(tlist, i*num*numt+j, numt, ct, [i*0.5, j*0.5]) 25 | def runitlw (num=10, ct=2048, numt=1): 26 | tlist = [None for i in range(num*num*numt)] 27 | for i in xrange(num): 28 | for j in xrange(num): 29 | tiles.loadtileswrap(tlist, i*num*numt+j, numt, ct, [i*0.5, j*0.5], [10, 1]) 30 | return tlist 31 | 32 | def initct(mem=16384): 33 | global ctu, cts, ctss 34 | ctu=tiles.CollisionTable(mem, safetyval='unsafe') 35 | cts=tiles.CollisionTable(mem, safetyval='safe') 36 | ctss=tiles.CollisionTable(mem, safetyval='super safe') 37 | 38 | def timetest(command, info, info2='2 floats', num=100, numt=1, mem=16384): 39 | initct(mem) 40 | print " " 41 | print info 42 | print "Timing over", num*num, "calls to tiles,", numt, "tiling each for", info2 43 | t= timeit.Timer(command + '('+str(num)+','+str(mem)+','+str(numt)+')', 'from __main__ import ' + command) 44 | print "With no collision table", t.timeit(1), "seconds" 45 | t= timeit.Timer(command + '('+str(num)+', ctu'+','+str(numt)+')', 'from __main__ import ctu, ' + command) 46 | print "With unsafe collision table", t.timeit(1), "seconds" 47 | print ctu 48 | t= timeit.Timer(command + '('+str(num)+', cts'+','+str(numt)+')', 'from __main__ import cts, ' + command) 49 | print "With safe collision table", t.timeit(1), "seconds" 50 | print cts 51 | t= timeit.Timer(command + '('+str(num)+', ctss'+','+str(numt)+')', 'from __main__ import ctss, ' + command) 52 | print "With super safe collision table", t.timeit(1), "seconds" 53 | print ctss 54 | print " " 55 | #print "Timing over", num*num, "calls to tiles, 16 tilings each for", info2 56 | #t= timeit.Timer(command + '('+str(num)+', 16384, 16)', 'from __main__ import ' + command) 57 | #print "With no collision table", t.timeit(1), "seconds" 58 | 59 | timetest('runit', "Standard test", numt=4) 60 | #timetest('runit2', 'Testing with more input variables','4 floats, 2 ints', 100, 3, 32768) 61 | timetest('runitw', 'WRAP version', numt=4) 62 | timetest('runitl', 'Load version', '2 floats', 100, 4) # only do 10 x 10 calls, but with 4 tilings each 63 | timetest('runitlw', 'Load WRAP version', '2 floats', 100, 4) 64 | 65 | -------------------------------------------------------------------------------- /pyrl/basis/README.md: -------------------------------------------------------------------------------- 1 | python-rl.basis 2 | ========= 3 | 4 | Basis functions used in function (usually linear) approximation by the agents. 5 | -------------------------------------------------------------------------------- /pyrl/basis/Tiles/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | */*.o 3 | */*.so 4 | 5 | # Packages 6 | *.egg 7 | *.egg-info 8 | dist 9 | build 10 | eggs 11 | parts 12 | bin 13 | var 14 | sdist 15 | develop-eggs 16 | .installed.cfg 17 | */*~ 18 | # Installer logs 19 | pip-log.txt 20 | 21 | # Unit test / coverage reports 22 | .coverage 23 | .tox 24 | 25 | #Translations 26 | *.mo 27 | 28 | #Mr Developer 29 | .mr.developer.cfg 30 | -------------------------------------------------------------------------------- /pyrl/basis/Tiles/README.md: -------------------------------------------------------------------------------- 1 | pyrl.basis.Tiles 2 | ================= 3 | 4 | This is the tile coding python implementation provided by Rich Sutton. The only reason to prefer this 5 | python only implementation over the CTiles package is that CTiles is very slow when doing loadtiles calls. 6 | 7 | Contents: 8 | tiles.py - python tiles code 9 | fancytiles.py - code for making different shapes and sizes of tiles 10 | tiletimes.py - timing code for tiles 11 | -------------------------------------------------------------------------------- /pyrl/basis/Tiles/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyrl/basis/Tiles/tiletimes.py: -------------------------------------------------------------------------------- 1 | # tile timing tests 2 | import random 3 | random.seed(65597) 4 | 5 | import tiles 6 | import tilesn 7 | import timeit 8 | 9 | def runit (num=10, ct=2048, numt=1): 10 | for i in xrange(num): 11 | for j in xrange(num): 12 | t = tiles.tiles(numt, ct, [i*0.5, j*0.5]) 13 | def runitn (num=10, ct=2048, numt=4): 14 | for i in xrange(num): 15 | for j in xrange(num): 16 | t = tilesn.tiles(numt, ct, [i*0.5, j*0.5]) 17 | def runit2 (num=10, ct=2048, numt=1): 18 | for i in xrange(num): 19 | for j in xrange(num): 20 | t = tiles.tiles(numt, ct, [i*0.5, j*0.5, float(i+j)/2, float(i-j)/2], [i, j]) 21 | def runitw (num=10, ct=2048, numt=1): 22 | for i in xrange(num): 23 | for j in xrange(num): 24 | t = tiles.tileswrap(numt, ct, [i*0.5, j*0.5], [10, 1]) 25 | def runitl (num=10, ct=2048, numt=1): 26 | tlist = [None for i in range(num*num*numt)] 27 | for i in xrange(num): 28 | for j in xrange(num): 29 | t = tiles.loadtiles(tlist, i*num*numt+j, numt, ct, [i*0.5, j*0.5]) 30 | def runitlw (num=10, ct=2048, numt=1): 31 | tlist = [None for i in range(num*num*numt)] 32 | for i in xrange(num): 33 | for j in xrange(num): 34 | tiles.loadtileswrap(tlist, i*num*numt+j, numt, ct, [i*0.5, j*0.5], [10, 1]) 35 | return tlist 36 | 37 | def initct(mem=16384): 38 | global ctu, cts, ctss 39 | ctu=tiles.CollisionTable(mem, safetyval='unsafe') 40 | cts=tiles.CollisionTable(mem, safetyval='safe') 41 | ctss=tiles.CollisionTable(mem, safetyval='super safe') 42 | 43 | def timetest(command, info, info2='2 floats', num=100, numt=1, mem=16384): 44 | initct(mem) 45 | print " " 46 | print info 47 | print "Timing over", num*num, "calls to tiles,", numt, "tiling each for", info2 48 | t= timeit.Timer(command + '('+str(num)+','+str(mem)+','+str(numt)+')', 'from __main__ import ' + command) 49 | print "With no collision table", t.timeit(1), "seconds" 50 | t= timeit.Timer(command + '('+str(num)+', ctu'+','+str(numt)+')', 'from __main__ import ctu, ' + command) 51 | print "With unsafe collision table", t.timeit(1), "seconds" 52 | print ctu 53 | t= timeit.Timer(command + '('+str(num)+', cts'+','+str(numt)+')', 'from __main__ import cts, ' + command) 54 | print "With safe collision table", t.timeit(1), "seconds" 55 | print cts 56 | t= timeit.Timer(command + '('+str(num)+', ctss'+','+str(numt)+')', 'from __main__ import ctss, ' + command) 57 | print "With super safe collision table", t.timeit(1), "seconds" 58 | print ctss 59 | print " " 60 | print "Timing over", num*num, "calls to tiles, 16 tilings each for", info2 61 | t= timeit.Timer(command + '('+str(num)+', 16384, 16)', 'from __main__ import ' + command) 62 | print "With no collision table", t.timeit(1), "seconds" 63 | 64 | timetest('runit', "Standard test", numt=4) 65 | timetest('runit2', 'Testing with more input variables','4 floats, 2 ints', 100, 3, 32768) 66 | timetest('runitw', 'WRAP version', numt=4) 67 | timetest('runitl', 'Load version', '2 floats', 100, 4) # only do 10 x 10 calls, but with 4 tilings each 68 | timetest('runitlw', 'Load WRAP version', '2 floats', 100, 4) 69 | 70 | """ 71 | print " " 72 | print "Tiles with num array" 73 | ctu=tilesn.CollisionTable(16384, safetyval='unsafe') 74 | cts=tilesn.CollisionTable(16384, safetyval='safe') 75 | ctss=tilesn.CollisionTable(16384, safetyval='super safe') 76 | print "Timing over 10000 calls to tiles with numarray, 4 tiling each for 2 floats" 77 | t= timeit.Timer('runitn(100, 16384)', 'from __main__ import runitn') 78 | print "With no collision table", t.timeit(1), "seconds" 79 | t= timeit.Timer('runitn(100, ctu)', 'from __main__ import runitn, ctu') 80 | print "With unsafe collision table", t.timeit(1), "seconds" 81 | print ctu 82 | t= timeit.Timer('runitn(100, cts)', 'from __main__ import runitn, cts') 83 | print "With safe collision table", t.timeit(1), "seconds" 84 | print cts 85 | #t= timeit.Timer('runitn(100, ctss)', 'from __main__ import runitn, ctss') 86 | #print "With super safe collision table", t.timeit(1), "seconds" 87 | #print ctss # need different array comparisons for super safe 88 | print " " 89 | print "Timing over 10000 calls to tiles with numarray, 16 tilings each for 2 floats" 90 | t= timeit.Timer('runitn(100, 16384, 16)', 'from __main__ import runitn') 91 | print "With no collision table", t.timeit(1), "seconds" 92 | """ 93 | 94 | 95 | -------------------------------------------------------------------------------- /pyrl/basis/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyrl/basis/fourier.py: -------------------------------------------------------------------------------- 1 | import numpy, itertools 2 | import trivial 3 | 4 | class FourierBasis(trivial.TrivialBasis): 5 | """Fourier Basis linear function approximation. Requires the ranges for each dimension, and is thus able to 6 | use only sine or cosine (and uses cosine). So, this has half the coefficients that a full Fourier approximation 7 | would use. 8 | 9 | From the paper: 10 | G.D. Konidaris, S. Osentoski and P.S. Thomas. 11 | Value Function Approximation in Reinforcement Learning using the Fourier Basis. 12 | In Proceedings of the Twenty-Fifth Conference on Artificial Intelligence, pages 380-385, August 2011. 13 | """ 14 | 15 | def __init__(self, nvars, ranges, order=3): 16 | nterms = pow(order + 1.0, nvars) 17 | self.numTerms = nterms 18 | self.order = order 19 | self.ranges = numpy.array(ranges) 20 | iter = itertools.product(range(order+1), repeat=nvars) 21 | self.multipliers = numpy.array([list(map(int,x)) for x in iter]) 22 | 23 | def computeFeatures(self, features): 24 | if len(features) == 0: 25 | return numpy.ones((1,)) 26 | basisFeatures = numpy.array([self.scale(features[i],i) for i in range(len(features))]) 27 | return numpy.cos(numpy.pi * numpy.dot(self.multipliers, basisFeatures)) 28 | 29 | 30 | -------------------------------------------------------------------------------- /pyrl/basis/rbf.py: -------------------------------------------------------------------------------- 1 | 2 | # Simplest Gaussian RBF implementation ever. 3 | import numpy, trivial 4 | 5 | class RBFBasis(trivial.TrivialBasis): 6 | """Radial Basis Functions basis. This implementation is just about as simplistic as it gets. 7 | This really could use some work to make it more competitive with state of the art. 8 | """ 9 | 10 | def __init__(self, nvars, ranges, num_functions=10, beta=0.9): 11 | trivial.TrivialBasis.__init__(self, nvars, ranges) 12 | self.beta = beta 13 | self.num_functions = num_functions 14 | self.centers = numpy.random.uniform(self.ranges[:,0], self.ranges[:,1].T, (self.num_functions,self.numTerms)) 15 | 16 | def getNumBasisFunctions(self): 17 | return self.num_functions 18 | 19 | def computeFeatures(self, features): 20 | if len(features) == 0: 21 | return numpy.ones((1,)) 22 | features = numpy.array(features) 23 | return numpy.array([numpy.exp(-self.beta * numpy.linalg.norm(features-c)**2) for c in self.centers]) 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /pyrl/basis/tilecode.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy 3 | import trivial 4 | from CTiles import tiles 5 | 6 | class TileCodingBasis(trivial.TrivialBasis): 7 | """ Tile Coding Basis. From Rich Sutton's implementation, 8 | http://incompleteideas.net/rlai.cs.ualberta.ca/RLAI/RLtoolkit/tiles.html 9 | """ 10 | 11 | def __init__(self, nvars, ranges, num_tiles=100, num_weights=2048): 12 | trivial.TrivialBasis.__init__(self, nvars, ranges) 13 | self.num_tiles = num_tiles 14 | self.mem_size = num_weights 15 | 16 | def getNumBasisFunctions(self): 17 | return self.mem_size 18 | 19 | def computeFeatures(self, features): 20 | if len(features) == 0: 21 | return numpy.ones((1,)) 22 | features = list(trivial.TrivialBasis.computeFeatures(self, features)) 23 | indices = tiles.tiles(self.num_tiles, self.mem_size, features) 24 | result = numpy.zeros((self.mem_size,)) 25 | result[indices] = 1.0 26 | return result 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /pyrl/basis/trivial.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | class TrivialBasis(object): 4 | """Uses the features themselves as a basis. However, does a little bit of basic manipulation 5 | to make things more reasonable. Specifically, this allows (defaults to) rescaling to be in the 6 | range [-1, +1]. 7 | """ 8 | 9 | def __init__(self, nvars, ranges): 10 | self.numTerms = nvars 11 | self.ranges = numpy.array(ranges) 12 | 13 | def scale(self, value, pos): 14 | if self.ranges[pos,0] == self.ranges[pos,1]: 15 | return 0.0 16 | else: 17 | return (value - self.ranges[pos,0]) / (self.ranges[pos,1] - self.ranges[pos,0]) 18 | 19 | def getNumBasisFunctions(self): 20 | return self.numTerms 21 | 22 | def computeFeatures(self, features): 23 | if len(features) == 0: 24 | return numpy.ones((1,)) 25 | return (numpy.array([self.scale(features[i],i) for i in range(len(features))]) - 0.5)*2. 26 | 27 | 28 | -------------------------------------------------------------------------------- /pyrl/environments/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | */*.o 3 | */*.so 4 | 5 | # Packages 6 | *.egg 7 | *.egg-info 8 | dist 9 | build 10 | eggs 11 | parts 12 | bin 13 | var 14 | sdist 15 | develop-eggs 16 | .installed.cfg 17 | */*~ 18 | # Installer logs 19 | pip-log.txt 20 | 21 | # Unit test / coverage reports 22 | .coverage 23 | .tox 24 | 25 | #Translations 26 | *.mo 27 | 28 | #Mr Developer 29 | .mr.developer.cfg 30 | -------------------------------------------------------------------------------- /pyrl/environments/README.md: -------------------------------------------------------------------------------- 1 | pyrl.environments 2 | ========= 3 | 4 | Different environments/domains implemented in python with RLGlue. 5 | -------------------------------------------------------------------------------- /pyrl/environments/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Pierre-Luc Bacon 3 | 4 | __all__ = ["fuelworld", "gridworld", "mountaincar", "acrobot", "cartpole", 5 | "multiroom", "skeleton_environment", "taxi", "windyworld", 6 | "batch_replenish", "puddleworld", "neurostim", "marble_maze", 7 | "bicycle", "chain", "twip"] 8 | 9 | try: 10 | from libPOMDP import libpomdp 11 | __all__.append("pomdp") 12 | except: 13 | pass 14 | 15 | try: 16 | from mdptetris import mdptetris 17 | __all__.append("tetris") 18 | except: 19 | pass 20 | 21 | 22 | try: 23 | import pygame 24 | __all__.append("pinball") 25 | except: 26 | pass 27 | -------------------------------------------------------------------------------- /pyrl/environments/batch_replenish.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import numpy 18 | from rlglue.environment.Environment import Environment 19 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 20 | from rlglue.types import Observation 21 | from rlglue.types import Action 22 | from rlglue.types import Reward_observation_terminal 23 | from pyrl.rlglue import TaskSpecRLGlue 24 | from pyrl.rlglue.registry import register_environment 25 | 26 | @register_environment 27 | class BatchReplenishment(Environment): 28 | """Batch replenishment inventory control task. 29 | 30 | The domain was given by George and Powell 2006. It is an example of a simple 31 | domain in which no fixed step-size performs well, but adaptive step-sizes 32 | do well. 33 | """ 34 | 35 | name = "Batch Replenishment" 36 | 37 | def __init__(self, demand_mean = 10.0, demand_std = 1.0, payoff = 5., 38 | cost = 2., gamma = 0.99, time_period = 20, noise=0.0): 39 | 40 | self.T = time_period 41 | self.noise = noise 42 | self.demand = numpy.array([demand_mean, demand_std]) 43 | self.payoff = payoff 44 | self.cost = cost 45 | self.discount = gamma 46 | self.max_quantity = 200. 47 | self.domain_name = "Noisy Batch Replenishment Problem" 48 | 49 | def makeTaskSpec(self): 50 | ts = TaskSpecRLGlue.TaskSpec(discount_factor=self.discount, 51 | reward_range=(-self.max_quantity * self.cost, 52 | self.max_quantity * self.payoff)) 53 | ts.addDiscreteAction((0, 3)) # Representing purchase of 0, 1, 10, and 100 units 54 | ts.addContinuousObservation((0.0, self.max_quantity)) 55 | ts.addContinuousObservation((0.0, self.max_quantity)) 56 | ts.setEpisodic() 57 | ts.setExtra(self.domain_name) 58 | return ts.toTaskSpec() 59 | 60 | def reset(self): 61 | # Start with no resources in stock, and no unsatisfied demand 62 | self.state = numpy.zeros((2,)) 63 | self.counter = 0 64 | 65 | def env_init(self): 66 | return self.makeTaskSpec() 67 | 68 | def env_start(self): 69 | self.reset() 70 | returnObs = Observation() 71 | returnObs.doubleArray = self.state.tolist() 72 | return returnObs 73 | 74 | def takeAction(self, intAction): 75 | x = 0. if intAction == 0 else 10.**(intAction-1) 76 | self.counter += 1 77 | # If noisy, create noise on cost/payoff 78 | paynoise = numpy.random.normal(scale=self.noise) if self.noise > 0 else 0.0 79 | costnoise = numpy.random.normal(scale=self.noise) if self.noise > 0 else 0.0 80 | 81 | # Update random demand 82 | self.state[1] = min(self.max_quantity, 83 | max(0., numpy.random.normal(self.demand[0], scale=self.demand[1]))) 84 | reward = (self.payoff + paynoise) * self.state.min() - (self.cost + costnoise) * x 85 | self.state[0] = min(self.max_quantity, max(0., self.state[0] - self.state[1]) + x) 86 | 87 | 88 | return reward/600. 89 | 90 | def env_step(self,thisAction): 91 | intAction = thisAction.intArray[0] 92 | theReward = self.takeAction(intAction) 93 | 94 | theObs = Observation() 95 | theObs.doubleArray = self.state.tolist() 96 | 97 | returnRO = Reward_observation_terminal() 98 | returnRO.r = theReward 99 | returnRO.o = theObs 100 | returnRO.terminal = int(self.counter >= self.T) 101 | 102 | return returnRO 103 | 104 | def env_cleanup(self): 105 | pass 106 | 107 | def env_message(self,inMessage): 108 | return "I don't know how to respond to your message"; 109 | 110 | 111 | if __name__=="__main__": 112 | import argparse 113 | parser = argparse.ArgumentParser(description='Run 2D Noisy Continuous Gridworld environment in network mode.') 114 | parser.add_argument("--demand_mean", type=float, default=10., help="Mean demand for the product.") 115 | parser.add_argument("--demand_std", type=float, default=1., 116 | help="Standard deviation of demand for the product.") 117 | parser.add_argument("--payoff", type=float, default=5., help="Payment received per unit product sold.") 118 | parser.add_argument("--cost", type=float, default=2., help="Cost per unit product purchased.") 119 | parser.add_argument("--discount_factor", type=float, default=0.999, help="Discount factor to learn over.") 120 | parser.add_argument("--time_period", type=int, default=20, help="Time period for problem. (Number of steps to run)") 121 | parser.add_argument("--noise", type=float, default=0, help="Standard deviation of additive noise to generate") 122 | args = parser.parse_args() 123 | EnvironmentLoader.loadEnvironment(BatchReplenishment(demand_mean=args.demand_mean, 124 | demand_std=args.demand_std, 125 | payoff=args.payoff, 126 | cost=args.cost, 127 | gamma=args.discount_factor, 128 | time_period = args.time_period, 129 | noise=args.noise)) 130 | -------------------------------------------------------------------------------- /pyrl/environments/chain.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import numpy 18 | from rlglue.environment.Environment import Environment 19 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 20 | from rlglue.types import Observation 21 | from rlglue.types import Action 22 | from rlglue.types import Reward_observation_terminal 23 | from pyrl.rlglue import TaskSpecRLGlue 24 | from pyrl.rlglue.registry import register_environment 25 | 26 | @register_environment 27 | class Chain(Environment): 28 | """The simple 5-state chain domain often used in the literature for more 29 | theoretical methods that don't scale as well to large problems. Its also 30 | a good demonstration of the need for sufficient exploration. 31 | 32 | From paper: 33 | Bayesian Q-learning. 1998. 34 | Richard Dearden, Nir Friedman, and Stuart Russell. 35 | """ 36 | name = "Chain" 37 | 38 | def __init__(self, **kwargs): 39 | self.state = 0 40 | self.chain_size = kwargs.setdefault("chain_size", 5) 41 | self.slip_prob = kwargs.setdefault("slip_prob", 0.2) 42 | self.goal_reward = 10.0 43 | self.left_reward = 2.0 44 | self.right_reward = 0.0 45 | 46 | def makeTaskSpec(self): 47 | ts = TaskSpecRLGlue.TaskSpec(discount_factor=0.99, reward_range=(0.0, 10.0)) 48 | ts.addDiscreteAction((0, 1)) 49 | ts.addDiscreteObservation((0, self.chain_size-1)) 50 | ts.setContinuing() 51 | ts.setExtra(self.name) 52 | return ts.toTaskSpec() 53 | 54 | def getState(self): 55 | return [self.state] 56 | 57 | def reset(self): 58 | self.state = 0 59 | 60 | def env_init(self): 61 | return self.makeTaskSpec() 62 | 63 | def env_start(self): 64 | self.reset() 65 | returnObs = Observation() 66 | returnObs.intArray = self.getState() 67 | return returnObs 68 | 69 | def isAtGoal(self): 70 | return self.state == self.chain_size-1 71 | 72 | def takeAction(self, intAction): 73 | if numpy.random.random() < self.slip_prob: 74 | intAction = 0 if intAction == 1 else 1 75 | 76 | if intAction == 0: 77 | self.state = 0 78 | return self.left_reward 79 | else: 80 | self.state = min(self.chain_size-1, self.state+1) 81 | if self.isAtGoal(): 82 | return self.goal_reward 83 | else: 84 | return self.right_reward 85 | 86 | def env_step(self,thisAction): 87 | intAction = int(thisAction.intArray[0]) 88 | theReward = self.takeAction(intAction) 89 | theObs = Observation() 90 | theObs.intArray = self.getState() 91 | 92 | returnRO = Reward_observation_terminal() 93 | returnRO.r = theReward 94 | returnRO.o = theObs 95 | returnRO.terminal = 0 96 | 97 | return returnRO 98 | 99 | def env_cleanup(self): 100 | pass 101 | 102 | def env_message(self,inMessage): 103 | return "I don't know how to respond to your message"; 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /pyrl/environments/configs/neurostim/params.dat: -------------------------------------------------------------------------------- 1 | test_features.dat 2 | test_labels.dat 3 | test_stimulation.dat 4 | 23198 5 | 5 6 | 3 7 | 3 8 | 0.02 9 | 5 -------------------------------------------------------------------------------- /pyrl/environments/configs/neurostim/test_stimulation.dat: -------------------------------------------------------------------------------- 1 | -0.3708554 -0.4063264 0.5785546 0.4593724 0.3791109 2 | -------------------------------------------------------------------------------- /pyrl/environments/configs/pinball/pinball_hard_single.cfg: -------------------------------------------------------------------------------- 1 | ball 0.015 2 | target 0.5 0.06 0.04 3 | start 0.055 0.95 4 | 5 | polygon 0.0 0.0 0.0 0.01 1.0 0.01 1.0 0.0 6 | polygon 0.0 0.0 0.01 0.0 0.01 1.0 0.0 1.0 7 | polygon 0.0 1.0 0.0 0.99 1.0 0.99 1.0 1.0 8 | polygon 1.0 1.0 0.99 1.0 0.99 0.0 1.0 0.0 9 | polygon 0.034 0.852 0.106 0.708 0.33199999999999996 0.674 0.17599999999999996 0.618 0.028 0.718 10 | polygon 0.15 0.7559999999999999 0.142 0.93 0.232 0.894 0.238 0.99 0.498 0.722 11 | polygon 0.8079999999999999 0.91 0.904 0.784 0.7799999999999999 0.572 0.942 0.562 0.952 0.82 0.874 0.934 12 | polygon 0.768 0.814 0.692 0.548 0.594 0.47 0.606 0.804 0.648 0.626 13 | polygon 0.22799999999999998 0.5760000000000001 0.39 0.322 0.3400000000000001 0.31400000000000006 0.184 0.456 14 | polygon 0.09 0.228 0.242 0.076 0.106 0.03 0.022 0.178 15 | polygon 0.11 0.278 0.24600000000000002 0.262 0.108 0.454 0.16 0.566 0.064 0.626 0.016 0.438 16 | polygon 0.772 0.1 0.71 0.20599999999999996 0.77 0.322 0.894 0.09600000000000002 0.8039999999999999 0.17600000000000002 17 | polygon 0.698 0.476 0.984 0.27199999999999996 0.908 0.512 18 | polygon 0.45 0.39199999999999996 0.614 0.25799999999999995 0.7340000000000001 0.438 19 | polygon 0.476 0.868 0.552 0.8119999999999999 0.62 0.902 0.626 0.972 0.49 0.958 20 | polygon 0.61 0.014000000000000002 0.58 0.094 0.774 0.05000000000000001 0.63 0.054000000000000006 21 | polygon 0.33399999999999996 0.014 0.27799999999999997 0.03799999999999998 0.368 0.254 0.7 0.20000000000000004 0.764 0.108 0.526 0.158 22 | polygon 0.294 0.584 0.478 0.626 0.482 0.574 0.324 0.434 0.35 0.39 0.572 0.52 0.588 0.722 0.456 0.668 23 | -------------------------------------------------------------------------------- /pyrl/environments/configs/pinball/pinball_simple_single.cfg: -------------------------------------------------------------------------------- 1 | ball 0.02 2 | target 0.9 0.2 0.04 3 | start 0.2 0.9 4 | 5 | polygon 0.0 0.0 0.0 0.01 1.0 0.01 1.0 0.0 6 | polygon 0.0 0.0 0.01 0.0 0.01 1.0 0.0 1.0 7 | polygon 0.0 1.0 0.0 0.99 1.0 0.99 1.0 1.0 8 | polygon 1.0 1.0 0.99 1.0 0.99 0.0 1.0 0.0 9 | 10 | polygon 0.35 0.4 0.45 0.55 0.43 0.65 0.3 0.7 0.45 0.7 0.5 0.6 0.45 0.35 11 | polygon 0.2 0.6 0.25 0.55 0.15 0.5 0.15 0.45 0.2 0.3 0.12 0.27 0.075 0.35 0.09 0.55 12 | polygon 0.3 0.8 0.6 0.75 0.8 0.8 0.8 0.9 0.6 0.85 0.3 0.9 13 | polygon 0.8 0.7 0.975 0.65 0.75 0.5 0.9 0.3 0.7 0.35 0.63 0.65 14 | polygon 0.6 0.25 0.3 0.07 0.15 0.175 0.15 0.2 0.3 0.175 0.6 0.3 15 | polygon 0.75 0.025 0.8 0.24 0.725 0.27 0.7 0.025 16 | -------------------------------------------------------------------------------- /pyrl/environments/configs/pomdps/tiger.POMDP: -------------------------------------------------------------------------------- 1 | # This is the tiger problem of AAAI paper fame in the new pomdp 2 | # format. This format is still experimental and subject to change 3 | 4 | discount: 0.75 5 | values: reward 6 | states: tiger-left tiger-right 7 | actions: listen open-left open-right 8 | observations: tiger-left tiger-right 9 | 10 | T:listen 11 | identity 12 | 13 | T:open-left 14 | uniform 15 | 16 | T:open-right 17 | uniform 18 | 19 | O:listen 20 | 0.85 0.15 21 | 0.15 0.85 22 | 23 | O:open-left 24 | uniform 25 | 26 | O:open-right 27 | uniform 28 | 29 | R:listen : * : * : * -1 30 | 31 | R:open-left : tiger-left : * : * -100 32 | 33 | R:open-left : tiger-right : * : * 10 34 | 35 | R:open-right : tiger-left : * : * 10 36 | 37 | R:open-right : tiger-right : * : * -100 38 | 39 | -------------------------------------------------------------------------------- /pyrl/environments/configs/tetris/3brick.dat: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # File format describing the pieces 3 | # number of pieces 4 | # for each piece: 5 | # number of orientations, height, width (for the first orientation) 6 | # piece (with "X") 7 | ########################################################################## 8 | # 9 | # Pieces with 3 bricks 10 | # There are 2 pieces: 11 | 2 12 | # 13 | 4 2 2 14 | XX 15 | X 16 | 2 1 3 17 | XXX 18 | # End of File -------------------------------------------------------------------------------- /pyrl/environments/configs/tetris/melax.dat: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # File format describing the pieces 3 | # number of pieces 4 | # for each piece: 5 | # number of orientations, height, width (for the first orientation) 6 | # piece (with "X") 7 | ########################################################################## 8 | # 9 | # Melax's Reduced set of pieces 10 | # There are 5 pieces: 11 | 5 12 | # 13 | 1 1 1 14 | X 15 | 2 1 2 16 | XX 17 | 2 2 2 18 | X 19 | X 20 | 4 2 2 21 | X 22 | XX 23 | 1 2 2 24 | XX 25 | XX 26 | # End of File -------------------------------------------------------------------------------- /pyrl/environments/configs/tetris/standard.dat: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # File format describing the pieces 3 | # number of pieces 4 | # for each piece: 5 | # number of orientations, height, width (for the first orientation) 6 | # piece (with "X") 7 | ########################################################################## 8 | # 9 | # Pieces with 4 bricks (standard pieces) 10 | # There are 7 pieces: 11 | 7 12 | # 13 | 2 4 1 14 | X 15 | X 16 | X 17 | X 18 | 1 2 2 19 | XX 20 | XX 21 | 4 3 2 22 | X 23 | XX 24 | X 25 | 2 3 2 26 | X 27 | XX 28 | X 29 | 2 3 2 30 | X 31 | XX 32 | X 33 | 4 2 3 34 | X 35 | XXX 36 | 4 2 3 37 | XXX 38 | X 39 | # End of File -------------------------------------------------------------------------------- /pyrl/environments/configs/tetris/sztetris.dat: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # File format describing the pieces 3 | # number of pieces 4 | # for each piece: 5 | # number of orientations, height, width (for the first orientation) 6 | # piece (with "X") 7 | ########################################################################## 8 | # 9 | # Pieces for SZ Tetris (only the N/Mirrored N (i.e. S/Z) pieces 10 | # There are 2 pieces: 11 | 2 12 | # 13 | 2 3 2 14 | X 15 | XX 16 | X 17 | 2 3 2 18 | X 19 | XX 20 | X 21 | # End of File -------------------------------------------------------------------------------- /pyrl/environments/fuelworld.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import sys 18 | import numpy 19 | 20 | from rlglue.environment.Environment import Environment 21 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 22 | from rlglue.types import Observation 23 | from rlglue.types import Action 24 | from rlglue.types import Reward_observation_terminal 25 | 26 | from pyrl.rlglue import TaskSpecRLGlue 27 | from pyrl.rlglue.registry import register_environment 28 | 29 | from . import gridworld 30 | from scipy.stats import norm 31 | 32 | @register_environment 33 | class FuelWorld(gridworld.Gridworld): 34 | name = "Fuel World" 35 | 36 | # This is a continuous version of Todd Hester's Fuel World domain. 37 | # As such, we will make the size, starting locations, and goal fixed to 38 | # match the original's specifications. We will keep the additive gaussian noise, 39 | # and as mentioned this will be continuous instead of discrete state spaces. 40 | def __init__(self, noise=0.0, fudge=1.4143, variation=(-10.0, -13.0, 5.0), fuel_noise=0.0): 41 | gridworld.Gridworld.__init__(self, size_x=31.0, size_y=21.0, goal_x=24.0, goal_y=11.0, 42 | noise=noise, random_start=True, fudge=fudge) 43 | self.fuel = 0.0 44 | self.fuel_noise = fuel_noise 45 | self.var = variation 46 | self.domain_name = "Continuous Fuel World" 47 | 48 | 49 | def makeTaskSpec(self): 50 | ts = TaskSpecRLGlue.TaskSpec(discount_factor=1.0, reward_range=(-400.0, 0.0)) 51 | ts.addDiscreteAction((0, 7)) 52 | ts.addContinuousObservation((0.0, self.size[0]-1)) 53 | ts.addContinuousObservation((0.0, self.size[1]-1)) 54 | ts.addContinuousObservation((-1.0, 60.0)) # Fuel range as per FuelRooms.cc 55 | ts.setEpisodic() 56 | ts.setExtra(self.domain_name) 57 | return ts.toTaskSpec() 58 | 59 | def env_start(self): 60 | self.reset() 61 | returnObs = Observation() 62 | returnObs.doubleArray = self.pos.tolist() + [self.fuel] 63 | return returnObs 64 | 65 | def reset(self): 66 | # Randomly start in the rectangle around (0,7),(4,12) 67 | self.pos = numpy.random.random((2,)) 68 | self.pos[0] *= 4.0 69 | self.pos[1] *= 5.0 70 | self.pos[1] += 7.0 71 | 72 | self.fuel = numpy.random.random()*4.0 + 14.0 # Between 14 and 18 73 | 74 | def inFuelCell(self, position): 75 | return self.pos[1] <= 1.0 or self.pos[1] >= self.size[1]-1.0 76 | 77 | def isAtGoal(self): 78 | return gridworld.Gridworld.isAtGoal(self) or self.fuel < 0 79 | 80 | def getState(self): 81 | return gridworld.Gridworld.getState(self) + [self.fuel] 82 | 83 | def takeAction(self, intAction): 84 | if intAction == 0: 85 | self.pos[0] += 1.0 86 | elif intAction == 1: 87 | self.pos[0] -= 1.0 88 | elif intAction == 2: 89 | self.pos[1] += 1.0 90 | elif intAction == 3: 91 | self.pos[1] -= 1.0 92 | elif intAction == 4: 93 | self.pos += numpy.array([-1.0, 1.0]) 94 | elif intAction == 5: 95 | self.pos += numpy.array([1.0, 1.0]) 96 | elif intAction == 6: 97 | self.pos += numpy.array([-1.0, -1.0]) 98 | elif intAction == 7: 99 | self.pos += numpy.array([1.0, -1.0]) 100 | 101 | if self.noise > 0: 102 | self.pos += numpy.random.normal(scale=self.noise, size=(2,)) 103 | 104 | self.pos = self.pos.clip([0, 0], self.size) 105 | 106 | self.fuel -= 1.0 107 | if self.fuel_noise > 0: 108 | self.fuel += numpy.random.normal(scale=self.fuel_noise) 109 | 110 | if self.inFuelCell(self.pos): 111 | self.fuel += 20.0 112 | if self.fuel > 60.0: 113 | self.fuel = 60.0 114 | 115 | if gridworld.Gridworld.isAtGoal(self): 116 | return 0.0 117 | elif self.fuel < 0: 118 | return -400.0 119 | elif self.inFuelCell(self.pos): # Fuel costs 120 | base = self.var[0] if self.pos[1] <= 1.0 else self.var[1] 121 | a = self.var[2] 122 | return base - (int(self.pos[0]) % 5)*a 123 | elif intAction < 4: 124 | return -1.0 125 | elif intAction >= 4: 126 | return -1.4 127 | else: 128 | print "ERROR in FuelWorld.takeAction" 129 | 130 | 131 | if __name__=="__main__": 132 | import argparse 133 | parser = argparse.ArgumentParser(description='Run 2D MultiRoom Noisy Continuous Gridworld environment in network mode.') 134 | gridworld.addGridworldArgs(parser) 135 | parser.add_argument("--fuel_noise", type=float, default=0.0, 136 | help="If non-zero then gives the standard deviation of the additive Gaussian noise to add to the fuel expenditure.") 137 | args = parser.parse_args() 138 | EnvironmentLoader.loadEnvironment(FuelWorld(noise=args.noise, fudge=args.fudge, fuel_noise=args.fuel_noise)) 139 | -------------------------------------------------------------------------------- /pyrl/environments/gridworld.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import numpy 18 | 19 | from rlglue.environment.Environment import Environment 20 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 21 | from rlglue.types import Observation 22 | from rlglue.types import Action 23 | from rlglue.types import Reward_observation_terminal 24 | from pyrl.rlglue import TaskSpecRLGlue 25 | from pyrl.rlglue.registry import register_environment 26 | 27 | @register_environment 28 | class Gridworld(Environment): 29 | name = "Gridworld" 30 | 31 | # All parameters are in units of 1, where 1 is how far on average 32 | # the agent can move with a single action. 33 | def __init__(self, size_x=10, size_y=10, goal_x=10, goal_y=10, noise=0.0, reward_noise=0.0, random_start=False, fudge=1.4143): 34 | self.size = numpy.array([size_x, size_y]) 35 | self.goal = numpy.array([goal_x, goal_y]) 36 | self.noise = noise 37 | self.reward_noise = reward_noise 38 | self.random_start = random_start 39 | self.pos = numpy.zeros((2,)) 40 | self.fudge = fudge 41 | self.domain_name = "Continuous Gridworld by Will Dabney" 42 | 43 | def makeTaskSpec(self): 44 | ts = TaskSpecRLGlue.TaskSpec(discount_factor=1.0, reward_range=(-1.0, 0.0)) 45 | ts.addDiscreteAction((0, 3)) 46 | ts.addContinuousObservation((0.0, self.size[0])) 47 | ts.addContinuousObservation((0.0, self.size[1])) 48 | ts.setEpisodic() 49 | ts.setExtra(self.domain_name) 50 | return ts.toTaskSpec() 51 | 52 | def getState(self): 53 | return self.pos.tolist() 54 | 55 | def reset(self): 56 | if self.random_start: 57 | self.pos = numpy.random.random((2,)) * self.size 58 | else: 59 | self.pos[:] = 0.0 60 | 61 | def env_init(self): 62 | return self.makeTaskSpec() 63 | 64 | def env_start(self): 65 | self.reset() 66 | returnObs = Observation() 67 | returnObs.doubleArray = self.getState() 68 | return returnObs 69 | 70 | def isAtGoal(self): 71 | return numpy.linalg.norm(self.pos - self.goal) < self.fudge 72 | 73 | def takeAction(self, intAction): 74 | if intAction == 0: 75 | self.pos[0] += 1.0 76 | elif intAction == 1: 77 | self.pos[0] -= 1.0 78 | elif intAction == 2: 79 | self.pos[1] += 1.0 80 | elif intAction == 3: 81 | self.pos[1] -= 1.0 82 | 83 | if self.noise > 0: 84 | self.pos += numpy.random.normal(scale=self.noise, size=(2,)) 85 | self.pos = self.pos.clip([0, 0], self.size) 86 | return 0.0 if self.isAtGoal() else -1.0 87 | 88 | def env_step(self,thisAction): 89 | episodeOver = 0 90 | intAction = thisAction.intArray[0] 91 | 92 | theReward = self.takeAction(intAction) 93 | 94 | if self.isAtGoal(): 95 | episodeOver = 1 96 | 97 | if self.reward_noise > 0: 98 | theReward += numpy.random.normal(scale=self.reward_noise) 99 | 100 | theObs = Observation() 101 | theObs.doubleArray = self.getState() 102 | 103 | returnRO = Reward_observation_terminal() 104 | returnRO.r = theReward 105 | returnRO.o = theObs 106 | returnRO.terminal = episodeOver 107 | 108 | return returnRO 109 | 110 | def env_cleanup(self): 111 | pass 112 | 113 | def env_message(self,inMessage): 114 | return "I don't know how to respond to your message"; 115 | 116 | 117 | def addGridworldArgs(parser): 118 | parser.add_argument("--size_x", type=float, default=10, help="Size of the gridworld in the x (horizontal) dimension, where 1.0 is the unit of movement.") 119 | parser.add_argument("--size_y", type=float, default=10, help="Size of the gridworld in the y (vertical) dimension, where 1.0 is the unit of movement.") 120 | parser.add_argument("--goal_x", type=float, default=10, help="Goal x coordinate") 121 | parser.add_argument("--goal_y", type=float, default=10, help="Goal y coordinate") 122 | parser.add_argument("--noise", type=float, default=0, help="Standard deviation of additive noise to generate") 123 | parser.add_argument("--fudge", type=float, default=1.4143, help="Distance from goal allowed before episode is counted as finished") 124 | parser.add_argument("--random_restarts", type=bool, default=False, help="Randomly assign x,y initial locations.") 125 | 126 | if __name__=="__main__": 127 | import argparse 128 | parser = argparse.ArgumentParser(description='Run 2D Noisy Continuous Gridworld environment in network mode.') 129 | addGridworldArgs(parser) 130 | args = parser.parse_args() 131 | EnvironmentLoader.loadEnvironment(Gridworld(size_x=args.size_x, size_y=args.size_y, goal_x=args.goal_x, goal_y=args.goal_y, noise=args.noise, random_start=args.random_restarts, fudge=args.fudge)) 132 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | */*.o 3 | */*.so 4 | 5 | # Packages 6 | *.egg 7 | *.egg-info 8 | dist 9 | build 10 | eggs 11 | parts 12 | bin 13 | var 14 | sdist 15 | develop-eggs 16 | .installed.cfg 17 | */*~ 18 | # Installer logs 19 | pip-log.txt 20 | 21 | # Unit test / coverage reports 22 | .coverage 23 | .tox 24 | 25 | #Translations 26 | *.mo 27 | 28 | #Mr Developer 29 | .mr.developer.cfg 30 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | PROJECT(libPOMDP) 4 | 5 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) 6 | 7 | file(GLOB SRC 8 | "src/*.h" 9 | "src/*.c" 10 | ) 11 | 12 | find_package(PythonInterp REQUIRED) 13 | find_package(PythonLibs REQUIRED) 14 | find_package(BISON REQUIRED) 15 | find_package(FLEX REQUIRED) 16 | 17 | # Semi-hacky way of getting numpy included 18 | # Needs testing on linux 19 | STRING(REPLACE bin/python lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/numpy/core/include/numpy/ NUMPY_INC ${PYTHON_EXECUTABLE}) 20 | 21 | include_directories(${PYTHON_INCLUDE_DIRS}) 22 | include_directories(${NUMPY_INC}) 23 | 24 | BISON_TARGET(parser ${CMAKE_CURRENT_SOURCE_DIR}/src/parser.y ${CMAKE_CURRENT_SOURCE_DIR}/src/parser.c) 25 | FLEX_TARGET(scanner ${CMAKE_CURRENT_SOURCE_DIR}/src/scanner.l ${CMAKE_CURRENT_SOURCE_DIR}/src/scanner.c) 26 | ADD_FLEX_BISON_DEPENDENCY(scanner parser) 27 | 28 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 29 | 30 | add_library(libpomdp MODULE ${SRC} ${BISON_parser_OUTPUTS} ${FLEX_scanner_OUTPUTS}) 31 | set_target_properties(libpomdp PROPERTIES PREFIX "") 32 | target_link_libraries(libpomdp ${PYTHON_LIBRARIES} ${FLEX_LIBRARIES} ${BISON_LIBRARIES}) 33 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/README.md: -------------------------------------------------------------------------------- 1 | pyrl.environments.libPOMDP 2 | ============================ 3 | 4 | This is primarily code from pomdp-solve written by Anthony R. Cassandra. I've only added a 5 | new makefile, which is not as sophisticated as the original, and some code to allow the whole thing 6 | to be compiled into a Python module. 7 | 8 | At present this only contains the code relevant for reading and writing the MDP/POMDP specification 9 | files, and interacting with the information contained within them. However, pomdp-solve itself 10 | has many useful implementations in pure C that may later be brought into this module for use in python. 11 | 12 | This has been tested on Mac OS X, but 'should' also work in Linux. -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amarack/python-rl/a1c1f5bc42cb20f5d9630818d1908f2100916ef4/pyrl/environments/libPOMDP/__init__.py -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/src/imm-reward.h: -------------------------------------------------------------------------------- 1 | /* imm-reward.h 2 | 3 | ***** 4 | Copyright 1994-1997, Brown University 5 | Copyright 1998, 1999, Anthony R. Cassandra 6 | 7 | All Rights Reserved 8 | 9 | Permission to use, copy, modify, and distribute this software and its 10 | documentation for any purpose other than its incorporation into a 11 | commercial product is hereby granted without fee, provided that the 12 | above copyright notice appear in all copies and that both that 13 | copyright notice and this permission notice appear in supporting 14 | documentation. 15 | 16 | ANTHONY CASSANDRA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 | INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY 18 | PARTICULAR PURPOSE. IN NO EVENT SHALL ANTHONY CASSANDRA BE LIABLE FOR 19 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 21 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 22 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 23 | ***** 24 | 25 | Header file for imm-reward.c 26 | */ 27 | #ifndef MDP_IMM_REWARD_H 28 | #define MDP_IMM_REWARD_H 1 29 | 30 | #include "sparse-matrix.h" 31 | 32 | /* 33 | We will represent the general immediate reward structure as a 34 | linked list, where each node of the list will correspond to a single 35 | R: * : ... entry. The entry from the file could specify a single 36 | value, a row of values, or an entire matrix. Thus we need three 37 | different representations depending on the situation. Additionally, 38 | all of the components could have a wildcard character indicating 39 | that it is a specification for a family of values. This is indicated 40 | with special characters. 41 | 42 | */ 43 | 44 | /* Each of the action, states and obs could have a state index number, 45 | or one of these two values. Since states cannot be negative we use 46 | negative values for the special characters. The observation cannot 47 | be present when the next_state is present, but this should be 48 | enforced by the parser. When both the next state and obs are not 49 | present, we will use a sparse matrix representation. When only the 50 | obs is not present we will use a single dimensional, non-sparse 51 | matrix. When both are specified we use a single value. Note that 52 | it does not matter if the indivdual elements are specific indices or 53 | a wildcard, either way we will store a single value. 54 | 55 | */ 56 | 57 | #define WILDCARD_SPEC -1 58 | #define NOT_PRESENT -99 59 | 60 | /* This allows us to easily check what type of entry it is, since */ 61 | /* there are three possibilities. */ 62 | typedef enum { ir_none, ir_value, ir_vector, ir_matrix } IR_Type; 63 | 64 | typedef struct Imm_Reward_Node_Struct *Imm_Reward_List; 65 | struct Imm_Reward_Node_Struct { 66 | 67 | IR_Type type; 68 | 69 | int action; 70 | int cur_state; 71 | int next_state; 72 | int obs; 73 | 74 | union rep_tag { 75 | double value; 76 | double *vector; 77 | Matrix matrix; 78 | } rep; 79 | 80 | Imm_Reward_List next; 81 | }; 82 | 83 | extern double gMinReward; 84 | extern double gMaxReward; 85 | 86 | extern void destroyImmRewards(); 87 | extern void newImmReward( int action, int cur_state, int next_state, int obs ); 88 | extern void enterImmReward( int cur_state, int next_state, int obs, 89 | double value ); 90 | extern void doneImmReward(); 91 | extern double getImmediateReward( int action, int cur_state, int 92 | next_state, int obs ); 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/src/libpomdp.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include "Python.h" // Must be the first header? 4 | #include "arrayobject.h" 5 | //#include "numpy/arrayobject.h" 6 | 7 | #include 8 | #include "mdp.h" 9 | #include "imm-reward.h" 10 | 11 | static PyObject *C_readMDP(PyObject *self, PyObject *args); 12 | /* list by action of sparse matrix of shape: gNumStates x gNumStates */ 13 | static PyObject* getSparseTransitionMatrix(PyObject *self, PyObject *args); 14 | /* list by action of sparse matrix of shape: gNumStates x gNumObservations */ 15 | static PyObject* getSparseObsMatrix(PyObject *self, PyObject *args); 16 | 17 | static PyObject *C_getRewardRange(PyObject *self, PyObject *args); 18 | static PyObject *C_getReward(PyObject *self, PyObject *args); 19 | static PyObject *C_transformBelief(PyObject *self, PyObject *args); 20 | static PyObject *C_getInitialBelief(PyObject *self, PyObject *args); 21 | static PyObject *C_isRewardType(PyObject *self, PyObject *args); 22 | static PyObject *C_getNumObservations(PyObject *self, PyObject *args); 23 | static PyObject *C_getNumActions(PyObject *self, PyObject *args); 24 | static PyObject *C_getNumStates(PyObject *self, PyObject *args); 25 | static PyObject *C_getDiscount(PyObject *self, PyObject *args); 26 | 27 | static PyObject *loadFile(PyObject *self, PyObject *args); 28 | PyObject* fillPyMatrix(Matrix *target); 29 | 30 | #define validState(S) (S >= 0 && S < gNumStates) 31 | #define validAction(A) (A >= 0 && A < gNumActions) 32 | #define validObservation(O) (O >= 0 && O < gNumObservations) 33 | 34 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/src/mdp-common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * 4 | * 5 | * global.h 6 | * 7 | * 8 | * Anthony R. Cassandra 9 | * 10 | * 11 | * July, 1998 12 | * 13 | * 14 | * 15 | * $RCSfile: mdp-common.h,v $ 16 | * $Source: /u/cvs/proj/pomdp-solve/src/mdp/mdp-common.h,v $ 17 | * $Revision: 1.1 $ 18 | * $Date: 2004/10/10 03:44:59 $ 19 | * 20 | * 21 | * 22 | * 23 | * 1994-1997, Brown University 24 | * 1998-2003, Anthony R. Cassandra 25 | * 26 | * All Rights Reserved 27 | * 28 | * Permission to use, copy, modify, and distribute this software and its 29 | * documentation for any purpose other than its incorporation into a 30 | * commercial product is hereby granted without fee, provided that the 31 | * above copyright notice appear in all copies and that both that 32 | * copyright notice and this permission notice appear in supporting 33 | * documentation. 34 | * 35 | * ANTHONY CASSANDRA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 36 | * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY 37 | * PARTICULAR PURPOSE. IN NO EVENT SHALL ANTHONY CASSANDRA BE LIABLE FOR 38 | * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 39 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 40 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 41 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 42 | * 43 | * 44 | * 45 | * 46 | */ 47 | 48 | /* 49 | * Header file for all globally defined items in the mdp library. 50 | */ 51 | 52 | #ifndef MDP_COMMON_H 53 | #define MDP_COMMON_H 54 | 55 | #ifdef DMALLOC 56 | 57 | #include "dmalloc.h" 58 | 59 | #define XCALLOC(num, size) calloc( (num), (size) ) 60 | #define XMALLOC(size) malloc( size ) 61 | #define XREALLOC(p, size) realloc( (p), (size) ) 62 | #define XFREE(stale) free(stale) 63 | 64 | #else 65 | 66 | #define XCALLOC(num, size) calloc( (num), (size) ) 67 | #define XMALLOC(size) malloc( size ) 68 | #define XREALLOC(p, size) realloc( (p), (size) ) 69 | #define XFREE(stale) free(stale) 70 | 71 | #endif 72 | 73 | #endif 74 | 75 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/src/mdp.h: -------------------------------------------------------------------------------- 1 | /* mdp.h 2 | 3 | ***** 4 | Copyright 1994-1997, Brown University 5 | Copyright 1998, 1999, Anthony R. Cassandra 6 | 7 | All Rights Reserved 8 | 9 | Permission to use, copy, modify, and distribute this software and its 10 | documentation for any purpose other than its incorporation into a 11 | commercial product is hereby granted without fee, provided that the 12 | above copyright notice appear in all copies and that both that 13 | copyright notice and this permission notice appear in supporting 14 | documentation. 15 | 16 | ANTHONY CASSANDRA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 | INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY 18 | PARTICULAR PURPOSE. IN NO EVENT SHALL ANTHONY CASSANDRA BE LIABLE FOR 19 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 21 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 22 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 23 | ***** 24 | 25 | header file for mdp.c 26 | */ 27 | #ifndef MDP_MDP_H 28 | #define MDP_MDP_H 29 | 30 | #include "sparse-matrix.h" 31 | 32 | /* Use this type for a variable that indicated whether we have a 33 | POMDP or an MDP. 34 | */ 35 | typedef enum { UNKNOWN_problem_type, 36 | MDP_problem_type, 37 | POMDP_problem_type 38 | } Problem_Type; 39 | 40 | /* Use this to determine if the problems values are rewards or costs. 41 | */ 42 | #define NUM_VALUE_TYPES 2 43 | typedef enum {REWARD_value_type, COST_value_type } Value_Type; 44 | #define VALUE_TYPE_STRINGS { \ 45 | "cost", \ 46 | "reward" \ 47 | } 48 | 49 | #define DEFAULT_DISCOUNT_FACTOR 1.0 50 | 51 | #define DEFAULT_VALUE_TYPE REWARD_value_type 52 | 53 | #define INVALID_STATE -1 54 | #define INVALID_OBS -1 55 | #define INVALID_ACTION -1 56 | 57 | #ifndef MDP_C 58 | 59 | /* Exported variables */ 60 | extern char *value_type_str[]; 61 | extern double gDiscount; 62 | extern Problem_Type gProblemType; 63 | extern Value_Type gValueType; 64 | 65 | /* We will use this flag to indicate whether the problem has negative 66 | rewards or not. It starts off FALSE and becomes TRUE if any 67 | negative reward is found. */ 68 | extern double gMinimumImmediateReward; 69 | 70 | extern int gNumStates; 71 | extern int gNumActions; 72 | extern int gNumObservations; 73 | 74 | /* Intermediate variables */ 75 | 76 | extern I_Matrix *IP; /* Transition Probabilities */ 77 | extern I_Matrix *IR; /* Observation Probabilities */ 78 | extern I_Matrix IQ; /* Immediate values for MDP only */ 79 | 80 | /* Sparse variables */ 81 | 82 | extern Matrix *P; /* Transition Probabilities */ 83 | extern Matrix *R; /* Observation Probabilities */ 84 | extern Matrix *QI; /* The immediate values, for MDPs only */ 85 | extern Matrix Q; /* Immediate values for state action pairs. These 86 | are expectations computed from immediate values: 87 | either the QI for MDPs or the special 88 | representation for the POMDPs */ 89 | 90 | extern double *gInitialBelief; /* For POMDPs */ 91 | extern int gInitialState; /* For MDPs */ 92 | 93 | /* Exported functions */ 94 | extern double *newBeliefState(); 95 | extern int transformBeliefState( double *pi, 96 | double *pi_hat, 97 | int a, 98 | int obs ); 99 | extern void copyBeliefState( double *copy, double *pi ); 100 | extern void displayBeliefState( FILE *file, double *pi ); 101 | extern int readMDP( char *filename ); 102 | extern void convertMatrices(); 103 | extern void deallocateMDP(); 104 | extern void convertMatrices(); 105 | extern int verifyIntermediateMDP(); 106 | extern void deallocateIntermediateMDP(); 107 | extern void allocateIntermediateMDP(); 108 | extern int writeMDP( char *filename ); 109 | extern void displayMDPSlice( int state ); 110 | 111 | #endif 112 | #endif 113 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/src/parse_constant.h: -------------------------------------------------------------------------------- 1 | /* parse_constant.h 2 | 3 | ***** 4 | Copyright 1994-1997, Brown University 5 | Copyright 1998, 1999, Anthony R. Cassandra 6 | 7 | All Rights Reserved 8 | 9 | Permission to use, copy, modify, and distribute this software and its 10 | documentation for any purpose other than its incorporation into a 11 | commercial product is hereby granted without fee, provided that the 12 | above copyright notice appear in all copies and that both that 13 | copyright notice and this permission notice appear in supporting 14 | documentation. 15 | 16 | ANTHONY CASSANDRA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 | INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY 18 | PARTICULAR PURPOSE. IN NO EVENT SHALL ANTHONY CASSANDRA BE LIABLE FOR 19 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 21 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 22 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 23 | ***** 24 | */ 25 | 26 | #ifndef MDP_PARSE_CONSTANT_H 27 | #define MDP_PARSE_CONSTANT_H 1 28 | 29 | typedef enum { CONST_INT, CONST_STRING, CONST_FLOAT } Const_Type; 30 | 31 | typedef struct cNode 32 | { 33 | Const_Type theTag; /* Type of constant it is */ 34 | union { 35 | int theInt; 36 | char *theString; 37 | double theFloat; 38 | } theValue; 39 | } Constant_Block; 40 | 41 | #endif 42 | 43 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/src/parse_hash.h: -------------------------------------------------------------------------------- 1 | /* parse_hash.h 2 | 3 | ***** 4 | Copyright 1994-1997, Brown University 5 | Copyright 1998, 1999, Anthony R. Cassandra 6 | 7 | All Rights Reserved 8 | 9 | Permission to use, copy, modify, and distribute this software and its 10 | documentation for any purpose other than its incorporation into a 11 | commercial product is hereby granted without fee, provided that the 12 | above copyright notice appear in all copies and that both that 13 | copyright notice and this permission notice appear in supporting 14 | documentation. 15 | 16 | ANTHONY CASSANDRA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 | INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY 18 | PARTICULAR PURPOSE. IN NO EVENT SHALL ANTHONY CASSANDRA BE LIABLE FOR 19 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 21 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 22 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 23 | ***** 24 | */ 25 | 26 | #ifndef MDP_PARSE_HASH_H 27 | #define MDP_PARSE_HASH_H 1 28 | 29 | #define HASH_TABLE_SIZE 255 30 | 31 | typedef enum { nt_state, nt_action, 32 | nt_observation, nt_unknown } Mnemonic_Type; 33 | 34 | typedef struct Node_Struct *Node; 35 | struct Node_Struct { 36 | Mnemonic_Type type; 37 | int number; 38 | char *str; 39 | Node next; 40 | }; 41 | 42 | extern void H_create(); 43 | extern void H_destroy(); 44 | extern int H_enter( char *str, Mnemonic_Type type ); 45 | extern int H_lookup( char *str, Mnemonic_Type type ); 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /pyrl/environments/libPOMDP/src/sparse-matrix.h: -------------------------------------------------------------------------------- 1 | /* sparse-matrix.h 2 | 3 | Header file for sparse-matrix.c. 4 | 5 | ***** 6 | Copyright 1994-1997, Brown University 7 | Copyright 1998, 1999, Anthony R. Cassandra 8 | 9 | All Rights Reserved 10 | 11 | Permission to use, copy, modify, and distribute this software and its 12 | documentation for any purpose other than its incorporation into a 13 | commercial product is hereby granted without fee, provided that the 14 | above copyright notice appear in all copies and that both that 15 | copyright notice and this permission notice appear in supporting 16 | documentation. 17 | 18 | ANTHONY CASSANDRA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 19 | INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY 20 | PARTICULAR PURPOSE. IN NO EVENT SHALL ANTHONY CASSANDRA BE LIABLE FOR 21 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 22 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 23 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 24 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 25 | ***** 26 | 27 | */ 28 | #ifndef MDP_SPARSE_MATRIX_H 29 | #define MDP_SPARSE_MATRIX_H 1 30 | 31 | #define POS_ZERO_TOLERANCE 0.0000000001 32 | #define NEG_ZERO_TOLERANCE -0.0000000001 33 | 34 | #define IS_ZERO(X) ((X < POS_ZERO_TOLERANCE) && ( X > NEG_ZERO_TOLERANCE )) 35 | 36 | /* Each row of the intermediate form of the matrix will consist 37 | of a linked list. This structure is for each node of those linked 38 | lists. The pertinent information is the column number for the 39 | entry and the value. Additionally, this linked list will be kept 40 | sorted by column so when adding an element we look for the column 41 | and insert it in place. It will be sorted from least to greatest. 42 | */ 43 | typedef struct I_Matrix_Row_Node_Struct *I_Matrix_Row_Node; 44 | struct I_Matrix_Row_Node_Struct { 45 | int column; 46 | double value; 47 | I_Matrix_Row_Node next; 48 | }; 49 | 50 | /* A matrix in intermediate form will be a linked list for each row 51 | and a count of the number of non-zero entries for each row. 52 | */ 53 | struct I_Matrix_Struct { 54 | int num_rows; 55 | I_Matrix_Row_Node *row; /* An array of pointers for the head of 56 | each row's linked list */ 57 | int *row_length; /* An array for the current lengths of 58 | each row. */ 59 | }; 60 | typedef struct I_Matrix_Struct *I_Matrix; 61 | 62 | /* A matrix will be sparsely represented by a bunch of arrays 63 | */ 64 | struct Matrix_Struct { 65 | int num_rows; 66 | int num_non_zero; 67 | double *mat_val; /* The actual non-zero entries stored row by row. */ 68 | int *row_start; /* the position for the start of each row in mat_val */ 69 | int *row_length; /* The length of each row in mat_val */ 70 | int *col; /* The column number for each entry in mat_val */ 71 | }; 72 | typedef struct Matrix_Struct *Matrix; 73 | 74 | /**********************************************************************/ 75 | /****************************** External Routines *******************/ 76 | /**********************************************************************/ 77 | extern void destroyRow( I_Matrix_Row_Node row ); 78 | extern I_Matrix_Row_Node addEntryToRow( I_Matrix_Row_Node row, 79 | int col, double value, 80 | int *count, int accumulate ); 81 | extern void displayRow( I_Matrix_Row_Node row ); 82 | 83 | extern int addEntryToIMatrix( I_Matrix i_matrix, int row, 84 | int col, double value ); 85 | extern int accumulateEntryInIMatrix( I_Matrix i_matrix, int row, 86 | int col, double value ); 87 | extern void destroyIMatrix( I_Matrix i_matrix ); 88 | extern I_Matrix newIMatrix( int num_rows ); 89 | extern double sumIMatrixRowValues( I_Matrix i_matrix, int row ); 90 | extern Matrix newMatrix( int num_rows, int num_non_zero ); 91 | extern void destroyMatrix( Matrix matrix ); 92 | extern Matrix transformIMatrix( I_Matrix i_matrix ); 93 | extern void displayMatrix( Matrix matrix ); 94 | extern double sumRowValues( Matrix matrix, int row ); 95 | extern double getEntryMatrix( Matrix matrix, int row, int col ); 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /pyrl/environments/marble_maze.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import numpy 18 | from rlglue.environment.Environment import Environment 19 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 20 | from rlglue.types import Observation 21 | from rlglue.types import Action 22 | from rlglue.types import Reward_observation_terminal 23 | from pyrl.rlglue import TaskSpecRLGlue 24 | from pyrl.rlglue.registry import register_environment 25 | 26 | @register_environment 27 | class MarbleMaze(Environment): 28 | """A simple gridworld like domain, with many wall segments to create 29 | a maze. This domain, unlike the other gridworld domains will be made 30 | entirely discrete. 31 | 32 | From paper: 33 | A Bayesian Sampling Approach to Exploration in Reinforcement Learning. 2009. 34 | John Asmuth, Lihong Li, Michael Littman, Ali Nouri, and David Wingate. 35 | """ 36 | name = "Marble Maze" 37 | 38 | def __init__(self, **kwargs): 39 | # Building walls, each int in the maze matrix represents the type of wall setup 40 | # 0000 no walls 41 | # 0001 wall to the north 42 | N = 1 43 | # 0010 wall to the east 44 | E = 2 45 | # 0100 wall to the south 46 | S = 4 47 | # 1000 wall to the west 48 | W = 8 49 | self.directions = numpy.array([N,E,S,W], dtype=int) 50 | self.maze = numpy.array([[N+W, N, N, N, N, N+E], 51 | [W+S, N+E+S, W, E, W, E], 52 | [N+W, N+S, S, S+E, W, E], 53 | [S+W, N+S, N, N, E, E], 54 | [N+W, N, 0, S, 0, E], 55 | [S+W, S, S+E, S+W+N, S+E, S+E+W]], dtype=int) 56 | self.pits = numpy.array([[1,1], [4,1], [4,2], [3, 3]], dtype=int) 57 | self.noise = kwargs.setdefault('noise', 0.2) 58 | self.start_loc = numpy.zeros((2,), dtype=int) 59 | self.pos = numpy.zeros((2,), dtype=int) 60 | self.step_reward = -0.001 61 | self.goal_loc = numpy.array([5,5], dtype=int) 62 | self.domain_name = "Marbel Maze (Discrete)" 63 | 64 | def makeTaskSpec(self): 65 | ts = TaskSpecRLGlue.TaskSpec(discount_factor=0.95, reward_range=(-1.0, 1.0)) 66 | ts.addDiscreteAction((0, 3)) 67 | ts.addDiscreteObservation((0, self.maze.shape[0]-1)) 68 | ts.addDiscreteObservation((0, self.maze.shape[1]-1)) 69 | ts.setEpisodic() 70 | ts.setExtra(self.domain_name) 71 | return ts.toTaskSpec() 72 | 73 | def getState(self): 74 | return self.pos.tolist() 75 | 76 | def reset(self): 77 | self.pos = self.start_loc.copy() 78 | 79 | def env_init(self): 80 | return self.makeTaskSpec() 81 | 82 | def env_start(self): 83 | self.reset() 84 | returnObs = Observation() 85 | returnObs.intArray = self.getState() 86 | return returnObs 87 | 88 | def isAtGoal(self): 89 | return (self.pos == self.goal_loc).all() 90 | 91 | def takeAction(self, intAction): 92 | direction = numpy.zeros((2,), dtype=int) 93 | direction[int(intAction)/2] = 1 + (intAction % 2)*-2 94 | 95 | # Noisy movement causes agent to move perpendicular to 96 | # the desired action, with equal likelihood for either option 97 | if numpy.random.random() < self.noise: 98 | direction.fill(0) 99 | direction[int(intAction < 2)] = numpy.random.randint(2)*-2 + 1 100 | if self.maze[tuple(self.pos)] % 2 != 0: # North wall 101 | direction[0] = max(0, direction[0]) 102 | if (self.maze[tuple(self.pos)] % 8 >= 4): # South wall 103 | direction[0] = min(0, direction[0]) 104 | if (self.maze[tuple(self.pos)] % 4 >= 2): # East wall 105 | direction[1] = min(0, direction[1]) 106 | if self.maze[tuple(self.pos)] >= 8: # West wall 107 | direction[1] = max(0, direction[1]) 108 | 109 | self.pos += direction 110 | if self.isAtGoal(): 111 | return 1.0, True 112 | elif self.pos.tolist() in self.pits.tolist(): 113 | return -1.0, True 114 | else: 115 | return self.step_reward, False 116 | 117 | def env_step(self,thisAction): 118 | intAction = int(thisAction.intArray[0]) 119 | theReward, episodeOver = self.takeAction(intAction) 120 | 121 | theObs = Observation() 122 | theObs.intArray = self.getState() 123 | 124 | returnRO = Reward_observation_terminal() 125 | returnRO.r = theReward 126 | returnRO.o = theObs 127 | returnRO.terminal = int(episodeOver) 128 | 129 | return returnRO 130 | 131 | def env_cleanup(self): 132 | pass 133 | 134 | def env_message(self,inMessage): 135 | return "I don't know how to respond to your message"; 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | PROJECT(mdptetris) 4 | 5 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) 6 | 7 | file(GLOB SRC 8 | "src/*.h" 9 | "src/*.c" 10 | "src/*.C" 11 | ) 12 | 13 | find_package(PythonInterp REQUIRED) 14 | find_package(PythonLibs REQUIRED) 15 | 16 | include_directories(${PYTHON_INCLUDE_DIRS}) 17 | 18 | add_library(mdptetris MODULE ${SRC}) 19 | set_target_properties(mdptetris PROPERTIES PREFIX "") 20 | target_link_libraries(mdptetris ${PYTHON_LIBRARIES}) 21 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/README.md: -------------------------------------------------------------------------------- 1 | mdptetris 2 | ========= 3 | 4 | This is a modified version of mdptetris from: 5 | 6 | https://gforge.inria.fr/projects/mdptetris/ 7 | http://mdptetris.gforge.inria.fr/doc/ 8 | 9 | It extends their Tetris implementation to be compiled as a python module, and adds 10 | a few utility functions for use with general reinforcement learning agents. We have 11 | also removed most of the agent logic from the original code, and are continuing to 12 | pair down this fork of their codebase to just the essentials for use as a Tetris 13 | reinforcement learning environment, where agents do not have specific domain knowledge. 14 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amarack/python-rl/a1c1f5bc42cb20f5d9630818d1908f2100916ef4/pyrl/environments/mdptetris/__init__.py -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/bertsekas_initial.dat: -------------------------------------------------------------------------------- 1 | 1 2 | 1 3 | 22 4 | 0 0 5 | 8 0 6 | 8 0 7 | 8 0 8 | 8 0 9 | 8 0 10 | 8 0 11 | 8 0 12 | 8 0 13 | 8 0 14 | 8 0 15 | 9 0 16 | 9 0 17 | 9 0 18 | 9 0 19 | 9 0 20 | 9 0 21 | 9 0 22 | 9 0 23 | 9 0 24 | 7 -10 25 | 5 -1 26 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/ce_bdu.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 28 4 | 8 0 100 5 | 8 0 100 6 | 8 0 100 7 | 8 0 100 8 | 8 0 100 9 | 8 0 100 10 | 8 0 100 11 | 8 0 100 12 | 8 0 100 13 | 8 0 100 14 | 9 0 100 15 | 9 0 100 16 | 9 0 100 17 | 9 0 100 18 | 9 0 100 19 | 9 0 100 20 | 9 0 100 21 | 9 0 100 22 | 9 0 100 23 | 7 0 100 24 | 5 0 100 25 | 1 0 100 26 | 2 0 100 27 | 3 0 100 28 | 4 0 100 29 | 6 0 100 30 | -1 0 100 31 | -2 0 100 32 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/ce_bertsekas.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 21 4 | 8 0 100 5 | 8 0 100 6 | 8 0 100 7 | 8 0 100 8 | 8 0 100 9 | 8 0 100 10 | 8 0 100 11 | 8 0 100 12 | 8 0 100 13 | 8 0 100 14 | 9 0 100 15 | 9 0 100 16 | 9 0 100 17 | 9 0 100 18 | 9 0 100 19 | 9 0 100 20 | 9 0 100 21 | 9 0 100 22 | 9 0 100 23 | 7 0 100 24 | 5 0 100 25 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/ce_bertsekas_dellacherie.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 26 4 | 8 0 100 5 | 8 0 100 6 | 8 0 100 7 | 8 0 100 8 | 8 0 100 9 | 8 0 100 10 | 8 0 100 11 | 8 0 100 12 | 8 0 100 13 | 8 0 100 14 | 9 0 100 15 | 9 0 100 16 | 9 0 100 17 | 9 0 100 18 | 9 0 100 19 | 9 0 100 20 | 9 0 100 21 | 9 0 100 22 | 9 0 100 23 | 7 0 100 24 | 5 0 100 25 | 1 0 100 26 | 2 0 100 27 | 3 0 100 28 | 4 0 100 29 | 6 0 100 30 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/ce_dellacherie.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 6 4 | 1 0 100 5 | 2 0 100 6 | 3 0 100 7 | 4 0 100 8 | 5 0 100 9 | 6 0 100 10 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/ce_du.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 8 4 | 1 0 100 5 | 2 0 100 6 | 3 0 100 7 | 4 0 100 8 | 5 0 100 9 | 6 0 100 10 | -1 0 100 11 | -2 0 100 12 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/dellacherie_initial.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 6 4 | 1 -1 5 | 2 1 6 | 3 -1 7 | 4 -1 8 | 5 -4 9 | 6 -1 10 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/dellacherie_ourwellsums.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 6 4 | 1 -1 5 | 2 1 6 | 3 -1 7 | 4 -1 8 | 5 -4 9 | -6 -1 10 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/record_bdu.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 28 4 | 8 -1.318725e+01 5 | 8 -7.652075e+00 6 | 8 7.622007e+00 7 | 8 -9.044294e+00 8 | 8 -1.747279e+00 9 | 8 -7.569865e+00 10 | 8 -5.221819e+00 11 | 8 -6.874555e-01 12 | 8 -6.483100e+00 13 | 8 -1.601383e+01 14 | 9 2.356909e+00 15 | 9 2.143912e+00 16 | 9 -7.351929e+00 17 | 9 1.558139e+00 18 | 9 4.336980e-01 19 | 9 3.655922e+00 20 | 9 -4.673708e+00 21 | 9 -1.997187e+00 22 | 9 9.122675e+00 23 | 7 1.236382e+01 24 | 5 -4.064000e+01 25 | 1 -4.327153e+01 26 | 2 -1.036039e+00 27 | 3 -3.693346e+01 28 | 4 -8.100232e+01 29 | 6 -4.117906e+01 30 | -1 -1.327585e+01 31 | -2 -7.547524e+01 32 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/record_du.dat: -------------------------------------------------------------------------------- 1 | 0 2 | -1 3 | 8 4 | 1 -1.262900e+01 5 | 2 6.601974e+00 6 | 3 -9.215815e+00 7 | 4 -1.977356e+01 8 | 5 -1.308335e+01 9 | 6 -1.048747e+01 10 | -1 -1.611863e+00 11 | -2 -2.404087e+01 12 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/features/value_estimator_bertsekas.dat: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 22 4 | 0 1.882163 5 | 8 0.157276 6 | 8 0.027709 7 | 8 0.073735 8 | 8 0.068846 9 | 8 0.076944 10 | 8 0.077298 11 | 8 0.071355 12 | 8 0.074865 13 | 8 0.026533 14 | 8 0.155844 15 | 9 -0.109454 16 | 9 -0.084454 17 | 9 -0.084207 18 | 9 -0.080510 19 | 9 -0.079441 20 | 9 -0.079068 21 | 9 -0.082915 22 | 9 -0.083773 23 | 9 -0.111844 24 | 7 -0.064443 25 | 5 -0.630502 26 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/pieces3.dat: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # File format describing the pieces 3 | # number of pieces 4 | # for each piece: 5 | # number of orientations, height, width (for the first orientation) 6 | # piece (with "X") 7 | ########################################################################## 8 | # 9 | # Pieces with 3 bricks 10 | # There are 2 pieces: 11 | 2 12 | # 13 | 4 2 2 14 | XX 15 | X 16 | 2 1 3 17 | XXX 18 | # End of File -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/pieces4.dat: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # File format describing the pieces 3 | # number of pieces 4 | # for each piece: 5 | # number of orientations, height, width (for the first orientation) 6 | # piece (with "X") 7 | ########################################################################## 8 | # 9 | # Pieces with 4 bricks (standard pieces) 10 | # There are 7 pieces: 11 | 7 12 | # 13 | 2 4 1 14 | X 15 | X 16 | X 17 | X 18 | 1 2 2 19 | XX 20 | XX 21 | 4 3 2 22 | X 23 | XX 24 | X 25 | 2 3 2 26 | X 27 | XX 28 | X 29 | 2 3 2 30 | X 31 | XX 32 | X 33 | 4 2 3 34 | X 35 | XXX 36 | 4 2 3 37 | XXX 38 | X 39 | # End of File -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/data/pieces_melax.dat: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # File format describing the pieces 3 | # number of pieces 4 | # for each piece: 5 | # number of orientations, height, width (for the first orientation) 6 | # piece (with "X") 7 | ########################################################################## 8 | # 9 | # Melax's Reduced set of pieces 10 | # There are 5 pieces: 11 | 5 12 | # 13 | 1 1 1 14 | X 15 | 2 1 2 16 | XX 17 | 2 2 2 18 | X 19 | X 20 | 4 2 2 21 | X 22 | XX 23 | 1 2 2 24 | XX 25 | XX 26 | # End of File -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/board.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup board Board 3 | * @ingroup api 4 | * @brief The Tetris board 5 | * 6 | * This module handles the game board. The board is composed by an array of rows. 7 | * Each row is represented by a 16-bit integer. 8 | * 9 | * @{ 10 | */ 11 | #ifndef BOARD_H 12 | #define BOARD_H 13 | 14 | #include 15 | #include "types.h" 16 | #include "piece.h" 17 | #include "last_move_info.h" 18 | 19 | /** 20 | * @brief The game board. 21 | * 22 | * The rows are numeroted from bottom to top, starting with 0. 23 | * The columns are numeroted from left to right, starting with 1 (actually column 0 is the left border). 24 | */ 25 | struct Board { 26 | /** 27 | * @name Board structure and content 28 | */ 29 | int width; /**< Number of columns in the board, not including the side borders (10 in standard Tetris). */ 30 | int height; /**< Number of rows in the board (20 in standard Tetris). */ 31 | int extended_height; /**< Number of rows in the internal representation of board (24 in standard Tetris). */ 32 | int allow_lines_after_overflow; /**< enable the lines completion when the piece overflows? */ 33 | uint16_t *rows; /**< Board state: array of rows where each row is represented with an integer. */ 34 | 35 | /** 36 | * @name Information stored to improve the speed 37 | */ 38 | int wall_height; /**< Current height of the wall (index of the lowest empty row). */ 39 | int max_piece_height; /**< Maximum height of a piece (4 for standard Tetris), used to know how many 40 | lines we have to check when a piece is dropped. */ 41 | int *column_heights; /**< Height of each column. */ 42 | 43 | /** 44 | * @name Bit masks depending on the board size 45 | */ 46 | uint16_t empty_row; /**< 16-bit integer representing an empty row (for the standard board size: 1000000000011111). */ 47 | uint16_t full_row; /**< 16-bit integer representing a full row (for the standard board size: 1111111111111111). */ 48 | 49 | /** 50 | * @name Information needed to cancel the last move. 51 | */ 52 | uint16_t *previous_rows; /**< The board state before the last move. */ 53 | int previous_wall_height; /**< The wall height (index of the first empty row) before the last move. */ 54 | }; 55 | 56 | /** 57 | * @name Board creation and destruction 58 | * 59 | * These functions allow to create or destroy a board. 60 | * 61 | * @{ 62 | */ 63 | Board *new_board(int width, int height, int allow_lines_after_overflow, int nb_pieces, Piece *pieces); 64 | Board *new_board_copy(const Board *board); 65 | void free_board(Board *board); 66 | /** 67 | * @} 68 | */ 69 | 70 | 71 | /** 72 | * @name Actions on the board 73 | * 74 | * These functions change the board state. 75 | * You should not change the content of a board's structure directly. 76 | * 77 | * @{ 78 | */ 79 | int board_drop_piece(Board *board, PieceOrientation *oriented_piece, int orientation, int column, LastMoveInfo *last_move_info, int cancellable); 80 | int board_drop_piece_fancy(Board *board, PieceOrientation *oriented_piece, int orientation, int column,LastMoveInfo *last_move_info, int cancellable, int **fancy_board); 81 | int board_drop_piece_rlc(Board *board, Piece *pieces, int piece_index, int desired_orientation, int desired_column,LastMoveInfo *last_move_info, int cancellable); 82 | void board_cancel_last_move(Board *board); 83 | void board_reset(Board *board); 84 | /** 85 | * @} 86 | */ 87 | 88 | /** 89 | * @name Additional information about the board 90 | * 91 | * The following functions provide some other information about the board. 92 | * For performance reasons, these information are computed only 93 | * if you ask it explicitly. 94 | * 95 | * @{ 96 | */ 97 | void board_update_column_heights(Board *board); 98 | int board_get_column_height(Board *board, int column); 99 | /** 100 | * @} 101 | */ 102 | 103 | /** 104 | * @name Displaying 105 | * @{ 106 | */ 107 | void board_print(FILE *out, Board *board); 108 | /** 109 | * @} 110 | */ 111 | 112 | #endif 113 | 114 | /** 115 | * @} 116 | */ 117 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/brick_masks.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Definition of the bit masks declared in brick_masks.h 3 | */ 4 | 5 | #include "config.h" 6 | #include "brick_masks.h" 7 | 8 | const uint16_t brick_masks[] = { 9 | 0x8000, /* X............... */ 10 | 0x4000, /* .X.............. */ 11 | 0x2000, /* ..X............. */ 12 | 0x1000, /* etc */ 13 | 0x0800, 14 | 0x0400, 15 | 0x0200, 16 | 0x0100, 17 | 0x0080, 18 | 0x0040, 19 | 0x0020, 20 | 0x0010, 21 | 0x0008, 22 | 0x0004, 23 | 0x0002, /* ..............X. */ 24 | 0x0001 /* ...............X */ 25 | }; 26 | 27 | const uint16_t brick_masks_inv[] = { 28 | 0x7FFF, /* .XXXXXXXXXXXXXXX */ 29 | ~0x4000, /* X.XXXXXXXXXXXXXX */ 30 | ~0x2000, /* XX.XXXXXXXXXXXXX */ 31 | ~0x1000, /* etc */ 32 | ~0x0800, 33 | ~0x0400, 34 | ~0x0200, 35 | ~0x0100, 36 | ~0x0080, 37 | ~0x0040, 38 | ~0x0020, 39 | ~0x0010, 40 | ~0x0008, 41 | ~0x0004, 42 | ~0x0002, /* XXXXXXXXXXXXXX.X */ 43 | ~0x0001 /* XXXXXXXXXXXXXXX. */ 44 | }; 45 | 46 | /** 47 | * Prints the 16 bits of a row into a file. 48 | * @param out the file to write 49 | * @param row the row 50 | */ 51 | void print_row(FILE *out, uint16_t row) { 52 | int i; 53 | for (i = 0; i < 16; i++) { 54 | if (row & brick_masks[i]) { 55 | fprintf(out, "X"); 56 | } 57 | else { 58 | fprintf(out, "."); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/brick_masks.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Bit masks to store the board state and the shape of the pieces. 3 | */ 4 | 5 | #ifndef BRICK_MASKS_H 6 | #define BRICK_MASKS_H 7 | 8 | #include 9 | #include 10 | 11 | /** 12 | * Bit masks to represent the bricks on a row and the shape of each piece. 13 | * With this representation, a row state is stored on a single 16-bit integer. 14 | * A row size must not exceed 16 cells. 15 | * There is 12 cells in the row of a standard Tetris game (including the 2 side borders). 16 | * These bit fields are also used to represent the shape of the pieces. 17 | */ 18 | extern const uint16_t brick_masks[]; 19 | extern const uint16_t brick_masks_inv[]; 20 | 21 | void print_row(FILE *out, uint16_t row); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/common_parameters.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_PARAMETERS_H 2 | #define COMMON_PARAMETERS_H 3 | 4 | #include "types.h" 5 | #include "rewards.h" 6 | #include "macros.h" 7 | #include "file_tools.h" 8 | 9 | #define MAX_LENGTH 256 10 | 11 | /** 12 | * This structure defines the parameters used in the LPI and CE algorithms. 13 | */ 14 | struct CommonParameters { 15 | 16 | /* Parameters common to algorithms and tetris */ 17 | 18 | int board_width; /* board width */ 19 | int board_height; /* board height */ 20 | 21 | int tetris_implementation; /* 0: Simplified Tetris, 1: RLC2008 Tetris, 2(to be done): Original Tetris */ 22 | 23 | int allow_lines_after_overflow; /* enable the lines completion when the piece overflows? */ 24 | char piece_file_name[MAX_FILE_NAME]; /* file defining the pieces */ 25 | unsigned int random_generator_seed; /* seed of the random number generator */ 26 | 27 | /* Parameters which are common to LPI and CE algorithms */ 28 | 29 | RewardDescription reward_description; /* reward function */ 30 | }; 31 | 32 | 33 | void set_default_reward_function(RewardFunctionID reward_function_id); 34 | 35 | void load_default_parameters(CommonParameters *parameters); 36 | 37 | void ask_common_parameters(CommonParameters *parameters); 38 | 39 | int parse_common_parameter(CommonParameters *parameters, int nb_args, char **args, void (*print_usage)(void)); 40 | 41 | void parameters_assert(int assertion, const char *error_message, void (*print_usage)(void)); 42 | 43 | void common_parameters_print_usage(void); 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/config.h: -------------------------------------------------------------------------------- 1 | /* config.h. Generated from config.h.in by configure. */ 2 | /* config.h.in. Generated from configure.ac by autoheader. */ 3 | 4 | /* Define to 1 if you have the header file. */ 5 | #define HAVE_INTTYPES_H 1 6 | 7 | /* Define to 1 if you have the `m' library (-lm). */ 8 | #define HAVE_LIBM 1 9 | 10 | /* Define to 1 if you have the `z' library (-lz). */ 11 | #define HAVE_LIBZ 1 12 | 13 | /* Define to 1 if you have the header file. */ 14 | #define HAVE_MEMORY_H 1 15 | 16 | /* Define to 1 if you have the `pow' function. */ 17 | #define HAVE_POW 1 18 | 19 | /* Define to 1 if you have the header file. */ 20 | #define HAVE_SIGNAL_H 1 21 | 22 | /* Define to 1 if you have the `sqrt' function. */ 23 | #define HAVE_SQRT 1 24 | 25 | /* Define to 1 if you have the header file. */ 26 | #define HAVE_STDINT_H 1 27 | 28 | /* Define to 1 if you have the header file. */ 29 | #define HAVE_STDLIB_H 1 30 | 31 | /* Define to 1 if you have the header file. */ 32 | #define HAVE_STRINGS_H 1 33 | 34 | /* Define to 1 if you have the header file. */ 35 | #define HAVE_STRING_H 1 36 | 37 | /* Define to 1 if you have the header file. */ 38 | #define HAVE_SYS_STAT_H 1 39 | 40 | /* Define to 1 if you have the header file. */ 41 | #define HAVE_SYS_TYPES_H 1 42 | 43 | /* Define to 1 if you have the header file. */ 44 | #define HAVE_UNISTD_H 1 45 | 46 | /* Name of package */ 47 | #define PACKAGE "mdptetris" 48 | 49 | /* Define to the address where bug reports for this package should be sent. */ 50 | #define PACKAGE_BUGREPORT "christophe.thiery@loria.fr" 51 | 52 | /* Define to the full name of this package. */ 53 | #define PACKAGE_NAME "mdptetris" 54 | 55 | /* Define to the full name and version of this package. */ 56 | #define PACKAGE_STRING "mdptetris 1.4" 57 | 58 | /* Define to the one symbol short name of this package. */ 59 | #define PACKAGE_TARNAME "mdptetris" 60 | 61 | /* Define to the home page for this package. */ 62 | #define PACKAGE_URL "" 63 | 64 | /* Define to the version of this package. */ 65 | #define PACKAGE_VERSION "1.4" 66 | 67 | /* Define to 1 if you have the ANSI C header files. */ 68 | #define STDC_HEADERS 1 69 | 70 | /* Version number of package */ 71 | #define VERSION "1.4" 72 | 73 | /* Define for Solaris 2.5.1 so the uint32_t typedef from , 74 | , or is not used. If the typedef were allowed, the 75 | #define below would cause a syntax error. */ 76 | /* #undef _UINT32_T */ 77 | 78 | /* Define to empty if `const' does not conform to ANSI C. */ 79 | /* #undef const */ 80 | 81 | /* Define to `unsigned int' if does not define. */ 82 | /* #undef size_t */ 83 | 84 | /* Define to the type of an unsigned integer type of width exactly 16 bits if 85 | such a type exists and the standard includes do not define it. */ 86 | /* #undef uint16_t */ 87 | 88 | /* Define to the type of an unsigned integer type of width exactly 32 bits if 89 | such a type exists and the standard includes do not define it. */ 90 | /* #undef uint32_t */ 91 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/feature_functions.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup feature_functions Feature functions 3 | * @ingroup api 4 | * @brief Definition of the features functions. 5 | * 6 | * This module contains the definition of all feature functions. 7 | * 8 | * @see feature_policy 9 | * @{ 10 | */ 11 | #ifndef FEATURE_FUNCTIONS_H 12 | #define FEATURE_FUNCTIONS_H 13 | 14 | #include "types.h" 15 | #include "feature_policy.h" 16 | 17 | /** 18 | * @name General feature function handling 19 | * @{ 20 | */ 21 | FeatureFunction *feature_function(FeatureID feature_id); 22 | void features_initialize(const FeaturePolicy *feature_policy); 23 | void features_exit(void); 24 | /** 25 | * @} 26 | */ 27 | 28 | /** 29 | * @name Special feature functions 30 | * @{ 31 | */ 32 | double get_constant(Game *game); 33 | /** 34 | * @} 35 | */ 36 | 37 | /** 38 | * @name Original feature functions 39 | */ 40 | double get_hole_depths(Game *game); 41 | double get_rows_with_holes(Game *game); 42 | double get_next_wall_height(Game *game); 43 | double get_surrounded_holes(Game *game); 44 | double get_next_local_value_function(Game *game); 45 | double get_well_sums_fast(Game *game); 46 | 47 | double get_wall_distance_to_top(Game *game); 48 | double get_next_column_distance_to_top(Game *game); 49 | double get_next_column_height_difference2(Game *game); 50 | 51 | double get_wall_distance_to_top_square(Game *game); 52 | double get_hole_depths_square(Game *game); 53 | double get_height_square(Game *game); 54 | double get_next_column_height2(Game *game); 55 | 56 | double get_diversity(Game *game); 57 | 58 | /** 59 | * @} 60 | */ 61 | 62 | /** 63 | * @name Feature functions from Dellacherie (2003) 64 | * @{ 65 | */ 66 | double get_landing_height(Game *game); 67 | double get_eroded_piece_cells(Game *game); 68 | double get_row_transitions(Game *game); 69 | double get_column_transitions(Game *game); 70 | double get_holes(Game *game); 71 | double get_well_sums_dellacherie(Game *game); 72 | /** 73 | * @} 74 | */ 75 | 76 | /** 77 | * @name Feature functions from Bertsekas and Ioffe (1996) 78 | */ 79 | double get_wall_height(Game *game); 80 | double get_next_column_height(Game *game); 81 | double get_next_column_height_difference(Game *game); 82 | /** 83 | * @} 84 | */ 85 | 86 | /** 87 | * @name Feature functions from Fahey (2006) 88 | */ 89 | double get_occupied_cells(Game *game); 90 | double get_weighted_cells(Game *game); 91 | double get_wells(Game *game); 92 | double get_rows_eliminated(Game *game); 93 | /** 94 | * @} 95 | */ 96 | 97 | /** 98 | * @name Feature functions from Bohm et al. (2005) 99 | */ 100 | double get_max_height_difference(Game *game); 101 | /** 102 | * @} 103 | */ 104 | 105 | #endif 106 | 107 | /** 108 | * @} 109 | */ 110 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/file_tools.c: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "file_tools.h" 3 | #include "macros.h" 4 | 5 | /** 6 | * @brief Opens a file in the current directory or in the MdpTetris data directory. 7 | * 8 | * The current directory is used for the user-defined data files. The data directory 9 | * contains data files installed with the program. 10 | * 11 | * First, the function tries to open the file in the current directory. If this file 12 | * doesn't exist, then the function tries to open it in the data directory of MdpTetris 13 | * (e.g. \c /usr/local/share/mdptetris). 14 | * 15 | * You should use this function instead of \c fopen to open any data file (provided 16 | * with the application of user-defined). You don't have to care about the directories. 17 | * 18 | * Note that the data directory contains read-only files installed with MdpTetris, 19 | * so this function will not try to open a file in the data directory if the fopen mode 20 | * is not \c "r". 21 | * 22 | * @param file_name name of the file to open 23 | * @param fopen_mode mode to give to the \c fopen call 24 | * @return the file, or NULL if it couldn't be open. 25 | */ 26 | FILE *open_data_file(const char *file_name, const char *fopen_mode) { 27 | 28 | FILE *f; 29 | char file_name_in_datadir[MAX_FILE_NAME]; 30 | 31 | /* first, open the file in the current directory */ 32 | f = fopen(file_name, fopen_mode); 33 | 34 | if (f == NULL && fopen_mode[0] == 'r') { 35 | /* open the file in the data directory */ 36 | sprintf(file_name_in_datadir, "%s/%s", DATADIR, file_name); 37 | f = fopen(file_name_in_datadir, fopen_mode); 38 | } 39 | 40 | return f; 41 | } 42 | 43 | /** 44 | * @brief Reads the next non-comment line of a file. 45 | * 46 | * A line is considered as a comment if the first character is '#'. 47 | * 48 | * @param f the file to read 49 | * @param line pointer to store the characters that will be read 50 | * @param line_size maximum number of characters to read on the line 51 | * @return 1 if the a line was successfuly read, 0 if the end of the file 52 | * was reached. 53 | */ 54 | int readline_skipcomments(FILE *f, char *line, int line_size) { 55 | 56 | do { 57 | if (fgets(line, line_size, f) == NULL) { 58 | return 0; 59 | } 60 | 61 | } while (line[0] == '#'); /* skip comments */ 62 | 63 | return 1; 64 | } 65 | 66 | /** 67 | * @brief Displays a message explaining an error occured when parsing a file 68 | * and exits the program. 69 | * @param file_name name of the file 70 | * @param expected a string describing what was expected 71 | * @param readed a string describing what was read instead of what was expected 72 | */ 73 | void problem_reading_file(const char *file_name, const char *expected, const char* readed) { 74 | DIE3("Problem reading file '%s': expected '%s' and readed '%s'\n", file_name, expected, readed); 75 | } 76 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/file_tools.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup file_tools File tools 3 | * @ingroup api 4 | * @brief Basic file parsing functions 5 | * 6 | * This module provides some utility functions to analyse a file. 7 | * 8 | * @{ 9 | */ 10 | #ifndef FILE_TOOLS_H 11 | #define FILE_TOOLS_H 12 | 13 | #include 14 | 15 | /** 16 | * @brief Maximum number or characters allowed in a file name. 17 | */ 18 | #define MAX_FILE_NAME 256 19 | 20 | /** 21 | * @brief Constant string containing the name of the MdpTetris data directory. 22 | */ 23 | #define DATADIR STRING(DATADIR_) 24 | 25 | 26 | FILE *open_data_file(const char *file_name, const char *fopen_mode); 27 | int readline_skipcomments(FILE *f, char *line, int line_size); 28 | void problem_reading_file(const char *file_name, const char *expected, const char* readed); 29 | 30 | #endif 31 | 32 | /** 33 | * @} 34 | */ 35 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/game.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup game Game 3 | * @ingroup api 4 | * @brief A tetris game 5 | * 6 | * This module handles a game of Tetris. A game is composed by the board state, 7 | * the current piece and the score. 8 | * 9 | * @{ 10 | */ 11 | #ifndef GAME_H 12 | #define GAME_H 13 | 14 | #include 15 | #include "types.h" 16 | #include "board.h" 17 | #include "piece.h" 18 | #include "last_move_info.h" 19 | 20 | /** 21 | * @brief Action decided by the player. 22 | * 23 | * An action is a decision made by the player in a given game state. 24 | * It is where the player drops the piece and in which orientation. 25 | */ 26 | struct Action { 27 | int orientation; /**< Rotation of the piece, from \c 0 to \c 3. */ 28 | int column; /**< Column where the piece is dropped, from \c 1 to \c board->width. */ 29 | }; 30 | 31 | /** 32 | * @brief Configuration of the game pieces. 33 | * 34 | * This structure describes the set of pieces to use in the game. 35 | * In standard Tetris, there are 7 pieces. 36 | */ 37 | typedef struct PieceConfiguration { 38 | int nb_pieces; /**< Number of existing pieces (7 in standard Tetris). */ 39 | Piece *pieces; /**< Shapes of the existing pieces. */ 40 | int *piece_sequence; /**< The sequence of pieces falling 41 | * (a NULL-terminated array of piece indexes), 42 | * or NULL to choose the pieces randomly. */ 43 | int nb_games; /**< Number of games currently allocated that use this piece configuration. */ 44 | } PieceConfiguration; 45 | 46 | /** 47 | * @brief A game. 48 | * 49 | * The game structure. The structure contains the game 50 | * configuration, the current state and some information 51 | * about the previous state of the game. 52 | */ 53 | struct Game { 54 | 55 | /** 56 | * @name Configuration of the pieces 57 | */ 58 | PieceConfiguration *piece_configuration; /**< The pieces of Tetris. */ 59 | 60 | int tetris_implementation; /**< 0: Simplified Tetris, 1: RLC2008 Tetris, 2(to be done): Original Tetris */ 61 | /** 62 | * @name Game state 63 | */ 64 | Board *board; /**< The wall state. */ 65 | int game_over; /**< 1 if the game is over, 0 otherwise. */ 66 | int score; /**< Number of lines removed since the beginning of the game. */ 67 | Piece *current_piece; /**< The current piece falling. */ 68 | int current_piece_index; /**< Index of the current piece. */ 69 | int current_piece_sequence_index; /**< Current index in the sequence of pieces. */ 70 | 71 | /** 72 | * @name Information about the previous state 73 | */ 74 | int previous_piece_index; /**< The last piece placed. */ 75 | LastMoveInfo last_move_info; /**< Information about the last move. */ 76 | }; 77 | 78 | /** 79 | * @name Game creation and destruction 80 | * 81 | * These functions allow to create or destroy a game. 82 | */ 83 | Game *new_game(int tetris_implementation, int width, int height, int allow_lines_after_overflow, 84 | const char *pieces_file_name, int *piece_sequence); 85 | Game *new_standard_game(); 86 | Game *new_game_from_parameters(CommonParameters *parameters); 87 | Game *new_game_copy(const Game *other); 88 | void free_game(Game *game); 89 | /** 90 | * @} 91 | */ 92 | 93 | /** 94 | * @name Observation functions 95 | * 96 | * These functions provide some information about the current game state. 97 | * You can read directly the content of a game's structure. 98 | * 99 | * @{ 100 | */ 101 | int game_get_nb_possible_orientations(Game *game); 102 | int game_get_nb_possible_columns(Game *game, int orientation); 103 | int game_get_current_piece(Game *game); 104 | int game_get_nb_pieces(Game *game); 105 | /** 106 | * @} 107 | */ 108 | 109 | /** 110 | * @name Modification functions 111 | * 112 | * These function change the current game state. 113 | * You should not change the content of a game's structure directly. 114 | * 115 | * @{ 116 | */ 117 | int game_drop_piece(Game *game, const Action *action, int cancellable); 118 | void game_cancel_last_move(Game *game); 119 | void game_set_current_piece_index(Game *game, int piece_index); 120 | void game_reset(Game *game); 121 | void generate_next_piece(Game *game); 122 | /** 123 | * @} 124 | */ 125 | 126 | /** 127 | * @name Displaying 128 | * @{ 129 | */ 130 | void game_print(FILE *out, Game *game); 131 | /** 132 | * @} 133 | */ 134 | 135 | #endif 136 | 137 | /** 138 | * @} 139 | */ 140 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/games_statistics.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup game_statistics Game statistics 3 | * @ingroup api 4 | * @brief Statistics about some sequences (episodes) of games played 5 | * 6 | * This module handles the statistics about one or several sequences of games played. 7 | * A seqeuence of games is called an episode. Each episode is saved as a line 8 | * in the statistics file. The line contains the episode number, the best score, 9 | * the worst score, the mean score, the standard deviation and the scores of the 10 | * games played during the episode. Optionnaly, it can also contain the feature weights 11 | * if the games were played with a feature-based policy. 12 | * 13 | * As the first element of each line is the episode number and the second element is 14 | * the episode's mean score, the file can be plotted directly in Gnuplot. 15 | * 16 | * @{ 17 | */ 18 | #ifndef GAMES_STATISTICS_H 19 | #define GAMES_STATISTICS_H 20 | 21 | #include 22 | #include "game.h" 23 | #include "macros.h" 24 | 25 | /** 26 | * @brief Statistics about the episodes. 27 | * 28 | * This structure contains all data about the current episode 29 | * and some global information about all episodes. 30 | */ 31 | struct GamesStatistics { 32 | /** 33 | * @name Statistics about an episode (reinitialized when games_statistics_reset() is called) 34 | */ 35 | int *scores; /**< Score of each game. */ 36 | int nb_games_played; /**< Number of games to play in the current episode. */ 37 | int min_score; /**< Worst score of a game in this episode. */ 38 | int max_score; /**< Best score of a game in this episode. */ 39 | double mean; /**< Mean score of the games in this episode. */ 40 | double standard_deviation; /**< Standard deviation of the games in this episode. */ 41 | 42 | /** 43 | * @name Information about all episodes played 44 | */ 45 | int nb_episodes; /**< Number of episodes done until now. */ 46 | double best_mean; /**< Best mean score of an episode. */ 47 | FILE *stats_file; /**< The file where the statistics are saved. */ 48 | }; 49 | 50 | /** 51 | * @name Statistics creation and destruction 52 | * 53 | * These functions allow to create or destroy a GameStatistics object. 54 | */ 55 | GamesStatistics *games_statistics_new(const char *stats_file_name, int nb_games, const char *comments); 56 | void games_statistics_free(GamesStatistics *games_statistics); 57 | /** 58 | * @} 59 | */ 60 | 61 | /** 62 | * @name Statistics update 63 | * 64 | * These function update the statistics, taking new information into account. 65 | */ 66 | void games_statistics_add_game(GamesStatistics *stats, int score); 67 | void games_statistics_end_episode(GamesStatistics *games_statistics, const FeaturePolicy *feature_policy); 68 | /** 69 | * @} 70 | */ 71 | 72 | #endif 73 | 74 | /** 75 | * @} 76 | */ 77 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/hashtable.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup hashtable Hashtables 3 | * @ingroup api 4 | * @brief A hashtable implementation 5 | * 6 | * This module a hashtable implementation. A hashtable associates 7 | * a void* object to a string. 8 | * 9 | * @{ 10 | */ 11 | #ifndef HASHTABLE_H 12 | #define HASHTABLE_H 13 | 14 | typedef struct Hashtable Hashtable; 15 | 16 | /** 17 | * @name Hashtable creation and destruction 18 | * @{ 19 | */ 20 | Hashtable* hashtable_new(int table_size, void (*free_function)(void *element)); 21 | void hashtable_free(Hashtable *hashtable); 22 | /** 23 | * @} 24 | */ 25 | 26 | /** 27 | * @name Accessing the elements 28 | * @{ 29 | */ 30 | void* hashtable_get(Hashtable *hashtable, const char *key); 31 | void hashtable_add(Hashtable *hashtable, const char *key, void *data); 32 | int hashtable_get_length(Hashtable *hashtable); 33 | int hashtable_contains(Hashtable *hashtable, const char *key); 34 | void hashtable_foreach(Hashtable *hashtable, void (*function)(const char *key, void *data)); 35 | /** 36 | * @} 37 | */ 38 | 39 | /** 40 | * @name Removing elements 41 | * @{ 42 | */ 43 | void hashtable_remove(Hashtable *hashtable, const char *key); 44 | void hashtable_clear(Hashtable *hashtable); 45 | void hashtable_prune(Hashtable *hashtable, int (*should_remove)(const char *key, void *data)); 46 | /** 47 | * @} 48 | */ 49 | 50 | /** 51 | * @name Displaying (for debug) 52 | * @{ 53 | */ 54 | void hashtable_print(Hashtable *hashtable); 55 | /** 56 | * @} 57 | */ 58 | 59 | #endif 60 | /** 61 | * @} 62 | */ 63 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/interruptions.c: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "interruptions.h" 3 | 4 | #ifdef HAVE_SIGNAL_H 5 | 6 | #include 7 | #include 8 | #define __USE_POSIX 9 | #include 10 | 11 | /** 12 | * Number of times the user has pressed Ctrl-C. 13 | */ 14 | static int nb_interruptions = 0; 15 | 16 | /** 17 | * Default interruption handler, saved here so that 18 | * we can restore it. 19 | */ 20 | static struct sigaction old_sigaction; 21 | 22 | /** 23 | * Function called when the user pressed Ctrl-C. 24 | */ 25 | static void interrupt_handler(int sig); 26 | 27 | /** 28 | * Initializes the interruption handler. 29 | */ 30 | void initialize_interruptions(void) { 31 | struct sigaction new_sigaction; 32 | sigset_t sigset; 33 | 34 | /* set the interruption handler */ 35 | new_sigaction.sa_handler = interrupt_handler; 36 | sigemptyset(&sigset); 37 | new_sigaction.sa_mask = sigset; 38 | new_sigaction.sa_flags = 0; 39 | sigaction(SIGINT, &new_sigaction, &old_sigaction); 40 | } 41 | 42 | /** 43 | * Restores the default interruption handler. 44 | */ 45 | void exit_interruptions(void) { 46 | sigaction(SIGINT, &old_sigaction, NULL); 47 | } 48 | 49 | /** 50 | * Returns whether the user pressed Ctrl-C a first time. 51 | */ 52 | int is_interrupted(void) { 53 | return nb_interruptions; 54 | } 55 | 56 | /** 57 | * Function called when the user pressed Ctrl-C. 58 | */ 59 | static void interrupt_handler(int sig) { 60 | switch (nb_interruptions) { 61 | 62 | case 0: 63 | printf("\nInterruption detected - Finishing the current iteration\nPress Ctrl-C again to exit now\n"); 64 | break; 65 | 66 | case 1: 67 | printf("\n"); 68 | exit(0); 69 | break; 70 | } 71 | 72 | nb_interruptions++; 73 | } 74 | 75 | #else 76 | 77 | /* signal.h is not present: we disable the Ctrl-C system */ 78 | 79 | /** 80 | * @cond 81 | */ 82 | 83 | void initialize_interruptions(void) { 84 | 85 | } 86 | 87 | void exit_interruptions(void) { 88 | 89 | } 90 | 91 | int is_interrupted(void) { 92 | return 0; 93 | } 94 | 95 | /** 96 | * @endcond 97 | */ 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/interruptions.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup interruptions Interruptions 3 | * @ingroup api 4 | * @brief Ctrl-C signal management 5 | * 6 | * This module handles the Ctrl-C signal. 7 | * When the user presses Ctrl-C, a message is displayed. 8 | * Then your algorithm should finish the current iteration and stop. 9 | * If the user presses Ctrl-C again, the program exits. 10 | * 11 | * Note that this module uses the POSIX signals mechanism, and requires 12 | * the \c signal.h header. If \c signal.h is not detected by the \c configure 13 | * script, then the mechanism is disabled: the functions of this module 14 | * do nothing, and when the user presses Ctrl-C, the program is stopped. 15 | * 16 | * @{ 17 | */ 18 | #ifndef INTERRUPTIONS_H 19 | #define INTERRUPTIONS_H 20 | 21 | /** 22 | * @brief Initializes the interruption handler. 23 | * 24 | * Call this function if you want to use the Ctrl-C system. 25 | * This function changes the handler of the SIGINT signal. 26 | * After this function is called, if SIGINT is received (i.e. 27 | * the user has pressed Ctrl-C), a message "Finishing the 28 | * current iteration" is displayed. Then the function 29 | * is_interrupted() returns \c 1 and your algorithm should 30 | * finish its current iteration and stop. 31 | * If the SIGINT signal is received a second time (i.e. after the message 32 | * was displayed but before the current iteration of your algorithm 33 | * is finished), then the program stops. 34 | * 35 | * @see exit_interruptions() 36 | * 37 | */ 38 | void initialize_interruptions(void); 39 | 40 | /** 41 | * @brief Restores the default interruption handler. 42 | * 43 | * Call this function to cancel the behavior created by 44 | * initialize_interruptions(). 45 | * 46 | * @see initialize_interruptions() 47 | */ 48 | void exit_interruptions(void); 49 | 50 | /** 51 | * @brief Returns whether the user pressed Ctrl-C a first time. 52 | * 53 | * initialize_interruptions() should have been called before. 54 | * Your algorithm has to call this function to know when the user wants to 55 | * stop. As soon as this function returns 1, your algorithm should finish 56 | * its current iteration, save some data if necessary and then stop. 57 | * 58 | * @return 1 if the user pressed Ctrl-C once, and 0 otherwise. 59 | */ 60 | int is_interrupted(void); 61 | 62 | #endif 63 | /** 64 | * @} 65 | */ 66 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/last_move_info.c: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "last_move_info.h" 3 | 4 | /** 5 | * @brief Prints information about the last move. 6 | * @param out file to write 7 | * @param last_move_info some last move information to print 8 | */ 9 | void print_last_move_info(FILE *out, LastMoveInfo *last_move_info) { 10 | printf("Last move:\n"); 11 | printf(" Removed lines: %d\n", last_move_info->removed_lines); 12 | printf(" Landing height: %d\n", last_move_info->landing_height_bottom); 13 | printf(" Eliminated cells from the last piece : %d\n", last_move_info->eliminated_bricks_in_last_piece); 14 | } 15 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/last_move_info.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup last_move_info Last move info 3 | * @ingroup api 4 | * @brief Information about the last move 5 | * 6 | * This module stores some information about the last move made. 7 | * This information is actually used by some features: indeed, several features 8 | * evaluate the last action instead of the current state itself. 9 | * 10 | * @{ 11 | */ 12 | #ifndef LAST_MOVE_INFO_H 13 | #define LAST_MOVE_INFO_H 14 | 15 | #include 16 | #include "types.h" 17 | 18 | /** 19 | * @brief Information about the last action. 20 | */ 21 | typedef struct LastMoveInfo { 22 | 23 | /** 24 | * @name Effect of the action on the board 25 | */ 26 | int removed_lines; /**< Number of rows completed during the move 27 | * (also used to cancel a move). */ 28 | int landing_height_bottom; /**< Index of the row where the bottom part 29 | * of the piece is put. */ 30 | int eliminated_bricks_in_last_piece; /**< Number of cells of the last piece put 31 | * that were part of rows completed. */ 32 | 33 | /** 34 | * @name The action made 35 | */ 36 | int column; /**< Column where the last piece was put 37 | * (\c 1 to \c w where \c w is the board size). */ 38 | int orientation; /**< Orientation of the last piece (\c 0 to 39 | * n - 1 where \c n is the number of 40 | * possible orientations of the piece. */ 41 | PieceOrientation *oriented_piece; /* The last piece put. */ 42 | 43 | int nb_steps; /* The number of steps (RLC mode) */ 44 | 45 | } LastMoveInfo; 46 | 47 | /** 48 | * @name Displaying 49 | * @{ 50 | */ 51 | void print_last_move_info(FILE *out, LastMoveInfo *last_move_info); 52 | /** 53 | * @} 54 | */ 55 | 56 | #endif 57 | /** 58 | * @} 59 | */ 60 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/piece.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @defgroup piece Pieces 3 | * @ingroup api 4 | * @brief Tetris pieces 5 | * 6 | * This module handles the game pieces. A piece is composed by an array of orientations. 7 | * Each orientation is an array of rows, where each row is represented by a 16-bit integer. 8 | * 9 | * @{ 10 | */ 11 | #ifndef PIECE_H 12 | #define PIECE_H 13 | 14 | #include 15 | #include 16 | #include "types.h" 17 | 18 | /** 19 | * @brief A Tetris piece oriented in a specific direction. 20 | */ 21 | struct PieceOrientation { 22 | int width; /**< Width of the piece in this orientation. */ 23 | int height; /**< Height of the piece in this orientation. */ 24 | uint16_t *bricks; /**< Shape of the piece in this orientation 25 | * (array of size \c height where each element is a 26 | * 16-bit integer representing a row). */ 27 | int *nb_full_cells_on_rows; /**< Number of full cells on each row (array of 28 | * size \c height where each element is the 29 | * number of full cells on a row. */ 30 | }; 31 | 32 | /** 33 | * @brief A Tetris piece with its possible orientations. 34 | */ 35 | struct Piece { 36 | int nb_orientations; /**< Number of possible orientations of the shape. */ 37 | PieceOrientation *orientations; /**< Array of size \c nb_orientations, containing 38 | * the possible orientations of the piece 39 | * (not an array of pointers) */ 40 | }; 41 | 42 | /** 43 | * @name Piece creation and destruction 44 | * 45 | * These functions allow to create or destroy the pieces. 46 | * 47 | * @{ 48 | */ 49 | void load_pieces(const char *file_name, int *nb_pieces, Piece **pieces); 50 | void free_piece(Piece *piece); 51 | /** 52 | * @} 53 | */ 54 | 55 | 56 | /** 57 | * @name Displaying 58 | * 59 | * These functions display human-readable views of a Tetris piece. 60 | */ 61 | void piece_print(FILE *out, Piece *piece); 62 | void piece_print_orientation(FILE *out, PieceOrientation *orientation); 63 | void piece_print_orientations(FILE *out, Piece *piece); 64 | /** 65 | * @} 66 | */ 67 | 68 | #endif 69 | 70 | /** 71 | * @} 72 | */ 73 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/random.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "config.h" 4 | #include "random.h" 5 | 6 | /** 7 | * Initializes the GSL random number generator with a specified seed. 8 | */ 9 | void initialize_random_generator(unsigned int seed) { 10 | srand(seed); 11 | } 12 | 13 | /** 14 | * Returns an integer number in [a,b[. 15 | */ 16 | int random_uniform(int a, int b) { 17 | return (rand() % (b - a)) - a; 18 | } 19 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/random.h: -------------------------------------------------------------------------------- 1 | #ifndef RANDOM_H 2 | #define RANDOM_H 3 | 4 | /** 5 | * Initializes the GSL random number generator. 6 | */ 7 | void initialize_random_generator(unsigned int seed); 8 | 9 | /** 10 | * Frees the random number generator. 11 | */ 12 | void exit_random_generator(); 13 | 14 | /** 15 | * Returns an integer number in [a,b[. 16 | */ 17 | int random_uniform(int a, int b); 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/rewards.c: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "rewards.h" 3 | 4 | RewardFunction *all_reward_functions[] = { 5 | get_no_reward, 6 | get_reward_removed_lines, 7 | get_reward_one, 8 | get_reward_at_least_one_line, 9 | get_reward_tetris_are_better 10 | }; 11 | 12 | /** 13 | * Returns 0. 14 | */ 15 | int get_no_reward(Game *game) { 16 | return 0; 17 | } 18 | 19 | /** 20 | * Returns the number of lines removed in the last move. 21 | */ 22 | int get_reward_removed_lines(Game *game) { 23 | return game->last_move_info.removed_lines; 24 | } 25 | 26 | /** 27 | * Returns 1 if the game is not over. 28 | */ 29 | int get_reward_one(Game *game) { 30 | if (game->game_over) { 31 | return 0; 32 | } else { 33 | return 1; 34 | } 35 | } 36 | 37 | /** 38 | * Returns 1 if there was at least one removed line in the last move. 39 | */ 40 | int get_reward_at_least_one_line(Game *game) { 41 | return (game->last_move_info.removed_lines > 0) ? 1 : 0; 42 | } 43 | 44 | 45 | /** 46 | * Returns a number which grows quickly with the number of removed lines. (useful for the RL competition) 47 | */ 48 | int get_reward_tetris_are_better(Game *game) { 49 | switch (game->last_move_info.removed_lines) { 50 | case 0: 51 | return 0; 52 | case 1: 53 | return 1; 54 | case 2: 55 | return 3; 56 | case 3: 57 | return 7; 58 | case 4: 59 | return 13; 60 | } 61 | return(0); 62 | } 63 | 64 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/rewards.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This module provides some usual reward functions. 3 | */ 4 | 5 | #ifndef REWARD_H 6 | #define REWARD_H 7 | 8 | #include "types.h" 9 | #include "game.h" 10 | 11 | /** 12 | * Function type for a reward function. 13 | */ 14 | typedef int (RewardFunction)(Game *game); 15 | 16 | /** 17 | * Constants to identify the reward functions. 18 | */ 19 | typedef enum { 20 | NO_REWARD, /* always zero */ 21 | REWARD_REMOVED_LINES, /* number of lines removed in the last move */ 22 | REWARD_ONE, /* always 1 (rewards the number of moves before the game is over) */ 23 | REWARD_AT_LEAST_ONE_LINE, /* 1 if there was at least one line removed in the last move */ 24 | REWARD_TETRIS_ARE_BETTER /* 0,1,4,9,15 (for RL competition) */ 25 | } RewardFunctionID; 26 | 27 | /** 28 | * This structure defines a reward function with its ID. 29 | */ 30 | struct RewardDescription { 31 | RewardFunction *reward_function; 32 | RewardFunctionID reward_function_id; 33 | }; 34 | 35 | /** 36 | * Associates a reward function to each index. 37 | */ 38 | extern RewardFunction *all_reward_functions[]; 39 | 40 | 41 | /************************************ 42 | * REWARD FUNCTIONS * 43 | ************************************/ 44 | 45 | /** 46 | * Returns 0. 47 | */ 48 | int get_no_reward(Game *game); 49 | 50 | /** 51 | * Returns the number of lines removed in the last move. 52 | */ 53 | int get_reward_removed_lines(Game *game); 54 | 55 | /** 56 | * Returns 1 if the game is not over. 57 | */ 58 | int get_reward_one(Game *game); 59 | 60 | /** 61 | * Returns 1 if there was at least one removed line in the last move. 62 | */ 63 | int get_reward_at_least_one_line(Game *game); 64 | 65 | /** 66 | * Returns a number which grows quickly with the number of removed lines. (useful for the RL competition) 67 | */ 68 | int get_reward_tetris_are_better(Game *game); 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/simple_tetris.c: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "simple_tetris.h" 3 | #include "brick_masks.h" 4 | 5 | /** 6 | * Returns the state code corresponding to a game 7 | * without the current piece. 8 | */ 9 | uint32_t get_game_code(Game *game) { 10 | uint32_t state_code; 11 | uint16_t *board_rows; 12 | int i; 13 | 14 | state_code = 0; 15 | 16 | board_rows = game->board->rows; 17 | for (i = game->board->wall_height - 1; i >= 0; i--) { 18 | state_code = state_code << WIDTH; 19 | state_code |= (board_rows[i] & brick_masks_inv[0]) >> WEAK_BITS_SHIFT; 20 | } 21 | 22 | return state_code; 23 | } 24 | 25 | /** 26 | * Sets the game board (not including the current piece) 27 | * corresponding to a integer code. 28 | */ 29 | void set_game_state(Game *game, uint32_t state_code) { 30 | uint16_t *board_rows; 31 | uint16_t row, empty_row; 32 | int i; 33 | int wall_height; 34 | 35 | /* each bit represents the state of a cell */ 36 | board_rows = game->board->rows; 37 | empty_row = game->board->empty_row; 38 | for (i = 0; i < HEIGHT; i++) { 39 | wall_height = i; 40 | row = (uint16_t) (state_code & LAST_BITS_MASK); 41 | if (!row) { /* the row is empty */ 42 | board_rows[i] = empty_row; 43 | } 44 | else { 45 | board_rows[i] = (row << WEAK_BITS_SHIFT) | empty_row; 46 | } 47 | state_code = state_code >> WIDTH; 48 | } 49 | game->board->wall_height = i; 50 | } 51 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/simple_tetris.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This module defines the properties of a simplified 3 | * tetris game. 4 | */ 5 | 6 | #ifndef SIMPLE_TETRIS_H 7 | #define SIMPLE_TETRIS_H 8 | 9 | #include 10 | #include "game.h" 11 | #include "macros.h" 12 | #include "file_tools.h" 13 | 14 | /* Uncomment the following line for 4*5 */ 15 | /*#define SIZE_45 */ 16 | 17 | 18 | #ifdef SIZE_45 19 | 20 | #define WIDTH 4 21 | #define HEIGHT 5 22 | #define NB_STATES 1048576 23 | #define LAST_BITS_MASK 0x0000000F /* 4 bits */ 24 | #define WEAK_BITS_SHIFT 11 25 | 26 | #else 27 | 28 | #define WIDTH 5 29 | #define HEIGHT 5 30 | #define NB_STATES 33554432 31 | #define LAST_BITS_MASK 0x0000001F /* 5 bits */ 32 | #define WEAK_BITS_SHIFT 10 33 | 34 | #endif 35 | 36 | /** 37 | * Parameters of the algorithm. 38 | * This parameters are saved in the values file so the 39 | * algorithm can be resumed after an interruption. 40 | */ 41 | typedef struct ValueIterationParameters { 42 | int nb_pieces; /* number of pieces */ 43 | double gamma; /* discount factor */ 44 | double delta_limit; /* limit to stop the algorithm */ 45 | int iterations; /* current number of iterations */ 46 | int use_buffer; /* 1 to use a buffer */ 47 | 48 | /* files */ 49 | char piece_file_name[MAX_FILE_NAME]; 50 | char delta_file_name[MAX_FILE_NAME]; 51 | } ValueIterationParameters; 52 | 53 | typedef struct OldValueIterationParameters { 54 | int nb_pieces; /* number of pieces */ 55 | double gamma; /* discount factor */ 56 | double delta_limit; /* limit to stop the algorithm */ 57 | int iterations; /* current number of iterations */ 58 | 59 | /* files */ 60 | char piece_file_name[MAX_FILE_NAME]; 61 | char delta_file_name[MAX_FILE_NAME]; 62 | } OldValueIterationParameters; 63 | 64 | uint32_t get_game_code(Game *game); 65 | void set_game_state(Game *game, uint32_t state_code); 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /pyrl/environments/mdptetris/src/types.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This header declares the main types defined in the modules. 3 | * Each header should include this one first to avoid cycling 4 | * dependencies other headers. 5 | */ 6 | 7 | #ifndef TYPES_H 8 | #define TYPES_H 9 | 10 | typedef struct Game Game; 11 | typedef struct Action Action; 12 | typedef struct Board Board; 13 | typedef struct Piece Piece; 14 | typedef struct PieceOrientation PieceOrientation; 15 | typedef struct RewardDescription RewardDescription; 16 | typedef struct Feature Feature; 17 | typedef struct FeaturePolicy FeaturePolicy; 18 | typedef struct GamesStatistics GamesStatistics; 19 | typedef struct Strategy Strategy; 20 | typedef struct CommonParameters CommonParameters; 21 | 22 | /** 23 | * @brief Function type for a feature. 24 | * 25 | * A feature function takes as parameter a game state 26 | * and returns a numeric value. 27 | */ 28 | typedef double (FeatureFunction)(Game *game); 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /pyrl/environments/multiroom.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import sys 18 | import numpy 19 | 20 | from rlglue.environment.Environment import Environment 21 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 22 | from rlglue.types import Observation 23 | from rlglue.types import Action 24 | from rlglue.types import Reward_observation_terminal 25 | from pyrl.rlglue import TaskSpecRLGlue 26 | from pyrl.rlglue.registry import register_environment 27 | 28 | from . import gridworld 29 | 30 | @register_environment 31 | class MultiRoomGridworld(gridworld.Gridworld): 32 | name = "Multi-Room Gridworld" 33 | 34 | # All parameters are in units of 1, where 1 is how far on average 35 | # the agent can move with a single action. 36 | # The walls will always be of unit thickness and be placed 37 | # at 0.5*size_y with a door at 0.9*size_x, and 38 | # above that wall a vertical wall will be placed at 0.3*size_x with a door at 0.75*size_y 39 | # If the goal falls inside a wall it will be pushed to the nearest non-wall location 40 | def __init__(self, size_x=10, size_y=10, goal_x=10, goal_y=10, noise=0.0, random_start=False, fudge=1.4143): 41 | gridworld.Gridworld.__init__(self, size_x=size_x, size_y=size_y, goal_x=goal_x, goal_y=goal_y, 42 | noise=noise, random_start=random_start, fudge=fudge) 43 | # Build walls and doors (actually might only need to specify the doors) 44 | #self.wall1 = numpy.array([[0.0, size_y*0.5], [size_x, size_y*0.5]]) 45 | self.door1 = numpy.array([size_x*0.9, size_y*0.5]) 46 | #self.wall2 = numpy.array([[size_x*0.3, size_y*0.5], [size_x*0.3, size_y]]) 47 | self.door2 = numpy.array([size_x*0.3, size_y*0.75]) 48 | self.goal = self.fixPoint(self.goal) 49 | self.domain_name = "Continuous MultiRoom Gridworld by Will Dabney" 50 | 51 | def fixPoint(self, point): 52 | if numpy.abs(self.door1 - point).max() <= 0.5 or numpy.abs(self.door2 - point).max() <= 0.5: 53 | return point 54 | 55 | cond1 = point[1] <= self.door1[1] 56 | cond2 = point[0] <= self.door2[0] 57 | 58 | if cond1: # Bottom room 59 | return point.clip([0.0, 0.0], [self.size[0], self.door1[1]-0.51]) 60 | else: 61 | if cond2: # Top left room 62 | return point.clip([0.0, self.door1[1]+0.51], [self.door2[0]-0.51, self.size[1]]) 63 | else: # Top right room 64 | return point.clip([self.door2[0]+0.51, self.door1[1]+0.51], self.size) 65 | 66 | def isPointInWall(self, point): 67 | if (self.fixPoint(point) == point).all(): 68 | return False 69 | else: 70 | return True 71 | 72 | def reset(self): 73 | if self.random_start: 74 | self.pos = self.fixPoint(numpy.random.random((2,)) * self.size) 75 | else: 76 | self.pos[:] = 0.0 77 | 78 | def takeAction(self, action): 79 | reward = gridworld.Gridworld.takeAction(self, action) 80 | self.pos = self.fixPoint(self.pos) 81 | return reward 82 | 83 | 84 | if __name__=="__main__": 85 | import argparse 86 | parser = argparse.ArgumentParser(description='Run 2D MultiRoom Noisy Continuous Gridworld environment in network mode.') 87 | gridworld.addGridworldArgs(parser) 88 | args = parser.parse_args() 89 | EnvironmentLoader.loadEnvironment(MultiRoomGridworld(size_x=args.size_x, size_y=args.size_y, goal_x=args.goal_x, goal_y=args.goal_y, noise=args.noise, random_start=args.random_restarts, fudge=args.fudge)) 90 | -------------------------------------------------------------------------------- /pyrl/environments/puddleworld.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import numpy 18 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 19 | from pyrl.rlglue.registry import register_environment 20 | 21 | from . import gridworld 22 | from pyrl.misc.matrix import mvnpdf 23 | 24 | @register_environment 25 | class PuddleWorld(gridworld.Gridworld): 26 | name = "Puddle World" 27 | def __init__(self, size_x=10, size_y=10, goal_x=10, goal_y=10, puddle_penalty=-100.0, 28 | puddle_means=[(0.35, 0.5), (0.5, 0.35)], puddle_var=[(1.2, 1.e-5, 1.e-5, 0.5), (0.5, 1.e-5, 1.e-5, 1.2)], 29 | noise=0.0, reward_noise=0.0, random_start=False, fudge=1.4143): 30 | 31 | gridworld.Gridworld.__init__(self, size_x=size_x, size_y=size_y, goal_x=goal_x, 32 | goal_y=goal_y, noise=noise, reward_noise=reward_noise, random_start=random_start, fudge=fudge) 33 | self.puddle_penalty = puddle_penalty 34 | self.puddle_means = map(numpy.array, puddle_means) 35 | self.puddle_var = map(lambda cov: numpy.linalg.inv(numpy.array(cov).reshape((2,2))), puddle_var) 36 | self.domain_name = "Continuous PuddleWorld" 37 | 38 | def reset(self): 39 | if self.random_start: 40 | self.pos = numpy.random.random((2,)) * self.size 41 | else: 42 | self.pos = numpy.array([0., 0.]) 43 | 44 | def takeAction(self, action): 45 | base_reward = gridworld.Gridworld.takeAction(self, action) 46 | for mu, inv_cov in zip(self.puddle_means, self.puddle_var): 47 | base_reward += mvnpdf(self.pos, mu, inv_cov) * self.puddle_penalty 48 | return base_reward 49 | 50 | 51 | if __name__=="__main__": 52 | import argparse 53 | parser = argparse.ArgumentParser(description='Run Noisy Continuous Puddle World environment in network mode.') 54 | gridworld.addGridworldArgs(parser) 55 | parser.add_argument("--puddle", type=float, nargs=6, action='append', 56 | help="Add a puddle with arguments: mean_x, mean_y, cov1, cov2, cov3, cov4. " + \ 57 | "Where mean specifies the center of the puddle and cov specifies the " + \ 58 | "covariance matrix of the multivariate normal distribution that describes " + \ 59 | "the puddle's depth.") 60 | parser.add_argument("--puddle_penalty", type=float, default=-100, 61 | help="The reward penalty scale for walking through puddles.") 62 | args = parser.parse_args() 63 | kwargs = {} 64 | if args.puddle is not None: 65 | means = [] 66 | covs = [] 67 | for puddle in args.puddle: 68 | means.append(tuple(puddle[:2])) 69 | covs.append(tuple(puddle[2:])) 70 | kwargs['puddle_means'] = means 71 | kwargs['puddle_var'] = covs 72 | 73 | if args.size_x: 74 | kwargs['size_x'] = args.size_x 75 | if args.size_y: 76 | kwargs['size_y'] = args.size_y 77 | if args.goal_x: 78 | kwargs['goal_x'] = args.goal_x 79 | if args.goal_y: 80 | kwargs['goal_y'] = args.goal_y 81 | if args.noise: 82 | kwargs['noise'] = args.noise 83 | if args.fudge: 84 | kwargs['fudge'] = args.fudge 85 | if args.random_restarts: 86 | kwargs['random_start'] = args.random_restarts 87 | 88 | EnvironmentLoader.loadEnvironment(PuddleWorld(**kwargs)) 89 | -------------------------------------------------------------------------------- /pyrl/environments/skeleton_environment.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2007, Mark Lee 3 | # 4 | #http://rl-glue-ext.googlecode.com/ 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import random 20 | from rlglue.environment.Environment import Environment 21 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 22 | from rlglue.types import Observation 23 | from rlglue.types import Action 24 | from rlglue.types import Reward_observation_terminal 25 | from pyrl.rlglue.registry import register_environment 26 | 27 | # /** 28 | # * This is a very simple environment with discrete observations corresponding to states labeled {0,1,...,19,20} 29 | # The starting state is 10. 30 | # 31 | # There are 2 actions = {0,1}. 0 decrements the state, 1 increments the state. 32 | # 33 | # The problem is episodic, ending when state 0 or 20 is reached, giving reward -1 or +1, respectively. The reward is 0 on 34 | # all other steps. 35 | # * @author Brian Tanner 36 | # */ 37 | 38 | @register_environment 39 | class skeleton_environment(Environment): 40 | name = "Skeleton environment" 41 | 42 | currentState=10 43 | def env_init(self): 44 | return "VERSION RL-Glue-3.0 PROBLEMTYPE episodic DISCOUNTFACTOR 1.0 OBSERVATIONS INTS (0 20) ACTIONS INTS (0 1) REWARDS (-1.0 1.0) EXTRA skeleton_environment(Python) by Brian Tanner." 45 | 46 | def env_start(self): 47 | self.currentState=10 48 | 49 | returnObs=Observation() 50 | returnObs.intArray=[self.currentState] 51 | 52 | return returnObs 53 | 54 | def env_step(self,thisAction): 55 | episodeOver=0 56 | theReward=0 57 | 58 | if thisAction.intArray[0]==0: 59 | self.currentState=self.currentState-1 60 | if thisAction.intArray[0]==1: 61 | self.currentState=self.currentState+1 62 | 63 | if self.currentState <= 0: 64 | self.currentState=0 65 | theReward=-1 66 | episodeOver=1 67 | 68 | if self.currentState >= 20: 69 | self.currentState=20 70 | theReward=1 71 | episodeOver=1 72 | 73 | theObs=Observation() 74 | theObs.intArray=[self.currentState] 75 | 76 | returnRO=Reward_observation_terminal() 77 | returnRO.r=theReward 78 | returnRO.o=theObs 79 | returnRO.terminal=episodeOver 80 | 81 | return returnRO 82 | 83 | def env_cleanup(self): 84 | pass 85 | 86 | def env_message(self,inMessage): 87 | if inMessage=="what is your name?": 88 | return "my name is skeleton_environment, Python edition!"; 89 | else: 90 | return "I don't know how to respond to your message"; 91 | 92 | 93 | if __name__=="__main__": 94 | EnvironmentLoader.loadEnvironment(skeleton_environment()) 95 | -------------------------------------------------------------------------------- /pyrl/environments/windyworld.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import sys 18 | import numpy 19 | 20 | from rlglue.environment.Environment import Environment 21 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 22 | from rlglue.types import Observation 23 | from rlglue.types import Action 24 | from rlglue.types import Reward_observation_terminal 25 | from pyrl.rlglue import TaskSpecRLGlue 26 | from pyrl.rlglue.registry import register_environment 27 | 28 | from . import gridworld 29 | from scipy.stats import norm 30 | 31 | @register_environment 32 | class WindyGridworld(gridworld.Gridworld): 33 | name = "Windy Gridworld" 34 | # The effect of the wind is always positive in the y dimension, and 35 | # is equal to the wind_power multiplied with the pdf of the current x-coordinate on a Gaussian distribution 36 | # with mean wind_center and standard deviation wind_stdev. 37 | def __init__(self, size_x=10, size_y=10, goal_x=10, goal_y=10, wind_center=7., wind_stdev=1.0, wind_power=2.0, noise=0.0, random_start=False, fudge=1.4143): 38 | gridworld.Gridworld.__init__(self, size_x=size_x, size_y=size_y, goal_x=goal_x, goal_y=goal_y, noise=noise, random_start=random_start, fudge=fudge) 39 | self.wind_center = wind_center 40 | self.wind_stdev = wind_stdev 41 | self.wind_power = wind_power 42 | self.domain_name = "Continuous Windy Gridworld by Will Dabney" 43 | 44 | def reset(self): 45 | if self.random_start: 46 | self.pos = numpy.random.random((2,)) * self.size 47 | else: 48 | self.pos = numpy.array([0.0, self.size[1]*0.5]) 49 | 50 | def takeAction(self, action): 51 | self.pos[1] += norm.pdf(self.pos[0], self.wind_center, self.wind_stdev) * self.wind_power 52 | return gridworld.Gridworld.takeAction(self, action) 53 | 54 | 55 | if __name__=="__main__": 56 | import argparse 57 | parser = argparse.ArgumentParser(description='Run 2D MultiRoom Noisy Continuous Gridworld environment in network mode.') 58 | gridworld.addGridworldArgs(parser) 59 | parser.add_argument("--wind_center", type=float, default=7, help="Center, or strongest point, in the x-direction of the wind") 60 | parser.add_argument("--wind_scale", type=float, default=1.0, help="Scale, or width, of the wind effects around the center.") 61 | parser.add_argument("--wind_power", type=float, default=2.0, help="The power, or strength, of the wind.") 62 | args = parser.parse_args() 63 | EnvironmentLoader.loadEnvironment( 64 | WindyGridworld(args.size_x, args.size_y, args.goal_x, args.goal_y, wind_center=args.wind_center, 65 | wind_stdev=args.wind_scale, wind_power=args.wind_power, noise=args.noise, 66 | random_start=args.random_restarts, fudge=args.fudge)) 67 | -------------------------------------------------------------------------------- /pyrl/experiments/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | */*~ 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /pyrl/experiments/README.md: -------------------------------------------------------------------------------- 1 | python-rl.experiments 2 | ========= 3 | 4 | Experiment scripts in python are used to run an agent on an environment in RLGlue. -------------------------------------------------------------------------------- /pyrl/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["episodic", "randomized"] 2 | 3 | -------------------------------------------------------------------------------- /pyrl/experiments/episodic.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Will Dabney 3 | # Author: Pierre-Luc Bacon 4 | 5 | import csv, os 6 | from pyrl.misc.timer import Timer 7 | from pyrl.rlglue import RLGlueLocal as RLGlueLocal 8 | from pyrl.rlglue.registry import register_experiment 9 | import rlglue.RLGlue as rl_glue 10 | import pyrl.visualizers.plotExperiment as plotExperiment 11 | 12 | @register_experiment 13 | class Episodic(object): 14 | name = "Episodic" 15 | 16 | def __init__(self, config, **kwargs): 17 | self.maxsteps = kwargs.setdefault('maxsteps', 5000) 18 | self.num_episodes = kwargs.setdefault('num_episodes', 10) 19 | self.num_runs = kwargs.setdefault('num_runs', 1) 20 | self.timed = kwargs.setdefault('timed', True) 21 | self.configuration = config 22 | 23 | if kwargs.has_key('agent') and kwargs.has_key('environment'): 24 | self.agent = kwargs['agent'] 25 | self.environment = kwargs['environment'] 26 | self.rlglue = RLGlueLocal.LocalGlue(self.environment, self.agent) 27 | else: 28 | self.rlglue = rl_glue 29 | 30 | def run_episode(self): 31 | terminal = 0 32 | runtime = 0 33 | # Query the agent whether or not it has diverged 34 | if self.hasAgentDiverged(): 35 | return 0, -1, 0.0, 0.0 # -1 number of steps, signals that divergence. 36 | if self.timed: 37 | timer = Timer() 38 | with timer: 39 | terminal = self.rlglue.RL_episode(self.maxsteps) 40 | runtime = timer.duration_in_seconds() 41 | else: 42 | terminal = self.rlglue.RL_episode(self.maxsteps) 43 | totalSteps = self.rlglue.RL_num_steps() 44 | totalReward = self.rlglue.RL_return() 45 | 46 | return terminal, totalSteps, totalReward, runtime 47 | 48 | def run_trial(self, filename=None): 49 | self.rlglue.RL_init() 50 | for i in range(self.num_episodes): 51 | term, steps, reward, runtime = self.run_episode() 52 | if filename is None: 53 | print i, steps, runtime, reward, term 54 | else: 55 | with open(filename, "a") as f: 56 | csvwrite = csv.writer(f) 57 | csvwrite.writerow([i, steps, runtime, reward, term]) 58 | self.rlglue.RL_cleanup() 59 | 60 | def run_experiment(self, filename=None): 61 | if filename is None: 62 | print 'trial, number of steps, runtime, accumulated reward, termination' 63 | for run in range(self.num_runs): 64 | self.run_trial(filename=filename) 65 | 66 | def hasAgentDiverged(self): 67 | """Sends an rl-glue message to the agent asking if it has diverged or not. 68 | The message is exactly: agent_diverged? 69 | The expected response is: True (if it has), False (if it has not) 70 | The responses are not case sensitive, and anything other than true or false 71 | will be treated as a false (to support agents which do not have this implemented). 72 | """ 73 | return self.rlglue.RL_agent_message("agent_diverged?").lower() == "true" 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /pyrl/experiments/randomized.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Will Dabney 3 | 4 | import csv, os, json, numpy, copy 5 | from pyrl.misc.timer import Timer 6 | from pyrl.rlglue import RLGlueLocal as RLGlueLocal 7 | from pyrl.rlglue.registry import register_experiment 8 | import rlglue.RLGlue as rl_glue 9 | from pyrl.experiments.episodic import Episodic 10 | import pyrl.visualizers.plotExperiment as plotExperiment 11 | from pyrl.misc.parameter import * 12 | 13 | 14 | @register_experiment 15 | class RandomizedTrial(Episodic): 16 | name = "Randomized Trial" 17 | 18 | def __init__(self, config, **kwargs): 19 | if not kwargs.has_key('agent') or not kwargs.has_key('environment'): 20 | print "ERROR: RandomizedTrial must be run locally in order to randomize parameters." 21 | import sys 22 | sys.exit(1) 23 | 24 | self.num_trials = kwargs.setdefault('num_trials', 1) 25 | self.evaluate = kwargs.setdefault('evaluate', 'reward') #reward, steps, time 26 | self.eval_reduce = kwargs.setdefault('evaluate_reduce', 'sum') # None, 'sum', 'final', 'kmeans' 27 | self.k = kwargs.setdefault('kmeans_k', 10) 28 | Episodic.__init__(self, config, **kwargs) 29 | 30 | def run_experiment(self, filename=None): 31 | param_parser = self.agent.agent_parameters() 32 | for trial in range(self.num_trials): 33 | parameters = copy.deepcopy(self.configuration['agent']['params']) 34 | # Randomize the parameters, those marked not optimizable get their default 35 | for name, value in randomize_parameters(param_parser): 36 | # Then, set the parameter value, but only if not already set 37 | parameters.setdefault(name, value) 38 | 39 | # Set params for current agent 40 | self.agent.params = parameters 41 | 42 | # Run a trial... 43 | tmp_file = "rndtrial" + str(numpy.random.randint(1.e10)) + ".dat" 44 | Episodic.run_experiment(self, filename = tmp_file) 45 | 46 | # Collect results 47 | locs, means, std = plotExperiment.processFile(tmp_file, self.evaluate, verbose=False, method=self.eval_reduce, kmeans_k=self.k) 48 | json_out = copy.deepcopy(self.configuration) 49 | json_out['agent']['params'] = parameters 50 | json_out['experiment']['episodes'] = locs.tolist() 51 | json_out['experiment']['returns'] = means.tolist() 52 | json_out['experiment']['deviations'] = std.tolist() 53 | 54 | if filename is None: 55 | print json.dumps(json_out) 56 | else: 57 | with open(filename, "a") as f: 58 | f.write(json.dumps(json_out) + "\n") 59 | os.remove(tmp_file) 60 | 61 | -------------------------------------------------------------------------------- /pyrl/misc/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | */*~ 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /pyrl/misc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __all__ = ["matrix", "timer", "json", "parameter"] 3 | 4 | 5 | -------------------------------------------------------------------------------- /pyrl/misc/json.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Cast all the unicode strings recieved from a json load into strings 4 | # http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-unicode-ones-from-json-in-python 5 | def convert(input): 6 | if isinstance(input, dict): 7 | return {convert(key): convert(value) for key, value in input.iteritems()} 8 | elif isinstance(input, list): 9 | return [convert(element) for element in input] 10 | elif isinstance(input, unicode): 11 | return input.encode('utf-8') 12 | else: 13 | return input 14 | -------------------------------------------------------------------------------- /pyrl/misc/matrix.py: -------------------------------------------------------------------------------- 1 | import numpy, itertools, math 2 | 3 | # Compute the value of (A + uv^T)^-1 given A^-1, u, and v. 4 | # Uses the Sherman-Morrison formula 5 | def SMInv(Ainv, u, v, e): 6 | u = u.reshape((len(u),1)) 7 | v = v.reshape((len(v),1)) 8 | if e is not None: 9 | g = numpy.dot(Ainv, u) / (e + numpy.dot(v.T, numpy.dot(Ainv, u))) 10 | return (Ainv / e) - numpy.dot(g, numpy.dot(v.T, Ainv/e)) 11 | else: 12 | return Ainv - numpy.dot(Ainv, numpy.dot(numpy.dot(u,v.T), Ainv)) / ( 1 + numpy.dot(v.T, numpy.dot(Ainv, u))) 13 | 14 | def vector_angle(u, v): 15 | return numpy.arccos(numpy.dot(u,v)/(numpy.linalg.norm(u)*numpy.linalg.norm(v)))*180.0/numpy.pi 16 | 17 | # Modified version of this solution: 18 | # http://stackoverflow.com/questions/11615664/multivariate-normal-density-in-python 19 | # Takes the inverse of the covariance matrix instead of the covariance matrix 20 | def mvnpdf(x, mu, sigma_inv): 21 | size = len(x) 22 | if size == len(mu) and sigma_inv.shape == (size, size): 23 | det = 1./numpy.linalg.det(sigma_inv) 24 | norm_const = 1.0/ ( math.pow((2*numpy.pi),float(size)/2) * math.pow(det,0.5) ) 25 | x_mu = x - mu 26 | result = math.pow(math.e, -0.5 * numpy.dot(x_mu, numpy.dot(sigma_inv, x_mu))) 27 | return norm_const * result 28 | else: 29 | raise NameError("The dimensions of the input don't match") 30 | 31 | -------------------------------------------------------------------------------- /pyrl/misc/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | class Timer(object): 4 | """Context manager to time a block of code. 5 | 6 | From http://stackoverflow.com/a/1685337/1306923 7 | Thanks to Corey Porter! 8 | 9 | """ 10 | def __enter__(self): 11 | self.__start = time.time() 12 | 13 | def __exit__(self, type, value, traceback): 14 | # Error handling here 15 | self.__finish = time.time() 16 | 17 | def duration_in_seconds(self): 18 | return self.__finish - self.__start 19 | -------------------------------------------------------------------------------- /pyrl/rlglue/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | */*~ 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /pyrl/rlglue/RLGlueLocal.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013, Will Dabney 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # 17 | # $Revision: 1 $ 18 | # $Date: 2013-01-24 $ 19 | # $Author: Will Dabney (amarack) $ 20 | 21 | from rlglue.agent.Agent import Agent 22 | from rlglue.environment.Environment import Environment 23 | 24 | from rlglue.types import Action 25 | from rlglue.types import Observation 26 | 27 | from rlglue.types import Observation_action 28 | from rlglue.types import Reward_observation_action_terminal 29 | from rlglue.types import Reward_observation_terminal 30 | 31 | # This class provides a seemless way of running python RLGlue experiments locally without 32 | # the use of sockets/network. I have no idea why this was not included in the python codec, 33 | # but I really need this functionality. Maybe it will help you as well. 34 | class LocalGlue: 35 | def __init__(self,theEnvironment,theAgent): 36 | self.env = theEnvironment 37 | self.agent = theAgent 38 | self.prevact = None 39 | self.reward_return = 0.0 40 | self.step_count = 0 41 | self.episode_count = 0 42 | self.exitStatus = 0 43 | 44 | def RL_init(self): 45 | taskSpecResponse = self.env.env_init() 46 | self.agent.agent_init(taskSpecResponse) 47 | self.prevact = None 48 | self.reward_return = 0.0 49 | self.step_count = 0 50 | self.episode_count = 0 51 | return taskSpecResponse 52 | 53 | def RL_start(self): 54 | self.reward_return = 0.0 55 | self.step_count = 1 56 | self.episode_count += 1 57 | self.prevact = None 58 | self.exitStatus = 0 59 | obs = self.env.env_start() 60 | action = self.agent.agent_start(obs) 61 | obsact = Observation_action() 62 | obsact.o = obs 63 | obsact.a = action 64 | self.prevact = action 65 | return obsact 66 | 67 | def RL_step(self): 68 | if self.prevact is None: 69 | self.RL_start() 70 | self.step_count += 1 71 | rot = self.env.env_step(self.prevact) 72 | roat = Reward_observation_action_terminal() 73 | roat.terminal = rot.terminal 74 | self.exitStatus = rot.terminal 75 | 76 | if rot.terminal == 1: 77 | self.agent.agent_end(rot.r) 78 | roat.a = self.prevact 79 | self.prevact = None 80 | else: 81 | self.prevact = self.agent.agent_step(rot.r, rot.o) 82 | roat.a = self.prevact 83 | 84 | self.reward_return += rot.r 85 | roat.r = rot.r 86 | roat.o = rot.o 87 | return roat 88 | 89 | def RL_cleanup(self): 90 | self.env.env_cleanup() 91 | self.agent.agent_cleanup() 92 | 93 | def RL_agent_message(self, message): 94 | if message == None: 95 | message="" 96 | return self.agent.agent_message(message) 97 | 98 | def RL_env_message(self, message): 99 | if message == None: 100 | message="" 101 | return self.env.env_message(message) 102 | 103 | def RL_return(self): 104 | return self.reward_return 105 | 106 | def RL_num_steps(self): 107 | return self.step_count 108 | 109 | def RL_num_episodes(self): 110 | return self.episode_count 111 | 112 | def RL_episode(self, num_steps): 113 | self.RL_start() 114 | while self.exitStatus != 1: 115 | # If num_steps is zero (or less) then treat as unlimited 116 | if (num_steps > 0) and self.step_count >= num_steps: 117 | break 118 | roat = self.RL_step() 119 | self.exitStatus = roat.terminal 120 | return self.exitStatus 121 | 122 | -------------------------------------------------------------------------------- /pyrl/rlglue/TaskSpecRLGlue.py: -------------------------------------------------------------------------------- 1 | 2 | # There didn't appear to be any python class in place in RLGlue to allow you to 3 | # easily create the Task Spec string like you can in java and c++. So this is 4 | # my substitute so that we can be as cool as those languages. 5 | 6 | # VERSION PROBLEMTYPE DISCOUNTFACTOR 7 | # OBSERVATIONS INTS ([times-to-repeat-this-tuple=1] )* DOUBLES 8 | # ([times-to-repeat-this-tuple=1] )* CHARCOUNT ACTIONS INTS 9 | # ([times-to-repeat-this-tuple=1] )* DOUBLES ([times-to-repeat-this-tuple=1] 10 | # )* CHARCOUNT REWARDS ( ) EXTRA 11 | # [extra text of your choice goes here]"; 12 | 13 | class TaskSpec: 14 | def __init__(self, discount_factor=1.0, reward_range=(-1,1)): 15 | self.version = "RL-Glue-3.0" 16 | self.actions = {} 17 | self.observations = {} 18 | self.prob_type = "episodic" 19 | self.disc_factor = discount_factor 20 | self.extras = "" 21 | self.act_charcount = 0 22 | self.obs_charcount = 0 23 | self.reward_range = reward_range 24 | 25 | def toTaskSpec(self): 26 | ts_list = ["VERSION " + self.version, 27 | "PROBLEMTYPE " + self.prob_type, 28 | "DISCOUNTFACTOR " + str(self.disc_factor)] 29 | 30 | # Observations 31 | if len(self.observations.keys()) > 0: 32 | ts_list += ["OBSERVATIONS"] 33 | if self.observations.has_key("INTS"): 34 | ts_list += ["INTS"] + self.observations["INTS"] 35 | if self.observations.has_key("DOUBLES"): 36 | ts_list += ["DOUBLES"] + self.observations["DOUBLES"] 37 | if self.observations.has_key("CHARCOUNT"): 38 | ts_list += ["CHARCOUNT"] + self.observations["CHARCOUNT"] 39 | 40 | # Actions 41 | if len(self.actions.keys()) > 0: 42 | ts_list += ["ACTIONS"] 43 | if self.actions.has_key("INTS"): 44 | ts_list += ["INTS"] + self.actions["INTS"] 45 | if self.actions.has_key("DOUBLES"): 46 | ts_list += ["DOUBLES"] + self.actions["DOUBLES"] 47 | if self.actions.has_key("CHARCOUNT"): 48 | ts_list += ["CHARCOUNT"] + self.actions["CHARCOUNT"] 49 | 50 | ts_list += ["REWARDS", "(" + str(self.reward_range[0]) + " " + str(self.reward_range[1]) + ")"] 51 | if self.extras != "": 52 | ts_list += ["EXTRAS", self.extras] 53 | return ' '.join(ts_list) 54 | 55 | 56 | def addAction(self, dRange, repeat=1, type="INTS"): 57 | rept = "" if repeat<= 1 else str(repeat) + " " 58 | self.actions.setdefault(type, []).append("(" + rept + str(dRange[0]) + " " + str(dRange[1]) + ")") 59 | 60 | def addContinuousAction(self, dRange, repeat=1): 61 | self.addAction(dRange, repeat, "DOUBLES") 62 | 63 | def addDiscreteAction(self, dRange, repeat=1): 64 | self.addAction(map(int, dRange), repeat, "INTS") 65 | 66 | def addObservation(self, dRange, repeat=1, type="INTS"): 67 | rept = "" if repeat<= 1 else str(repeat) + " " 68 | self.observations.setdefault(type, []).append("(" + rept + str(dRange[0]) + " " + str(dRange[1]) + ")") 69 | 70 | def addContinuousObservation(self, dRange, repeat=1): 71 | self.addObservation(dRange, repeat, "DOUBLES") 72 | 73 | def addDiscreteObservation(self, dRange, repeat=1): 74 | self.addObservation(map(int, dRange), repeat, "INTS") 75 | 76 | def setActionCharLimit(self, charLimit): 77 | self.actions["CHARCOUNT"] = [str(charLimit)] 78 | 79 | def setObservationCharLimit(self, charLimit): 80 | self.observations["CHARCOUNT"] = [str(charLimit)] 81 | 82 | def setContinuing(self): 83 | self.prob_type = "continuing" 84 | 85 | def setEpisodic(self): 86 | self.prob_type = "episodic" 87 | 88 | def setDiscountFactor(self, factor): 89 | self.disc_factor = factor 90 | 91 | def setExtra(self, strExtra): 92 | self.extras = strExtra 93 | 94 | def setProblemTypeCustom(self, strProbType): 95 | self.prob_type = strProbType 96 | 97 | def setRewardRange(self, low, high): 98 | self.reward_range = (low, high) 99 | 100 | -------------------------------------------------------------------------------- /pyrl/rlglue/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /pyrl/rlglue/registry.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: RLGlueRegistry 3 | :platform: Unix, Windows 4 | :synopsis: Registry for rl-glue agents, environments and experiments 5 | 6 | .. moduleauthor:: Pierre-Luc Bacon 7 | 8 | """ 9 | 10 | class RLGlueRegistry(object): 11 | def __init__(self): 12 | self.agents = {} 13 | self.environments = {} 14 | self.experiments = {} 15 | 16 | def register_agent(self, cls): 17 | self.agents[cls.name] = cls 18 | return cls 19 | 20 | def register_environment(self, cls): 21 | self.environments[cls.name] = cls 22 | return cls 23 | 24 | def register_experiment(self, cls): 25 | self.experiments[cls.name] = cls 26 | return cls 27 | 28 | rlglue_registry = RLGlueRegistry() 29 | register_agent = rlglue_registry.register_agent 30 | register_environment = rlglue_registry.register_environment 31 | register_experiment = rlglue_registry.register_experiment 32 | -------------------------------------------------------------------------------- /pyrl/rlglue/run.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Will Dabney 3 | # Author: Pierre-Luc Bacon 4 | 5 | # Runs an experiment by starting up rl_glue, 6 | # and letting the user choose from a set of 7 | # agents, environments, and experiments. 8 | 9 | import json 10 | from multiprocessing import Process 11 | from subprocess import Popen 12 | 13 | from pyrl.agents import * 14 | from pyrl.environments import * 15 | from pyrl.experiments import * 16 | from pyrl.rlglue.registry import rlglue_registry 17 | from pyrl.misc.json import convert 18 | 19 | from rlglue.agent import AgentLoader as AgentLoader 20 | from rlglue.environment import EnvironmentLoader as EnvironmentLoader 21 | 22 | 23 | def fromjson(filename): 24 | with open(filename, 'r') as f: 25 | config = json.load(f, object_hook=convert) 26 | 27 | # Process the environment 28 | environment = rlglue_registry.environments[config['environment']['name']] 29 | environment_params = config['environment']['params'] 30 | # Process the agent 31 | agent = rlglue_registry.agents[config['agent']['name']] 32 | agent_params = config['agent']['params'] 33 | # Process the experiment 34 | experiment = rlglue_registry.experiments[config['experiment']['name']] 35 | experiment_params = config['experiment']['params'] 36 | 37 | return agent, agent_params, environment, environment_params, experiment, experiment_params 38 | 39 | def tojson(agent, a_args, env, env_args, exp, exp_args, local=None): 40 | config = {'agent': {'name': agent.name, 'params': a_args}, 41 | 'environment': {'name': env.name, 'params': env_args}, 42 | 'experiment': {'name': exp.name, 'params': exp_args}} 43 | return json.dumps(config) 44 | 45 | def fromuser(): 46 | environment = interactive_choose(rlglue_registry.environments, 47 | "Choose an environment.") 48 | agent = interactive_choose(rlglue_registry.agents, "Choose an agent.") 49 | experiment = interactive_choose(rlglue_registry.experiments, 50 | "Choose an experiment.") 51 | return agent, {}, environment, {}, experiment, {} 52 | 53 | 54 | def interactive_choose(choices, prompt): 55 | print(prompt) 56 | sortkeys = sorted(choices.keys()) 57 | 58 | for ix, a_key in enumerate(sortkeys): 59 | print(" ({:d}): {}".format(ix + 1, a_key)) 60 | 61 | choice = None 62 | while choice not in range(1, len(sortkeys) + 1): 63 | choice = raw_input("Enter number (1 - {:d}): ".format( 64 | len(sortkeys))) 65 | try: 66 | choice = int(choice) 67 | except: 68 | pass 69 | 70 | return choices[sortkeys[choice - 1]] 71 | 72 | 73 | def run(agent, a_args, env, env_args, exp, exp_args, local=None, result_file=None): 74 | if local is None: 75 | ans = raw_input("Run locally? [y/n]: ") 76 | if ans.lower() == 'y' or ans.lower() == 'yes': 77 | local = True 78 | else: 79 | local = False 80 | 81 | config = {'agent': {'name': agent.name, 'params': a_args}, 82 | 'environment': {'name': env.name, 'params': env_args}, 83 | 'experiment': {'name': exp.name, 'params': exp_args}} 84 | if local: 85 | experiment = exp(config, agent=agent(**a_args), 86 | environment=env(**env_args), **exp_args) 87 | experiment.run_experiment(filename=result_file) 88 | else: 89 | experiment = exp(config, **exp_args) 90 | # TODO: Figure out if rl_glue is running, don't start it in that case 91 | rlglue_p = Popen('rl_glue') 92 | agent_p = Process(target=AgentLoader.loadAgent, 93 | args=(agent(**a_args),)) 94 | agent_p.start() 95 | env_p = Process(target=EnvironmentLoader.loadEnvironment, 96 | args=(env(**env_args),)) 97 | env_p.start() 98 | experiment.run_experiment(filename=result_file, **a_args) 99 | env_p.terminate() 100 | agent_p.terminate() 101 | rlglue_p.terminate() 102 | 103 | 104 | def addRunExpArgs(parser): 105 | json_group = parser.add_mutually_exclusive_group() 106 | json_group.add_argument("--load", type=str, help="Load an experimental configuration from a JSON file.") 107 | json_group.add_argument("--genjson", action='store_true', help="Generate an experimental configuration JSON file from " + \ 108 | "interactive selections. Only generates, does not run.") 109 | group = parser.add_mutually_exclusive_group() 110 | group.add_argument("--local", action='store_true', default="True", help="Run experiment locally") 111 | group.add_argument("--network", action='store_true', help="Run experiment through network sockets") 112 | parser.add_argument("--output", type=str, help="Save the results to a file.") 113 | return parser 114 | 115 | if __name__ == '__main__': 116 | import argparse 117 | parser = argparse.ArgumentParser(description='Run a reinforcement learning experiment. Defaults to interactive experiment.') 118 | addRunExpArgs(parser) 119 | args = parser.parse_args() 120 | 121 | if args.load is None: 122 | config = fromuser() 123 | if args.genjson: 124 | print tojson(*config) 125 | else: 126 | run(*config,local=args.local, result_file=args.output) 127 | else: 128 | config = fromjson(args.load) 129 | run(*config, local=args.local, result_file=args.output) 130 | -------------------------------------------------------------------------------- /pyrl/visualizers/.gitignore: -------------------------------------------------------------------------------- 1 | */*.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | */*~ 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /pyrl/visualizers/README.md: -------------------------------------------------------------------------------- 1 | pyrl.visualizers 2 | ========= 3 | 4 | Scripts and modules related to visualizing the output from the agents, environments and experiments for the pyrl project. 5 | -------------------------------------------------------------------------------- /pyrl/visualizers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /pyrl/visualizers/compareParameters.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # plotParameters.py 3 | # Author: Will Dabney 4 | # 5 | # A script to plot the output of the collected results of a randomized 6 | # parameter search experiment. 7 | # 8 | # Using the --parameter index name, argument you can plot a single parameter or 9 | # pair of parameters against their collected performance values. Specifying no 10 | # parameter will result in a low dimensionality embedding of all the parameters 11 | # to be plotted against their values. 12 | # Thus, with one parameter specified we get the usual results graphs used to 13 | # illustrate how an algorithm performs as a parameter varries. With two parameters 14 | # specified we get this same conceptual view but for the interaction of the two 15 | # parameters. And finally with three or more, we get something more interesting 16 | # which shows the overal behavior pattern with respect to parameter change that 17 | # the algorithm exibits (at least on the given domain). 18 | # 19 | # Example: python -m pyrl.visualizers.plotParameters --file exp.dat --parameter 1 alpha 20 | ################################################################################ 21 | 22 | import numpy 23 | import matplotlib.pyplot as plt 24 | import sys 25 | import argparse 26 | from scipy.interpolate import griddata 27 | 28 | def loadParameterData(filename, param_index): 29 | data = numpy.genfromtxt(filename, delimiter=',')[:,(0,1,param_index)] # Grab the mean, std, and parameter 30 | data = data[numpy.lexsort((data[:,2],)),:] 31 | 32 | if data[:,2].std() <= 1.e-10: 33 | xs = numpy.linspace(0, 1.0) 34 | ys = xs.copy() 35 | ys.fill(data[:,0].mean()) 36 | stdv = xs.copy() 37 | xs.fill(0.0) 38 | return numpy.array([ys, stdv, xs]).T 39 | else: 40 | return data 41 | 42 | 43 | if __name__=="__main__": 44 | import argparse 45 | parser = argparse.ArgumentParser(description='Plot a comparison of algorithms parameter ' + \ 46 | 'exploration for a singe parameter.') 47 | parser.add_argument("--file", type=str, action='append', nargs=3, required=True, 48 | help="Parameter exploration algorithm name, results file, and " + \ 49 | "the index of the parameter to display. Ex: Alg algfile.dat 2") 50 | parser.add_argument("--title", type=str, help="Title for the figure.", 51 | default="Parameter Exploration") 52 | parser.add_argument("--xlabel", type=str, help="Name of parameter being compared, label for x-axis", 53 | default="Parameter") 54 | parser.add_argument("--ylabel", type=str, help="Name of evaluation metric for algorithms. " + \ 55 | "This is the label for the y-axis", default="Total Return") 56 | parser.add_argument("--output", type=str, help="Filename to save the resulting figure.") 57 | parser.add_argument("--nobars", action='store_true', default=False, help="Disable plotting of error bars for standard deviations.") 58 | args = parser.parse_args() 59 | 60 | for (name, file, index) in args.file: 61 | data = loadParameterData(file, index) 62 | if not args.nobars: 63 | plt.errorbar(data[:,2], data[:,0], yerr=data[:,1]) 64 | else: 65 | plt.plot(data[:,2], data[:,0]) 66 | plt.hold(True) 67 | 68 | plt.legend(map(lambda k: k[0], args.file), loc='best') 69 | plt.title(args.title) 70 | plt.xlabel(args.xlabel) 71 | plt.ylabel(args.ylabel) 72 | if args.output: 73 | plt.savefig(args.output) 74 | else: 75 | plt.show() 76 | 77 | -------------------------------------------------------------------------------- /scripts/generate_spearmint.sh: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # 3 | # Generate Spearmint configuration/experiment 4 | # to optimize parameters for some pyRL experiment file. 5 | # 6 | # Run with: 7 | # sh scripts/generate_spearmint.sh /path/to/output experimentfile.json 8 | # 9 | # This will create all the necessary files in /path/to/output 10 | # spearmint needs to run the optimization. It will use experimentfile.json 11 | # as the template for the experiment. 12 | # 13 | # Note: Make sure *at least* one parameter for the algorithm 14 | # is *NOT* specified in the experiment json file. This code 15 | # will use whatever values are specified in the json file and 16 | # will *ONLY* optimize the parameters that are not given in the json. 17 | # 18 | # Then, to run the experiment go to spearmint directory and run: 19 | # python spearmint_sync.py --method=GPEIOptChooser --method-args=noiseless=1 /path/to/output/ 20 | ##################################################################### 21 | 22 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../ && pwd )" 23 | OUTDIR=$1 24 | EXPFILE=$2 25 | 26 | AGENT_NAME=`cat $EXPFILE | tr "\n" " " | sed -e 's/{.*"agent"[ ]*:.*"name"[ ]*:[ ]*"\([^"]*\)".*$/\1/g'` 27 | mkdir $OUTDIR 28 | 29 | cat ${DIR}/scripts/spearmint_template.py | sed -e s:'pyrl_path = "###"':'pyrl_path = "'${DIR}'"':g > ${OUTDIR}/"${AGENT_NAME}.py" 30 | cp $EXPFILE ${OUTDIR}/experiment.json 31 | 32 | python ${DIR}/scripts/spearmint_config.py "${AGENT_NAME}" $EXPFILE > ${OUTDIR}/config.pb 33 | 34 | 35 | -------------------------------------------------------------------------------- /scripts/spearmint_config.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Will Dabney 3 | 4 | #import csv, os, json, numpy, copy 5 | import numpy, sys 6 | sys.path.append("..") 7 | from pyrl.misc.parameter import * 8 | 9 | def gen_numeric_pbvar(name, size, type, min, max): 10 | config = ["variable {", ' name: "' + name + '"'] 11 | if type is int: 12 | config.append(" type: INT") 13 | else: 14 | config.append(" type: FLOAT") 15 | config.append(" size: " + str(size)) 16 | config.append(" min: " + str(min)) 17 | config.append(" max: " + str(max)) 18 | config += ["}", ""] 19 | return config 20 | 21 | def gen_enum_pbvar(name, size, options): 22 | config = ["variable {", ' name: "' + name + '"'] 23 | config.append(" type: ENUM") 24 | config.append(" size: " + str(size)) 25 | for entry in options: 26 | config.append(' options: "' + str(entry) + '"') 27 | config += ["}", ""] 28 | return config 29 | 30 | def gen_config(agent_name, param_parser, fixed_params): 31 | config_contents = ["language: PYTHON", 'name: "' + agent_name + '"', ""] 32 | 33 | opt_grp = get_optimize_group(param_parser) 34 | opt_pnames = set(fixed_params.keys()) 35 | for param in opt_grp._group_actions: 36 | if (param.dest in opt_pnames): 37 | continue 38 | opt_pnames.add(param.dest) 39 | 40 | var_size = param.nargs if param.nargs is not None else 1 41 | if param.type is bool: 42 | config_contents += gen_enum_pbvar(param.dest, var_size, ["true", "false"]) 43 | elif param.choices.__class__ is ValueRange: 44 | config_contents += gen_numeric_pbvar(param.dest, var_size, param.type, 45 | param.choices.min(), param.choices.max()) 46 | else: 47 | config_contents += gen_enum_pbvar(param.dest, var_size, param.choices) 48 | 49 | return config_contents 50 | 51 | if __name__ == "__main__": 52 | from pyrl.agents import * 53 | from pyrl.rlglue.registry import rlglue_registry 54 | from pyrl.rlglue import run 55 | 56 | agent_name = sys.argv[1] 57 | agent = rlglue_registry.agents[agent_name] 58 | param_parser = agent.agent_parameters() 59 | fixed_params = run.fromjson(sys.argv[2])[1] # Grabs agent parameters from experiment file 60 | 61 | # Produce a config.pb file based upon parameter parser 62 | config_contents = gen_config(agent_name, param_parser, fixed_params) 63 | for line in config_contents: 64 | print line 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /scripts/spearmint_template.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Will Dabney 3 | 4 | import sys 5 | pyrl_path = "###" 6 | sys.path.append(pyrl_path) 7 | 8 | import os, numpy 9 | from pyrl.misc.timer import Timer 10 | from pyrl.rlglue import RLGlueLocal as RLGlueLocal 11 | from pyrl.rlglue.registry import register_experiment 12 | import rlglue.RLGlue as rl_glue 13 | from pyrl.experiments.episodic import Episodic 14 | import pyrl.visualizers.plotExperiment as plotExperiment 15 | from pyrl.misc.parameter import * 16 | from pyrl.rlglue.run import fromjson 17 | 18 | def main(job_id, params): 19 | print 'Anything printed here will end up in the output directory for job #:', str(job_id) 20 | parameters = {} 21 | for key in params: 22 | if isinstance(key, unicode): 23 | parameters[key.encode('utf-8')] = params[key] 24 | else: 25 | parameters[key] = params[key] 26 | 27 | for key in parameters: 28 | parameters[key] = map(lambda k: k.encode('utf-8') if isinstance(k, unicode) else k, parameters[key]) 29 | if len(parameters[key]) == 1: 30 | value = parameters[key][0] 31 | try: 32 | value = float(value) 33 | except: 34 | if value.lower() == "false": 35 | value = False 36 | elif value.lower() == "true": 37 | value = True 38 | 39 | parameters[key] = value 40 | print parameters 41 | 42 | my_path = os.path.dirname(os.path.abspath(__file__)) 43 | tmp_file = os.path.join(my_path, "rndtrial" + str(numpy.random.randint(1.e10)) + ".dat") 44 | my_path = os.path.abspath(os.path.join(my_path, "experiment.json")) 45 | agent, a_args, env, env_args, exp, exp_args = fromjson(my_path) 46 | 47 | for key in parameters: 48 | a_args.setdefault(key, parameters[key]) 49 | 50 | config = {'agent': {'name': agent.name, 'params': a_args}, 51 | 'environment': {'name': env.name, 'params': env_args}, 52 | 'experiment': {'name': exp.name, 'params': exp_args}} 53 | 54 | experiment = Episodic(config, agent=agent(**a_args), 55 | environment=env(**env_args), **exp_args) 56 | 57 | # Using this try/except makes debugging in spearmint 1mil times easier 58 | try: 59 | experiment.run_experiment(filename=tmp_file) 60 | except Exception as ex: 61 | import traceback 62 | traceback.print_exc() 63 | 64 | locs, means, std = plotExperiment.processFile(tmp_file, "reward", verbose=False, method="sum") 65 | os.remove(tmp_file) 66 | print "Result:", -means[0] 67 | return -means[0] 68 | 69 | 70 | --------------------------------------------------------------------------------