├── docs ├── .gitkeep ├── reports │ └── .gitkeep └── manuscript │ └── .gitkeep ├── src ├── .gitkeep ├── __pycache__ │ ├── mcts.cpython-35.pyc │ └── network.cpython-35.pyc ├── common │ ├── __pycache__ │ │ ├── putils.cpython-35.pyc │ │ ├── submit.cpython-35.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── hps_setup.cpython-35.pyc │ │ └── visualize.cpython-35.pyc │ ├── __init__.py │ ├── rl │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── atari_copy.cpython-35.pyc │ │ │ └── make_game.cpython-35.pyc │ │ ├── envs │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── chain.cpython-35.pyc │ │ │ │ └── __init__.cpython-35.pyc │ │ │ ├── chicken.py │ │ │ ├── taxi.py │ │ │ ├── chain.py │ │ │ └── grid.py │ │ ├── wrappers │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── atari.cpython-35.pyc │ │ │ │ ├── __init__.cpython-35.pyc │ │ │ │ └── control.cpython-35.pyc │ │ │ ├── doom │ │ │ │ ├── __init__.py │ │ │ │ ├── control.py │ │ │ │ ├── observation_space.py │ │ │ │ ├── custom_game.py │ │ │ │ ├── action_space.py │ │ │ │ └── multi_discrete.py │ │ │ ├── control.py │ │ │ └── atari.py │ │ ├── atari_copy.py │ │ ├── doom_helpers.py │ │ └── make_game.py │ ├── examples │ │ ├── submit_wrapper.py │ │ └── visualize_wrapper.py │ ├── putils.py │ ├── hps_setup.py │ └── submit.py ├── rl │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── doom_setup.cpython-35.pyc │ │ ├── make_game.cpython-35.pyc │ │ ├── policies.cpython-35.pyc │ │ ├── atariwrapper.cpython-35.pyc │ │ └── rewardwrapper.cpython-35.pyc │ ├── envs │ │ ├── __pycache__ │ │ │ ├── chain.cpython-35.pyc │ │ │ ├── taxi.cpython-35.pyc │ │ │ └── toy.cpython-35.pyc │ │ ├── chicken.py │ │ ├── taxi.py │ │ ├── grid.py │ │ └── chain.py │ ├── wrappers │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── control.cpython-35.pyc │ │ │ ├── custom_game.cpython-35.pyc │ │ │ ├── action_space.cpython-35.pyc │ │ │ ├── multi_discrete.cpython-35.pyc │ │ │ └── observation_space.cpython-35.pyc │ │ ├── __init__.py │ │ ├── control.py │ │ ├── observation_space.py │ │ ├── custom_game.py │ │ ├── action_space.py │ │ └── multi_discrete.py │ ├── doom_setup.py │ ├── make_game.py │ ├── atariwrapper.py │ ├── rewardwrapper.py │ └── policies.py └── network.py ├── config ├── .gitkeep ├── __pycache__ │ └── hps.cpython-35.pyc └── hps.py ├── requirements.txt ├── CITATION.md ├── .gitignore ├── jobs ├── job_pendulum_final.sh ├── expand_job_call4.sh ├── expand_job_call.sh ├── expand_job_call2.sh ├── expand_job_call3.sh ├── job_Ant.sh ├── job_m.sh ├── expand_jobs_over_games.py ├── job_PM.sh ├── job_PM2.sh ├── job_p.sh ├── job_backup.sh └── job.sh ├── submit.py ├── LICENSE.md ├── README.md ├── visualize.py └── agent.py /docs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /config/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/reports/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/manuscript/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__pycache__/mcts.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/__pycache__/mcts.cpython-35.pyc -------------------------------------------------------------------------------- /config/__pycache__/hps.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/config/__pycache__/hps.cpython-35.pyc -------------------------------------------------------------------------------- /src/__pycache__/network.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/__pycache__/network.cpython-35.pyc -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | Please cite this project as follows: 2 | 3 | Thomas Moerland (2020), A0C cursus. url: github.com/tmoer/cursus 4 | -------------------------------------------------------------------------------- /src/common/__pycache__/putils.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/putils.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/__pycache__/submit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/submit.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 2 14:48:24 2017 4 | 5 | @author: thomas 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /src/rl/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/__pycache__/doom_setup.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/doom_setup.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/__pycache__/make_game.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/make_game.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/__pycache__/policies.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/policies.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/envs/__pycache__/chain.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/envs/__pycache__/chain.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/envs/__pycache__/taxi.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/envs/__pycache__/taxi.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/envs/__pycache__/toy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/envs/__pycache__/toy.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 2 14:48:24 2017 4 | 5 | @author: thomas 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /src/common/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/__pycache__/atariwrapper.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/atariwrapper.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/__pycache__/hps_setup.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/hps_setup.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/__pycache__/visualize.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/visualize.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 2 14:48:24 2017 4 | 5 | @author: thomas 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /src/common/rl/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/__pycache__/rewardwrapper.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/rewardwrapper.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/__pycache__/atari_copy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/__pycache__/atari_copy.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/__pycache__/make_game.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/__pycache__/make_game.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/envs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 2 14:48:24 2017 4 | 5 | @author: thomas 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /src/common/rl/envs/__pycache__/chain.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/envs/__pycache__/chain.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 2 14:48:24 2017 4 | 5 | @author: thomas 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /src/rl/wrappers/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/wrappers/__pycache__/control.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/control.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/envs/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/envs/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .action_space import * 2 | from .control import * 3 | from .custom_game import * 4 | from .observation_space import * -------------------------------------------------------------------------------- /src/rl/wrappers/__pycache__/custom_game.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/custom_game.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/wrappers/__pycache__/atari.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/wrappers/__pycache__/atari.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/wrappers/__pycache__/action_space.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/action_space.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/wrappers/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/wrappers/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/wrappers/__pycache__/control.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/wrappers/__pycache__/control.cpython-35.pyc -------------------------------------------------------------------------------- /src/common/rl/wrappers/doom/__init__.py: -------------------------------------------------------------------------------- 1 | from .action_space import * 2 | from .control import * 3 | from .custom_game import * 4 | from .observation_space import * -------------------------------------------------------------------------------- /src/rl/wrappers/__pycache__/multi_discrete.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/multi_discrete.cpython-35.pyc -------------------------------------------------------------------------------- /src/rl/wrappers/__pycache__/observation_space.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/observation_space.cpython-35.pyc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # exclude compiled binaries and external models 2 | /bin 3 | 4 | # exclude data from source control by default 5 | /data/temp 6 | /data/processed 7 | /data/raw 8 | -------------------------------------------------------------------------------- /jobs/job_pendulum_final.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=c,seq2=0.005+0.05,item3=lr,seq3=0.001+0.0001+0.00001,n_rep=10,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000000,n_eps=50000000,V_decision=max,clip_gradient_norm=1.0,temp=0.1,entropy_l=0.1 3 | -------------------------------------------------------------------------------- /jobs/expand_job_call4.sh: -------------------------------------------------------------------------------- 1 | python3 expand_jobs_over_games.py --job job_pendulum_final.sh --games Pendulum-v0s --hpsetup item1=n_mcts,seq1=1+5+10+25+50+100,item2=c,seq2=0.005+0.05,item3=lr,seq3=0.001+0.0001+0.00001,n_rep=10 --hp bound=beta,n_t=20000000000,n_eps=50000000,V_decision=max,clip_gradient_norm=1.0,temp=0.1,entropy_l=0.1 --slurm_mode short 2 | -------------------------------------------------------------------------------- /jobs/expand_job_call.sh: -------------------------------------------------------------------------------- 1 | python3 expand_jobs_over_games.py --games Pendulum-v0s --hpsetup item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max --item1 lr --seq1 0.1 0.01 0.001 0.0001 --item2 loss_type --seq2 Q count --item3 c --seq3 0.05 0.25 1.0 --slurm_mode short 2 | -------------------------------------------------------------------------------- /jobs/expand_job_call2.sh: -------------------------------------------------------------------------------- 1 | python3 expand_jobs_over_games.py --job job_PM2.sh --games Pendulum-v0s MountainCarContinuous-v0 --hpsetup item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max --item1 lr --seq1 0.01 0.001 0.0001 --item2 loss_type --seq2 count --item3 c --seq3 0.005 0.05 --slurm_mode short 2 | -------------------------------------------------------------------------------- /src/rl/wrappers/control.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | __all__ = ['SetPlayingMode'] 4 | 5 | def SetPlayingMode(target_mode): 6 | """ target mode can be 'algo' or 'human' """ 7 | 8 | class SetPlayingModeWrapper(gym.Wrapper): 9 | """ 10 | Doom wrapper to change playing mode 'human' or 'algo' 11 | """ 12 | def __init__(self, env): 13 | super(SetPlayingModeWrapper, self).__init__(env) 14 | if target_mode not in ['algo', 'human']: 15 | raise gym.error.Error('Error - The mode "{}" is not supported. Supported options are "algo" or "human"'.format(target_mode)) 16 | self.unwrapped._mode = target_mode 17 | 18 | return SetPlayingModeWrapper -------------------------------------------------------------------------------- /src/common/rl/wrappers/doom/control.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | __all__ = ['SetPlayingMode'] 4 | 5 | def SetPlayingMode(target_mode): 6 | """ target mode can be 'algo' or 'human' """ 7 | 8 | class SetPlayingModeWrapper(gym.Wrapper): 9 | """ 10 | Doom wrapper to change playing mode 'human' or 'algo' 11 | """ 12 | def __init__(self, env): 13 | super(SetPlayingModeWrapper, self).__init__(env) 14 | if target_mode not in ['algo', 'human']: 15 | raise gym.error.Error('Error - The mode "{}" is not supported. Supported options are "algo" or "human"'.format(target_mode)) 16 | self.unwrapped._mode = target_mode 17 | 18 | return SetPlayingModeWrapper -------------------------------------------------------------------------------- /src/common/rl/atari_copy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Atari helper functions 4 | @author: thomas 5 | """ 6 | 7 | def get_base_env(env): 8 | ''' removes all wrappers ''' 9 | while hasattr(env,'env'): 10 | env = env.env 11 | return env 12 | 13 | def copy_atari_state(env): 14 | env = get_base_env(env) 15 | return env.clone_full_state() 16 | # return env.ale.cloneSystemState() 17 | 18 | def restore_atari_state(env,snapshot): 19 | env = get_base_env(env) 20 | env.restore_full_state(snapshot) 21 | # env.ale.restoreSystemState(snapshot) 22 | 23 | def is_atari_game(env): 24 | ''' Verify whether game uses the Arcade Learning Environment ''' 25 | env = get_base_env(env) 26 | return hasattr(env,'ale') -------------------------------------------------------------------------------- /jobs/expand_job_call3.sh: -------------------------------------------------------------------------------- 1 | python3 expand_jobs_over_games.py --job job_m.sh --games MountainCarContinuous-v0 --hpsetup item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0 --item1 lr --seq1 0.001 0.0001 0.00001 --item2 c --seq2 0.02 0.06 --slurm_mode short 2 | 3 | python3 expand_jobs_over_games.py --job job_p.sh --games Pendulum-v0s --hpsetup item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05 --item1 lr --seq1 0.001 0.0001 0.00001 --item2 random_action_frac --seq2 0.0 1.0 --item3 use_prior --seq3 True False --slurm_mode short 4 | -------------------------------------------------------------------------------- /submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Wrapper for submit function 5 | 6 | @author: thomas 7 | """ 8 | 9 | if __name__ == '__main__' and __package__ is None: 10 | from os import sys, path 11 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 12 | 13 | import argparse 14 | from src.common.submit import submit 15 | 16 | from config.hps import get_hps,override_hps_settings 17 | from agent import agent 18 | 19 | if __name__ == "__main__": 20 | '''Set-up training''' 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--hp', help='Hyperparameter configuration',default='') 23 | parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='') 24 | parser.add_argument('--no_plot', action='store_true',default=False) 25 | args = parser.parse_args() 26 | submit(args.hp,args.hpsetup,args.no_plot,agent,get_hps,override_hps_settings) -------------------------------------------------------------------------------- /src/common/examples/submit_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Wrapper for submit function 5 | 6 | @author: thomas 7 | """ 8 | 9 | if __name__ == '__main__' and __package__ is None: 10 | from os import sys, path 11 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 12 | 13 | import argparse 14 | from common.submit import submit 15 | from hps import get_hps,override_hps_settings 16 | from agent import agent 17 | 18 | if __name__ == "__main__": 19 | '''Set-up training''' 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--hp', help='Hyperparameter configuration',default='') 22 | parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='') 23 | parser.add_argument('--no_plot', action='store_true',default=False) 24 | args = parser.parse_args() 25 | submit(args.hp,args.hpsetup,args.no_plot,agent,get_hps,override_hps_settings) -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, Thomas Moerland 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /jobs/job_Ant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=Q,name=lr:0.01-loss_type:Q 3 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=count,name=lr:0.01-loss_type:count 4 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=Q,name=lr:0.001-loss_type:Q 5 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=count,name=lr:0.001-loss_type:count 6 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=Q,name=lr:0.0001-loss_type:Q 7 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=count,name=lr:0.0001-loss_type:count 8 | -------------------------------------------------------------------------------- /src/common/putils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Python utils 5 | @author: thomas 6 | """ 7 | import time 8 | import os 9 | import numpy as np 10 | import random 11 | from shutil import copyfile 12 | 13 | def timing(f): 14 | ''' function decorator ''' 15 | def wrap(*args,**kwargs): 16 | time1 = time.time() 17 | ret = f(*args,**kwargs) 18 | time2 = time.time() 19 | print('{} function took {} sec'.format(f.__name__,time2-time1)) 20 | return ret 21 | return wrap 22 | 23 | def store_safely(folder,name,to_store): 24 | ''' to prevent losing information due to interruption of process''' 25 | new_name = folder+name+'.npy' 26 | old_name = folder+name+'_old.npy' 27 | if os.path.exists(new_name): 28 | copyfile(new_name,old_name) 29 | np.save(new_name,to_store) 30 | if os.path.exists(old_name): 31 | os.remove(old_name) 32 | 33 | def my_argmax(x): 34 | ''' assumes a 1D vector x ''' 35 | x = x.flatten() 36 | if np.any(np.isnan(x)): 37 | print('Warning: Cannot argmax when vector contains nans, results will be wrong') 38 | try: 39 | winners = np.argwhere(x == np.max(x)).flatten() 40 | winner = random.choice(winners) 41 | except: 42 | winner = np.argmax(x) # numerical instability ? 43 | return winner -------------------------------------------------------------------------------- /src/rl/doom_setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 11 14:16:19 2018 4 | 5 | @author: thomas 6 | """ 7 | import gym 8 | import ppaquette_gym_doom 9 | from rl.wrappers.action_space import ToDiscrete 10 | from rl.wrappers.observation_space import SetResolution 11 | 12 | def make_doom_env(game): 13 | ''' Makes doom environments based on simpler function ''' 14 | if game == 'Doom-1': 15 | Env = gym.make('ppaquette/DoomBasic-v0') 16 | if game == 'Doom-2': 17 | Env = gym.make('ppaquette/DoomCorridor-v0') 18 | if game == 'Doom-3': 19 | Env = gym.make('ppaquette/DoomDefendCenter-v0') 20 | if game == 'Doom-4': 21 | Env = gym.make('ppaquette/DoomDefendLine-v0') 22 | if game == 'Doom-5': 23 | Env = gym.make('ppaquette/DoomHealthGathering-v0') 24 | if game == 'Doom-6': 25 | Env = gym.make('ppaquette/DoomMyWayHome-v0') 26 | if game == 'Doom-7': 27 | Env = gym.make('ppaquette/PredictPosition-v0') 28 | if game == 'Doom-8': 29 | Env = gym.make('ppaquette/TakeCover-v0') 30 | if game == 'Doom-9': 31 | Env = gym.make('ppaquette/Deathmatch-v0') 32 | if game == 'Doom-10': 33 | Env = gym.make('ppaquette/meta-Doom-v0') 34 | return Env 35 | 36 | def make_doom_env_with_wrappers(game,action_config='minimal',screen_res='160x120'): 37 | ''' 38 | action_config can be 'minimal', 'constant-7', 'constant-17', 'full' 39 | ''' 40 | Env = make_doom_env(game) 41 | if action_config is not None: 42 | action_wrapper = ToDiscrete(config=action_config) 43 | Env = action_wrapper(Env) 44 | if screen_res is not None: 45 | obs_wrapper = SetResolution(screen_res) 46 | Env = obs_wrapper(Env) 47 | return Env -------------------------------------------------------------------------------- /src/common/rl/doom_helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 11 14:16:19 2018 4 | 5 | @author: thomas 6 | """ 7 | import gym 8 | import ppaquette_gym_doom 9 | from .wrappers.doom.action_space import ToDiscrete 10 | from .wrappers.doom.observation_space import SetResolution 11 | 12 | def make_doom_env(game): 13 | ''' Makes doom environments based on simpler function ''' 14 | if game == 'Doom-1': 15 | Env = gym.make('ppaquette/DoomBasic-v0') 16 | if game == 'Doom-2': 17 | Env = gym.make('ppaquette/DoomCorridor-v0') 18 | if game == 'Doom-3': 19 | Env = gym.make('ppaquette/DoomDefendCenter-v0') 20 | if game == 'Doom-4': 21 | Env = gym.make('ppaquette/DoomDefendLine-v0') 22 | if game == 'Doom-5': 23 | Env = gym.make('ppaquette/DoomHealthGathering-v0') 24 | if game == 'Doom-6': 25 | Env = gym.make('ppaquette/DoomMyWayHome-v0') 26 | if game == 'Doom-7': 27 | Env = gym.make('ppaquette/PredictPosition-v0') 28 | if game == 'Doom-8': 29 | Env = gym.make('ppaquette/TakeCover-v0') 30 | if game == 'Doom-9': 31 | Env = gym.make('ppaquette/Deathmatch-v0') 32 | if game == 'Doom-10': 33 | Env = gym.make('ppaquette/meta-Doom-v0') 34 | return Env 35 | 36 | def make_doom_env_with_wrappers(game,action_config='minimal',screen_res='160x120'): 37 | ''' 38 | action_config can be 'minimal', 'constant-7', 'constant-17', 'full' 39 | ''' 40 | Env = make_doom_env(game) 41 | if action_config is not None: 42 | action_wrapper = ToDiscrete(config=action_config) 43 | Env = action_wrapper(Env) 44 | if screen_res is not None: 45 | obs_wrapper = SetResolution(screen_res) 46 | Env = obs_wrapper(Env) 47 | return Env -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A0C 2 | 3 | This project simulates the Alpha Zero algorithm for continuous action spaces (A0C). 4 | The associated paper is available from here: https://arxiv.org/pdf/1805.09613.pdf 5 | 6 | ## Installation 7 | You can clone this project by running: 8 | `git clone https://github.com/tmoer/cursus.git` 9 | 10 | Requirements are available from `./requirements.txt` 11 | 12 | 13 | ## Run the project 14 | Run the project with `python3 agent.py` 15 | 16 | Hyperparameter settings can be modified in `./config/hps` 17 | 18 | 19 | ## Project organization 20 | 21 | ``` 22 | . 23 | ├── .gitignore 24 | ├── CITATION.md 25 | ├── LICENSE.md 26 | ├── README.md 27 | ├── requirements.txt 28 | ├── bin <- Compiled and external code, ignored by git (PG) 29 | │ └── external <- Any external source code, ignored by git (RO) 30 | ├── config <- Configuration files (HW) 31 | ├── data <- All project data, ignored by git 32 | │ ├── processed <- The final, canonical data sets for modeling. (PG) 33 | │ ├── raw <- The original, immutable data dump. (RO) 34 | │ └── temp <- Intermediate data that has been transformed. (PG) 35 | ├── docs <- Documentation notebook for users (HW) 36 | │ ├── manuscript <- Manuscript source, e.g., LaTeX, Markdown, etc. (HW) 37 | │ └── reports <- Other project reports and notebooks (e.g. Jupyter, .Rmd) (HW) 38 | ├── results 39 | │ ├── figures <- Figures for the manuscript or reports (PG) 40 | │ └── output <- Other output for the manuscript or reports (PG) 41 | └── src <- Source code for this project (HW) 42 | 43 | ``` 44 | 45 | 46 | ## License 47 | 48 | This project is licensed under the terms of the [MIT License](/LICENSE.md) 49 | 50 | ## Citation 51 | 52 | Please [cite this project as described here](/CITATION.md). 53 | -------------------------------------------------------------------------------- /jobs/job_m.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.001,c=0.02,name=lr:0.001-c:0.02 3 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.001,c=0.06,name=lr:0.001-c:0.06 4 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.0001,c=0.02,name=lr:0.0001-c:0.02 5 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.0001,c=0.06,name=lr:0.0001-c:0.06 6 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.00001,c=0.02,name=lr:0.00001-c:0.02 7 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.00001,c=0.06,name=lr:0.00001-c:0.06 8 | -------------------------------------------------------------------------------- /src/rl/wrappers/observation_space.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | try: 4 | from doom_py import ScreenResolution 5 | except ImportError as e: 6 | raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " + 7 | "with 'pip install doom_py.)'".format(e)) 8 | 9 | resolutions = ['160x120', '200x125', '200x150', '256x144', '256x160', '256x192', '320x180', '320x200', 10 | '320x240', '320x256', '400x225', '400x250', '400x300', '512x288', '512x320', '512x384', 11 | '640x360', '640x400', '640x480', '800x450', '800x500', '800x600', '1024x576', '1024x640', 12 | '1024x768', '1280x720', '1280x800', '1280x960', '1280x1024', '1400x787', '1400x875', 13 | '1400x1050', '1600x900', '1600x1000', '1600x1200', '1920x1080'] 14 | 15 | __all__ = [ 'SetResolution' ] 16 | 17 | def SetResolution(target_resolution): 18 | 19 | class SetResolutionWrapper(gym.Wrapper): 20 | """ 21 | Doom wrapper to change screen resolution 22 | """ 23 | def __init__(self, env): 24 | super(SetResolutionWrapper, self).__init__(env) 25 | if target_resolution not in resolutions: 26 | raise gym.error.Error('Error - The specified resolution "{}" is not supported by Vizdoom.'.format(target_resolution)) 27 | parts = target_resolution.lower().split('x') 28 | width = int(parts[0]) 29 | height = int(parts[1]) 30 | screen_res = __import__('doom_py') 31 | screen_res = getattr(screen_res, 'ScreenResolution') 32 | screen_res = getattr(screen_res, 'RES_{}X{}'.format(width, height)) 33 | self.screen_width, self.screen_height, self.unwrapped.screen_resolution = width, height, screen_res 34 | self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.screen_height, self.screen_width, 3)) 35 | self.observation_space = self.unwrapped.observation_space 36 | 37 | return SetResolutionWrapper 38 | -------------------------------------------------------------------------------- /src/common/rl/wrappers/doom/observation_space.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | try: 4 | from doom_py import ScreenResolution 5 | except ImportError as e: 6 | raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " + 7 | "with 'pip install doom_py.)'".format(e)) 8 | 9 | resolutions = ['160x120', '200x125', '200x150', '256x144', '256x160', '256x192', '320x180', '320x200', 10 | '320x240', '320x256', '400x225', '400x250', '400x300', '512x288', '512x320', '512x384', 11 | '640x360', '640x400', '640x480', '800x450', '800x500', '800x600', '1024x576', '1024x640', 12 | '1024x768', '1280x720', '1280x800', '1280x960', '1280x1024', '1400x787', '1400x875', 13 | '1400x1050', '1600x900', '1600x1000', '1600x1200', '1920x1080'] 14 | 15 | __all__ = [ 'SetResolution' ] 16 | 17 | def SetResolution(target_resolution): 18 | 19 | class SetResolutionWrapper(gym.Wrapper): 20 | """ 21 | Doom wrapper to change screen resolution 22 | """ 23 | def __init__(self, env): 24 | super(SetResolutionWrapper, self).__init__(env) 25 | if target_resolution not in resolutions: 26 | raise gym.error.Error('Error - The specified resolution "{}" is not supported by Vizdoom.'.format(target_resolution)) 27 | parts = target_resolution.lower().split('x') 28 | width = int(parts[0]) 29 | height = int(parts[1]) 30 | screen_res = __import__('doom_py') 31 | screen_res = getattr(screen_res, 'ScreenResolution') 32 | screen_res = getattr(screen_res, 'RES_{}X{}'.format(width, height)) 33 | self.screen_width, self.screen_height, self.unwrapped.screen_resolution = width, height, screen_res 34 | self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.screen_height, self.screen_width, 3)) 35 | self.observation_space = self.unwrapped.observation_space 36 | 37 | return SetResolutionWrapper 38 | -------------------------------------------------------------------------------- /config/hps.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Default hyperparameter settings 4 | @author: thomas 5 | """ 6 | from common.hps_setup import HParams 7 | 8 | def override_hps_settings(hps): 9 | ''' some more global modifications to multiple settings based on 1 indicator ''' 10 | if hps.mode == 'off': 11 | pass 12 | return hps 13 | 14 | def get_hps(): 15 | ''' Hyperparameter settings ''' 16 | return HParams( 17 | # General 18 | game = 'MountainCarContinuous-v0', # Environment name 19 | name = 'unnamed', # Name of experiment 20 | result_dir = '', 21 | 22 | # Steps & limits 23 | n_t = 2000, # max timesteps 24 | n_eps = 100, # max episodes 25 | steps_per_ep = 300, 26 | 27 | mode = 'off', # overall indicator to jointly change a group of settings. Use with override_hps_settings() 28 | 29 | # MCTS 30 | n_mcts = 10, 31 | c = 1.0, 32 | alpha = 0.5, 33 | C_widening = 1.0, 34 | decision_type = 'count', 35 | backup_Q = 'on-policy', # 'on-policy', 'max' or 'thompson': Type of policy used for value back-up. Thopmpson requires additional sampling 36 | sigma_tree = False, # whether to use tree uncertainty 37 | backup_sigma_tree = 'on-policy', # 'uniform', 'on-policy', 'max', 'thompson': policy used for sigma_tree back-up 38 | block_loop = False, # Whether to block loops 39 | 40 | # MCTS + DL 41 | loss_type = 'count', # 'count' or 'Q' 42 | bound = 'beta', # 'tanh' or 'beta' 43 | entropy_l = 0.0, 44 | random_action_frac = 0.0, 45 | temp = 1.0, 46 | n_mix = 1, 47 | use_prior = False, 48 | bootstrap_V = True, 49 | V_decision = 'on_policy', 50 | 51 | # Train 52 | lr = 0.005, 53 | n_epochs = 1, 54 | batch_size = 32, 55 | data_size = 5000, # total database, if distributed summed over the agents 56 | clip_gradient_norm = 0.0, 57 | tfdb = False, 58 | 59 | # Other 60 | timeit = False, 61 | verbose = False, 62 | verbose_mcts = False 63 | ) -------------------------------------------------------------------------------- /src/common/examples/visualize_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Wrapper for visualize.py 5 | 6 | @author: thomas 7 | """ 8 | 9 | if __name__ == '__main__' and __package__ is None: 10 | from os import sys, path 11 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 12 | 13 | import os 14 | import argparse 15 | from common.visualize import loop_directories 16 | from hps import get_hps 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--folder', default='/home/thomas/mcts_results') 21 | parser.add_argument('--home', action='store_true',default=False) 22 | parser.add_argument('--game', default=None) 23 | parser.add_argument('--name', default=None) 24 | parser.add_argument('--subindex', default=None) 25 | parser.add_argument('--plot_type', default='lc') 26 | parser.add_argument('--window', type=int,default=25) 27 | parser.add_argument('--sd', action='store_true',default=False) 28 | parser.add_argument('--on_mean', action='store_true',default=False) 29 | parser.add_argument('--item', default='return',help='This item in result will be plotted') 30 | parser.add_argument('--remove', action='store_true',default=False) 31 | parser.add_argument('--plotlen', type=int,default=25) 32 | parser.add_argument('--xlim', nargs='+',type=float,default=None) 33 | parser.add_argument('--ylim', nargs='+',type=float,default=None) 34 | parser.add_argument('--errlim', nargs='+',type=float,default=None,help='Limits on the errorbars') 35 | parser.add_argument('--item1_label', nargs='+', default=None) 36 | parser.add_argument('--item2_label', nargs='+', default=None) 37 | parser.add_argument('--item3_label', nargs='+', default=None) 38 | 39 | args = parser.parse_args() 40 | 41 | if args.home: 42 | result_folder = os.getcwd() + '/results/' 43 | else: 44 | result_folder = args.folder + '/' 45 | print('Start processing folder {}'.format(result_folder)) 46 | overview_dir= result_folder+'learning_curves/' 47 | if not os.path.exists(overview_dir): 48 | os.makedirs(overview_dir) 49 | 50 | loop_directories(result_folder=result_folder,overview_dir=overview_dir,game=args.game,name=args.name, 51 | subindex=args.subindex,plot_type=args.plot_type,window=args.window,sd=args.sd,on_mean=args.on_mean, 52 | item=args.item,remove=args.remove,plotlen=args.plotlen,xlim=args.xlim,ylim=args.ylim,errlim=args.errlim, 53 | get_hps=get_hps) 54 | 55 | -------------------------------------------------------------------------------- /src/rl/envs/chicken.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Wet-Chicken benchmark 4 | @author: thomas 5 | """ 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | class chicken_env(object): 11 | ''' Wet Chicken Benchmark ''' 12 | 13 | def __init__(self,to_plot = False): 14 | self.state = np.array([0,0]) 15 | self.observation_shape = np.shape(self.get_state())[0] 16 | 17 | if to_plot: 18 | plt.ion() 19 | fig = plt.figure() 20 | ax1 = fig.add_subplot(111,aspect='equal') 21 | #ax1.axis('off') 22 | plt.xlim([-0.5,5.5]) 23 | plt.ylim([-0.5,5.5]) 24 | 25 | self.g1 = ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red')) 26 | self.fig = fig 27 | self.ax1 = ax1 28 | self.fig.canvas.draw() 29 | self.fig.canvas.flush_events() 30 | 31 | def reset(self): 32 | self.state = np.array([0,0]) 33 | return self.get_state() 34 | 35 | def get_state(self): 36 | return self.state/5 37 | 38 | def set_state(self,state): 39 | self.state = state 40 | 41 | def step(self,a): 42 | x = self.state[0] 43 | y = self.state[1] 44 | ax = a[0] 45 | ay = a[1] 46 | tau = np.random.uniform(-1,1) 47 | w=5.0 48 | l=5.0 49 | 50 | v = 3 * x * (1/w) 51 | s = 3.5 - v 52 | yhat = y + ay - 1 + v + s*tau 53 | 54 | # change x 55 | if x + ax < 0: 56 | x = 0 57 | elif yhat > l: 58 | x = 0 59 | elif x + ax > w: 60 | x = w 61 | else: 62 | x = x + ax 63 | 64 | # change y 65 | if yhat < 0: 66 | y = 0 67 | elif yhat > l: 68 | y = 0 69 | else: 70 | y = yhat 71 | 72 | self.state = np.array([x,y]).flatten() 73 | 74 | r = - (l - y) 75 | 76 | return self.state,r,yhat>l 77 | 78 | def plot(self): 79 | self.g1.remove() 80 | self.g1 = self.ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red')) 81 | self.fig.canvas.draw() 82 | 83 | # Test 84 | if __name__ == '__main__': 85 | Env = chicken_env(True) 86 | s = Env.get_state() 87 | for i in range(500): 88 | a = np.random.uniform(-1,1,2) 89 | s,r,dead = Env.step(a) 90 | if not dead: 91 | Env.plot() 92 | else: 93 | print('Died in step',i,', restarting') 94 | s = Env.reset() 95 | print(Env.get_state()) 96 | print('Finished') 97 | -------------------------------------------------------------------------------- /src/common/rl/envs/chicken.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Wet-Chicken benchmark 4 | @author: thomas 5 | """ 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | class chicken_env(object): 11 | ''' Wet Chicken Benchmark ''' 12 | 13 | def __init__(self,to_plot = False): 14 | self.state = np.array([0,0]) 15 | self.observation_shape = np.shape(self.get_state())[0] 16 | 17 | if to_plot: 18 | plt.ion() 19 | fig = plt.figure() 20 | ax1 = fig.add_subplot(111,aspect='equal') 21 | #ax1.axis('off') 22 | plt.xlim([-0.5,5.5]) 23 | plt.ylim([-0.5,5.5]) 24 | 25 | self.g1 = ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red')) 26 | self.fig = fig 27 | self.ax1 = ax1 28 | self.fig.canvas.draw() 29 | self.fig.canvas.flush_events() 30 | 31 | def reset(self): 32 | self.state = np.array([0,0]) 33 | return self.get_state() 34 | 35 | def get_state(self): 36 | return self.state/5 37 | 38 | def set_state(self,state): 39 | self.state = state 40 | 41 | def step(self,a): 42 | x = self.state[0] 43 | y = self.state[1] 44 | ax = a[0] 45 | ay = a[1] 46 | tau = np.random.uniform(-1,1) 47 | w=5.0 48 | l=5.0 49 | 50 | v = 3 * x * (1/w) 51 | s = 3.5 - v 52 | yhat = y + ay - 1 + v + s*tau 53 | 54 | # change x 55 | if x + ax < 0: 56 | x = 0 57 | elif yhat > l: 58 | x = 0 59 | elif x + ax > w: 60 | x = w 61 | else: 62 | x = x + ax 63 | 64 | # change y 65 | if yhat < 0: 66 | y = 0 67 | elif yhat > l: 68 | y = 0 69 | else: 70 | y = yhat 71 | 72 | self.state = np.array([x,y]).flatten() 73 | 74 | r = - (l - y) 75 | 76 | return self.state,r,yhat>l 77 | 78 | def plot(self): 79 | self.g1.remove() 80 | self.g1 = self.ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red')) 81 | self.fig.canvas.draw() 82 | 83 | # Test 84 | if __name__ == '__main__': 85 | Env = chicken_env(True) 86 | s = Env.get_state() 87 | for i in range(500): 88 | a = np.random.uniform(-1,1,2) 89 | s,r,dead = Env.step(a) 90 | if not dead: 91 | Env.plot() 92 | else: 93 | print('Died in step',i,', restarting') 94 | s = Env.reset() 95 | print(Env.get_state()) 96 | print('Finished') 97 | -------------------------------------------------------------------------------- /src/rl/make_game.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Wrapper to generate the game environments 4 | @author: thomas 5 | """ 6 | import gym 7 | import numpy as np 8 | from rl.rewardwrapper import RewardWrapper,PILCOWrapper,NormalizeWrapper 9 | from rl.atariwrapper import AtariWrapper,ClipRewardWrapper 10 | from rl.envs.chain import Chain, ChainOrdered 11 | #from rl.doom_setup import make_doom_env_with_wrappers 12 | from gym import spaces 13 | import os 14 | #import gym_ple 15 | 16 | from gym.envs.registration import register 17 | register( 18 | id='FrozenLakeNotSlippery-v0', 19 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 20 | kwargs={'map_name' : '4x4', 'is_slippery': False}, 21 | max_episode_steps=100, 22 | reward_threshold=0.78, # optimum = .8196 23 | ) 24 | register( 25 | id='FrozenLakeNotSlippery-v1', 26 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 27 | kwargs={'map_name' : '8x8', 'is_slippery': False}, 28 | max_episode_steps=100, 29 | reward_threshold=0.78, # optimum = .8196 30 | ) 31 | 32 | def make_game(game): 33 | os.system('export LD_LIBRARY_PATH=`$LD_LIBRARY_PATH:$HOME/.mujoco/mjpro150/bin`') 34 | 35 | if 'Chain' in game: 36 | game,n = game.split('-') 37 | if game == 'Chain': 38 | Env = Chain(int(n)) 39 | elif game == 'ChainOrdered': 40 | Env = ChainOrdered(int(n)) 41 | elif game == 'CartPole-vr' or game == 'MountainCar-vr' or game == 'Acrobot-vr' or game == 'LunarLander-vr': 42 | Env = RewardWrapper(game) 43 | elif game == 'CartPole-vp' or game == 'MountainCar-vp' or game == 'Acrobot-vp': 44 | Env = PILCOWrapper(game) 45 | elif game == 'CartPole-vn' or game == 'MountainCar-vn': 46 | Env = NormalizeWrapper(game) 47 | else: 48 | Env = gym.make(game) 49 | if type(Env) == gym.wrappers.time_limit.TimeLimit: 50 | Env = Env.env 51 | if game in ['Breakout-v0','Pong-v0','MontezumaRevenge-v0']: 52 | Env = AtariWrapper(Env,skip=3,k=3,ram=False) 53 | Env = ClipRewardWrapper(Env) 54 | elif 'ram' in game: 55 | Env = AtariWrapper(Env,skip=3,k=2,ram=True) 56 | Env = ClipRewardWrapper(Env) 57 | if 'CartPole' in game: 58 | Env.observation_space = gym.spaces.Box(np.array([-4.8,-10,-4.8,-10]),np.array([4.8,10,4.8,10])) 59 | return Env 60 | 61 | def check_space(space): 62 | '''check the properties of the env ''' 63 | if isinstance(space,spaces.Box): 64 | dim = space.shape # should the zero be here? 65 | discrete = False 66 | elif isinstance(space,spaces.Discrete): 67 | dim = space.n 68 | discrete = True 69 | else: 70 | raise NotImplementedError 71 | return dim, discrete -------------------------------------------------------------------------------- /src/rl/atariwrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Atari wrapper, based on https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 4 | @author: thomas 5 | """ 6 | import gym 7 | from gym import spaces 8 | from collections import deque 9 | import numpy as np 10 | from PIL import Image 11 | 12 | class ClipRewardWrapper(gym.RewardWrapper): 13 | def reward(self, reward): 14 | """Bin reward to {+1, 0, -1} by its sign.""" 15 | return 0.5 * np.sign(reward) 16 | 17 | class AtariWrapper(gym.Wrapper): 18 | ''' Chain domain ''' 19 | 20 | def __init__(self, env, skip=4, k=4,ram=False): 21 | """Return only every `skip`-th frame""" 22 | gym.Wrapper.__init__(self, env) 23 | # Frame skip and pooling 24 | self._obs_buffer = deque(maxlen=skip) 25 | self._skip = skip 26 | self._ram = ram 27 | 28 | # Frame stacking 29 | self.k = k 30 | self.frames = deque([], maxlen=k) 31 | 32 | # Frame wrapping 33 | if not self._ram: 34 | self.res = 84 35 | self.observation_space = spaces.Box(low=0, high=1, shape=(self.res,self.res, k)) 36 | else: 37 | self.res = env.observation_space.shape[0] 38 | self.observation_space = spaces.Box(low=0, high=1, shape=(self.res, k)) 39 | 40 | def _observation(self): 41 | assert len(self.frames) == self.k 42 | return np.concatenate(self.frames, axis=-1) 43 | 44 | def _resize(self, obs): 45 | if not self._ram: 46 | frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32')) 47 | frame = np.array(Image.fromarray(frame).resize((self.res, self.res), 48 | resample=Image.BILINEAR), dtype=np.float32)/255.0 49 | return frame.reshape((self.res, self.res, 1)) 50 | else: 51 | obs = obs/255 52 | return obs.astype('float32').reshape((self.res,1)) 53 | 54 | def _reset(self): 55 | """Clear buffers and re-fill by duplicating the first observation.""" 56 | ob = self.env.reset() 57 | ob = self._resize(ob) 58 | for _ in range(self.k): self.frames.append(ob) 59 | self._obs_buffer.clear() 60 | for _ in range(self._skip): self._obs_buffer.append(ob) 61 | return self._observation() 62 | 63 | def _step(self, action): 64 | """Repeat action, sum reward, and max over last observations.""" 65 | total_reward = 0.0 66 | done = None 67 | for _ in range(self._skip): 68 | obs, reward, done, info = self.env.step(action) 69 | obs = self._resize(obs) 70 | self._obs_buffer.append(obs) 71 | total_reward += reward 72 | if done: 73 | break 74 | if not self._ram: 75 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) # max over skips 76 | else: 77 | max_frame = obs # just take the last, max has no interpretation 78 | self.frames.append(max_frame) # append to buffer 79 | return self._observation(), total_reward, done, info 80 | -------------------------------------------------------------------------------- /visualize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Wrapper for visualize.py 5 | 6 | @author: thomas 7 | """ 8 | 9 | if __name__ == '__main__' and __package__ is None: 10 | from os import sys, path 11 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 12 | 13 | import os 14 | import argparse 15 | from common.visualize import loop_directories 16 | from hps import get_hps 17 | 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--folder', default='/home/thomas/a0c_results') 22 | parser.add_argument('--home', action='store_true',default=False) 23 | parser.add_argument('--game', default=None) 24 | parser.add_argument('--name', default=None) 25 | parser.add_argument('--subindex', default=None) 26 | parser.add_argument('--plot_type', default='lc') 27 | parser.add_argument('--window', type=int,default=25) 28 | parser.add_argument('--sd', action='store_true',default=False) 29 | parser.add_argument('--on_mean', action='store_true',default=False) 30 | parser.add_argument('--item', default='return',help='This item in result will be plotted') 31 | parser.add_argument('--remove', action='store_true',default=False) 32 | parser.add_argument('--plotlen', type=int,default=25) 33 | parser.add_argument('--xlim', nargs='+',type=float,default=None) 34 | parser.add_argument('--ylim', nargs='+',type=float,default=None) 35 | parser.add_argument('--errlim', nargs='+',type=float,default=None,help='Limits on the errorbars') 36 | parser.add_argument('--item1_label', nargs='+', default=None) 37 | parser.add_argument('--item2_label', nargs='+', default=None) 38 | parser.add_argument('--item3_label', nargs='+', default=None) 39 | parser.add_argument('--no_suptitle', action='store_true',default=False) 40 | parser.add_argument('--x_item', default='steps') # steps or eps 41 | 42 | parser.add_argument('--line_item', default='item1') # 43 | parser.add_argument('--col_item', default='item2') # 44 | parser.add_argument('--row_item', default='item3') # 45 | 46 | args = parser.parse_args() 47 | 48 | if args.item1_label is not None: args.item1_label = ' '.join(args.item1_label) 49 | if args.item2_label is not None: args.item2_label = ' '.join(args.item2_label) 50 | if args.item3_label is not None: args.item3_label = ' '.join(args.item3_label) 51 | 52 | if args.home: 53 | result_folder = os.getcwd() + '/results/' 54 | else: 55 | result_folder = args.folder + '/' 56 | print('Start processing folder {}'.format(result_folder)) 57 | overview_dir= result_folder+'learning_curves/' 58 | if not os.path.exists(overview_dir): 59 | os.makedirs(overview_dir) 60 | 61 | loop_directories(result_folder=result_folder,overview_dir=overview_dir,game=args.game,name=args.name, 62 | subindex=args.subindex,plot_type=args.plot_type,window=args.window,sd=args.sd,on_mean=args.on_mean, 63 | item=args.item,remove=args.remove,plotlen=args.plotlen,xlim=args.xlim,ylim=args.ylim,errlim=args.errlim, 64 | get_hps=get_hps,no_suptitle=args.no_suptitle,x_item=args.x_item,line_item=args.line_item,col_item=args.col_item, 65 | row_item=args.row_item,item1_label=args.item1_label,item2_label=args.item2_label,item3_label=args.item3_label) 66 | 67 | -------------------------------------------------------------------------------- /jobs/expand_jobs_over_games.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Expand a submission over games 4 | @author: thomas 5 | """ 6 | 7 | if __name__ == '__main__' and __package__ is None: 8 | from os import sys, path 9 | sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))) 10 | 11 | import os 12 | import argparse 13 | from common.visualize import make_name 14 | 15 | def expand_job(games,job,hp,hp_setup,item1=None,seq1=[None],item2=None,seq2=[None],item3=None,seq3=[None]): 16 | # hacky way to bring in games 17 | #games = ['CartPole-vr','MountainCar-vr','Acrobot-vr','FrozenLake-v0','FrozenLakeNotSlippery-v0','FrozenLakeNotSlippery-v1'] 18 | #games = ['Breakout-ramDeterministic-v0','Pong-ramDeterministic-v0','AirRaid-ramDeterministic-v0','Amidar-ramDeterministic-v0', 19 | # 'Enduro-ramDeterministic-v0','MontezumaRevenge-ramDeterministic-v0','Venture-ramDeterministic-v0'] 20 | # Regarding Atari: 21 | # Assault, Freeway, Seaquest have different initial states 22 | 23 | file = os.getcwd() + '/' + job 24 | with open(file,'w') as fp: 25 | fp.write('#!/bin/sh\n') 26 | for i,game in enumerate(games): 27 | for j,it1 in enumerate(seq1): 28 | for k,it2 in enumerate(seq2): 29 | for l,it3 in enumerate(seq3): 30 | fp.write('python3 submit.py --hpsetup game={},{} --hp {}'.format(game,hp_setup,hp)) 31 | if item1 is not None: 32 | fp.write(',{}={}'.format(item1,it1)) 33 | if item2 is not None: 34 | fp.write(',{}={}'.format(item2,it2)) 35 | if item3 is not None: 36 | fp.write(',{}={}'.format(item3,it3)) 37 | hyperloop_name = make_name('',item1,it1,item2,it2,item3,it3) 38 | if hyperloop_name != '': 39 | fp.write(',name={}'.format(hyperloop_name)) 40 | fp.write('\n') 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('--games', nargs='+',type=str,default=[]) 45 | parser.add_argument('--job', default='job.sh') 46 | parser.add_argument('--slurm_mode', default='off') 47 | parser.add_argument('--hp', help='Hyperparameter configuration',default='') 48 | parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='') 49 | # extra items 50 | parser.add_argument('--item1',type=str,default=None) 51 | parser.add_argument('--seq1', nargs='+',type=str,default=[None]) 52 | parser.add_argument('--item2',type=str,default=None) 53 | parser.add_argument('--seq2', nargs='+',type=str,default=[None]) 54 | parser.add_argument('--item3',type=str,default=None) 55 | parser.add_argument('--seq3', nargs='+',type=str,default=[None]) 56 | 57 | args = parser.parse_args() 58 | 59 | if args.slurm_mode == 'short': 60 | args.hpsetup += ',slurm=True,slurm_qos=short,slurm_time=3:59:59' 61 | elif args.slurm_mode == 'long': 62 | args.hpsetup += ',slurm=True,slurm_qos=long,slurm_time=5-0:00:00' 63 | 64 | expand_job(games=args.games,job=args.job,hp=args.hp,hp_setup=args.hpsetup, 65 | item1=args.item1,seq1=args.seq1,item2=args.item2,seq2=args.seq2, 66 | item3=args.item3,seq3=args.seq3) -------------------------------------------------------------------------------- /src/rl/envs/taxi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Taxi Env 4 | @author: thomas 5 | """ 6 | 7 | import numpy 8 | import random 9 | import gym 10 | 11 | class Taxi(): 12 | ''' ''' 13 | 14 | def __init__(self): 15 | self.size = numpy.array([4,4]) 16 | self.landmarks = numpy.array([[0.0, 0.0], [0.0, 4.0], [3.0, 0.0], [4.0, 4.0]]) 17 | self.walls = numpy.array([[1.0, 2.0], [2.0, -2.0], [3.0, 2.0]]) 18 | self.fuel = 0 19 | self.fuel_loc = numpy.array([2.0, 1.0]) 20 | self.pass_loc = 0 # Passenger location: -1 for in taxi, >= 0 for a landmark 21 | self.pass_dest = 0 # Passenger destination: >=0 for a landmark 22 | self.pos = numpy.zeros((2,)) 23 | self.observation_space = gym.spaces.Box(0,12,(5,)) 24 | self.action_space = gym.spaces.Discrete(6) 25 | 26 | def reset(self): 27 | self.pos = numpy.random.randint(0,5,(2,)) 28 | self.fuel = numpy.random.random()*7 + 5.0 29 | self.lm_list = [i for i in range(len(self.landmarks))] 30 | random.shuffle(self.lm_list) 31 | self.pass_loc = self.lm_list.pop() 32 | self.pass_dest = random.choice(self.lm_list) 33 | return self.get_state() 34 | 35 | def get_state(self): 36 | return numpy.hstack([self.pos,self.fuel,self.pass_loc,self.pass_dest]) 37 | 38 | def step(self,action): 39 | # move taxi 40 | reward = self.takeAction(action) 41 | terminal = 1 if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0 else 0 42 | return self.get_state(),reward,terminal,{} 43 | 44 | def takeAction(self, intAction): 45 | reward = -1.0 46 | self.fuel -= 1 47 | prev_pos = self.pos.copy() 48 | sign = 0 49 | if intAction == 0: 50 | self.pos[0] += 1.0 51 | sign = 1 52 | elif intAction == 1: 53 | self.pos[0] -= 1.0 54 | sign = -1 55 | elif intAction == 2: 56 | self.pos[1] += 1.0 57 | elif intAction == 3: 58 | self.pos[1] -= 1.0 59 | elif intAction == 4: # Pickup 60 | if self.pass_loc >= 0 and self.atPoint(self.landmarks[self.pass_loc]): 61 | self.pass_loc = -1 62 | else: 63 | reward = -10.0 64 | elif intAction == 5: # Drop off 65 | if self.pass_loc == -1 and self.atPoint(self.landmarks[self.pass_dest]): 66 | self.pass_loc = self.pass_dest 67 | reward = 20.0 68 | else: 69 | reward = -10.0 70 | elif self.fuel_loc is not None and intAction == 4: # Refuel 71 | if self.atPoint(self.fuel_loc): 72 | self.fuel = 12.0 73 | 74 | self.pos = self.pos.clip([0, 0], self.size) 75 | 76 | if sign != 0 and self.hitsWall(prev_pos, self.pos, sign): 77 | self.pos[0] = prev_pos[0] # Only revert the x-coord, to allow noise and such in y 78 | 79 | return reward 80 | 81 | # helpers 82 | def atPoint(self, point): 83 | return numpy.linalg.norm(self.pos - point) < 0.1 84 | 85 | def isAtGoal(self): 86 | return self.pass_loc == self.pass_dest 87 | 88 | def hitsWall(self, old_pos, new_pos, sign): 89 | return (((self.walls[:,0]*sign >= old_pos[0]*sign) & (self.walls[:,0]*sign < new_pos[0]*sign)) \ 90 | & ((self.walls[:,1] > old_pos[1]) | ((self.size[1]-1)+self.walls[:,1] < old_pos[1]))).any() 91 | 92 | -------------------------------------------------------------------------------- /src/common/rl/envs/taxi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Taxi Env 4 | @author: thomas 5 | """ 6 | 7 | import numpy 8 | import random 9 | import gym 10 | 11 | class Taxi(): 12 | ''' ''' 13 | 14 | def __init__(self): 15 | self.size = numpy.array([4,4]) 16 | self.landmarks = numpy.array([[0.0, 0.0], [0.0, 4.0], [3.0, 0.0], [4.0, 4.0]]) 17 | self.walls = numpy.array([[1.0, 2.0], [2.0, -2.0], [3.0, 2.0]]) 18 | self.fuel = 0 19 | self.fuel_loc = numpy.array([2.0, 1.0]) 20 | self.pass_loc = 0 # Passenger location: -1 for in taxi, >= 0 for a landmark 21 | self.pass_dest = 0 # Passenger destination: >=0 for a landmark 22 | self.pos = numpy.zeros((2,)) 23 | self.observation_space = gym.spaces.Box(0,12,(5,)) 24 | self.action_space = gym.spaces.Discrete(6) 25 | 26 | def reset(self): 27 | self.pos = numpy.random.randint(0,5,(2,)) 28 | self.fuel = numpy.random.random()*7 + 5.0 29 | self.lm_list = [i for i in range(len(self.landmarks))] 30 | random.shuffle(self.lm_list) 31 | self.pass_loc = self.lm_list.pop() 32 | self.pass_dest = random.choice(self.lm_list) 33 | return self.get_state() 34 | 35 | def get_state(self): 36 | return numpy.hstack([self.pos,self.fuel,self.pass_loc,self.pass_dest]) 37 | 38 | def step(self,action): 39 | # move taxi 40 | reward = self.takeAction(action) 41 | terminal = 1 if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0 else 0 42 | return self.get_state(),reward,terminal,{} 43 | 44 | def takeAction(self, intAction): 45 | reward = -1.0 46 | self.fuel -= 1 47 | prev_pos = self.pos.copy() 48 | sign = 0 49 | if intAction == 0: 50 | self.pos[0] += 1.0 51 | sign = 1 52 | elif intAction == 1: 53 | self.pos[0] -= 1.0 54 | sign = -1 55 | elif intAction == 2: 56 | self.pos[1] += 1.0 57 | elif intAction == 3: 58 | self.pos[1] -= 1.0 59 | elif intAction == 4: # Pickup 60 | if self.pass_loc >= 0 and self.atPoint(self.landmarks[self.pass_loc]): 61 | self.pass_loc = -1 62 | else: 63 | reward = -10.0 64 | elif intAction == 5: # Drop off 65 | if self.pass_loc == -1 and self.atPoint(self.landmarks[self.pass_dest]): 66 | self.pass_loc = self.pass_dest 67 | reward = 20.0 68 | else: 69 | reward = -10.0 70 | elif self.fuel_loc is not None and intAction == 4: # Refuel 71 | if self.atPoint(self.fuel_loc): 72 | self.fuel = 12.0 73 | 74 | self.pos = self.pos.clip([0, 0], self.size) 75 | 76 | if sign != 0 and self.hitsWall(prev_pos, self.pos, sign): 77 | self.pos[0] = prev_pos[0] # Only revert the x-coord, to allow noise and such in y 78 | 79 | return reward 80 | 81 | # helpers 82 | def atPoint(self, point): 83 | return numpy.linalg.norm(self.pos - point) < 0.1 84 | 85 | def isAtGoal(self): 86 | return self.pass_loc == self.pass_dest 87 | 88 | def hitsWall(self, old_pos, new_pos, sign): 89 | return (((self.walls[:,0]*sign >= old_pos[0]*sign) & (self.walls[:,0]*sign < new_pos[0]*sign)) \ 90 | & ((self.walls[:,1] > old_pos[1]) | ((self.size[1]-1)+self.walls[:,1] < old_pos[1]))).any() 91 | 92 | -------------------------------------------------------------------------------- /jobs/job_PM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=Q,name=lr:0.01-loss_type:Q 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=count,name=lr:0.01-loss_type:count 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=Q,name=lr:0.001-loss_type:Q 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=count,name=lr:0.001-loss_type:count 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=Q,name=lr:0.0001-loss_type:Q 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=count,name=lr:0.0001-loss_type:count 8 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=Q,name=lr:0.01-loss_type:Q 9 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=count,name=lr:0.01-loss_type:count 10 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=Q,name=lr:0.001-loss_type:Q 11 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=count,name=lr:0.001-loss_type:count 12 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=Q,name=lr:0.0001-loss_type:Q 13 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=count,name=lr:0.0001-loss_type:count 14 | -------------------------------------------------------------------------------- /src/rl/wrappers/custom_game.py: -------------------------------------------------------------------------------- 1 | import os 2 | import types 3 | import gym 4 | 5 | try: 6 | import doom_py 7 | from doom_py import DoomGame, Mode, Button, GameVariable, ScreenFormat, ScreenResolution, Loader 8 | except ImportError as e: 9 | raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " + 10 | "with 'pip install doom_py.)'".format(e)) 11 | 12 | __all__ = [ 'CustomGame' ] 13 | 14 | def CustomGame(): 15 | 16 | def _customize_game(self): 17 | vizdoom_path = self.loader.get_vizdoom_path() # paths are based on installation path of doom_py 18 | freedoom_path = self.loader.get_freedoom_path() 19 | doom_dir = self.doom_dir # dirname of directory containing 'doom_env' 20 | 21 | # Settings 22 | config = os.path.join(doom_dir, 'assets', 'deadly_corridor.cfg') 23 | scenario = self.loader.get_scenario_path('deadly_corridor.wad') 24 | map = '' 25 | difficulty = 1 26 | 27 | # Customizing - self.game refers to a new DoomGame() 28 | self.game.set_vizdoom_path(vizdoom_path) 29 | self.game.set_doom_game_path(freedoom_path) 30 | self.game.load_config(config) 31 | self.game.set_doom_scenario_path(scenario) 32 | if map != '': 33 | self.game.set_doom_map(map) 34 | self.game.set_doom_skill(difficulty) 35 | self.game.set_screen_resolution(self.screen_resolution) 36 | 37 | class CustomGameWrapper(gym.Wrapper): 38 | """ 39 | Doom wrapper to load a custom map 40 | This wrapper modifies directly the unwrapped env, and is not expected to be stacked 41 | """ 42 | def __init__(self, env): 43 | super(CustomGameWrapper, self).__init__(env) 44 | self.unwrapped.action_space = gym.spaces.MultiDiscrete([[0, 1]] * 38 + [[-10, 10]] * 2 + [[-100, 100]] * 3) # Default 43 button action space 45 | self.unwrapped.screen_height = 480 46 | self.unwrapped.screen_width = 640 47 | self.unwrapped.screen_resolution = ScreenResolution.RES_640X480 48 | self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.unwrapped.screen_height, self.unwrapped.screen_width, 3)) 49 | self.observation_space = self.unwrapped.observation_space 50 | self.unwrapped.allowed_actions = [0, 10, 11, 13, 14, 15] # Must match exactly and in order the actions in the config file 51 | # (The number is the action number based on controls.md) 52 | # This will only enable these actions out of the 43 available buttons 53 | 54 | # Converting to Discrete action space 55 | discrete_actions = self.unwrapped.allowed_actions 56 | self.action_space = gym.spaces.DiscreteToMultiDiscrete(self.unwrapped.action_space, discrete_actions) 57 | 58 | # Alternative to convert to continuous action space 59 | # box_actions = self.unwrapped.allowed_actions 60 | # self.action_space = gym.spaces.BoxToMultiDiscrete(self.unwrapped.action_space, box_actions) 61 | 62 | # Bouding method to env 63 | self.unwrapped._customize_game = types.MethodType(_customize_game, self.unwrapped) 64 | 65 | def _step(self, action): 66 | return self.unwrapped._step(self.action_space(action)) 67 | 68 | return CustomGameWrapper 69 | -------------------------------------------------------------------------------- /src/common/rl/wrappers/doom/custom_game.py: -------------------------------------------------------------------------------- 1 | import os 2 | import types 3 | import gym 4 | 5 | try: 6 | import doom_py 7 | from doom_py import DoomGame, Mode, Button, GameVariable, ScreenFormat, ScreenResolution, Loader 8 | except ImportError as e: 9 | raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " + 10 | "with 'pip install doom_py.)'".format(e)) 11 | 12 | __all__ = [ 'CustomGame' ] 13 | 14 | def CustomGame(): 15 | 16 | def _customize_game(self): 17 | vizdoom_path = self.loader.get_vizdoom_path() # paths are based on installation path of doom_py 18 | freedoom_path = self.loader.get_freedoom_path() 19 | doom_dir = self.doom_dir # dirname of directory containing 'doom_env' 20 | 21 | # Settings 22 | config = os.path.join(doom_dir, 'assets', 'deadly_corridor.cfg') 23 | scenario = self.loader.get_scenario_path('deadly_corridor.wad') 24 | map = '' 25 | difficulty = 1 26 | 27 | # Customizing - self.game refers to a new DoomGame() 28 | self.game.set_vizdoom_path(vizdoom_path) 29 | self.game.set_doom_game_path(freedoom_path) 30 | self.game.load_config(config) 31 | self.game.set_doom_scenario_path(scenario) 32 | if map != '': 33 | self.game.set_doom_map(map) 34 | self.game.set_doom_skill(difficulty) 35 | self.game.set_screen_resolution(self.screen_resolution) 36 | 37 | class CustomGameWrapper(gym.Wrapper): 38 | """ 39 | Doom wrapper to load a custom map 40 | This wrapper modifies directly the unwrapped env, and is not expected to be stacked 41 | """ 42 | def __init__(self, env): 43 | super(CustomGameWrapper, self).__init__(env) 44 | self.unwrapped.action_space = gym.spaces.MultiDiscrete([[0, 1]] * 38 + [[-10, 10]] * 2 + [[-100, 100]] * 3) # Default 43 button action space 45 | self.unwrapped.screen_height = 480 46 | self.unwrapped.screen_width = 640 47 | self.unwrapped.screen_resolution = ScreenResolution.RES_640X480 48 | self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.unwrapped.screen_height, self.unwrapped.screen_width, 3)) 49 | self.observation_space = self.unwrapped.observation_space 50 | self.unwrapped.allowed_actions = [0, 10, 11, 13, 14, 15] # Must match exactly and in order the actions in the config file 51 | # (The number is the action number based on controls.md) 52 | # This will only enable these actions out of the 43 available buttons 53 | 54 | # Converting to Discrete action space 55 | discrete_actions = self.unwrapped.allowed_actions 56 | self.action_space = gym.spaces.DiscreteToMultiDiscrete(self.unwrapped.action_space, discrete_actions) 57 | 58 | # Alternative to convert to continuous action space 59 | # box_actions = self.unwrapped.allowed_actions 60 | # self.action_space = gym.spaces.BoxToMultiDiscrete(self.unwrapped.action_space, box_actions) 61 | 62 | # Bouding method to env 63 | self.unwrapped._customize_game = types.MethodType(_customize_game, self.unwrapped) 64 | 65 | def _step(self, action): 66 | return self.unwrapped._step(self.action_space(action)) 67 | 68 | return CustomGameWrapper 69 | -------------------------------------------------------------------------------- /jobs/job_PM2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.005,name=lr:0.01-loss_type:count-c:0.005 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.05,name=lr:0.01-loss_type:count-c:0.05 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.005,name=lr:0.001-loss_type:count-c:0.005 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.05,name=lr:0.001-loss_type:count-c:0.05 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.005,name=lr:0.0001-loss_type:count-c:0.005 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.05,name=lr:0.0001-loss_type:count-c:0.05 8 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.005,name=lr:0.01-loss_type:count-c:0.005 9 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.05,name=lr:0.01-loss_type:count-c:0.05 10 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.005,name=lr:0.001-loss_type:count-c:0.005 11 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.05,name=lr:0.001-loss_type:count-c:0.05 12 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.005,name=lr:0.0001-loss_type:count-c:0.005 13 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.05,name=lr:0.0001-loss_type:count-c:0.05 14 | -------------------------------------------------------------------------------- /src/common/rl/make_game.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Wrapper to generate the game environments 4 | @author: thomas 5 | """ 6 | import gym 7 | import numpy as np 8 | from gym import spaces 9 | import os 10 | 11 | from .envs.chain import Chain,ChainOrdered,ChainLoop 12 | from .wrappers.control import NormalizeWrapper,ReparametrizeWrapper,PILCOWrapper,ScaleRewardWrapper 13 | from .wrappers.atari import ClipRewardWrapper 14 | 15 | # Register deterministic FrozenLakes 16 | from gym.envs.registration import register 17 | #register( 18 | # id='FrozenLakeNotSlippery-v0', 19 | # entry_point='gym.envs.toy_text:FrozenLakeEnv', 20 | # kwargs={'map_name' : '4x4', 'is_slippery': False}, 21 | # max_episode_steps=100, 22 | # reward_threshold=0.78, # optimum = .8196 23 | #) 24 | #register( 25 | # id='FrozenLakeNotSlippery-v1', 26 | # entry_point='gym.envs.toy_text:FrozenLakeEnv', 27 | # kwargs={'map_name' : '8x8', 'is_slippery': False}, 28 | # max_episode_steps=100, 29 | # reward_threshold=0.78, # optimum = .8196 30 | #) 31 | 32 | def get_base_env(env): 33 | ''' removes all wrappers ''' 34 | while hasattr(env,'env'): 35 | env = env.env 36 | return env 37 | 38 | def is_atari_game(env): 39 | ''' Verify whether game uses the Arcade Learning Environment ''' 40 | env = get_base_env(env) 41 | return hasattr(env,'ale') 42 | 43 | def prepare_gym_env(game): 44 | ''' Modifications to Env ''' 45 | print(game) 46 | name,version = game.rsplit('-',1) 47 | if len(version) > 2: 48 | modify = version[2:] 49 | game = name + '-' + version[:2] 50 | else: 51 | modify = '' 52 | 53 | env = gym.make(game) 54 | # remove timelimit wrapper 55 | if type(env) == gym.wrappers.time_limit.TimeLimit: 56 | env = env.env 57 | 58 | print(modify) 59 | # prepare control 60 | if 'n' in modify and type(env.observation_space) == gym.spaces.Box: 61 | env = NormalizeWrapper(env) 62 | if 'r' in modify: 63 | env = ReparametrizeWrapper(env) 64 | if 'p' in modify: 65 | env = PILCOWrapper(env) 66 | if 's' in modify: 67 | env = ScaleRewardWrapper(env) 68 | 69 | if 'CartPole' in game: 70 | env.observation_space = gym.spaces.Box(np.array([-4.8,-10,-4.8,-10]),np.array([4.8,10,4.8,10])) 71 | 72 | # prepare atari 73 | if is_atari_game(env): 74 | env = prepare_atari_env(env) 75 | return env 76 | 77 | def prepare_atari_env(Env,frame_skip=3,repeat_action_prob=0.0,reward_clip=True): 78 | ''' Initialize an Atari environment ''' 79 | env = get_base_env(Env) 80 | env.ale.setFloat('repeat_action_probability'.encode('utf-8'), repeat_action_prob) 81 | env.frame_skip = frame_skip 82 | if reward_clip: 83 | Env = ClipRewardWrapper(Env) 84 | return Env 85 | 86 | def prepare_chain_env(game): 87 | game,n = game.split('-') 88 | if game == 'Chain': 89 | Env = Chain(int(n)) 90 | elif game == 'ChainOrdered': 91 | Env = ChainOrdered(int(n)) 92 | elif game == 'ChainLoop': 93 | Env = ChainLoop(int(n)) 94 | return Env 95 | 96 | def make_game(game): 97 | ''' Overall wrapper for gym.make_game ''' 98 | os.system('export LD_LIBRARY_PATH=`$HOME/.mujoco/mjpro150/bin`') # necessary for mujoco 99 | if 'Chain' in game: 100 | Env = prepare_chain_env(game) 101 | else: 102 | Env = prepare_gym_env(game) 103 | return Env 104 | 105 | def check_space(space): 106 | '''check the properties of the env ''' 107 | if isinstance(space,spaces.Box): 108 | dim = space.shape # should the zero be here? 109 | discrete = False 110 | elif isinstance(space,spaces.Discrete): 111 | dim = space.n 112 | discrete = True 113 | else: 114 | raise NotImplementedError 115 | return dim, discrete -------------------------------------------------------------------------------- /src/common/rl/envs/chain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Chain environment 4 | @author: thomas 5 | """ 6 | 7 | import gym.spaces 8 | import numpy as np 9 | 10 | class ChainOrdered(object): 11 | ''' Chain domain ''' 12 | 13 | def __init__(self,n=10): 14 | # n = length of chain 15 | self.action_space = gym.spaces.Discrete(2) 16 | self.observation_space = gym.spaces.Discrete(n+1) 17 | self.n = n 18 | self.state = 0 19 | self.correct = np.repeat(1,n) 20 | 21 | def reset(self): 22 | self.state = 0 23 | return self.state 24 | 25 | def step(self,a): 26 | if a == 0: 27 | # move back 28 | self.state = 0 29 | r = 0 30 | terminal = True 31 | elif a == 1: 32 | # move forward 33 | self.state += 1 34 | if self.state == self.n: 35 | r = 1 36 | terminal = True 37 | else: 38 | r = 0 39 | terminal = False 40 | else: 41 | raise ValueError('Action not possible') 42 | 43 | return self.state,r,terminal, {} 44 | 45 | def seed(self,seed): 46 | pass # deterministic anyway 47 | 48 | class Chain(object): 49 | ''' Chain domain ''' 50 | 51 | def __init__(self,n=10): 52 | # n = length of chain 53 | self.action_space = gym.spaces.Discrete(2) 54 | self.observation_space = gym.spaces.Discrete(n+1) 55 | self.n = n 56 | self.state = 0 57 | self.correct = np.random.randint(0,2,n) # correct action in each state 58 | self.counts = np.zeros((self.n,2)) 59 | 60 | def reset(self): 61 | self.state = 0 62 | return self.state 63 | 64 | def step(self,a): 65 | self.counts[self.state,a] += 1 66 | if a != self.correct[self.state]: 67 | # move back 68 | self.state = 0 69 | r = 0 70 | terminal = True 71 | elif a == self.correct[self.state]: 72 | # move forward 73 | self.state += 1 74 | if self.state == self.n: 75 | r = 1 76 | terminal = True 77 | else: 78 | r = 0 79 | terminal = False 80 | else: 81 | raise ValueError('Action not possible') 82 | 83 | return self.state,r,terminal, {} 84 | 85 | def seed(self,seed): 86 | pass # deterministic anyway 87 | 88 | 89 | class ChainLoop(object): 90 | ''' Chain domain ''' 91 | 92 | def __init__(self,n=10): 93 | # n = length of chain 94 | self.action_space = gym.spaces.Discrete(2) 95 | self.observation_space = gym.spaces.Discrete(n+1) 96 | self.n = n 97 | self.state = 0 98 | self.correct = np.random.randint(0,2,n) # correct action in each state 99 | self.counts = np.zeros((self.n,2)) 100 | 101 | def reset(self): 102 | self.state = 0 103 | return self.state 104 | 105 | def step(self,a): 106 | self.counts[self.state,a] += 1 107 | if a != self.correct[self.state]: 108 | # move back 109 | self.state = 0 110 | r = 0 111 | terminal = False 112 | elif a == self.correct[self.state]: 113 | # move forward 114 | self.state += 1 115 | if self.state == self.n: 116 | r = 1 117 | terminal = True 118 | else: 119 | r = 0 120 | terminal = False 121 | else: 122 | raise ValueError('Action not possible') 123 | 124 | return self.state,r,terminal, {} 125 | 126 | def seed(self,seed): 127 | pass # deterministic anyway 128 | 129 | # Test 130 | if __name__ == '__main__': 131 | Env = ChainOrdered() 132 | s = Env.reset() 133 | for i in range(500): 134 | a = Env.action_space.sample() 135 | s,r,terminal,_ = Env.step(a) 136 | if terminal: 137 | print('Died in step',i,'with reward',r,' restarting') 138 | s = Env.reset() 139 | print('Finished') -------------------------------------------------------------------------------- /jobs/job_p.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=0.0,use_prior=True,name=lr:0.001-random_action_frac:0.0-use_prior:True 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=0.0,use_prior=False,name=lr:0.001-random_action_frac:0.0-use_prior:False 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=1.0,use_prior=True,name=lr:0.001-random_action_frac:1.0-use_prior:True 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=1.0,use_prior=False,name=lr:0.001-random_action_frac:1.0-use_prior:False 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=0.0,use_prior=True,name=lr:0.0001-random_action_frac:0.0-use_prior:True 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=0.0,use_prior=False,name=lr:0.0001-random_action_frac:0.0-use_prior:False 8 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=1.0,use_prior=True,name=lr:0.0001-random_action_frac:1.0-use_prior:True 9 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=1.0,use_prior=False,name=lr:0.0001-random_action_frac:1.0-use_prior:False 10 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=0.0,use_prior=True,name=lr:0.00001-random_action_frac:0.0-use_prior:True 11 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=0.0,use_prior=False,name=lr:0.00001-random_action_frac:0.0-use_prior:False 12 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=1.0,use_prior=True,name=lr:0.00001-random_action_frac:1.0-use_prior:True 13 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=1.0,use_prior=False,name=lr:0.00001-random_action_frac:1.0-use_prior:False 14 | -------------------------------------------------------------------------------- /src/common/rl/wrappers/control.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed May 9 14:57:40 2018 5 | 6 | @author: thomas 7 | """ 8 | import gym 9 | import numpy as np 10 | from scipy.stats import multivariate_normal 11 | import sklearn.preprocessing 12 | 13 | class ObservationRewardWrapper(gym.Wrapper): 14 | ''' My own base class - allows for both observation and reward modification ''' 15 | def step(self, action): 16 | observation, reward, done, info = self.env.step(action) 17 | return self.observation(observation), self.reward(reward), done, info 18 | 19 | def reset(self): 20 | observation = self.env.reset() 21 | return self.observation(observation) 22 | 23 | def observation(self, observation): 24 | return observation 25 | 26 | def reward(self, reward): 27 | return reward 28 | 29 | def get_name(env): 30 | while True: 31 | if hasattr(env,'_spec'): 32 | name = env._spec.id 33 | break 34 | elif hasattr(env,'spec'): 35 | name = env.spec.id 36 | break 37 | else: 38 | env = env.env 39 | return name 40 | 41 | class NormalizeWrapper(ObservationRewardWrapper): 42 | ''' normalizes the input data range ''' 43 | def __init__(self, env): 44 | ObservationRewardWrapper.__init__(self, env) 45 | self.name = get_name(env) 46 | observation_examples = np.array([env.observation_space.sample() for x in range(10000)]) 47 | self.scaler = sklearn.preprocessing.StandardScaler() 48 | self.scaler.fit(observation_examples) 49 | 50 | def observation(self, observation): 51 | return self.scaler.transform([observation])[0] 52 | 53 | class ScaleRewardWrapper(ObservationRewardWrapper): 54 | 55 | def __init__(self, env): 56 | gym.RewardWrapper.__init__(self, env) 57 | self.name = get_name(env) 58 | 59 | def reward(self, reward): 60 | """ Rescale reward """ 61 | if 'Pendulum' in self.name: 62 | return np.float32(reward/1000.0) 63 | #elif 'MountainCarContinuous' in self.name: 64 | # return np.float32(reward/500.0) 65 | elif 'Lunarlander' in self.name: 66 | return np.float32(reward/250.0) 67 | else: 68 | return reward 69 | 70 | class ReparametrizeWrapper(ObservationRewardWrapper): 71 | 72 | def __init__(self, env): 73 | gym.RewardWrapper.__init__(self, env) 74 | self.name = get_name(env) 75 | 76 | def step(self, action): 77 | observation, reward, terminal, info = self.env.step(action) 78 | return self.observation(observation), self.reward(reward,terminal), terminal, info 79 | 80 | def reward(self,r,terminal): 81 | if 'CartPole' in self.name: 82 | if terminal: 83 | r = -1 84 | else: 85 | r = 0.005 86 | elif 'MountainCar' in self.name: 87 | if terminal: 88 | r = 1 89 | else: 90 | r = -0.005 91 | elif 'Acrobot' in self.name: 92 | if terminal: 93 | r = 1 94 | else: 95 | r = -0.005 96 | return r 97 | 98 | class PILCOWrapper(ObservationRewardWrapper): 99 | 100 | def __init__(self, env): 101 | gym.RewardWrapper.__init__(self, env) 102 | while True: 103 | if hasattr(env,'_spec'): 104 | self.name = env._spec.id 105 | break 106 | else: 107 | env = env.env 108 | 109 | def step(self, action): 110 | observation, reward, terminal, info = self.env.step(action) 111 | return self.observation(observation), self.reward(observation), terminal, info 112 | 113 | def reward(self,s): 114 | if 'CartPole' in self.name: 115 | target = np.array([0.0,0.0,0.0,0.0]) 116 | elif 'Acrobot' in self.name: 117 | target = np.array([1.0]) 118 | s = -np.cos(s[0]) - np.cos(s[1] + s[0]) 119 | elif 'MountainCar' in self.name: 120 | target = np.array([0.5]) 121 | s = s[0] 122 | elif 'Pendulum' in self.name: 123 | target = np.array([0.0,0.0]) 124 | else: 125 | raise ValueError('no PILCO reward mofication for this game') 126 | return 1 - multivariate_normal.pdf(s,mean=target) -------------------------------------------------------------------------------- /jobs/job_backup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts20lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.01 4 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts50lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.01 5 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts150lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.01 6 | 7 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts20lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.001 8 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts50lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.001 9 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts150lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.001 10 | 11 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts20lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.0001 12 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts50lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.0001 13 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts150lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.0001 14 | 15 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts20lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.01 16 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts50lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.01 17 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts150lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.01 18 | 19 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts20lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.001 20 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts50lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.001 21 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts150lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.001 22 | 23 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts20lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.0001 24 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts50lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.0001 25 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts150lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.0001 26 | -------------------------------------------------------------------------------- /src/common/hps_setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | General hyperparameters for hyperlooping and slurm setup 4 | @author: thomas 5 | """ 6 | 7 | import copy 8 | 9 | def get_hps_setup(): 10 | ''' Hyperparameter settings ''' 11 | return HParams( 12 | # General 13 | game = 'None', # Environment name 14 | name = 'None', # Name of experiment 15 | 16 | # Slurm parameters 17 | slurm = False, 18 | slurm_qos = 'short', 19 | slurm_time = '3:59:59', 20 | cpu_per_task = 2, 21 | mem_per_cpu = 2048, 22 | distributed = False, 23 | n_jobs = 16, # distribute over n_jobs processes 24 | 25 | # Hyperparameter looping 26 | n_rep = 1, 27 | rep = 0, # repetition index 28 | loop_hyper = False, 29 | item1 = None, 30 | seq1 = [None], 31 | item2 = None, 32 | seq2 = [None], 33 | item3 = None, 34 | seq3 = [None], 35 | item4 = None, 36 | seq4 = [None], 37 | ) 38 | 39 | def hps_to_list(hps): 40 | out=[] 41 | hps_dict = copy.deepcopy(hps.__dict__) 42 | try: 43 | del hps_dict['_items'] 44 | except: 45 | pass 46 | for k,v in hps_dict.items(): 47 | if type(v) == list: 48 | v='+'.join(str(x) for x in v) 49 | if not (v is None or v == 'None'): # should not write the default hyperloop settings 50 | out.append('{}={}'.format(k,v)) 51 | out.sort() 52 | return ','.join(out) 53 | 54 | def hps_to_dict(hps): 55 | hps_dict = copy.deepcopy(hps.__dict__) 56 | try: 57 | del hps_dict['_items'] 58 | except: 59 | pass 60 | return hps_dict 61 | 62 | class HParams(object): 63 | 64 | def __init__(self, **kwargs): 65 | self._items = {} 66 | for k, v in kwargs.items(): 67 | self._set(k, v) 68 | 69 | def _set(self, k, v): 70 | self._items[k] = v 71 | setattr(self, k, v) 72 | 73 | def _get(self,k): 74 | return self._items[k] 75 | 76 | def __eq__(self, other) : 77 | return self.__dict__ == other.__dict__ 78 | 79 | def parse(self, str_value,hps_extra=None): 80 | hps = HParams(**self._items) 81 | for entry in str_value.strip().split(","): 82 | entry = entry.strip() 83 | if not entry: 84 | continue 85 | key, sep, value = entry.partition("=") 86 | if not sep: 87 | raise ValueError("Unable to parse: %s" % entry) 88 | try: 89 | default_value = hps._items[key] 90 | except: 91 | print('Cant parse key {}, skipping'.format(key)) 92 | continue 93 | if isinstance(default_value, bool): 94 | hps._set(key, value.lower() == "true") 95 | elif isinstance(default_value, int): 96 | hps._set(key, int(value)) 97 | elif default_value is None and value == 'None': 98 | hps._set(key, None) 99 | elif isinstance(default_value, float): 100 | hps._set(key, float(value)) 101 | elif isinstance(default_value, list): 102 | value = value.split('+') 103 | default_inlist = hps._items[key][0] 104 | if key == 'seq1': 105 | if hps_extra is not None: 106 | default_inlist = hps_extra._items[hps._items['item1']] 107 | else: 108 | default_inlist = hps._items[hps._items['item1']] 109 | if key == 'seq2': 110 | if hps_extra is not None: 111 | default_inlist = hps_extra._items[hps._items['item2']] 112 | else: 113 | default_inlist = hps._items[hps._items['item2']] 114 | if key == 'seq3': 115 | if hps_extra is not None: 116 | default_inlist = hps_extra._items[hps._items['item3']] 117 | else: 118 | default_inlist = hps._items[hps._items['item3']] 119 | if isinstance(default_inlist, bool): 120 | hps._set(key, [i.lower() == "true" for i in value]) 121 | elif isinstance(default_inlist, int): 122 | hps._set(key, [int(i) for i in value]) 123 | elif isinstance(default_inlist, float): 124 | hps._set(key, [float(i) for i in value]) 125 | else: 126 | hps._set(key,value) # string 127 | else: 128 | hps._set(key, value) 129 | return hps 130 | -------------------------------------------------------------------------------- /src/rl/wrappers/action_space.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from .multi_discrete import BoxToMultiDiscrete, DiscreteToMultiDiscrete 3 | 4 | # Constants 5 | NUM_ACTIONS = 43 6 | ALLOWED_ACTIONS = [ 7 | [0, 10, 11], # 0 - Basic 8 | [0, 10, 11, 13, 14, 15], # 1 - Corridor 9 | [0, 14, 15], # 2 - DefendCenter 10 | [0, 14, 15], # 3 - DefendLine 11 | [13, 14, 15], # 4 - HealthGathering 12 | [13, 14, 15], # 5 - MyWayHome 13 | [0, 14, 15], # 6 - PredictPosition 14 | [10, 11], # 7 - TakeCover 15 | [x for x in range(NUM_ACTIONS) if x != 33], # 8 - Deathmatch 16 | ] 17 | 18 | __all__ = [ 'ToDiscrete', 'ToBox' ] 19 | 20 | def ToDiscrete(config): 21 | # Config can be 'minimal', 'constant-7', 'constant-17', 'full' 22 | 23 | class ToDiscreteWrapper(gym.Wrapper): 24 | """ 25 | Doom wrapper to convert MultiDiscrete action space to Discrete 26 | 27 | config: 28 | - minimal - Will only use the levels' allowed actions (+ NOOP) 29 | - constant-7 - Will use the 7 minimum actions (+NOOP) to complete all levels 30 | - constant-17 - Will use the 17 most common actions (+NOOP) to complete all levels 31 | - full - Will use all available actions (+ NOOP) 32 | 33 | list of commands: 34 | - minimal: 35 | Basic: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT 36 | Corridor: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 37 | DefendCenter NOOP, ATTACK, TURN_RIGHT, TURN_LEFT 38 | DefendLine: NOOP, ATTACK, TURN_RIGHT, TURN_LEFT 39 | HealthGathering: NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 40 | MyWayHome: NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 41 | PredictPosition: NOOP, ATTACK, TURN_RIGHT, TURN_LEFT 42 | TakeCover: NOOP, MOVE_RIGHT, MOVE_LEFT 43 | Deathmatch: NOOP, ALL COMMANDS (Deltas are limited to [0,1] range and will not work properly) 44 | 45 | - constant-7: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON 46 | 47 | - constant-17: NOOP, ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD 48 | MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON 49 | """ 50 | def __init__(self, env): 51 | super(ToDiscreteWrapper, self).__init__(env) 52 | if config == 'minimal': 53 | allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level] 54 | elif config == 'constant-7': 55 | allowed_actions = [0, 10, 11, 13, 14, 15, 31] 56 | elif config == 'constant-17': 57 | allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32] 58 | elif config == 'full': 59 | allowed_actions = None 60 | else: 61 | raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"') 62 | self.action_space = DiscreteToMultiDiscrete(self.action_space, allowed_actions) 63 | def _step(self, action): 64 | return self.env._step(self.action_space(action)) 65 | 66 | return ToDiscreteWrapper 67 | 68 | def ToBox(config): 69 | # Config can be 'minimal', 'constant-7', 'constant-17', 'full' 70 | 71 | class ToBoxWrapper(gym.Wrapper): 72 | """ 73 | Doom wrapper to convert MultiDiscrete action space to Box 74 | 75 | config: 76 | - minimal - Will only use the levels' allowed actions 77 | - constant-7 - Will use the 7 minimum actions to complete all levels 78 | - constant-17 - Will use the 17 most common actions to complete all levels 79 | - full - Will use all available actions 80 | 81 | list of commands: 82 | - minimal: 83 | Basic: ATTACK, MOVE_RIGHT, MOVE_LEFT 84 | Corridor: ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 85 | DefendCenter ATTACK, TURN_RIGHT, TURN_LEFT 86 | DefendLine: ATTACK, TURN_RIGHT, TURN_LEFT 87 | HealthGathering: MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 88 | MyWayHome: MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 89 | PredictPosition: ATTACK, TURN_RIGHT, TURN_LEFT 90 | TakeCover: MOVE_RIGHT, MOVE_LEFT 91 | Deathmatch: ALL COMMANDS 92 | 93 | - constant-7: ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON 94 | 95 | - constant-17: ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD 96 | MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON 97 | """ 98 | def __init__(self, env): 99 | super(ToBoxWrapper, self).__init__(env) 100 | if config == 'minimal': 101 | allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level] 102 | elif config == 'constant-7': 103 | allowed_actions = [0, 10, 11, 13, 14, 15, 31] 104 | elif config == 'constant-17': 105 | allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32] 106 | elif config == 'full': 107 | allowed_actions = None 108 | else: 109 | raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"') 110 | self.action_space = BoxToMultiDiscrete(self.action_space, allowed_actions) 111 | def _step(self, action): 112 | return self.env._step(self.action_space(action)) 113 | 114 | return ToBoxWrapper 115 | -------------------------------------------------------------------------------- /src/common/rl/wrappers/doom/action_space.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from .multi_discrete import BoxToMultiDiscrete, DiscreteToMultiDiscrete 3 | 4 | # Constants 5 | NUM_ACTIONS = 43 6 | ALLOWED_ACTIONS = [ 7 | [0, 10, 11], # 0 - Basic 8 | [0, 10, 11, 13, 14, 15], # 1 - Corridor 9 | [0, 14, 15], # 2 - DefendCenter 10 | [0, 14, 15], # 3 - DefendLine 11 | [13, 14, 15], # 4 - HealthGathering 12 | [13, 14, 15], # 5 - MyWayHome 13 | [0, 14, 15], # 6 - PredictPosition 14 | [10, 11], # 7 - TakeCover 15 | [x for x in range(NUM_ACTIONS) if x != 33], # 8 - Deathmatch 16 | ] 17 | 18 | __all__ = [ 'ToDiscrete', 'ToBox' ] 19 | 20 | def ToDiscrete(config): 21 | # Config can be 'minimal', 'constant-7', 'constant-17', 'full' 22 | 23 | class ToDiscreteWrapper(gym.Wrapper): 24 | """ 25 | Doom wrapper to convert MultiDiscrete action space to Discrete 26 | 27 | config: 28 | - minimal - Will only use the levels' allowed actions (+ NOOP) 29 | - constant-7 - Will use the 7 minimum actions (+NOOP) to complete all levels 30 | - constant-17 - Will use the 17 most common actions (+NOOP) to complete all levels 31 | - full - Will use all available actions (+ NOOP) 32 | 33 | list of commands: 34 | - minimal: 35 | Basic: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT 36 | Corridor: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 37 | DefendCenter NOOP, ATTACK, TURN_RIGHT, TURN_LEFT 38 | DefendLine: NOOP, ATTACK, TURN_RIGHT, TURN_LEFT 39 | HealthGathering: NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 40 | MyWayHome: NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 41 | PredictPosition: NOOP, ATTACK, TURN_RIGHT, TURN_LEFT 42 | TakeCover: NOOP, MOVE_RIGHT, MOVE_LEFT 43 | Deathmatch: NOOP, ALL COMMANDS (Deltas are limited to [0,1] range and will not work properly) 44 | 45 | - constant-7: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON 46 | 47 | - constant-17: NOOP, ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD 48 | MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON 49 | """ 50 | def __init__(self, env): 51 | super(ToDiscreteWrapper, self).__init__(env) 52 | if config == 'minimal': 53 | allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level] 54 | elif config == 'constant-7': 55 | allowed_actions = [0, 10, 11, 13, 14, 15, 31] 56 | elif config == 'constant-17': 57 | allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32] 58 | elif config == 'full': 59 | allowed_actions = None 60 | else: 61 | raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"') 62 | self.action_space = DiscreteToMultiDiscrete(self.action_space, allowed_actions) 63 | def _step(self, action): 64 | return self.env._step(self.action_space(action)) 65 | 66 | return ToDiscreteWrapper 67 | 68 | def ToBox(config): 69 | # Config can be 'minimal', 'constant-7', 'constant-17', 'full' 70 | 71 | class ToBoxWrapper(gym.Wrapper): 72 | """ 73 | Doom wrapper to convert MultiDiscrete action space to Box 74 | 75 | config: 76 | - minimal - Will only use the levels' allowed actions 77 | - constant-7 - Will use the 7 minimum actions to complete all levels 78 | - constant-17 - Will use the 17 most common actions to complete all levels 79 | - full - Will use all available actions 80 | 81 | list of commands: 82 | - minimal: 83 | Basic: ATTACK, MOVE_RIGHT, MOVE_LEFT 84 | Corridor: ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 85 | DefendCenter ATTACK, TURN_RIGHT, TURN_LEFT 86 | DefendLine: ATTACK, TURN_RIGHT, TURN_LEFT 87 | HealthGathering: MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 88 | MyWayHome: MOVE_FORWARD, TURN_RIGHT, TURN_LEFT 89 | PredictPosition: ATTACK, TURN_RIGHT, TURN_LEFT 90 | TakeCover: MOVE_RIGHT, MOVE_LEFT 91 | Deathmatch: ALL COMMANDS 92 | 93 | - constant-7: ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON 94 | 95 | - constant-17: ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD 96 | MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON 97 | """ 98 | def __init__(self, env): 99 | super(ToBoxWrapper, self).__init__(env) 100 | if config == 'minimal': 101 | allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level] 102 | elif config == 'constant-7': 103 | allowed_actions = [0, 10, 11, 13, 14, 15, 31] 104 | elif config == 'constant-17': 105 | allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32] 106 | elif config == 'full': 107 | allowed_actions = None 108 | else: 109 | raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"') 110 | self.action_space = BoxToMultiDiscrete(self.action_space, allowed_actions) 111 | def _step(self, action): 112 | return self.env._step(self.action_space(action)) 113 | 114 | return ToBoxWrapper 115 | -------------------------------------------------------------------------------- /src/rl/rewardwrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Chain environment 4 | @author: thomas 5 | """ 6 | 7 | import gym.spaces 8 | import gym 9 | import numpy as np 10 | 11 | from gym import Wrapper 12 | 13 | class NormalizeWrapper(object): 14 | ''' Heuristically normalizes the reward scale for CartPole and MountainCar ''' 15 | 16 | def __init__(self,name): 17 | # n = length of chain 18 | if 'CartPole' in name: 19 | self.env = gym.make('CartPole-v0') 20 | elif 'MountainCar' in name: 21 | self.env = gym.make('MountainCar-v0') 22 | self.name = name 23 | self.action_space = self.env.action_space 24 | self.observation_space = self.env.observation_space 25 | 26 | def reset(self): 27 | return self.env.reset() 28 | 29 | def step(self,a): 30 | s,r,terminal,_ = self.env.step(a) 31 | r = r/50 32 | return s,r,terminal, _ 33 | 34 | class PILCOWrapper(object): 35 | ''' Wraps according to PILCO ''' 36 | 37 | def __init__(self,name): 38 | # n = length of chain 39 | if 'CartPole' in name: 40 | self.env = gym.make('CartPole-v0') 41 | elif 'MountainCar' in name: 42 | self.env = gym.make('MountainCar-v0') 43 | self.name = name 44 | self.action_space = self.env.action_space 45 | self.observation_space = self.env.observation_space 46 | 47 | def reset(self): 48 | return self.env.reset() 49 | 50 | def step(self,a): 51 | s,r,terminal,_ = self.env.step(a) 52 | r = pilco_reward(s,self.name) 53 | return s,r,terminal, _ 54 | 55 | def pilco_reward(s,game='Cartpole-v0'): 56 | ''' use modified reward function as in Pilco ''' 57 | from scipy.stats import multivariate_normal 58 | if game == 'CartPole-vp': 59 | target = np.array([0.0,0.0,0.0,0.0]) 60 | elif game == 'Acrobot-vp': 61 | target = np.array([1.0]) 62 | s = -np.cos(s[0]) - np.cos(s[1] + s[0]) 63 | elif game == 'MountainCar-vp': 64 | target = np.array([0.5]) 65 | s = s[0] 66 | elif game == 'Pendulum-vp': 67 | target = np.array([0.0,0.0]) 68 | else: 69 | raise ValueError('no PILCO reward mofication for this game') 70 | r = 1 - multivariate_normal.pdf(s,mean=target) 71 | return r 72 | 73 | class RewardWrapper2(Wrapper): 74 | env = None 75 | 76 | def __init__(self, env): 77 | self.env = env 78 | self.action_space = self.env.action_space 79 | self.observation_space = self.env.observation_space 80 | self.reward_range = self.env.reward_range 81 | self.metadata = self.env.metadata 82 | self._warn_double_wrap() 83 | while True: 84 | if hasattr(env,'_spec'): 85 | self.name = env._spec.id 86 | break 87 | else: 88 | env = env.env 89 | 90 | def reset(self): 91 | return self.env.reset() 92 | 93 | def step(self, action): 94 | observation, reward, terminal, info = self.env.step(action) 95 | return observation, self.reward(reward,terminal), terminal, info 96 | 97 | def reward(self,r,terminal): 98 | if 'CartPole' in self.name: 99 | if terminal: 100 | r = -1 101 | else: 102 | r = 0.005 103 | elif 'MountainCar' in self.name: 104 | if terminal: 105 | r = 1 106 | else: 107 | r = -0.005 108 | elif 'Acrobot' in self.name: 109 | if terminal: 110 | r = 1 111 | else: 112 | r = -0.005 113 | elif 'LunarLander' in self.name: 114 | r = r/250.0 115 | return r 116 | 117 | 118 | class RewardWrapper(object): 119 | ''' Chain domain ''' 120 | 121 | def __init__(self,name): 122 | # n = length of chain 123 | if name == 'CartPole-vr': 124 | self.env = gym.make('CartPole-v1') 125 | if type(self.env) == gym.wrappers.time_limit.TimeLimit: 126 | self.env = self.env.env 127 | elif name == 'MountainCar-vr': 128 | self.env = gym.make('MountainCar-v0') 129 | if type(self.env) == gym.wrappers.time_limit.TimeLimit: 130 | self.env = self.env.env 131 | elif name == 'Acrobot-vr': 132 | self.env = gym.make('Acrobot-v1') 133 | if type(self.env) == gym.wrappers.time_limit.TimeLimit: 134 | self.env = self.env.env 135 | elif name == 'LunarLander-vr': 136 | self.env = gym.make('LunarLander-v2') 137 | # self.env.metadata = {} 138 | if type(self.env) == gym.wrappers.time_limit.TimeLimit: 139 | self.env = self.env.env 140 | self.name = name 141 | self.action_space = self.env.action_space 142 | self.observation_space = self.env.observation_space 143 | 144 | def reset(self): 145 | return self.env.reset() 146 | 147 | def step(self,a): 148 | s,r,terminal,_ = self.env.step(a) 149 | if self.name == 'CartPole-vr': 150 | if terminal: 151 | r = -1 152 | else: 153 | r = 0 154 | elif self.name == 'MountainCar-vr': 155 | if terminal: 156 | r = 1 157 | else: 158 | r = 0 159 | elif self.name == 'Acrobot-vr': 160 | if terminal: 161 | r = 1 162 | else: 163 | r = 0 164 | elif self.name == 'LunarLander-vr': 165 | r = r/250.0 166 | return s,r,terminal, _ 167 | 168 | def seed(self,seed): 169 | self.env.seed(seed) 170 | 171 | def render(self): 172 | return self.env.render() 173 | 174 | def close(self): 175 | return self.env.close() 176 | 177 | # Test 178 | if __name__ == '__main__': 179 | for game in ['MountainCar-vr','CartPole-vr']: 180 | Env = RewardWrapper(game) 181 | s = Env.reset() 182 | for i in range(500): 183 | a = Env.action_space.sample() 184 | s,r,terminal,_ = Env.step(a) 185 | if terminal: 186 | print('Died in step',i,'with reward',r,' restarting') 187 | s = Env.reset() 188 | print('Finished') -------------------------------------------------------------------------------- /src/common/submit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Script to submit jobs 4 | Handles slurm settings, hyperparameter looping, and potential plotting (if not on slurm) 5 | @author: thomas 6 | """ 7 | import os 8 | import argparse 9 | from pprint import pformat 10 | from .hps_setup import get_hps_setup,hps_to_dict,hps_to_list 11 | from .visualize import nested_list,make_name,plot_hyperloop_results 12 | 13 | def make_unique_subfolder(folder,hyperloop=False): 14 | ''' adds a unique four digit subfolder to folder ''' 15 | i = 0 16 | while os.path.exists(folder + candidate(i,hyperloop)): 17 | i += 1 18 | subfolder = folder + candidate(i,hyperloop) 19 | if not os.path.exists(subfolder): 20 | os.makedirs(subfolder) 21 | return subfolder 22 | 23 | def candidate(i,hyperloop): 24 | return '{0:04}h/'.format(i) if hyperloop else '{0:04}/'.format(i) 25 | 26 | def submit_slurm(hps,hps_setup,hyperloopname,job_dir,slurmout_dir,ntasks,nodes,n_cpu,mem_per_cpu): 27 | # make sh file 28 | run_file = job_dir + hps.game + hyperloopname + '0.sh' 29 | 30 | if hps_setup.distributed: 31 | base = ' '.join(['mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH' 32 | '-mca pml ob1 -mca btl ^openib python3 agent.py --hp {} --no_plot']).format(hps_to_list(hps)) # this should become mpirun 33 | else: 34 | base = 'srun python3 agent.py --hp {} --no_plot '.format(hps_to_list(hps)) # this should become mpirun 35 | 36 | with open(run_file,'w') as fp: 37 | fp.write('#!/bin/sh\n') 38 | fp.write(base) 39 | 40 | # prepare sbatch command 41 | my_sbatch = ' '.join(['sbatch --partition=general --qos={} --time={} --ntasks={}', 42 | '--nodes={} --cpus-per-task={} --mem-per-cpu={} --mail-type=NONE', 43 | '--output={}slurm-%j.out', 44 | '--exclude=ess-2', 45 | '--workdir={}', 46 | '--job-name={} {}']).format(hps_setup.slurm_qos,hps_setup.slurm_time,ntasks, 47 | nodes,n_cpu,mem_per_cpu,slurmout_dir, 48 | os.getcwd(),hps.game,run_file) 49 | # run 50 | os.system('chmod +x {}'.format(run_file)) 51 | return_val = os.system(my_sbatch) 52 | if return_val != 0: 53 | raise ValueError('submission went wrong') 54 | 55 | def submit(hp_string,hpsetup_string,no_plot,agent,get_hps,override_hps_settings): 56 | hps = get_hps().parse(hp_string) 57 | hps_setup = get_hps_setup().parse(hpsetup_string,hps) 58 | # override game and name from hyperlooping 59 | if hps_setup.game != 'None': 60 | hps.game = hps_setup.game 61 | if hps_setup.name != 'None': 62 | hps.name = hps_setup.name 63 | 64 | # set-up base result folder 65 | result_folder = os.getcwd() + '/results/{}/{}/'.format(hps.name,hps.game) 66 | 67 | # check whether we should be hyperlooping 68 | loop_hyper = True if (hps_setup.item1 is not None or hps_setup.item2 is not None or hps_setup.item3 is not None) else False 69 | 70 | # make the unique subfolder 71 | subfolder = make_unique_subfolder(result_folder,loop_hyper) 72 | 73 | # Write hyperparameters in nice format 74 | with open(subfolder + 'hps_setup.txt','w') as file: 75 | file.write(pformat(hps_to_dict(hps_setup))) 76 | with open(subfolder + 'hps.txt','w') as file: 77 | file.write(pformat(hps_to_dict(hps))) 78 | with open(subfolder + 'hps_setup_raw.txt','w') as file: 79 | file.write(hps_to_list(hps_setup)) 80 | 81 | if not hps_setup.slurm: 82 | # for automatically plotting results if not on slurm 83 | n1,n2,n3 = len(hps_setup.seq1),len(hps_setup.seq2),len(hps_setup.seq3) 84 | results = nested_list(n1,n2,n3,hps_setup.n_rep) # handle plotting within this call, so agregate results 85 | else: 86 | no_plot = True 87 | # prepare slurm submission folders 88 | job_dir = os.getcwd() + '/results/jobs/' 89 | if not os.path.exists(job_dir): 90 | os.makedirs(job_dir) 91 | slurmout_dir = os.getcwd() + '/results/slurmout/' 92 | if not os.path.exists(slurmout_dir): 93 | os.makedirs(slurmout_dir) 94 | # some slurm settings initialization due to the specific Delft slurm cluster 95 | if hps_setup.distributed: 96 | ntasks = hps_setup.n_tasks 97 | nodes = '1-3' 98 | n_cpu = hps_setup.cpu_per_task 99 | mem_per_cpu = int((16384/(ntasks*n_cpu)) - 5) 100 | else: 101 | ntasks = 1 102 | nodes = 1 103 | n_cpu = hps_setup.cpu_per_task 104 | mem_per_cpu = hps_setup.mem_per_cpu 105 | 106 | for rep in range(hps_setup.n_rep): 107 | hps.rep = rep 108 | for it1,item1 in enumerate(hps_setup.seq1): 109 | if hps_setup.item1 is not None: 110 | hps._set(hps_setup.item1,item1) 111 | for it2,item2 in enumerate(hps_setup.seq2): 112 | if hps_setup.item2 is not None: 113 | hps._set(hps_setup.item2,item2) 114 | for it3,item3 in enumerate(hps_setup.seq3): 115 | if hps_setup.item3 is not None: 116 | hps._set(hps_setup.item3,item3) 117 | hyperloop_name = make_name('',hps_setup.item1,item1,hps_setup.item2,item2, 118 | hps_setup.item3,item3) 119 | # if loop_hyper: 120 | result_folder = subfolder + hyperloop_name 121 | hps.result_dir = result_folder + 'rep:{}'.format(rep) 122 | 123 | hps = override_hps_settings(hps) # maybe some hps_setup parameter overrides a number of hps parameters 124 | 125 | # Submit slurm job or launch agent in this process 126 | if hps_setup.slurm: 127 | submit_slurm(hps,hps_setup,hyperloop_name,job_dir,slurmout_dir,ntasks,nodes,n_cpu,mem_per_cpu) 128 | else: 129 | print(' ________________________________________ ') 130 | print('Start learning on game {} with hyperparams {}'.format(hps.game,hyperloop_name)) 131 | curve = agent(hps) 132 | results[it1][it2][it3][rep] = curve 133 | 134 | if not no_plot: 135 | plot_hyperloop_results(results,hps_setup,subfolder,plot_type='mean',sd=True) 136 | 137 | if __name__ == "__main__": 138 | '''Set-up training''' 139 | parser = argparse.ArgumentParser() 140 | parser.add_argument('--hp', help='Hyperparameter configuration',default='') 141 | parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='') 142 | parser.add_argument('--no_plot', action='store_true',default=False) 143 | args = parser.parse_args() -------------------------------------------------------------------------------- /src/rl/envs/grid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Grid-world environment 4 | @author: thomas 5 | """ 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import matplotlib.patches as patches 10 | import random 11 | 12 | class Grid(object): 13 | ''' Grid world with stochastic ghosts ''' 14 | 15 | def __init__(self,to_plot=False,grid=False): 16 | world = np.zeros([7,7],dtype='int32') 17 | world[1:6,1] = 1 18 | world[1:3,4] = 1 19 | world[4:6,4] = 1 20 | self.world = world 21 | self.grid = grid 22 | self.reset() 23 | self.observation_shape = np.shape(self.get_state())[0] 24 | 25 | if to_plot: 26 | plt.ion() 27 | fig = plt.figure() 28 | ax1 = fig.add_subplot(111,aspect='equal') 29 | ax1.axis('off') 30 | plt.xlim([-1,8]) 31 | plt.ylim([-1,8]) 32 | 33 | #colors = matplotlib.colors.ListerColormap() 34 | for i in range(7): 35 | for j in range(7): 36 | if world[i,j]==1: 37 | col = "black" 38 | else: 39 | col = "white" 40 | ax1.add_patch( 41 | patches.Rectangle( 42 | (i,j),1,1, 43 | #fill=False, 44 | edgecolor='black', 45 | linewidth = 2, 46 | facecolor = col,), 47 | ) 48 | if np.all([i,j] == self.ghost1): 49 | self.g1 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='red')) 50 | if np.all([i,j] == self.ghost2): 51 | self.g2 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='blue')) 52 | if np.all([i,j] == self.pacman): 53 | self.p = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='yellow')) 54 | self.fig = fig 55 | self.ax1 = ax1 56 | self.fig.canvas.draw() 57 | 58 | def reset(self): 59 | self.pacman = np.array([0,0]) 60 | self.ghost1 = np.array([1,3]) 61 | self.ghost2 = np.array([5,3]) 62 | return self.get_state() 63 | 64 | def set_state(self,state): 65 | self.pacman = np.array(state[0:2]) 66 | self.ghost1 = np.array(state[2:4]) 67 | self.ghost2 = np.array(state[4:6]) 68 | 69 | def step(self,a): 70 | # move pacman 71 | self._move(self.pacman,a) 72 | 73 | # check collision 74 | dead = self._check_dead() 75 | if dead: 76 | r = -1 77 | return self.get_state(),r,dead 78 | 79 | # move ghosts 80 | wall = True 81 | while wall: 82 | a1 = random.sample(range(4),1) # random ghost 83 | wall = self._move(self.ghost1,a1) 84 | 85 | # move ghosts 86 | wall = True 87 | while wall: 88 | a2 = np.where(np.random.multinomial(1,[0.1,0.1,0.4,0.4]))[0] # probabilistic ghost 89 | wall = self._move(self.ghost2,a2) 90 | 91 | # check collision again 92 | dead = self._check_dead() 93 | if dead: 94 | r = -1 95 | else: 96 | if np.all(self.pacman == np.array([6,6])): 97 | r = 10 98 | dead = True 99 | #print('Reached the goal') 100 | else: 101 | r = 0 102 | return self.get_state(),r,dead 103 | 104 | def get_state(self): 105 | if not self.grid: 106 | state = np.concatenate((self.pacman,self.ghost1,self.ghost2)) 107 | else: 108 | state = np.copy(self.world) 109 | state = np.stack(state,np.zeros(7,7),np.zeros(7,7),np.zeros(7,7),axis=2) 110 | state[self.pacman[0],self.pacman[1],1] = 1 111 | state[self.ghost1[0],self.ghost1[1],2] = 1 112 | state[self.ghost2[0],self.ghost2[1],3] = 1 113 | return state 114 | 115 | def plot(self): 116 | self.g1.remove() 117 | self.g2.remove() 118 | self.p.remove() 119 | 120 | # replot 121 | self.g1 = self.ax1.add_artist(plt.Circle(self.ghost1+0.5,0.3,color='red')) 122 | self.g2 = self.ax1.add_artist(plt.Circle(self.ghost2+0.5,0.3,color='blue')) 123 | self.p = self.ax1.add_artist(plt.Circle(self.pacman +0.5,0.3,color='yellow')) 124 | self.fig.canvas.draw() 125 | 126 | def plot_predictions(self,world): 127 | for i in range(7): 128 | for j in range(7): 129 | for k in range(3): 130 | if k==1: 131 | col = "yellow" 132 | elif k == 2: 133 | col = "red" 134 | elif k == 3: 135 | col = 'blue' 136 | if world[i,j,k]>0.0: 137 | self.ax1.add_patch(patches.Rectangle( 138 | (i,j),1,1, 139 | #fill=False, 140 | edgecolor='black', 141 | linewidth = 2, 142 | facecolor = col, 143 | alpha=world[i,j,k]), 144 | ) 145 | 146 | def _move(self,s,a): 147 | s_old = np.copy(s) 148 | 149 | # move 150 | if int(a[0]) == 0: #up 151 | s[1] +=1 152 | elif int(a[0]) == 1: #down 153 | s[1] -=1 154 | elif int(a[0])== 2: #right 155 | s[0] +=1 156 | elif int(a[0])==3: #left 157 | s[0] -=1 158 | else: 159 | raise ValueError('move not possible') 160 | 161 | # check if move is possible 162 | if s[0]<0 or s[0]>6 or s[1]<0 or s[1]>6: # out of grid 163 | wall = True 164 | elif np.all(self.world[s[0],s[1]] == 1): # wall 165 | wall = True 166 | else: 167 | wall = False 168 | 169 | if wall: 170 | # Need to repeat, put back old values 171 | s[0] = s_old[0] 172 | s[1] = s_old[1] 173 | return wall 174 | else: 175 | # Move to new state 176 | return wall 177 | 178 | def _check_dead(self): 179 | if np.all(self.pacman == self.ghost1) or np.all(self.pacman == self.ghost2): 180 | return True 181 | else: 182 | return False 183 | 184 | 185 | # Test 186 | if __name__ == '__main__': 187 | grid = grid_env(True) 188 | s = grid.get_state() 189 | for i in range(200): 190 | a = random.sample(range(4),1) 191 | s,r,dead = grid.step(a) 192 | if not dead: 193 | grid.plot() 194 | else: 195 | print('Died in step',i,', restarting') 196 | s = grid.reset() 197 | print(grid.get_state()) 198 | print('Finished') 199 | plt.show(block=True) 200 | -------------------------------------------------------------------------------- /src/common/rl/envs/grid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Grid-world environment 4 | @author: thomas 5 | """ 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import matplotlib.patches as patches 10 | import random 11 | 12 | class Grid(object): 13 | ''' Grid world with stochastic ghosts ''' 14 | 15 | def __init__(self,to_plot=False,grid=False): 16 | world = np.zeros([7,7],dtype='int32') 17 | world[1:6,1] = 1 18 | world[1:3,4] = 1 19 | world[4:6,4] = 1 20 | self.world = world 21 | self.grid = grid 22 | self.reset() 23 | self.observation_shape = np.shape(self.get_state())[0] 24 | 25 | if to_plot: 26 | plt.ion() 27 | fig = plt.figure() 28 | ax1 = fig.add_subplot(111,aspect='equal') 29 | ax1.axis('off') 30 | plt.xlim([-1,8]) 31 | plt.ylim([-1,8]) 32 | 33 | #colors = matplotlib.colors.ListerColormap() 34 | for i in range(7): 35 | for j in range(7): 36 | if world[i,j]==1: 37 | col = "black" 38 | else: 39 | col = "white" 40 | ax1.add_patch( 41 | patches.Rectangle( 42 | (i,j),1,1, 43 | #fill=False, 44 | edgecolor='black', 45 | linewidth = 2, 46 | facecolor = col,), 47 | ) 48 | if np.all([i,j] == self.ghost1): 49 | self.g1 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='red')) 50 | if np.all([i,j] == self.ghost2): 51 | self.g2 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='blue')) 52 | if np.all([i,j] == self.pacman): 53 | self.p = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='yellow')) 54 | self.fig = fig 55 | self.ax1 = ax1 56 | self.fig.canvas.draw() 57 | 58 | def reset(self): 59 | self.pacman = np.array([0,0]) 60 | self.ghost1 = np.array([1,3]) 61 | self.ghost2 = np.array([5,3]) 62 | return self.get_state() 63 | 64 | def set_state(self,state): 65 | self.pacman = np.array(state[0:2]) 66 | self.ghost1 = np.array(state[2:4]) 67 | self.ghost2 = np.array(state[4:6]) 68 | 69 | def step(self,a): 70 | # move pacman 71 | self._move(self.pacman,a) 72 | 73 | # check collision 74 | dead = self._check_dead() 75 | if dead: 76 | r = -1 77 | return self.get_state(),r,dead 78 | 79 | # move ghosts 80 | wall = True 81 | while wall: 82 | a1 = random.sample(range(4),1) # random ghost 83 | wall = self._move(self.ghost1,a1) 84 | 85 | # move ghosts 86 | wall = True 87 | while wall: 88 | a2 = np.where(np.random.multinomial(1,[0.1,0.1,0.4,0.4]))[0] # probabilistic ghost 89 | wall = self._move(self.ghost2,a2) 90 | 91 | # check collision again 92 | dead = self._check_dead() 93 | if dead: 94 | r = -1 95 | else: 96 | if np.all(self.pacman == np.array([6,6])): 97 | r = 10 98 | dead = True 99 | #print('Reached the goal') 100 | else: 101 | r = 0 102 | return self.get_state(),r,dead 103 | 104 | def get_state(self): 105 | if not self.grid: 106 | state = np.concatenate((self.pacman,self.ghost1,self.ghost2)) 107 | else: 108 | state = np.copy(self.world) 109 | state = np.stack(state,np.zeros(7,7),np.zeros(7,7),np.zeros(7,7),axis=2) 110 | state[self.pacman[0],self.pacman[1],1] = 1 111 | state[self.ghost1[0],self.ghost1[1],2] = 1 112 | state[self.ghost2[0],self.ghost2[1],3] = 1 113 | return state 114 | 115 | def plot(self): 116 | self.g1.remove() 117 | self.g2.remove() 118 | self.p.remove() 119 | 120 | # replot 121 | self.g1 = self.ax1.add_artist(plt.Circle(self.ghost1+0.5,0.3,color='red')) 122 | self.g2 = self.ax1.add_artist(plt.Circle(self.ghost2+0.5,0.3,color='blue')) 123 | self.p = self.ax1.add_artist(plt.Circle(self.pacman +0.5,0.3,color='yellow')) 124 | self.fig.canvas.draw() 125 | 126 | def plot_predictions(self,world): 127 | for i in range(7): 128 | for j in range(7): 129 | for k in range(3): 130 | if k==1: 131 | col = "yellow" 132 | elif k == 2: 133 | col = "red" 134 | elif k == 3: 135 | col = 'blue' 136 | if world[i,j,k]>0.0: 137 | self.ax1.add_patch(patches.Rectangle( 138 | (i,j),1,1, 139 | #fill=False, 140 | edgecolor='black', 141 | linewidth = 2, 142 | facecolor = col, 143 | alpha=world[i,j,k]), 144 | ) 145 | 146 | def _move(self,s,a): 147 | s_old = np.copy(s) 148 | 149 | # move 150 | if int(a[0]) == 0: #up 151 | s[1] +=1 152 | elif int(a[0]) == 1: #down 153 | s[1] -=1 154 | elif int(a[0])== 2: #right 155 | s[0] +=1 156 | elif int(a[0])==3: #left 157 | s[0] -=1 158 | else: 159 | raise ValueError('move not possible') 160 | 161 | # check if move is possible 162 | if s[0]<0 or s[0]>6 or s[1]<0 or s[1]>6: # out of grid 163 | wall = True 164 | elif np.all(self.world[s[0],s[1]] == 1): # wall 165 | wall = True 166 | else: 167 | wall = False 168 | 169 | if wall: 170 | # Need to repeat, put back old values 171 | s[0] = s_old[0] 172 | s[1] = s_old[1] 173 | return wall 174 | else: 175 | # Move to new state 176 | return wall 177 | 178 | def _check_dead(self): 179 | if np.all(self.pacman == self.ghost1) or np.all(self.pacman == self.ghost2): 180 | return True 181 | else: 182 | return False 183 | 184 | 185 | # Test 186 | if __name__ == '__main__': 187 | grid = grid_env(True) 188 | s = grid.get_state() 189 | for i in range(200): 190 | a = random.sample(range(4),1) 191 | s,r,dead = grid.step(a) 192 | if not dead: 193 | grid.plot() 194 | else: 195 | print('Died in step',i,', restarting') 196 | s = grid.reset() 197 | print(grid.get_state()) 198 | print('Finished') 199 | plt.show(block=True) 200 | -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Chain experiments 4 | @author: thomas 5 | """ 6 | 7 | if __name__ == '__main__' and __package__ is None: 8 | from os import sys, path 9 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 10 | 11 | global mpl 12 | import matplotlib as mpl 13 | mpl.use('Agg') 14 | 15 | import numpy as np 16 | import os 17 | import time 18 | from tensorflow.python import debug as tf_debug 19 | import tensorflow as tf 20 | import argparse 21 | from pprint import pformat 22 | #from pdb import set_trace 23 | 24 | # common package import 25 | from src.common.rl.make_game import make_game 26 | from src.common.submit import make_unique_subfolder 27 | from src.common.hps_setup import hps_to_dict 28 | from src.common.visualize import plot_single_experiment 29 | from src.common.putils import store_safely 30 | 31 | # local imports 32 | from config.hps import get_hps,override_hps_settings 33 | from src.mcts import MCTS,display_info 34 | from src.network import Model,Database 35 | 36 | def agent(hps): 37 | ''' Agent function ''' 38 | tf.reset_default_graph() 39 | 40 | # storage 41 | result = {} 42 | env_steps,ep_return = [],[] # will indicate the timestep for the learning curve 43 | losses,gn = [],[] 44 | best_R = -np.Inf 45 | 46 | Env = make_game(hps.game) 47 | D = Database(max_size=max(hps.data_size,hps.n_mcts*hps.steps_per_ep),batch_size=hps.batch_size) 48 | model = Model(Env,lr=hps.lr,n_mix=hps.n_mix,clip_gradient_norm=hps.clip_gradient_norm,loss_type=hps.loss_type, 49 | bound=hps.bound,temp=hps.temp,entropy_l=hps.entropy_l) 50 | 51 | #with tf.Session() as sess,sess.as_default(): 52 | with tf.Session() as sess: 53 | if hps.tfdb: 54 | sess = tf_debug.LocalCLIDebugWrapperSession(sess) 55 | model.sess = sess 56 | sess.run(tf.global_variables_initializer()) 57 | global_t_mcts = 0 58 | global_t = 0 59 | 60 | for ep in range(hps.n_eps): 61 | start = time.time() 62 | root_index = Env.reset() 63 | root = None 64 | R = 0.0 # episode reward 65 | t = 0 # episode steps 66 | seed = np.random.randint(1e7) 67 | Env.seed(seed) 68 | a_store = [] 69 | 70 | while True: 71 | # run an episode 72 | if hps.timeit: now = time.time() 73 | root = MCTS(root_index,root,Env,N=hps.n_mcts,model=model,c=hps.c,bootstrap_V=hps.bootstrap_V, 74 | block_loop=hps.block_loop,sigma_tree=hps.sigma_tree,backup_Q=hps.backup_Q, 75 | backup_sigma_tree=hps.backup_sigma_tree,seed=seed,a_his=a_store, 76 | alpha=hps.alpha,C_widening=hps.C_widening,use_prior=hps.use_prior,timeit=hps.timeit, 77 | random_action_frac=hps.random_action_frac) 78 | if hps.timeit: print('One MCTS search takes {} seconds'.format(time.time()-now)) 79 | if hps.verbose_mcts: display_info(root,'{}'.format(t),hps.c) 80 | 81 | probs,a_list,V,a,a_argmax = root.return_results(decision_type=hps.decision_type,loss_type=hps.loss_type, 82 | temperature=hps.temp,V_decision=hps.V_decision) 83 | for k,prob in enumerate(probs): 84 | D.store((root.index,V,a_list[k],np.array([prob]))) 85 | #if count == 0: 86 | # print('Warning',[child_action.n for child_action in root.child_actions],display_info(root,'{}'.format(t),hps.c)) 87 | 88 | # Make the step 89 | a_store.append(a) 90 | s1,r,terminal,_ = Env.step(a) 91 | R += r 92 | t += 1 93 | global_t += 1 94 | global_t_mcts += hps.n_mcts 95 | 96 | #if hps.verbose: 97 | # if (t % 50) == 0: 98 | # print('Overall step {}, root currently returns V {}, and considers a {} with counts {}'.format(global_t,V,a_list,probs)) 99 | 100 | if terminal or (t > hps.steps_per_ep): 101 | if hps.verbose: 102 | print('Episode terminal, total reward {}, steps {}'.format(R,t)) 103 | ep_return.append(R) 104 | env_steps.append(global_t_mcts) 105 | break # break out, start new episode 106 | else: 107 | root = root.forward(a_argmax,s1,r,terminal,model) 108 | 109 | # saving 110 | result.update({'steps':env_steps,'return':ep_return}) 111 | if hps.verbose: 112 | result.update({'gn':gn,'loss':losses}) 113 | #if R > best_R: 114 | # result.update({'seed':seed,'actions':a_store,'R':best_R}) 115 | # best_R = R 116 | store_safely(hps.result_dir,'result',result) 117 | 118 | # Train 119 | if (global_t_mcts > hps.n_t) or (ep > hps.n_eps): 120 | break # end learning 121 | else: 122 | n_epochs = hps.n_epochs * (np.ceil(hps.n_mcts/20)).astype(int) 123 | #print(n_epochs) 124 | loss = model.train(D,n_epochs,hps.lr) 125 | losses.append(loss['total_loss']) 126 | gn.append(loss['gn']) 127 | 128 | if hps.verbose: 129 | print('Time {}, Episode {}, Return {}, V {}, gn {}, Vloss {}, piloss {}'.format( 130 | global_t_mcts,ep,R,loss['V'],loss['gn'],loss['V_loss'],loss['pi_loss'])) 131 | print('Actions {}, probs {}'.format(np.array(a_list),probs)) 132 | print('One full episode loop + training in {} seconds'.format(time.time()-start)) 133 | 134 | return result 135 | 136 | if __name__ == '__main__': 137 | '''Set-up training''' 138 | parser = argparse.ArgumentParser() 139 | parser.add_argument('--hp', help='Hyperparameter configuration',default='') 140 | parser.add_argument('--no_plot', action='store_true',default=False) 141 | args = parser.parse_args() 142 | hps = get_hps().parse(args.hp) 143 | hps = override_hps_settings(hps) 144 | 145 | # set-up result folder if not prespecified, then we are not hyperlooping 146 | if hps.result_dir == '': 147 | result_folder = os.getcwd() + '/results/{}/{}/'.format(hps.name,hps.game) 148 | hps.result_dir = make_unique_subfolder(result_folder,hyperloop=False) 149 | with open(hps.result_dir + 'hps.txt','w') as file: 150 | file.write(pformat(hps_to_dict(hps))) 151 | 152 | #with open(subfolder + 'hps_raw.txt','w') as file: 153 | # file.write(hps_to_list(hps)) 154 | print(' ________________________________________ ') 155 | print('Start learning on game {}'.format(hps.game)) 156 | result = agent(hps) 157 | 158 | if not args.no_plot: 159 | plot_single_experiment(result,hps.game,hps.result_dir,plot_type='lc') -------------------------------------------------------------------------------- /jobs/job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=Q,c=0.05,name=lr:0.1-loss_type:Q-c:0.05 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=Q,c=0.25,name=lr:0.1-loss_type:Q-c:0.25 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=Q,c=1.0,name=lr:0.1-loss_type:Q-c:1.0 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=count,c=0.05,name=lr:0.1-loss_type:count-c:0.05 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=count,c=0.25,name=lr:0.1-loss_type:count-c:0.25 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=count,c=1.0,name=lr:0.1-loss_type:count-c:1.0 8 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=Q,c=0.05,name=lr:0.01-loss_type:Q-c:0.05 9 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=Q,c=0.25,name=lr:0.01-loss_type:Q-c:0.25 10 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=Q,c=1.0,name=lr:0.01-loss_type:Q-c:1.0 11 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.05,name=lr:0.01-loss_type:count-c:0.05 12 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.25,name=lr:0.01-loss_type:count-c:0.25 13 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=1.0,name=lr:0.01-loss_type:count-c:1.0 14 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=Q,c=0.05,name=lr:0.001-loss_type:Q-c:0.05 15 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=Q,c=0.25,name=lr:0.001-loss_type:Q-c:0.25 16 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=Q,c=1.0,name=lr:0.001-loss_type:Q-c:1.0 17 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.05,name=lr:0.001-loss_type:count-c:0.05 18 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.25,name=lr:0.001-loss_type:count-c:0.25 19 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=1.0,name=lr:0.001-loss_type:count-c:1.0 20 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=Q,c=0.05,name=lr:0.0001-loss_type:Q-c:0.05 21 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=Q,c=0.25,name=lr:0.0001-loss_type:Q-c:0.25 22 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=Q,c=1.0,name=lr:0.0001-loss_type:Q-c:1.0 23 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.05,name=lr:0.0001-loss_type:count-c:0.05 24 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.25,name=lr:0.0001-loss_type:count-c:0.25 25 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=1.0,name=lr:0.0001-loss_type:count-c:1.0 26 | -------------------------------------------------------------------------------- /src/rl/policies.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Various policies 4 | @author: thomas 5 | """ 6 | import numpy as np 7 | import logging 8 | logger = logging.getLogger('root') 9 | logger.propagate = False 10 | 11 | def policy(policy,model,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False): 12 | ''' wrapper policy function ''' 13 | pass 14 | 15 | def thompson_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False): 16 | ''' Thompson sample value function in discrete action space 17 | Input: s - state, Thompson sampling applied across first dimension. 18 | Output: a - picked action ''' 19 | 20 | rep = s.shape[0] 21 | state_seq = np.repeat(s,model.action_dim,axis=0) 22 | action_seq = np.repeat(np.arange(0,model.action_dim)[None,:],rep,axis=0).reshape(-1,1) 23 | rep_action_values = np.zeros([rep*model.action_dim,hps.n_thompson_sample]) 24 | 25 | # sample 26 | for i in range(hps.n_thompson_sample): 27 | action_values = sample_value(sess,model,hps,state_seq,action_seq,seed,eval_on_mean_output,eval_on_mean_params) 28 | rep_action_values[:,i] = np.squeeze(action_values) 29 | 30 | # max 31 | max_action_values = np.max(rep_action_values,axis=1) # max over the repetitions 32 | max_action_values = np.reshape(max_action_values,[rep,model.action_dim]) 33 | #a = np.argmax(max_action_values,axis=1)[:,None] 34 | a = argmax_tiebreaking(max_action_values) 35 | return a 36 | 37 | def egreedy_policy(s,model,sess,hps,e,seed): 38 | ''' e-greedy policy on discrete action-space''' 39 | # setup 40 | #hps.n_thompson_sample = 1 41 | #a_exploit = thompson_policy(s,model,sess,hps,seed,eval_on_mean_output=True,eval_on_mean_params=True) 42 | 43 | rep = s.shape[0] 44 | state_seq = np.repeat(s,model.action_dim,axis=0) 45 | action_seq = np.repeat(np.arange(0,model.action_dim)[None,:],rep,axis=0).reshape(-1,1) 46 | 47 | action_values = get_net_mean(sess,model,state_seq,action_seq,seed,hps.p_dropout,hps.output) 48 | action_values = np.reshape(action_values,[rep,model.action_dim]) 49 | a_exploit = argmax_tiebreaking(action_values) 50 | 51 | a_explore = get_discrete_random_action(model.action_dim,s.shape[0]) 52 | a = np.array([(a1 if np.random.rand()>0.05 else a2) for a1,a2 in zip(a_exploit,a_explore)]) 53 | return a 54 | 55 | def ucb_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False): 56 | ''' upper confidence bound policy ''' 57 | #p_dropout = 1.0 if eval_on_mean_params else hps.p_dropout # some unexplainable bug if uncommented 58 | p_dropout = hps.p_dropout 59 | 60 | rep = s.shape[0] 61 | state_seq = np.repeat(s,model.action_dim,axis=0) 62 | action_seq = np.repeat(np.arange(0,model.action_dim)[None,:],rep,axis=0).reshape(-1,1) 63 | 64 | mu = get_net_mean(sess,model,state_seq,action_seq,seed,p_dropout,hps.output) 65 | sds = analytic_sd(sess,model,state_seq,action_seq,seed,p_dropout,hps.output) 66 | #sds2 = sample_sd(40,sess,model,state_seq,action_seq,p_dropout,hps.output) 67 | 68 | ucb_multipliers = np.random.uniform(1.7,2.3,(rep*model.action_dim,1)) 69 | ucb = np.reshape(mu + ucb_multipliers * sds,[-1,model.action_dim]) 70 | #a = np.argmax(ucb,axis=1)[:,None] 71 | a = argmax_tiebreaking(ucb) 72 | return a 73 | 74 | def get_discrete_random_action(n_act,n_sample): 75 | return np.random.randint(0,n_act,n_sample)[:,None] 76 | 77 | def sample_value(sess,model,hps,sb,ab,seed,eval_on_mean_output=False,eval_on_mean_params=False): 78 | ''' Sample values for policy ''' 79 | if eval_on_mean_params: 80 | p_dropout = 1.0 81 | else: 82 | p_dropout = hps.p_dropout 83 | 84 | if eval_on_mean_output: 85 | Qsa = get_net_mean(sess,model,sb,ab,seed,p_dropout,hps.output) 86 | else: 87 | Qsa = sample_net(sess,model,sb,ab,seed,p_dropout,hps.output) 88 | return Qsa 89 | 90 | def sample_net(sess,model,sb,ab,seed,p_dropout,output): 91 | ''' Sample from network output distribution ''' 92 | sample = sess.run(model.sample,feed_dict = {model.x:sb, 93 | model.a:ab, 94 | model.p_dropout: p_dropout, 95 | model.seed:seed}) 96 | if output == 'categorical': 97 | sample = model.transformer.to_value(sample) 98 | return sample 99 | 100 | def get_net_mean(sess,model,sb,ab,seed,p_dropout,output): 101 | ''' Expectation of network output distribution ''' 102 | if not output == 'categorical': 103 | Qsa = sess.run(model.mean,feed_dict = {model.x:sb, 104 | model.a:ab, 105 | model.p_dropout: p_dropout, 106 | model.seed:seed}) 107 | else: 108 | density = sess.run(model.params,feed_dict = {model.x:sb, 109 | model.a:ab, 110 | model.p_dropout: p_dropout, 111 | model.seed:seed}) 112 | Qsa = np.matmul(density,model.transformer.means)[:,None] 113 | return Qsa 114 | 115 | def analytic_sd(sess,model,sb,ab,seed,p_dropout,output): 116 | ''' analytic sd calculation from network parameters ''' 117 | params = get_net_params(sess,model,sb,ab,seed,p_dropout) 118 | if output == 'gaussian': 119 | sd = params[:,1][:,None] 120 | elif output == 'categorical': 121 | # sd = sum_i (x_i-mu) 122 | bin_means = model.transformer.means 123 | mu = np.repeat(np.matmul(params,bin_means)[:,None],params.shape[1],axis=1) 124 | sd = np.sqrt(np.sum(params * np.square(bin_means - mu), axis=1))[:,None] # 125 | elif output == 'mog': 126 | # need to sample 127 | sd = sd_mog(params)[:,None] 128 | #sd = sample_sd(20,sess,model,sb,ab,p_dropout,output) 129 | elif output == 'deterministic': 130 | sd = sample_sd(15,sess,model,sb,ab,p_dropout,output) 131 | return sd 132 | 133 | def sd_mog(params): 134 | ''' Standard deviation of gaussian mixture ''' 135 | n_mix = int(params.shape[1]/3) 136 | p = params[:,:n_mix] 137 | mu = params[:,n_mix:(2*n_mix)] 138 | sd = params[:,(2*n_mix):(3*n_mix)] 139 | return np.sum(p * (np.square(mu) + np.square(sd)),axis=1) - np.square(np.sum(p*mu,axis=1)) 140 | 141 | def sample_sd(n,sess,model,sb,ab,p_dropout,output): 142 | ''' get standard deviation estimates 143 | Crude implementation, based on sampling. However, there is no better way 144 | to integrate over the parameter uncertainty ''' 145 | samples = np.zeros([sb.shape[0],n]) 146 | for i in range(n): 147 | seed = [np.random.randint(1e15),np.random.randint(1e15)] # new seed for parametric uncertainty 148 | sample = sample_net(sess,model,sb,ab,seed,p_dropout,output) 149 | samples[:,i] = np.squeeze(sample) 150 | sds = np.std(samples,axis=1)[:,None] 151 | return sds 152 | 153 | def get_net_params(sess,model,sb,ab,seed,p_dropout): 154 | ''' Network parameters ''' 155 | params = sess.run(model.params,feed_dict = {model.x:sb, 156 | model.a:ab, 157 | model.p_dropout: p_dropout, 158 | model.seed:seed}) 159 | return params 160 | 161 | def argmax_tiebreaking(x): 162 | ''' own argmax because numpy.argmax does not break ties ''' 163 | try: 164 | out = np.array([[np.random.choice(np.flatnonzero(a == a.max()))] for a in x]) # sparsely fails due to numerical errors between a and a.max()? 165 | except: 166 | out = np.array([[np.argmax(a)] for a in x]) 167 | return out -------------------------------------------------------------------------------- /src/rl/wrappers/multi_discrete.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import gym 4 | from gym.spaces import prng, Discrete, Box, MultiDiscrete 5 | from gym.error import Error 6 | 7 | # Adapters 8 | 9 | class DiscreteToMultiDiscrete(Discrete): 10 | """ 11 | Adapter that adapts the MultiDiscrete action space to a Discrete action space of any size 12 | The converted action can be retrieved by calling the adapter with the discrete action 13 | discrete_to_multi_discrete = DiscreteToMultiDiscrete(multi_discrete) 14 | discrete_action = discrete_to_multi_discrete.sample() 15 | multi_discrete_action = discrete_to_multi_discrete(discrete_action) 16 | It can be initialized using 3 configurations: 17 | Configuration 1) - DiscreteToMultiDiscrete(multi_discrete) [2nd param is empty] 18 | Would adapt to a Discrete action space of size (1 + nb of discrete in MultiDiscrete) 19 | where 20 | 0 returns NOOP [ 0, 0, 0, ...] 21 | 1 returns max for the first discrete space [max, 0, 0, ...] 22 | 2 returns max for the second discrete space [ 0, max, 0, ...] 23 | etc. 24 | Configuration 2) - DiscreteToMultiDiscrete(multi_discrete, list_of_discrete) [2nd param is a list] 25 | Would adapt to a Discrete action space of size (1 + nb of items in list_of_discrete) 26 | e.g. 27 | if list_of_discrete = [0, 2] 28 | 0 returns NOOP [ 0, 0, 0, ...] 29 | 1 returns max for first discrete in list [max, 0, 0, ...] 30 | 2 returns max for second discrete in list [ 0, 0, max, ...] 31 | etc. 32 | Configuration 3) - DiscreteToMultiDiscrete(multi_discrete, discrete_mapping) [2nd param is a dict] 33 | Would adapt to a Discrete action space of size (nb_keys in discrete_mapping) 34 | where discrete_mapping is a dictionnary in the format { discrete_key: multi_discrete_mapping } 35 | e.g. for the Nintendo Game Controller [ [0,4], [0,1], [0,1] ] a possible mapping might be; 36 | mapping = { 37 | 0: [0, 0, 0], # NOOP 38 | 1: [1, 0, 0], # Up 39 | 2: [3, 0, 0], # Down 40 | 3: [2, 0, 0], # Right 41 | 4: [2, 1, 0], # Right + A 42 | 5: [2, 0, 1], # Right + B 43 | 6: [2, 1, 1], # Right + A + B 44 | 7: [4, 0, 0], # Left 45 | 8: [4, 1, 0], # Left + A 46 | 9: [4, 0, 1], # Left + B 47 | 10: [4, 1, 1], # Left + A + B 48 | 11: [0, 1, 0], # A only 49 | 12: [0, 0, 1], # B only, 50 | 13: [0, 1, 1], # A + B 51 | } 52 | """ 53 | def __init__(self, multi_discrete, options=None): 54 | assert isinstance(multi_discrete, MultiDiscrete) 55 | self.multi_discrete = multi_discrete 56 | self.num_discrete_space = self.multi_discrete.num_discrete_space 57 | 58 | # Config 1 59 | if options is None: 60 | self.n = self.num_discrete_space + 1 # +1 for NOOP at beginning 61 | self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)} 62 | for i in range(self.num_discrete_space): 63 | self.mapping[i + 1][i] = self.multi_discrete.high[i] 64 | 65 | # Config 2 66 | elif isinstance(options, list): 67 | assert len(options) <= self.num_discrete_space 68 | self.n = len(options) + 1 # +1 for NOOP at beginning 69 | self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)} 70 | for i, disc_num in enumerate(options): 71 | assert disc_num < self.num_discrete_space 72 | self.mapping[i + 1][disc_num] = self.multi_discrete.high[disc_num] 73 | 74 | # Config 3 75 | elif isinstance(options, dict): 76 | self.n = len(options.keys()) 77 | self.mapping = options 78 | for i, key in enumerate(options.keys()): 79 | if i != key: 80 | raise Error('DiscreteToMultiDiscrete must contain ordered keys. ' \ 81 | 'Item {0} should have a key of "{0}", but key "{1}" found instead.'.format(i, key)) 82 | if not self.multi_discrete.contains(options[key]): 83 | raise Error('DiscreteToMultiDiscrete mapping for key {0} is ' \ 84 | 'not contained in the underlying MultiDiscrete action space. ' \ 85 | 'Invalid mapping: {1}'.format(key, options[key])) 86 | # Unknown parameter provided 87 | else: 88 | raise Error('DiscreteToMultiDiscrete - Invalid parameter provided.') 89 | 90 | def __call__(self, discrete_action): 91 | return self.mapping[discrete_action] 92 | 93 | 94 | class BoxToMultiDiscrete(Box): 95 | """ 96 | Adapter that adapts the MultiDiscrete action space to a Box action space 97 | The converted action can be retrieved by calling the adapter with the box action 98 | box_to_multi_discrete = BoxToMultiDiscrete(multi_discrete) 99 | box_action = box_to_multi_discrete.sample() 100 | multi_discrete_action = box_to_multi_discrete(box_action) 101 | It can be initialized using 2 configurations: 102 | Configuration 1) - BoxToMultiDiscrete(multi_discrete) [2nd param is empty] 103 | Would adapt to a Box action space of shape (nb of discrete space, ), with the min-max of 104 | each Discrete space sets as Box boundaries 105 | e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete) 106 | would adapt to a Box with parameters low=np.array([0.0, 0.0, 0.0]) high=np.array([4.0, 1.0, 1.0]) 107 | The box action would then be rounded to the nearest integer. 108 | e.g. [ 2.560453, 0.3523456, 0.674546 ] would be converted to the multi discrete action of [3, 0, 1] 109 | Configuration 2) - BoxToMultiDiscrete(multi_discrete, list_of_discrete) [2nd param is a list] 110 | Would adapt to a Box action space of shape (nb of items in list_of_discrete, ), where list_of_discrete 111 | is the index of the discrete space in the MultiDiscrete space 112 | e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete, [2, 0]) 113 | would adapt to a Box with parameters low=np.array([0.0, 0.0]) high=np.array([1.0, 4.0]) 114 | where 115 | 0.0 = min(discrete space #2), 1.0 = max(discrete space #2) 116 | 0.0 = min(discrete space #0), 4.0 = max(discrete space #0) 117 | The box action would then be rounded to the nearest integer and mapped to the correct discrete space in multi-discrete. 118 | e.g. [ 0.7412057, 3.0174142 ] would be converted to the multi discrete action of [3, 0, 1] 119 | This configuration is useful if you want to ignore certain discrete spaces in the MultiDiscrete space. 120 | """ 121 | def __init__(self, multi_discrete, options=None): 122 | assert isinstance(multi_discrete, MultiDiscrete) 123 | self.multi_discrete = multi_discrete 124 | self.num_discrete_space = self.multi_discrete.num_discrete_space 125 | 126 | if options is None: 127 | options = list(range(self.num_discrete_space)) 128 | 129 | if not isinstance(options, list): 130 | raise Error('BoxToMultiDiscrete - Invalid parameter provided.') 131 | 132 | assert len(options) <= self.num_discrete_space 133 | self.low = np.array([self.multi_discrete.low[x] for x in options]) 134 | self.high = np.array([self.multi_discrete.high[x] for x in options]) 135 | self.mapping = { i: disc_num for i, disc_num in enumerate(options)} 136 | 137 | def __call__(self, box_action): 138 | multi_discrete_action = [0] * self.num_discrete_space 139 | for i in self.mapping: 140 | multi_discrete_action[self.mapping[i]] = int(round(box_action[i], 0)) 141 | return multi_discrete_action 142 | -------------------------------------------------------------------------------- /src/common/rl/wrappers/doom/multi_discrete.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import gym 4 | from gym.spaces import prng, Discrete, Box, MultiDiscrete 5 | from gym.error import Error 6 | 7 | # Adapters 8 | 9 | class DiscreteToMultiDiscrete(Discrete): 10 | """ 11 | Adapter that adapts the MultiDiscrete action space to a Discrete action space of any size 12 | The converted action can be retrieved by calling the adapter with the discrete action 13 | discrete_to_multi_discrete = DiscreteToMultiDiscrete(multi_discrete) 14 | discrete_action = discrete_to_multi_discrete.sample() 15 | multi_discrete_action = discrete_to_multi_discrete(discrete_action) 16 | It can be initialized using 3 configurations: 17 | Configuration 1) - DiscreteToMultiDiscrete(multi_discrete) [2nd param is empty] 18 | Would adapt to a Discrete action space of size (1 + nb of discrete in MultiDiscrete) 19 | where 20 | 0 returns NOOP [ 0, 0, 0, ...] 21 | 1 returns max for the first discrete space [max, 0, 0, ...] 22 | 2 returns max for the second discrete space [ 0, max, 0, ...] 23 | etc. 24 | Configuration 2) - DiscreteToMultiDiscrete(multi_discrete, list_of_discrete) [2nd param is a list] 25 | Would adapt to a Discrete action space of size (1 + nb of items in list_of_discrete) 26 | e.g. 27 | if list_of_discrete = [0, 2] 28 | 0 returns NOOP [ 0, 0, 0, ...] 29 | 1 returns max for first discrete in list [max, 0, 0, ...] 30 | 2 returns max for second discrete in list [ 0, 0, max, ...] 31 | etc. 32 | Configuration 3) - DiscreteToMultiDiscrete(multi_discrete, discrete_mapping) [2nd param is a dict] 33 | Would adapt to a Discrete action space of size (nb_keys in discrete_mapping) 34 | where discrete_mapping is a dictionnary in the format { discrete_key: multi_discrete_mapping } 35 | e.g. for the Nintendo Game Controller [ [0,4], [0,1], [0,1] ] a possible mapping might be; 36 | mapping = { 37 | 0: [0, 0, 0], # NOOP 38 | 1: [1, 0, 0], # Up 39 | 2: [3, 0, 0], # Down 40 | 3: [2, 0, 0], # Right 41 | 4: [2, 1, 0], # Right + A 42 | 5: [2, 0, 1], # Right + B 43 | 6: [2, 1, 1], # Right + A + B 44 | 7: [4, 0, 0], # Left 45 | 8: [4, 1, 0], # Left + A 46 | 9: [4, 0, 1], # Left + B 47 | 10: [4, 1, 1], # Left + A + B 48 | 11: [0, 1, 0], # A only 49 | 12: [0, 0, 1], # B only, 50 | 13: [0, 1, 1], # A + B 51 | } 52 | """ 53 | def __init__(self, multi_discrete, options=None): 54 | assert isinstance(multi_discrete, MultiDiscrete) 55 | self.multi_discrete = multi_discrete 56 | self.num_discrete_space = self.multi_discrete.num_discrete_space 57 | 58 | # Config 1 59 | if options is None: 60 | self.n = self.num_discrete_space + 1 # +1 for NOOP at beginning 61 | self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)} 62 | for i in range(self.num_discrete_space): 63 | self.mapping[i + 1][i] = self.multi_discrete.high[i] 64 | 65 | # Config 2 66 | elif isinstance(options, list): 67 | assert len(options) <= self.num_discrete_space 68 | self.n = len(options) + 1 # +1 for NOOP at beginning 69 | self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)} 70 | for i, disc_num in enumerate(options): 71 | assert disc_num < self.num_discrete_space 72 | self.mapping[i + 1][disc_num] = self.multi_discrete.high[disc_num] 73 | 74 | # Config 3 75 | elif isinstance(options, dict): 76 | self.n = len(options.keys()) 77 | self.mapping = options 78 | for i, key in enumerate(options.keys()): 79 | if i != key: 80 | raise Error('DiscreteToMultiDiscrete must contain ordered keys. ' \ 81 | 'Item {0} should have a key of "{0}", but key "{1}" found instead.'.format(i, key)) 82 | if not self.multi_discrete.contains(options[key]): 83 | raise Error('DiscreteToMultiDiscrete mapping for key {0} is ' \ 84 | 'not contained in the underlying MultiDiscrete action space. ' \ 85 | 'Invalid mapping: {1}'.format(key, options[key])) 86 | # Unknown parameter provided 87 | else: 88 | raise Error('DiscreteToMultiDiscrete - Invalid parameter provided.') 89 | 90 | def __call__(self, discrete_action): 91 | return self.mapping[discrete_action] 92 | 93 | 94 | class BoxToMultiDiscrete(Box): 95 | """ 96 | Adapter that adapts the MultiDiscrete action space to a Box action space 97 | The converted action can be retrieved by calling the adapter with the box action 98 | box_to_multi_discrete = BoxToMultiDiscrete(multi_discrete) 99 | box_action = box_to_multi_discrete.sample() 100 | multi_discrete_action = box_to_multi_discrete(box_action) 101 | It can be initialized using 2 configurations: 102 | Configuration 1) - BoxToMultiDiscrete(multi_discrete) [2nd param is empty] 103 | Would adapt to a Box action space of shape (nb of discrete space, ), with the min-max of 104 | each Discrete space sets as Box boundaries 105 | e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete) 106 | would adapt to a Box with parameters low=np.array([0.0, 0.0, 0.0]) high=np.array([4.0, 1.0, 1.0]) 107 | The box action would then be rounded to the nearest integer. 108 | e.g. [ 2.560453, 0.3523456, 0.674546 ] would be converted to the multi discrete action of [3, 0, 1] 109 | Configuration 2) - BoxToMultiDiscrete(multi_discrete, list_of_discrete) [2nd param is a list] 110 | Would adapt to a Box action space of shape (nb of items in list_of_discrete, ), where list_of_discrete 111 | is the index of the discrete space in the MultiDiscrete space 112 | e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete, [2, 0]) 113 | would adapt to a Box with parameters low=np.array([0.0, 0.0]) high=np.array([1.0, 4.0]) 114 | where 115 | 0.0 = min(discrete space #2), 1.0 = max(discrete space #2) 116 | 0.0 = min(discrete space #0), 4.0 = max(discrete space #0) 117 | The box action would then be rounded to the nearest integer and mapped to the correct discrete space in multi-discrete. 118 | e.g. [ 0.7412057, 3.0174142 ] would be converted to the multi discrete action of [3, 0, 1] 119 | This configuration is useful if you want to ignore certain discrete spaces in the MultiDiscrete space. 120 | """ 121 | def __init__(self, multi_discrete, options=None): 122 | assert isinstance(multi_discrete, MultiDiscrete) 123 | self.multi_discrete = multi_discrete 124 | self.num_discrete_space = self.multi_discrete.num_discrete_space 125 | 126 | if options is None: 127 | options = list(range(self.num_discrete_space)) 128 | 129 | if not isinstance(options, list): 130 | raise Error('BoxToMultiDiscrete - Invalid parameter provided.') 131 | 132 | assert len(options) <= self.num_discrete_space 133 | self.low = np.array([self.multi_discrete.low[x] for x in options]) 134 | self.high = np.array([self.multi_discrete.high[x] for x in options]) 135 | self.mapping = { i: disc_num for i, disc_num in enumerate(options)} 136 | 137 | def __call__(self, box_action): 138 | multi_discrete_action = [0] * self.num_discrete_space 139 | for i in self.mapping: 140 | multi_discrete_action[self.mapping[i]] = int(round(box_action[i], 0)) 141 | return multi_discrete_action 142 | -------------------------------------------------------------------------------- /src/common/rl/wrappers/atari.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Atari Wrappers from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 4 | @author: thomas 5 | """ 6 | 7 | import numpy as np 8 | from collections import deque 9 | import gym 10 | from gym import spaces 11 | #import cv2 12 | #cv2.ocl.setUseOpenCL(False) 13 | 14 | class NoopResetEnv(gym.Wrapper): 15 | def __init__(self, env, noop_max=30): 16 | """Sample initial states by taking random number of no-ops on reset. 17 | No-op is assumed to be action 0. 18 | """ 19 | gym.Wrapper.__init__(self, env) 20 | self.noop_max = noop_max 21 | self.override_num_noops = None 22 | self.noop_action = 0 23 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 24 | 25 | def reset(self, **kwargs): 26 | """ Do no-op action for a number of steps in [1, noop_max].""" 27 | self.env.reset(**kwargs) 28 | if self.override_num_noops is not None: 29 | noops = self.override_num_noops 30 | else: 31 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 32 | assert noops > 0 33 | obs = None 34 | for _ in range(noops): 35 | obs, _, done, _ = self.env.step(self.noop_action) 36 | if done: 37 | obs = self.env.reset(**kwargs) 38 | return obs 39 | 40 | def step(self, ac): 41 | return self.env.step(ac) 42 | 43 | class FireResetEnv(gym.Wrapper): 44 | def __init__(self, env): 45 | """Take action on reset for environments that are fixed until firing.""" 46 | gym.Wrapper.__init__(self, env) 47 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 48 | assert len(env.unwrapped.get_action_meanings()) >= 3 49 | 50 | def reset(self, **kwargs): 51 | self.env.reset(**kwargs) 52 | obs, _, done, _ = self.env.step(1) 53 | if done: 54 | self.env.reset(**kwargs) 55 | obs, _, done, _ = self.env.step(2) 56 | if done: 57 | self.env.reset(**kwargs) 58 | return obs 59 | 60 | def step(self, ac): 61 | return self.env.step(ac) 62 | 63 | class EpisodicLifeEnv(gym.Wrapper): 64 | def __init__(self, env): 65 | """Make end-of-life == end-of-episode, but only reset on true game over. 66 | Done by DeepMind for the DQN and co. since it helps value estimation. 67 | """ 68 | gym.Wrapper.__init__(self, env) 69 | self.lives = 0 70 | self.was_real_done = True 71 | 72 | def step(self, action): 73 | obs, reward, done, info = self.env.step(action) 74 | self.was_real_done = done 75 | # check current lives, make loss of life terminal, 76 | # then update lives to handle bonus lives 77 | lives = self.env.unwrapped.ale.lives() 78 | if lives < self.lives and lives > 0: 79 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames 80 | # so its important to keep lives > 0, so that we only reset once 81 | # the environment advertises done. 82 | done = True 83 | self.lives = lives 84 | return obs, reward, done, info 85 | 86 | def reset(self, **kwargs): 87 | """Reset only when lives are exhausted. 88 | This way all states are still reachable even though lives are episodic, 89 | and the learner need not know about any of this behind-the-scenes. 90 | """ 91 | if self.was_real_done: 92 | obs = self.env.reset(**kwargs) 93 | else: 94 | # no-op step to advance from terminal/lost life state 95 | obs, _, _, _ = self.env.step(0) 96 | self.lives = self.env.unwrapped.ale.lives() 97 | return obs 98 | 99 | class MaxAndSkipEnv(gym.Wrapper): 100 | def __init__(self, env, skip=4): 101 | """Return only every `skip`-th frame""" 102 | gym.Wrapper.__init__(self, env) 103 | # most recent raw observations (for max pooling across time steps) 104 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 105 | self._skip = skip 106 | 107 | def step(self, action): 108 | """Repeat action, sum reward, and max over last observations.""" 109 | total_reward = 0.0 110 | done = None 111 | for i in range(self._skip): 112 | obs, reward, done, info = self.env.step(action) 113 | if i == self._skip - 2: self._obs_buffer[0] = obs 114 | if i == self._skip - 1: self._obs_buffer[1] = obs 115 | total_reward += reward 116 | if done: 117 | break 118 | # Note that the observation on the done=True frame 119 | # doesn't matter 120 | max_frame = self._obs_buffer.max(axis=0) 121 | 122 | return max_frame, total_reward, done, info 123 | 124 | def reset(self, **kwargs): 125 | return self.env.reset(**kwargs) 126 | 127 | class ClipRewardWrapper(gym.RewardWrapper): 128 | def __init__(self, env): 129 | gym.RewardWrapper.__init__(self, env) 130 | 131 | def reward(self, reward): 132 | """Bin reward to {+1, 0, -1} by its sign.""" 133 | return np.sign(reward) 134 | 135 | #class WarpFrame(gym.ObservationWrapper): 136 | # def __init__(self, env): 137 | # """Warp frames to 84x84 as done in the Nature paper and later work.""" 138 | # gym.ObservationWrapper.__init__(self, env) 139 | # self.width = 84 140 | # self.height = 84 141 | # self.observation_space = spaces.Box(low=0, high=255, 142 | # shape=(self.height, self.width, 1), dtype=np.uint8) 143 | # 144 | # def observation(self, frame): 145 | # frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 146 | # frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 147 | # return frame[:, :, None] 148 | 149 | class FrameStack(gym.Wrapper): 150 | def __init__(self, env, k): 151 | """Stack k last frames. 152 | Returns lazy array, which is much more memory efficient. 153 | See Also 154 | -------- 155 | baselines.common.atari_wrappers.LazyFrames 156 | """ 157 | gym.Wrapper.__init__(self, env) 158 | self.k = k 159 | self.frames = deque([], maxlen=k) 160 | shp = env.observation_space.shape 161 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) 162 | 163 | def reset(self): 164 | ob = self.env.reset() 165 | for _ in range(self.k): 166 | self.frames.append(ob) 167 | return self._get_ob() 168 | 169 | def step(self, action): 170 | ob, reward, done, info = self.env.step(action) 171 | self.frames.append(ob) 172 | return self._get_ob(), reward, done, info 173 | 174 | def _get_ob(self): 175 | assert len(self.frames) == self.k 176 | return LazyFrames(list(self.frames)) 177 | 178 | class ScaledFloatFrame(gym.ObservationWrapper): 179 | def __init__(self, env): 180 | gym.ObservationWrapper.__init__(self, env) 181 | 182 | def observation(self, observation): 183 | # careful! This undoes the memory optimization, use 184 | # with smaller replay buffers only. 185 | return np.array(observation).astype(np.float32) / 255.0 186 | 187 | class LazyFrames(object): 188 | def __init__(self, frames): 189 | """This object ensures that common frames between the observations are only stored once. 190 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 191 | buffers. 192 | This object should only be converted to numpy array before being passed to the model. 193 | You'd not believe how complex the previous solution was.""" 194 | self._frames = frames 195 | self._out = None 196 | 197 | def _force(self): 198 | if self._out is None: 199 | self._out = np.concatenate(self._frames, axis=2) 200 | self._frames = None 201 | return self._out 202 | 203 | def __array__(self, dtype=None): 204 | out = self._force() 205 | if dtype is not None: 206 | out = out.astype(dtype) 207 | return out 208 | 209 | def __len__(self): 210 | return len(self._force()) 211 | 212 | def __getitem__(self, i): 213 | return self._force()[i] 214 | 215 | def make_atari(env_id): 216 | env = gym.make(env_id) 217 | assert 'NoFrameskip' in env.spec.id 218 | env = NoopResetEnv(env, noop_max=30) 219 | env = MaxAndSkipEnv(env, skip=4) 220 | return env 221 | 222 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): 223 | """Configure environment for DeepMind-style Atari. 224 | """ 225 | if episode_life: 226 | env = EpisodicLifeEnv(env) 227 | if 'FIRE' in env.unwrapped.get_action_meanings(): 228 | env = FireResetEnv(env) 229 | # env = WarpFrame(env) removed for now, needs cv2 230 | if scale: 231 | env = ScaledFloatFrame(env) 232 | if clip_rewards: 233 | env = ClipRewardEnv(env) 234 | if frame_stack: 235 | env = FrameStack(env, 4) 236 | return env -------------------------------------------------------------------------------- /src/network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Neural network specification 5 | @author: thomas 6 | """ 7 | 8 | import tensorflow as tf 9 | import tensorflow.contrib.slim as slim 10 | import numpy as np 11 | import random 12 | from common.rl.make_game import check_space 13 | 14 | from pdb import set_trace 15 | 16 | 17 | class Model(): 18 | 19 | def __init__(self,Env,lr,n_mix,clip_gradient_norm,loss_type='count',bound='tanh',temp=1.0,entropy_l=0.0): 20 | 21 | self.action_dim, self.action_discrete = check_space(Env.action_space) 22 | self.state_dim, self.state_discrete = check_space(Env.observation_space) 23 | 24 | if self.action_discrete: 25 | raise ValueError('Discrete action space not implemented') 26 | if len(self.action_dim) > 1: 27 | raise ValueError('Cant handle multidimensional action spaces') 28 | else: 29 | self.action_dim = self.action_dim[0] 30 | self.scale = Env.action_space.high[0] # assumes a symmetric action space [-scale,scale] for all action_dim 31 | 32 | # placeholders 33 | if not self.state_discrete: 34 | self.x = x = tf.placeholder("float32", shape=np.append(None,self.state_dim),name='x') # s 35 | else: 36 | self.x = x = tf.placeholder("int32", shape=np.append(None,1)) # s 37 | x = tf.squeeze(tf.one_hot(x,self.state_dim,axis=1),axis=2) 38 | 39 | # feedforward 40 | for i in range(2): 41 | x = slim.fully_connected(x,128,activation_fn=tf.nn.elu) 42 | 43 | # Mixture of Gaussians 44 | if self.action_discrete: 45 | raise ValueError('Only works for continuous outputs') 46 | #print(self.action_dim) 47 | n_params = n_mix *(2 * self.action_dim) 48 | z = slim.fully_connected(x,n_params,activation_fn=None) 49 | if n_mix > 1: 50 | logits = slim.fully_connected(x,n_mix,activation_fn=None) 51 | 52 | # params 53 | #self.sigma_p = sigma_p = tf.Print(sigma_p,[sigma_p],summarize=16) 54 | 55 | # Make distribution 56 | if bound == 'tanh': 57 | self.mu_p = mu_p = z[:,:(self.action_dim*n_mix)] 58 | log_sigma = z[:,(self.action_dim*n_mix):(2*self.action_dim*n_mix)] 59 | self.sigma_p = sigma_p = tf.clip_by_value(tf.nn.softplus(log_sigma),0.001,10000) 60 | if n_mix == 1: 61 | if self.action_dim == 1: 62 | outdist = tf.distributions.Normal(mu_p,sigma_p) 63 | else: 64 | outdist = tf.contrib.distributions.MultivariateNormalDiag(mu_p,sigma_p) 65 | else: 66 | p_dist = tf.distributions.Categorical(logits=logits,validate_args=True,allow_nan_stats=False) 67 | n_dist = [] 68 | for i in range(n_mix): 69 | if self.action_dim == 1: 70 | n_dist.append(tf.distributions.Normal(mu_p[:,i],sigma_p[:,i])) 71 | else: 72 | n_dist.append(tf.contrib.distributions.MultivariateNormalDiag(loc=mu_p[:,(i*self.action_dim):((i+1)*self.action_dim)],scale_diag=sigma_p[:,(i*self.action_dim):((i+1)*self.action_dim)])) 73 | outdist = tf.contrib.distributions.Mixture(cat=p_dist,components=n_dist) 74 | # Wrap distribution 75 | outdist = BoundedDistribution(outdist,scale=self.scale) 76 | elif bound == 'beta': 77 | self.alpha = alpha = z[:,:(self.action_dim*n_mix)] 78 | self.beta = beta = z[:,(self.action_dim*n_mix):(2*self.action_dim*n_mix)] 79 | if n_mix == 1: 80 | outdist = tf.contrib.distributions.BetaWithSoftplusConcentration(alpha,beta) 81 | outdist = BoundedDistributionBeta(outdist,scale=self.scale) 82 | self.entropy = outdist.entropy() 83 | else: 84 | raise ValueError('Beta bounding not implemented for n_mix >1') 85 | else: 86 | raise ValueError('Unknown bounding type: {}'.format(bound)) 87 | 88 | # V loss 89 | self.V_hat = slim.fully_connected(x,1,activation_fn=None) 90 | self.V = tf.placeholder("float32", shape=[None,1],name='V') 91 | self.V_loss = tf.losses.mean_squared_error(labels=self.V,predictions=self.V_hat) 92 | 93 | # pi loss (needs a) 94 | self.a = a = tf.placeholder("float32", shape=np.append(None,self.action_dim),name='a') 95 | self.log_pi_a_s = outdist.log_prob(a) # shape (batch,) 96 | self.pi_hat = outdist.prob(a) # shape (batch,) 97 | if loss_type == 'count': 98 | self.n_a = n_a = tf.placeholder("float32", shape=np.append(None,1),name='n_a') 99 | pi_loss = tf.stop_gradient(self.log_pi_a_s - tf.log(tf.squeeze(n_a,axis=1))) * self.log_pi_a_s 100 | elif loss_type == 'Q': 101 | self.n_a = n_a = tf.placeholder("float32", shape=np.append(None,1),name='Q') 102 | pi_loss = tf.stop_gradient(self.log_pi_a_s - tf.squeeze((n_a*temp) - self.V_hat,axis=1)) * self.log_pi_a_s 103 | self.pi_loss = tf.reduce_mean(pi_loss) 104 | self.sample = outdist.sample() 105 | self.pi_sample = outdist.prob(self.sample) 106 | 107 | # training 108 | self.loss = self.V_loss + self.pi_loss 109 | if bound == 'beta': 110 | self.loss -= tf.reduce_mean(entropy_l * self.entropy) 111 | self.lr = tf.Variable(lr,name="learning_rate",trainable=False) 112 | optimizer = tf.train.RMSPropOptimizer(learning_rate=lr) 113 | var_list = tf.trainable_variables() 114 | grads = tf.gradients(self.loss, var_list) 115 | if clip_gradient_norm > 0.0: 116 | clip_global = tf.Variable(clip_gradient_norm,trainable=False) 117 | grads,self.gradient_norm = tf.clip_by_global_norm(grads, clip_global) 118 | else: 119 | self.gradient_norm = tf.global_norm(grads) 120 | gvs = list(zip(grads, var_list)) 121 | self.train_op = optimizer.apply_gradients(gvs) 122 | 123 | def train(self,D,n_epochs,lr): 124 | sess = self.sess 125 | D.reshuffle() 126 | gn,VL,piL,V = [],[],[],[] 127 | for epoch in range(n_epochs): 128 | for sb,Vb,ab,a_nb in D: 129 | _,VL_,piL_,gn_,V_ = sess.run([self.train_op,self.V_loss,self.pi_loss,self.gradient_norm,self.V], 130 | feed_dict={self.x:sb, 131 | self.V:Vb, 132 | self.a:ab, 133 | self.n_a:a_nb, 134 | self.lr:lr 135 | }) 136 | gn.append(gn_) 137 | VL.append(VL_) 138 | piL.append(piL_) 139 | V.append(np.mean(V_)) 140 | if np.isnan(np.mean(gn)) or np.isnan(np.mean(VL)) or np.isnan(np.mean(piL)) or np.isnan(np.mean(V)): 141 | set_trace() 142 | t_loss = np.mean(VL)+np.mean(piL) 143 | return {'V_loss':np.mean(VL),'pi_loss':np.mean(piL),'gn':np.mean(gn),'total_loss':t_loss,'V':np.mean(V)} 144 | 145 | def predict_V(self,s): 146 | sess = self.sess 147 | return sess.run(self.V_hat,feed_dict={self.x:s}) 148 | 149 | def predict_pi(self,s,a): 150 | sess = self.sess 151 | return sess.run(self.pi_hat,feed_dict={self.x:s, 152 | self.a:a}) 153 | 154 | def log_prob(self,s,a): 155 | return self.sess.run([self.log_pi_a_s],feed_dict={self.x:s, 156 | self.a:a}) 157 | qui 158 | def sample_action(self,s): 159 | sess = self.sess 160 | mix_list = sess.run(self.p_dist.sample(),feed_dict={self.x:s}) 161 | samples = np.array([sess.run(self.n_dist[mix].sample(),feed_dict={self.x:s}) for mix in mix_list]) 162 | return samples 163 | 164 | def sample_action_and_pi(self,s): 165 | sess = self.sess 166 | return sess.run([self.sample,self.pi_sample],feed_dict={self.x:s}) 167 | 168 | class BoundedDistribution(object): 169 | ''' Bounded transformation of arbitrary continuous density with support on real line ''' 170 | 171 | def __init__(self,dist,scale): 172 | self.dist = dist 173 | self.scale = scale 174 | 175 | def to_u(self,a): 176 | return tf.atanh(tf.clip_by_value(a/self.scale,-0.999999,0.999999)) # clip what goes into atanh 177 | 178 | def to_a(self,u): 179 | return self.scale*tf.tanh(u) 180 | 181 | def sample(self): 182 | return self.to_a(self.dist.sample()) 183 | 184 | def log_prob(self,a): 185 | u = self.to_u(a) 186 | return self.dist.log_prob(u) - tf.reduce_sum(tf.log(self.scale*(1-tf.square( 187 | tf.clip_by_value(tf.tanh(u),-0.999999,0.999999)))),axis=1) # clip what comes out of tanh and goes into log 188 | 189 | def prob(self,a): 190 | return tf.exp(self.log_prob(a)) 191 | 192 | class BoundedDistributionBeta(object): 193 | ''' Bounded transformation of Beta distribution ''' 194 | 195 | def __init__(self,dist,scale): 196 | self.dist = dist 197 | self.scale = scale 198 | 199 | def to_u(self,a): 200 | return tf.clip_by_value(((a/self.scale) + 1.0)/2.0,0.00001,0.999999) 201 | 202 | def to_a(self,u): 203 | return self.scale * ((2.0 * u) - 1.0) 204 | 205 | def sample(self): 206 | return self.to_a(self.dist.sample()) 207 | 208 | def log_prob(self,a): 209 | u = self.to_u(a) 210 | shape = a.get_shape().as_list() 211 | constants = shape[-1]*tf.log(tf.constant(np.array(2.0)*np.squeeze(self.scale),dtype='float32')) 212 | return tf.reduce_sum(self.dist.log_prob(u),axis=1) - constants 213 | 214 | def prob(self,a): 215 | return tf.exp(self.log_prob(a)) 216 | 217 | def entropy(self): 218 | return self.dist.entropy() 219 | 220 | 221 | class Database(): 222 | ''' Database ''' 223 | 224 | def __init__(self,max_size,batch_size): 225 | self.max_size = max_size 226 | self.batch_size = batch_size 227 | self.size = 0 228 | self.insert_index = 0 229 | self.experience = [] 230 | self.sample_array = None 231 | self.sample_index = 0 232 | 233 | def clear(self): 234 | self.experience = [] 235 | self.insert_index = 0 236 | self.size = 0 237 | 238 | def store(self,experience): 239 | if self.size < self.max_size: 240 | self.experience.append(experience) 241 | self.size +=1 242 | else: 243 | self.experience[self.insert_index] = experience 244 | self.insert_index += 1 245 | if self.insert_index >= self.size: 246 | self.insert_index = 0 247 | 248 | def store_from_array(self,*args): 249 | for i in range(args[0].shape[0]): 250 | entry = [] 251 | for arg in args: 252 | entry.append(arg[i]) 253 | self.store(entry) 254 | 255 | def reshuffle(self): 256 | self.sample_array = np.arange(self.size) 257 | random.shuffle(self.sample_array) 258 | self.sample_index = 0 259 | 260 | def __iter__(self): 261 | return self 262 | 263 | def __next__(self): 264 | if (self.sample_index + self.batch_size > self.size) and (not self.sample_index == 0): 265 | self.reshuffle() # Reset for the next epoch 266 | raise(StopIteration) 267 | 268 | if (self.sample_index + 2*self.batch_size > self.size): 269 | indices = self.sample_array[self.sample_index:] 270 | batch = [self.experience[i] for i in indices] 271 | else: 272 | indices = self.sample_array[self.sample_index:self.sample_index+self.batch_size] 273 | batch = [self.experience[i] for i in indices] 274 | self.sample_index += self.batch_size 275 | 276 | arrays = [] 277 | for i in range(len(batch[0])): 278 | to_add = np.array([entry[i] for entry in batch]) 279 | arrays.append(to_add) 280 | return tuple(arrays) 281 | 282 | next = __next__ -------------------------------------------------------------------------------- /src/rl/envs/chain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Chain environment 4 | @author: thomas 5 | """ 6 | 7 | import gym.spaces 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from scipy.stats import norm 11 | from rl.policies import get_net_mean, get_net_params, sample_sd, analytic_sd, thompson_policy, ucb_policy 12 | import matplotlib.patches as patches 13 | 14 | #plt.style.use('ggplot') 15 | plt.rcParams['lines.linewidth'] = 4 16 | plt.rcParams.update({'font.size': 11}) 17 | plt.rcParams['axes.facecolor']='white' 18 | plt.rcParams['savefig.facecolor']='white' 19 | plt.rcParams['font.family'] = 'sans-serif' 20 | plt.rcParams['font.sans-serif'] = ['Latin Modern Math'] 21 | plt.rcParams['xtick.labelsize'] = 15 22 | plt.rcParams['font.weight'] = 'bold' 23 | plt.rcParams['ytick.labelsize'] = 15 24 | plt.locator_params(axis='x', nticks=3) 25 | plt.ion() 26 | 27 | class ChainOrdered(object): 28 | ''' Chain domain ''' 29 | 30 | def __init__(self,n=10): 31 | # n = length of chain 32 | self.action_space = gym.spaces.Discrete(2) 33 | self.observation_space = gym.spaces.Discrete(n+1) 34 | self.n = n 35 | self.state = 0 36 | self.correct = np.repeat(1,n) 37 | 38 | def reset(self): 39 | self.state = 0 40 | return self.state 41 | 42 | def step(self,a): 43 | if a == 0: 44 | # move back 45 | self.state = 0 46 | r = 0 47 | terminal = True 48 | elif a == 1: 49 | # move forward 50 | self.state += 1 51 | if self.state == self.n: 52 | r = 1 53 | terminal = True 54 | else: 55 | r = 0 56 | terminal = False 57 | else: 58 | raise ValueError('Action not possible') 59 | 60 | return self.state,r,terminal, {} 61 | 62 | def seed(self,seed): 63 | pass # deterministic anyway 64 | 65 | class Chain(object): 66 | ''' Chain domain ''' 67 | 68 | def __init__(self,n=10): 69 | # n = length of chain 70 | self.action_space = gym.spaces.Discrete(2) 71 | self.observation_space = gym.spaces.Discrete(n+1) 72 | self.n = n 73 | self.state = 0 74 | self.correct = np.random.randint(0,2,n) # correct action in each state 75 | self.counts = np.zeros((self.n,2)) 76 | 77 | def reset(self): 78 | self.state = 0 79 | return self.state 80 | 81 | def step(self,a): 82 | self.counts[self.state,a] += 1 83 | if a != self.correct[self.state]: 84 | # move back 85 | self.state = 0 86 | r = 0 87 | terminal = True 88 | elif a == self.correct[self.state]: 89 | # move forward 90 | self.state += 1 91 | if self.state == self.n: 92 | r = 1 93 | terminal = True 94 | else: 95 | r = 0 96 | terminal = False 97 | else: 98 | raise ValueError('Action not possible') 99 | 100 | return self.state,r,terminal, {} 101 | 102 | def seed(self,seed): 103 | pass # deterministic anyway 104 | 105 | 106 | class ChainLoop(object): 107 | ''' Chain domain ''' 108 | 109 | def __init__(self,n=10): 110 | # n = length of chain 111 | self.action_space = gym.spaces.Discrete(2) 112 | self.observation_space = gym.spaces.Discrete(n+1) 113 | self.n = n 114 | self.state = 0 115 | self.correct = np.random.randint(0,2,n) # correct action in each state 116 | self.counts = np.zeros((self.n,2)) 117 | 118 | def reset(self): 119 | self.state = 0 120 | return self.state 121 | 122 | def step(self,a): 123 | self.counts[self.state,a] += 1 124 | if a != self.correct[self.state]: 125 | # move back 126 | self.state = 0 127 | r = 0 128 | terminal = False 129 | elif a == self.correct[self.state]: 130 | # move forward 131 | self.state += 1 132 | if self.state == self.n: 133 | r = 1 134 | terminal = True 135 | else: 136 | r = 0 137 | terminal = False 138 | else: 139 | raise ValueError('Action not possible') 140 | 141 | return self.state,r,terminal, {} 142 | 143 | def seed(self,seed): 144 | pass # deterministic anyway 145 | 146 | class ChainDomainPlotter(object): 147 | 148 | def __init__(self,Env): 149 | self.fig,self.ax = plt.subplots(1,figsize=(Env.n*2,4)) 150 | self.n = Env.n 151 | self.truth = Env.correct 152 | 153 | for i in range(self.n): 154 | for j in range(2): 155 | if self.truth[i]==j: 156 | col = 'g' 157 | else: 158 | col = 'r' 159 | self.ax.add_patch(patches.Circle((i,j), radius=0.05,color=col)) 160 | 161 | self.ax.set_xlim([-1,self.n+1]) 162 | self.ax.set_ylim([-1,2]) 163 | self.fig.canvas.draw() 164 | 165 | def update(self,counts): 166 | self.ax.clear() 167 | for i in range(self.n): 168 | for j in range(2): 169 | if self.truth[i]==j: 170 | col = 'g' 171 | else: 172 | col = 'r' 173 | self.ax.add_patch(patches.Circle((i,j), radius=0.05,color=col)) 174 | self.ax.text(i-0.2,j-0.2,'s = {}, a={}\n N = {}'.format(i,j,int(counts[i,j]))) 175 | 176 | self.fig.canvas.draw() 177 | 178 | class ChainPlotter(object): 179 | 180 | def __init__(self,truth,n_plot): 181 | self.fig,self.ax = plt.subplots(2,n_plot,figsize=(n_plot*10,4),sharex=True,sharey=True) 182 | self.pl = self.ax.flatten('F') 183 | self.n = 2*n_plot 184 | 185 | # setup for predictions 186 | self.sb = np.repeat(np.arange(0,n_plot,1),2)[:,None] 187 | self.ab = np.array([0,1]*n_plot)[:,None] 188 | self.truth = truth 189 | self.fig.canvas.draw() 190 | 191 | def update(self,sess,model,hps,ep): 192 | # clear plots 193 | for ax in self.pl: 194 | ax.clear() 195 | overall_means = np.zeros([hps.n_rep_visualize,self.n]) 196 | overall_max_dens = np.ones([self.n])*-np.inf 197 | for k in range(hps.n_rep_visualize): 198 | # get prediction parameters 199 | seed = [np.random.randint(1e15),np.random.randint(1e15)] # new seed 200 | params = get_net_params(sess,model,self.sb,self.ab,seed,hps.p_dropout) 201 | means = get_net_mean(sess,model,self.sb,self.ab,seed,hps.p_dropout,output=hps.output) 202 | overall_means[k,:] = means[:,0] 203 | #print(np.concatenate([np.array([0,0,1,1,2,2])[:,None],np.array([0,1,0,1,0,1])[:,None],params],axis=1)) 204 | 205 | # need to determine range 206 | if hps.output != 'categorical': 207 | if hps.output == 'gaussian': 208 | mu = params[:,0] 209 | sigma = params[:,1] 210 | elif hps.output == 'mog': 211 | mu = params[:,hps.n_mix:(hps.n_mix*2)] 212 | sigma = params[:,(2*hps.n_mix):(3*hps.n_mix)] 213 | elif hps.output == 'deterministic': 214 | mu = params[:,0] 215 | sigma = 1.0 216 | 217 | max_sd = np.max(sigma) 218 | lower,upper = np.min(mu)-3*max_sd,np.max(mu)+3*max_sd 219 | else: 220 | lower,upper = model.transformer.plot_edges[0],model.transformer.plot_edges[-1] 221 | 222 | # update all plots 223 | x = np.linspace(lower,upper,100) 224 | for i in range(self.n): 225 | #self.pl[i].set_xlim([lower,upper]) 226 | param = params[i,:] 227 | if hps.output == 'deterministic': 228 | max_dens = 1.0 229 | overall_max_dens[i] = 1.0 230 | mean = means[i] 231 | self.pl[i].plot([mean,mean],[0,max_dens],':') 232 | else: 233 | if hps.output == 'gaussian' or hps.output == 'mog': 234 | if hps.output == 'gaussian': 235 | dens = norm.pdf(x,param[0],param[1]) 236 | elif hps.output == 'mog': 237 | dens = [param[j]*norm.pdf(x,param[hps.n_mix+j],param[2*hps.n_mix+j]) for j in range(hps.n_mix)] 238 | dens = np.sum(np.array(dens),axis=0) 239 | #print(x,param,dens) 240 | self.pl[i].plot(x,dens,color='cornflowerblue') 241 | elif hps.output == 'categorical': 242 | dens = param 243 | edges = model.transformer.plot_edges 244 | self.pl[i].hist(model.transformer.means,bins=edges,weights=dens,color='cornflowerblue') 245 | overall_max_dens[i] = np.max([overall_max_dens[i],np.max(dens)]) 246 | # add the mean 247 | grand_means = np.mean(np.array(overall_means),axis=0) 248 | seed = [np.random.randint(1e15),np.random.randint(1e15)] # new seed for parametric uncertainty 249 | grand_sds = analytic_sd(sess,model,self.sb,self.ab,seed,hps.p_dropout,hps.output) 250 | #grand_sds = np.ones([len(grand_means),1]) 251 | 252 | # get policy estimates 253 | s = np.arange(0,int(self.n/2),1)[:,None] 254 | a_thompson = np.array([thompson_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False) for i in range(100)]) 255 | a_ucb = np.array([ucb_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False) for i in range(100)]) 256 | 257 | thompson_probs = np.zeros(self.n) 258 | ucb_probs = np.zeros(self.n) 259 | 260 | for j,(state,action) in enumerate(zip(self.sb,self.ab)): 261 | thompson_probs[j] = np.mean(a_thompson[:,state,:] == action) 262 | ucb_probs[j] = np.mean(a_ucb[:,state,:] == action) 263 | 264 | for i in range(self.n): 265 | grand_mean = grand_means[i] 266 | grand_sd = grand_sds[i] 267 | max_dens = overall_max_dens[i] #np.max(dens) if 'dens' in locals() else 1 268 | self.pl[i].plot([grand_mean,grand_mean],[0,max_dens],'--',color='orange') 269 | #self.pl[i].plot([grand_mean-2*grand_sd,grand_mean+2*grand_sd],[max_dens/2,max_dens/2],'--',color='orange') 270 | self.pl[i].text(0.1,0.75,'$\mu$={:0.2f}'.format(grand_mean),transform=self.pl[i].transAxes) 271 | self.pl[i].text(0.55,0.75,'$\sigma$={:0.2f}'.format(grand_sds[i][0]),transform=self.pl[i].transAxes) 272 | 273 | #self.pl[i].text(0.1,0.75,'$\mu$={:0.2f}\n$\sigma$={:0.2f}'.format(grand_mean,grand_sds[i][0]),transform=self.pl[i].transAxes) 274 | #self.pl[i].text(0.55,0.75,'tho={:0.2f}\nucb={:0.2f}'.format(thompson_probs[i],ucb_probs[i]),transform=self.pl[i].transAxes) 275 | 276 | 277 | for j in range(int(self.n/2)): 278 | for l in range(2): 279 | if self.truth[j]==l: 280 | val = 1. 281 | col = 'g' 282 | else: 283 | val = 0. 284 | col = 'r' 285 | self.ax[l,j].add_patch(patches.Rectangle((0.01,0.01),0.98,0.98,linewidth=10,edgecolor=col,facecolor='none',transform=self.ax[l,j].transAxes)) 286 | if j>0: 287 | plt.setp(self.ax[l,j].get_yticklabels(), visible=False) 288 | if l==0: 289 | plt.setp(self.ax[l,j].get_xticklabels(), visible=False) 290 | #self.ax[l,j].set_title('V={:0.2f}'.format(val)) 291 | self.ax[l,j].set_ylim([0,1.0]) 292 | self.ax[l,j].set_xlim([-2.5,2.5]) 293 | 294 | 295 | self.fig.canvas.draw() 296 | self.fig.savefig(hps.result_dir + 'episode_{}'.format(ep),dpi=300) 297 | self.fig.canvas.flush_events() 298 | 299 | # Test 300 | if __name__ == '__main__': 301 | Env = ChainOrdered() 302 | s = Env.reset() 303 | for i in range(500): 304 | a = Env.action_space.sample() 305 | s,r,terminal,_ = Env.step(a) 306 | if terminal: 307 | print('Died in step',i,'with reward',r,' restarting') 308 | s = Env.reset() 309 | print('Finished') --------------------------------------------------------------------------------