├── docs
    ├── .gitkeep
    ├── reports
    │   └── .gitkeep
    └── manuscript
    │   └── .gitkeep
├── src
    ├── .gitkeep
    ├── __pycache__
    │   ├── mcts.cpython-35.pyc
    │   └── network.cpython-35.pyc
    ├── common
    │   ├── __pycache__
    │   │   ├── putils.cpython-35.pyc
    │   │   ├── submit.cpython-35.pyc
    │   │   ├── __init__.cpython-35.pyc
    │   │   ├── hps_setup.cpython-35.pyc
    │   │   └── visualize.cpython-35.pyc
    │   ├── __init__.py
    │   ├── rl
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   ├── atari_copy.cpython-35.pyc
    │   │   │   └── make_game.cpython-35.pyc
    │   │   ├── envs
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── chain.cpython-35.pyc
    │   │   │   │   └── __init__.cpython-35.pyc
    │   │   │   ├── chicken.py
    │   │   │   ├── taxi.py
    │   │   │   ├── chain.py
    │   │   │   └── grid.py
    │   │   ├── wrappers
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── atari.cpython-35.pyc
    │   │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   │   └── control.cpython-35.pyc
    │   │   │   ├── doom
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── control.py
    │   │   │   │   ├── observation_space.py
    │   │   │   │   ├── custom_game.py
    │   │   │   │   ├── action_space.py
    │   │   │   │   └── multi_discrete.py
    │   │   │   ├── control.py
    │   │   │   └── atari.py
    │   │   ├── atari_copy.py
    │   │   ├── doom_helpers.py
    │   │   └── make_game.py
    │   ├── examples
    │   │   ├── submit_wrapper.py
    │   │   └── visualize_wrapper.py
    │   ├── putils.py
    │   ├── hps_setup.py
    │   └── submit.py
    ├── rl
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-35.pyc
    │   │   ├── doom_setup.cpython-35.pyc
    │   │   ├── make_game.cpython-35.pyc
    │   │   ├── policies.cpython-35.pyc
    │   │   ├── atariwrapper.cpython-35.pyc
    │   │   └── rewardwrapper.cpython-35.pyc
    │   ├── envs
    │   │   ├── __pycache__
    │   │   │   ├── chain.cpython-35.pyc
    │   │   │   ├── taxi.cpython-35.pyc
    │   │   │   └── toy.cpython-35.pyc
    │   │   ├── chicken.py
    │   │   ├── taxi.py
    │   │   ├── grid.py
    │   │   └── chain.py
    │   ├── wrappers
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   ├── control.cpython-35.pyc
    │   │   │   ├── custom_game.cpython-35.pyc
    │   │   │   ├── action_space.cpython-35.pyc
    │   │   │   ├── multi_discrete.cpython-35.pyc
    │   │   │   └── observation_space.cpython-35.pyc
    │   │   ├── __init__.py
    │   │   ├── control.py
    │   │   ├── observation_space.py
    │   │   ├── custom_game.py
    │   │   ├── action_space.py
    │   │   └── multi_discrete.py
    │   ├── doom_setup.py
    │   ├── make_game.py
    │   ├── atariwrapper.py
    │   ├── rewardwrapper.py
    │   └── policies.py
    └── network.py
├── config
    ├── .gitkeep
    ├── __pycache__
    │   └── hps.cpython-35.pyc
    └── hps.py
├── requirements.txt
├── CITATION.md
├── .gitignore
├── jobs
    ├── job_pendulum_final.sh
    ├── expand_job_call4.sh
    ├── expand_job_call.sh
    ├── expand_job_call2.sh
    ├── expand_job_call3.sh
    ├── job_Ant.sh
    ├── job_m.sh
    ├── expand_jobs_over_games.py
    ├── job_PM.sh
    ├── job_PM2.sh
    ├── job_p.sh
    ├── job_backup.sh
    └── job.sh
├── submit.py
├── LICENSE.md
├── README.md
├── visualize.py
└── agent.py


/docs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/reports/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/manuscript/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/__pycache__/mcts.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/__pycache__/mcts.cpython-35.pyc


--------------------------------------------------------------------------------
/config/__pycache__/hps.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/config/__pycache__/hps.cpython-35.pyc


--------------------------------------------------------------------------------
/src/__pycache__/network.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/__pycache__/network.cpython-35.pyc


--------------------------------------------------------------------------------
/CITATION.md:
--------------------------------------------------------------------------------
1 | Please cite this project as follows:
2 | 
3 | Thomas Moerland (2020),  A0C cursus. url: github.com/tmoer/cursus
4 | 


--------------------------------------------------------------------------------
/src/common/__pycache__/putils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/putils.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/__pycache__/submit.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/submit.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Mar  2 14:48:24 2017
4 | 
5 | @author: thomas
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/src/rl/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/__pycache__/doom_setup.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/doom_setup.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/__pycache__/make_game.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/make_game.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/__pycache__/policies.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/policies.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/envs/__pycache__/chain.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/envs/__pycache__/chain.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/envs/__pycache__/taxi.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/envs/__pycache__/taxi.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/envs/__pycache__/toy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/envs/__pycache__/toy.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Mar  2 14:48:24 2017
4 | 
5 | @author: thomas
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/src/common/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/__pycache__/atariwrapper.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/atariwrapper.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/__pycache__/hps_setup.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/hps_setup.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/__pycache__/visualize.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/__pycache__/visualize.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Mar  2 14:48:24 2017
4 | 
5 | @author: thomas
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/src/common/rl/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/__pycache__/rewardwrapper.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/__pycache__/rewardwrapper.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/__pycache__/atari_copy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/__pycache__/atari_copy.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/__pycache__/make_game.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/__pycache__/make_game.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/envs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Mar  2 14:48:24 2017
4 | 
5 | @author: thomas
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/src/common/rl/envs/__pycache__/chain.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/envs/__pycache__/chain.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Mar  2 14:48:24 2017
4 | 
5 | @author: thomas
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/src/rl/wrappers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/wrappers/__pycache__/control.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/control.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/envs/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/envs/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .action_space import *
2 | from .control import *
3 | from .custom_game import *
4 | from .observation_space import *


--------------------------------------------------------------------------------
/src/rl/wrappers/__pycache__/custom_game.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/custom_game.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/wrappers/__pycache__/atari.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/wrappers/__pycache__/atari.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/wrappers/__pycache__/action_space.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/action_space.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/wrappers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/wrappers/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/wrappers/__pycache__/control.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/common/rl/wrappers/__pycache__/control.cpython-35.pyc


--------------------------------------------------------------------------------
/src/common/rl/wrappers/doom/__init__.py:
--------------------------------------------------------------------------------
1 | from .action_space import *
2 | from .control import *
3 | from .custom_game import *
4 | from .observation_space import *


--------------------------------------------------------------------------------
/src/rl/wrappers/__pycache__/multi_discrete.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/multi_discrete.cpython-35.pyc


--------------------------------------------------------------------------------
/src/rl/wrappers/__pycache__/observation_space.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmoer/a0c/HEAD/src/rl/wrappers/__pycache__/observation_space.cpython-35.pyc


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # exclude compiled binaries and external models
2 | /bin
3 | 
4 | # exclude data from source control by default
5 | /data/temp
6 | /data/processed
7 | /data/raw
8 | 


--------------------------------------------------------------------------------
/jobs/job_pendulum_final.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=c,seq2=0.005+0.05,item3=lr,seq3=0.001+0.0001+0.00001,n_rep=10,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000000,n_eps=50000000,V_decision=max,clip_gradient_norm=1.0,temp=0.1,entropy_l=0.1
3 | 


--------------------------------------------------------------------------------
/jobs/expand_job_call4.sh:
--------------------------------------------------------------------------------
1 | python3 expand_jobs_over_games.py --job job_pendulum_final.sh --games Pendulum-v0s --hpsetup item1=n_mcts,seq1=1+5+10+25+50+100,item2=c,seq2=0.005+0.05,item3=lr,seq3=0.001+0.0001+0.00001,n_rep=10 --hp bound=beta,n_t=20000000000,n_eps=50000000,V_decision=max,clip_gradient_norm=1.0,temp=0.1,entropy_l=0.1 --slurm_mode short
2 | 


--------------------------------------------------------------------------------
/jobs/expand_job_call.sh:
--------------------------------------------------------------------------------
1 | python3 expand_jobs_over_games.py --games Pendulum-v0s --hpsetup item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max --item1 lr --seq1 0.1 0.01 0.001 0.0001 --item2 loss_type --seq2 Q count --item3 c --seq3 0.05 0.25 1.0 --slurm_mode short
2 | 


--------------------------------------------------------------------------------
/jobs/expand_job_call2.sh:
--------------------------------------------------------------------------------
1 | python3 expand_jobs_over_games.py --job job_PM2.sh --games Pendulum-v0s MountainCarContinuous-v0 --hpsetup item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max --item1 lr --seq1 0.01 0.001 0.0001 --item2 loss_type --seq2 count --item3 c --seq3 0.005 0.05 --slurm_mode short
2 | 


--------------------------------------------------------------------------------
/src/rl/wrappers/control.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | __all__ = ['SetPlayingMode']
 4 | 
 5 | def SetPlayingMode(target_mode):
 6 |     """ target mode can be 'algo' or 'human' """
 7 | 
 8 |     class SetPlayingModeWrapper(gym.Wrapper):
 9 |         """
10 |             Doom wrapper to change playing mode 'human' or 'algo'
11 |         """
12 |         def __init__(self, env):
13 |             super(SetPlayingModeWrapper, self).__init__(env)
14 |             if target_mode not in ['algo', 'human']:
15 |                 raise gym.error.Error('Error - The mode "{}" is not supported. Supported options are "algo" or "human"'.format(target_mode))
16 |             self.unwrapped._mode = target_mode
17 | 
18 |     return SetPlayingModeWrapper


--------------------------------------------------------------------------------
/src/common/rl/wrappers/doom/control.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | __all__ = ['SetPlayingMode']
 4 | 
 5 | def SetPlayingMode(target_mode):
 6 |     """ target mode can be 'algo' or 'human' """
 7 | 
 8 |     class SetPlayingModeWrapper(gym.Wrapper):
 9 |         """
10 |             Doom wrapper to change playing mode 'human' or 'algo'
11 |         """
12 |         def __init__(self, env):
13 |             super(SetPlayingModeWrapper, self).__init__(env)
14 |             if target_mode not in ['algo', 'human']:
15 |                 raise gym.error.Error('Error - The mode "{}" is not supported. Supported options are "algo" or "human"'.format(target_mode))
16 |             self.unwrapped._mode = target_mode
17 | 
18 |     return SetPlayingModeWrapper


--------------------------------------------------------------------------------
/src/common/rl/atari_copy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Atari helper functions
 4 | @author: thomas
 5 | """
 6 | 
 7 | def get_base_env(env):
 8 |     ''' removes all wrappers '''
 9 |     while hasattr(env,'env'):
10 |         env = env.env
11 |     return env
12 | 
13 | def copy_atari_state(env):
14 |     env = get_base_env(env)
15 |     return env.clone_full_state()
16 | #    return env.ale.cloneSystemState()
17 | 
18 | def restore_atari_state(env,snapshot):
19 |     env = get_base_env(env)
20 |     env.restore_full_state(snapshot)
21 | #    env.ale.restoreSystemState(snapshot)
22 | 
23 | def is_atari_game(env):
24 |     ''' Verify whether game uses the Arcade Learning Environment '''
25 |     env = get_base_env(env)
26 |     return hasattr(env,'ale')


--------------------------------------------------------------------------------
/jobs/expand_job_call3.sh:
--------------------------------------------------------------------------------
1 | python3 expand_jobs_over_games.py --job job_m.sh --games MountainCarContinuous-v0 --hpsetup item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0 --item1 lr --seq1 0.001 0.0001 0.00001 --item2 c --seq2 0.02 0.06 --slurm_mode short
2 | 
3 | python3 expand_jobs_over_games.py --job job_p.sh --games Pendulum-v0s --hpsetup item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05 --item1 lr --seq1 0.001 0.0001 0.00001 --item2 random_action_frac --seq2 0.0 1.0 --item3 use_prior --seq3 True False --slurm_mode short
4 | 


--------------------------------------------------------------------------------
/submit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Wrapper for submit function
 5 | 
 6 | @author: thomas
 7 | """
 8 | 
 9 | if __name__ == '__main__' and __package__ is None:
10 |     from os import sys, path
11 |     sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
12 | 
13 | import argparse
14 | from src.common.submit import submit
15 | 
16 | from config.hps import get_hps,override_hps_settings
17 | from agent import agent        
18 |         
19 | if __name__ == "__main__":   
20 |     '''Set-up training'''
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('--hp', help='Hyperparameter configuration',default='')
23 |     parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='')
24 |     parser.add_argument('--no_plot', action='store_true',default=False)
25 |     args = parser.parse_args()
26 |     submit(args.hp,args.hpsetup,args.no_plot,agent,get_hps,override_hps_settings)


--------------------------------------------------------------------------------
/src/common/examples/submit_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Wrapper for submit function
 5 | 
 6 | @author: thomas
 7 | """
 8 | 
 9 | if __name__ == '__main__' and __package__ is None:
10 |     from os import sys, path
11 |     sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
12 | 
13 | import argparse
14 | from common.submit import submit
15 | from hps import get_hps,override_hps_settings
16 | from agent import agent        
17 |         
18 | if __name__ == "__main__":   
19 |     '''Set-up training'''
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('--hp', help='Hyperparameter configuration',default='')
22 |     parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='')
23 |     parser.add_argument('--no_plot', action='store_true',default=False)
24 |     args = parser.parse_args()
25 |     submit(args.hp,args.hpsetup,args.no_plot,agent,get_hps,override_hps_settings)


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020, Thomas Moerland
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/jobs/job_Ant.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=Q,name=lr:0.01-loss_type:Q
3 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=count,name=lr:0.01-loss_type:count
4 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=Q,name=lr:0.001-loss_type:Q
5 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=count,name=lr:0.001-loss_type:count
6 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=Q,name=lr:0.0001-loss_type:Q
7 | python3 submit.py --hpsetup game=Ant-v2,item1=n_mcts,seq1=50,item2=entropy_l,seq2=0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=count,name=lr:0.0001-loss_type:count
8 | 


--------------------------------------------------------------------------------
/src/common/putils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Python utils
 5 | @author: thomas
 6 | """
 7 | import time
 8 | import os
 9 | import numpy as np
10 | import random
11 | from shutil import copyfile
12 | 
13 | def timing(f):
14 |     ''' function decorator '''
15 |     def wrap(*args,**kwargs):
16 |         time1 = time.time()
17 |         ret = f(*args,**kwargs)
18 |         time2 = time.time()
19 |         print('{} function took {} sec'.format(f.__name__,time2-time1))
20 |         return ret
21 |     return wrap
22 | 
23 | def store_safely(folder,name,to_store):
24 |     ''' to prevent losing information due to interruption of process'''
25 |     new_name = folder+name+'.npy'
26 |     old_name = folder+name+'_old.npy'
27 |     if os.path.exists(new_name):
28 |         copyfile(new_name,old_name)
29 |     np.save(new_name,to_store)
30 |     if os.path.exists(old_name):            
31 |         os.remove(old_name)
32 | 
33 | def my_argmax(x):
34 |     ''' assumes a 1D vector x '''
35 |     x = x.flatten()
36 |     if np.any(np.isnan(x)):
37 |         print('Warning: Cannot argmax when vector contains nans, results will be wrong')
38 |     try:
39 |         winners = np.argwhere(x == np.max(x)).flatten()   
40 |         winner = random.choice(winners)
41 |     except:
42 |         winner = np.argmax(x) # numerical instability ? 
43 |     return winner 


--------------------------------------------------------------------------------
/src/rl/doom_setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 11 14:16:19 2018
 4 | 
 5 | @author: thomas
 6 | """
 7 | import gym
 8 | import ppaquette_gym_doom
 9 | from rl.wrappers.action_space import ToDiscrete
10 | from rl.wrappers.observation_space import SetResolution
11 | 
12 | def make_doom_env(game):
13 |     ''' Makes doom environments based on simpler function '''
14 |     if game == 'Doom-1':
15 |         Env = gym.make('ppaquette/DoomBasic-v0')
16 |     if game == 'Doom-2':
17 |         Env = gym.make('ppaquette/DoomCorridor-v0')
18 |     if game == 'Doom-3':
19 |         Env = gym.make('ppaquette/DoomDefendCenter-v0')        
20 |     if game == 'Doom-4':
21 |         Env = gym.make('ppaquette/DoomDefendLine-v0')        
22 |     if game == 'Doom-5':
23 |         Env = gym.make('ppaquette/DoomHealthGathering-v0')   
24 |     if game == 'Doom-6':
25 |         Env = gym.make('ppaquette/DoomMyWayHome-v0')        
26 |     if game == 'Doom-7':
27 |         Env = gym.make('ppaquette/PredictPosition-v0')        
28 |     if game == 'Doom-8':
29 |         Env = gym.make('ppaquette/TakeCover-v0')        
30 |     if game == 'Doom-9':
31 |         Env = gym.make('ppaquette/Deathmatch-v0')
32 |     if game == 'Doom-10':
33 |         Env = gym.make('ppaquette/meta-Doom-v0')
34 |     return Env
35 |     
36 | def make_doom_env_with_wrappers(game,action_config='minimal',screen_res='160x120'):
37 |     '''
38 |     action_config can be 'minimal', 'constant-7', 'constant-17', 'full'
39 |     '''
40 |     Env = make_doom_env(game)
41 |     if action_config is not None:   
42 |         action_wrapper = ToDiscrete(config=action_config)
43 |         Env = action_wrapper(Env)    
44 |     if screen_res is not None:
45 |         obs_wrapper = SetResolution(screen_res)
46 |         Env = obs_wrapper(Env)
47 |     return Env


--------------------------------------------------------------------------------
/src/common/rl/doom_helpers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 11 14:16:19 2018
 4 | 
 5 | @author: thomas
 6 | """
 7 | import gym
 8 | import ppaquette_gym_doom
 9 | from .wrappers.doom.action_space import ToDiscrete
10 | from .wrappers.doom.observation_space import SetResolution
11 | 
12 | def make_doom_env(game):
13 |     ''' Makes doom environments based on simpler function '''
14 |     if game == 'Doom-1':
15 |         Env = gym.make('ppaquette/DoomBasic-v0')
16 |     if game == 'Doom-2':
17 |         Env = gym.make('ppaquette/DoomCorridor-v0')
18 |     if game == 'Doom-3':
19 |         Env = gym.make('ppaquette/DoomDefendCenter-v0')        
20 |     if game == 'Doom-4':
21 |         Env = gym.make('ppaquette/DoomDefendLine-v0')        
22 |     if game == 'Doom-5':
23 |         Env = gym.make('ppaquette/DoomHealthGathering-v0')   
24 |     if game == 'Doom-6':
25 |         Env = gym.make('ppaquette/DoomMyWayHome-v0')        
26 |     if game == 'Doom-7':
27 |         Env = gym.make('ppaquette/PredictPosition-v0')        
28 |     if game == 'Doom-8':
29 |         Env = gym.make('ppaquette/TakeCover-v0')        
30 |     if game == 'Doom-9':
31 |         Env = gym.make('ppaquette/Deathmatch-v0')
32 |     if game == 'Doom-10':
33 |         Env = gym.make('ppaquette/meta-Doom-v0')
34 |     return Env
35 |     
36 | def make_doom_env_with_wrappers(game,action_config='minimal',screen_res='160x120'):
37 |     '''
38 |     action_config can be 'minimal', 'constant-7', 'constant-17', 'full'
39 |     '''
40 |     Env = make_doom_env(game)
41 |     if action_config is not None:   
42 |         action_wrapper = ToDiscrete(config=action_config)
43 |         Env = action_wrapper(Env)    
44 |     if screen_res is not None:
45 |         obs_wrapper = SetResolution(screen_res)
46 |         Env = obs_wrapper(Env)
47 |     return Env


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A0C
 2 | 
 3 | This project simulates the Alpha Zero algorithm for continuous action spaces (A0C). 
 4 | The associated paper is available from here: https://arxiv.org/pdf/1805.09613.pdf
 5 | 
 6 | ## Installation
 7 | You can clone this project by running:
 8 | `git clone https://github.com/tmoer/cursus.git`
 9 | 
10 | Requirements are available from `./requirements.txt`
11 | 
12 | 
13 | ## Run the project
14 | Run the project with `python3 agent.py`
15 | 
16 | Hyperparameter settings can be modified in `./config/hps`
17 | 
18 | 
19 | ## Project organization
20 | 
21 | ```
22 | .
23 | ├── .gitignore
24 | ├── CITATION.md
25 | ├── LICENSE.md
26 | ├── README.md
27 | ├── requirements.txt
28 | ├── bin                <- Compiled and external code, ignored by git (PG)
29 | │   └── external       <- Any external source code, ignored by git (RO)
30 | ├── config             <- Configuration files (HW)
31 | ├── data               <- All project data, ignored by git
32 | │   ├── processed      <- The final, canonical data sets for modeling. (PG)
33 | │   ├── raw            <- The original, immutable data dump. (RO)
34 | │   └── temp           <- Intermediate data that has been transformed. (PG)
35 | ├── docs               <- Documentation notebook for users (HW)
36 | │   ├── manuscript     <- Manuscript source, e.g., LaTeX, Markdown, etc. (HW)
37 | │   └── reports        <- Other project reports and notebooks (e.g. Jupyter, .Rmd) (HW)
38 | ├── results
39 | │   ├── figures        <- Figures for the manuscript or reports (PG)
40 | │   └── output         <- Other output for the manuscript or reports (PG)
41 | └── src                <- Source code for this project (HW)
42 | 
43 | ```
44 | 
45 | 
46 | ## License
47 | 
48 | This project is licensed under the terms of the [MIT License](/LICENSE.md)
49 | 
50 | ## Citation
51 | 
52 | Please [cite this project as described here](/CITATION.md).
53 | 


--------------------------------------------------------------------------------
/jobs/job_m.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.001,c=0.02,name=lr:0.001-c:0.02
3 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.001,c=0.06,name=lr:0.001-c:0.06
4 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.0001,c=0.02,name=lr:0.0001-c:0.02
5 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.0001,c=0.06,name=lr:0.0001-c:0.06
6 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.00001,c=0.02,name=lr:0.00001-c:0.02
7 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=5+10+25+40,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,lr=0.00001,c=0.06,name=lr:0.00001-c:0.06
8 | 


--------------------------------------------------------------------------------
/src/rl/wrappers/observation_space.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | try:
 4 |     from doom_py import ScreenResolution
 5 | except ImportError as e:
 6 |     raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " +
 7 |                                            "with 'pip install doom_py.)'".format(e))
 8 | 
 9 | resolutions = ['160x120', '200x125', '200x150', '256x144', '256x160', '256x192', '320x180', '320x200',
10 |                '320x240', '320x256', '400x225', '400x250', '400x300', '512x288', '512x320', '512x384',
11 |                '640x360', '640x400', '640x480', '800x450', '800x500', '800x600', '1024x576', '1024x640',
12 |                '1024x768', '1280x720', '1280x800', '1280x960', '1280x1024', '1400x787', '1400x875',
13 |                '1400x1050', '1600x900', '1600x1000', '1600x1200', '1920x1080']
14 | 
15 | __all__ = [ 'SetResolution' ]
16 | 
17 | def SetResolution(target_resolution):
18 | 
19 |     class SetResolutionWrapper(gym.Wrapper):
20 |         """
21 |             Doom wrapper to change screen resolution
22 |         """
23 |         def __init__(self, env):
24 |             super(SetResolutionWrapper, self).__init__(env)
25 |             if target_resolution not in resolutions:
26 |                 raise gym.error.Error('Error - The specified resolution "{}" is not supported by Vizdoom.'.format(target_resolution))
27 |             parts = target_resolution.lower().split('x')
28 |             width = int(parts[0])
29 |             height = int(parts[1])
30 |             screen_res = __import__('doom_py')
31 |             screen_res = getattr(screen_res, 'ScreenResolution')
32 |             screen_res = getattr(screen_res, 'RES_{}X{}'.format(width, height))
33 |             self.screen_width, self.screen_height, self.unwrapped.screen_resolution = width, height, screen_res
34 |             self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.screen_height, self.screen_width, 3))
35 |             self.observation_space = self.unwrapped.observation_space
36 | 
37 |     return SetResolutionWrapper
38 | 


--------------------------------------------------------------------------------
/src/common/rl/wrappers/doom/observation_space.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | try:
 4 |     from doom_py import ScreenResolution
 5 | except ImportError as e:
 6 |     raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " +
 7 |                                            "with 'pip install doom_py.)'".format(e))
 8 | 
 9 | resolutions = ['160x120', '200x125', '200x150', '256x144', '256x160', '256x192', '320x180', '320x200',
10 |                '320x240', '320x256', '400x225', '400x250', '400x300', '512x288', '512x320', '512x384',
11 |                '640x360', '640x400', '640x480', '800x450', '800x500', '800x600', '1024x576', '1024x640',
12 |                '1024x768', '1280x720', '1280x800', '1280x960', '1280x1024', '1400x787', '1400x875',
13 |                '1400x1050', '1600x900', '1600x1000', '1600x1200', '1920x1080']
14 | 
15 | __all__ = [ 'SetResolution' ]
16 | 
17 | def SetResolution(target_resolution):
18 | 
19 |     class SetResolutionWrapper(gym.Wrapper):
20 |         """
21 |             Doom wrapper to change screen resolution
22 |         """
23 |         def __init__(self, env):
24 |             super(SetResolutionWrapper, self).__init__(env)
25 |             if target_resolution not in resolutions:
26 |                 raise gym.error.Error('Error - The specified resolution "{}" is not supported by Vizdoom.'.format(target_resolution))
27 |             parts = target_resolution.lower().split('x')
28 |             width = int(parts[0])
29 |             height = int(parts[1])
30 |             screen_res = __import__('doom_py')
31 |             screen_res = getattr(screen_res, 'ScreenResolution')
32 |             screen_res = getattr(screen_res, 'RES_{}X{}'.format(width, height))
33 |             self.screen_width, self.screen_height, self.unwrapped.screen_resolution = width, height, screen_res
34 |             self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.screen_height, self.screen_width, 3))
35 |             self.observation_space = self.unwrapped.observation_space
36 | 
37 |     return SetResolutionWrapper
38 | 


--------------------------------------------------------------------------------
/config/hps.py:
--------------------------------------------------------------------------------
 1 |     # -*- coding: utf-8 -*-
 2 | """
 3 | Default hyperparameter settings
 4 | @author: thomas
 5 | """
 6 | from common.hps_setup import HParams
 7 | 
 8 | def override_hps_settings(hps):
 9 |     ''' some more global modifications to multiple settings based on 1 indicator '''
10 |     if hps.mode == 'off':
11 |         pass
12 |     return hps
13 | 
14 | def get_hps():
15 |     ''' Hyperparameter settings '''
16 |     return HParams(      
17 |         # General
18 |         game = 'MountainCarContinuous-v0', # Environment name
19 |         name = 'unnamed', # Name of experiment
20 |         result_dir = '',
21 | 
22 |         # Steps & limits
23 |         n_t = 2000, # max timesteps
24 |         n_eps = 100, # max episodes
25 |         steps_per_ep = 300,
26 | 
27 |         mode = 'off', # overall indicator to jointly change a group of settings. Use with override_hps_settings()
28 |         
29 |         # MCTS
30 |         n_mcts = 10,
31 |         c = 1.0,
32 |         alpha = 0.5,
33 |         C_widening = 1.0,
34 |         decision_type = 'count',        
35 |         backup_Q = 'on-policy', # 'on-policy', 'max' or 'thompson': Type of policy used for value back-up. Thopmpson requires additional sampling
36 |         sigma_tree = False, # whether to use tree uncertainty
37 |         backup_sigma_tree = 'on-policy', # 'uniform', 'on-policy', 'max', 'thompson': policy used for sigma_tree back-up
38 |         block_loop = False, # Whether to block loops
39 | 
40 |         # MCTS + DL
41 |         loss_type = 'count', # 'count' or 'Q'
42 |         bound = 'beta', # 'tanh' or 'beta'
43 |         entropy_l = 0.0,
44 |         random_action_frac = 0.0,
45 |         temp = 1.0,
46 |         n_mix = 1,
47 |         use_prior = False,
48 |         bootstrap_V = True,
49 |         V_decision = 'on_policy',
50 | 
51 |         # Train
52 |         lr = 0.005,        
53 |         n_epochs = 1,
54 |         batch_size = 32,
55 |         data_size = 5000, # total database, if distributed summed over the agents
56 |         clip_gradient_norm = 0.0,
57 |         tfdb = False,
58 | 
59 |         # Other
60 |         timeit = False,
61 |         verbose = False,
62 |         verbose_mcts = False
63 |         )


--------------------------------------------------------------------------------
/src/common/examples/visualize_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Wrapper for visualize.py
 5 | 
 6 | @author: thomas
 7 | """
 8 | 
 9 | if __name__ == '__main__' and __package__ is None:
10 |     from os import sys, path
11 |     sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
12 | 
13 | import os
14 | import argparse
15 | from common.visualize import loop_directories
16 | from hps import get_hps
17 |    
18 | if __name__ == "__main__":  
19 |     parser = argparse.ArgumentParser()  
20 |     parser.add_argument('--folder', default='/home/thomas/mcts_results')
21 |     parser.add_argument('--home', action='store_true',default=False)
22 |     parser.add_argument('--game', default=None)
23 |     parser.add_argument('--name', default=None)
24 |     parser.add_argument('--subindex', default=None)
25 |     parser.add_argument('--plot_type', default='lc')
26 |     parser.add_argument('--window', type=int,default=25)
27 |     parser.add_argument('--sd', action='store_true',default=False)
28 |     parser.add_argument('--on_mean', action='store_true',default=False)
29 |     parser.add_argument('--item', default='return',help='This item in result will be plotted')
30 |     parser.add_argument('--remove', action='store_true',default=False)
31 |     parser.add_argument('--plotlen', type=int,default=25)
32 |     parser.add_argument('--xlim', nargs='+',type=float,default=None)
33 |     parser.add_argument('--ylim', nargs='+',type=float,default=None)
34 |     parser.add_argument('--errlim', nargs='+',type=float,default=None,help='Limits on the errorbars')    
35 |     parser.add_argument('--item1_label', nargs='+', default=None)
36 |     parser.add_argument('--item2_label', nargs='+', default=None)
37 |     parser.add_argument('--item3_label', nargs='+', default=None)
38 | 
39 |     args = parser.parse_args()
40 |     
41 |     if args.home:
42 |         result_folder = os.getcwd() + '/results/'
43 |     else:
44 |         result_folder = args.folder + '/'
45 |     print('Start processing folder {}'.format(result_folder))
46 |     overview_dir= result_folder+'learning_curves/'
47 |     if not os.path.exists(overview_dir):
48 |         os.makedirs(overview_dir)
49 |         
50 |     loop_directories(result_folder=result_folder,overview_dir=overview_dir,game=args.game,name=args.name,
51 |                      subindex=args.subindex,plot_type=args.plot_type,window=args.window,sd=args.sd,on_mean=args.on_mean,
52 |                      item=args.item,remove=args.remove,plotlen=args.plotlen,xlim=args.xlim,ylim=args.ylim,errlim=args.errlim,
53 |                      get_hps=get_hps) 
54 | 
55 | 


--------------------------------------------------------------------------------
/src/rl/envs/chicken.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Wet-Chicken benchmark
 4 | @author: thomas
 5 | """
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | class chicken_env(object):
11 |     ''' Wet Chicken Benchmark '''
12 |     
13 |     def __init__(self,to_plot = False):
14 |         self.state = np.array([0,0])        
15 |         self.observation_shape = np.shape(self.get_state())[0]
16 |         
17 |         if to_plot:
18 |             plt.ion()
19 |             fig = plt.figure()
20 |             ax1 = fig.add_subplot(111,aspect='equal')
21 |             #ax1.axis('off')
22 |             plt.xlim([-0.5,5.5])
23 |             plt.ylim([-0.5,5.5])
24 | 
25 |             self.g1 = ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red'))
26 |             self.fig = fig
27 |             self.ax1 = ax1
28 |             self.fig.canvas.draw()
29 |             self.fig.canvas.flush_events()
30 | 
31 |     def reset(self):
32 |         self.state = np.array([0,0])
33 |         return self.get_state()
34 | 
35 |     def get_state(self):
36 |         return self.state/5
37 | 
38 |     def set_state(self,state):
39 |         self.state = state
40 |     
41 |     def step(self,a):
42 |         x = self.state[0]
43 |         y = self.state[1]
44 |         ax = a[0]
45 |         ay = a[1]
46 |         tau = np.random.uniform(-1,1)
47 |         w=5.0
48 |         l=5.0
49 |         
50 |         v = 3 * x * (1/w)
51 |         s = 3.5 - v
52 |         yhat = y + ay - 1 + v + s*tau
53 |         
54 |         # change x
55 |         if x + ax < 0:
56 |             x = 0
57 |         elif yhat > l:
58 |             x = 0
59 |         elif x + ax > w:
60 |             x = w
61 |         else:
62 |             x = x + ax
63 |         
64 |         # change y
65 |         if yhat < 0:
66 |             y = 0
67 |         elif yhat > l:
68 |             y = 0
69 |         else:
70 |             y = yhat
71 |             
72 |         self.state = np.array([x,y]).flatten()
73 |         
74 |         r = - (l - y)
75 |         
76 |         return self.state,r,yhat>l
77 |             
78 |     def plot(self):
79 |         self.g1.remove()         
80 |         self.g1 = self.ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red'))
81 |         self.fig.canvas.draw()    
82 | 
83 | # Test
84 | if __name__ == '__main__':
85 |     Env = chicken_env(True)
86 |     s = Env.get_state()
87 |     for i in range(500): 
88 |         a = np.random.uniform(-1,1,2)
89 |         s,r,dead = Env.step(a)
90 |         if not dead:
91 |             Env.plot()
92 |         else:
93 |             print('Died in step',i,', restarting')
94 |             s = Env.reset() 
95 |     print(Env.get_state())
96 |     print('Finished')
97 | 


--------------------------------------------------------------------------------
/src/common/rl/envs/chicken.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Wet-Chicken benchmark
 4 | @author: thomas
 5 | """
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | class chicken_env(object):
11 |     ''' Wet Chicken Benchmark '''
12 |     
13 |     def __init__(self,to_plot = False):
14 |         self.state = np.array([0,0])        
15 |         self.observation_shape = np.shape(self.get_state())[0]
16 |         
17 |         if to_plot:
18 |             plt.ion()
19 |             fig = plt.figure()
20 |             ax1 = fig.add_subplot(111,aspect='equal')
21 |             #ax1.axis('off')
22 |             plt.xlim([-0.5,5.5])
23 |             plt.ylim([-0.5,5.5])
24 | 
25 |             self.g1 = ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red'))
26 |             self.fig = fig
27 |             self.ax1 = ax1
28 |             self.fig.canvas.draw()
29 |             self.fig.canvas.flush_events()
30 | 
31 |     def reset(self):
32 |         self.state = np.array([0,0])
33 |         return self.get_state()
34 | 
35 |     def get_state(self):
36 |         return self.state/5
37 | 
38 |     def set_state(self,state):
39 |         self.state = state
40 |     
41 |     def step(self,a):
42 |         x = self.state[0]
43 |         y = self.state[1]
44 |         ax = a[0]
45 |         ay = a[1]
46 |         tau = np.random.uniform(-1,1)
47 |         w=5.0
48 |         l=5.0
49 |         
50 |         v = 3 * x * (1/w)
51 |         s = 3.5 - v
52 |         yhat = y + ay - 1 + v + s*tau
53 |         
54 |         # change x
55 |         if x + ax < 0:
56 |             x = 0
57 |         elif yhat > l:
58 |             x = 0
59 |         elif x + ax > w:
60 |             x = w
61 |         else:
62 |             x = x + ax
63 |         
64 |         # change y
65 |         if yhat < 0:
66 |             y = 0
67 |         elif yhat > l:
68 |             y = 0
69 |         else:
70 |             y = yhat
71 |             
72 |         self.state = np.array([x,y]).flatten()
73 |         
74 |         r = - (l - y)
75 |         
76 |         return self.state,r,yhat>l
77 |             
78 |     def plot(self):
79 |         self.g1.remove()         
80 |         self.g1 = self.ax1.add_artist(plt.Circle((self.state[0],self.state[1]),0.1,color='red'))
81 |         self.fig.canvas.draw()    
82 | 
83 | # Test
84 | if __name__ == '__main__':
85 |     Env = chicken_env(True)
86 |     s = Env.get_state()
87 |     for i in range(500): 
88 |         a = np.random.uniform(-1,1,2)
89 |         s,r,dead = Env.step(a)
90 |         if not dead:
91 |             Env.plot()
92 |         else:
93 |             print('Died in step',i,', restarting')
94 |             s = Env.reset() 
95 |     print(Env.get_state())
96 |     print('Finished')
97 | 


--------------------------------------------------------------------------------
/src/rl/make_game.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Wrapper to generate the game environments
 4 | @author: thomas
 5 | """
 6 | import gym
 7 | import numpy as np
 8 | from rl.rewardwrapper import RewardWrapper,PILCOWrapper,NormalizeWrapper
 9 | from rl.atariwrapper import AtariWrapper,ClipRewardWrapper
10 | from rl.envs.chain import Chain, ChainOrdered
11 | #from rl.doom_setup import make_doom_env_with_wrappers
12 | from gym import spaces
13 | import os
14 | #import gym_ple
15 | 
16 | from gym.envs.registration import register
17 | register(
18 |     id='FrozenLakeNotSlippery-v0',
19 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
20 |     kwargs={'map_name' : '4x4', 'is_slippery': False},
21 |     max_episode_steps=100,
22 |     reward_threshold=0.78, # optimum = .8196
23 | )
24 | register(
25 |     id='FrozenLakeNotSlippery-v1',
26 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
27 |     kwargs={'map_name' : '8x8', 'is_slippery': False},
28 |     max_episode_steps=100,
29 |     reward_threshold=0.78, # optimum = .8196
30 | )
31 | 
32 | def make_game(game):
33 |     os.system('export LD_LIBRARY_PATH=`$LD_LIBRARY_PATH:$HOME/.mujoco/mjpro150/bin`')
34 | 
35 |     if 'Chain' in game:
36 |         game,n = game.split('-')
37 |         if game == 'Chain':
38 |             Env = Chain(int(n))
39 |         elif game == 'ChainOrdered':
40 |             Env = ChainOrdered(int(n))
41 |     elif game == 'CartPole-vr' or game == 'MountainCar-vr' or game == 'Acrobot-vr' or game == 'LunarLander-vr':
42 |         Env = RewardWrapper(game)
43 |     elif game == 'CartPole-vp' or game == 'MountainCar-vp' or game == 'Acrobot-vp':
44 |         Env = PILCOWrapper(game)
45 |     elif game == 'CartPole-vn' or game == 'MountainCar-vn':
46 |         Env = NormalizeWrapper(game)
47 |     else:
48 |         Env = gym.make(game)
49 |         if type(Env) == gym.wrappers.time_limit.TimeLimit:
50 |             Env = Env.env
51 |     if game in ['Breakout-v0','Pong-v0','MontezumaRevenge-v0']:
52 |         Env = AtariWrapper(Env,skip=3,k=3,ram=False)
53 |         Env = ClipRewardWrapper(Env)
54 |     elif 'ram' in game:
55 |         Env = AtariWrapper(Env,skip=3,k=2,ram=True)
56 |         Env = ClipRewardWrapper(Env)
57 |     if 'CartPole' in game:
58 |         Env.observation_space = gym.spaces.Box(np.array([-4.8,-10,-4.8,-10]),np.array([4.8,10,4.8,10]))        
59 |     return Env
60 | 
61 | def check_space(space):    
62 |     '''check the properties of the env '''
63 |     if isinstance(space,spaces.Box):
64 |         dim = space.shape # should the zero be here?
65 |         discrete = False    
66 |     elif isinstance(space,spaces.Discrete):
67 |         dim = space.n
68 |         discrete = True
69 |     else:
70 |         raise NotImplementedError
71 |     return dim, discrete


--------------------------------------------------------------------------------
/src/rl/atariwrapper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Atari wrapper, based on https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
 4 | @author: thomas
 5 | """
 6 | import gym
 7 | from gym import spaces
 8 | from collections import deque
 9 | import numpy as np
10 | from PIL import Image
11 | 
12 | class ClipRewardWrapper(gym.RewardWrapper):
13 |     def reward(self, reward):
14 |         """Bin reward to {+1, 0, -1} by its sign."""
15 |         return 0.5 * np.sign(reward)
16 | 
17 | class AtariWrapper(gym.Wrapper):
18 |     ''' Chain domain '''
19 |     
20 |     def __init__(self, env, skip=4, k=4,ram=False):
21 |         """Return only every `skip`-th frame"""
22 |         gym.Wrapper.__init__(self, env)
23 |         # Frame skip and pooling
24 |         self._obs_buffer = deque(maxlen=skip)
25 |         self._skip = skip    
26 |         self._ram = ram
27 | 
28 |         # Frame stacking
29 |         self.k = k
30 |         self.frames = deque([], maxlen=k)
31 |         
32 |         # Frame wrapping
33 |         if not self._ram:
34 |             self.res = 84
35 |             self.observation_space = spaces.Box(low=0, high=1, shape=(self.res,self.res, k))
36 |         else:
37 |             self.res = env.observation_space.shape[0]
38 |             self.observation_space = spaces.Box(low=0, high=1, shape=(self.res, k))
39 | 
40 |     def _observation(self):
41 |         assert len(self.frames) == self.k
42 |         return np.concatenate(self.frames, axis=-1)  
43 | 
44 |     def _resize(self, obs):
45 |         if not self._ram:
46 |             frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32'))
47 |             frame = np.array(Image.fromarray(frame).resize((self.res, self.res),
48 |                 resample=Image.BILINEAR), dtype=np.float32)/255.0
49 |             return frame.reshape((self.res, self.res, 1))
50 |         else:
51 |             obs = obs/255
52 |             return obs.astype('float32').reshape((self.res,1))
53 |             
54 |     def _reset(self):
55 |         """Clear buffers and re-fill by duplicating the first observation."""
56 |         ob = self.env.reset()
57 |         ob = self._resize(ob)
58 |         for _ in range(self.k): self.frames.append(ob)
59 |         self._obs_buffer.clear()
60 |         for _ in range(self._skip): self._obs_buffer.append(ob)
61 |         return self._observation()
62 |         
63 |     def _step(self, action):
64 |         """Repeat action, sum reward, and max over last observations."""
65 |         total_reward = 0.0
66 |         done = None
67 |         for _ in range(self._skip):
68 |             obs, reward, done, info = self.env.step(action)
69 |             obs = self._resize(obs)
70 |             self._obs_buffer.append(obs)
71 |             total_reward += reward
72 |             if done:
73 |                 break
74 |         if not self._ram:
75 |             max_frame = np.max(np.stack(self._obs_buffer), axis=0) # max over skips
76 |         else:
77 |             max_frame = obs # just take the last, max has no interpretation
78 |         self.frames.append(max_frame) # append to buffer
79 |         return self._observation(), total_reward, done, info
80 | 


--------------------------------------------------------------------------------
/visualize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Wrapper for visualize.py
 5 | 
 6 | @author: thomas
 7 | """
 8 | 
 9 | if __name__ == '__main__' and __package__ is None:
10 |     from os import sys, path
11 |     sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
12 | 
13 | import os
14 | import argparse
15 | from common.visualize import loop_directories
16 | from hps import get_hps
17 |    
18 |    
19 | if __name__ == "__main__":  
20 |     parser = argparse.ArgumentParser()  
21 |     parser.add_argument('--folder', default='/home/thomas/a0c_results')
22 |     parser.add_argument('--home', action='store_true',default=False)
23 |     parser.add_argument('--game', default=None)
24 |     parser.add_argument('--name', default=None)
25 |     parser.add_argument('--subindex', default=None)
26 |     parser.add_argument('--plot_type', default='lc')
27 |     parser.add_argument('--window', type=int,default=25)
28 |     parser.add_argument('--sd', action='store_true',default=False)
29 |     parser.add_argument('--on_mean', action='store_true',default=False)
30 |     parser.add_argument('--item', default='return',help='This item in result will be plotted')
31 |     parser.add_argument('--remove', action='store_true',default=False)
32 |     parser.add_argument('--plotlen', type=int,default=25)
33 |     parser.add_argument('--xlim', nargs='+',type=float,default=None)
34 |     parser.add_argument('--ylim', nargs='+',type=float,default=None)
35 |     parser.add_argument('--errlim', nargs='+',type=float,default=None,help='Limits on the errorbars')    
36 |     parser.add_argument('--item1_label', nargs='+', default=None)
37 |     parser.add_argument('--item2_label', nargs='+', default=None)
38 |     parser.add_argument('--item3_label', nargs='+', default=None)
39 |     parser.add_argument('--no_suptitle', action='store_true',default=False)        
40 |     parser.add_argument('--x_item', default='steps') # steps or eps        
41 | 
42 |     parser.add_argument('--line_item', default='item1') #   
43 |     parser.add_argument('--col_item', default='item2') #     
44 |     parser.add_argument('--row_item', default='item3') #    
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     if args.item1_label is not None: args.item1_label = ' '.join(args.item1_label)
49 |     if args.item2_label is not None: args.item2_label = ' '.join(args.item2_label)
50 |     if args.item3_label is not None: args.item3_label = ' '.join(args.item3_label)
51 |     
52 |     if args.home:
53 |         result_folder = os.getcwd() + '/results/'
54 |     else:
55 |         result_folder = args.folder + '/'
56 |     print('Start processing folder {}'.format(result_folder))
57 |     overview_dir= result_folder+'learning_curves/'
58 |     if not os.path.exists(overview_dir):
59 |         os.makedirs(overview_dir)
60 |         
61 |     loop_directories(result_folder=result_folder,overview_dir=overview_dir,game=args.game,name=args.name,
62 |                      subindex=args.subindex,plot_type=args.plot_type,window=args.window,sd=args.sd,on_mean=args.on_mean,
63 |                      item=args.item,remove=args.remove,plotlen=args.plotlen,xlim=args.xlim,ylim=args.ylim,errlim=args.errlim,
64 |                      get_hps=get_hps,no_suptitle=args.no_suptitle,x_item=args.x_item,line_item=args.line_item,col_item=args.col_item,
65 |                      row_item=args.row_item,item1_label=args.item1_label,item2_label=args.item2_label,item3_label=args.item3_label) 
66 | 
67 | 


--------------------------------------------------------------------------------
/jobs/expand_jobs_over_games.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Expand a submission over games
 4 | @author: thomas
 5 | """
 6 | 
 7 | if __name__ == '__main__' and __package__ is None:
 8 |     from os import sys, path
 9 |     sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
10 | 
11 | import os
12 | import argparse
13 | from common.visualize import make_name
14 | 
15 | def expand_job(games,job,hp,hp_setup,item1=None,seq1=[None],item2=None,seq2=[None],item3=None,seq3=[None]):
16 |     # hacky way to bring in games
17 |     #games = ['CartPole-vr','MountainCar-vr','Acrobot-vr','FrozenLake-v0','FrozenLakeNotSlippery-v0','FrozenLakeNotSlippery-v1']
18 |     #games = ['Breakout-ramDeterministic-v0','Pong-ramDeterministic-v0','AirRaid-ramDeterministic-v0','Amidar-ramDeterministic-v0',
19 |     #         'Enduro-ramDeterministic-v0','MontezumaRevenge-ramDeterministic-v0','Venture-ramDeterministic-v0']
20 |     # Regarding Atari:
21 |     # Assault, Freeway, Seaquest have different initial states
22 | 
23 |     file = os.getcwd() + '/' + job
24 |     with open(file,'w') as fp:
25 |         fp.write('#!/bin/sh\n')  
26 |         for i,game in enumerate(games):
27 |             for j,it1 in enumerate(seq1):
28 |                 for k,it2 in enumerate(seq2):
29 |                     for l,it3 in enumerate(seq3):
30 |                         fp.write('python3 submit.py --hpsetup game={},{} --hp {}'.format(game,hp_setup,hp))
31 |                         if item1 is not None:
32 |                             fp.write(',{}={}'.format(item1,it1))
33 |                         if item2 is not None:
34 |                             fp.write(',{}={}'.format(item2,it2))
35 |                         if item3 is not None:
36 |                             fp.write(',{}={}'.format(item3,it3))
37 |                         hyperloop_name = make_name('',item1,it1,item2,it2,item3,it3)
38 |                         if hyperloop_name != '':
39 |                             fp.write(',name={}'.format(hyperloop_name))    
40 |                         fp.write('\n')
41 | 
42 | if __name__ == "__main__":  
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument('--games', nargs='+',type=str,default=[])
45 |     parser.add_argument('--job', default='job.sh')    
46 |     parser.add_argument('--slurm_mode', default='off')    
47 |     parser.add_argument('--hp', help='Hyperparameter configuration',default='')
48 |     parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='')
49 |     # extra items
50 |     parser.add_argument('--item1',type=str,default=None)
51 |     parser.add_argument('--seq1', nargs='+',type=str,default=[None])
52 |     parser.add_argument('--item2',type=str,default=None)
53 |     parser.add_argument('--seq2', nargs='+',type=str,default=[None])
54 |     parser.add_argument('--item3',type=str,default=None)
55 |     parser.add_argument('--seq3', nargs='+',type=str,default=[None])
56 |     
57 |     args = parser.parse_args()
58 |     
59 |     if args.slurm_mode == 'short':
60 |         args.hpsetup += ',slurm=True,slurm_qos=short,slurm_time=3:59:59'
61 |     elif args.slurm_mode == 'long':
62 |         args.hpsetup += ',slurm=True,slurm_qos=long,slurm_time=5-0:00:00'
63 |         
64 |     expand_job(games=args.games,job=args.job,hp=args.hp,hp_setup=args.hpsetup,
65 |                item1=args.item1,seq1=args.seq1,item2=args.item2,seq2=args.seq2,
66 |                item3=args.item3,seq3=args.seq3)


--------------------------------------------------------------------------------
/src/rl/envs/taxi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Taxi Env
 4 | @author: thomas
 5 | """
 6 | 
 7 | import numpy
 8 | import random
 9 | import gym
10 | 
11 | class Taxi():
12 |     ''' '''
13 |     
14 |     def __init__(self):
15 |         self.size = numpy.array([4,4])
16 |         self.landmarks = numpy.array([[0.0, 0.0], [0.0, 4.0], [3.0, 0.0], [4.0, 4.0]])
17 |         self.walls = numpy.array([[1.0, 2.0], [2.0, -2.0], [3.0, 2.0]])
18 |         self.fuel = 0
19 |         self.fuel_loc = numpy.array([2.0, 1.0])
20 |         self.pass_loc = 0 # Passenger location: -1 for in taxi, >= 0 for a landmark
21 |         self.pass_dest = 0 # Passenger destination: >=0 for a landmark
22 |         self.pos = numpy.zeros((2,))
23 |         self.observation_space = gym.spaces.Box(0,12,(5,)) 
24 |         self.action_space = gym.spaces.Discrete(6)
25 | 
26 |     def reset(self):
27 |         self.pos = numpy.random.randint(0,5,(2,))
28 |         self.fuel = numpy.random.random()*7 + 5.0
29 |         self.lm_list = [i for i in range(len(self.landmarks))]
30 |         random.shuffle(self.lm_list)
31 |         self.pass_loc = self.lm_list.pop()
32 |         self.pass_dest = random.choice(self.lm_list)
33 |         return self.get_state()
34 | 
35 |     def get_state(self):
36 |         return numpy.hstack([self.pos,self.fuel,self.pass_loc,self.pass_dest])
37 | 
38 |     def step(self,action):
39 |         # move taxi
40 |         reward = self.takeAction(action)
41 |         terminal = 1 if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0 else 0	
42 |         return self.get_state(),reward,terminal,{}
43 | 
44 |     def takeAction(self, intAction):
45 |         reward = -1.0
46 |         self.fuel -= 1
47 |         prev_pos = self.pos.copy()
48 |         sign = 0
49 |         if intAction == 0:
50 |             self.pos[0] += 1.0
51 |             sign = 1
52 |         elif intAction == 1:
53 |             self.pos[0] -= 1.0
54 |             sign = -1
55 |         elif intAction == 2:
56 |             self.pos[1] += 1.0
57 |         elif intAction == 3:
58 |             self.pos[1] -= 1.0
59 |         elif intAction == 4: # Pickup
60 |             if self.pass_loc >= 0 and self.atPoint(self.landmarks[self.pass_loc]):
61 |                 self.pass_loc = -1
62 |             else:
63 |                 reward = -10.0
64 |         elif intAction == 5: # Drop off
65 |             if self.pass_loc == -1 and self.atPoint(self.landmarks[self.pass_dest]):
66 |                 self.pass_loc = self.pass_dest
67 |                 reward = 20.0
68 |             else:
69 |                 reward = -10.0
70 |         elif self.fuel_loc is not None and intAction == 4: # Refuel
71 |             if self.atPoint(self.fuel_loc):
72 |                 self.fuel = 12.0
73 |     
74 |         self.pos = self.pos.clip([0, 0], self.size)
75 |     
76 |         if sign != 0 and self.hitsWall(prev_pos, self.pos, sign):
77 |             self.pos[0] = prev_pos[0] # Only revert the x-coord, to allow noise and such in y
78 |     
79 |         return reward
80 | 
81 |     # helpers
82 |     def atPoint(self, point):
83 |         return numpy.linalg.norm(self.pos - point) < 0.1
84 | 
85 |     def isAtGoal(self):
86 |         return self.pass_loc == self.pass_dest
87 | 
88 |     def hitsWall(self, old_pos, new_pos, sign):
89 |         return (((self.walls[:,0]*sign >= old_pos[0]*sign) & (self.walls[:,0]*sign < new_pos[0]*sign)) \
90 | 				& ((self.walls[:,1] > old_pos[1]) | ((self.size[1]-1)+self.walls[:,1] < old_pos[1]))).any()
91 |    
92 | 


--------------------------------------------------------------------------------
/src/common/rl/envs/taxi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Taxi Env
 4 | @author: thomas
 5 | """
 6 | 
 7 | import numpy
 8 | import random
 9 | import gym
10 | 
11 | class Taxi():
12 |     ''' '''
13 |     
14 |     def __init__(self):
15 |         self.size = numpy.array([4,4])
16 |         self.landmarks = numpy.array([[0.0, 0.0], [0.0, 4.0], [3.0, 0.0], [4.0, 4.0]])
17 |         self.walls = numpy.array([[1.0, 2.0], [2.0, -2.0], [3.0, 2.0]])
18 |         self.fuel = 0
19 |         self.fuel_loc = numpy.array([2.0, 1.0])
20 |         self.pass_loc = 0 # Passenger location: -1 for in taxi, >= 0 for a landmark
21 |         self.pass_dest = 0 # Passenger destination: >=0 for a landmark
22 |         self.pos = numpy.zeros((2,))
23 |         self.observation_space = gym.spaces.Box(0,12,(5,)) 
24 |         self.action_space = gym.spaces.Discrete(6)
25 | 
26 |     def reset(self):
27 |         self.pos = numpy.random.randint(0,5,(2,))
28 |         self.fuel = numpy.random.random()*7 + 5.0
29 |         self.lm_list = [i for i in range(len(self.landmarks))]
30 |         random.shuffle(self.lm_list)
31 |         self.pass_loc = self.lm_list.pop()
32 |         self.pass_dest = random.choice(self.lm_list)
33 |         return self.get_state()
34 | 
35 |     def get_state(self):
36 |         return numpy.hstack([self.pos,self.fuel,self.pass_loc,self.pass_dest])
37 | 
38 |     def step(self,action):
39 |         # move taxi
40 |         reward = self.takeAction(action)
41 |         terminal = 1 if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0 else 0	
42 |         return self.get_state(),reward,terminal,{}
43 | 
44 |     def takeAction(self, intAction):
45 |         reward = -1.0
46 |         self.fuel -= 1
47 |         prev_pos = self.pos.copy()
48 |         sign = 0
49 |         if intAction == 0:
50 |             self.pos[0] += 1.0
51 |             sign = 1
52 |         elif intAction == 1:
53 |             self.pos[0] -= 1.0
54 |             sign = -1
55 |         elif intAction == 2:
56 |             self.pos[1] += 1.0
57 |         elif intAction == 3:
58 |             self.pos[1] -= 1.0
59 |         elif intAction == 4: # Pickup
60 |             if self.pass_loc >= 0 and self.atPoint(self.landmarks[self.pass_loc]):
61 |                 self.pass_loc = -1
62 |             else:
63 |                 reward = -10.0
64 |         elif intAction == 5: # Drop off
65 |             if self.pass_loc == -1 and self.atPoint(self.landmarks[self.pass_dest]):
66 |                 self.pass_loc = self.pass_dest
67 |                 reward = 20.0
68 |             else:
69 |                 reward = -10.0
70 |         elif self.fuel_loc is not None and intAction == 4: # Refuel
71 |             if self.atPoint(self.fuel_loc):
72 |                 self.fuel = 12.0
73 |     
74 |         self.pos = self.pos.clip([0, 0], self.size)
75 |     
76 |         if sign != 0 and self.hitsWall(prev_pos, self.pos, sign):
77 |             self.pos[0] = prev_pos[0] # Only revert the x-coord, to allow noise and such in y
78 |     
79 |         return reward
80 | 
81 |     # helpers
82 |     def atPoint(self, point):
83 |         return numpy.linalg.norm(self.pos - point) < 0.1
84 | 
85 |     def isAtGoal(self):
86 |         return self.pass_loc == self.pass_dest
87 | 
88 |     def hitsWall(self, old_pos, new_pos, sign):
89 |         return (((self.walls[:,0]*sign >= old_pos[0]*sign) & (self.walls[:,0]*sign < new_pos[0]*sign)) \
90 | 				& ((self.walls[:,1] > old_pos[1]) | ((self.size[1]-1)+self.walls[:,1] < old_pos[1]))).any()
91 |    
92 | 


--------------------------------------------------------------------------------
/jobs/job_PM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=Q,name=lr:0.01-loss_type:Q
 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=count,name=lr:0.01-loss_type:count
 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=Q,name=lr:0.001-loss_type:Q
 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=count,name=lr:0.001-loss_type:count
 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=Q,name=lr:0.0001-loss_type:Q
 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=count,name=lr:0.0001-loss_type:count
 8 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=Q,name=lr:0.01-loss_type:Q
 9 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.01,loss_type=count,name=lr:0.01-loss_type:count
10 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=Q,name=lr:0.001-loss_type:Q
11 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.001,loss_type=count,name=lr:0.001-loss_type:count
12 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=Q,name=lr:0.0001-loss_type:Q
13 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=50+200,item2=entropy_l,seq2=0.0+0.01+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp c=0.1,bound=beta,n_t=10000000,n_ep=20000,lr=0.0001,loss_type=count,name=lr:0.0001-loss_type:count
14 | 


--------------------------------------------------------------------------------
/src/rl/wrappers/custom_game.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import types
 3 | import gym
 4 | 
 5 | try:
 6 |     import doom_py
 7 |     from doom_py import DoomGame, Mode, Button, GameVariable, ScreenFormat, ScreenResolution, Loader
 8 | except ImportError as e:
 9 |     raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " +
10 |                                            "with 'pip install doom_py.)'".format(e))
11 | 
12 | __all__ = [ 'CustomGame' ]
13 | 
14 | def CustomGame():
15 | 
16 |     def _customize_game(self):
17 |         vizdoom_path = self.loader.get_vizdoom_path()   # paths are based on installation path of doom_py
18 |         freedoom_path = self.loader.get_freedoom_path()
19 |         doom_dir = self.doom_dir                        # dirname of directory containing 'doom_env'
20 | 
21 |         # Settings
22 |         config = os.path.join(doom_dir, 'assets', 'deadly_corridor.cfg')
23 |         scenario = self.loader.get_scenario_path('deadly_corridor.wad')
24 |         map = ''
25 |         difficulty = 1
26 | 
27 |         # Customizing - self.game refers to a new DoomGame()
28 |         self.game.set_vizdoom_path(vizdoom_path)
29 |         self.game.set_doom_game_path(freedoom_path)
30 |         self.game.load_config(config)
31 |         self.game.set_doom_scenario_path(scenario)
32 |         if map != '':
33 |             self.game.set_doom_map(map)
34 |         self.game.set_doom_skill(difficulty)
35 |         self.game.set_screen_resolution(self.screen_resolution)
36 | 
37 |     class CustomGameWrapper(gym.Wrapper):
38 |         """
39 |             Doom wrapper to load a custom map
40 |             This wrapper modifies directly the unwrapped env, and is not expected to be stacked
41 |         """
42 |         def __init__(self, env):
43 |             super(CustomGameWrapper, self).__init__(env)
44 |             self.unwrapped.action_space = gym.spaces.MultiDiscrete([[0, 1]] * 38 + [[-10, 10]] * 2 + [[-100, 100]] * 3)   # Default 43 button action space
45 |             self.unwrapped.screen_height = 480
46 |             self.unwrapped.screen_width = 640
47 |             self.unwrapped.screen_resolution = ScreenResolution.RES_640X480
48 |             self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.unwrapped.screen_height, self.unwrapped.screen_width, 3))
49 |             self.observation_space = self.unwrapped.observation_space
50 |             self.unwrapped.allowed_actions = [0, 10, 11, 13, 14, 15]        # Must match exactly and in order the actions in the config file
51 |                                                                             # (The number is the action number based on controls.md)
52 |                                                                             # This will only enable these actions out of the 43 available buttons
53 | 
54 |             # Converting to Discrete action space
55 |             discrete_actions = self.unwrapped.allowed_actions
56 |             self.action_space = gym.spaces.DiscreteToMultiDiscrete(self.unwrapped.action_space, discrete_actions)
57 | 
58 |             # Alternative to convert to continuous action space
59 |             # box_actions = self.unwrapped.allowed_actions
60 |             # self.action_space = gym.spaces.BoxToMultiDiscrete(self.unwrapped.action_space, box_actions)
61 | 
62 |             # Bouding method to env
63 |             self.unwrapped._customize_game = types.MethodType(_customize_game, self.unwrapped)
64 | 
65 |         def _step(self, action):
66 |             return self.unwrapped._step(self.action_space(action))
67 | 
68 |     return CustomGameWrapper
69 | 


--------------------------------------------------------------------------------
/src/common/rl/wrappers/doom/custom_game.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import types
 3 | import gym
 4 | 
 5 | try:
 6 |     import doom_py
 7 |     from doom_py import DoomGame, Mode, Button, GameVariable, ScreenFormat, ScreenResolution, Loader
 8 | except ImportError as e:
 9 |     raise gym.error.DependencyNotInstalled("{}. (HINT: you can install Doom dependencies " +
10 |                                            "with 'pip install doom_py.)'".format(e))
11 | 
12 | __all__ = [ 'CustomGame' ]
13 | 
14 | def CustomGame():
15 | 
16 |     def _customize_game(self):
17 |         vizdoom_path = self.loader.get_vizdoom_path()   # paths are based on installation path of doom_py
18 |         freedoom_path = self.loader.get_freedoom_path()
19 |         doom_dir = self.doom_dir                        # dirname of directory containing 'doom_env'
20 | 
21 |         # Settings
22 |         config = os.path.join(doom_dir, 'assets', 'deadly_corridor.cfg')
23 |         scenario = self.loader.get_scenario_path('deadly_corridor.wad')
24 |         map = ''
25 |         difficulty = 1
26 | 
27 |         # Customizing - self.game refers to a new DoomGame()
28 |         self.game.set_vizdoom_path(vizdoom_path)
29 |         self.game.set_doom_game_path(freedoom_path)
30 |         self.game.load_config(config)
31 |         self.game.set_doom_scenario_path(scenario)
32 |         if map != '':
33 |             self.game.set_doom_map(map)
34 |         self.game.set_doom_skill(difficulty)
35 |         self.game.set_screen_resolution(self.screen_resolution)
36 | 
37 |     class CustomGameWrapper(gym.Wrapper):
38 |         """
39 |             Doom wrapper to load a custom map
40 |             This wrapper modifies directly the unwrapped env, and is not expected to be stacked
41 |         """
42 |         def __init__(self, env):
43 |             super(CustomGameWrapper, self).__init__(env)
44 |             self.unwrapped.action_space = gym.spaces.MultiDiscrete([[0, 1]] * 38 + [[-10, 10]] * 2 + [[-100, 100]] * 3)   # Default 43 button action space
45 |             self.unwrapped.screen_height = 480
46 |             self.unwrapped.screen_width = 640
47 |             self.unwrapped.screen_resolution = ScreenResolution.RES_640X480
48 |             self.unwrapped.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.unwrapped.screen_height, self.unwrapped.screen_width, 3))
49 |             self.observation_space = self.unwrapped.observation_space
50 |             self.unwrapped.allowed_actions = [0, 10, 11, 13, 14, 15]        # Must match exactly and in order the actions in the config file
51 |                                                                             # (The number is the action number based on controls.md)
52 |                                                                             # This will only enable these actions out of the 43 available buttons
53 | 
54 |             # Converting to Discrete action space
55 |             discrete_actions = self.unwrapped.allowed_actions
56 |             self.action_space = gym.spaces.DiscreteToMultiDiscrete(self.unwrapped.action_space, discrete_actions)
57 | 
58 |             # Alternative to convert to continuous action space
59 |             # box_actions = self.unwrapped.allowed_actions
60 |             # self.action_space = gym.spaces.BoxToMultiDiscrete(self.unwrapped.action_space, box_actions)
61 | 
62 |             # Bouding method to env
63 |             self.unwrapped._customize_game = types.MethodType(_customize_game, self.unwrapped)
64 | 
65 |         def _step(self, action):
66 |             return self.unwrapped._step(self.action_space(action))
67 | 
68 |     return CustomGameWrapper
69 | 


--------------------------------------------------------------------------------
/jobs/job_PM2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.005,name=lr:0.01-loss_type:count-c:0.005
 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.05,name=lr:0.01-loss_type:count-c:0.05
 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.005,name=lr:0.001-loss_type:count-c:0.005
 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.05,name=lr:0.001-loss_type:count-c:0.05
 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.005,name=lr:0.0001-loss_type:count-c:0.005
 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.05,name=lr:0.0001-loss_type:count-c:0.05
 8 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.005,name=lr:0.01-loss_type:count-c:0.005
 9 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.05,name=lr:0.01-loss_type:count-c:0.05
10 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.005,name=lr:0.001-loss_type:count-c:0.005
11 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.05,name=lr:0.001-loss_type:count-c:0.05
12 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.005,name=lr:0.0001-loss_type:count-c:0.005
13 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,item1=n_mcts,seq1=1+5+10+25+50+100,item2=entropy_l,seq2=0.0+0.1+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.05,name=lr:0.0001-loss_type:count-c:0.05
14 | 


--------------------------------------------------------------------------------
/src/common/rl/make_game.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Wrapper to generate the game environments
  4 | @author: thomas
  5 | """
  6 | import gym
  7 | import numpy as np
  8 | from gym import spaces
  9 | import os
 10 | 
 11 | from .envs.chain import Chain,ChainOrdered,ChainLoop
 12 | from .wrappers.control import NormalizeWrapper,ReparametrizeWrapper,PILCOWrapper,ScaleRewardWrapper
 13 | from .wrappers.atari import ClipRewardWrapper
 14 | 
 15 | # Register deterministic FrozenLakes
 16 | from gym.envs.registration import register
 17 | #register(
 18 | #    id='FrozenLakeNotSlippery-v0',
 19 | #    entry_point='gym.envs.toy_text:FrozenLakeEnv',
 20 | #    kwargs={'map_name' : '4x4', 'is_slippery': False},
 21 | #    max_episode_steps=100,
 22 | #    reward_threshold=0.78, # optimum = .8196
 23 | #)
 24 | #register(
 25 | #    id='FrozenLakeNotSlippery-v1',
 26 | #    entry_point='gym.envs.toy_text:FrozenLakeEnv',
 27 | #    kwargs={'map_name' : '8x8', 'is_slippery': False},
 28 | #    max_episode_steps=100,
 29 | #    reward_threshold=0.78, # optimum = .8196
 30 | #)
 31 | 
 32 | def get_base_env(env):
 33 |     ''' removes all wrappers '''
 34 |     while hasattr(env,'env'):
 35 |         env = env.env
 36 |     return env
 37 | 
 38 | def is_atari_game(env):
 39 |     ''' Verify whether game uses the Arcade Learning Environment '''
 40 |     env = get_base_env(env)
 41 |     return hasattr(env,'ale')
 42 | 
 43 | def prepare_gym_env(game):
 44 |     ''' Modifications to Env '''
 45 |     print(game)
 46 |     name,version = game.rsplit('-',1)
 47 |     if len(version) > 2:
 48 |         modify = version[2:]
 49 |         game = name + '-' + version[:2]
 50 |     else:
 51 |         modify = ''
 52 |         
 53 |     env = gym.make(game)
 54 |     # remove timelimit wrapper
 55 |     if type(env) == gym.wrappers.time_limit.TimeLimit:
 56 |         env = env.env
 57 |     
 58 |     print(modify)
 59 |     # prepare control
 60 |     if 'n' in modify and type(env.observation_space) == gym.spaces.Box:
 61 |         env = NormalizeWrapper(env)        
 62 |     if 'r' in modify:
 63 |         env = ReparametrizeWrapper(env)
 64 |     if 'p' in modify:
 65 |         env = PILCOWrapper(env)
 66 |     if 's' in modify:
 67 |         env = ScaleRewardWrapper(env)
 68 |     
 69 |     if 'CartPole' in game:
 70 |         env.observation_space = gym.spaces.Box(np.array([-4.8,-10,-4.8,-10]),np.array([4.8,10,4.8,10]))        
 71 | 
 72 |     # prepare atari
 73 |     if is_atari_game(env):
 74 |         env = prepare_atari_env(env)
 75 |     return env
 76 | 
 77 | def prepare_atari_env(Env,frame_skip=3,repeat_action_prob=0.0,reward_clip=True):
 78 |     ''' Initialize an Atari environment '''
 79 |     env = get_base_env(Env)
 80 |     env.ale.setFloat('repeat_action_probability'.encode('utf-8'), repeat_action_prob)
 81 |     env.frame_skip = frame_skip
 82 |     if reward_clip:
 83 |         Env = ClipRewardWrapper(Env)
 84 |     return Env
 85 | 
 86 | def prepare_chain_env(game):
 87 |     game,n = game.split('-')
 88 |     if game == 'Chain':
 89 |         Env = Chain(int(n))
 90 |     elif game == 'ChainOrdered':
 91 |         Env = ChainOrdered(int(n))
 92 |     elif game == 'ChainLoop':
 93 |         Env = ChainLoop(int(n))   
 94 |     return Env
 95 | 
 96 | def make_game(game):
 97 |     ''' Overall wrapper for gym.make_game '''
 98 |     os.system('export LD_LIBRARY_PATH=`$HOME/.mujoco/mjpro150/bin`') # necessary for mujoco
 99 |     if 'Chain' in game:
100 |         Env = prepare_chain_env(game)
101 |     else:
102 |         Env = prepare_gym_env(game)
103 |     return Env
104 | 
105 | def check_space(space):    
106 |     '''check the properties of the env '''
107 |     if isinstance(space,spaces.Box):
108 |         dim = space.shape # should the zero be here?
109 |         discrete = False    
110 |     elif isinstance(space,spaces.Discrete):
111 |         dim = space.n
112 |         discrete = True
113 |     else:
114 |         raise NotImplementedError
115 |     return dim, discrete


--------------------------------------------------------------------------------
/src/common/rl/envs/chain.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Chain environment
  4 | @author: thomas
  5 | """
  6 | 
  7 | import gym.spaces
  8 | import numpy as np
  9 | 
 10 | class ChainOrdered(object):
 11 |     ''' Chain domain '''
 12 |     
 13 |     def __init__(self,n=10):
 14 |         # n = length of chain
 15 |         self.action_space = gym.spaces.Discrete(2)
 16 |         self.observation_space = gym.spaces.Discrete(n+1)        
 17 |         self.n = n      
 18 |         self.state = 0
 19 |         self.correct = np.repeat(1,n)
 20 | 
 21 |     def reset(self):
 22 |         self.state = 0
 23 |         return self.state        
 24 | 
 25 |     def step(self,a):
 26 |         if a == 0:
 27 |             # move back
 28 |             self.state = 0
 29 |             r = 0
 30 |             terminal = True
 31 |         elif a == 1:
 32 |             # move forward
 33 |             self.state += 1
 34 |             if self.state == self.n:
 35 |                 r = 1
 36 |                 terminal = True
 37 |             else:
 38 |                 r = 0
 39 |                 terminal = False
 40 |         else:
 41 |             raise ValueError('Action not possible')
 42 |             
 43 |         return self.state,r,terminal, {}
 44 |     
 45 |     def seed(self,seed):
 46 |         pass # deterministic anyway
 47 | 
 48 | class Chain(object):
 49 |     ''' Chain domain '''
 50 |     
 51 |     def __init__(self,n=10):
 52 |         # n = length of chain
 53 |         self.action_space = gym.spaces.Discrete(2)
 54 |         self.observation_space = gym.spaces.Discrete(n+1)        
 55 |         self.n = n      
 56 |         self.state = 0
 57 |         self.correct = np.random.randint(0,2,n) # correct action in each state
 58 |         self.counts = np.zeros((self.n,2))
 59 | 
 60 |     def reset(self):
 61 |         self.state = 0
 62 |         return self.state        
 63 | 
 64 |     def step(self,a):
 65 |         self.counts[self.state,a] += 1
 66 |         if a != self.correct[self.state]:
 67 |             # move back
 68 |             self.state = 0
 69 |             r = 0
 70 |             terminal = True
 71 |         elif a == self.correct[self.state]:
 72 |             # move forward
 73 |             self.state += 1
 74 |             if self.state == self.n:
 75 |                 r = 1
 76 |                 terminal = True
 77 |             else:
 78 |                 r = 0
 79 |                 terminal = False
 80 |         else:
 81 |             raise ValueError('Action not possible')
 82 |             
 83 |         return self.state,r,terminal, {}
 84 | 
 85 |     def seed(self,seed):
 86 |         pass # deterministic anyway
 87 | 
 88 | 
 89 | class ChainLoop(object):
 90 |     ''' Chain domain '''
 91 |     
 92 |     def __init__(self,n=10):
 93 |         # n = length of chain
 94 |         self.action_space = gym.spaces.Discrete(2)
 95 |         self.observation_space = gym.spaces.Discrete(n+1)        
 96 |         self.n = n      
 97 |         self.state = 0
 98 |         self.correct = np.random.randint(0,2,n) # correct action in each state
 99 |         self.counts = np.zeros((self.n,2))
100 | 
101 |     def reset(self):
102 |         self.state = 0
103 |         return self.state        
104 | 
105 |     def step(self,a):
106 |         self.counts[self.state,a] += 1
107 |         if a != self.correct[self.state]:
108 |             # move back
109 |             self.state = 0
110 |             r = 0
111 |             terminal = False
112 |         elif a == self.correct[self.state]:
113 |             # move forward
114 |             self.state += 1
115 |             if self.state == self.n:
116 |                 r = 1
117 |                 terminal = True
118 |             else:
119 |                 r = 0
120 |                 terminal = False
121 |         else:
122 |             raise ValueError('Action not possible')
123 |             
124 |         return self.state,r,terminal, {}
125 | 
126 |     def seed(self,seed):
127 |         pass # deterministic anyway
128 | 
129 | # Test
130 | if __name__ == '__main__':
131 |     Env = ChainOrdered()
132 |     s = Env.reset()
133 |     for i in range(500): 
134 |         a = Env.action_space.sample()
135 |         s,r,terminal,_ = Env.step(a)
136 |         if terminal:
137 |             print('Died in step',i,'with reward',r,' restarting')
138 |             s = Env.reset() 
139 |     print('Finished')


--------------------------------------------------------------------------------
/jobs/job_p.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=0.0,use_prior=True,name=lr:0.001-random_action_frac:0.0-use_prior:True
 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=0.0,use_prior=False,name=lr:0.001-random_action_frac:0.0-use_prior:False
 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=1.0,use_prior=True,name=lr:0.001-random_action_frac:1.0-use_prior:True
 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.001,random_action_frac=1.0,use_prior=False,name=lr:0.001-random_action_frac:1.0-use_prior:False
 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=0.0,use_prior=True,name=lr:0.0001-random_action_frac:0.0-use_prior:True
 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=0.0,use_prior=False,name=lr:0.0001-random_action_frac:0.0-use_prior:False
 8 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=1.0,use_prior=True,name=lr:0.0001-random_action_frac:1.0-use_prior:True
 9 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.0001,random_action_frac=1.0,use_prior=False,name=lr:0.0001-random_action_frac:1.0-use_prior:False
10 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=0.0,use_prior=True,name=lr:0.00001-random_action_frac:0.0-use_prior:True
11 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=0.0,use_prior=False,name=lr:0.00001-random_action_frac:0.0-use_prior:False
12 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=1.0,use_prior=True,name=lr:0.00001-random_action_frac:1.0-use_prior:True
13 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=5+10+25,item2=entropy_l,seq2=0.0+0.01,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=20000000,n_eps=50000,V_decision=max,clip_gradient_norm=1.0,c=0.05,lr=0.00001,random_action_frac=1.0,use_prior=False,name=lr:0.00001-random_action_frac:1.0-use_prior:False
14 | 


--------------------------------------------------------------------------------
/src/common/rl/wrappers/control.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed May  9 14:57:40 2018
  5 | 
  6 | @author: thomas
  7 | """
  8 | import gym
  9 | import numpy as np
 10 | from scipy.stats import multivariate_normal
 11 | import sklearn.preprocessing
 12 | 
 13 | class ObservationRewardWrapper(gym.Wrapper):
 14 |     ''' My own base class - allows for both observation and reward modification '''
 15 |     def step(self, action):
 16 |         observation, reward, done, info = self.env.step(action)
 17 |         return self.observation(observation), self.reward(reward), done, info
 18 | 
 19 |     def reset(self):
 20 |         observation = self.env.reset()
 21 |         return self.observation(observation)
 22 | 
 23 |     def observation(self, observation):
 24 |         return observation 
 25 | 
 26 |     def reward(self, reward):
 27 |         return reward
 28 | 
 29 | def get_name(env):
 30 |     while True:
 31 |         if hasattr(env,'_spec'):
 32 |             name = env._spec.id
 33 |             break
 34 |         elif hasattr(env,'spec'): 
 35 |             name = env.spec.id
 36 |             break
 37 |         else:
 38 |             env = env.env
 39 |     return name
 40 | 
 41 | class NormalizeWrapper(ObservationRewardWrapper):
 42 |     ''' normalizes the input data range '''
 43 |     def __init__(self, env):
 44 |         ObservationRewardWrapper.__init__(self, env)
 45 |         self.name = get_name(env)    
 46 |         observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
 47 |         self.scaler = sklearn.preprocessing.StandardScaler()
 48 |         self.scaler.fit(observation_examples)
 49 |         
 50 |     def observation(self, observation):
 51 |         return self.scaler.transform([observation])[0]
 52 | 
 53 | class ScaleRewardWrapper(ObservationRewardWrapper):
 54 |     
 55 |     def __init__(self, env):
 56 |         gym.RewardWrapper.__init__(self, env)
 57 |         self.name = get_name(env)
 58 | 
 59 |     def reward(self, reward):
 60 |         """ Rescale reward """
 61 |         if 'Pendulum' in self.name:
 62 |             return np.float32(reward/1000.0)
 63 |         #elif 'MountainCarContinuous' in self.name:
 64 |         #    return np.float32(reward/500.0)
 65 |         elif 'Lunarlander' in self.name:
 66 |             return np.float32(reward/250.0)
 67 |         else:
 68 |             return reward
 69 |           
 70 | class ReparametrizeWrapper(ObservationRewardWrapper):
 71 | 
 72 |     def __init__(self, env):
 73 |         gym.RewardWrapper.__init__(self, env)
 74 |         self.name = get_name(env)
 75 | 
 76 |     def step(self, action):
 77 |         observation, reward, terminal, info = self.env.step(action)
 78 |         return self.observation(observation), self.reward(reward,terminal), terminal, info
 79 | 
 80 |     def reward(self,r,terminal):
 81 |         if 'CartPole' in self.name:
 82 |             if terminal:
 83 |                 r = -1
 84 |             else:
 85 |                 r = 0.005
 86 |         elif 'MountainCar' in self.name:
 87 |             if terminal:
 88 |                 r = 1
 89 |             else:
 90 |                 r = -0.005
 91 |         elif 'Acrobot' in self.name:
 92 |             if terminal:
 93 |                 r = 1
 94 |             else:
 95 |                 r = -0.005
 96 |         return r
 97 | 
 98 | class PILCOWrapper(ObservationRewardWrapper):
 99 | 
100 |     def __init__(self, env):
101 |         gym.RewardWrapper.__init__(self, env)
102 |         while True:
103 |             if hasattr(env,'_spec'):
104 |                 self.name = env._spec.id
105 |                 break
106 |             else:
107 |                 env = env.env
108 | 
109 |     def step(self, action):
110 |         observation, reward, terminal, info = self.env.step(action)
111 |         return self.observation(observation), self.reward(observation), terminal, info
112 | 
113 |     def reward(self,s):
114 |         if 'CartPole' in self.name:
115 |             target = np.array([0.0,0.0,0.0,0.0])
116 |         elif 'Acrobot' in self.name:
117 |             target = np.array([1.0])
118 |             s = -np.cos(s[0]) - np.cos(s[1] + s[0])
119 |         elif 'MountainCar' in self.name:
120 |             target = np.array([0.5])
121 |             s = s[0]
122 |         elif 'Pendulum' in self.name:
123 |             target = np.array([0.0,0.0])
124 |         else:
125 |             raise ValueError('no PILCO reward mofication for this game')
126 |         return 1 - multivariate_normal.pdf(s,mean=target)


--------------------------------------------------------------------------------
/jobs/job_backup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts20lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.01
 4 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts50lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.01
 5 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts150lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.01
 6 | 
 7 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts20lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.001
 8 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts50lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.001
 9 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts150lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.001
10 | 
11 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts20lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.0001
12 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts50lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.0001
13 | python3 submit.py --hpsetup game=Pendulum-v0s,name=mcts150lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.0001
14 | 
15 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts20lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.01
16 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts50lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.01
17 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts150lr01,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.01
18 | 
19 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts20lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.001
20 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts50lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.001
21 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts150lr001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.001
22 | 
23 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts20lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=20,lr=0.0001
24 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts50lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=50,lr=0.0001
25 | python3 submit.py --hpsetup game=MountainCarContinuous-v0,name=mcts150lr0001,item1=c,seq1=1.0+2.5+5.0,item2=loss_type,seq2=count+Q,item3=temp,seq3=0.1+1.0+10.0,n_rep=3,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp n_eps=10000,n_t=10000000,n_mcts=150,lr=0.0001
26 | 


--------------------------------------------------------------------------------
/src/common/hps_setup.py:
--------------------------------------------------------------------------------
  1 |     # -*- coding: utf-8 -*-
  2 | """
  3 | General hyperparameters for hyperlooping and slurm setup
  4 | @author: thomas
  5 | """
  6 | 
  7 | import copy       
  8 | 
  9 | def get_hps_setup():
 10 |     ''' Hyperparameter settings '''
 11 |     return HParams(      
 12 |         # General
 13 |         game = 'None', # Environment name
 14 |         name = 'None', # Name of experiment
 15 | 
 16 |         # Slurm parameters
 17 |         slurm = False,
 18 |         slurm_qos = 'short',        
 19 |         slurm_time = '3:59:59',
 20 |         cpu_per_task = 2, 
 21 |         mem_per_cpu = 2048,
 22 |         distributed = False,
 23 |         n_jobs = 16, # distribute over n_jobs processes
 24 |         
 25 |         # Hyperparameter looping
 26 |         n_rep = 1,
 27 |         rep = 0, # repetition index
 28 |         loop_hyper = False,
 29 |         item1 = None,
 30 |         seq1 = [None],
 31 |         item2 = None,
 32 |         seq2 = [None],
 33 |         item3 = None,
 34 |         seq3 = [None],
 35 |         item4 = None,
 36 |         seq4 = [None],
 37 |         )
 38 | 
 39 | def hps_to_list(hps):
 40 |     out=[]
 41 |     hps_dict = copy.deepcopy(hps.__dict__)
 42 |     try:
 43 |         del hps_dict['_items']
 44 |     except:
 45 |         pass
 46 |     for k,v in hps_dict.items():
 47 |         if type(v) == list:
 48 |             v='+'.join(str(x) for x in v)
 49 |         if not (v is None or v == 'None'): # should not write the default hyperloop settings
 50 |             out.append('{}={}'.format(k,v))
 51 |     out.sort()
 52 |     return ','.join(out)
 53 | 
 54 | def hps_to_dict(hps):
 55 |     hps_dict = copy.deepcopy(hps.__dict__)
 56 |     try:
 57 |         del hps_dict['_items']
 58 |     except:
 59 |         pass
 60 |     return hps_dict
 61 |         
 62 | class HParams(object):
 63 | 
 64 |     def __init__(self, **kwargs):
 65 |         self._items = {}
 66 |         for k, v in kwargs.items():
 67 |             self._set(k, v)
 68 | 
 69 |     def _set(self, k, v):
 70 |         self._items[k] = v
 71 |         setattr(self, k, v)
 72 |         
 73 |     def _get(self,k):
 74 |         return self._items[k]
 75 |         
 76 |     def __eq__(self, other) : 
 77 |         return self.__dict__ == other.__dict__
 78 | 
 79 |     def parse(self, str_value,hps_extra=None):
 80 |         hps = HParams(**self._items)
 81 |         for entry in str_value.strip().split(","):
 82 |             entry = entry.strip()
 83 |             if not entry:
 84 |                 continue
 85 |             key, sep, value = entry.partition("=")
 86 |             if not sep:
 87 |                 raise ValueError("Unable to parse: %s" % entry)
 88 |             try:
 89 |                 default_value = hps._items[key]
 90 |             except:
 91 |                 print('Cant parse key {}, skipping'.format(key))
 92 |                 continue
 93 |             if isinstance(default_value, bool):
 94 |                 hps._set(key, value.lower() == "true")
 95 |             elif isinstance(default_value, int):
 96 |                 hps._set(key, int(value))
 97 |             elif default_value is None and value == 'None':
 98 |                 hps._set(key, None)
 99 |             elif isinstance(default_value, float):
100 |                 hps._set(key, float(value))
101 |             elif isinstance(default_value, list):
102 |                 value = value.split('+')
103 |                 default_inlist = hps._items[key][0]
104 |                 if key == 'seq1':
105 |                     if hps_extra is not None:
106 |                         default_inlist = hps_extra._items[hps._items['item1']]
107 |                     else:
108 |                         default_inlist = hps._items[hps._items['item1']]                        
109 |                 if key == 'seq2':
110 |                     if hps_extra is not None:
111 |                         default_inlist = hps_extra._items[hps._items['item2']]
112 |                     else:
113 |                         default_inlist = hps._items[hps._items['item2']]                        
114 |                 if key == 'seq3':
115 |                     if hps_extra is not None:
116 |                         default_inlist = hps_extra._items[hps._items['item3']]
117 |                     else:
118 |                         default_inlist = hps._items[hps._items['item3']]                        
119 |                 if isinstance(default_inlist, bool):
120 |                     hps._set(key, [i.lower() == "true" for i in value])
121 |                 elif isinstance(default_inlist, int):
122 |                     hps._set(key, [int(i) for i in value])
123 |                 elif isinstance(default_inlist, float):
124 |                     hps._set(key, [float(i) for i in value])
125 |                 else:
126 |                     hps._set(key,value) # string
127 |             else:
128 |                 hps._set(key, value)
129 |         return hps
130 | 


--------------------------------------------------------------------------------
/src/rl/wrappers/action_space.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from .multi_discrete import BoxToMultiDiscrete, DiscreteToMultiDiscrete
  3 | 
  4 | # Constants
  5 | NUM_ACTIONS = 43
  6 | ALLOWED_ACTIONS = [
  7 |     [0, 10, 11],                                # 0 - Basic
  8 |     [0, 10, 11, 13, 14, 15],                    # 1 - Corridor
  9 |     [0, 14, 15],                                # 2 - DefendCenter
 10 |     [0, 14, 15],                                # 3 - DefendLine
 11 |     [13, 14, 15],                               # 4 - HealthGathering
 12 |     [13, 14, 15],                               # 5 - MyWayHome
 13 |     [0, 14, 15],                                # 6 - PredictPosition
 14 |     [10, 11],                                   # 7 - TakeCover
 15 |     [x for x in range(NUM_ACTIONS) if x != 33], # 8 - Deathmatch
 16 | ]
 17 | 
 18 | __all__ = [ 'ToDiscrete', 'ToBox' ]
 19 | 
 20 | def ToDiscrete(config):
 21 |     # Config can be 'minimal', 'constant-7', 'constant-17', 'full'
 22 | 
 23 |     class ToDiscreteWrapper(gym.Wrapper):
 24 |         """
 25 |             Doom wrapper to convert MultiDiscrete action space to Discrete
 26 | 
 27 |             config:
 28 |                 - minimal - Will only use the levels' allowed actions (+ NOOP)
 29 |                 - constant-7 - Will use the 7 minimum actions (+NOOP) to complete all levels
 30 |                 - constant-17 - Will use the 17 most common actions (+NOOP) to complete all levels
 31 |                 - full - Will use all available actions (+ NOOP)
 32 | 
 33 |             list of commands:
 34 |                 - minimal:
 35 |                     Basic:              NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT
 36 |                     Corridor:           NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 37 |                     DefendCenter        NOOP, ATTACK, TURN_RIGHT, TURN_LEFT
 38 |                     DefendLine:         NOOP, ATTACK, TURN_RIGHT, TURN_LEFT
 39 |                     HealthGathering:    NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 40 |                     MyWayHome:          NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 41 |                     PredictPosition:    NOOP, ATTACK, TURN_RIGHT, TURN_LEFT
 42 |                     TakeCover:          NOOP, MOVE_RIGHT, MOVE_LEFT
 43 |                     Deathmatch:         NOOP, ALL COMMANDS (Deltas are limited to [0,1] range and will not work properly)
 44 | 
 45 |                 - constant-7: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON
 46 | 
 47 |                 - constant-17: NOOP, ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD
 48 |                                 MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON
 49 |         """
 50 |         def __init__(self, env):
 51 |             super(ToDiscreteWrapper, self).__init__(env)
 52 |             if config == 'minimal':
 53 |                 allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level]
 54 |             elif config == 'constant-7':
 55 |                 allowed_actions = [0, 10, 11, 13, 14, 15, 31]
 56 |             elif config == 'constant-17':
 57 |                 allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32]
 58 |             elif config == 'full':
 59 |                 allowed_actions = None
 60 |             else:
 61 |                 raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"')
 62 |             self.action_space = DiscreteToMultiDiscrete(self.action_space, allowed_actions)
 63 |         def _step(self, action):
 64 |             return self.env._step(self.action_space(action))
 65 | 
 66 |     return ToDiscreteWrapper
 67 | 
 68 | def ToBox(config):
 69 |     # Config can be 'minimal', 'constant-7', 'constant-17', 'full'
 70 | 
 71 |     class ToBoxWrapper(gym.Wrapper):
 72 |         """
 73 |             Doom wrapper to convert MultiDiscrete action space to Box
 74 | 
 75 |             config:
 76 |                 - minimal - Will only use the levels' allowed actions
 77 |                 - constant-7 - Will use the 7 minimum actions to complete all levels
 78 |                 - constant-17 - Will use the 17 most common actions to complete all levels
 79 |                 - full - Will use all available actions
 80 | 
 81 |             list of commands:
 82 |                 - minimal:
 83 |                     Basic:              ATTACK, MOVE_RIGHT, MOVE_LEFT
 84 |                     Corridor:           ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 85 |                     DefendCenter        ATTACK, TURN_RIGHT, TURN_LEFT
 86 |                     DefendLine:         ATTACK, TURN_RIGHT, TURN_LEFT
 87 |                     HealthGathering:    MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 88 |                     MyWayHome:          MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 89 |                     PredictPosition:    ATTACK, TURN_RIGHT, TURN_LEFT
 90 |                     TakeCover:          MOVE_RIGHT, MOVE_LEFT
 91 |                     Deathmatch:         ALL COMMANDS
 92 | 
 93 |                 - constant-7: ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON
 94 | 
 95 |                 - constant-17:  ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD
 96 |                                 MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON
 97 |         """
 98 |         def __init__(self, env):
 99 |             super(ToBoxWrapper, self).__init__(env)
100 |             if config == 'minimal':
101 |                 allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level]
102 |             elif config == 'constant-7':
103 |                 allowed_actions = [0, 10, 11, 13, 14, 15, 31]
104 |             elif config == 'constant-17':
105 |                 allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32]
106 |             elif config == 'full':
107 |                 allowed_actions = None
108 |             else:
109 |                 raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"')
110 |             self.action_space = BoxToMultiDiscrete(self.action_space, allowed_actions)
111 |         def _step(self, action):
112 |             return self.env._step(self.action_space(action))
113 | 
114 |     return ToBoxWrapper
115 | 


--------------------------------------------------------------------------------
/src/common/rl/wrappers/doom/action_space.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from .multi_discrete import BoxToMultiDiscrete, DiscreteToMultiDiscrete
  3 | 
  4 | # Constants
  5 | NUM_ACTIONS = 43
  6 | ALLOWED_ACTIONS = [
  7 |     [0, 10, 11],                                # 0 - Basic
  8 |     [0, 10, 11, 13, 14, 15],                    # 1 - Corridor
  9 |     [0, 14, 15],                                # 2 - DefendCenter
 10 |     [0, 14, 15],                                # 3 - DefendLine
 11 |     [13, 14, 15],                               # 4 - HealthGathering
 12 |     [13, 14, 15],                               # 5 - MyWayHome
 13 |     [0, 14, 15],                                # 6 - PredictPosition
 14 |     [10, 11],                                   # 7 - TakeCover
 15 |     [x for x in range(NUM_ACTIONS) if x != 33], # 8 - Deathmatch
 16 | ]
 17 | 
 18 | __all__ = [ 'ToDiscrete', 'ToBox' ]
 19 | 
 20 | def ToDiscrete(config):
 21 |     # Config can be 'minimal', 'constant-7', 'constant-17', 'full'
 22 | 
 23 |     class ToDiscreteWrapper(gym.Wrapper):
 24 |         """
 25 |             Doom wrapper to convert MultiDiscrete action space to Discrete
 26 | 
 27 |             config:
 28 |                 - minimal - Will only use the levels' allowed actions (+ NOOP)
 29 |                 - constant-7 - Will use the 7 minimum actions (+NOOP) to complete all levels
 30 |                 - constant-17 - Will use the 17 most common actions (+NOOP) to complete all levels
 31 |                 - full - Will use all available actions (+ NOOP)
 32 | 
 33 |             list of commands:
 34 |                 - minimal:
 35 |                     Basic:              NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT
 36 |                     Corridor:           NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 37 |                     DefendCenter        NOOP, ATTACK, TURN_RIGHT, TURN_LEFT
 38 |                     DefendLine:         NOOP, ATTACK, TURN_RIGHT, TURN_LEFT
 39 |                     HealthGathering:    NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 40 |                     MyWayHome:          NOOP, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 41 |                     PredictPosition:    NOOP, ATTACK, TURN_RIGHT, TURN_LEFT
 42 |                     TakeCover:          NOOP, MOVE_RIGHT, MOVE_LEFT
 43 |                     Deathmatch:         NOOP, ALL COMMANDS (Deltas are limited to [0,1] range and will not work properly)
 44 | 
 45 |                 - constant-7: NOOP, ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON
 46 | 
 47 |                 - constant-17: NOOP, ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD
 48 |                                 MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON
 49 |         """
 50 |         def __init__(self, env):
 51 |             super(ToDiscreteWrapper, self).__init__(env)
 52 |             if config == 'minimal':
 53 |                 allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level]
 54 |             elif config == 'constant-7':
 55 |                 allowed_actions = [0, 10, 11, 13, 14, 15, 31]
 56 |             elif config == 'constant-17':
 57 |                 allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32]
 58 |             elif config == 'full':
 59 |                 allowed_actions = None
 60 |             else:
 61 |                 raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"')
 62 |             self.action_space = DiscreteToMultiDiscrete(self.action_space, allowed_actions)
 63 |         def _step(self, action):
 64 |             return self.env._step(self.action_space(action))
 65 | 
 66 |     return ToDiscreteWrapper
 67 | 
 68 | def ToBox(config):
 69 |     # Config can be 'minimal', 'constant-7', 'constant-17', 'full'
 70 | 
 71 |     class ToBoxWrapper(gym.Wrapper):
 72 |         """
 73 |             Doom wrapper to convert MultiDiscrete action space to Box
 74 | 
 75 |             config:
 76 |                 - minimal - Will only use the levels' allowed actions
 77 |                 - constant-7 - Will use the 7 minimum actions to complete all levels
 78 |                 - constant-17 - Will use the 17 most common actions to complete all levels
 79 |                 - full - Will use all available actions
 80 | 
 81 |             list of commands:
 82 |                 - minimal:
 83 |                     Basic:              ATTACK, MOVE_RIGHT, MOVE_LEFT
 84 |                     Corridor:           ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 85 |                     DefendCenter        ATTACK, TURN_RIGHT, TURN_LEFT
 86 |                     DefendLine:         ATTACK, TURN_RIGHT, TURN_LEFT
 87 |                     HealthGathering:    MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 88 |                     MyWayHome:          MOVE_FORWARD, TURN_RIGHT, TURN_LEFT
 89 |                     PredictPosition:    ATTACK, TURN_RIGHT, TURN_LEFT
 90 |                     TakeCover:          MOVE_RIGHT, MOVE_LEFT
 91 |                     Deathmatch:         ALL COMMANDS
 92 | 
 93 |                 - constant-7: ATTACK, MOVE_RIGHT, MOVE_LEFT, MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, SELECT_NEXT_WEAPON
 94 | 
 95 |                 - constant-17:  ATTACK, JUMP, CROUCH, TURN180, RELOAD, SPEED, STRAFE, MOVE_RIGHT, MOVE_LEFT, MOVE_BACKWARD
 96 |                                 MOVE_FORWARD, TURN_RIGHT, TURN_LEFT, LOOK_UP, LOOK_DOWN, SELECT_NEXT_WEAPON, SELECT_PREV_WEAPON
 97 |         """
 98 |         def __init__(self, env):
 99 |             super(ToBoxWrapper, self).__init__(env)
100 |             if config == 'minimal':
101 |                 allowed_actions = ALLOWED_ACTIONS[self.unwrapped.level]
102 |             elif config == 'constant-7':
103 |                 allowed_actions = [0, 10, 11, 13, 14, 15, 31]
104 |             elif config == 'constant-17':
105 |                 allowed_actions = [0, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 32]
106 |             elif config == 'full':
107 |                 allowed_actions = None
108 |             else:
109 |                 raise gym.error.Error('Invalid configuration. Valid options are "minimal", "constant-7", "constant-17", "full"')
110 |             self.action_space = BoxToMultiDiscrete(self.action_space, allowed_actions)
111 |         def _step(self, action):
112 |             return self.env._step(self.action_space(action))
113 | 
114 |     return ToBoxWrapper
115 | 


--------------------------------------------------------------------------------
/src/rl/rewardwrapper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Chain environment
  4 | @author: thomas
  5 | """
  6 | 
  7 | import gym.spaces
  8 | import gym
  9 | import numpy as np
 10 | 
 11 | from gym import Wrapper
 12 | 
 13 | class NormalizeWrapper(object):
 14 |     ''' Heuristically normalizes the reward scale for CartPole and MountainCar '''
 15 |     
 16 |     def __init__(self,name):
 17 |         # n = length of chain
 18 |         if 'CartPole' in name:
 19 |             self.env = gym.make('CartPole-v0')
 20 |         elif 'MountainCar' in name:
 21 |             self.env = gym.make('MountainCar-v0')
 22 |         self.name = name
 23 |         self.action_space = self.env.action_space
 24 |         self.observation_space = self.env.observation_space
 25 | 
 26 |     def reset(self):
 27 |         return self.env.reset()
 28 |         
 29 |     def step(self,a):
 30 |         s,r,terminal,_ = self.env.step(a)
 31 |         r = r/50
 32 |         return s,r,terminal, _
 33 | 
 34 | class PILCOWrapper(object):
 35 |     ''' Wraps according to PILCO '''
 36 |     
 37 |     def __init__(self,name):
 38 |         # n = length of chain
 39 |         if 'CartPole' in name:
 40 |             self.env = gym.make('CartPole-v0')
 41 |         elif 'MountainCar' in name:
 42 |             self.env = gym.make('MountainCar-v0')
 43 |         self.name = name
 44 |         self.action_space = self.env.action_space
 45 |         self.observation_space = self.env.observation_space
 46 | 
 47 |     def reset(self):
 48 |         return self.env.reset()
 49 |         
 50 |     def step(self,a):
 51 |         s,r,terminal,_ = self.env.step(a)
 52 |         r = pilco_reward(s,self.name)
 53 |         return s,r,terminal, _
 54 |     
 55 | def pilco_reward(s,game='Cartpole-v0'):
 56 |     ''' use modified reward function as in Pilco '''
 57 |     from scipy.stats import multivariate_normal
 58 |     if game == 'CartPole-vp':
 59 |         target = np.array([0.0,0.0,0.0,0.0])
 60 |     elif game == 'Acrobot-vp':
 61 |         target = np.array([1.0])
 62 |         s = -np.cos(s[0]) - np.cos(s[1] + s[0])
 63 |     elif game == 'MountainCar-vp':
 64 |         target = np.array([0.5])
 65 |         s = s[0]
 66 |     elif game == 'Pendulum-vp':
 67 |         target = np.array([0.0,0.0])
 68 |     else:
 69 |         raise ValueError('no PILCO reward mofication for this game')
 70 |     r = 1 - multivariate_normal.pdf(s,mean=target)
 71 |     return r
 72 | 
 73 | class RewardWrapper2(Wrapper):
 74 |     env = None
 75 | 
 76 |     def __init__(self, env):
 77 |         self.env = env
 78 |         self.action_space = self.env.action_space
 79 |         self.observation_space = self.env.observation_space
 80 |         self.reward_range = self.env.reward_range
 81 |         self.metadata = self.env.metadata
 82 |         self._warn_double_wrap() 
 83 |         while True:
 84 |             if hasattr(env,'_spec'):
 85 |                 self.name = env._spec.id
 86 |                 break
 87 |             else:
 88 |                 env = env.env
 89 |     
 90 |     def reset(self):
 91 |         return self.env.reset()
 92 | 
 93 |     def step(self, action):
 94 |         observation, reward, terminal, info = self.env.step(action)
 95 |         return observation, self.reward(reward,terminal), terminal, info
 96 | 
 97 |     def reward(self,r,terminal):
 98 |         if 'CartPole' in self.name:
 99 |             if terminal:
100 |                 r = -1
101 |             else:
102 |                 r = 0.005
103 |         elif 'MountainCar' in self.name:
104 |             if terminal:
105 |                 r = 1
106 |             else:
107 |                 r = -0.005
108 |         elif 'Acrobot' in self.name:
109 |             if terminal:
110 |                 r = 1
111 |             else:
112 |                 r = -0.005
113 |         elif 'LunarLander' in self.name:
114 |             r = r/250.0
115 |         return r
116 |         
117 | 
118 | class RewardWrapper(object):
119 |     ''' Chain domain '''
120 |     
121 |     def __init__(self,name):
122 |         # n = length of chain
123 |         if name == 'CartPole-vr':
124 |             self.env = gym.make('CartPole-v1')
125 |             if type(self.env) == gym.wrappers.time_limit.TimeLimit:
126 |                 self.env = self.env.env
127 |         elif name == 'MountainCar-vr':
128 |             self.env = gym.make('MountainCar-v0')
129 |             if type(self.env) == gym.wrappers.time_limit.TimeLimit:
130 |                 self.env = self.env.env
131 |         elif name == 'Acrobot-vr':
132 |             self.env = gym.make('Acrobot-v1')
133 |             if type(self.env) == gym.wrappers.time_limit.TimeLimit:
134 |                 self.env = self.env.env
135 |         elif name == 'LunarLander-vr':
136 |             self.env = gym.make('LunarLander-v2')
137 |             # self.env.metadata = {}
138 |             if type(self.env) == gym.wrappers.time_limit.TimeLimit:
139 |                 self.env = self.env.env
140 |         self.name = name
141 |         self.action_space = self.env.action_space
142 |         self.observation_space = self.env.observation_space
143 | 
144 |     def reset(self):
145 |         return self.env.reset()
146 |         
147 |     def step(self,a):
148 |         s,r,terminal,_ = self.env.step(a)        
149 |         if self.name == 'CartPole-vr':
150 |             if terminal:
151 |                 r = -1
152 |             else:
153 |                 r = 0
154 |         elif self.name == 'MountainCar-vr':
155 |             if terminal:
156 |                 r = 1
157 |             else:
158 |                 r = 0
159 |         elif self.name == 'Acrobot-vr':
160 |             if terminal:
161 |                 r = 1
162 |             else:
163 |                 r = 0
164 |         elif self.name == 'LunarLander-vr':
165 |             r = r/250.0
166 |         return s,r,terminal, _
167 | 
168 |     def seed(self,seed):
169 |         self.env.seed(seed)
170 |                 
171 |     def render(self):
172 |         return self.env.render()
173 |         
174 |     def close(self):
175 |         return self.env.close()
176 | 
177 | # Test
178 | if __name__ == '__main__':
179 |     for game in ['MountainCar-vr','CartPole-vr']:
180 |         Env = RewardWrapper(game)
181 |         s = Env.reset()
182 |         for i in range(500): 
183 |             a = Env.action_space.sample()
184 |             s,r,terminal,_ = Env.step(a)
185 |             if terminal:
186 |                 print('Died in step',i,'with reward',r,' restarting')
187 |                 s = Env.reset() 
188 |         print('Finished')


--------------------------------------------------------------------------------
/src/common/submit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Script to submit jobs
  4 | Handles slurm settings, hyperparameter looping, and potential plotting (if not on slurm)
  5 | @author: thomas
  6 | """
  7 | import os
  8 | import argparse
  9 | from pprint import pformat
 10 | from .hps_setup import get_hps_setup,hps_to_dict,hps_to_list
 11 | from .visualize import nested_list,make_name,plot_hyperloop_results
 12 | 
 13 | def make_unique_subfolder(folder,hyperloop=False):
 14 |     ''' adds a unique four digit subfolder to folder '''
 15 |     i = 0
 16 |     while os.path.exists(folder + candidate(i,hyperloop)):
 17 |         i += 1
 18 |     subfolder = folder + candidate(i,hyperloop)
 19 |     if not os.path.exists(subfolder):
 20 |         os.makedirs(subfolder)
 21 |     return subfolder
 22 | 
 23 | def candidate(i,hyperloop):
 24 |     return '{0:04}h/'.format(i) if hyperloop else '{0:04}/'.format(i)
 25 | 
 26 | def submit_slurm(hps,hps_setup,hyperloopname,job_dir,slurmout_dir,ntasks,nodes,n_cpu,mem_per_cpu):
 27 |     # make sh file        
 28 |     run_file = job_dir + hps.game + hyperloopname + '0.sh'
 29 |     
 30 |     if hps_setup.distributed:
 31 |         base = ' '.join(['mpirun  -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH'
 32 |                          '-mca pml ob1 -mca btl ^openib python3 agent.py --hp {} --no_plot']).format(hps_to_list(hps)) # this should become mpirun  
 33 |     else:
 34 |         base = 'srun python3 agent.py --hp {} --no_plot '.format(hps_to_list(hps)) # this should become mpirun   
 35 |             
 36 |     with open(run_file,'w') as fp:
 37 |         fp.write('#!/bin/sh\n')    
 38 |         fp.write(base)
 39 |     
 40 |     # prepare sbatch command
 41 |     my_sbatch = ' '.join(['sbatch --partition=general --qos={} --time={} --ntasks={}',
 42 |                 '--nodes={} --cpus-per-task={} --mem-per-cpu={} --mail-type=NONE',
 43 |                 '--output={}slurm-%j.out',
 44 |                 '--exclude=ess-2',
 45 |                 '--workdir={}',
 46 |                 '--job-name={} {}']).format(hps_setup.slurm_qos,hps_setup.slurm_time,ntasks,
 47 |                                             nodes,n_cpu,mem_per_cpu,slurmout_dir,
 48 |                                             os.getcwd(),hps.game,run_file)
 49 |     # run
 50 |     os.system('chmod +x {}'.format(run_file))
 51 |     return_val = os.system(my_sbatch)
 52 |     if return_val != 0:
 53 |         raise ValueError('submission went wrong')   
 54 | 
 55 | def submit(hp_string,hpsetup_string,no_plot,agent,get_hps,override_hps_settings):
 56 |     hps = get_hps().parse(hp_string)    
 57 |     hps_setup = get_hps_setup().parse(hpsetup_string,hps)  
 58 |     # override game and name from hyperlooping
 59 |     if hps_setup.game != 'None':
 60 |         hps.game = hps_setup.game
 61 |     if hps_setup.name != 'None':
 62 |         hps.name = hps_setup.name
 63 |     
 64 |     # set-up base result folder
 65 |     result_folder = os.getcwd() + '/results/{}/{}/'.format(hps.name,hps.game)
 66 | 
 67 |     # check whether we should be hyperlooping
 68 |     loop_hyper = True if (hps_setup.item1 is not None or hps_setup.item2 is not None or hps_setup.item3 is not None) else False
 69 | 
 70 |     # make the unique subfolder
 71 |     subfolder = make_unique_subfolder(result_folder,loop_hyper)
 72 | 
 73 |     # Write hyperparameters in nice format   
 74 |     with open(subfolder + 'hps_setup.txt','w') as file:
 75 |         file.write(pformat(hps_to_dict(hps_setup)))
 76 |     with open(subfolder + 'hps.txt','w') as file:
 77 |         file.write(pformat(hps_to_dict(hps)))
 78 |     with open(subfolder + 'hps_setup_raw.txt','w') as file:
 79 |                 file.write(hps_to_list(hps_setup))
 80 |     
 81 |     if not hps_setup.slurm:
 82 |         # for automatically plotting results if not on slurm
 83 |         n1,n2,n3 = len(hps_setup.seq1),len(hps_setup.seq2),len(hps_setup.seq3)
 84 |         results = nested_list(n1,n2,n3,hps_setup.n_rep) # handle plotting within this call, so agregate results
 85 |     else:
 86 |         no_plot = True
 87 |         # prepare slurm submission folders
 88 |         job_dir = os.getcwd() + '/results/jobs/'
 89 |         if not os.path.exists(job_dir):
 90 |             os.makedirs(job_dir)
 91 |         slurmout_dir = os.getcwd() + '/results/slurmout/'
 92 |         if not os.path.exists(slurmout_dir):
 93 |             os.makedirs(slurmout_dir)  
 94 |         # some slurm settings initialization due to the specific Delft slurm cluster    
 95 |         if hps_setup.distributed:
 96 |             ntasks = hps_setup.n_tasks
 97 |             nodes = '1-3'
 98 |             n_cpu = hps_setup.cpu_per_task   
 99 |             mem_per_cpu = int((16384/(ntasks*n_cpu)) - 5)
100 |         else:
101 |             ntasks = 1
102 |             nodes = 1
103 |             n_cpu = hps_setup.cpu_per_task   
104 |             mem_per_cpu = hps_setup.mem_per_cpu
105 | 
106 |     for rep in range(hps_setup.n_rep): 
107 |         hps.rep = rep          
108 |         for it1,item1 in enumerate(hps_setup.seq1):
109 |             if hps_setup.item1 is not None: 
110 |                 hps._set(hps_setup.item1,item1)
111 |             for it2,item2 in enumerate(hps_setup.seq2):
112 |                 if hps_setup.item2 is not None: 
113 |                     hps._set(hps_setup.item2,item2)
114 |                 for it3,item3 in enumerate(hps_setup.seq3):
115 |                     if hps_setup.item3 is not None: 
116 |                         hps._set(hps_setup.item3,item3)
117 |                     hyperloop_name = make_name('',hps_setup.item1,item1,hps_setup.item2,item2,
118 |                                                hps_setup.item3,item3)
119 |                     # if loop_hyper:
120 |                     result_folder = subfolder + hyperloop_name                 
121 |                     hps.result_dir = result_folder + 'rep:{}'.format(rep)   
122 |                     
123 |                     hps = override_hps_settings(hps) # maybe some hps_setup parameter overrides a number of hps parameters
124 | 
125 |                     # Submit slurm job or launch agent in this process
126 |                     if hps_setup.slurm:
127 |                         submit_slurm(hps,hps_setup,hyperloop_name,job_dir,slurmout_dir,ntasks,nodes,n_cpu,mem_per_cpu)  
128 |                     else:
129 |                         print(' ________________________________________ ')     
130 |                         print('Start learning on game {} with hyperparams {}'.format(hps.game,hyperloop_name)) 
131 |                         curve = agent(hps)   
132 |                         results[it1][it2][it3][rep] = curve
133 |                         
134 |     if not no_plot:
135 |         plot_hyperloop_results(results,hps_setup,subfolder,plot_type='mean',sd=True)        
136 |         
137 | if __name__ == "__main__":   
138 |     '''Set-up training'''
139 |     parser = argparse.ArgumentParser()
140 |     parser.add_argument('--hp', help='Hyperparameter configuration',default='')
141 |     parser.add_argument('--hpsetup', help='Hyperparameter configuration of slurm and hyperparameters and distribution',default='')
142 |     parser.add_argument('--no_plot', action='store_true',default=False)
143 |     args = parser.parse_args()


--------------------------------------------------------------------------------
/src/rl/envs/grid.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Grid-world environment
  4 | @author: thomas
  5 | """
  6 | 
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import matplotlib.patches as patches
 10 | import random
 11 | 
 12 | class Grid(object):
 13 |     ''' Grid world with stochastic ghosts '''
 14 |     
 15 |     def __init__(self,to_plot=False,grid=False):
 16 |         world = np.zeros([7,7],dtype='int32')
 17 |         world[1:6,1] = 1
 18 |         world[1:3,4] = 1
 19 |         world[4:6,4] = 1
 20 |         self.world = world
 21 |         self.grid = grid
 22 |         self.reset()
 23 |         self.observation_shape = np.shape(self.get_state())[0]
 24 |         
 25 |         if to_plot:
 26 |             plt.ion()
 27 |             fig = plt.figure()
 28 |             ax1 = fig.add_subplot(111,aspect='equal')
 29 |             ax1.axis('off')
 30 |             plt.xlim([-1,8])
 31 |             plt.ylim([-1,8])
 32 |             
 33 |             #colors = matplotlib.colors.ListerColormap()
 34 |             for i in range(7):
 35 |                 for j in range(7):
 36 |                     if world[i,j]==1:
 37 |                         col = "black"
 38 |                     else:
 39 |                         col = "white"
 40 |                     ax1.add_patch(
 41 |                         patches.Rectangle(
 42 |                             (i,j),1,1,
 43 |                             #fill=False,
 44 |                             edgecolor='black',
 45 |                             linewidth = 2,
 46 |                             facecolor = col,),
 47 |                         )
 48 |                     if np.all([i,j] == self.ghost1):
 49 |                         self.g1 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='red'))
 50 |                     if np.all([i,j] == self.ghost2):
 51 |                         self.g2 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='blue'))                    
 52 |                     if np.all([i,j] == self.pacman):
 53 |                         self.p = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='yellow'))
 54 |             self.fig = fig
 55 |             self.ax1 = ax1
 56 |             self.fig.canvas.draw()
 57 |             
 58 |     def reset(self):
 59 |         self.pacman = np.array([0,0])
 60 |         self.ghost1 = np.array([1,3]) 
 61 |         self.ghost2 = np.array([5,3])
 62 |         return self.get_state()
 63 |         
 64 |     def set_state(self,state):
 65 |         self.pacman = np.array(state[0:2])
 66 |         self.ghost1 = np.array(state[2:4])
 67 |         self.ghost2 = np.array(state[4:6])
 68 |     
 69 |     def step(self,a):        
 70 |         # move pacman
 71 |         self._move(self.pacman,a)
 72 |             
 73 |         # check collision
 74 |         dead = self._check_dead()
 75 |         if dead:
 76 |             r = -1
 77 |             return self.get_state(),r,dead
 78 |             
 79 |         # move ghosts
 80 |         wall = True
 81 |         while wall:
 82 |             a1 = random.sample(range(4),1) # random ghost
 83 |             wall = self._move(self.ghost1,a1)
 84 |             
 85 |         # move ghosts
 86 |         wall = True
 87 |         while wall:
 88 |             a2 = np.where(np.random.multinomial(1,[0.1,0.1,0.4,0.4]))[0] # probabilistic ghost
 89 |             wall = self._move(self.ghost2,a2)
 90 | 
 91 |         # check collision again
 92 |         dead = self._check_dead()
 93 |         if dead:
 94 |             r = -1
 95 |         else:
 96 |             if np.all(self.pacman == np.array([6,6])):
 97 |                 r = 10
 98 |                 dead = True
 99 |                 #print('Reached the goal')
100 |             else:
101 |                 r = 0          
102 |         return self.get_state(),r,dead
103 | 
104 |     def get_state(self):
105 |         if not self.grid:
106 |             state = np.concatenate((self.pacman,self.ghost1,self.ghost2))
107 |         else:
108 |             state = np.copy(self.world)
109 |             state = np.stack(state,np.zeros(7,7),np.zeros(7,7),np.zeros(7,7),axis=2)
110 |             state[self.pacman[0],self.pacman[1],1] = 1
111 |             state[self.ghost1[0],self.ghost1[1],2] = 1
112 |             state[self.ghost2[0],self.ghost2[1],3] = 1
113 |         return state
114 |     
115 |     def plot(self):
116 |         self.g1.remove() 
117 |         self.g2.remove() 
118 |         self.p.remove()
119 |         
120 |         # replot
121 |         self.g1 = self.ax1.add_artist(plt.Circle(self.ghost1+0.5,0.3,color='red'))
122 |         self.g2 = self.ax1.add_artist(plt.Circle(self.ghost2+0.5,0.3,color='blue'))
123 |         self.p = self.ax1.add_artist(plt.Circle(self.pacman +0.5,0.3,color='yellow'))
124 |         self.fig.canvas.draw()
125 |         
126 |     def plot_predictions(self,world):
127 |         for i in range(7):
128 |             for j in range(7):
129 |                 for k in range(3):
130 |                     if k==1:
131 |                         col = "yellow"
132 |                     elif k == 2:
133 |                         col = "red"
134 |                     elif k == 3:
135 |                         col = 'blue'
136 |                     if world[i,j,k]>0.0:
137 |                         self.ax1.add_patch(patches.Rectangle(
138 |                                 (i,j),1,1,
139 |                                 #fill=False,
140 |                                 edgecolor='black',
141 |                                 linewidth = 2,
142 |                                 facecolor = col,
143 |                                 alpha=world[i,j,k]),
144 |                             )
145 |     
146 |     def _move(self,s,a):
147 |         s_old = np.copy(s)
148 |         
149 |         # move
150 |         if int(a[0]) == 0: #up
151 |             s[1] +=1
152 |         elif int(a[0]) == 1: #down
153 |             s[1] -=1
154 |         elif int(a[0])== 2: #right
155 |             s[0] +=1
156 |         elif int(a[0])==3: #left
157 |             s[0] -=1
158 |         else: 
159 |             raise ValueError('move not possible')
160 |             
161 |         # check if move is possible
162 |         if s[0]<0 or s[0]>6 or s[1]<0 or s[1]>6: # out of grid
163 |             wall = True
164 |         elif np.all(self.world[s[0],s[1]] == 1): # wall
165 |             wall = True
166 |         else:
167 |             wall = False
168 |         
169 |         if wall:
170 |             # Need to repeat, put back old values
171 |             s[0] = s_old[0]
172 |             s[1] = s_old[1]
173 |             return wall
174 |         else:
175 |             # Move to new state
176 |             return wall
177 |     
178 |     def _check_dead(self):
179 |         if np.all(self.pacman == self.ghost1) or np.all(self.pacman == self.ghost2):
180 |             return True
181 |         else:
182 |             return False
183 |     
184 | 
185 | # Test
186 | if __name__ == '__main__':
187 |     grid = grid_env(True)
188 |     s = grid.get_state()
189 |     for i in range(200): 
190 |         a = random.sample(range(4),1)
191 |         s,r,dead = grid.step(a)
192 |         if not dead:
193 |             grid.plot()
194 |         else:
195 |             print('Died in step',i,', restarting')
196 |             s = grid.reset() 
197 |     print(grid.get_state())
198 |     print('Finished')
199 |     plt.show(block=True)
200 | 


--------------------------------------------------------------------------------
/src/common/rl/envs/grid.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Grid-world environment
  4 | @author: thomas
  5 | """
  6 | 
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import matplotlib.patches as patches
 10 | import random
 11 | 
 12 | class Grid(object):
 13 |     ''' Grid world with stochastic ghosts '''
 14 |     
 15 |     def __init__(self,to_plot=False,grid=False):
 16 |         world = np.zeros([7,7],dtype='int32')
 17 |         world[1:6,1] = 1
 18 |         world[1:3,4] = 1
 19 |         world[4:6,4] = 1
 20 |         self.world = world
 21 |         self.grid = grid
 22 |         self.reset()
 23 |         self.observation_shape = np.shape(self.get_state())[0]
 24 |         
 25 |         if to_plot:
 26 |             plt.ion()
 27 |             fig = plt.figure()
 28 |             ax1 = fig.add_subplot(111,aspect='equal')
 29 |             ax1.axis('off')
 30 |             plt.xlim([-1,8])
 31 |             plt.ylim([-1,8])
 32 |             
 33 |             #colors = matplotlib.colors.ListerColormap()
 34 |             for i in range(7):
 35 |                 for j in range(7):
 36 |                     if world[i,j]==1:
 37 |                         col = "black"
 38 |                     else:
 39 |                         col = "white"
 40 |                     ax1.add_patch(
 41 |                         patches.Rectangle(
 42 |                             (i,j),1,1,
 43 |                             #fill=False,
 44 |                             edgecolor='black',
 45 |                             linewidth = 2,
 46 |                             facecolor = col,),
 47 |                         )
 48 |                     if np.all([i,j] == self.ghost1):
 49 |                         self.g1 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='red'))
 50 |                     if np.all([i,j] == self.ghost2):
 51 |                         self.g2 = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='blue'))                    
 52 |                     if np.all([i,j] == self.pacman):
 53 |                         self.p = ax1.add_artist(plt.Circle((i+0.5,j+0.5),0.3,color='yellow'))
 54 |             self.fig = fig
 55 |             self.ax1 = ax1
 56 |             self.fig.canvas.draw()
 57 |             
 58 |     def reset(self):
 59 |         self.pacman = np.array([0,0])
 60 |         self.ghost1 = np.array([1,3]) 
 61 |         self.ghost2 = np.array([5,3])
 62 |         return self.get_state()
 63 |         
 64 |     def set_state(self,state):
 65 |         self.pacman = np.array(state[0:2])
 66 |         self.ghost1 = np.array(state[2:4])
 67 |         self.ghost2 = np.array(state[4:6])
 68 |     
 69 |     def step(self,a):        
 70 |         # move pacman
 71 |         self._move(self.pacman,a)
 72 |             
 73 |         # check collision
 74 |         dead = self._check_dead()
 75 |         if dead:
 76 |             r = -1
 77 |             return self.get_state(),r,dead
 78 |             
 79 |         # move ghosts
 80 |         wall = True
 81 |         while wall:
 82 |             a1 = random.sample(range(4),1) # random ghost
 83 |             wall = self._move(self.ghost1,a1)
 84 |             
 85 |         # move ghosts
 86 |         wall = True
 87 |         while wall:
 88 |             a2 = np.where(np.random.multinomial(1,[0.1,0.1,0.4,0.4]))[0] # probabilistic ghost
 89 |             wall = self._move(self.ghost2,a2)
 90 | 
 91 |         # check collision again
 92 |         dead = self._check_dead()
 93 |         if dead:
 94 |             r = -1
 95 |         else:
 96 |             if np.all(self.pacman == np.array([6,6])):
 97 |                 r = 10
 98 |                 dead = True
 99 |                 #print('Reached the goal')
100 |             else:
101 |                 r = 0          
102 |         return self.get_state(),r,dead
103 | 
104 |     def get_state(self):
105 |         if not self.grid:
106 |             state = np.concatenate((self.pacman,self.ghost1,self.ghost2))
107 |         else:
108 |             state = np.copy(self.world)
109 |             state = np.stack(state,np.zeros(7,7),np.zeros(7,7),np.zeros(7,7),axis=2)
110 |             state[self.pacman[0],self.pacman[1],1] = 1
111 |             state[self.ghost1[0],self.ghost1[1],2] = 1
112 |             state[self.ghost2[0],self.ghost2[1],3] = 1
113 |         return state
114 |     
115 |     def plot(self):
116 |         self.g1.remove() 
117 |         self.g2.remove() 
118 |         self.p.remove()
119 |         
120 |         # replot
121 |         self.g1 = self.ax1.add_artist(plt.Circle(self.ghost1+0.5,0.3,color='red'))
122 |         self.g2 = self.ax1.add_artist(plt.Circle(self.ghost2+0.5,0.3,color='blue'))
123 |         self.p = self.ax1.add_artist(plt.Circle(self.pacman +0.5,0.3,color='yellow'))
124 |         self.fig.canvas.draw()
125 |         
126 |     def plot_predictions(self,world):
127 |         for i in range(7):
128 |             for j in range(7):
129 |                 for k in range(3):
130 |                     if k==1:
131 |                         col = "yellow"
132 |                     elif k == 2:
133 |                         col = "red"
134 |                     elif k == 3:
135 |                         col = 'blue'
136 |                     if world[i,j,k]>0.0:
137 |                         self.ax1.add_patch(patches.Rectangle(
138 |                                 (i,j),1,1,
139 |                                 #fill=False,
140 |                                 edgecolor='black',
141 |                                 linewidth = 2,
142 |                                 facecolor = col,
143 |                                 alpha=world[i,j,k]),
144 |                             )
145 |     
146 |     def _move(self,s,a):
147 |         s_old = np.copy(s)
148 |         
149 |         # move
150 |         if int(a[0]) == 0: #up
151 |             s[1] +=1
152 |         elif int(a[0]) == 1: #down
153 |             s[1] -=1
154 |         elif int(a[0])== 2: #right
155 |             s[0] +=1
156 |         elif int(a[0])==3: #left
157 |             s[0] -=1
158 |         else: 
159 |             raise ValueError('move not possible')
160 |             
161 |         # check if move is possible
162 |         if s[0]<0 or s[0]>6 or s[1]<0 or s[1]>6: # out of grid
163 |             wall = True
164 |         elif np.all(self.world[s[0],s[1]] == 1): # wall
165 |             wall = True
166 |         else:
167 |             wall = False
168 |         
169 |         if wall:
170 |             # Need to repeat, put back old values
171 |             s[0] = s_old[0]
172 |             s[1] = s_old[1]
173 |             return wall
174 |         else:
175 |             # Move to new state
176 |             return wall
177 |     
178 |     def _check_dead(self):
179 |         if np.all(self.pacman == self.ghost1) or np.all(self.pacman == self.ghost2):
180 |             return True
181 |         else:
182 |             return False
183 |     
184 | 
185 | # Test
186 | if __name__ == '__main__':
187 |     grid = grid_env(True)
188 |     s = grid.get_state()
189 |     for i in range(200): 
190 |         a = random.sample(range(4),1)
191 |         s,r,dead = grid.step(a)
192 |         if not dead:
193 |             grid.plot()
194 |         else:
195 |             print('Died in step',i,', restarting')
196 |             s = grid.reset() 
197 |     print(grid.get_state())
198 |     print('Finished')
199 |     plt.show(block=True)
200 | 


--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Chain experiments
  4 | @author: thomas
  5 | """
  6 | 
  7 | if __name__ == '__main__' and __package__ is None:
  8 |     from os import sys, path
  9 |     sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
 10 | 
 11 | global mpl
 12 | import matplotlib as mpl
 13 | mpl.use('Agg')
 14 | 
 15 | import numpy as np
 16 | import os
 17 | import time
 18 | from tensorflow.python import debug as tf_debug
 19 | import tensorflow as tf
 20 | import argparse
 21 | from pprint import pformat
 22 | #from pdb import set_trace
 23 | 
 24 | # common package import
 25 | from src.common.rl.make_game import make_game
 26 | from src.common.submit import make_unique_subfolder
 27 | from src.common.hps_setup import hps_to_dict
 28 | from src.common.visualize import plot_single_experiment
 29 | from src.common.putils import store_safely
 30 | 
 31 | # local imports
 32 | from config.hps import get_hps,override_hps_settings
 33 | from src.mcts import MCTS,display_info
 34 | from src.network import Model,Database
 35 | 
 36 | def agent(hps):
 37 |     ''' Agent function '''
 38 |     tf.reset_default_graph()
 39 |     
 40 |     # storage
 41 |     result = {}
 42 |     env_steps,ep_return = [],[] # will indicate the timestep for the learning curve
 43 |     losses,gn = [],[]
 44 |     best_R = -np.Inf    
 45 |                
 46 |     Env = make_game(hps.game)
 47 |     D = Database(max_size=max(hps.data_size,hps.n_mcts*hps.steps_per_ep),batch_size=hps.batch_size)        
 48 |     model = Model(Env,lr=hps.lr,n_mix=hps.n_mix,clip_gradient_norm=hps.clip_gradient_norm,loss_type=hps.loss_type,
 49 |                   bound=hps.bound,temp=hps.temp,entropy_l=hps.entropy_l)        
 50 | 
 51 |     #with tf.Session() as sess,sess.as_default():
 52 |     with tf.Session() as sess:
 53 |         if hps.tfdb:
 54 |             sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 55 |         model.sess = sess
 56 |         sess.run(tf.global_variables_initializer())
 57 |         global_t_mcts = 0
 58 |         global_t = 0 
 59 |         
 60 |         for ep in range(hps.n_eps):
 61 |             start = time.time()
 62 |             root_index = Env.reset() 
 63 |             root = None
 64 |             R = 0.0 # episode reward
 65 |             t = 0 # episode steps        
 66 |             seed = np.random.randint(1e7) 
 67 |             Env.seed(seed)                                   
 68 |             a_store = []
 69 | 
 70 |             while True:
 71 |                 # run an episode
 72 |                 if hps.timeit: now = time.time()
 73 |                 root = MCTS(root_index,root,Env,N=hps.n_mcts,model=model,c=hps.c,bootstrap_V=hps.bootstrap_V,
 74 |                             block_loop=hps.block_loop,sigma_tree=hps.sigma_tree,backup_Q=hps.backup_Q,
 75 |                             backup_sigma_tree=hps.backup_sigma_tree,seed=seed,a_his=a_store,
 76 |                             alpha=hps.alpha,C_widening=hps.C_widening,use_prior=hps.use_prior,timeit=hps.timeit,
 77 |                             random_action_frac=hps.random_action_frac)
 78 |                 if hps.timeit: print('One MCTS search takes {} seconds'.format(time.time()-now))                    
 79 |                 if hps.verbose_mcts: display_info(root,'{}'.format(t),hps.c)
 80 |                     
 81 |                 probs,a_list,V,a,a_argmax = root.return_results(decision_type=hps.decision_type,loss_type=hps.loss_type,
 82 |                                                               temperature=hps.temp,V_decision=hps.V_decision)
 83 |                 for k,prob in enumerate(probs):
 84 |                     D.store((root.index,V,a_list[k],np.array([prob])))
 85 |                     #if count == 0:
 86 |                     #    print('Warning',[child_action.n for child_action in root.child_actions],display_info(root,'{}'.format(t),hps.c))
 87 |                         
 88 |                 # Make the step
 89 |                 a_store.append(a)
 90 |                 s1,r,terminal,_ = Env.step(a)
 91 |                 R += r
 92 |                 t += 1
 93 |                 global_t += 1
 94 |                 global_t_mcts += hps.n_mcts
 95 |     
 96 |                 #if hps.verbose:
 97 |                 #    if (t % 50) == 0: 
 98 |                 #        print('Overall step {}, root currently returns V {}, and considers a {} with counts {}'.format(global_t,V,a_list,probs))
 99 |                         
100 |                 if terminal or (t > hps.steps_per_ep):
101 |                     if hps.verbose:
102 |                         print('Episode terminal, total reward {}, steps {}'.format(R,t))
103 |                     ep_return.append(R)
104 |                     env_steps.append(global_t_mcts)
105 |                     break # break out, start new episode
106 |                 else:
107 |                     root = root.forward(a_argmax,s1,r,terminal,model)
108 | 
109 |             # saving
110 |             result.update({'steps':env_steps,'return':ep_return})
111 |             if hps.verbose:
112 |                 result.update({'gn':gn,'loss':losses})
113 |             #if R > best_R:
114 |             #    result.update({'seed':seed,'actions':a_store,'R':best_R})
115 |             #    best_R = R
116 |             store_safely(hps.result_dir,'result',result)  
117 | 
118 |             # Train 
119 |             if (global_t_mcts > hps.n_t) or (ep > hps.n_eps):
120 |                 break # end learning
121 |             else:
122 |                 n_epochs = hps.n_epochs * (np.ceil(hps.n_mcts/20)).astype(int)
123 |                 #print(n_epochs)
124 |                 loss = model.train(D,n_epochs,hps.lr)
125 |                 losses.append(loss['total_loss'])
126 |                 gn.append(loss['gn'])
127 |             
128 |             if hps.verbose:            
129 |                 print('Time {}, Episode {}, Return {}, V {}, gn {}, Vloss {}, piloss {}'.format(
130 |                     global_t_mcts,ep,R,loss['V'],loss['gn'],loss['V_loss'],loss['pi_loss']))
131 |                 print('Actions {}, probs {}'.format(np.array(a_list),probs))
132 |                 print('One full episode loop + training in {} seconds'.format(time.time()-start))
133 |         
134 |     return result
135 | 
136 | if __name__ == '__main__':
137 |     '''Set-up training'''
138 |     parser = argparse.ArgumentParser()
139 |     parser.add_argument('--hp', help='Hyperparameter configuration',default='')
140 |     parser.add_argument('--no_plot', action='store_true',default=False)
141 |     args = parser.parse_args()
142 |     hps = get_hps().parse(args.hp)    
143 |     hps = override_hps_settings(hps)
144 | 
145 |     # set-up result folder if not prespecified, then we are not hyperlooping
146 |     if hps.result_dir == '': 
147 |         result_folder = os.getcwd() + '/results/{}/{}/'.format(hps.name,hps.game)
148 |         hps.result_dir = make_unique_subfolder(result_folder,hyperloop=False)
149 |         with open(hps.result_dir + 'hps.txt','w') as file:
150 |             file.write(pformat(hps_to_dict(hps)))
151 | 
152 |     #with open(subfolder + 'hps_raw.txt','w') as file:
153 |     #    file.write(hps_to_list(hps)) 
154 |     print(' ________________________________________ ')     
155 |     print('Start learning on game {}'.format(hps.game))               
156 |     result = agent(hps)
157 |     
158 |     if not args.no_plot:
159 |         plot_single_experiment(result,hps.game,hps.result_dir,plot_type='lc')


--------------------------------------------------------------------------------
/jobs/job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=Q,c=0.05,name=lr:0.1-loss_type:Q-c:0.05
 3 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=Q,c=0.25,name=lr:0.1-loss_type:Q-c:0.25
 4 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=Q,c=1.0,name=lr:0.1-loss_type:Q-c:1.0
 5 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=count,c=0.05,name=lr:0.1-loss_type:count-c:0.05
 6 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=count,c=0.25,name=lr:0.1-loss_type:count-c:0.25
 7 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.1,loss_type=count,c=1.0,name=lr:0.1-loss_type:count-c:1.0
 8 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=Q,c=0.05,name=lr:0.01-loss_type:Q-c:0.05
 9 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=Q,c=0.25,name=lr:0.01-loss_type:Q-c:0.25
10 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=Q,c=1.0,name=lr:0.01-loss_type:Q-c:1.0
11 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.05,name=lr:0.01-loss_type:count-c:0.05
12 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=0.25,name=lr:0.01-loss_type:count-c:0.25
13 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.01,loss_type=count,c=1.0,name=lr:0.01-loss_type:count-c:1.0
14 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=Q,c=0.05,name=lr:0.001-loss_type:Q-c:0.05
15 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=Q,c=0.25,name=lr:0.001-loss_type:Q-c:0.25
16 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=Q,c=1.0,name=lr:0.001-loss_type:Q-c:1.0
17 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.05,name=lr:0.001-loss_type:count-c:0.05
18 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=0.25,name=lr:0.001-loss_type:count-c:0.25
19 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.001,loss_type=count,c=1.0,name=lr:0.001-loss_type:count-c:1.0
20 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=Q,c=0.05,name=lr:0.0001-loss_type:Q-c:0.05
21 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=Q,c=0.25,name=lr:0.0001-loss_type:Q-c:0.25
22 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=Q,c=1.0,name=lr:0.0001-loss_type:Q-c:1.0
23 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.05,name=lr:0.0001-loss_type:count-c:0.05
24 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=0.25,name=lr:0.0001-loss_type:count-c:0.25
25 | python3 submit.py --hpsetup game=Pendulum-v0s,item1=n_mcts,seq1=20+70+250,item2=entropy_l,seq2=0.0+0.05+1.0,item3=temp,seq3=0.1+1.0+10.0,n_rep=5,slurm=True,slurm_qos=short,slurm_time=3:59:59 --hp bound=beta,n_t=10000000,n_eps=500000,V_decision=max,lr=0.0001,loss_type=count,c=1.0,name=lr:0.0001-loss_type:count-c:1.0
26 | 


--------------------------------------------------------------------------------
/src/rl/policies.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Various policies
  4 | @author: thomas
  5 | """
  6 | import numpy as np
  7 | import logging
  8 | logger = logging.getLogger('root')
  9 | logger.propagate = False
 10 | 
 11 | def policy(policy,model,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False):
 12 |     ''' wrapper policy function '''    
 13 |     pass
 14 | 
 15 | def thompson_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False):
 16 |     ''' Thompson sample value function in discrete action space 
 17 |     Input:      s - state, Thompson sampling applied across first dimension.
 18 |     Output:     a - picked action '''
 19 |     
 20 |     rep = s.shape[0]
 21 |     state_seq = np.repeat(s,model.action_dim,axis=0)
 22 |     action_seq = np.repeat(np.arange(0,model.action_dim)[None,:],rep,axis=0).reshape(-1,1)    
 23 |     rep_action_values = np.zeros([rep*model.action_dim,hps.n_thompson_sample])
 24 |       
 25 |     # sample
 26 |     for i in range(hps.n_thompson_sample):
 27 |         action_values = sample_value(sess,model,hps,state_seq,action_seq,seed,eval_on_mean_output,eval_on_mean_params)
 28 |         rep_action_values[:,i] = np.squeeze(action_values)
 29 | 
 30 |     # max        
 31 |     max_action_values = np.max(rep_action_values,axis=1) # max over the repetitions
 32 |     max_action_values = np.reshape(max_action_values,[rep,model.action_dim]) 
 33 |     #a = np.argmax(max_action_values,axis=1)[:,None]
 34 |     a = argmax_tiebreaking(max_action_values)
 35 |     return a
 36 |  
 37 | def egreedy_policy(s,model,sess,hps,e,seed):
 38 |     ''' e-greedy policy on discrete action-space'''
 39 |     # setup
 40 |     #hps.n_thompson_sample = 1 
 41 |     #a_exploit = thompson_policy(s,model,sess,hps,seed,eval_on_mean_output=True,eval_on_mean_params=True)    
 42 | 
 43 |     rep = s.shape[0]
 44 |     state_seq = np.repeat(s,model.action_dim,axis=0)
 45 |     action_seq = np.repeat(np.arange(0,model.action_dim)[None,:],rep,axis=0).reshape(-1,1)    
 46 | 
 47 |     action_values = get_net_mean(sess,model,state_seq,action_seq,seed,hps.p_dropout,hps.output)
 48 |     action_values = np.reshape(action_values,[rep,model.action_dim]) 
 49 |     a_exploit = argmax_tiebreaking(action_values)
 50 | 
 51 |     a_explore = get_discrete_random_action(model.action_dim,s.shape[0])
 52 |     a = np.array([(a1 if np.random.rand()>0.05 else a2) for a1,a2 in zip(a_exploit,a_explore)])
 53 |     return a
 54 | 
 55 | def ucb_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False):
 56 |     ''' upper confidence bound policy '''
 57 |     #p_dropout = 1.0 if eval_on_mean_params else hps.p_dropout # some unexplainable bug if uncommented
 58 |     p_dropout = hps.p_dropout
 59 |     
 60 |     rep = s.shape[0]
 61 |     state_seq = np.repeat(s,model.action_dim,axis=0)
 62 |     action_seq = np.repeat(np.arange(0,model.action_dim)[None,:],rep,axis=0).reshape(-1,1)    
 63 |               
 64 |     mu = get_net_mean(sess,model,state_seq,action_seq,seed,p_dropout,hps.output)
 65 |     sds = analytic_sd(sess,model,state_seq,action_seq,seed,p_dropout,hps.output)
 66 |     #sds2 = sample_sd(40,sess,model,state_seq,action_seq,p_dropout,hps.output)
 67 |     
 68 |     ucb_multipliers = np.random.uniform(1.7,2.3,(rep*model.action_dim,1))
 69 |     ucb = np.reshape(mu + ucb_multipliers * sds,[-1,model.action_dim])
 70 |     #a = np.argmax(ucb,axis=1)[:,None]
 71 |     a = argmax_tiebreaking(ucb) 
 72 |     return a    
 73 | 
 74 | def get_discrete_random_action(n_act,n_sample):
 75 |     return np.random.randint(0,n_act,n_sample)[:,None]
 76 | 
 77 | def sample_value(sess,model,hps,sb,ab,seed,eval_on_mean_output=False,eval_on_mean_params=False):
 78 |     ''' Sample values for policy '''
 79 |     if eval_on_mean_params:
 80 |         p_dropout = 1.0
 81 |     else:
 82 |         p_dropout = hps.p_dropout
 83 | 
 84 |     if eval_on_mean_output:
 85 |         Qsa = get_net_mean(sess,model,sb,ab,seed,p_dropout,hps.output)
 86 |     else:
 87 |         Qsa = sample_net(sess,model,sb,ab,seed,p_dropout,hps.output)
 88 |     return Qsa
 89 |     
 90 | def sample_net(sess,model,sb,ab,seed,p_dropout,output):
 91 |     ''' Sample from network output distribution '''
 92 |     sample = sess.run(model.sample,feed_dict = {model.x:sb,
 93 |                                                   model.a:ab,
 94 |                                                   model.p_dropout: p_dropout,
 95 |                                                   model.seed:seed})  
 96 |     if output == 'categorical':
 97 |         sample = model.transformer.to_value(sample)
 98 |     return sample
 99 | 
100 | def get_net_mean(sess,model,sb,ab,seed,p_dropout,output):
101 |     ''' Expectation of network output distribution '''
102 |     if not output == 'categorical':
103 |         Qsa = sess.run(model.mean,feed_dict = {model.x:sb,
104 |                                               model.a:ab,
105 |                                               model.p_dropout: p_dropout,
106 |                                               model.seed:seed})  
107 |     else:
108 |         density = sess.run(model.params,feed_dict = {model.x:sb,
109 |                                                   model.a:ab,
110 |                                                   model.p_dropout: p_dropout,
111 |                                                   model.seed:seed})      
112 |         Qsa = np.matmul(density,model.transformer.means)[:,None]
113 |     return Qsa
114 | 
115 | def analytic_sd(sess,model,sb,ab,seed,p_dropout,output):
116 |     ''' analytic sd calculation from network parameters '''
117 |     params = get_net_params(sess,model,sb,ab,seed,p_dropout)
118 |     if output == 'gaussian':
119 |         sd = params[:,1][:,None]
120 |     elif output == 'categorical':
121 |         # sd = sum_i (x_i-mu)
122 |         bin_means = model.transformer.means
123 |         mu = np.repeat(np.matmul(params,bin_means)[:,None],params.shape[1],axis=1)
124 |         sd = np.sqrt(np.sum(params * np.square(bin_means - mu), axis=1))[:,None] #
125 |     elif output == 'mog':
126 |         # need to sample
127 |         sd = sd_mog(params)[:,None]
128 |         #sd = sample_sd(20,sess,model,sb,ab,p_dropout,output)
129 |     elif output == 'deterministic':
130 |         sd = sample_sd(15,sess,model,sb,ab,p_dropout,output)
131 |     return sd
132 | 
133 | def sd_mog(params):
134 |     ''' Standard deviation of gaussian mixture '''
135 |     n_mix = int(params.shape[1]/3)
136 |     p = params[:,:n_mix]
137 |     mu = params[:,n_mix:(2*n_mix)]
138 |     sd = params[:,(2*n_mix):(3*n_mix)]    
139 |     return np.sum(p * (np.square(mu) + np.square(sd)),axis=1) - np.square(np.sum(p*mu,axis=1))
140 | 
141 | def sample_sd(n,sess,model,sb,ab,p_dropout,output):
142 |     ''' get standard deviation estimates
143 |     Crude implementation, based on sampling. However, there is no better way
144 |     to integrate over the parameter uncertainty '''
145 |     samples = np.zeros([sb.shape[0],n])
146 |     for i in range(n):
147 |         seed = [np.random.randint(1e15),np.random.randint(1e15)] # new seed for parametric uncertainty
148 |         sample = sample_net(sess,model,sb,ab,seed,p_dropout,output)
149 |         samples[:,i] = np.squeeze(sample)    
150 |     sds = np.std(samples,axis=1)[:,None]
151 |     return sds
152 |     
153 | def get_net_params(sess,model,sb,ab,seed,p_dropout):
154 |     ''' Network parameters '''
155 |     params = sess.run(model.params,feed_dict = {model.x:sb,
156 |                                               model.a:ab,
157 |                                               model.p_dropout: p_dropout,
158 |                                               model.seed:seed})  
159 |     return params
160 | 
161 | def argmax_tiebreaking(x):
162 |     ''' own argmax because numpy.argmax does not break ties '''
163 |     try:    
164 |         out = np.array([[np.random.choice(np.flatnonzero(a == a.max()))] for a in x]) # sparsely fails due to numerical errors between a and a.max()?
165 |     except:
166 |         out = np.array([[np.argmax(a)] for a in x])
167 |     return out


--------------------------------------------------------------------------------
/src/rl/wrappers/multi_discrete.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import gym
  4 | from gym.spaces import prng, Discrete, Box, MultiDiscrete
  5 | from gym.error import Error
  6 | 
  7 | # Adapters
  8 | 
  9 | class DiscreteToMultiDiscrete(Discrete):
 10 |     """
 11 |     Adapter that adapts the MultiDiscrete action space to a Discrete action space of any size
 12 |     The converted action can be retrieved by calling the adapter with the discrete action
 13 |         discrete_to_multi_discrete = DiscreteToMultiDiscrete(multi_discrete)
 14 |         discrete_action = discrete_to_multi_discrete.sample()
 15 |         multi_discrete_action = discrete_to_multi_discrete(discrete_action)
 16 |     It can be initialized using 3 configurations:
 17 |     Configuration 1) - DiscreteToMultiDiscrete(multi_discrete)                   [2nd param is empty]
 18 |         Would adapt to a Discrete action space of size (1 + nb of discrete in MultiDiscrete)
 19 |         where
 20 |             0   returns NOOP                                [  0,   0,   0, ...]
 21 |             1   returns max for the first discrete space    [max,   0,   0, ...]
 22 |             2   returns max for the second discrete space   [  0, max,   0, ...]
 23 |             etc.
 24 |     Configuration 2) - DiscreteToMultiDiscrete(multi_discrete, list_of_discrete) [2nd param is a list]
 25 |         Would adapt to a Discrete action space of size (1 + nb of items in list_of_discrete)
 26 |         e.g.
 27 |         if list_of_discrete = [0, 2]
 28 |             0   returns NOOP                                [  0,   0,   0, ...]
 29 |             1   returns max for first discrete in list      [max,   0,   0, ...]
 30 |             2   returns max for second discrete in list     [  0,   0,  max, ...]
 31 |             etc.
 32 |     Configuration 3) - DiscreteToMultiDiscrete(multi_discrete, discrete_mapping) [2nd param is a dict]
 33 |         Would adapt to a Discrete action space of size (nb_keys in discrete_mapping)
 34 |         where discrete_mapping is a dictionnary in the format { discrete_key: multi_discrete_mapping }
 35 |         e.g. for the Nintendo Game Controller [ [0,4], [0,1], [0,1] ] a possible mapping might be;
 36 |         mapping = {
 37 |             0:  [0, 0, 0],  # NOOP
 38 |             1:  [1, 0, 0],  # Up
 39 |             2:  [3, 0, 0],  # Down
 40 |             3:  [2, 0, 0],  # Right
 41 |             4:  [2, 1, 0],  # Right + A
 42 |             5:  [2, 0, 1],  # Right + B
 43 |             6:  [2, 1, 1],  # Right + A + B
 44 |             7:  [4, 0, 0],  # Left
 45 |             8:  [4, 1, 0],  # Left + A
 46 |             9:  [4, 0, 1],  # Left + B
 47 |             10: [4, 1, 1],  # Left + A + B
 48 |             11: [0, 1, 0],  # A only
 49 |             12: [0, 0, 1],  # B only,
 50 |             13: [0, 1, 1],  # A + B
 51 |         }
 52 |     """
 53 |     def __init__(self, multi_discrete, options=None):
 54 |         assert isinstance(multi_discrete, MultiDiscrete)
 55 |         self.multi_discrete = multi_discrete
 56 |         self.num_discrete_space = self.multi_discrete.num_discrete_space
 57 | 
 58 |         # Config 1
 59 |         if options is None:
 60 |             self.n = self.num_discrete_space + 1                # +1 for NOOP at beginning
 61 |             self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)}
 62 |             for i in range(self.num_discrete_space):
 63 |                 self.mapping[i + 1][i] = self.multi_discrete.high[i]
 64 | 
 65 |         # Config 2
 66 |         elif isinstance(options, list):
 67 |             assert len(options) <= self.num_discrete_space
 68 |             self.n = len(options) + 1                          # +1 for NOOP at beginning
 69 |             self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)}
 70 |             for i, disc_num in enumerate(options):
 71 |                 assert disc_num < self.num_discrete_space
 72 |                 self.mapping[i + 1][disc_num] = self.multi_discrete.high[disc_num]
 73 | 
 74 |         # Config 3
 75 |         elif isinstance(options, dict):
 76 |             self.n = len(options.keys())
 77 |             self.mapping = options
 78 |             for i, key in enumerate(options.keys()):
 79 |                 if i != key:
 80 |                     raise Error('DiscreteToMultiDiscrete must contain ordered keys. ' \
 81 |                                 'Item {0} should have a key of "{0}", but key "{1}" found instead.'.format(i, key))
 82 |                 if not self.multi_discrete.contains(options[key]):
 83 |                     raise Error('DiscreteToMultiDiscrete mapping for key {0} is ' \
 84 |                                 'not contained in the underlying MultiDiscrete action space. ' \
 85 |                                 'Invalid mapping: {1}'.format(key, options[key]))
 86 |         # Unknown parameter provided
 87 |         else:
 88 |             raise Error('DiscreteToMultiDiscrete - Invalid parameter provided.')
 89 | 
 90 |     def __call__(self, discrete_action):
 91 |         return self.mapping[discrete_action]
 92 | 
 93 | 
 94 | class BoxToMultiDiscrete(Box):
 95 |     """
 96 |     Adapter that adapts the MultiDiscrete action space to a Box action space
 97 |     The converted action can be retrieved by calling the adapter with the box action
 98 |         box_to_multi_discrete = BoxToMultiDiscrete(multi_discrete)
 99 |         box_action = box_to_multi_discrete.sample()
100 |         multi_discrete_action = box_to_multi_discrete(box_action)
101 |     It can be initialized using 2 configurations:
102 |     Configuration 1) - BoxToMultiDiscrete(multi_discrete)                       [2nd param is empty]
103 |         Would adapt to a Box action space of shape (nb of discrete space, ), with the min-max of
104 |         each Discrete space sets as Box boundaries
105 |         e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete)
106 |             would adapt to a Box with parameters low=np.array([0.0, 0.0, 0.0]) high=np.array([4.0, 1.0, 1.0])
107 |         The box action would then be rounded to the nearest integer.
108 |         e.g. [ 2.560453, 0.3523456, 0.674546 ] would be converted to the multi discrete action of [3, 0, 1]
109 |     Configuration 2) - BoxToMultiDiscrete(multi_discrete, list_of_discrete)     [2nd param is a list]
110 |         Would adapt to a Box action space of shape (nb of items in list_of_discrete, ), where list_of_discrete
111 |         is the index of the discrete space in the MultiDiscrete space
112 |         e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete, [2, 0])
113 |             would adapt to a Box with parameters low=np.array([0.0, 0.0]) high=np.array([1.0, 4.0])
114 |             where
115 |                 0.0 = min(discrete space #2), 1.0 = max(discrete space #2)
116 |                 0.0 = min(discrete space #0), 4.0 = max(discrete space #0)
117 |         The box action would then be rounded to the nearest integer and mapped to the correct discrete space in multi-discrete.
118 |         e.g. [ 0.7412057, 3.0174142 ] would be converted to the multi discrete action of [3, 0, 1]
119 |         This configuration is useful if you want to ignore certain discrete spaces in the MultiDiscrete space.
120 |     """
121 |     def __init__(self, multi_discrete, options=None):
122 |         assert isinstance(multi_discrete, MultiDiscrete)
123 |         self.multi_discrete = multi_discrete
124 |         self.num_discrete_space = self.multi_discrete.num_discrete_space
125 | 
126 |         if options is None:
127 |             options = list(range(self.num_discrete_space))
128 | 
129 |         if not isinstance(options, list):
130 |             raise Error('BoxToMultiDiscrete - Invalid parameter provided.')
131 | 
132 |         assert len(options) <= self.num_discrete_space
133 |         self.low = np.array([self.multi_discrete.low[x] for x in options])
134 |         self.high = np.array([self.multi_discrete.high[x] for x in options])
135 |         self.mapping = { i: disc_num for i, disc_num in enumerate(options)}
136 | 
137 |     def __call__(self, box_action):
138 |         multi_discrete_action = [0] * self.num_discrete_space
139 |         for i in self.mapping:
140 |             multi_discrete_action[self.mapping[i]] = int(round(box_action[i], 0))
141 |         return multi_discrete_action
142 | 


--------------------------------------------------------------------------------
/src/common/rl/wrappers/doom/multi_discrete.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import gym
  4 | from gym.spaces import prng, Discrete, Box, MultiDiscrete
  5 | from gym.error import Error
  6 | 
  7 | # Adapters
  8 | 
  9 | class DiscreteToMultiDiscrete(Discrete):
 10 |     """
 11 |     Adapter that adapts the MultiDiscrete action space to a Discrete action space of any size
 12 |     The converted action can be retrieved by calling the adapter with the discrete action
 13 |         discrete_to_multi_discrete = DiscreteToMultiDiscrete(multi_discrete)
 14 |         discrete_action = discrete_to_multi_discrete.sample()
 15 |         multi_discrete_action = discrete_to_multi_discrete(discrete_action)
 16 |     It can be initialized using 3 configurations:
 17 |     Configuration 1) - DiscreteToMultiDiscrete(multi_discrete)                   [2nd param is empty]
 18 |         Would adapt to a Discrete action space of size (1 + nb of discrete in MultiDiscrete)
 19 |         where
 20 |             0   returns NOOP                                [  0,   0,   0, ...]
 21 |             1   returns max for the first discrete space    [max,   0,   0, ...]
 22 |             2   returns max for the second discrete space   [  0, max,   0, ...]
 23 |             etc.
 24 |     Configuration 2) - DiscreteToMultiDiscrete(multi_discrete, list_of_discrete) [2nd param is a list]
 25 |         Would adapt to a Discrete action space of size (1 + nb of items in list_of_discrete)
 26 |         e.g.
 27 |         if list_of_discrete = [0, 2]
 28 |             0   returns NOOP                                [  0,   0,   0, ...]
 29 |             1   returns max for first discrete in list      [max,   0,   0, ...]
 30 |             2   returns max for second discrete in list     [  0,   0,  max, ...]
 31 |             etc.
 32 |     Configuration 3) - DiscreteToMultiDiscrete(multi_discrete, discrete_mapping) [2nd param is a dict]
 33 |         Would adapt to a Discrete action space of size (nb_keys in discrete_mapping)
 34 |         where discrete_mapping is a dictionnary in the format { discrete_key: multi_discrete_mapping }
 35 |         e.g. for the Nintendo Game Controller [ [0,4], [0,1], [0,1] ] a possible mapping might be;
 36 |         mapping = {
 37 |             0:  [0, 0, 0],  # NOOP
 38 |             1:  [1, 0, 0],  # Up
 39 |             2:  [3, 0, 0],  # Down
 40 |             3:  [2, 0, 0],  # Right
 41 |             4:  [2, 1, 0],  # Right + A
 42 |             5:  [2, 0, 1],  # Right + B
 43 |             6:  [2, 1, 1],  # Right + A + B
 44 |             7:  [4, 0, 0],  # Left
 45 |             8:  [4, 1, 0],  # Left + A
 46 |             9:  [4, 0, 1],  # Left + B
 47 |             10: [4, 1, 1],  # Left + A + B
 48 |             11: [0, 1, 0],  # A only
 49 |             12: [0, 0, 1],  # B only,
 50 |             13: [0, 1, 1],  # A + B
 51 |         }
 52 |     """
 53 |     def __init__(self, multi_discrete, options=None):
 54 |         assert isinstance(multi_discrete, MultiDiscrete)
 55 |         self.multi_discrete = multi_discrete
 56 |         self.num_discrete_space = self.multi_discrete.num_discrete_space
 57 | 
 58 |         # Config 1
 59 |         if options is None:
 60 |             self.n = self.num_discrete_space + 1                # +1 for NOOP at beginning
 61 |             self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)}
 62 |             for i in range(self.num_discrete_space):
 63 |                 self.mapping[i + 1][i] = self.multi_discrete.high[i]
 64 | 
 65 |         # Config 2
 66 |         elif isinstance(options, list):
 67 |             assert len(options) <= self.num_discrete_space
 68 |             self.n = len(options) + 1                          # +1 for NOOP at beginning
 69 |             self.mapping = {i: [0] * self.num_discrete_space for i in range(self.n)}
 70 |             for i, disc_num in enumerate(options):
 71 |                 assert disc_num < self.num_discrete_space
 72 |                 self.mapping[i + 1][disc_num] = self.multi_discrete.high[disc_num]
 73 | 
 74 |         # Config 3
 75 |         elif isinstance(options, dict):
 76 |             self.n = len(options.keys())
 77 |             self.mapping = options
 78 |             for i, key in enumerate(options.keys()):
 79 |                 if i != key:
 80 |                     raise Error('DiscreteToMultiDiscrete must contain ordered keys. ' \
 81 |                                 'Item {0} should have a key of "{0}", but key "{1}" found instead.'.format(i, key))
 82 |                 if not self.multi_discrete.contains(options[key]):
 83 |                     raise Error('DiscreteToMultiDiscrete mapping for key {0} is ' \
 84 |                                 'not contained in the underlying MultiDiscrete action space. ' \
 85 |                                 'Invalid mapping: {1}'.format(key, options[key]))
 86 |         # Unknown parameter provided
 87 |         else:
 88 |             raise Error('DiscreteToMultiDiscrete - Invalid parameter provided.')
 89 | 
 90 |     def __call__(self, discrete_action):
 91 |         return self.mapping[discrete_action]
 92 | 
 93 | 
 94 | class BoxToMultiDiscrete(Box):
 95 |     """
 96 |     Adapter that adapts the MultiDiscrete action space to a Box action space
 97 |     The converted action can be retrieved by calling the adapter with the box action
 98 |         box_to_multi_discrete = BoxToMultiDiscrete(multi_discrete)
 99 |         box_action = box_to_multi_discrete.sample()
100 |         multi_discrete_action = box_to_multi_discrete(box_action)
101 |     It can be initialized using 2 configurations:
102 |     Configuration 1) - BoxToMultiDiscrete(multi_discrete)                       [2nd param is empty]
103 |         Would adapt to a Box action space of shape (nb of discrete space, ), with the min-max of
104 |         each Discrete space sets as Box boundaries
105 |         e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete)
106 |             would adapt to a Box with parameters low=np.array([0.0, 0.0, 0.0]) high=np.array([4.0, 1.0, 1.0])
107 |         The box action would then be rounded to the nearest integer.
108 |         e.g. [ 2.560453, 0.3523456, 0.674546 ] would be converted to the multi discrete action of [3, 0, 1]
109 |     Configuration 2) - BoxToMultiDiscrete(multi_discrete, list_of_discrete)     [2nd param is a list]
110 |         Would adapt to a Box action space of shape (nb of items in list_of_discrete, ), where list_of_discrete
111 |         is the index of the discrete space in the MultiDiscrete space
112 |         e.g. a MultiDiscrete with parameters [ [0,4], [0,1], [0,1] ], adapted through BoxToMultiDiscrete(multi_discrete, [2, 0])
113 |             would adapt to a Box with parameters low=np.array([0.0, 0.0]) high=np.array([1.0, 4.0])
114 |             where
115 |                 0.0 = min(discrete space #2), 1.0 = max(discrete space #2)
116 |                 0.0 = min(discrete space #0), 4.0 = max(discrete space #0)
117 |         The box action would then be rounded to the nearest integer and mapped to the correct discrete space in multi-discrete.
118 |         e.g. [ 0.7412057, 3.0174142 ] would be converted to the multi discrete action of [3, 0, 1]
119 |         This configuration is useful if you want to ignore certain discrete spaces in the MultiDiscrete space.
120 |     """
121 |     def __init__(self, multi_discrete, options=None):
122 |         assert isinstance(multi_discrete, MultiDiscrete)
123 |         self.multi_discrete = multi_discrete
124 |         self.num_discrete_space = self.multi_discrete.num_discrete_space
125 | 
126 |         if options is None:
127 |             options = list(range(self.num_discrete_space))
128 | 
129 |         if not isinstance(options, list):
130 |             raise Error('BoxToMultiDiscrete - Invalid parameter provided.')
131 | 
132 |         assert len(options) <= self.num_discrete_space
133 |         self.low = np.array([self.multi_discrete.low[x] for x in options])
134 |         self.high = np.array([self.multi_discrete.high[x] for x in options])
135 |         self.mapping = { i: disc_num for i, disc_num in enumerate(options)}
136 | 
137 |     def __call__(self, box_action):
138 |         multi_discrete_action = [0] * self.num_discrete_space
139 |         for i in self.mapping:
140 |             multi_discrete_action[self.mapping[i]] = int(round(box_action[i], 0))
141 |         return multi_discrete_action
142 | 


--------------------------------------------------------------------------------
/src/common/rl/wrappers/atari.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Atari Wrappers from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
  4 | @author: thomas
  5 | """
  6 | 
  7 | import numpy as np
  8 | from collections import deque
  9 | import gym
 10 | from gym import spaces
 11 | #import cv2
 12 | #cv2.ocl.setUseOpenCL(False)
 13 | 
 14 | class NoopResetEnv(gym.Wrapper):
 15 |     def __init__(self, env, noop_max=30):
 16 |         """Sample initial states by taking random number of no-ops on reset.
 17 |         No-op is assumed to be action 0.
 18 |         """
 19 |         gym.Wrapper.__init__(self, env)
 20 |         self.noop_max = noop_max
 21 |         self.override_num_noops = None
 22 |         self.noop_action = 0
 23 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 24 | 
 25 |     def reset(self, **kwargs):
 26 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 27 |         self.env.reset(**kwargs)
 28 |         if self.override_num_noops is not None:
 29 |             noops = self.override_num_noops
 30 |         else:
 31 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 32 |         assert noops > 0
 33 |         obs = None
 34 |         for _ in range(noops):
 35 |             obs, _, done, _ = self.env.step(self.noop_action)
 36 |             if done:
 37 |                 obs = self.env.reset(**kwargs)
 38 |         return obs
 39 | 
 40 |     def step(self, ac):
 41 |         return self.env.step(ac)
 42 | 
 43 | class FireResetEnv(gym.Wrapper):
 44 |     def __init__(self, env):
 45 |         """Take action on reset for environments that are fixed until firing."""
 46 |         gym.Wrapper.__init__(self, env)
 47 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 48 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 49 | 
 50 |     def reset(self, **kwargs):
 51 |         self.env.reset(**kwargs)
 52 |         obs, _, done, _ = self.env.step(1)
 53 |         if done:
 54 |             self.env.reset(**kwargs)
 55 |         obs, _, done, _ = self.env.step(2)
 56 |         if done:
 57 |             self.env.reset(**kwargs)
 58 |         return obs
 59 | 
 60 |     def step(self, ac):
 61 |         return self.env.step(ac)
 62 | 
 63 | class EpisodicLifeEnv(gym.Wrapper):
 64 |     def __init__(self, env):
 65 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 66 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 67 |         """
 68 |         gym.Wrapper.__init__(self, env)
 69 |         self.lives = 0
 70 |         self.was_real_done  = True
 71 | 
 72 |     def step(self, action):
 73 |         obs, reward, done, info = self.env.step(action)
 74 |         self.was_real_done = done
 75 |         # check current lives, make loss of life terminal,
 76 |         # then update lives to handle bonus lives
 77 |         lives = self.env.unwrapped.ale.lives()
 78 |         if lives < self.lives and lives > 0:
 79 |             # for Qbert sometimes we stay in lives == 0 condtion for a few frames
 80 |             # so its important to keep lives > 0, so that we only reset once
 81 |             # the environment advertises done.
 82 |             done = True
 83 |         self.lives = lives
 84 |         return obs, reward, done, info
 85 | 
 86 |     def reset(self, **kwargs):
 87 |         """Reset only when lives are exhausted.
 88 |         This way all states are still reachable even though lives are episodic,
 89 |         and the learner need not know about any of this behind-the-scenes.
 90 |         """
 91 |         if self.was_real_done:
 92 |             obs = self.env.reset(**kwargs)
 93 |         else:
 94 |             # no-op step to advance from terminal/lost life state
 95 |             obs, _, _, _ = self.env.step(0)
 96 |         self.lives = self.env.unwrapped.ale.lives()
 97 |         return obs
 98 | 
 99 | class MaxAndSkipEnv(gym.Wrapper):
100 |     def __init__(self, env, skip=4):
101 |         """Return only every `skip`-th frame"""
102 |         gym.Wrapper.__init__(self, env)
103 |         # most recent raw observations (for max pooling across time steps)
104 |         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
105 |         self._skip       = skip
106 | 
107 |     def step(self, action):
108 |         """Repeat action, sum reward, and max over last observations."""
109 |         total_reward = 0.0
110 |         done = None
111 |         for i in range(self._skip):
112 |             obs, reward, done, info = self.env.step(action)
113 |             if i == self._skip - 2: self._obs_buffer[0] = obs
114 |             if i == self._skip - 1: self._obs_buffer[1] = obs
115 |             total_reward += reward
116 |             if done:
117 |                 break
118 |         # Note that the observation on the done=True frame
119 |         # doesn't matter
120 |         max_frame = self._obs_buffer.max(axis=0)
121 | 
122 |         return max_frame, total_reward, done, info
123 | 
124 |     def reset(self, **kwargs):
125 |         return self.env.reset(**kwargs)
126 | 
127 | class ClipRewardWrapper(gym.RewardWrapper):
128 |     def __init__(self, env):
129 |         gym.RewardWrapper.__init__(self, env)
130 | 
131 |     def reward(self, reward):
132 |         """Bin reward to {+1, 0, -1} by its sign."""
133 |         return np.sign(reward)
134 | 
135 | #class WarpFrame(gym.ObservationWrapper):
136 | #    def __init__(self, env):
137 | #        """Warp frames to 84x84 as done in the Nature paper and later work."""
138 | #        gym.ObservationWrapper.__init__(self, env)
139 | #        self.width = 84
140 | #        self.height = 84
141 | #        self.observation_space = spaces.Box(low=0, high=255,
142 | #            shape=(self.height, self.width, 1), dtype=np.uint8)
143 | #
144 | #    def observation(self, frame):
145 | #        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
146 | #        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
147 | #        return frame[:, :, None]
148 | 
149 | class FrameStack(gym.Wrapper):
150 |     def __init__(self, env, k):
151 |         """Stack k last frames.
152 |         Returns lazy array, which is much more memory efficient.
153 |         See Also
154 |         --------
155 |         baselines.common.atari_wrappers.LazyFrames
156 |         """
157 |         gym.Wrapper.__init__(self, env)
158 |         self.k = k
159 |         self.frames = deque([], maxlen=k)
160 |         shp = env.observation_space.shape
161 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
162 | 
163 |     def reset(self):
164 |         ob = self.env.reset()
165 |         for _ in range(self.k):
166 |             self.frames.append(ob)
167 |         return self._get_ob()
168 | 
169 |     def step(self, action):
170 |         ob, reward, done, info = self.env.step(action)
171 |         self.frames.append(ob)
172 |         return self._get_ob(), reward, done, info
173 | 
174 |     def _get_ob(self):
175 |         assert len(self.frames) == self.k
176 |         return LazyFrames(list(self.frames))
177 | 
178 | class ScaledFloatFrame(gym.ObservationWrapper):
179 |     def __init__(self, env):
180 |         gym.ObservationWrapper.__init__(self, env)
181 | 
182 |     def observation(self, observation):
183 |         # careful! This undoes the memory optimization, use
184 |         # with smaller replay buffers only.
185 |         return np.array(observation).astype(np.float32) / 255.0
186 | 
187 | class LazyFrames(object):
188 |     def __init__(self, frames):
189 |         """This object ensures that common frames between the observations are only stored once.
190 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
191 |         buffers.
192 |         This object should only be converted to numpy array before being passed to the model.
193 |         You'd not believe how complex the previous solution was."""
194 |         self._frames = frames
195 |         self._out = None
196 | 
197 |     def _force(self):
198 |         if self._out is None:
199 |             self._out = np.concatenate(self._frames, axis=2)
200 |             self._frames = None
201 |         return self._out
202 | 
203 |     def __array__(self, dtype=None):
204 |         out = self._force()
205 |         if dtype is not None:
206 |             out = out.astype(dtype)
207 |         return out
208 | 
209 |     def __len__(self):
210 |         return len(self._force())
211 | 
212 |     def __getitem__(self, i):
213 |         return self._force()[i]
214 | 
215 | def make_atari(env_id):
216 |     env = gym.make(env_id)
217 |     assert 'NoFrameskip' in env.spec.id
218 |     env = NoopResetEnv(env, noop_max=30)
219 |     env = MaxAndSkipEnv(env, skip=4)
220 |     return env
221 | 
222 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
223 |     """Configure environment for DeepMind-style Atari.
224 |     """
225 |     if episode_life:
226 |         env = EpisodicLifeEnv(env)
227 |     if 'FIRE' in env.unwrapped.get_action_meanings():
228 |         env = FireResetEnv(env)
229 |     # env = WarpFrame(env) removed for now, needs cv2
230 |     if scale:
231 |         env = ScaledFloatFrame(env)
232 |     if clip_rewards:
233 |         env = ClipRewardEnv(env)
234 |     if frame_stack:
235 |         env = FrameStack(env, 4)
236 |     return env


--------------------------------------------------------------------------------
/src/network.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Neural network specification
  5 | @author: thomas
  6 | """
  7 | 
  8 | import tensorflow as tf
  9 | import tensorflow.contrib.slim as slim
 10 | import numpy as np
 11 | import random
 12 | from common.rl.make_game import check_space
 13 | 
 14 | from pdb import set_trace
 15 | 
 16 | 
 17 | class Model():
 18 |     
 19 |     def __init__(self,Env,lr,n_mix,clip_gradient_norm,loss_type='count',bound='tanh',temp=1.0,entropy_l=0.0):
 20 |         
 21 |         self.action_dim, self.action_discrete  = check_space(Env.action_space)
 22 |         self.state_dim, self.state_discrete  = check_space(Env.observation_space)
 23 |         
 24 |         if self.action_discrete: 
 25 |             raise ValueError('Discrete action space not implemented')
 26 |         if len(self.action_dim) > 1:
 27 |             raise ValueError('Cant handle multidimensional action spaces')
 28 |         else:
 29 |             self.action_dim = self.action_dim[0]
 30 |             self.scale = Env.action_space.high[0] # assumes a symmetric action space [-scale,scale] for all action_dim
 31 |             
 32 |         # placeholders
 33 |         if not self.state_discrete:
 34 |             self.x = x = tf.placeholder("float32", shape=np.append(None,self.state_dim),name='x') # s   
 35 |         else:
 36 |             self.x = x = tf.placeholder("int32", shape=np.append(None,1)) # s 
 37 |             x =  tf.squeeze(tf.one_hot(x,self.state_dim,axis=1),axis=2)
 38 |         
 39 |         # feedforward
 40 |         for i in range(2):
 41 |             x = slim.fully_connected(x,128,activation_fn=tf.nn.elu)
 42 |             
 43 |         # Mixture of Gaussians
 44 |         if self.action_discrete:
 45 |             raise ValueError('Only works for continuous outputs')
 46 |         #print(self.action_dim)
 47 |         n_params = n_mix *(2 * self.action_dim)
 48 |         z = slim.fully_connected(x,n_params,activation_fn=None) 
 49 |         if n_mix > 1:
 50 |             logits = slim.fully_connected(x,n_mix,activation_fn=None) 
 51 | 
 52 |         # params
 53 |         #self.sigma_p = sigma_p = tf.Print(sigma_p,[sigma_p],summarize=16)
 54 | 
 55 |         # Make distribution
 56 |         if bound == 'tanh':
 57 |             self.mu_p = mu_p = z[:,:(self.action_dim*n_mix)]
 58 |             log_sigma = z[:,(self.action_dim*n_mix):(2*self.action_dim*n_mix)] 
 59 |             self.sigma_p = sigma_p = tf.clip_by_value(tf.nn.softplus(log_sigma),0.001,10000) 
 60 |             if n_mix == 1:
 61 |                 if self.action_dim == 1:
 62 |                     outdist = tf.distributions.Normal(mu_p,sigma_p)
 63 |                 else:
 64 |                     outdist = tf.contrib.distributions.MultivariateNormalDiag(mu_p,sigma_p)
 65 |             else:                
 66 |                 p_dist = tf.distributions.Categorical(logits=logits,validate_args=True,allow_nan_stats=False)
 67 |                 n_dist = []
 68 |                 for i in range(n_mix):
 69 |                     if self.action_dim == 1:
 70 |                         n_dist.append(tf.distributions.Normal(mu_p[:,i],sigma_p[:,i]))  
 71 |                     else:
 72 |                         n_dist.append(tf.contrib.distributions.MultivariateNormalDiag(loc=mu_p[:,(i*self.action_dim):((i+1)*self.action_dim)],scale_diag=sigma_p[:,(i*self.action_dim):((i+1)*self.action_dim)]))  
 73 |                 outdist = tf.contrib.distributions.Mixture(cat=p_dist,components=n_dist)
 74 |         # Wrap distribution        
 75 |             outdist = BoundedDistribution(outdist,scale=self.scale)
 76 |         elif bound == 'beta':
 77 |             self.alpha = alpha = z[:,:(self.action_dim*n_mix)]
 78 |             self.beta = beta = z[:,(self.action_dim*n_mix):(2*self.action_dim*n_mix)] 
 79 |             if n_mix == 1:
 80 |                 outdist = tf.contrib.distributions.BetaWithSoftplusConcentration(alpha,beta)
 81 |                 outdist = BoundedDistributionBeta(outdist,scale=self.scale)
 82 |                 self.entropy = outdist.entropy()
 83 |             else:
 84 |                 raise ValueError('Beta bounding not implemented for n_mix >1')        
 85 |         else:
 86 |             raise ValueError('Unknown bounding type: {}'.format(bound))        
 87 |         
 88 |         # V loss            
 89 |         self.V_hat = slim.fully_connected(x,1,activation_fn=None)
 90 |         self.V = tf.placeholder("float32", shape=[None,1],name='V')
 91 |         self.V_loss = tf.losses.mean_squared_error(labels=self.V,predictions=self.V_hat)
 92 |     
 93 |         # pi loss (needs a)
 94 |         self.a = a = tf.placeholder("float32", shape=np.append(None,self.action_dim),name='a') 
 95 |         self.log_pi_a_s = outdist.log_prob(a) # shape (batch,)
 96 |         self.pi_hat = outdist.prob(a) # shape (batch,)
 97 |         if loss_type == 'count':
 98 |             self.n_a = n_a = tf.placeholder("float32", shape=np.append(None,1),name='n_a') 
 99 |             pi_loss = tf.stop_gradient(self.log_pi_a_s - tf.log(tf.squeeze(n_a,axis=1))) * self.log_pi_a_s
100 |         elif loss_type == 'Q':
101 |             self.n_a = n_a = tf.placeholder("float32", shape=np.append(None,1),name='Q') 
102 |             pi_loss = tf.stop_gradient(self.log_pi_a_s - tf.squeeze((n_a*temp) - self.V_hat,axis=1)) * self.log_pi_a_s    
103 |         self.pi_loss = tf.reduce_mean(pi_loss)
104 |         self.sample = outdist.sample()
105 |         self.pi_sample = outdist.prob(self.sample)
106 |         
107 |         # training
108 |         self.loss = self.V_loss + self.pi_loss
109 |         if bound == 'beta':
110 |             self.loss -= tf.reduce_mean(entropy_l * self.entropy)
111 |         self.lr = tf.Variable(lr,name="learning_rate",trainable=False)
112 |         optimizer = tf.train.RMSPropOptimizer(learning_rate=lr)
113 |         var_list = tf.trainable_variables()
114 |         grads = tf.gradients(self.loss, var_list)
115 |         if clip_gradient_norm > 0.0:
116 |             clip_global = tf.Variable(clip_gradient_norm,trainable=False)
117 |             grads,self.gradient_norm = tf.clip_by_global_norm(grads, clip_global)
118 |         else:
119 |             self.gradient_norm = tf.global_norm(grads)
120 |         gvs = list(zip(grads, var_list))
121 |         self.train_op = optimizer.apply_gradients(gvs)
122 |     
123 |     def train(self,D,n_epochs,lr):
124 |         sess = self.sess
125 |         D.reshuffle()
126 |         gn,VL,piL,V = [],[],[],[]
127 |         for epoch in range(n_epochs):
128 |             for sb,Vb,ab,a_nb in D:        
129 |                 _,VL_,piL_,gn_,V_ = sess.run([self.train_op,self.V_loss,self.pi_loss,self.gradient_norm,self.V],
130 |                                              feed_dict={self.x:sb,
131 |                                                   self.V:Vb,
132 |                                                   self.a:ab,
133 |                                                   self.n_a:a_nb,
134 |                                                   self.lr:lr
135 |                                                   })
136 |                 gn.append(gn_)
137 |                 VL.append(VL_)
138 |                 piL.append(piL_)
139 |                 V.append(np.mean(V_))
140 |                 if np.isnan(np.mean(gn)) or np.isnan(np.mean(VL)) or np.isnan(np.mean(piL)) or np.isnan(np.mean(V)):
141 |                     set_trace()
142 |         t_loss = np.mean(VL)+np.mean(piL)
143 |         return {'V_loss':np.mean(VL),'pi_loss':np.mean(piL),'gn':np.mean(gn),'total_loss':t_loss,'V':np.mean(V)}
144 |     
145 |     def predict_V(self,s):
146 |         sess = self.sess
147 |         return sess.run(self.V_hat,feed_dict={self.x:s})
148 |         
149 |     def predict_pi(self,s,a):
150 |         sess = self.sess
151 |         return sess.run(self.pi_hat,feed_dict={self.x:s,
152 |                                                self.a:a})
153 |     
154 |     def log_prob(self,s,a):
155 |         return self.sess.run([self.log_pi_a_s],feed_dict={self.x:s,
156 |                                                      self.a:a})
157 |         qui
158 |     def sample_action(self,s):
159 |         sess = self.sess
160 |         mix_list = sess.run(self.p_dist.sample(),feed_dict={self.x:s})
161 |         samples = np.array([sess.run(self.n_dist[mix].sample(),feed_dict={self.x:s}) for mix in mix_list])
162 |         return samples
163 |     
164 |     def sample_action_and_pi(self,s):
165 |         sess = self.sess
166 |         return sess.run([self.sample,self.pi_sample],feed_dict={self.x:s})
167 | 
168 | class BoundedDistribution(object):
169 |     ''' Bounded transformation of arbitrary continuous density with support on real line '''
170 |     
171 |     def __init__(self,dist,scale):
172 |         self.dist = dist        
173 |         self.scale = scale
174 |     
175 |     def to_u(self,a):
176 |         return tf.atanh(tf.clip_by_value(a/self.scale,-0.999999,0.999999)) # clip what goes into atanh
177 |     
178 |     def to_a(self,u):
179 |         return self.scale*tf.tanh(u)
180 | 
181 |     def sample(self):
182 |         return self.to_a(self.dist.sample())
183 |         
184 |     def log_prob(self,a):
185 |         u = self.to_u(a)
186 |         return self.dist.log_prob(u) - tf.reduce_sum(tf.log(self.scale*(1-tf.square(
187 |                 tf.clip_by_value(tf.tanh(u),-0.999999,0.999999)))),axis=1) # clip what comes out of tanh and goes into log
188 |     
189 |     def prob(self,a):
190 |         return tf.exp(self.log_prob(a))      
191 | 
192 | class BoundedDistributionBeta(object):
193 |     ''' Bounded transformation of Beta distribution '''
194 |     
195 |     def __init__(self,dist,scale):
196 |         self.dist = dist
197 |         self.scale = scale
198 |     
199 |     def to_u(self,a):
200 |         return tf.clip_by_value(((a/self.scale) + 1.0)/2.0,0.00001,0.999999)
201 |     
202 |     def to_a(self,u):
203 |         return self.scale * ((2.0 * u) - 1.0)
204 | 
205 |     def sample(self):
206 |         return self.to_a(self.dist.sample())
207 |         
208 |     def log_prob(self,a):
209 |         u = self.to_u(a)
210 |         shape = a.get_shape().as_list()
211 |         constants = shape[-1]*tf.log(tf.constant(np.array(2.0)*np.squeeze(self.scale),dtype='float32'))
212 |         return tf.reduce_sum(self.dist.log_prob(u),axis=1) - constants
213 |     
214 |     def prob(self,a):
215 |         return tf.exp(self.log_prob(a))    
216 |     
217 |     def entropy(self):
218 |         return self.dist.entropy()
219 | 
220 | 
221 | class Database():
222 |     ''' Database '''
223 |     
224 |     def __init__(self,max_size,batch_size):
225 |         self.max_size = max_size        
226 |         self.batch_size = batch_size
227 |         self.size = 0
228 |         self.insert_index = 0
229 |         self.experience = []
230 |         self.sample_array = None
231 |         self.sample_index = 0
232 |     
233 |     def clear(self):
234 |         self.experience = []
235 |         self.insert_index = 0
236 |         self.size = 0
237 |     
238 |     def store(self,experience):
239 |         if self.size < self.max_size:
240 |             self.experience.append(experience)
241 |             self.size +=1
242 |         else:
243 |             self.experience[self.insert_index] = experience
244 |             self.insert_index += 1
245 |             if self.insert_index >= self.size:
246 |                 self.insert_index = 0
247 | 
248 |     def store_from_array(self,*args):
249 |         for i in range(args[0].shape[0]):
250 |             entry = []
251 |             for arg in args:
252 |                 entry.append(arg[i])
253 |             self.store(entry)
254 |         
255 |     def reshuffle(self):
256 |         self.sample_array = np.arange(self.size)
257 |         random.shuffle(self.sample_array)
258 |         self.sample_index = 0
259 |                             
260 |     def __iter__(self):
261 |         return self
262 | 
263 |     def __next__(self):
264 |         if (self.sample_index + self.batch_size > self.size) and (not self.sample_index == 0):
265 |             self.reshuffle() # Reset for the next epoch
266 |             raise(StopIteration)
267 |           
268 |         if (self.sample_index + 2*self.batch_size > self.size):
269 |             indices = self.sample_array[self.sample_index:]
270 |             batch = [self.experience[i] for i in indices]
271 |         else:
272 |             indices = self.sample_array[self.sample_index:self.sample_index+self.batch_size]
273 |             batch = [self.experience[i] for i in indices]
274 |         self.sample_index += self.batch_size
275 |         
276 |         arrays = []
277 |         for i in range(len(batch[0])):
278 |             to_add = np.array([entry[i] for entry in batch])
279 |             arrays.append(to_add) 
280 |         return tuple(arrays)
281 |             
282 |     next = __next__


--------------------------------------------------------------------------------
/src/rl/envs/chain.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Chain environment
  4 | @author: thomas
  5 | """
  6 | 
  7 | import gym.spaces
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | from scipy.stats import norm
 11 | from rl.policies import get_net_mean, get_net_params, sample_sd, analytic_sd, thompson_policy, ucb_policy
 12 | import matplotlib.patches as patches
 13 | 
 14 | #plt.style.use('ggplot')
 15 | plt.rcParams['lines.linewidth'] = 4
 16 | plt.rcParams.update({'font.size': 11})
 17 | plt.rcParams['axes.facecolor']='white'
 18 | plt.rcParams['savefig.facecolor']='white'
 19 | plt.rcParams['font.family'] = 'sans-serif'
 20 | plt.rcParams['font.sans-serif'] = ['Latin Modern Math']
 21 | plt.rcParams['xtick.labelsize'] = 15
 22 | plt.rcParams['font.weight'] = 'bold'
 23 | plt.rcParams['ytick.labelsize'] = 15
 24 | plt.locator_params(axis='x', nticks=3)
 25 | plt.ion()
 26 | 
 27 | class ChainOrdered(object):
 28 |     ''' Chain domain '''
 29 |     
 30 |     def __init__(self,n=10):
 31 |         # n = length of chain
 32 |         self.action_space = gym.spaces.Discrete(2)
 33 |         self.observation_space = gym.spaces.Discrete(n+1)        
 34 |         self.n = n      
 35 |         self.state = 0
 36 |         self.correct = np.repeat(1,n)
 37 | 
 38 |     def reset(self):
 39 |         self.state = 0
 40 |         return self.state        
 41 | 
 42 |     def step(self,a):
 43 |         if a == 0:
 44 |             # move back
 45 |             self.state = 0
 46 |             r = 0
 47 |             terminal = True
 48 |         elif a == 1:
 49 |             # move forward
 50 |             self.state += 1
 51 |             if self.state == self.n:
 52 |                 r = 1
 53 |                 terminal = True
 54 |             else:
 55 |                 r = 0
 56 |                 terminal = False
 57 |         else:
 58 |             raise ValueError('Action not possible')
 59 |             
 60 |         return self.state,r,terminal, {}
 61 |     
 62 |     def seed(self,seed):
 63 |         pass # deterministic anyway
 64 | 
 65 | class Chain(object):
 66 |     ''' Chain domain '''
 67 |     
 68 |     def __init__(self,n=10):
 69 |         # n = length of chain
 70 |         self.action_space = gym.spaces.Discrete(2)
 71 |         self.observation_space = gym.spaces.Discrete(n+1)        
 72 |         self.n = n      
 73 |         self.state = 0
 74 |         self.correct = np.random.randint(0,2,n) # correct action in each state
 75 |         self.counts = np.zeros((self.n,2))
 76 | 
 77 |     def reset(self):
 78 |         self.state = 0
 79 |         return self.state        
 80 | 
 81 |     def step(self,a):
 82 |         self.counts[self.state,a] += 1
 83 |         if a != self.correct[self.state]:
 84 |             # move back
 85 |             self.state = 0
 86 |             r = 0
 87 |             terminal = True
 88 |         elif a == self.correct[self.state]:
 89 |             # move forward
 90 |             self.state += 1
 91 |             if self.state == self.n:
 92 |                 r = 1
 93 |                 terminal = True
 94 |             else:
 95 |                 r = 0
 96 |                 terminal = False
 97 |         else:
 98 |             raise ValueError('Action not possible')
 99 |             
100 |         return self.state,r,terminal, {}
101 | 
102 |     def seed(self,seed):
103 |         pass # deterministic anyway
104 | 
105 | 
106 | class ChainLoop(object):
107 |     ''' Chain domain '''
108 |     
109 |     def __init__(self,n=10):
110 |         # n = length of chain
111 |         self.action_space = gym.spaces.Discrete(2)
112 |         self.observation_space = gym.spaces.Discrete(n+1)        
113 |         self.n = n      
114 |         self.state = 0
115 |         self.correct = np.random.randint(0,2,n) # correct action in each state
116 |         self.counts = np.zeros((self.n,2))
117 | 
118 |     def reset(self):
119 |         self.state = 0
120 |         return self.state        
121 | 
122 |     def step(self,a):
123 |         self.counts[self.state,a] += 1
124 |         if a != self.correct[self.state]:
125 |             # move back
126 |             self.state = 0
127 |             r = 0
128 |             terminal = False
129 |         elif a == self.correct[self.state]:
130 |             # move forward
131 |             self.state += 1
132 |             if self.state == self.n:
133 |                 r = 1
134 |                 terminal = True
135 |             else:
136 |                 r = 0
137 |                 terminal = False
138 |         else:
139 |             raise ValueError('Action not possible')
140 |             
141 |         return self.state,r,terminal, {}
142 | 
143 |     def seed(self,seed):
144 |         pass # deterministic anyway
145 | 
146 | class ChainDomainPlotter(object):
147 | 
148 |     def __init__(self,Env):
149 |         self.fig,self.ax = plt.subplots(1,figsize=(Env.n*2,4))
150 |         self.n = Env.n
151 |         self.truth = Env.correct
152 |         
153 |         for i in range(self.n):
154 |             for j in range(2):
155 |                 if self.truth[i]==j:
156 |                     col = 'g'
157 |                 else:
158 |                     col = 'r'
159 |                 self.ax.add_patch(patches.Circle((i,j), radius=0.05,color=col))
160 |                 
161 |         self.ax.set_xlim([-1,self.n+1])
162 |         self.ax.set_ylim([-1,2])
163 |         self.fig.canvas.draw()
164 |     
165 |     def update(self,counts):
166 |         self.ax.clear()
167 |         for i in range(self.n):
168 |             for j in range(2):
169 |                 if self.truth[i]==j:
170 |                     col = 'g'
171 |                 else:
172 |                     col = 'r'
173 |                 self.ax.add_patch(patches.Circle((i,j), radius=0.05,color=col))
174 |                 self.ax.text(i-0.2,j-0.2,'s = {}, a={}\n N = {}'.format(i,j,int(counts[i,j])))
175 | 
176 |         self.fig.canvas.draw()
177 |          
178 | class ChainPlotter(object):
179 |     
180 |     def __init__(self,truth,n_plot):
181 |         self.fig,self.ax = plt.subplots(2,n_plot,figsize=(n_plot*10,4),sharex=True,sharey=True)
182 |         self.pl = self.ax.flatten('F')
183 |         self.n = 2*n_plot
184 |         
185 |         # setup for predictions
186 |         self.sb = np.repeat(np.arange(0,n_plot,1),2)[:,None]       
187 |         self.ab = np.array([0,1]*n_plot)[:,None]
188 |         self.truth = truth       
189 |         self.fig.canvas.draw()
190 | 
191 |     def update(self,sess,model,hps,ep):
192 |         # clear plots
193 |         for ax in self.pl:
194 |             ax.clear()
195 |         overall_means = np.zeros([hps.n_rep_visualize,self.n])
196 |         overall_max_dens = np.ones([self.n])*-np.inf
197 |         for k in range(hps.n_rep_visualize):
198 |             # get prediction parameters
199 |             seed = [np.random.randint(1e15),np.random.randint(1e15)] # new seed
200 |             params = get_net_params(sess,model,self.sb,self.ab,seed,hps.p_dropout)
201 |             means = get_net_mean(sess,model,self.sb,self.ab,seed,hps.p_dropout,output=hps.output)        
202 |             overall_means[k,:] = means[:,0]            
203 |             #print(np.concatenate([np.array([0,0,1,1,2,2])[:,None],np.array([0,1,0,1,0,1])[:,None],params],axis=1))
204 |             
205 |             # need to determine range
206 |             if hps.output != 'categorical':
207 |                 if hps.output == 'gaussian':
208 |                     mu = params[:,0]
209 |                     sigma = params[:,1]
210 |                 elif hps.output == 'mog':
211 |                     mu = params[:,hps.n_mix:(hps.n_mix*2)]
212 |                     sigma = params[:,(2*hps.n_mix):(3*hps.n_mix)]
213 |                 elif hps.output == 'deterministic':
214 |                     mu = params[:,0]
215 |                     sigma = 1.0
216 |                 
217 |                 max_sd = np.max(sigma)
218 |                 lower,upper = np.min(mu)-3*max_sd,np.max(mu)+3*max_sd   
219 |             else:
220 |                 lower,upper = model.transformer.plot_edges[0],model.transformer.plot_edges[-1]
221 |             
222 |             # update all plots
223 |             x = np.linspace(lower,upper,100)
224 |             for i in range(self.n):
225 |                 #self.pl[i].set_xlim([lower,upper])
226 |                 param = params[i,:]
227 |                 if hps.output == 'deterministic':
228 |                     max_dens = 1.0
229 |                     overall_max_dens[i] = 1.0
230 |                     mean = means[i]
231 |                     self.pl[i].plot([mean,mean],[0,max_dens],':')
232 |                 else:
233 |                     if hps.output == 'gaussian' or hps.output == 'mog':
234 |                         if hps.output == 'gaussian':
235 |                             dens = norm.pdf(x,param[0],param[1])
236 |                         elif hps.output == 'mog':
237 |                             dens = [param[j]*norm.pdf(x,param[hps.n_mix+j],param[2*hps.n_mix+j]) for j in range(hps.n_mix)]
238 |                             dens = np.sum(np.array(dens),axis=0)
239 |                         #print(x,param,dens)
240 |                         self.pl[i].plot(x,dens,color='cornflowerblue')
241 |                     elif hps.output == 'categorical':
242 |                         dens = param
243 |                         edges = model.transformer.plot_edges
244 |                         self.pl[i].hist(model.transformer.means,bins=edges,weights=dens,color='cornflowerblue')
245 |                     overall_max_dens[i] = np.max([overall_max_dens[i],np.max(dens)])
246 |         # add the mean
247 |         grand_means = np.mean(np.array(overall_means),axis=0)
248 |         seed = [np.random.randint(1e15),np.random.randint(1e15)] # new seed for parametric uncertainty
249 |         grand_sds = analytic_sd(sess,model,self.sb,self.ab,seed,hps.p_dropout,hps.output)
250 |         #grand_sds = np.ones([len(grand_means),1])
251 | 
252 |         # get policy estimates
253 |         s = np.arange(0,int(self.n/2),1)[:,None]
254 |         a_thompson = np.array([thompson_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False) for i in range(100)])
255 |         a_ucb = np.array([ucb_policy(s,model,sess,hps,seed,eval_on_mean_output=False,eval_on_mean_params=False) for i in range(100)])
256 |         
257 |         thompson_probs = np.zeros(self.n)        
258 |         ucb_probs = np.zeros(self.n)
259 |         
260 |         for j,(state,action) in enumerate(zip(self.sb,self.ab)):
261 |             thompson_probs[j] = np.mean(a_thompson[:,state,:] == action)
262 |             ucb_probs[j] = np.mean(a_ucb[:,state,:] == action)
263 | 
264 |         for i in range(self.n):
265 |             grand_mean = grand_means[i]
266 |             grand_sd = grand_sds[i]
267 |             max_dens = overall_max_dens[i] #np.max(dens) if 'dens' in locals() else 1
268 |             self.pl[i].plot([grand_mean,grand_mean],[0,max_dens],'--',color='orange')
269 |             #self.pl[i].plot([grand_mean-2*grand_sd,grand_mean+2*grand_sd],[max_dens/2,max_dens/2],'--',color='orange')
270 |             self.pl[i].text(0.1,0.75,'$\mu$={:0.2f}'.format(grand_mean),transform=self.pl[i].transAxes)
271 |             self.pl[i].text(0.55,0.75,'$\sigma$={:0.2f}'.format(grand_sds[i][0]),transform=self.pl[i].transAxes)
272 | 
273 |             #self.pl[i].text(0.1,0.75,'$\mu$={:0.2f}\n$\sigma$={:0.2f}'.format(grand_mean,grand_sds[i][0]),transform=self.pl[i].transAxes)
274 |             #self.pl[i].text(0.55,0.75,'tho={:0.2f}\nucb={:0.2f}'.format(thompson_probs[i],ucb_probs[i]),transform=self.pl[i].transAxes)
275 | 
276 |         
277 |         for j in range(int(self.n/2)):
278 |             for l in range(2):
279 |                 if self.truth[j]==l:
280 |                     val = 1.
281 |                     col = 'g'
282 |                 else:
283 |                     val = 0.
284 |                     col = 'r'
285 |                 self.ax[l,j].add_patch(patches.Rectangle((0.01,0.01),0.98,0.98,linewidth=10,edgecolor=col,facecolor='none',transform=self.ax[l,j].transAxes))
286 |                 if j>0:                
287 |                     plt.setp(self.ax[l,j].get_yticklabels(), visible=False)
288 |                 if l==0:                
289 |                     plt.setp(self.ax[l,j].get_xticklabels(), visible=False)
290 |                 #self.ax[l,j].set_title('V={:0.2f}'.format(val))
291 |                 self.ax[l,j].set_ylim([0,1.0])
292 |                 self.ax[l,j].set_xlim([-2.5,2.5])
293 | 
294 |                 
295 |         self.fig.canvas.draw()
296 |         self.fig.savefig(hps.result_dir + 'episode_{}'.format(ep),dpi=300)
297 |         self.fig.canvas.flush_events()
298 | 
299 | # Test
300 | if __name__ == '__main__':
301 |     Env = ChainOrdered()
302 |     s = Env.reset()
303 |     for i in range(500): 
304 |         a = Env.action_space.sample()
305 |         s,r,terminal,_ = Env.step(a)
306 |         if terminal:
307 |             print('Died in step',i,'with reward',r,' restarting')
308 |             s = Env.reset() 
309 |     print('Finished')


--------------------------------------------------------------------------------