├── game
    ├── particle
    │   ├── __init__.py
    │   ├── bin
    │   │   ├── __init__.py
    │   │   └── interactive.py
    │   ├── multiagent
    │   │   ├── scenarios
    │   │   │   ├── __init__.py
    │   │   │   ├── simple_spread.py
    │   │   │   └── simple_tag.py
    │   │   ├── scenario.py
    │   │   ├── __init__.py
    │   │   ├── policy.py
    │   │   ├── multi_discrete.py
    │   │   ├── core.py
    │   │   └── rendering.py
    │   ├── make_env.py
    │   └── README.md
    ├── pacman
    │   ├── pacmanDQN_Agents.py
    │   ├── layouts
    │   │   ├── openClassic.lay
    │   │   └── originalClassic.lay
    │   ├── textDisplay.py
    │   ├── keyboardAgents.py
    │   ├── ghostAgents.py
    │   ├── make_env.py
    │   ├── layout.py
    │   └── graphicsUtils.py
    └── __init__.py
├── source
    ├── pacman
    │   ├── original
    │   │   └── 0
    │   │   │   ├── checkpoint
    │   │   │   ├── model_0.ckpt.meta
    │   │   │   ├── model_0.ckpt.index
    │   │   │   ├── model_0.ckpt.data-00000-of-00001
    │   │   │   ├── command.txt
    │   │   │   └── args.json
    │   └── medium
    │   │   └── 0
    │   │       ├── checkpoint
    │   │       ├── model_40000_0.ckpt.index
    │   │       ├── model_40000_0.ckpt.meta
    │   │       ├── model_40000_0.ckpt.data-00000-of-00001
    │   │       ├── command.txt
    │   │       └── args.json
    └── simple_tag
    │   └── tag4
    │       ├── checkpoint
    │       ├── model_30000_3.ckpt.index
    │       ├── model_30000_3.ckpt.meta
    │       ├── model_30000_3.ckpt.data-00000-of-00001
    │       └── args.json
├── requirements.txt
├── alg
    ├── __init__.py
    ├── maddpg
    │   ├── __init__.py
    │   ├── trainer
    │   │   └── replay_buffer.py
    │   ├── train.py
    │   └── common
    │   │   ├── tf_util.py
    │   │   └── distributions.py
    ├── optimizer.py
    ├── common
    │   └── common.py
    ├── sharing_multi_ppo
    │   ├── ppo.py
    │   └── ppo_add_entropy.py
    └── muti_ptf_ppo
    │   ├── ppo.py
    │   └── ppo_add_entropy.py
├── config
    ├── particle_conf.yaml
    ├── pacman_conf.yaml
    ├── maddpg_conf.yaml
    └── ppo_conf.yaml
├── run
    └── __init__.py
├── util
    ├── fource_exit.py
    ├── get_out_files.py
    ├── output_json.py
    ├── logger.py
    └── ReplayBuffer.py
├── README.md
└── main.py


/game/particle/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/game/particle/bin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/pacman/original/0/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model_3.ckpt"
2 | all_model_checkpoint_paths: "model_3.ckpt"
3 | 


--------------------------------------------------------------------------------
/source/pacman/medium/0/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model_40000_2.ckpt"
2 | all_model_checkpoint_paths: "model_40000_2.ckpt"
3 | 


--------------------------------------------------------------------------------
/source/simple_tag/tag4/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model_30000_3.ckpt"
2 | all_model_checkpoint_paths: "model_30000_3.ckpt"
3 | 


--------------------------------------------------------------------------------
/source/pacman/original/0/model_0.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.meta


--------------------------------------------------------------------------------
/source/pacman/original/0/model_0.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.index


--------------------------------------------------------------------------------
/source/pacman/medium/0/model_40000_0.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.index


--------------------------------------------------------------------------------
/source/pacman/medium/0/model_40000_0.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.meta


--------------------------------------------------------------------------------
/source/simple_tag/tag4/model_30000_3.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.index


--------------------------------------------------------------------------------
/source/simple_tag/tag4/model_30000_3.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.meta


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.9
2 | numpy==1.19.5
3 | tensorboard==1.14.0
4 | tensorboard-logger==0.1.0
5 | tensorflow==1.14.0
6 | PyYAML==5.4.1
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/game/pacman/pacmanDQN_Agents.py:
--------------------------------------------------------------------------------
1 | import game.pacman.game as game
2 | 
3 | 
4 | class PacmanDQN(game.Agent):
5 |     def __init__(self, args):
6 |         pass
7 | 


--------------------------------------------------------------------------------
/source/pacman/original/0/model_0.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/source/pacman/medium/0/model_40000_0.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/source/simple_tag/tag4/model_30000_3.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/alg/__init__.py:
--------------------------------------------------------------------------------
1 | from alg.maddpg.trainer.maddpg import MADDPGAgentTrainer as maddpg
2 | 
3 | REGISTRY = {}
4 | 
5 | 
6 | REGISTRY['maddpg'] = maddpg
7 | REGISTRY['maddpg_sr'] = maddpg
8 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/scenarios/__init__.py:
--------------------------------------------------------------------------------
1 | import imp
2 | import os.path as osp
3 | 
4 | 
5 | def load(name):
6 |     pathname = osp.join(osp.dirname(__file__), name)
7 |     return imp.load_source('', pathname)
8 | 


--------------------------------------------------------------------------------
/game/__init__.py:
--------------------------------------------------------------------------------
 1 | from .particle.make_env import make_env as Particle
 2 | from .pacman.make_env import make_env as PacmanEnv
 3 | 
 4 | 
 5 | REGISTRY = {}
 6 | 
 7 | REGISTRY['particle'] = Particle
 8 | REGISTRY['pacman'] = PacmanEnv
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/config/particle_conf.yaml:
--------------------------------------------------------------------------------
 1 | game_name: "simple_spread"
 2 | continuous_action: False
 3 | reward_normalize: False
 4 | benchmark: False
 5 | action_clip: 1
 6 | num_adversaries: 0
 7 | num_good: 0
 8 | obs_sort: False
 9 | reward_func: "reward"
10 | restrict_move: False


--------------------------------------------------------------------------------
/game/pacman/layouts/openClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%
 2 | %..    ....      .... G %
 3 | %..  ...  ...  ...  ... %
 4 | %..  ...  ...  ...  ... %
 5 | %.. P  ....      ....   %
 6 | %..  ...  ...  ...  ... %
 7 | %..  ...  ...  ...  ... %
 8 | %..    ....      .... G %
 9 | %%%%%%%%%%%%%%%%%%%%%%%%%
10 | 


--------------------------------------------------------------------------------
/config/pacman_conf.yaml:
--------------------------------------------------------------------------------
 1 | num_adversaries: 1 #pacman
 2 | timeout: 30
 3 | game_name: "trickyClassic"
 4 | textGraphics: False
 5 | quietGraphics: False
 6 | zoom: 1.0
 7 | fixRandomSeed: False
 8 | recordActions: False
 9 | replay: None
10 | frameTime: 0.1
11 | catchExceptions: False
12 | continuous_action: False
13 | obs_sort: False
14 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/scenario.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | # defines scenario upon which the world is built
 5 | class BaseScenario(object):
 6 |     # create elements of the world
 7 |     def make_world(self):
 8 |         raise NotImplementedError()
 9 |     # create initial conditions of the world
10 | 
11 |     def reset_world(self, world):
12 |         raise NotImplementedError()
13 | 


--------------------------------------------------------------------------------
/run/__init__.py:
--------------------------------------------------------------------------------
 1 | from .run_multi_ptf_ppo_sro import run as multi_ppo_sr_run
 2 | from .run_maddpg_sr import run as run_maddpg_sr
 3 | from .run_multi_ptf_shppo_sro import run as shppo_sr_run
 4 | 
 5 | REGISTRY = {}
 6 | 
 7 | REGISTRY['multi_ppo'] = multi_ppo_sr_run
 8 | REGISTRY['multi_ppo_sro'] = multi_ppo_sr_run
 9 | REGISTRY['maddpg'] = run_maddpg_sr
10 | REGISTRY['maddpg_sr'] = run_maddpg_sr
11 | REGISTRY['shppo'] = shppo_sr_run
12 | REGISTRY['shppo_sro'] = shppo_sr_run
13 | 
14 | 


--------------------------------------------------------------------------------
/alg/maddpg/__init__.py:
--------------------------------------------------------------------------------
 1 | class AgentTrainer(object):
 2 |     def __init__(self, name, model, obs_shape, act_space, args):
 3 |         raise NotImplemented()
 4 | 
 5 |     def action(self, obs):
 6 |         raise NotImplemented()
 7 | 
 8 |     def process_experience(self, obs, act, rew, new_obs, done, terminal):
 9 |         raise NotImplemented()
10 | 
11 |     def preupdate(self):
12 |         raise NotImplemented()
13 | 
14 |     def update(self, agents):
15 |         raise NotImplemented()
16 | 


--------------------------------------------------------------------------------
/util/fource_exit.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import msvcrt
 3 | 
 4 | 
 5 | class Exit:
 6 |     def __init__(self):
 7 |         self.isExit = False
 8 |         self.thread = threading.Thread(target=self.work)
 9 | 
10 |     def work(self):
11 |         while True:
12 |             newChar = msvcrt.getch()
13 |             if newChar in b'\r':  # 如果是换行，则输入结束
14 |                 self.isExit = True
15 |                 break
16 | 
17 |     def run(self):
18 |         self.thread.start()
19 | 
20 |     def get_status(self):
21 |         return self.isExit
22 | 
23 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # Multiagent envs
 4 | # ----------------------------------------
 5 | 
 6 | register(
 7 |     id='MultiagentSimple-v0',
 8 |     entry_point='multiagent.envs:SimpleEnv',
 9 |     # FIXME(cathywu) currently has to be exactly max_path_length parameters in
10 |     # rllab run script
11 |     max_episode_steps=100,
12 | )
13 | 
14 | register(
15 |     id='MultiagentSimpleSpeakerListener-v0',
16 |     entry_point='multiagent.envs:SimpleSpeakerListenerEnv',
17 |     max_episode_steps=100,
18 | )
19 | 


--------------------------------------------------------------------------------
/source/pacman/medium/0/command.txt:
--------------------------------------------------------------------------------
1 | -a multi_ppo -c multi_ptf_ppo_conf -g pacman -d pacman_conf -n 50000 -e 99 -s 7 -o adam n_layer_a_1=128 n_layer_c_1=128 option_layer_1=64 n_layer_a_2=128 n_layer_c_2=128 option_layer_2=64 c2=0.01 learning_rate_a=5e-4 learning_rate_c=5e-4 learning_rate_o=1e-3 learning_rate_t=1e-3 continuous_action=False reward_decay=0.99 clip_value=0.2 e_greedy=0.95 e_greedy_increment=1e-3 replace_target_iter=1000 learning_step=1000 option_batch_size=64 batch_size=64 save_per_episodes=10000 save_model=True c3=0.001 num_adversaries=1 adv_use_option=False good_use_option=False adv_load_model=True adv_load_model_path=source/pacman/3/model_20000_0 game_name=mediumClassic obs_sort=False xi=0 use_gpu=True use_gpu_id=1 memory_size=100000


--------------------------------------------------------------------------------
/source/pacman/original/0/command.txt:
--------------------------------------------------------------------------------
1 | -a multi_ppo -c multi_ptf_ppo_conf -g pacman -d pacman_conf -n 50000 -e 99 -s 7 -o adam n_layer_a_1=128 n_layer_c_1=128 option_layer_1=64 n_layer_a_2=128 n_layer_c_2=128 option_layer_2=64 c2=0.01 learning_rate_a=5e-4 learning_rate_c=5e-4 learning_rate_o=1e-3 learning_rate_t=1e-3 continuous_action=False reward_decay=0.99 clip_value=0.2 e_greedy=0.95 e_greedy_increment=1e-3 replace_target_iter=1000 learning_step=1000 option_batch_size=64 batch_size=64 save_per_episodes=5000 save_model=True c3=0.001 num_adversaries=1 adv_use_option=False good_use_option=False load_model=True load_model_path=source/pacman/original/2020-11-18_22-49-59/model game_name=originalClassic obs_sort=False xi=0 use_gpu=False use_gpu_id=1 memory_size=100000


--------------------------------------------------------------------------------
/alg/optimizer.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class Optimizer:
 5 |     def __init__(
 6 |             self,
 7 |             optimizer,
 8 |             learning_rate,
 9 |             momentum=None
10 |     ):
11 |         self.opt = None
12 |         if str(optimizer).lower() == "grad":
13 |             self.opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
14 |         elif str(optimizer).lower() == "momentum":
15 |             self.opt = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum)
16 |         elif str(optimizer).lower() == 'rmsprop':
17 |             self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
18 |         elif str(optimizer).lower() == 'adam':
19 |             self.opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
20 | 
21 |     def get_optimizer(self):
22 |         return self.opt
23 | 


--------------------------------------------------------------------------------
/game/pacman/layouts/originalClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | %............%%............%
 3 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
 4 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
 5 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
 6 | %..........................%
 7 | %.%%%%.%%.%%%%%%%%.%%.%%%%.%
 8 | %.%%%%.%%.%%%%%%%%.%%.%%%%.%
 9 | %......%%....%%....%%......%
10 | %%%%%%.%%%%% %% %%%%%.%%%%%%
11 | %%%%%%.%%%%% %% %%%%%.%%%%%%
12 | %%%%%%.%            %.%%%%%%
13 | %%%%%%.% %%%%  %%%% %.%%%%%%
14 | %     .  %G   G  G%  .     %
15 | %%%%%%.% %%%%%%%%%% %.%%%%%%
16 | %%%%%%.%            %.%%%%%%
17 | %%%%%%.% %%%%%%%%%% %.%%%%%%
18 | %............%%............%
19 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
20 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
21 | %...%%.......  .......%%...%
22 | %%%.%%.%%.%%%%%%%%.%%.%%.%%%
23 | %%%.%%.%%.%%%%%%%%.%%.%%.%%%
24 | %......%%....%%....%%......%
25 | %.%%%%%%%%%%.%%.%%%%%%%%%%.%
26 | %.............P............%
27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%
28 | 


--------------------------------------------------------------------------------
/util/get_out_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | def paths(file_path):
 6 |     path_collection = []
 7 |     path_target_collection = []
 8 |     path_target_dir = []
 9 |     for dirpath, dirnames, filenames in os.walk(file_path):
10 |         for file_name in filenames:
11 |             if file_name == 'out.json' or file_name == 'args.json' or file_name == 'command.txt':
12 |                 fullpath = os.path.join(dirpath, file_name)
13 |                 path_collection.append(fullpath)
14 |                 path_target_collection.append(fullpath.replace('results', 'results_out'))
15 |                 path_target_dir.append(dirpath.replace('results', 'results_out'))
16 |     return path_collection, path_target_collection, path_target_dir
17 | 
18 | 
19 | source_path = ''
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     path_collection, path_target_collection, path_target_dir = paths(source_path)
24 | 
25 |     for (source, target, dir_path) in zip(path_collection, path_target_collection, path_target_dir):
26 |         if not os.path.exists(dir_path):
27 |             os.makedirs(dir_path)
28 |         print(source, target)
29 |         shutil.copyfile(source, target)
30 | 


--------------------------------------------------------------------------------
/config/maddpg_conf.yaml:
--------------------------------------------------------------------------------
 1 | reward_decay: 0.99
 2 | output_graph: True
 3 | save_model: True
 4 | summary_output_times: 10
 5 | regular: 0.005
 6 | learning_rate_a: 0.01
 7 | learning_rate_c: 0.01
 8 | ENTROPY_BETA: 0.0005
 9 | USE_CPU_COUNT: True
10 | load_model: False
11 | load_model_path: ''
12 | batch_size: 1024
13 | display: False
14 | 
15 | #run
16 | reward_memory: 100
17 | save_per_episodes: 2000
18 | num_adversaries: 0
19 | good_policy: 'maddpg'
20 | adv_policy: 'maddpg'
21 | adv_use_option: False
22 | good_use_option: False
23 | adv_load_model: False
24 | adv_load_model_path: ''
25 | good_load_model: False
26 | good_load_model_path: ''
27 | use_gpu_id: '0'
28 | use_gpu: False
29 | other_option_update: True
30 | 
31 | #option
32 | learning_rate_o: 0.0003
33 | learning_rate_t: 0.0003
34 | option_layer_1: 128
35 | option_layer_2: 128
36 | e_greedy: 0.95
37 | e_greedy_increment: 0.005
38 | start_greedy: 0.0
39 | memory_size: 1000000
40 | option_batch_size: 16
41 | xi: 0.005
42 | option_clip_value: 10.0
43 | is_soft_max_action: True
44 | replace_target_iter: 1000
45 | learning_step: 1000
46 | c3: 0.0005
47 | c1: 1.0
48 | 
49 | # SF
50 | embedding_dim: 32
51 | option_embedding_layer: 64
52 | recon_loss_coef: 0.1
53 | learning_rate_r: 0.0003
54 | clip_value: 0.2
55 | 
56 | #DVM
57 | distillation_frequent: 1000
58 | distillation_interation: 2048
59 | 
60 | # network
61 | n_layer_a_1: 128
62 | 
63 | # output
64 | SAVE_PATH: "model"
65 | graph_path: "graph"
66 | reward_output: "output"
67 | output_filename: "out"
68 | log: "log"
69 | benchmark_dir: "benchmark"


--------------------------------------------------------------------------------
/config/ppo_conf.yaml:
--------------------------------------------------------------------------------
 1 | # ppo
 2 | learning_rate_a: 0.0003
 3 | learning_rate_c: 0.0003
 4 | batch_size: 32
 5 | clip_value: 0.2
 6 | reward_decay: 0.99
 7 | c2: 0.001
 8 | stochastic: True
 9 | load_model: False
10 | load_model_path: ''
11 | adv_policy: "ppo"
12 | good_policy: "ppo"
13 | reward_normalize: False
14 | done_reward: 1.0
15 | 
16 | # option
17 | option_batch_size: 32
18 | option_clip_value: 10.0
19 | other_option_update: True
20 | c1: 0.005
21 | c3: 0.0005
22 | epi_train_times: 1
23 | memory_size: 100000
24 | e_greedy: 0.95
25 | replace_target_iter: 1000
26 | e_greedy_increment: 0.001
27 | start_greedy: 0.0
28 | learning_step: 1000
29 | learning_rate_o: 0.00001
30 | learning_rate_t: 0.00001
31 | xi: 0.005
32 | adv_use_option: False
33 | good_use_option: False
34 | adv_load_model: False
35 | adv_load_model_path: ''
36 | good_load_model: False
37 | good_load_model_path: ''
38 | grad_clip: 10
39 | 
40 | #sro
41 | learning_rate_r: 0.0003
42 | embedding_dim: 32
43 | option_embedding_layer: 64
44 | recon_loss_coef: 0.1
45 | 
46 | # transfer_agent
47 | trans_agent_start_epi: 0
48 | 
49 | #run
50 | reward_memory: 100
51 | save_per_episodes: 2000
52 | use_gpu_id: '0'
53 | use_gpu: False
54 | output_graph: True
55 | save_model: True
56 | summary_output_times: 10
57 | reload_model: False
58 | reload_model_path: ''
59 | 
60 | # network
61 | policy: 'policy'
62 | old_policy: 'old_policy'
63 | n_layer_a_1: 64
64 | n_layer_a_2: 64
65 | n_layer_c_1: 64
66 | n_layer_c_2: 64
67 | option_layer_1: 128
68 | option_layer_2: 128
69 | 
70 | # output
71 | SAVE_PATH: "model"
72 | graph_path: "graph"
73 | reward_output: "output"
74 | output_filename: "out"
75 | log: "log"
76 | 
77 | 


--------------------------------------------------------------------------------
/source/pacman/medium/0/args.json:
--------------------------------------------------------------------------------
1 | {"numGames": 50000, "game": "pacman", "algorithm": "multi_ppo", "epi_step": 99, "seed": 7, "optimizer": "adam", "run_test": false, "results_path": "../results/multi_ppo_pacman/2019-12-12_13-12-59/", "learning_rate_a": 0.0005, "learning_rate_c": 0.0005, "batch_size": 64, "option_batch_size": 64, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.01, "epi_train_times": 1, "stochastic": true, "load_model": false, "load_model_path": "", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.001, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 0.001, "learning_rate_t": 0.001, "ENTROPY_BETA": 0.0005, "c3": 0.001, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "adv_load_model": true, "adv_load_model_path": "source/pacman/3/model_20000_0", "reward_normalize": false, "done_reward": 1.0, "reward_memory": 100, "save_per_episodes": 10000, "use_gpu_id": "1", "use_gpu": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 128, "n_layer_a_2": 128, "n_layer_c_1": 128, "n_layer_c_2": 128, "temperature": 0.1, "option_layer_1": 64, "option_layer_2": 64, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "num_adversaries": 1, "timeout": 30, "game_name": "mediumClassic", "textGraphics": false, "quietGraphics": false, "zoom": 1.0, "fixRandomSeed": false, "recordActions": false, "replay": "None", "frameTime": 0.1, "catchExceptions": false, "continuous_action": false, "obs_sort": false}


--------------------------------------------------------------------------------
/game/particle/bin/interactive.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os,sys
 3 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 4 | import argparse
 5 | 
 6 | from game.particle.multiagent.environment import MultiAgentEnv
 7 | from game.particle.multiagent.policy import InteractivePolicy
 8 | import game.particle.multiagent.scenarios as scenarios
 9 | 
10 | if __name__ == '__main__':
11 |     # parse arguments
12 |     parser = argparse.ArgumentParser(description=None)
13 |     parser.add_argument('-s', '--scenario', default='simple_adversary.py', help='Path of the scenario Python script.')
14 |     args = parser.parse_args()
15 | 
16 |     # load scenario from script
17 |     scenario = scenarios.load(args.scenario).Scenario()
18 |     # create world
19 |     world = scenario.make_world()
20 |     # create multiagent environment
21 |     env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False)
22 |     # render call to create viewer window (necessary only for interactive policies)
23 |     env.render()
24 |     # create interactive policies for each agent
25 |     policies = [InteractivePolicy(env, i) for i in range(env.n)]
26 |     # execution loop
27 |     obs_n = env.reset()
28 |     while True:
29 |         # query for action from each agent's policy
30 |         act_n = []
31 |         for i, policy in enumerate(policies):
32 |             act_n.append(policy.action(obs_n[i]))
33 |         # step environment
34 |         obs_n, reward_n, done_n, _ = env.step(act_n)
35 |         # render all agent views
36 |         env.render()
37 |         # display rewards
38 |         #for agent in env.world.agents:
39 |         #    print(agent.name + " reward: %0.3f" % env._get_reward(agent))
40 | 


--------------------------------------------------------------------------------
/source/simple_tag/tag4/args.json:
--------------------------------------------------------------------------------
1 | {"numGames": 20000, "game": "particle", "algorithm": "multi_ppo_sr2", "epi_step": 99, "seed": 12345, "optimizer": "adam", "run_test": false, "obs_sort": false, "learning_rate_a": 0.0003, "learning_rate_c": 0.0003, "learning_rate_r": 0.0003, "batch_size": 32, "option_batch_size": 32, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.001, "epi_train_times": 1, "stochastic": true, "load_model": false, "load_model_path": "", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.005, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 1e-05, "learning_rate_t": 1e-05, "ENTROPY_BETA": 0.0005, "c3": 0.0005, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "reward_normalize": false, "adv_load_model": false, "adv_load_model_path": "", "good_load_model": true, "good_load_model_path": "source\\simple_tag\\ppo_tag\\model_30000_3", "done_reward": 1.0, "trans_agent_start_epi": 0, "reward_memory": 100, "save_per_episodes": 2000, "use_gpu_id": "0", "use_gpu": false, "other_option_update": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 64, "n_layer_a_2": 64, "n_layer_c_1": 64, "n_layer_c_2": 64, "temperature": 0.1, "option_layer_1": 32, "option_layer_2": 32, "embedding_dim": 32, "option_embedding_layer": 64, "recon_loss_coef": 0.1, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "game_name": "simple_tag", "continuous_action": false, "benchmark": false, "action_clip": 1, "num_adversaries": 3, "reward_func": "reward", "restrict_move": true, "results_path": "../results/multi_ppo_sr2/particle/simple_tag/2020-09-22_17-44-26/"}


--------------------------------------------------------------------------------
/source/pacman/original/0/args.json:
--------------------------------------------------------------------------------
1 | {"numGames": 50000, "game": "pacman", "algorithm": "multi_ppo", "epi_step": 99, "seed": 7, "optimizer": "adam", "run_test": false, "learning_rate_a": 0.0005, "learning_rate_c": 0.0005, "batch_size": 64, "option_batch_size": 64, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.01, "epi_train_times": 1, "stochastic": true, "load_model": true, "load_model_path": "source/pacman/original/2020-11-18_22-49-59/model", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.001, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 0.001, "learning_rate_t": 0.001, "ENTROPY_BETA": 0.0005, "c3": 0.001, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "adv_load_model": false, "adv_load_model_path": "", "good_load_model": false, "good_load_model_path": "", "reward_normalize": false, "done_reward": 1.0, "grad_clip": 10, "trans_agent_start_epi": 0, "reward_memory": 100, "save_per_episodes": 5000, "use_gpu_id": "1", "use_gpu": false, "other_option_update": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 128, "n_layer_a_2": 128, "n_layer_c_1": 128, "n_layer_c_2": 128, "temperature": 0.1, "option_layer_1": 64, "option_layer_2": 64, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "num_adversaries": 1, "timeout": 30, "game_name": "originalClassic", "textGraphics": false, "quietGraphics": false, "zoom": 1.0, "fixRandomSeed": false, "recordActions": false, "replay": "None", "frameTime": 0.1, "catchExceptions": false, "continuous_action": false, "obs_sort": false, "results_path": "../results/multi_ppo/pacman/originalClassic/2020-11-21_11-40-33/"}


--------------------------------------------------------------------------------
/game/particle/multiagent/policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pyglet.window import key
 3 | 
 4 | 
 5 | # individual agent policy
 6 | class Policy(object):
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def action(self, obs):
11 |         raise NotImplementedError()
12 | 
13 | 
14 | # interactive policy based on keyboard input
15 | # hard-coded to deal only with movement, not communication
16 | class InteractivePolicy(Policy):
17 |     def __init__(self, env, agent_index):
18 |         super(InteractivePolicy, self).__init__()
19 |         self.env = env
20 |         # hard-coded keyboard events
21 |         self.move = [False for i in range(4)]
22 |         self.comm = [False for i in range(env.world.dim_c)]
23 |         # register keyboard events with this environment's window
24 |         env.viewers[agent_index].window.on_key_press = self.key_press
25 |         env.viewers[agent_index].window.on_key_release = self.key_release
26 | 
27 |     def action(self, obs):
28 |         # ignore observation and just act based on keyboard events
29 |         if self.env.discrete_action_input:
30 |             u = 0
31 |             if self.move[0]: u = 1
32 |             if self.move[1]: u = 2
33 |             if self.move[2]: u = 4
34 |             if self.move[3]: u = 3
35 |         else:
36 |             u = np.zeros(5) # 5-d because of no-move action
37 |             if self.move[0]: u[1] += 1.0
38 |             if self.move[1]: u[2] += 1.0
39 |             if self.move[3]: u[3] += 1.0
40 |             if self.move[2]: u[4] += 1.0
41 |             if True not in self.move:
42 |                 u[0] += 1.0
43 |         return np.concatenate([u, np.zeros(self.env.world.dim_c)])
44 | 
45 |     # keyboard event callbacks
46 |     def key_press(self, k, mod):
47 |         if k == key.LEFT:  self.move[0] = True
48 |         if k == key.RIGHT: self.move[1] = True
49 |         if k == key.UP:    self.move[2] = True
50 |         if k == key.DOWN:  self.move[3] = True
51 | 
52 |     def key_release(self, k, mod):
53 |         if k == key.LEFT:  self.move[0] = False
54 |         if k == key.RIGHT: self.move[1] = False
55 |         if k == key.UP:    self.move[2] = False
56 |         if k == key.DOWN:  self.move[3] = False
57 | 


--------------------------------------------------------------------------------
/util/output_json.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | 
 5 | 
 6 | class OutputJson:
 7 |     def __init__(self, data_field=[]):
 8 |         self.data_field = data_field
 9 |         self.data = {}
10 |         for i in range(len(data_field)):
11 |             if not isinstance(data_field[i], str):
12 |                 raise Exception('the data field must be type of string: ' + str(data_field[i]))
13 | 
14 |             self.data[data_field[i]] = []
15 | 
16 |     def update(self, value, key=None):
17 |         if key is not None:
18 |             if isinstance(value, bool):
19 |                 value = str(value)
20 |             self.data[key].append(value)
21 |             return
22 |         if isinstance(value, tuple) or isinstance(value, list):
23 |             if len(value) != len(self.data_field):
24 |                 raise Exception('Error in parameters size: ' + str(value))
25 |             for i in range(len(value)):
26 |                 if type(value[i]) is np.bool_ or type(value[i]) is np.bool or type(value[i]) is bool:
27 |                     self.data[self.data_field[i]].append(str(value[i]))
28 |                 else:
29 |                     self.data[self.data_field[i]].append(value[i])
30 | 
31 |     def print_first(self):
32 |         if self.data == {}:
33 |             return
34 |         for i, key in enumerate(self.data_field):
35 |             print(key, ": %s, " % self.data[key][len(self.data[key]) - 1], end=' ')
36 |         print()
37 | 
38 |     def print_by_key(self, key, index=None):
39 |         if index is None:
40 |             print(key, ": ", self.data[key])
41 |         else:
42 |             print(key, " ", index, ": ", self.data[key][index])
43 | 
44 |     def save(self, path, filename, field=None):
45 |         if not os.path.exists(path):
46 |             os.makedirs(path)
47 |         if field is None:
48 |             field = self.data_field
49 |         out = {}
50 |         for key in field:
51 |             if len(self.data[key]) > 0 and type(self.data[key][0]) is np.ndarray:
52 |                 out[key] = [a.tolist() for a in self.data[key]]
53 |             else:
54 |                 out[key] = self.data[key]
55 |         with open(path + "/" + filename + ".json", "w") as f:
56 |             json.dump(out, f)
57 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/multi_discrete.py:
--------------------------------------------------------------------------------
 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
 3 | 
 4 | import numpy as np
 5 | 
 6 | import gym
 7 | from gym.spaces import prng
 8 | 
 9 | 
10 | class MultiDiscrete(gym.Space):
11 |     """
12 |     - The multi-discrete action space consists of a series of discrete action spaces with different parameters
13 |     - It can be adapted to both a Discrete action space or a continuous (Box) action space
14 |     - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
15 |     - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
16 |        where the discrete action space can take any integers from `min` to `max` (both inclusive)
17 |     Note: A value of 0 always need to represent the NOOP action.
18 |     e.g. Nintendo Game Controller
19 |     - Can be conceptualized as 3 discrete action spaces:
20 |         1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
21 |         2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
22 |         3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
23 |     - Can be initialized as
24 |         MultiDiscrete([ [0,4], [0,1], [0,1] ])
25 |     """
26 |     def __init__(self, array_of_param_array):
27 |         self.low = np.array([x[0] for x in array_of_param_array])
28 |         self.high = np.array([x[1] for x in array_of_param_array])
29 |         self.num_discrete_space = self.low.shape[0]
30 | 
31 |     def sample(self):
32 |         """ Returns a array with one sample from each discrete action space """
33 |         # For each row: round(random .* (max - min) + min, 0)
34 |         random_array = prng.np_random.rand(self.num_discrete_space)
35 |         return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
36 | 
37 |     def contains(self, x):
38 |         return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
39 | 
40 |     @property
41 |     def shape(self):
42 |         return self.num_discrete_space
43 | 
44 |     def __repr__(self):
45 |         return "MultiDiscrete" + str(self.num_discrete_space)
46 | 
47 |     def __eq__(self, other):
48 |         return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
49 | 


--------------------------------------------------------------------------------
/alg/common/common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import json
 4 | from alg.muti_ptf_ppo.ppo import PPO
 5 | 
 6 | def action_equal(action1, action2, continuous_action=None):
 7 |     if not continuous_action or continuous_action is None:
 8 |         if (isinstance(action1, list) or isinstance(action1, np.ndarray)) and (isinstance(action2, list) or isinstance(action2, np.ndarray)):
 9 |             return (np.array(action1) == np.array(action2)).all()
10 |         else:
11 |             return False
12 |     elif continuous_action:
13 |         mean = action1[0]
14 |         std = action1[1]
15 |         for i in range(len(action2)):
16 |             if action2[i] < mean[i] - std[i] or action2[i] > mean[i] + std[i]:
17 |                 return False
18 |         return True
19 | 
20 | 
21 | def build_source_actor(args, sess, policy_path, i=0):
22 |     par_path = os.path.dirname(policy_path)
23 |     file_name = ''
24 |     for dirPath, dirNames, fileNames in os.walk(par_path):
25 |         # print(fileNames)
26 |         for fileName in fileNames:
27 |             if fileName == 'args.json':
28 |                 file_name = fileName
29 |                 break
30 |         if file_name != '':
31 |             break
32 |     file_path = par_path + "/" + file_name
33 |     with open(file_path, 'r') as f:
34 |         source_args = json.load(f)
35 |     source_policy = 'ppo'#args['policy']''
36 |     if source_policy == 'ppo':
37 |         return PPO(args['action_dim'], args['features'], source_args, sess, logger=None, i=i)
38 |     else:
39 |         raise Exception('no such source_policy named: ' + str(source_policy))
40 | 
41 | 
42 | class OptionToList:
43 |     def __init__(self, num_agent):
44 |         self.num_agent = num_agent
45 |         self.option_list = []
46 |         self.reset()
47 | 
48 |     def reset(self):
49 |         self.option_list = []
50 |         length = np.power(self.num_agent - 1, self.num_agent)
51 |         for i in range(length):
52 |             self.option_list.append(self.number_converter(i))
53 | 
54 |     # FIXME option网络输出option index，转换成union option的操作相当于进制转换，例如option_dim=3，option_index=26, union_option=[2, 2, 2]
55 |     def number_converter(self, number):
56 |         hex = self.num_agent
57 |         res = np.zeros(hex)
58 |         index = 0
59 |         while True:
60 |             s = number // (hex - 1)  # 商
61 |             y = number % (hex - 1)  # 余数
62 |             res[index] = y
63 |             if s == 0:
64 |                 break
65 |             number = s
66 |             index += 1
67 |         res = list(res)
68 |         res.reverse()
69 |         return res
70 | 
71 |     def get_option_list(self, i):
72 |         if i >= len(self.option_list):
73 |             assert 'out of option_list memory!'
74 |         return self.option_list[i]
75 | 


--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy
 3 | import time
 4 | import threading
 5 | 
 6 | R = threading.Lock()
 7 | 
 8 | class Logger:
 9 |     def __init__(self, log_name, graph_path, args):
10 |         # 第一步，创建一个logger
11 |         self.logger = self.build_log(log_name)
12 |         self.build_tb_log(graph_path)
13 |         self.args = args
14 |         self.keys = dict()
15 |         if 'summary_output_times' in self.args.keys():
16 |             self.summary_times = self.args['summary_output_times']
17 |         else:
18 |             self.summary_times = 1
19 | 
20 | 
21 |     def build_log(self, name):
22 |         logger = logging.getLogger()
23 |         logger.setLevel(logging.INFO)  # Log等级总开关
24 |         # 第二步，创建一个handler，用于写入日志文件
25 |         rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
26 |         log_name = name + '/out.log'
27 |         logfile = log_name
28 |         fh = logging.FileHandler(logfile, mode='w')
29 |         fh.setLevel(logging.DEBUG)  # 输出到file的log等级的开关
30 |         # 第三步，定义handler的输出格式
31 |         formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
32 |         fh.setFormatter(formatter)
33 |         # 第四步，将logger添加到handler里面
34 |         logger.addHandler(fh)
35 |         return logger
36 | 
37 |     def build_tb_log(self, path):
38 |         from tensorboard_logger import configure, log_value, log_histogram
39 |         configure(path)
40 |         self.tb_logger = log_value
41 |         self.tb_h_logger = log_histogram
42 | 
43 |     def write_tb_log(self, key, value, t):
44 |         if self.args['output_graph']:
45 |             if t % self.summary_times != 0:
46 |                 return
47 |             #print(key, value)
48 |             if type(value) is numpy.ndarray or type(value) is list:
49 |                 R.acquire()
50 |                 if key not in self.keys.keys():
51 |                     self.keys[key] = 0
52 |                 else:
53 |                     self.keys[key] += 1
54 |                 #print(type(value), key, value, self.keys[key])
55 |                 self.tb_h_logger(key, value, self.keys[key])
56 |                 R.release()
57 |             else:
58 |                 R.acquire()
59 |                 if key not in self.keys.keys():
60 |                     self.keys[key] = 0
61 |                 else:
62 |                     self.keys[key] += 1
63 |                 #print(key, value, self.keys[key])
64 |                 self.tb_logger(key, value, self.keys[key])
65 |                 R.release()
66 |         else:
67 |             return
68 | 
69 | 
70 |     def write_log(self, msg, type='info'):
71 |         if type == 'debug':
72 |             self.logger.debug(msg)
73 |         elif type == 'info':
74 |             self.logger.log(msg)
75 |         elif type == 'warning':
76 |             self.logger.warning(msg)
77 |         elif type == 'error':
78 |             self.logger.error(msg)
79 |         elif type == 'critical':
80 |             self.logger.critical(msg)
81 | 
82 | 


--------------------------------------------------------------------------------
/game/pacman/textDisplay.py:
--------------------------------------------------------------------------------
  1 | # textDisplay.py
  2 | # --------------
  3 | # Licensing Information:  You are free to use or extend these projects for
  4 | # educational purposes provided that (1) you do not distribute or publish
  5 | # solutions, (2) you retain this notice, and (3) you provide clear
  6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
  7 | # 
  8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley.
  9 | # The core projects and autograders were primarily created by John DeNero
 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 11 | # Student side autograding was added by Brad Miller, Nick Hay, and
 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu).
 13 | 
 14 | 
 15 | import time
 16 | try: 
 17 |     import pacman
 18 | except:
 19 |     pass
 20 | 
 21 | DRAW_EVERY = 1
 22 | SLEEP_TIME = 0 # This can be overwritten by __init__
 23 | DISPLAY_MOVES = False
 24 | QUIET = False # Supresses output
 25 | 
 26 | class NotGraphics:
 27 |     def initialize(self, state, isBlue = False):
 28 |         pass
 29 | 
 30 |     def update(self, state):
 31 |         pass
 32 | 
 33 |     def checkNullDisplay(self):
 34 |         return True
 35 | 
 36 |     def pause(self):
 37 |         time.sleep(SLEEP_TIME)
 38 | 
 39 |     def draw(self, state):
 40 |         pass
 41 | 
 42 |     def updateDistributions(self, dist):
 43 |         pass
 44 | 
 45 |     def finish(self):
 46 |         pass
 47 | 
 48 | class NullGraphics:
 49 |     def initialize(self, state, isBlue = False):
 50 |         pass
 51 | 
 52 |     def update(self, state):
 53 |         pass
 54 | 
 55 |     def checkNullDisplay(self):
 56 |         return True
 57 | 
 58 |     def pause(self):
 59 |         time.sleep(SLEEP_TIME)
 60 | 
 61 |     def draw(self, state):
 62 |         print(state)
 63 | 
 64 |     def updateDistributions(self, dist):
 65 |         pass
 66 | 
 67 |     def finish(self):
 68 |         pass
 69 | 
 70 | class PacmanGraphics:
 71 |     def __init__(self, speed=None):
 72 |         if speed != None:
 73 |             global SLEEP_TIME
 74 |             SLEEP_TIME = speed
 75 | 
 76 |     def initialize(self, state, isBlue = False):
 77 |         self.draw(state)
 78 |         self.pause()
 79 |         self.turn = 0
 80 |         self.agentCounter = 0
 81 | 
 82 |     def update(self, state):
 83 |         numAgents = len(state.agentStates)
 84 |         self.agentCounter = (self.agentCounter + 1) % numAgents
 85 |         if self.agentCounter == 0:
 86 |             self.turn += 1
 87 |             if DISPLAY_MOVES:
 88 |                 ghosts = [pacman.nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)]
 89 |                 print(("%4d) P: %-8s" % (self.turn, str(pacman.nearestPoint(state.getPacmanPosition()))), '| Score: %-5d' % state.score, '| Ghosts:', ghosts))
 90 |             if self.turn % DRAW_EVERY == 0:
 91 |                 self.draw(state)
 92 |                 self.pause()
 93 |         if state._win or state._lose:
 94 |             self.draw(state)
 95 | 
 96 |     def pause(self):
 97 |         time.sleep(SLEEP_TIME)
 98 | 
 99 |     def draw(self, state):
100 |         print(state)
101 | 
102 |     def finish(self):
103 |         pass
104 | 


--------------------------------------------------------------------------------
/alg/maddpg/trainer/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | class ReplayBuffer(object):
 5 |     def __init__(self, size):
 6 |         """Create Prioritized Replay buffer.
 7 | 
 8 |         Parameters
 9 |         ----------
10 |         size: int
11 |             Max number of transitions to store in the buffer. When the buffer
12 |             overflows the old memories are dropped.
13 |         """
14 |         self._storage = []
15 |         self._maxsize = int(size)
16 |         self._next_idx = 0
17 | 
18 |     def __len__(self):
19 |         return len(self._storage)
20 | 
21 |     def clear(self):
22 |         self._storage = []
23 |         self._next_idx = 0
24 | 
25 |     def add(self, obs_t, action, reward, obs_tp1, option, term, done):
26 |         data = (obs_t, action, reward, obs_tp1, option, term, done)
27 | 
28 |         if self._next_idx >= len(self._storage):
29 |             self._storage.append(data)
30 |         else:
31 |             self._storage[self._next_idx] = data
32 |         self._next_idx = (self._next_idx + 1) % self._maxsize
33 | 
34 |     def _encode_sample(self, idxes):
35 |         obses_t, actions, rewards, obses_tp1, options, terms, dones = [], [], [], [], [], [], []
36 |         for i in idxes:
37 |             data = self._storage[i]
38 |             obs_t, action, reward, obs_tp1, option, term, done = data
39 |             obses_t.append(np.array(obs_t, copy=False))
40 |             actions.append(np.array(action, copy=False))
41 |             rewards.append(reward)
42 |             obses_tp1.append(np.array(obs_tp1, copy=False))
43 |             options.append(option)
44 |             terms.append(term)
45 |             dones.append(done)
46 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(options), np.array(terms), np.array(dones)
47 | 
48 |     def make_index(self, batch_size):
49 |         return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
50 | 
51 |     def make_latest_index(self, batch_size):
52 |         idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)]
53 |         np.random.shuffle(idx)
54 |         return idx
55 | 
56 |     def sample_index(self, idxes):
57 |         return self._encode_sample(idxes)
58 | 
59 |     def sample(self, batch_size):
60 |         """Sample a batch of experiences.
61 | 
62 |         Parameters
63 |         ----------
64 |         batch_size: int
65 |             How many transitions to sample.
66 | 
67 |         Returns
68 |         -------
69 |         obs_batch: np.array
70 |             batch of observations
71 |         act_batch: np.array
72 |             batch of actions executed given obs_batch
73 |         rew_batch: np.array
74 |             rewards received as results of executing act_batch
75 |         next_obs_batch: np.array
76 |             next set of observations seen after executing act_batch
77 |         done_mask: np.array
78 |             done_mask[i] = 1 if executing act_batch[i] resulted in
79 |             the end of an episode and 0 otherwise.
80 |         """
81 |         if batch_size > 0:
82 |             idxes = self.make_index(batch_size)
83 |         else:
84 |             idxes = range(0, len(self._storage))
85 |         return self._encode_sample(idxes)
86 | 
87 |     def collect(self):
88 |         return self.sample(-1)
89 | 


--------------------------------------------------------------------------------
/util/ReplayBuffer.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import random
 3 | 
 4 | class ReplayBuffer(object):
 5 | 
 6 |     def __init__(self, buffer_size):
 7 |         self.buffer_size = buffer_size
 8 |         self.num_experiences = 0
 9 |         self.buffer = deque()
10 | 
11 |     def get_batch(self, batch_size):
12 |         # Randomly sample batch_size examples
13 |         return random.sample(self.buffer, batch_size)
14 | 
15 |     def size(self):
16 |         return self.buffer_size
17 | 
18 |     def add(self, state, action, reward, done, new_state, opa):
19 |         experience = (state, action, reward, done, new_state, opa)
20 |         if self.num_experiences < self.buffer_size:
21 |             self.buffer.append(experience)
22 |             self.num_experiences += 1
23 |         else:
24 |             self.buffer.popleft()
25 |             self.buffer.append(experience)
26 | 
27 |     def count(self):
28 |         # if buffer is full, return buffer size
29 |         # otherwise, return experience counter
30 |         return self.num_experiences
31 | 
32 |     def erase(self):
33 |         self.buffer = deque()
34 |         self.num_experiences = 0
35 | 
36 | class ReplayBufferSR(object):
37 | 
38 |     def __init__(self, buffer_size):
39 |         self.buffer_size = buffer_size
40 |         self.num_experiences = 0
41 |         self.buffer = deque()
42 | 
43 |     def get_batch(self, batch_size):
44 |         # Randomly sample batch_size examples
45 |         return random.sample(self.buffer, batch_size)
46 | 
47 |     def size(self):
48 |         return self.buffer_size
49 | 
50 |     def add(self, state, action, reward, done, new_state, opa):
51 |         experience = (state, action, reward, done, new_state, opa)
52 |         if self.num_experiences < self.buffer_size:
53 |             self.buffer.append(experience)
54 |             self.num_experiences += 1
55 |         else:
56 |             self.buffer.popleft()
57 |             self.buffer.append(experience)
58 | 
59 |     def count(self):
60 |         # if buffer is full, return buffer size
61 |         # otherwise, return experience counter
62 |         return self.num_experiences
63 | 
64 |     def erase(self):
65 |         self.buffer = deque()
66 |         self.num_experiences = 0
67 | 
68 | 
69 | class ShareReplayBuffer(object):
70 | 
71 |     def __init__(self, buffer_size):
72 |         self.buffer_size = buffer_size
73 |         self.num_experiences = 0
74 |         self.buffer = deque()
75 | 
76 |     def get_batch(self, batch_size):
77 |         # Randomly sample batch_size examples
78 |         return random.sample(self.buffer, batch_size)
79 | 
80 |     def size(self):
81 |         return self.buffer_size
82 | 
83 |     def add(self, state, action, reward, done, new_state, opa, agentId):
84 |         experience = (state, action, reward, done, new_state, opa, agentId)
85 |         if self.num_experiences < self.buffer_size:
86 |             self.buffer.append(experience)
87 |             self.num_experiences += 1
88 |         else:
89 |             self.buffer.popleft()
90 |             self.buffer.append(experience)
91 | 
92 |     def count(self):
93 |         # if buffer is full, return buffer size
94 |         # otherwise, return experience counter
95 |         return self.num_experiences
96 | 
97 |     def erase(self):
98 |         self.buffer = deque()
99 |         self.num_experiences = 0


--------------------------------------------------------------------------------
/game/pacman/keyboardAgents.py:
--------------------------------------------------------------------------------
 1 | # keyboardAgents.py
 2 | # -----------------
 3 | # Licensing Information:  You are free to use or extend these projects for
 4 | # educational purposes provided that (1) you do not distribute or publish
 5 | # solutions, (2) you retain this notice, and (3) you provide clear
 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
 7 | #
 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley.
 9 | # The core projects and autograders were primarily created by John DeNero
10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
11 | # Student side autograding was added by Brad Miller, Nick Hay, and
12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu).
13 | 
14 | 
15 | from game.pacman.game import Agent
16 | from game.pacman.game import Directions
17 | import random
18 | 
19 | 
20 | class KeyboardAgent(Agent):
21 |     """
22 |     An agent controlled by the keyboard.
23 |     """
24 |     # NOTE: Arrow keys also work.
25 |     WEST_KEY = 'a'
26 |     EAST_KEY = 'd'
27 |     NORTH_KEY = 'w'
28 |     SOUTH_KEY = 's'
29 |     STOP_KEY = 'q'
30 | 
31 |     def __init__(self, index=0):
32 | 
33 |         self.lastMove = Directions.STOP
34 |         self.index = index
35 |         self.keys = []
36 | 
37 |     def getAction(self, state):
38 |         from game.pacman.graphicsUtils import keys_waiting
39 |         from game.pacman.graphicsUtils import keys_pressed
40 |         keys = keys_waiting() + keys_pressed()
41 |         if keys != []:
42 |             self.keys = keys
43 | 
44 |         legal = state.getLegalActions(self.index)
45 |         move = self.getMove(legal)
46 | 
47 |         if move == Directions.STOP:
48 |             # Try to move in the same direction as before
49 |             if self.lastMove in legal:
50 |                 move = self.lastMove
51 | 
52 |         if (self.STOP_KEY in self.keys) and Directions.STOP in legal:
53 |             move = Directions.STOP
54 | 
55 |         if move not in legal:
56 |             move = random.choice(legal)
57 | 
58 |         self.lastMove = move
59 |         return move
60 | 
61 |     def getMove(self, legal):
62 |         move = Directions.STOP
63 |         if (self.WEST_KEY in self.keys or 'Left' in self.keys) and Directions.WEST in legal:
64 |             move = Directions.WEST
65 |         if (self.EAST_KEY in self.keys or 'Right' in self.keys) and Directions.EAST in legal:
66 |             move = Directions.EAST
67 |         if (self.NORTH_KEY in self.keys or 'Up' in self.keys) and Directions.NORTH in legal:
68 |             move = Directions.NORTH
69 |         if (self.SOUTH_KEY in self.keys or 'Down' in self.keys) and Directions.SOUTH in legal:
70 |             move = Directions.SOUTH
71 |         return move
72 | 
73 | 
74 | class KeyboardAgent2(KeyboardAgent):
75 |     """
76 |     A second agent controlled by the keyboard.
77 |     """
78 |     # NOTE: Arrow keys also work.
79 |     WEST_KEY = 'j'
80 |     EAST_KEY = "l"
81 |     NORTH_KEY = 'i'
82 |     SOUTH_KEY = 'k'
83 |     STOP_KEY = 'u'
84 | 
85 |     def getMove(self, legal):
86 |         move = Directions.STOP
87 |         if (self.WEST_KEY in self.keys) and Directions.WEST in legal:
88 |             move = Directions.WEST
89 |         if (self.EAST_KEY in self.keys) and Directions.EAST in legal:
90 |             move = Directions.EAST
91 |         if (self.NORTH_KEY in self.keys) and Directions.NORTH in legal:
92 |             move = Directions.NORTH
93 |         if (self.SOUTH_KEY in self.keys) and Directions.SOUTH in legal:
94 |             move = Directions.SOUTH
95 |         return move
96 | 


--------------------------------------------------------------------------------
/game/pacman/ghostAgents.py:
--------------------------------------------------------------------------------
 1 | # ghostAgents.py
 2 | # --------------
 3 | # Licensing Information:  You are free to use or extend these projects for
 4 | # educational purposes provided that (1) you do not distribute or publish
 5 | # solutions, (2) you retain this notice, and (3) you provide clear
 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
 7 | #
 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley.
 9 | # The core projects and autograders were primarily created by John DeNero
10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
11 | # Student side autograding was added by Brad Miller, Nick Hay, and
12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu).
13 | 
14 | 
15 | from game.pacman.game import Agent
16 | from game.pacman.game import Actions
17 | from game.pacman.game import Directions
18 | import random
19 | from game.pacman.util import manhattanDistance
20 | import game.pacman.util as util
21 | 
22 | 
23 | class GhostAgent(Agent):
24 | 
25 |     def __init__(self, index):
26 |         self.index = index
27 | 
28 |     def getAction(self, state):
29 |         dist = self.getDistribution(state)
30 |         if len(dist) == 0:
31 |             return Directions.STOP
32 |         else:
33 |             return util.chooseFromDistribution(dist)
34 | 
35 |     def getDistribution(self, state):
36 |         "Returns a Counter encoding a distribution over actions from the provided state."
37 |         util.raiseNotDefined()
38 | 
39 | 
40 | class RandomGhost(GhostAgent):
41 |     "A ghost that chooses a legal action uniformly at random."
42 | 
43 |     def getDistribution(self, state):
44 |         dist = util.Counter()
45 |         for a in state.getLegalActions(self.index):
46 |             dist[a] = 1.0
47 |         dist.normalize()
48 |         return dist
49 | 
50 | 
51 | class DirectionalGhost(GhostAgent):
52 |     "A ghost that prefers to rush Pacman, or flee when scared."
53 | 
54 |     def __init__(self, index, prob_attack=0.8, prob_scaredFlee=0.8):
55 |         self.index = index
56 |         self.prob_attack = prob_attack
57 |         self.prob_scaredFlee = prob_scaredFlee
58 | 
59 |     def getDistribution(self, state):
60 |         # Read variables from state
61 |         ghostState = state.getGhostState(self.index)
62 |         legalActions = state.getLegalActions(self.index)
63 |         pos = state.getGhostPosition(self.index)
64 |         isScared = ghostState.scaredTimer > 0
65 | 
66 |         speed = 1
67 |         if isScared:
68 |             speed = 0.5
69 | 
70 |         actionVectors = [Actions.directionToVector(
71 |             a, speed) for a in legalActions]
72 |         newPositions = [(pos[0] + a[0], pos[1] + a[1]) for a in actionVectors]
73 |         pacmanPosition = state.getPacmanPosition()
74 | 
75 |         # Select best actions given the state
76 |         distancesToPacman = [manhattanDistance(
77 |             pos, pacmanPosition) for pos in newPositions]
78 |         if isScared:
79 |             bestScore = max(distancesToPacman)
80 |             bestProb = self.prob_scaredFlee
81 |         else:
82 |             bestScore = min(distancesToPacman)
83 |             bestProb = self.prob_attack
84 |         bestActions = [action for action, distance in zip(
85 |             legalActions, distancesToPacman) if distance == bestScore]
86 | 
87 |         # Construct distribution
88 |         dist = util.Counter()
89 |         for a in bestActions:
90 |             dist[a] = bestProb / len(bestActions)
91 |         for a in legalActions:
92 |             dist[a] += (1 - bestProb) / len(legalActions)
93 |         dist.normalize()
94 |         return dist
95 | 


--------------------------------------------------------------------------------
/game/pacman/make_env.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gym
  3 | import numpy as np
  4 | from gym import spaces
  5 | 
  6 | import game.pacman.layout as layout
  7 | from game.pacman.pacman import readCommand
  8 | from game.pacman.pacman import ClassicGameRules
  9 | import game.pacman.textDisplay as textDisplay
 10 | from game.pacman.ghostAgents import RandomGhost as Ghost
 11 | 
 12 | 
 13 | class Agent:
 14 |     def __init__(self):
 15 |         self.name = ''
 16 | 
 17 |     def get(self):
 18 |         raise NotImplemented()
 19 | 
 20 | 
 21 | class Wrap_pacman():
 22 |     def __init__(self, args):
 23 |         #  , layout, pacman, ghosts, display, numGames, record, numTraining = 0, catchExceptions = False, timeout = 30
 24 |         self.args = args
 25 |         self.layout = layout.getLayout(args['game_name'])
 26 |         self.rules = ClassicGameRules(self.args['timeout'])
 27 |         self.pacman = Agent()
 28 |         self.ghosts = [Agent() for i in range(self.layout.getNumGhosts())]#[Ghost(i+1) for i in range(self.layout.getNumGhosts())]  # [Agent() for i in range(self.layout.getNumGhosts())]
 29 |         if self.args['quietGraphics']:
 30 |             display = textDisplay.NullGraphics()
 31 |         elif self.args['textGraphics']:
 32 |             textDisplay.SLEEP_TIME = self.args['frameTime']
 33 |             display = textDisplay.PacmanGraphics()
 34 |         else:
 35 |             import game.pacman.graphicsDisplay as graphicsDisplay
 36 |             display = graphicsDisplay.PacmanGraphics(self.args['zoom'], frameTime=self.args['frameTime'])
 37 |         self.beQuiet = False
 38 |         self.textDisplay = textDisplay.NotGraphics()
 39 |         self.videoDisplay = display
 40 |         self.rules.quiet = True
 41 |         self.catchExceptions = self.args['catchExceptions']
 42 |         self.done = True
 43 | 
 44 |         self.action2str = ['North', 'South', 'East', 'West', 'Stop']
 45 |         self.game = self.rules.newGame(self.layout, self.pacman, self.ghosts, display, self.beQuiet,
 46 |                                        self.catchExceptions)
 47 | 
 48 |         # gym-like info
 49 |         self.n = len(self.game.agents)
 50 |         self.action_space = [spaces.Discrete(len(self.action2str)) for i in range(self.n)]
 51 |         self.observation_space = [spaces.Box(low=0, high=1, shape=((self.layout.width + self.layout.height) * self.n + 18,), dtype=np.float32) if i == 0 else
 52 |                                   spaces.Box(low=0, high=1, shape=((self.layout.width + self.layout.height) * 2,), dtype=np.float32)
 53 |                                   for i in range(self.n)]
 54 | 
 55 |     def step(self, actions, done=None):
 56 |         assert not self.done, 'done!  step after reset'
 57 |         actions = [np.argmax(a) for a in actions]
 58 |         actions = [self.action2str[action] for action in actions]
 59 |         # ghost_action = []
 60 |         # ghost_action.append(actions[0])
 61 |         # for ghost in self.ghosts:
 62 |         #     action = ghost.getAction(self.game.state)
 63 |         #     ghost_action.append(action)
 64 |         # print(ghost_action)
 65 |         state, reward, done, info = self.game.step(actions)
 66 |         self.done = done
 67 |         done = [done for i in range(self.n)]
 68 |         return state, reward, done, info
 69 | 
 70 |     def reset(self, render=False):
 71 |         del self.game
 72 |         del self.rules
 73 |         del self.pacman
 74 |         del self.ghosts
 75 | 
 76 |         self.pacman = Agent()
 77 |         self.ghosts = [Agent() for i in range(self.layout.getNumGhosts())]#[Ghost(i+1) for i in range(self.layout.getNumGhosts())]
 78 | 
 79 |         self.rules = ClassicGameRules(self.args['timeout'])
 80 |         self.rules.quiet = True
 81 | 
 82 |         if render:
 83 |             display = self.videoDisplay
 84 |             self.rules.quiet = False
 85 |         else:
 86 |             display = self.textDisplay
 87 |             self.rules.quiet = True
 88 | 
 89 |         self.game = self.rules.newGame(self.layout, self.pacman, self.ghosts, display, self.beQuiet,
 90 |                                        self.catchExceptions)
 91 | 
 92 |         self.done = False
 93 | 
 94 |         return self.game.reset(render=render)
 95 | 
 96 |     def render(self):
 97 |         pass
 98 | 
 99 | 
100 | def runGames(args):
101 |     env = Wrap_pacman(args)
102 |     return env
103 | 
104 | 
105 | def runGames_2(layout, pacman, ghosts, display, numGames, record, numTraining=0, catchExceptions=False, timeout=30):
106 | 
107 |     rules = ClassicGameRules(timeout)
108 |     games = []
109 | 
110 |     for i in range(numGames):
111 |         beQuiet = i < numTraining
112 | 
113 |         gameDisplay = textDisplay.NullGraphics()
114 |         rules.quiet = True
115 | 
116 |         # render
117 |         # gameDisplay = display
118 |         # rules.quiet = False
119 | 
120 |         game = rules.newGame(layout, pacman, ghosts,
121 |                              gameDisplay, beQuiet, catchExceptions)
122 |         game.run()
123 | 
124 |     return games
125 | 
126 | 
127 | def make_env(args):
128 |     #args = readCommand(sys.argv[1:])  # Get game components based on input
129 |     #print(args)
130 |     return runGames(args)
131 |     # runGames_2(**args)
132 |     # return env


--------------------------------------------------------------------------------
/game/particle/make_env.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code for creating a multiagent environment with one of the scenarios listed
  3 | in ./scenarios/.
  4 | Can be called by using, for example:
  5 |     env = make_env('simple_speaker_listener')
  6 | After producing the env object, can be used similarly to an OpenAI gym
  7 | environment.
  8 | 
  9 | A policy using this environment must output actions in the form of a list
 10 | for all agents. Each element of the list should be a numpy array,
 11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede
 12 | communication actions in this array. See environment.py for more details.
 13 | """
 14 | import numpy as np
 15 | import time
 16 | 
 17 | 
 18 | def make_env(args):
 19 |     scenario_name = args['game_name']
 20 |     '''
 21 |     Creates a MultiAgentEnv object as env. This can be used similar to a gym
 22 |     environment by calling env.reset() and env.step().
 23 |     Use env.render() to view the environment on the screen.
 24 | 
 25 |     Input:
 26 |         scenario_name   :   name of the scenario from ./scenarios/ to be Returns
 27 |                             (without the .py extension)
 28 |         benchmark       :   whether you want to produce benchmarking data
 29 |                             (usually only done during evaluation)
 30 | 
 31 |     Some useful env properties (see environment.py):
 32 |         .observation_space  :   Returns the observation space for each agent
 33 |         .action_space       :   Returns the action space for each agent
 34 |         .n                  :   Returns the number of Agents
 35 |     '''
 36 |     from game.particle.multiagent.environment import MultiAgentEnv
 37 |     import game.particle.multiagent.scenarios as scenarios
 38 | 
 39 |     # load scenario from script
 40 |     scenario = scenarios.load(scenario_name + ".py").Scenario()
 41 |     # create world
 42 |     world = scenario.make_world(args)
 43 |     # create multiagent environment
 44 |     if args['benchmark'] and not args['obs_sort']:
 45 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data, args=args)
 46 |     elif not args['benchmark'] and args['obs_sort']:
 47 |         if args["reward_func"] == "reward2":
 48 |             env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, scenario.observation_sort, scenario.is_done2, args=args)
 49 |         elif args["reward_func"] == "reward3":
 50 |             env = MultiAgentEnv(world, scenario.reset_world, scenario.reward3, scenario.observation3, scenario.observation_sort3, scenario.is_done3, args=args)
 51 |         else:
 52 |             env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation,
 53 |                                 scenario.observation_sort, scenario.is_done, args=args)
 54 |     elif not args['benchmark'] and not args['obs_sort']:
 55 |         if args["reward_func"] == "reward2":
 56 |             env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, args=args)
 57 |         elif args["reward_func"] == "reward3":
 58 |             env = MultiAgentEnv(world, scenario.reset_world, scenario.reward3, scenario.observation3, args=args)
 59 |         else:
 60 |             env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, args=args)
 61 |     else:
 62 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, args=args)
 63 |     return env
 64 | 
 65 | 
 66 | # test
 67 | def action(obs):
 68 |     if env.discrete_action_space:
 69 |         i = np.random.randint(0, 5)
 70 |         u = np.zeros(5)
 71 |         u[i] = 1
 72 |     else:
 73 |         u = np.array([(np.random.random() - 0.5) * 2, (np.random.random() - 0.5) * 2])
 74 |     return u
 75 | 
 76 | 
 77 | if __name__ == '__main__':
 78 |     args = dict()
 79 |     args['game_name'] = "simple_spread_old"
 80 |     args['benchmark'] = False
 81 |     args['obs_sort'] = False
 82 |     args['reward_func'] = 'reward'
 83 |     args['restrict_move'] = True
 84 |     args['num_adversaries'] = 0
 85 |     args['num_good'] = 6
 86 |     env = make_env(args)
 87 |     print(env.action_space)
 88 |     env.render()
 89 |     # create interactive policies for each agent
 90 |     # execution loop
 91 |     obs_n = env.reset()
 92 |     print(env.observation_space)
 93 |     print(env.action_space)
 94 | 
 95 |     for ep in range(100):
 96 |         obs_n = env.reset()
 97 |         step = 0
 98 |         reward = np.zeros(env.n)
 99 |         done = [False for i in range(env.n)]
100 |         while True:
101 |             # query for action from each agent's policy
102 |             act_n = []
103 |             for i in range(env.n):
104 |                 act_n.append(action(obs_n[i]))
105 |             #print(act_n)
106 |             #print(act_n)
107 |             # step environment
108 |             obs_n, reward_n, done_n, _ = env.step(act_n)
109 |             for i in range(env.n):
110 |                 if not done[i]:
111 |                     done[i] = done_n[i]
112 |             reward += reward_n
113 |             #print(obs_n)
114 |             # render all agent views
115 |             #time.sleep(0.1)
116 |             env.render()
117 |             step += 1
118 |             if step > 100 or all((done_n[i] is True for i in range(env.n))):
119 |                 print(step, reward, done_n)
120 |                 break
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/scenarios/simple_spread.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from game.particle.multiagent.core import World, Agent, Landmark
  3 | from game.particle.multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self, args=None):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_agents = 10
 12 |         num_landmarks = 10
 13 |         if args is not None and args['num_good'] != 0:
 14 |             num_landmarks = args['num_good']
 15 |             num_agents = num_landmarks
 16 |         world.cam_range = 4
 17 |         world.collaborative = False
 18 |         # add agents
 19 |         world.agents = [Agent() for i in range(num_agents)]
 20 |         for i, agent in enumerate(world.agents):
 21 |             agent.name = 'agent %d' % i
 22 |             agent.collide = True
 23 |             agent.silent = True
 24 |             #agent.size = 0.15
 25 |         # add landmarks
 26 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 27 |         for i, landmark in enumerate(world.landmarks):
 28 |             landmark.name = 'landmark %d' % i
 29 |             landmark.collide = False
 30 |             landmark.movable = False
 31 |         # make initial conditions
 32 |         self.reset_world(world)
 33 |         return world
 34 | 
 35 |     def reset_world(self, world):
 36 |         # random properties for agents
 37 |         for i, agent in enumerate(world.agents):
 38 |             agent.color = np.array([0.35, 0.35, 0.85])
 39 |         # random properties for landmarks
 40 |         for i, landmark in enumerate(world.landmarks):
 41 |             landmark.color = np.array([0.25, 0.25, 0.25])
 42 |         # set random initial states
 43 |         for agent in world.agents:
 44 |             agent.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p)
 45 |             agent.state.p_vel = np.zeros(world.dim_p)
 46 |             agent.state.c = np.zeros(world.dim_c)
 47 |         for i, landmark in enumerate(world.landmarks):
 48 |             landmark.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p)
 49 |             landmark.state.p_vel = np.zeros(world.dim_p)
 50 | 
 51 |     def benchmark_data(self, agent, world):
 52 |         rew = 0
 53 |         collisions = 0
 54 |         occupied_landmarks = 0
 55 |         min_dists = 0
 56 |         for l in world.landmarks:
 57 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 58 |             min_dists += min(dists)
 59 |             rew -= min(dists)
 60 |             if min(dists) < 0.1:
 61 |                 occupied_landmarks += 1
 62 |         if agent.collide:
 63 |             for a in world.agents:
 64 |                 if self.is_collision(a, agent):
 65 |                     rew -= 1
 66 |                     collisions += 1
 67 |         return (rew, collisions, min_dists, occupied_landmarks)
 68 | 
 69 | 
 70 |     def is_collision(self, agent1, agent2):
 71 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 72 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 73 |         dist_min = agent1.size + agent2.size
 74 |         return True if dist < dist_min else False
 75 | 
 76 |     # def reward(self, agent, world):
 77 |     #     # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
 78 |     #     rew = 0
 79 |     #
 80 |     #     # for l in world.landmarks:
 81 |     #     #     dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 82 |     #     #     rew -= min(dists)
 83 |     #     dists = [np.sqrt(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) for l in world.landmarks]
 84 |     #     rew -= min(dists)
 85 |     #     if agent.collide:
 86 |     #         for a in world.agents:
 87 |     #             if self.is_collision(a, agent):
 88 |     #                 rew -= 1
 89 |     #     return rew
 90 |     def reward(self, agent, world):
 91 |         # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
 92 |         rew = 0
 93 |         agentIndex = 0
 94 |         for i, a in enumerate(world.agents):
 95 |             if a.name == agent.name:
 96 |                 agentIndex = i
 97 |                 break
 98 |         dists = np.sqrt(np.sum(np.square(agent.state.p_pos - world.landmarks[agentIndex].state.p_pos)))
 99 |         # if self.is_collision(agent, world.landmarks[agentIndex]):
100 |         #     rew = 1
101 |         rew -= dists
102 |         return rew
103 | 
104 |     def observation(self, agent, world):
105 |         # get positions of all entities in this agent's reference frame
106 |         entity_pos = []
107 |         for entity in world.landmarks:  # world.entities:
108 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
109 |         # entity colors
110 |         entity_color = []
111 |         for entity in world.landmarks:  # world.entities:
112 |             entity_color.append(entity.color)
113 |         # communication of all other agents
114 |         comm = []
115 |         other_pos = []
116 |         for other in world.agents:
117 |             if other is agent: continue
118 |             comm.append(other.state.c)
119 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
120 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MAPTF
  2 | 
  3 | Source code for paper: An Efficient Transfer Learning Framework for Multiagent Reinforcement Learning
  4 | 
  5 |  * [MAPTF code](#MAPTF code)
  6 |  * [Installation](#Installation)
  7 |  * [Run an experiment](#Run an experiment)
  8 |     * [Example](#Example)
  9 |     * [Results](#results)
 10 |  * [Configuration](#Configuration)
 11 |     * [Operating parameters](#Operating parameters)
 12 |     * [Core parameters](#Core parameters)
 13 |     * [Some experiences setting in paper](#Some experiences setting in paper)
 14 |  * [In BibTeX format](#In BibTeX format) 
 15 | 
 16 | ## MAPTF code
 17 |  * MAPTF
 18 |     * alg (multiagent polices)
 19 |        * maddpg
 20 |        * muti_ptf_ppo
 21 |        * sharing_multi_ppo
 22 |        * option
 23 |     * config (Configuration parameters of each polices)
 24 |        * maddpg_conf (including maddpg and maddpg_sr)
 25 |        * ppo_config (including ppo sro shppo and shsro)
 26 |        * particle_conf (Configuration of particle game )
 27 |        * pacman_conf (Configuration of pacman game)
 28 |     * run (execute the tasks)
 29 |        * run_maddpg_sr (including maddpg and maddpg_sr)
 30 |        * run_multi_ptf_ppo_sro (including ppo sro)
 31 |        * run_multi_ptf_shppo_sro (including shppo and shsro)
 32 |      * source (opponent policies)
 33 |      * util
 34 |      * main (entry function)
 35 | 
 36 | ## Installation
 37 | python==3.6.5
 38 | 
 39 | pip install -r requirements.txt
 40 | 
 41 | ## Running Example
 42 | 
 43 | #### Example
 44 | 
 45 | ```
 46 | #MAPTF-PPO Pacman
 47 | python main.py -a multi_ppo -c ppo_conf -g pacman -d pacman_conf game_name=originalClassic num_adversaries=1 adv_load_model=True adv_load_model_path=source/pacman/original/0/model
 48 | ```
 49 | ```
 50 | #MAPTF-PPO Predator-prey 4
 51 | python main.py -a multi_ppo -c ppo_conf -g particle -d particle_conf game_name=simple_tag num_adversaries=3 good_use_option=False good_load_model=True good_load_model_path=source/simple_tag/tag4/model_30000 c1=0.001
 52 | ```
 53 | some logs will be shown below:
 54 | ```
 55 | INFO:tensorflow:Restoring parameters from source/pacman/original/0/model_0.ckpt
 56 | win : [False, False, False, False],  step : 100,  discounted_reward : [ 0.61213843 -0.63762798 -0.63762798 -0.63762798],  discount_reward_mean : [ 0.61213843 -0.63762798 -0.63762798 -0.63762798],  undiscounted_reward : [ 0.31 -1.01 -1.01 -1.01],  reward_mean : [ 0.31 -1.01 -1.01 -1.01],  episode : 0,
 57 | win : [False, False, False, False],  step : 100,  discounted_reward : [ 0.58945708 -0.63762798 -0.63762798 -0.63762798],  discount_reward_mean : [ 0.60079775 -0.63762798 -0.63762798 -0.63762798],  undiscounted_reward : [ 0.31 -1.01 -1.01 -1.01],  reward_mean : [ 0.31 -1.01 -1.01 -1.01],  episode : 1,
 58 | ```
 59 | 
 60 | #### Results
 61 | 
 62 | All results will be stored in the `results/alg_name/game_type/game_name/time` folder, every folder contains `graph`, `log`, `model`, `output`, `args.json`, `command.txt`
 63 | 
 64 | If you do not want to save `graph` and `model`, setting param `save_model=False`.
 65 | * `graph`: can use `tensorboard --logdir=path` to check the tensorflow graph and loss in terminal.
 66 | * `log`: the print results in terminal.
 67 | * `model`: models saved every `save_per_episodes` episodes.
 68 | * `output.json`: reward results.
 69 | * `args.json`: store all params.
 70 | * `command.txt`: shell command.
 71 | 
 72 | ## Source Policy
 73 | 
 74 | Source policies contain pre-trained opponent policies. For example, in pac-man, the pac-man agent is the opponent, the policy is a pre-trained PPO; in predator-prey, the blue circle agents are pre-trained using PPO. Using test mode via `-t` `load_model`can reload the model to render
 75 | 
 76 | ## Configuration
 77 | 
 78 | The config files act as defaults for an algorithm or environment. 
 79 | 
 80 | They are all located in `config`.
 81 | 
 82 | #### Operating parameters
 83 | 
 84 | Take the above example: 
 85 | * `-a multi_ppo`: choose an algorithm.
 86 | * `-c ppo_conf`: choose corresponding algorithm configuration.
 87 | * `-g pacman`: game type.
 88 | * `-d pacman_conf`: game configuration.
 89 | * `-t`: evaluation the results, by setting `-t True`, and `-t False` as default.
 90 | * `game_name=originalClassic`: choose a game environment.
 91 | * `num_adversaries=1`: as needed.
 92 | * `adv_load_model=True adv_load_model_path=source/pacman/original/0/model`: load source policy.
 93 | * `adv_use_option, good_use_option`: use option, by setting `True`, `False` as default. Learning ppo, shppo and maddpg, setting `False`, otherwise setting `True` as needed.
 94 | 
 95 | #### Core parameters
 96 | 
 97 | Default:
 98 | * `option_layer_1=128, option_layer_2=128`
 99 | * `learning_rate_r=0.0003`
100 | * `embedding_dim=32`
101 | * `option_embedding_layer=64`
102 | * `recon_loss_coef=0.1`
103 | * `option_batch_size=32`
104 | * `c1=0.005`
105 | * `e_greedy_increment=0.001`
106 | * `learning_rate_o=0.00001, learning_rate_t=0.00001`
107 | * `xi=0.005`
108 | 
109 | #### Some experiences setting in paper
110 | ```
111 | #ppo+sro, game type=pacman, game environment=mediumClassic
112 | c1=0.005
113 | ```
114 | ```
115 | #ppo+sro, game type=pacman, game environment=originalClassic
116 | option_batch_size=128
117 | c1=0.0005
118 | ```
119 | ```
120 | #maddpg+sro, game type=particle, game environment=simple_tag
121 | option_layer_1=128 option_layer_2=128 
122 | learning_rate_o=0.00001 learning_rate_t=0.00001 
123 | c1=0.005 
124 | xi=0
125 | ```
126 | ```
127 | #ppo+sro, game type=particle, game environment=simple_tag
128 | option_layer_1=32 option_layer_2=32 
129 | c1=0.1 
130 | option_batch_size=128
131 | ```
132 | ```
133 | #shsro, game type=particle, game environment=simple_tag
134 | option_layer_1=32 option_layer_2=32 
135 | c1=0.1 
136 | ```
137 | 
138 | MADDPG code follows: https://github.com/openai/maddpg
139 | 
140 | ## In BibTeX format:
141 | 
142 | ```tex
143 | @article{yang2021efficient,
144 |   title={An Efficient Transfer Learning Framework for Multiagent Reinforcement Learning},
145 |   author={Yang, Tianpei and Wang, Weixun and Tang, Hongyao and Hao, Jianye and Meng, Zhaopeng and Mao, Hangyu and Li, Dong and Liu, Wulong and Chen, Yingfeng and Hu, Yujing and others},
146 |   journal={Advances in Neural Information Processing Systems},
147 |   volume={34},
148 |   year={2021}
149 | }
150 | ```
151 | 


--------------------------------------------------------------------------------
/game/particle/README.md:
--------------------------------------------------------------------------------
 1 | **Status:** Archive (code is provided as-is, no updates expected)
 2 | 
 3 | # Multi-Agent Particle Environment
 4 | 
 5 | A simple multi-agent particle world with a continuous observation and discrete action space, along with some basic simulated physics.
 6 | Used in the paper [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf).
 7 | 
 8 | ## Getting started:
 9 | 
10 | - To install, `cd` into the root directory and type `pip install -e .`
11 | 
12 | - To interactively view moving to landmark scenario (see others in ./scenarios/):
13 | `bin/interactive.py --scenario simple.py`
14 | 
15 | - Known dependencies: Python (3.5.4), OpenAI gym (0.10.5), numpy (1.14.5)
16 | 
17 | - To use the environments, look at the code for importing them in `make_env.py`.
18 | 
19 | ## Code structure
20 | 
21 | - `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object.
22 | 
23 | - `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.)
24 | 
25 | - `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code.
26 | 
27 | - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen.
28 | 
29 | - `./multiagent/policy.py`: contains code for interactive policy based on keyboard input.
30 | 
31 | - `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios.
32 | 
33 | - `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions:
34 |     1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both).
35 |      called once at the beginning of each training session
36 |     2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world
37 |     called before every episode (including after make_world() before the first episode)
38 |     3) `reward()`: defines the reward function for a given agent
39 |     4) `observation()`: defines the observation space of a given agent
40 |     5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics)
41 | 
42 | ### Creating new environments
43 | 
44 | You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`).
45 | 
46 | ## List of environments
47 | 
48 | 
49 | | Env name in code (name in paper) |  Communication? | Competitive? | Notes |
50 | | --- | --- | --- | --- |
51 | | `simple.py` | N | N | Single agent sees landmark position, rewarded based on how close it gets to landmark. Not a multiagent environment -- used for debugging policies. |
52 | | `simple_adversary.py` (Physical deception) | N | Y | 1 adversary (red), N good agents (green), N landmarks (usually N=2). All agents observe position of landmarks and other agents. One landmark is the ‘target landmark’ (colored green). Good agents rewarded based on how close one of them is to the target landmark, but negatively rewarded if the adversary is close to target landmark. Adversary is rewarded based on how close it is to the target, but it doesn’t know which landmark is the target landmark. So good agents have to learn to ‘split up’ and cover all landmarks to deceive the adversary. |
53 | | `simple_crypto.py` (Covert communication) | Y | Y | Two good agents (alice and bob), one adversary (eve). Alice must sent a private message to bob over a public channel. Alice and bob are rewarded based on how well bob reconstructs the message, but negatively rewarded if eve can reconstruct the message. Alice and bob have a private key (randomly generated at beginning of each episode), which they must learn to use to encrypt the message. |
54 | | `simple_push.py` (Keep-away) | N |Y  | 1 agent, 1 adversary, 1 landmark. Agent is rewarded based on distance to landmark. Adversary is rewarded if it is close to the landmark, and if the agent is far from the landmark. So the adversary learns to push agent away from the landmark. |
55 | | `simple_reference.py` | Y | N | 2 agents, 3 landmarks of different colors. Each agent wants to get to their target landmark, which is known only by other agent. Reward is collective. So agents have to learn to communicate the goal of the other agent, and navigate to their landmark. This is the same as the simple_speaker_listener scenario where both agents are simultaneous speakers and listeners. |
56 | | `simple_speaker_listener.py` (Cooperative communication) | Y | N | Same as simple_reference, except one agent is the ‘speaker’ (gray) that does not move (observes goal of other agent), and other agent is the listener (cannot speak, but must navigate to correct landmark).|
57 | | `simple_spread.py` (Cooperative navigation) | N | N | N agents, N landmarks. Agents are rewarded based on how far any agent is from each landmark. Agents are penalized if they collide with other agents. So, agents have to learn to cover all the landmarks while avoiding collisions. |
58 | | `simple_tag.py` (Predator-prey) | N | Y | Predator-prey environment. Good agents (green) are faster and want to avoid being hit by adversaries (red). Adversaries are slower and want to hit good agents. Obstacles (large black circles) block the way. |
59 | | `simple_world_comm.py` | Y | Y | Environment seen in the video accompanying the paper. Same as simple_tag, except (1) there is food (small blue balls) that the good agents are rewarded for being near, (2) we now have ‘forests’ that hide agents inside from being seen from outside; (3) there is a ‘leader adversary” that can see the agents at all times, and can communicate with the other adversaries to help coordinate the chase. |
60 | 
61 | ## Paper citation
62 | 
63 | If you used this environment for your experiments or found it helpful, consider citing the following papers:
64 | 
65 | Environments in this repo:
66 | <pre>
67 | @article{lowe2017multi,
68 |   title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
69 |   author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
70 |   journal={Neural Information Processing Systems (NIPS)},
71 |   year={2017}
72 | }
73 | </pre>
74 | 
75 | Original particle world environment:
76 | <pre>
77 | @article{mordatch2017emergence,
78 |   title={Emergence of Grounded Compositional Language in Multi-Agent Populations},
79 |   author={Mordatch, Igor and Abbeel, Pieter},
80 |   journal={arXiv preprint arXiv:1703.04908},
81 |   year={2017}
82 | }
83 | </pre>
84 | 


--------------------------------------------------------------------------------
/game/pacman/layout.py:
--------------------------------------------------------------------------------
  1 | # layout.py
  2 | # ---------
  3 | # Licensing Information:  You are free to use or extend these projects for
  4 | # educational purposes provided that (1) you do not distribute or publish
  5 | # solutions, (2) you retain this notice, and (3) you provide clear
  6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
  7 | #
  8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley.
  9 | # The core projects and autograders were primarily created by John DeNero
 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 11 | # Student side autograding was added by Brad Miller, Nick Hay, and
 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu).
 13 | 
 14 | 
 15 | from game.pacman.util import manhattanDistance
 16 | from game.pacman.game import Grid
 17 | import os
 18 | import random
 19 | from functools import reduce
 20 | 
 21 | VISIBILITY_MATRIX_CACHE = {}
 22 | 
 23 | 
 24 | class Layout:
 25 |     """
 26 |     A Layout manages the static information about the game board.
 27 |     """
 28 | 
 29 |     def __init__(self, layoutText):
 30 |         self.width = len(layoutText[0])
 31 |         self.height = len(layoutText)
 32 |         self.walls = Grid(self.width, self.height, False)
 33 |         self.food = Grid(self.width, self.height, False)
 34 |         self.capsules = []
 35 |         self.agentPositions = []
 36 |         self.numGhosts = 0
 37 |         self.processLayoutText(layoutText)
 38 |         self.layoutText = layoutText
 39 |         self.totalFood = len(self.food.asList())
 40 |         # self.initializeVisibilityMatrix()
 41 | 
 42 |     def getNumGhosts(self):
 43 |         return self.numGhosts
 44 | 
 45 |     def initializeVisibilityMatrix(self):
 46 |         global VISIBILITY_MATRIX_CACHE
 47 |         if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE:
 48 |             from game import Directions
 49 |             vecs = [(-0.5, 0), (0.5, 0), (0, -0.5), (0, 0.5)]
 50 |             dirs = [Directions.NORTH, Directions.SOUTH,
 51 |                     Directions.WEST, Directions.EAST]
 52 |             vis = Grid(self.width, self.height, {Directions.NORTH: set(), Directions.SOUTH: set(
 53 |             ), Directions.EAST: set(), Directions.WEST: set(), Directions.STOP: set()})
 54 |             for x in range(self.width):
 55 |                 for y in range(self.height):
 56 |                     if self.walls[x][y] == False:
 57 |                         for vec, direction in zip(vecs, dirs):
 58 |                             dx, dy = vec
 59 |                             nextx, nexty = x + dx, y + dy
 60 |                             while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)]:
 61 |                                 vis[x][y][direction].add((nextx, nexty))
 62 |                                 nextx, nexty = x + dx, y + dy
 63 |             self.visibility = vis
 64 |             VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis
 65 |         else:
 66 |             self.visibility = VISIBILITY_MATRIX_CACHE[
 67 |                 reduce(str.__add__, self.layoutText)]
 68 | 
 69 |     def isWall(self, pos):
 70 |         x, col = pos
 71 |         return self.walls[x][col]
 72 | 
 73 |     def getRandomLegalPosition(self):
 74 |         x = random.choice(list(range(self.width)))
 75 |         y = random.choice(list(range(self.height)))
 76 |         while self.isWall((x, y)):
 77 |             x = random.choice(list(range(self.width)))
 78 |             y = random.choice(list(range(self.height)))
 79 |         return (x, y)
 80 | 
 81 |     def getRandomCorner(self):
 82 |         poses = [(1, 1), (1, self.height - 2), (self.width - 2, 1),
 83 |                  (self.width - 2, self.height - 2)]
 84 |         return random.choice(poses)
 85 | 
 86 |     def getFurthestCorner(self, pacPos):
 87 |         poses = [(1, 1), (1, self.height - 2), (self.width - 2, 1),
 88 |                  (self.width - 2, self.height - 2)]
 89 |         dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses])
 90 |         return pos
 91 | 
 92 |     def isVisibleFrom(self, ghostPos, pacPos, pacDirection):
 93 |         row, col = [int(x) for x in pacPos]
 94 |         return ghostPos in self.visibility[row][col][pacDirection]
 95 | 
 96 |     def __str__(self):
 97 |         return "\n".join(self.layoutText)
 98 | 
 99 |     def deepCopy(self):
100 |         return Layout(self.layoutText[:])
101 | 
102 |     def processLayoutText(self, layoutText):
103 |         """
104 |         Coordinates are flipped from the input format to the (x,y) convention here
105 | 
106 |         The shape of the maze.  Each character
107 |         represents a different type of object.
108 |          % - Wall
109 |          . - Food
110 |          o - Capsule
111 |          G - Ghost
112 |          P - Pacman
113 |         Other characters are ignored.
114 |         """
115 |         maxY = self.height - 1
116 |         for y in range(self.height):
117 |             for x in range(self.width):
118 |                 layoutChar = layoutText[maxY - y][x]
119 |                 self.processLayoutChar(x, y, layoutChar)
120 |         # (x1, y1) = self.getRandomLegalPosition()
121 |         # self.agentPositions.append((0, (x1, y1)))
122 |         # for i in range(self.numGhosts):
123 |         #     (x1, y1) = self.getRandomLegalPosition()
124 |         #     self.agentPositions.append((1, (x1, y1)))
125 |         self.agentPositions.sort()
126 |         self.agentPositions = [(i == 0, pos) for i, pos in self.agentPositions]
127 | 
128 |     def processLayoutChar(self, x, y, layoutChar):
129 |         if layoutChar == '%':
130 |             self.walls[x][y] = True
131 |         elif layoutChar == '.':
132 |             self.food[x][y] = True
133 |         elif layoutChar == 'o':
134 |             self.capsules.append((x, y))
135 |         elif layoutChar == 'P':
136 |             self.agentPositions.append((0, (x, y)))
137 |             #(x1, y1) = self.getRandomLegalPosition()
138 |             #self.agentPositions.append((0, (x1, y1)))
139 |         elif layoutChar in ['G']:
140 |             self.agentPositions.append((1, (x, y)))
141 |             #(x1, y1) = self.getRandomLegalPosition()
142 |             #self.agentPositions.append((1, (x1, y1)))
143 |             self.numGhosts += 1
144 |         elif layoutChar in ['1', '2', '3', '4']:
145 |             self.agentPositions.append((int(layoutChar), (x, y)))
146 |             self.numGhosts += 1
147 | 
148 | 
149 | def getLayout(name, back=2):
150 |     # print('1:', os.getcwd())
151 |     # print(os.path.abspath(__file__))
152 |     if name.endswith('.lay'):
153 |         layout = tryToLoad(os.getcwd() + '/game/pacman/layouts/' + name)
154 |         print(os.getcwd() + '/game/pacman/layouts/' + name)
155 |         if layout == None:
156 |             layout = tryToLoad(name)
157 |     else:
158 |         layout = tryToLoad(os.getcwd() + '/game/pacman/' + 'layouts/' + name + '.lay')
159 |         print(os.getcwd() + '/game/pacman/' + 'layouts/' + name + '.lay')
160 |         if layout == None:
161 |             layout = tryToLoad(name + '.lay')
162 |     if layout == None and back >= 0:
163 |         curdir = os.path.abspath('.')
164 |         os.chdir('..')
165 |         layout = getLayout(name, back - 1)
166 |         os.chdir(curdir)
167 |     return layout
168 | 
169 | 
170 | def tryToLoad(fullname):
171 |     if(not os.path.exists(fullname)):
172 |         return None
173 |     f = open(fullname)
174 |     try:
175 |         return Layout([line.strip() for line in f])
176 |     finally:
177 |         f.close()
178 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/scenarios/simple_tag.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from game.particle.multiagent.core import World, Agent, Landmark
  3 | from game.particle.multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self, args=None):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_good_agents = 1
 12 |         num_adversaries = 6
 13 |         if args is not None and args['num_good'] != 0:
 14 |             num_good_agents = args['num_good']
 15 |         if args is not None and args['num_adversaries'] != 0:
 16 |             num_adversaries = args['num_adversaries']
 17 |         world.cam_range = 1
 18 |         num_agents = num_adversaries + num_good_agents
 19 |         num_landmarks = 2
 20 |         # add agents
 21 |         world.agents = [Agent() for i in range(num_agents)]
 22 |         for i, agent in enumerate(world.agents):
 23 |             agent.name = 'agent %d' % i
 24 |             agent.collide = True
 25 |             agent.silent = True
 26 |             agent.adversary = True if i < num_adversaries else False
 27 |             agent.size = 0.075 if agent.adversary else 0.05
 28 |             agent.accel = 3.0 if agent.adversary else 4.0
 29 |             #agent.accel = 20.0 if agent.adversary else 25.0
 30 |             agent.max_speed = 1.0 if agent.adversary else 1.3
 31 |         # add landmarks
 32 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 33 |         for i, landmark in enumerate(world.landmarks):
 34 |             landmark.name = 'landmark %d' % i
 35 |             landmark.collide = True
 36 |             landmark.movable = False
 37 |             landmark.size = 0.2
 38 |             landmark.boundary = False
 39 |         # make initial conditions
 40 |         self.reset_world(world)
 41 |         return world
 42 | 
 43 |     def reset_world(self, world):
 44 |         # random properties for agents
 45 |         for i, agent in enumerate(world.agents):
 46 |             agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35])
 47 |             # random properties for landmarks
 48 |         for i, landmark in enumerate(world.landmarks):
 49 |             landmark.color = np.array([0.25, 0.25, 0.25])
 50 |         # set random initial states
 51 |         for agent in world.agents:
 52 |             agent.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p)
 53 |             agent.state.p_vel = np.zeros(world.dim_p)
 54 |             agent.state.c = np.zeros(world.dim_c)
 55 |         for i, landmark in enumerate(world.landmarks):
 56 |             if not landmark.boundary:
 57 |                 landmark.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p)
 58 |                 landmark.state.p_vel = np.zeros(world.dim_p)
 59 | 
 60 |     def benchmark_data(self, agent, world):
 61 |         # returns data for benchmarking purposes
 62 |         if agent.adversary:
 63 |             collisions = 0
 64 |             for a in self.good_agents(world):
 65 |                 if self.is_collision(a, agent):
 66 |                     collisions += 1
 67 |             return collisions
 68 |         else:
 69 |             return 0
 70 | 
 71 |     def is_collision(self, agent1, agent2):
 72 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 73 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 74 |         dist_min = agent1.size + agent2.size
 75 |         return True if dist < dist_min else False
 76 | 
 77 |     # return all agents that are not adversaries
 78 |     def good_agents(self, world):
 79 |         return [agent for agent in world.agents if not agent.adversary]
 80 | 
 81 |     # return all adversarial agents
 82 |     def adversaries(self, world):
 83 |         return [agent for agent in world.agents if agent.adversary]
 84 | 
 85 |     def reward(self, agent, world):
 86 |         # Agents are rewarded based on minimum agent distance to each landmark
 87 |         main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 88 |         return main_reward
 89 | 
 90 |     def agent_reward(self, agent, world):
 91 |         # Agents are negatively rewarded if caught by adversaries
 92 |         rew = 0
 93 |         shape = False
 94 |         adversaries = self.adversaries(world)
 95 |         if shape:  # reward can optionally be shaped (increased reward for increased distance from adversary)
 96 |             for adv in adversaries:
 97 |                 rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
 98 |         if agent.collide:
 99 |             for a in adversaries:
100 |                 if self.is_collision(a, agent):
101 |                     rew -= 10
102 | 
103 |         # agents are penalized for exiting the screen, so that they can be caught by the adversaries
104 |         def bound(x):
105 |             if x < world.cam_range * 0.9:
106 |                 return 0
107 |             if x < world.cam_range:
108 |                 return (x - 0.9 * world.cam_range) * 10 / world.cam_range
109 |             return min(np.exp((2 * x - 2 * world.cam_range) / world.cam_range), 10)
110 |         for p in range(world.dim_p):
111 |             x = abs(agent.state.p_pos[p])
112 |             rew -= bound(x)
113 |         return rew
114 | 
115 |     def adversary_reward(self, agent, world):
116 |         # Adversaries are rewarded for collisions with agents
117 |         rew = 0
118 |         shape = False
119 |         agents = self.good_agents(world)
120 |         adversaries = self.adversaries(world)
121 |         if shape:  # reward can optionally be shaped (decreased reward for increased distance from agents)
122 |             for adv in adversaries:
123 |                 rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents])
124 |         # if shape:  # reward can optionally be shaped (decreased reward for increased distance from agents)
125 |         #     rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents])
126 | 
127 |         if agent.collide:
128 |             for ag in agents:
129 |                 agent_reward = 0
130 |                 for adv in adversaries:
131 |                     if self.is_collision(ag, adv):
132 |                         rew += 10
133 |                 # if self.is_collision(ag, agent):
134 |                 #     rew += 10
135 |                 # 同一个agent被3个adversaries同时抓住才有分数，每一组最高分数30分
136 |                 # if agent_reward < 30:
137 |                 #     rew += 0
138 |                 # else:
139 |                 #     rew += 30
140 |         return rew
141 | 
142 |     def observation(self, agent, world):
143 |         # get positions of all entities in this agent's reference frame
144 |         entity_pos = []
145 |         for entity in world.landmarks:
146 |             if not entity.boundary:
147 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
148 |         # communication of all other agents
149 |         comm = []
150 |         other_pos = []
151 |         other_vel = []
152 |         for other in world.agents:
153 |             if other is agent: continue
154 |             comm.append(other.state.c)
155 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
156 |             if not other.adversary:
157 |                 other_vel.append(other.state.p_vel)
158 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
159 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | # physical/external base state of all entites
  5 | class EntityState(object):
  6 |     def __init__(self):
  7 |         # physical position
  8 |         self.p_pos = None
  9 |         # physical velocity
 10 |         self.p_vel = None
 11 | 
 12 | 
 13 | # state of agents (including communication and internal/mental state)
 14 | class AgentState(EntityState):
 15 |     def __init__(self):
 16 |         super(AgentState, self).__init__()
 17 |         # communication utterance
 18 |         self.c = None
 19 | 
 20 | 
 21 | # action of the agent
 22 | class Action(object):
 23 |     def __init__(self):
 24 |         # physical action
 25 |         self.u = None
 26 |         # communication action
 27 |         self.c = None
 28 | 
 29 | 
 30 | # properties and state of physical world entity
 31 | class Entity(object):
 32 |     def __init__(self):
 33 |         # name 
 34 |         self.name = ''
 35 |         # properties:
 36 |         self.size = 0.050
 37 |         # entity can move / be pushed
 38 |         self.movable = False
 39 |         # entity collides with others
 40 |         self.collide = True
 41 |         # material density (affects mass)
 42 |         self.density = 25.0
 43 |         # color
 44 |         self.color = None
 45 |         # max speed and accel
 46 |         self.max_speed = None
 47 |         self.accel = None
 48 |         # state
 49 |         self.state = EntityState()
 50 |         # mass
 51 |         self.initial_mass = 1.0
 52 | 
 53 |     @property
 54 |     def mass(self):
 55 |         return self.initial_mass
 56 | 
 57 | 
 58 | # properties of landmark entities
 59 | class Landmark(Entity):
 60 |      def __init__(self):
 61 |         super(Landmark, self).__init__()
 62 | 
 63 | 
 64 | # properties of agent entities
 65 | class Agent(Entity):
 66 |     def __init__(self):
 67 |         super(Agent, self).__init__()
 68 |         # agents are movable by default
 69 |         self.movable = True
 70 |         # cannot send communication signals
 71 |         self.silent = False
 72 |         # cannot observe the world
 73 |         self.blind = False
 74 |         # physical motor noise amount
 75 |         self.u_noise = None
 76 |         # communication noise amount
 77 |         self.c_noise = None
 78 |         # control range
 79 |         self.u_range = 1.0
 80 |         # state
 81 |         self.state = AgentState()
 82 |         # action
 83 |         self.action = Action()
 84 |         # script behavior to execute
 85 |         self.action_callback = None
 86 | 
 87 | 
 88 | # multi-agent world
 89 | class World(object):
 90 |     def __init__(self):
 91 |         # list of agents and entities (can change at execution-time!)
 92 |         self.agents = []
 93 |         self.landmarks = []
 94 |         # communication channel dimensionality
 95 |         self.dim_c = 0
 96 |         # position dimensionality
 97 |         self.dim_p = 2
 98 |         # color dimensionality
 99 |         self.dim_color = 3
100 |         # simulation timestep
101 |         self.dt = 0.1
102 |         # physical damping
103 |         self.damping = 0.25
104 |         # contact response parameters
105 |         self.contact_force = 1e+2
106 |         self.contact_margin = 1e-3
107 | 
108 |     # return all entities in the world
109 |     @property
110 |     def entities(self):
111 |         return self.agents + self.landmarks
112 | 
113 |     # return all agents controllable by external policies
114 |     @property
115 |     def policy_agents(self):
116 |         return [agent for agent in self.agents if agent.action_callback is None]
117 | 
118 |     # return all agents controlled by world scripts
119 |     @property
120 |     def scripted_agents(self):
121 |         return [agent for agent in self.agents if agent.action_callback is not None]
122 | 
123 |     # update state of the world
124 |     def step(self, done=None):
125 |         # set actions for scripted agents 
126 |         for agent in self.scripted_agents:
127 |             agent.action = agent.action_callback(agent, self)
128 |         # gather forces applied to entities
129 |         p_force = [None] * len(self.entities)
130 |         # apply agent physical controls
131 |         p_force = self.apply_action_force(p_force, done)
132 |         # apply environment forces
133 |         p_force = self.apply_environment_force(p_force)
134 |         # integrate physical state
135 |         self.integrate_state(p_force, done)
136 |         # update agent state
137 |         for agent in self.agents:
138 |             self.update_agent_state(agent)
139 | 
140 |     # gather agent action forces
141 |     def apply_action_force(self, p_force, done=None):
142 |         # set applied forces
143 |         for i, agent in enumerate(self.agents):
144 |             if agent.movable:
145 |                 noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
146 |                 p_force[i] = agent.action.u + noise
147 |         return p_force
148 | 
149 |     # gather physical forces acting on entities
150 |     def apply_environment_force(self, p_force):
151 |         # simple (but inefficient) collision response
152 |         for a, entity_a in enumerate(self.entities):
153 |             for b,entity_b in enumerate(self.entities):
154 |                 if b <= a: continue
155 |                 [f_a, f_b] = self.get_collision_force(entity_a, entity_b)
156 |                 if f_a is not None:
157 |                     if p_force[a] is None: p_force[a] = 0.0
158 |                     p_force[a] = f_a + p_force[a] 
159 |                 if f_b is not None:
160 |                     if p_force[b] is None: p_force[b] = 0.0
161 |                     p_force[b] = f_b + p_force[b]        
162 |         return p_force
163 | 
164 |     # integrate physical state
165 |     def integrate_state(self, p_force, done=None):
166 |         for i,entity in enumerate(self.entities):
167 |             if not entity.movable: continue
168 |             if entity.movable and done is not None and done[i]: continue
169 |             entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
170 |             if p_force[i] is not None:
171 |                 entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
172 |             if entity.max_speed is not None:
173 |                 speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
174 |                 if speed > entity.max_speed:
175 |                     entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
176 |                                                                   np.square(entity.state.p_vel[1])) * entity.max_speed
177 |             entity.state.p_pos += entity.state.p_vel * self.dt
178 | 
179 |     def update_agent_state(self, agent):
180 |         # set communication state (directly for now)
181 |         if agent.silent:
182 |             agent.state.c = np.zeros(self.dim_c)
183 |         else:
184 |             noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
185 |             agent.state.c = agent.action.c + noise      
186 | 
187 |     # get collision forces for any contact between two entities
188 |     def get_collision_force(self, entity_a, entity_b):
189 |         if (not entity_a.collide) or (not entity_b.collide):
190 |             return [None, None] # not a collider
191 |         if entity_a is entity_b:
192 |             return [None, None] # don't collide against itself
193 |         # compute actual distance between entities
194 |         delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
195 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
196 |         # minimum allowable distance
197 |         dist_min = entity_a.size + entity_b.size
198 |         # softmax penetration
199 |         k = self.contact_margin
200 |         penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
201 |         if dist == 0:
202 |             force = 0
203 |         else:
204 |             force = self.contact_force * delta_pos / dist * penetration
205 |         force_a = +force if entity_a.movable else None
206 |         force_b = -force if entity_b.movable else None
207 |         return [force_a, force_b]
208 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import yaml
  4 | import gym.spaces
  5 | import sys
  6 | import tensorflow as tf
  7 | from gym.utils import seeding
  8 | import random
  9 | 
 10 | from alg import REGISTRY as alg_REGISTRY
 11 | from game import REGISTRY as env_REGISTRY
 12 | from run import REGISTRY as run_REGISTRY
 13 | from util.logger import Logger
 14 | import json
 15 | import time
 16 | 
 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 18 | 
 19 | 
 20 | def default(str):
 21 |     return str + ' [Default: %default]'
 22 | 
 23 | 
 24 | def config_args(config_name):
 25 |     if config_name is not None:
 26 |         with open(os.path.join(os.path.dirname(__file__), "config", "{}.yaml".format(config_name)), "r") as f:
 27 |             try:
 28 |                 #config_dict = yaml.load(f, Loader=yaml.FullLoader)
 29 |                 config_dict = yaml.load(f)
 30 |                 return config_dict
 31 |             except yaml.YAMLError as exc:
 32 |                 assert False, "{}.yaml error: {}".format(config_name, exc)
 33 | 
 34 | 
 35 | def readCommand(argv):
 36 |     """
 37 |     Processes the command used to run main from the command line.
 38 |     """
 39 |     from optparse import OptionParser
 40 |     usageStr = """
 41 |     USAGE:      python main.py <options>
 42 |     """
 43 |     parser = OptionParser(usageStr)
 44 | 
 45 |     parser.add_option('-n', '--numGames', dest='numGames', type='int',
 46 |                       help=default('the number of GAMES to play'), metavar='GAMES', default=20000)
 47 |     parser.add_option('-e', '--epi_step', dest='epi_step', type='int',
 48 |                       help=default('the steps of each episode'), default=99)
 49 |     parser.add_option('-g', '--game', dest='game',
 50 |                       help=default('use which GAME to play'), default='pacman')
 51 |     parser.add_option('-a', '--alg', dest='algorithm',
 52 |                       help=default('use which algorithm to play'), default='multi_ppo')
 53 |     parser.add_option('-c', '--alg_conf', dest='algorithm_config',
 54 |                       help=default('algorithm config'), default='ppo_conf.yaml')
 55 |     parser.add_option('-d', '--env_conf', dest='environment_config',
 56 |                       help=default('Environment config'), default='pacman_conf')
 57 |     parser.add_option('-s', '--seed', dest='seed', type='int',
 58 |                       help=default('the seed of tf'), default=1234)
 59 |     parser.add_option('-o', '--optimizer', dest='optimizer',
 60 |                       help=default('the optimizer of tensorflow'), default='adam')
 61 |     parser.add_option('-t', '--run_test', dest='run_test',
 62 |                       help=default('run test'), default=False)
 63 | 
 64 |     """
 65 |     parser.add_option('-f', '--fileName', dest='fileName',
 66 |                       help=default('the file name'), default='dqn_pinball')
 67 |     parser.add_option('-m', '--modelName', dest='modelName',
 68 |                       help=default('the model name'), default='dqn_pinball')
 69 |     """
 70 | 
 71 |     options, otherjunk = parser.parse_args(argv)
 72 |     # print(type(options))
 73 | 
 74 |     alg_conf = options.algorithm_config
 75 |     env_conf = options.environment_config
 76 |     alg_config_dict = config_args(alg_conf)
 77 |     env_config_dict = config_args(env_conf)
 78 | 
 79 |     args = dict()
 80 |     args['numGames'] = options.numGames
 81 |     args['game'] = options.game
 82 |     args['algorithm'] = options.algorithm
 83 |     args['epi_step'] = options.epi_step
 84 |     args['seed'] = options.seed
 85 |     args['optimizer'] = options.optimizer
 86 |     args['run_test'] = options.run_test
 87 |     t = str(time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
 88 | 
 89 |     if alg_config_dict is not None:
 90 |         args = dict(args, **alg_config_dict)
 91 |     if env_config_dict is not None:
 92 |         args = dict(args, **env_config_dict)
 93 | 
 94 |     #args['fileName'] = options.fileName
 95 |     #args['optimizer'] = options.optimizer
 96 | 
 97 |     for item in otherjunk:
 98 |         key = item.split('=')[0]
 99 |         value = item.split('=')[1]
100 |         #print(key, value)
101 |         if key not in args:
102 |             raise Exception('Command line input not understood: ' + str(item))
103 |         if type(args[key]) is int:
104 |             args[key] = int(value)
105 |         elif type(args[key]) is float:
106 |             args[key] = float(value)
107 |         elif type(args[key]) is str:
108 |             args[key] = str(value)
109 |         elif type(args[key]) is bool:
110 |             if str(value).lower() == 'true':
111 |                 args[key] = True
112 |             elif str(value).lower() == 'false':
113 |                 args[key] = False
114 |             else:
115 |                 raise Exception('Command line input is not boolean type: ' + str(value))
116 |         elif type(args[key]) is list:
117 |             try:
118 |                 args[key] = eval(value)
119 |             except (SyntaxError, NameError):
120 |                 value_l = str(value).replace(' ', '').replace('[', '').replace(']', '').split(',')
121 |                 args[key] = value_l
122 |         else:
123 |             raise Exception('Command line input is not valid type: ' + str(value))
124 | 
125 |     args['results_path'] = "../results/" + args['algorithm'] + "/" + args['game'] + "/" + args[
126 |         'game_name'] + "/" + t + "/"
127 | 
128 |     if not args['run_test']:
129 |         if not os.path.exists(args['results_path']):
130 |             os.makedirs(args['results_path'])
131 |         if not os.path.exists(args['results_path'] + args['SAVE_PATH']):
132 |             os.makedirs(args['results_path'] + args['SAVE_PATH'])
133 |         if not os.path.exists(args['results_path'] + args['graph_path']):
134 |             os.makedirs(args['results_path'] + args['graph_path'])
135 |         if not os.path.exists(args['results_path'] + args['reward_output']):
136 |             os.makedirs(args['results_path'] + args['reward_output'])
137 |         if not os.path.exists(args['results_path'] + args['log']):
138 |             os.makedirs(args['results_path'] + args['log'])
139 | 
140 |         with open(
141 |                 args['results_path'] + "command.txt",
142 |                 'w') as f:
143 |             out = ' '.join(argv)
144 |             f.writelines(out)
145 | 
146 |         with open(args['results_path'] + "args.json", "w") as f:
147 |             json.dump(args, f)
148 | 
149 |         # print('args', args)
150 | 
151 |     return args
152 | 
153 | 
154 | def get_space(env):
155 |     if type(env.action_space) is gym.spaces.discrete.Discrete:
156 |         action_dim = env.action_space.n
157 |     elif type(env.action_space) is gym.spaces.box.Box:
158 |         action_dim = env.action_space.shape[0]
159 |     elif type(env.action_space) is int:
160 |         action_dim = env.action_space
161 |     elif type(env.action_space) is list:
162 |         if type(env.action_space[0]) is gym.spaces.box.Box:
163 |             action_dim = env.action_space[0].shape[0]
164 |         else:
165 |             action_dim = env.action_space[0].n
166 |     else:
167 |         raise Exception('action space is not a valid '
168 |                         '.type')
169 |     if type(env.observation_space) is gym.spaces.discrete.Discrete:
170 |         features = env.observation_space.n
171 |     elif type(env.observation_space) is gym.spaces.box.Box:
172 |         features = env.observation_space.shape[0]
173 |     elif type(env.observation_space) is int:
174 |         features = env.observation_space
175 |     elif type(env.observation_space) is list:
176 |         features = env.observation_space[0].shape[0]
177 |     else:
178 |         raise Exception('observation space is not a valid type')
179 |     return action_dim, features
180 | 
181 | def NoneAlg(alg):
182 |     algs = ['maddpg', 'multi_ppo', 'multi_ppo_sro', 'maddpg_sr' , 'shppo', 'shppo_sro']
183 |     if alg in algs:
184 |         return True
185 |     return False
186 | 
187 | 
188 | def runGames(args):
189 |     print(args)
190 |     if args['run_test']:
191 |         logger = None
192 |     else:
193 |         logger = Logger(args['results_path'] + args['log'], args['results_path'] + args['graph_path'], args)
194 |     np.random.seed(args['seed'])
195 |     tf.set_random_seed(args['seed'])
196 |     random.seed(args['seed'])
197 |     seeding.np_random(args['seed'])
198 |     env = env_REGISTRY[args['game']](args)
199 |     args['action_dim'], args['features'] = get_space(env)
200 |     if NoneAlg(args['algorithm']):
201 |         alg = None
202 |     else:
203 |         alg = alg_REGISTRY[args['algorithm']](args['action_dim'], args['features'], args, logger)
204 |     if args['run_test'] and args['game'] != 'particle':
205 |         run_REGISTRY['test'](args, env, alg, logger)
206 |     elif args['run_test'] and args['game'] == 'particle':
207 |         run_REGISTRY['particle'](args, env, alg, logger)
208 |     else:
209 |         run_REGISTRY[args['algorithm']](args, env, alg, logger)
210 | 
211 | 
212 | if __name__ == '__main__':
213 |     args = readCommand(sys.argv[1:])  # Get game components based on input
214 |     runGames(args)
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 


--------------------------------------------------------------------------------
/alg/maddpg/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import time
  5 | import pickle
  6 | 
  7 | from alg.maddpg.common import tf_util as U
  8 | from alg.maddpg.trainer.maddpg import MADDPGAgentTrainer
  9 | import tensorflow.contrib.layers as layers
 10 | 
 11 | 
 12 | def parse_args():
 13 |     parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
 14 |     # Environment
 15 |     parser.add_argument("--scenario_name", type=str, default="simple_spread", help="name of the scenario script")
 16 |     parser.add_argument("--max-episode-len", type=int, default=25, help="maximum episode length")
 17 |     parser.add_argument("--num-episodes", type=int, default=60000, help="number of episodes")
 18 |     parser.add_argument("--num-adversaries", type=int, default=0, help="number of adversaries")
 19 |     parser.add_argument("--good-policy", type=str, default="maddpg", help="policy for good agents")
 20 |     parser.add_argument("--adv-policy", type=str, default="maddpg", help="policy of adversaries")
 21 |     # Core training parameters
 22 |     parser.add_argument("--lr", type=float, default=1e-2, help="learning rate for Adam optimizer")
 23 |     parser.add_argument("--learning_rate_c", type=float, default=1e-2, help="learning rate for Adam optimizer")
 24 |     parser.add_argument("--learning_rate_a", type=float, default=1e-2, help="learning rate for Adam optimizer")
 25 |     parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
 26 |     parser.add_argument("--batch-size", type=int, default=1024, help="number of episodes to optimize at the same time")
 27 |     parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp")
 28 |     # Checkpointing
 29 |     parser.add_argument("--exp-name", type=str, default=None, help="name of the experiment")
 30 |     parser.add_argument("--save-dir", type=str, default="/tmp/policy/", help="directory in which training state and model should be saved")
 31 |     parser.add_argument("--save-rate", type=int, default=1000, help="save model once every time this many episodes are completed")
 32 |     parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded")
 33 |     # Evaluation
 34 |     parser.add_argument("--restore", action="store_true", default=False)
 35 |     parser.add_argument("--display", action="store_true", default=False)
 36 |     parser.add_argument("--benchmark", action="store_true", default=False)
 37 |     parser.add_argument("--benchmark-iters", type=int, default=100000, help="number of iterations run for benchmarking")
 38 |     parser.add_argument("--benchmark-dir", type=str, default="./benchmark_files/", help="directory where benchmark data is saved")
 39 |     parser.add_argument("--plots-dir", type=str, default="./learning_curves/", help="directory where plot data is saved")
 40 |     return parser.parse_args()
 41 | 
 42 | 
 43 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, rnn_cell=None):
 44 |     # This model takes as input an observation and returns values of all actions
 45 |     with tf.variable_scope(scope, reuse=reuse):
 46 |         out = input
 47 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
 48 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
 49 |         out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None)
 50 |         return out
 51 | 
 52 | 
 53 | def get_trainers(env, num_adversaries, obs_shape_n, arglist):
 54 |     trainers = []
 55 |     model = mlp_model
 56 |     trainer = MADDPGAgentTrainer
 57 |     for i in range(num_adversaries):
 58 |         trainers.append(trainer(
 59 |             "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
 60 |             local_q_func=(arglist.adv_policy=='ddpg')))
 61 |     for i in range(num_adversaries, env.n):
 62 |         trainers.append(trainer(
 63 |             "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
 64 |             local_q_func=(arglist.good_policy=='ddpg')))
 65 |     return trainers
 66 | 
 67 | 
 68 | def make_env(scenario_name, arglist, benchmark=False):
 69 |     from game.particle.multiagent.environment import MultiAgentEnv
 70 |     import game.particle.multiagent.scenarios as scenarios
 71 | 
 72 |     # load scenario from script
 73 |     scenario = scenarios.load(scenario_name + ".py").Scenario()
 74 |     # create world
 75 |     world = scenario.make_world()
 76 |     # create multiagent environment
 77 |     if benchmark:
 78 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
 79 |     else:
 80 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
 81 |     return env
 82 | 
 83 | 
 84 | def train(arglist):
 85 |     with U.single_threaded_session():
 86 |         # Create environment
 87 |         env = make_env(arglist.scenario_name, arglist, arglist.benchmark)
 88 |         # Create agent trainers
 89 |         obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
 90 |         num_adversaries = min(env.n, arglist.num_adversaries)
 91 |         trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
 92 |         print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))
 93 | 
 94 |         # Initialize
 95 |         U.initialize()
 96 | 
 97 |         # Load previous results, if necessary
 98 |         if arglist.load_dir == "":
 99 |             arglist.load_dir = arglist.save_dir
100 |         if arglist.display or arglist.restore or arglist.benchmark:
101 |             print('Loading previous state...')
102 |             U.load_state(arglist.load_dir)
103 | 
104 |         episode_rewards = [0.0]  # sum of rewards for all agents
105 |         agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
106 |         final_ep_rewards = []  # sum of rewards for training curve
107 |         final_ep_ag_rewards = []  # agent rewards for training curve
108 |         agent_info = [[[]]]  # placeholder for benchmarking info
109 |         saver = tf.train.Saver()
110 |         obs_n = env.reset()
111 |         episode_step = 0
112 |         train_step = 0
113 |         t_start = time.time()
114 | 
115 |         print('Starting iterations...')
116 |         while True:
117 |             # get action
118 |             action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
119 |             # environment step
120 |             print(new_obs_n)
121 |             new_obs_n, rew_n, done_n, info_n = env.step(action_n)
122 |             episode_step += 1
123 |             done = all(done_n)
124 |             terminal = (episode_step >= arglist.max_episode_len)
125 |             # collect experience
126 |             for i, agent in enumerate(trainers):
127 |                 agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
128 |             obs_n = new_obs_n
129 | 
130 |             for i, rew in enumerate(rew_n):
131 |                 episode_rewards[-1] += rew
132 |                 agent_rewards[i][-1] += rew
133 | 
134 |             if done or terminal:
135 |                 obs_n = env.reset()
136 |                 episode_step = 0
137 |                 episode_rewards.append(0)
138 |                 for a in agent_rewards:
139 |                     a.append(0)
140 |                 agent_info.append([[]])
141 | 
142 |             # increment global step counter
143 |             train_step += 1
144 | 
145 |             # for benchmarking learned policies
146 |             if arglist.benchmark:
147 |                 for i, info in enumerate(info_n):
148 |                     agent_info[-1][i].append(info_n['n'])
149 |                 if train_step > arglist.benchmark_iters and (done or terminal):
150 |                     file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
151 |                     print('Finished benchmarking, now saving...')
152 |                     with open(file_name, 'wb') as fp:
153 |                         pickle.dump(agent_info[:-1], fp)
154 |                     break
155 |                 continue
156 | 
157 |             # for displaying learned policies
158 |             if arglist.display:
159 |                 time.sleep(0.1)
160 |                 env.render()
161 |                 continue
162 | 
163 |             # update all trainers, if not in display or benchmark mode
164 |             loss = None
165 |             for agent in trainers:
166 |                 agent.preupdate()
167 |             for agent in trainers:
168 |                 loss = agent.update(trainers, train_step)
169 | 
170 |             # save model, display training output
171 |             if terminal and (len(episode_rewards) % arglist.save_rate == 0):
172 |                 U.save_state(arglist.save_dir, saver=saver)
173 |                 # print statement depends on whether or not there are adversaries
174 |                 if num_adversaries == 0:
175 |                     print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
176 |                         train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
177 |                 else:
178 |                     print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
179 |                         train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
180 |                         [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
181 |                 t_start = time.time()
182 |                 # Keep track of final episode reward
183 |                 final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
184 |                 for rew in agent_rewards:
185 |                     final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
186 | 
187 |             # saves final episode reward for plotting training curve later
188 |             if len(episode_rewards) > arglist.num_episodes:
189 |                 rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
190 |                 with open(rew_file_name, 'wb') as fp:
191 |                     pickle.dump(final_ep_rewards, fp)
192 |                 agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
193 |                 with open(agrew_file_name, 'wb') as fp:
194 |                     pickle.dump(final_ep_ag_rewards, fp)
195 |                 print('...Finished total of {} episodes.'.format(len(episode_rewards)))
196 |                 break
197 | 
198 | 
199 | # if __name__ == '__main__':
200 | #     arglist = parse_args()
201 | #     train(arglist)
202 | 


--------------------------------------------------------------------------------
/alg/sharing_multi_ppo/ppo.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from alg.optimizer import Optimizer
  5 | 
  6 | 
  7 | class PPO:
  8 |     def __init__(self, n_actions, n_features, n_agents, args, SESS, logger):
  9 |         self.n_actions = n_actions
 10 |         self.n_features = n_features + n_agents
 11 |         self.n_agents = n_agents
 12 |         self.args = args
 13 |         self.logger = logger
 14 |         self.learning_step = 0
 15 |         self.obs = tf.placeholder(tf.float32, [None, self.n_features], 's')
 16 | 
 17 |         self.act_probs, self.policy_param = self.build_actor_net(self.args['policy'])
 18 |         self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy'], trainable=False)
 19 |         self.v_preds, self.v_param = self.build_critic_net('critic')
 20 | 
 21 |         if self.args['continuous_action']:
 22 |             self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0)
 23 |         else:
 24 |             self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1)
 25 |             self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1])
 26 |             self.act_deterministic = tf.argmax(self.act_probs, axis=1)
 27 | 
 28 |         self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)]
 29 | 
 30 |         opt = Optimizer(args['optimizer'], args['learning_rate_a'])
 31 |         self.optimizer = opt.get_optimizer()
 32 |         opt_c = Optimizer(args['optimizer'], args['learning_rate_c'])
 33 |         self.optimizer_c = opt_c.get_optimizer()
 34 | 
 35 |         with tf.variable_scope('train_inp'):
 36 |             if self.args['continuous_action']:
 37 |                 self.actions = tf.placeholder(tf.float32, [None, n_actions], 'action')
 38 |             else:
 39 |                 self.actions = tf.placeholder(dtype=tf.float32, shape=[None, n_actions], name='actions')
 40 |             self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards')
 41 |             self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next')
 42 |             self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes')
 43 | 
 44 |         self.build_loss()
 45 | 
 46 |         self.sess = SESS
 47 |         #self.sess.run(tf.global_variables_initializer())
 48 | 
 49 |     def build_actor_net(self, scope, trainable=True):
 50 |         with tf.variable_scope(scope):
 51 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable)
 52 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu,
 53 |                                       trainable=trainable)
 54 |             if self.args['continuous_action']:
 55 |                 mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable)
 56 |                 sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable)
 57 |                 act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9)
 58 |             else:
 59 |                 act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax)
 60 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 61 |         return act_probs, params
 62 | 
 63 |     def build_critic_net(self, scope):
 64 |         with tf.variable_scope(scope):
 65 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu)
 66 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu)
 67 |             v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None)
 68 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 69 |         return v_preds, params
 70 | 
 71 |     def build_loss(self):
 72 |         with tf.variable_scope('update_critic'):
 73 |             self.advantage = self.rewards - self.v_preds
 74 |             self.c_loss = tf.reduce_mean(tf.square(self.advantage))
 75 |             '''
 76 |             gradients = self.optimizer_c.compute_gradients(self.c_loss, var_list=self.v_param)
 77 |             for i, (grad, var) in enumerate(gradients):
 78 |                 if grad is not None:
 79 |                     gradients[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var)
 80 |             self.train_c_op = self.optimizer_c.apply_gradients(gradients)
 81 |             '''
 82 |             self.train_c_op = self.optimizer_c.minimize(self.c_loss)
 83 | 
 84 |         with tf.variable_scope('update_actor'):
 85 |             with tf.variable_scope('loss/clip'):
 86 |                 # ratios = tf.divide(act_probs, act_probs_old)
 87 |                 if self.args['continuous_action']:
 88 |                     act_probs = self.act_probs.prob(self.actions)
 89 |                     act_probs_old = self.o_act_probs.prob(self.actions)
 90 |                     entropy = self.act_probs.entropy()
 91 |                     ratios = act_probs / act_probs_old
 92 |                     #ratios = self.act_probs.prob(self.actions) / self.o_act_probs.prob(self.actions)
 93 |                 else:
 94 |                     act_probs = self.act_probs * self.actions# * tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1])
 95 |                     act_probs = tf.reduce_sum(act_probs, axis=1)
 96 |                     # probabilities of actions which agent took with old policy
 97 |                     act_probs_old = self.o_act_probs * self.actions#* tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1])
 98 |                     act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
 99 |                     entropy = -tf.reduce_sum(self.act_probs *
100 |                                              tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1)
101 |                     ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old))
102 |                 self.entropy = self.args['c2'] * tf.reduce_mean(entropy)  # mean of entropy of pi(obs)
103 |                 clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'], clip_value_max=1 + self.args['clip_value'])
104 |                 loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
105 |                 self.loss_clip = tf.reduce_mean(loss_clip)
106 |                 self.a_loss = -(self.loss_clip + self.entropy)
107 |                 '''
108 |                 gradients_t = self.optimizer.compute_gradients(self.a_loss, var_list=self.policy_param)
109 |                 for i, (grad, var) in enumerate(gradients_t):
110 |                     if grad is not None:
111 |                         gradients_t[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var)
112 |                 self.train_a_op = self.optimizer.apply_gradients(gradients_t)
113 |                 '''
114 |                 self.train_a_op = self.optimizer.minimize(self.a_loss)
115 | 
116 |     def choose_action(self, obs, agent_id=0):
117 |         obs = obs[np.newaxis, :]
118 |         obs = self.get_agent_obs(obs, agent_id)
119 |         if self.args['continuous_action']:
120 |             actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs})
121 |             #print('clip', actions[0], self.args['action_clip'], np.clip(actions[0], -self.args['action_clip'], self.args['action_clip']))
122 |             return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip'])
123 |         else:
124 |             if self.args['stochastic']:
125 |                 actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs})
126 |                 action = actions[0]
127 |                 action_one_hot = np.zeros(self.n_actions)
128 |                 action_one_hot[action] = 1
129 |                 #print(p)
130 |                 return action_one_hot
131 |             else:
132 |                 actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs})
133 |                 action = actions[0]
134 |                 action_one_hot = np.zeros(self.n_actions)
135 |                 action_one_hot[action] = 1
136 |                 return action_one_hot
137 | 
138 |     def choose_hold_action(self, obs):
139 |         return np.zeros(self.n_actions)
140 | 
141 |     def choose_deterministic_action(self, obs, agent_id=0):
142 |         obs = self.get_agent_obs(obs, agent_id)
143 |         actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0]
144 |         action_one_hots = []
145 |         for i in range(len(actions)):
146 |             action = actions[i]
147 |             action_one_hot = np.zeros(self.n_actions)
148 |             action_one_hot[action] = 1
149 |             action_one_hots.append(action_one_hot)
150 |         return action_one_hots
151 | 
152 |     def get_agent_obs(self, obs, agent_id=0):
153 |         if type(agent_id) is int:
154 |             agent_id_arr = [agent_id] * len(obs)
155 |         elif type(agent_id) is list:
156 |             agent_id_arr = agent_id
157 |         else:
158 |             raise Exception('the agent_id field must be type of int or list')
159 |         agent_one_hot = np.eye(self.n_agents)[agent_id_arr]
160 |         obs = np.hstack((agent_one_hot, obs))
161 |         return obs
162 | 
163 |     def get_v(self, s, agent_id=0):
164 |         obs = np.array(s)
165 |         obs = obs[np.newaxis, :]
166 |         obs = self.get_agent_obs(obs, agent_id)
167 |         v_preds = self.sess.run(self.v_preds, {self.obs: obs})
168 |         return v_preds[0, 0]
169 | 
170 |     def update(self, actor, s, a, r, options, terms, epi, agent_id=0):
171 |         self.sess.run(self.replace_op)
172 |         s = self.get_agent_obs(s, agent_id)
173 |         adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r})
174 |         for i in range(self.args['epi_train_times']):
175 |             _, a_loss, clip, entropy = self.sess.run([self.train_a_op, self.a_loss, self.loss_clip, self.entropy], {self.obs: s, self.actions: a, self.gaes: adv})
176 |             __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r})
177 |             self.logger.write_tb_log('a_loss', a_loss, self.learning_step)
178 |             self.logger.write_tb_log('c_loss', c_loss, self.learning_step)
179 |             self.logger.write_tb_log('clip', clip, self.learning_step)
180 |             self.logger.write_tb_log('entropy', entropy, self.learning_step)
181 |             self.learning_step += 1
182 |             #print(a_loss, clip, entropy, c_loss)
183 | 
184 |     def load_model(self, path):
185 |         saver = tf.train.Saver(self.policy_param)
186 |         print(path + '.ckpt')
187 |         saver.restore(self.sess, path + ".ckpt")
188 | 
189 |     def save_model(self, path):
190 |         saver = tf.train.Saver(self.policy_param)
191 |         saver.save(self.sess, path + ".ckpt")
192 | 


--------------------------------------------------------------------------------
/alg/muti_ptf_ppo/ppo.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from alg.optimizer import Optimizer
  5 | 
  6 | 
  7 | class PPO:
  8 |     def __init__(self, n_actions, n_features, args, SESS, logger, i):
  9 |         self.n_actions = n_actions
 10 |         self.n_features = n_features
 11 |         self.index = i
 12 |         self.args = args
 13 |         self.logger = logger
 14 |         self.learning_step = 0
 15 |         self.obs = tf.placeholder(tf.float32, [None, n_features], 's')
 16 | 
 17 |         self.act_probs, self.policy_param = self.build_actor_net(self.args['policy'] + "_" + str(self.index))
 18 |         self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy'] + "_" + str(self.index), trainable=False)
 19 |         self.v_preds, self.v_param = self.build_critic_net('critic' + "_" + str(self.index))
 20 | 
 21 |         if self.args['continuous_action']:
 22 |             self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0)
 23 |         else:
 24 |             self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1)
 25 |             self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1])
 26 |             self.act_deterministic = tf.argmax(self.act_probs, axis=1)
 27 | 
 28 |         self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)]
 29 | 
 30 |         opt = Optimizer(args['optimizer'], args['learning_rate_a'])
 31 |         self.optimizer = opt.get_optimizer()
 32 |         opt_c = Optimizer(args['optimizer'], args['learning_rate_c'])
 33 |         self.optimizer_c = opt_c.get_optimizer()
 34 | 
 35 |         with tf.variable_scope('train_inp' + "_" + str(self.index)):
 36 |             if self.args['continuous_action']:
 37 |                 self.actions = tf.placeholder(tf.float32, [None, n_actions], 'action')
 38 |             else:
 39 |                 self.actions = tf.placeholder(dtype=tf.float32, shape=[None, n_actions], name='actions')
 40 |             self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards')
 41 |             self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next')
 42 |             self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes')
 43 | 
 44 |         self.build_loss()
 45 | 
 46 |         self.sess = SESS
 47 |         #self.sess.run(tf.global_variables_initializer())
 48 | 
 49 |     def build_actor_net(self, scope, trainable=True):
 50 |         with tf.variable_scope(scope):
 51 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable)
 52 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu,
 53 |                                       trainable=trainable)
 54 |             if self.args['continuous_action']:
 55 |                 mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable)
 56 |                 sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable)
 57 |                 act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9)
 58 |             else:
 59 |                 act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax)
 60 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 61 |         return act_probs, params
 62 | 
 63 |     def build_critic_net(self, scope):
 64 |         with tf.variable_scope(scope):
 65 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu)
 66 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu)
 67 |             v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None)
 68 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 69 |         return v_preds, params
 70 | 
 71 |     def build_loss(self):
 72 |         with tf.variable_scope('update_critic' + "_" + str(self.index)):
 73 |             self.advantage = self.rewards - self.v_preds
 74 |             self.c_loss = tf.reduce_mean(tf.square(self.advantage))
 75 |             '''
 76 |             gradients = self.optimizer_c.compute_gradients(self.c_loss, var_list=self.v_param)
 77 |             for i, (grad, var) in enumerate(gradients):
 78 |                 if grad is not None:
 79 |                     gradients[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var)
 80 |             self.train_c_op = self.optimizer_c.apply_gradients(gradients)
 81 |             '''
 82 |             self.train_c_op = self.optimizer_c.minimize(self.c_loss)
 83 | 
 84 |         with tf.variable_scope('update_actor' + "_" + str(self.index)):
 85 |             with tf.variable_scope('loss/clip'):
 86 |                 # ratios = tf.divide(act_probs, act_probs_old)
 87 |                 if self.args['continuous_action']:
 88 |                     act_probs = self.act_probs.prob(self.actions)
 89 |                     act_probs_old = self.o_act_probs.prob(self.actions)
 90 |                     entropy = self.act_probs.entropy()
 91 |                     ratios = act_probs / act_probs_old
 92 |                     #ratios = self.act_probs.prob(self.actions) / self.o_act_probs.prob(self.actions)
 93 |                 else:
 94 |                     act_probs = self.act_probs * self.actions# * tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1])
 95 |                     act_probs = tf.reduce_sum(act_probs, axis=1)
 96 |                     # probabilities of actions which agent took with old policy
 97 |                     act_probs_old = self.o_act_probs * self.actions#* tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1])
 98 |                     act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
 99 |                     entropy = -tf.reduce_sum(self.act_probs *
100 |                                              tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1)
101 |                     ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old))
102 |                 self.entropy = self.args['c2'] * tf.reduce_mean(entropy)  # mean of entropy of pi(obs)
103 |                 clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'], clip_value_max=1 + self.args['clip_value'])
104 |                 loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
105 |                 self.loss_clip = tf.reduce_mean(loss_clip)
106 |                 self.a_loss = -(self.loss_clip + self.entropy)
107 |                 '''
108 |                 gradients_t = self.optimizer.compute_gradients(self.a_loss, var_list=self.policy_param)
109 |                 for i, (grad, var) in enumerate(gradients_t):
110 |                     if grad is not None:
111 |                         gradients_t[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var)
112 |                 self.train_a_op = self.optimizer.apply_gradients(gradients_t)
113 |                 '''
114 |                 self.train_a_op = self.optimizer.minimize(self.a_loss)
115 | 
116 |     def choose_action(self, obs, agent_id=0):
117 |         obs = np.array(obs)
118 |         obs = obs[np.newaxis, :]
119 |         if self.args['continuous_action']:
120 |             actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs})
121 |             #print('clip', actions[0], self.args['action_clip'], np.clip(actions[0], -self.args['action_clip'], self.args['action_clip']))
122 |             return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip'])
123 |         else:
124 |             if self.args['stochastic']:
125 |                 actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs})
126 |                 action = actions[0]
127 |                 action_one_hot = np.zeros(self.n_actions)
128 |                 action_one_hot[action] = 1
129 |                 #print(p)
130 |                 return action_one_hot
131 |             else:
132 |                 actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs})
133 |                 action = actions[0]
134 |                 action_one_hot = np.zeros(self.n_actions)
135 |                 action_one_hot[action] = 1
136 |                 return action_one_hot
137 | 
138 |     def get_action_and_v(self, s):
139 |         obs = np.array(s)
140 |         if self.args['continuous_action']:
141 |             actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs})
142 |             return actions, v_preds
143 |         else:
144 |             if self.args['stochastic']:
145 |                 actions, v_preds, p = self.sess.run([self.act_probs, self.v_preds, self.act_probs], feed_dict={self.obs: obs})
146 |                 return actions, v_preds
147 | 
148 |     def choose_hold_action(self, obs):
149 |         return np.zeros(self.n_actions)
150 | 
151 |     def choose_deterministic_action(self, obs):
152 |         actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0]
153 |         action_one_hots = []
154 |         for i in range(len(actions)):
155 |             action = actions[i]
156 |             action_one_hot = np.zeros(self.n_actions)
157 |             action_one_hot[action] = 1
158 |             action_one_hots.append(action_one_hot)
159 |         return action_one_hots
160 | 
161 |     def get_v(self, s, agent_id=0):
162 |         obs = np.array(s)
163 |         obs = obs[np.newaxis, :]
164 |         v_preds = self.sess.run(self.v_preds, {self.obs: obs})
165 |         return v_preds[0, 0]
166 | 
167 |     def update(self, actor, s, a, r, options, terms, epi, agentid=0):
168 |         self.sess.run(self.replace_op)
169 |         adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r})
170 |         for i in range(self.args['epi_train_times']):
171 |             _, a_loss, clip, entropy = self.sess.run([self.train_a_op, self.a_loss, self.loss_clip, self.entropy], {self.obs: s, self.actions: a, self.gaes: adv})
172 |             __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r})
173 |             self.logger.write_tb_log('a_loss_' + str(self.index), a_loss, self.learning_step)
174 |             self.logger.write_tb_log('c_loss_' + str(self.index), c_loss, self.learning_step)
175 |             self.logger.write_tb_log('clip_' + str(self.index), clip, self.learning_step)
176 |             self.logger.write_tb_log('entropy_' + str(self.index), entropy, self.learning_step)
177 |             self.learning_step += 1
178 |             #print(a_loss, clip, entropy, c_loss)
179 | 
180 |     def load_model(self, path):
181 |         saver = tf.train.Saver(self.policy_param)
182 |         print(path + '.ckpt')
183 |         saver.restore(self.sess, path + ".ckpt")
184 | 
185 |     def save_model(self, path):
186 |         saver = tf.train.Saver(self.policy_param)
187 |         saver.save(self.sess, path + ".ckpt")
188 | 


--------------------------------------------------------------------------------
/game/particle/multiagent/rendering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 2D rendering framework
  3 | """
  4 | from __future__ import division
  5 | import os
  6 | import six
  7 | import sys
  8 | 
  9 | if "Apple" in sys.version:
 10 |     if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
 11 |         os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
 12 |         # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
 13 | 
 14 | from gym.utils import reraise
 15 | from gym import error
 16 | 
 17 | try:
 18 |     import pyglet
 19 | except ImportError as e:
 20 |     reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
 21 | 
 22 | try:
 23 |     from pyglet.gl import *
 24 | except ImportError as e:
 25 |     reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python <your_script.py>'")
 26 | 
 27 | import math
 28 | import numpy as np
 29 | 
 30 | RAD2DEG = 57.29577951308232
 31 | 
 32 | 
 33 | def get_display(spec):
 34 |     """Convert a display specification (such as :0) into an actual Display
 35 |     object.
 36 | 
 37 |     Pyglet only supports multiple Displays on Linux.
 38 |     """
 39 |     if spec is None:
 40 |         return None
 41 |     elif isinstance(spec, six.string_types):
 42 |         return pyglet.canvas.Display(spec)
 43 |     else:
 44 |         raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
 45 | 
 46 | 
 47 | class Viewer(object):
 48 |     def __init__(self, width, height, display=None):
 49 |         display = get_display(display)
 50 | 
 51 |         self.width = width
 52 |         self.height = height
 53 | 
 54 |         self.window = pyglet.window.Window(width=width, height=height, display=display)
 55 |         self.window.on_close = self.window_closed_by_user
 56 |         self.geoms = []
 57 |         self.onetime_geoms = []
 58 |         self.transform = Transform()
 59 | 
 60 |         glEnable(GL_BLEND)
 61 |         # glEnable(GL_MULTISAMPLE)
 62 |         glEnable(GL_LINE_SMOOTH)
 63 |         # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
 64 |         glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
 65 |         glLineWidth(2.0)
 66 |         glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
 67 | 
 68 |     def close(self):
 69 |         self.window.close()
 70 | 
 71 |     def window_closed_by_user(self):
 72 |         self.close()
 73 | 
 74 |     def set_bounds(self, left, right, bottom, top):
 75 |         assert right > left and top > bottom
 76 |         scalex = self.width/(right-left)
 77 |         scaley = self.height/(top-bottom)
 78 |         self.transform = Transform(
 79 |             translation=(-left*scalex, -bottom*scaley),
 80 |             scale=(scalex, scaley))
 81 | 
 82 |     def add_geom(self, geom):
 83 |         self.geoms.append(geom)
 84 | 
 85 |     def add_onetime(self, geom):
 86 |         self.onetime_geoms.append(geom)
 87 | 
 88 |     def render(self, return_rgb_array=False):
 89 |         glClearColor(1,1,1,1)
 90 |         self.window.clear()
 91 |         self.window.switch_to()
 92 |         self.window.dispatch_events()
 93 |         self.transform.enable()
 94 |         for geom in self.geoms:
 95 |             geom.render()
 96 |         for geom in self.onetime_geoms:
 97 |             geom.render()
 98 |         self.transform.disable()
 99 |         arr = None
100 |         if return_rgb_array:
101 |             buffer = pyglet.image.get_buffer_manager().get_color_buffer()
102 |             image_data = buffer.get_image_data()
103 |             arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
104 |             # In https://github.com/openai/gym-http-api/issues/2, we
105 |             # discovered that someone using Xmonad on Arch was having
106 |             # a window of size 598 x 398, though a 600 x 400 window
107 |             # was requested. (Guess Xmonad was preserving a pixel for
108 |             # the boundary.) So we use the buffer height/width rather
109 |             # than the requested one.
110 |             arr = arr.reshape(buffer.height, buffer.width, 4)
111 |             arr = arr[::-1,:,0:3]
112 |         self.window.flip()
113 |         self.onetime_geoms = []
114 |         return arr
115 | 
116 |     # Convenience
117 |     def draw_circle(self, radius=10, res=30, filled=True, **attrs):
118 |         geom = make_circle(radius=radius, res=res, filled=filled)
119 |         _add_attrs(geom, attrs)
120 |         self.add_onetime(geom)
121 |         return geom
122 | 
123 |     def draw_polygon(self, v, filled=True, **attrs):
124 |         geom = make_polygon(v=v, filled=filled)
125 |         _add_attrs(geom, attrs)
126 |         self.add_onetime(geom)
127 |         return geom
128 | 
129 |     def draw_polyline(self, v, **attrs):
130 |         geom = make_polyline(v=v)
131 |         _add_attrs(geom, attrs)
132 |         self.add_onetime(geom)
133 |         return geom
134 | 
135 |     def draw_line(self, start, end, **attrs):
136 |         geom = Line(start, end)
137 |         _add_attrs(geom, attrs)
138 |         self.add_onetime(geom)
139 |         return geom
140 | 
141 |     def get_array(self):
142 |         self.window.flip()
143 |         image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
144 |         self.window.flip()
145 |         arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
146 |         arr = arr.reshape(self.height, self.width, 4)
147 |         return arr[::-1,:,0:3]
148 | 
149 | 
150 | def _add_attrs(geom, attrs):
151 |     if "color" in attrs:
152 |         geom.set_color(*attrs["color"])
153 |     if "linewidth" in attrs:
154 |         geom.set_linewidth(attrs["linewidth"])
155 | 
156 | 
157 | class Geom(object):
158 |     def __init__(self):
159 |         self._color=Color((0, 0, 0, 1.0))
160 |         self.attrs = [self._color]
161 | 
162 |     def render(self):
163 |         for attr in reversed(self.attrs):
164 |             attr.enable()
165 |         self.render1()
166 |         for attr in self.attrs:
167 |             attr.disable()
168 | 
169 |     def render1(self):
170 |         raise NotImplementedError
171 | 
172 |     def add_attr(self, attr):
173 |         self.attrs.append(attr)
174 | 
175 |     def set_color(self, r, g, b, alpha=1):
176 |         self._color.vec4 = (r, g, b, alpha)
177 | 
178 | 
179 | class Attr(object):
180 |     def enable(self):
181 |         raise NotImplementedError
182 | 
183 |     def disable(self):
184 |         pass
185 | 
186 | 
187 | class Transform(Attr):
188 |     def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
189 |         self.set_translation(*translation)
190 |         self.set_rotation(rotation)
191 |         self.set_scale(*scale)
192 | 
193 |     def enable(self):
194 |         glPushMatrix()
195 |         glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
196 |         glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
197 |         glScalef(self.scale[0], self.scale[1], 1)
198 | 
199 |     def disable(self):
200 |         glPopMatrix()
201 | 
202 |     def set_translation(self, newx, newy):
203 |         self.translation = (float(newx), float(newy))
204 | 
205 |     def set_rotation(self, new):
206 |         self.rotation = float(new)
207 | 
208 |     def set_scale(self, newx, newy):
209 |         self.scale = (float(newx), float(newy))
210 | 
211 | 
212 | class Color(Attr):
213 |     def __init__(self, vec4):
214 |         self.vec4 = vec4
215 | 
216 |     def enable(self):
217 |         glColor4f(*self.vec4)
218 | 
219 | 
220 | class LineStyle(Attr):
221 |     def __init__(self, style):
222 |         self.style = style
223 | 
224 |     def enable(self):
225 |         glEnable(GL_LINE_STIPPLE)
226 |         glLineStipple(1, self.style)
227 | 
228 |     def disable(self):
229 |         glDisable(GL_LINE_STIPPLE)
230 | 
231 | 
232 | class LineWidth(Attr):
233 |     def __init__(self, stroke):
234 |         self.stroke = stroke
235 | 
236 |     def enable(self):
237 |         glLineWidth(self.stroke)
238 | 
239 | 
240 | class Point(Geom):
241 |     def __init__(self):
242 |         Geom.__init__(self)
243 | 
244 |     def render1(self):
245 |         glBegin(GL_POINTS) # draw point
246 |         glVertex3f(0.0, 0.0, 0.0)
247 |         glEnd()
248 | 
249 | 
250 | class FilledPolygon(Geom):
251 |     def __init__(self, v):
252 |         Geom.__init__(self)
253 |         self.v = v
254 | 
255 |     def render1(self):
256 |         if len(self.v) == 4: glBegin(GL_QUADS)
257 |         elif len(self.v) > 4: glBegin(GL_POLYGON)
258 |         else: glBegin(GL_TRIANGLES)
259 |         for p in self.v:
260 |             glVertex3f(p[0], p[1], 0)  # draw each vertex
261 |         glEnd()
262 | 
263 |         color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
264 |         glColor4f(*color)
265 |         glBegin(GL_LINE_LOOP)
266 |         for p in self.v:
267 |             glVertex3f(p[0], p[1], 0)  # draw each vertex
268 |         glEnd()
269 | 
270 | 
271 | def make_circle(radius=10, res=30, filled=True):
272 |     points = []
273 |     for i in range(res):
274 |         ang = 2*math.pi*i / res
275 |         points.append((math.cos(ang)*radius, math.sin(ang)*radius))
276 |     if filled:
277 |         return FilledPolygon(points)
278 |     else:
279 |         return PolyLine(points, True)
280 | 
281 | 
282 | def make_polygon(v, filled=True):
283 |     if filled: return FilledPolygon(v)
284 |     else: return PolyLine(v, True)
285 | 
286 | 
287 | def make_polyline(v):
288 |     return PolyLine(v, False)
289 | 
290 | 
291 | def make_capsule(length, width):
292 |     l, r, t, b = 0, length, width/2, -width/2
293 |     box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
294 |     circ0 = make_circle(width/2)
295 |     circ1 = make_circle(width/2)
296 |     circ1.add_attr(Transform(translation=(length, 0)))
297 |     geom = Compound([box, circ0, circ1])
298 |     return geom
299 | 
300 | 
301 | class Compound(Geom):
302 |     def __init__(self, gs):
303 |         Geom.__init__(self)
304 |         self.gs = gs
305 |         for g in self.gs:
306 |             g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
307 | 
308 |     def render1(self):
309 |         for g in self.gs:
310 |             g.render()
311 | 
312 | 
313 | class PolyLine(Geom):
314 |     def __init__(self, v, close):
315 |         Geom.__init__(self)
316 |         self.v = v
317 |         self.close = close
318 |         self.linewidth = LineWidth(1)
319 |         self.add_attr(self.linewidth)
320 | 
321 |     def render1(self):
322 |         glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
323 |         for p in self.v:
324 |             glVertex3f(p[0], p[1],0)  # draw each vertex
325 |         glEnd()
326 | 
327 |     def set_linewidth(self, x):
328 |         self.linewidth.stroke = x
329 | 
330 | 
331 | class Line(Geom):
332 |     def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
333 |         Geom.__init__(self)
334 |         self.start = start
335 |         self.end = end
336 |         self.linewidth = LineWidth(1)
337 |         self.add_attr(self.linewidth)
338 | 
339 |     def render1(self):
340 |         glBegin(GL_LINES)
341 |         glVertex2f(*self.start)
342 |         glVertex2f(*self.end)
343 |         glEnd()
344 | 
345 | 
346 | class Image(Geom):
347 |     def __init__(self, fname, width, height):
348 |         Geom.__init__(self)
349 |         self.width = width
350 |         self.height = height
351 |         img = pyglet.image.load(fname)
352 |         self.img = img
353 |         self.flip = False
354 | 
355 |     def render1(self):
356 |         self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
357 | 
358 | # ================================================================
359 | 
360 | 
361 | class SimpleImageViewer(object):
362 |     def __init__(self, display=None):
363 |         self.window = None
364 |         self.isopen = False
365 |         self.display = display
366 | 
367 |     def imshow(self, arr):
368 |         if self.window is None:
369 |             height, width, channels = arr.shape
370 |             self.window = pyglet.window.Window(width=width, height=height, display=self.display)
371 |             self.width = width
372 |             self.height = height
373 |             self.isopen = True
374 |         assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
375 |         image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
376 |         self.window.clear()
377 |         self.window.switch_to()
378 |         self.window.dispatch_events()
379 |         image.blit(0,0)
380 |         self.window.flip()
381 | 
382 |     def close(self):
383 |         if self.isopen:
384 |             self.window.close()
385 |             self.isopen = False
386 | 
387 |     def __del__(self):
388 |         self.close()
389 | 


--------------------------------------------------------------------------------
/alg/muti_ptf_ppo/ppo_add_entropy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from alg.optimizer import Optimizer
  4 | 
  5 | 
  6 | class PPO:
  7 |     def __init__(self, n_actions, n_features, args, SESS, logger, index):
  8 |         self.index = index
  9 |         self.args = args
 10 |         self.n_actions = n_actions
 11 |         self.n_features = n_features
 12 |         self.logger = logger
 13 |         self.learning_step = 0
 14 | 
 15 |         self.obs = tf.placeholder(tf.float32, [None, self.n_features], 's')
 16 | 
 17 |         self.act_probs, self.policy_param = self.build_actor_net(self.args['policy'] + "_" + str(self.index))
 18 |         self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy'] + "_" + str(self.index))
 19 |         self.v_preds, self.v_param = self.build_critic_net('critic' + "_" + str(self.index))
 20 | 
 21 |         if self.args['continuous_action']:
 22 |             self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0)
 23 |         else:
 24 |             self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1)
 25 |             self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1])
 26 |             self.act_deterministic = tf.argmax(self.act_probs, axis=1)
 27 | 
 28 |         self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)]
 29 | 
 30 |         opt = Optimizer(args['optimizer'], args['learning_rate_a'])
 31 |         self.optimizer = opt.get_optimizer()
 32 |         opt_c = Optimizer(args['optimizer'], args['learning_rate_c'])
 33 |         self.optimizer_c = opt_c.get_optimizer()
 34 | 
 35 |         with tf.variable_scope('train_inp' + "_" + str(self.index)):
 36 |             if self.args['continuous_action']:
 37 |                 self.actions = tf.placeholder(tf.float32, [None, self.n_actions], 'action')
 38 |                 self.mu = tf.placeholder(tf.float32, [None, self.n_actions], 'input_mu')
 39 |                 self.sigma = tf.placeholder(tf.float32, [None, self.n_actions], 'input_sigma')
 40 |             else:
 41 |                 self.actions = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='actions')
 42 |                 self.s_a_prob = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='s_a_prob')
 43 |             self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards')
 44 |             self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next')
 45 |             self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes')
 46 |             self.term = tf.placeholder(dtype=tf.float32, shape=[None], name='term')
 47 |             self.e = tf.placeholder(tf.float32, (), 'e')
 48 | 
 49 |         self.build_loss()
 50 | 
 51 |         self.sess = SESS
 52 | 
 53 |     def build_actor_net(self, scope, trainable=True):
 54 |         with tf.variable_scope(scope):
 55 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable)
 56 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu,
 57 |                                       trainable=trainable)
 58 |             if self.args['continuous_action']:
 59 |                 mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable)
 60 |                 sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable)
 61 |                 act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9)
 62 |             else:
 63 |                 act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax)
 64 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 65 |         return act_probs, params
 66 | 
 67 |     def build_critic_net(self, scope):
 68 |         with tf.variable_scope(scope):
 69 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu)
 70 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu)
 71 |             v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None)
 72 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 73 |         return v_preds, params
 74 | 
 75 |     def build_loss(self):
 76 |         with tf.variable_scope('update_critic' + "_" + str(self.index)):
 77 |             self.advantage = self.rewards - self.v_preds
 78 |             self.c_loss = tf.reduce_mean(tf.square(self.advantage))
 79 |             self.train_c_op = self.optimizer_c.minimize(self.c_loss, var_list=self.v_param)
 80 | 
 81 |         with tf.variable_scope('update_actor' + "_" + str(self.index)):
 82 |             if self.args['continuous_action']:
 83 |                 act_probs = self.act_probs.prob(self.actions)
 84 |                 act_probs_old = self.o_act_probs.prob(self.actions)
 85 |                 entropy = self.act_probs.entropy()
 86 |                 otherNormal = tf.distributions.Normal(self.mu, self.sigma)
 87 |                 otherEntroy = otherNormal.cross_entropy(self.act_probs)
 88 |             else:
 89 |                 act_probs = self.act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1])
 90 |                 act_probs = tf.reduce_sum(act_probs, axis=1)
 91 |                 # probabilities of actions which agent took with old policy
 92 |                 act_probs_old = self.o_act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1])
 93 |                 act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
 94 |                 entropy = -tf.reduce_sum(self.act_probs *
 95 |                                          tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1)
 96 |                 #otherEntroy = -self.s_a_prob * tf.log(self.act_probs + 1e-9)
 97 |                 otherEntroy = -self.s_a_prob * tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0))
 98 | 
 99 |             with tf.variable_scope('loss/clip' + "_" + str(self.index)):
100 |                 # ratios = tf.divide(act_probs, act_probs_old)
101 |                 ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old))
102 |                 clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'],
103 |                                                   clip_value_max=1 + self.args['clip_value'])
104 |                 loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
105 |                 self.loss_clip = tf.reduce_mean(loss_clip)
106 | 
107 |                 self.entropy = self.args['c2'] * tf.reduce_mean(entropy)  # mean of entropy of pi(obs)
108 | 
109 |                 t = tf.reshape(self.term, shape=[-1, 1])
110 |                 entropyTS = tf.reduce_sum(otherEntroy, axis=1,
111 |                                            keepdims=True)
112 |                 weight = 0.5 + tf.tanh(3 - self.args['c3'] * self.e) / 2
113 |                 entropyTS = entropyTS * weight * self.args['c1']
114 |                 self.entropyTS = tf.reduce_mean(entropyTS)
115 | 
116 |                 self.a_loss = -(self.loss_clip + self.entropy) + self.entropyTS
117 |                 self.train_a_op = self.optimizer.minimize(self.a_loss, var_list=self.policy_param)
118 | 
119 |     def choose_action(self, obs):
120 |         obs = np.array(obs)
121 |         obs = obs[np.newaxis, :]
122 |         if self.args['continuous_action']:
123 |             actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs})
124 |             return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip'])
125 |         else:
126 |             if self.args['stochastic']:
127 |                 actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs})
128 |                 action = actions[0]
129 |                 action_one_hot = np.zeros(self.n_actions)
130 |                 action_one_hot[action] = 1
131 |                 return action_one_hot
132 |             else:
133 |                 actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs})
134 |                 action = actions[0]
135 |                 action_one_hot = np.zeros(self.n_actions)
136 |                 action_one_hot[action] = 1
137 |                 return action_one_hot
138 | 
139 |     def choose_hold_action(self, obs):
140 |         return np.zeros(self.n_actions)
141 | 
142 |     def choose_deterministic_action(self, obs, agent_id=0):
143 |         actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0]
144 |         action_one_hots = []
145 |         for i in range(len(actions)):
146 |             action = actions[i]
147 |             action_one_hot = np.zeros(self.n_actions)
148 |             action_one_hot[action] = 1
149 |             action_one_hots.append(action_one_hot)
150 |         return action_one_hots
151 | 
152 |     def choose_acton_prob(self, observation, action):
153 |         observation = np.array(observation)
154 |         observation = observation[np.newaxis, :]
155 |         if self.args['continuous_action']:
156 |             actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation})
157 |             actions_value = [actions_value[0][0], actions_value[1][0]]
158 |         else:
159 |             actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation})[0]
160 |         return actions_value
161 | 
162 |     def get_v(self, s):
163 |         s = np.array(s)
164 |         return self.sess.run(self.v_preds, {self.obs: s[np.newaxis, :]})[0, 0]
165 | 
166 |     def update(self, actor, s, a, r, options, terms, epi, agentid):
167 |         self.sess.run(self.replace_op)
168 | 
169 |         source_actor_prob = []
170 |         mu = []
171 |         sigma = []
172 |         for i, o in enumerate(options):
173 |             o = int(o)
174 |             if o == agentid:
175 |                 terms[i] = 0
176 |             if self.args['continuous_action']:
177 |                 a_prob = actor[o].choose_acton_prob(s[i], a[i])
178 |                 mu.append(a_prob[0])
179 |                 sigma.append(a_prob[1])
180 |             else:
181 |                 if o == agentid:
182 |                     a_prob = actor[o].choose_hold_action(s[i])
183 |                 else:
184 |                     a_prob = actor[o].choose_acton_prob(s[i], a[i])
185 |                 source_actor_prob.append(a_prob)
186 |         adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r})
187 |         if self.args['continuous_action']:
188 |             for i in range(self.args['epi_train_times']):
189 |                 _, a_loss, clip, entropy, entropyTS = self.sess.run(
190 |                     [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS],
191 |                     {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms,
192 |                      self.mu: mu, self.sigma: sigma, self.e: epi})
193 |                 __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r})
194 |                 self.logger.write_tb_log('a_loss_' + str(self.index), a_loss, self.learning_step)
195 |                 self.logger.write_tb_log('c_loss_' + str(self.index), c_loss, self.learning_step)
196 |                 self.logger.write_tb_log('clip_' + str(self.index), clip, self.learning_step)
197 |                 self.logger.write_tb_log('entropy_' + str(self.index), entropy, self.learning_step)
198 |                 self.logger.write_tb_log('entropyTS_' + str(self.index), entropyTS, self.learning_step)
199 |                 self.learning_step += 1
200 |         else:
201 |             for i in range(self.args['epi_train_times']):
202 |                 _, a_loss, clip, entropy, entropyTS = self.sess.run(
203 |                     [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS],
204 |                     {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms,
205 |                      self.s_a_prob: source_actor_prob, self.e: epi})
206 |                 __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r})
207 |                 self.logger.write_tb_log('a_loss_' + str(self.index), a_loss, self.learning_step)
208 |                 self.logger.write_tb_log('c_loss_' + str(self.index), c_loss, self.learning_step)
209 |                 self.logger.write_tb_log('clip_' + str(self.index), clip, self.learning_step)
210 |                 self.logger.write_tb_log('entropy_' + str(self.index), entropy, self.learning_step)
211 |                 self.logger.write_tb_log('entropyTS_' + str(self.index), entropyTS, self.learning_step)
212 |                 self.learning_step += 1
213 | 
214 |     def load_model(self, path):
215 |         saver = tf.train.Saver(self.policy_param)
216 |         saver.restore(self.sess, path + ".ckpt")
217 | 
218 |     def save_model(self, path):
219 |         saver = tf.train.Saver(self.policy_param)
220 |         saver.save(self.sess, path + ".ckpt")
221 | 


--------------------------------------------------------------------------------
/alg/maddpg/common/tf_util.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | 
  6 | def sum(x, axis=None, keepdims=False):
  7 |     return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims)
  8 | def mean(x, axis=None, keepdims=False):
  9 |     return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 10 | def var(x, axis=None, keepdims=False):
 11 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 12 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 13 | def std(x, axis=None, keepdims=False):
 14 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 15 | def max(x, axis=None, keepdims=False):
 16 |     return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 17 | def min(x, axis=None, keepdims=False):
 18 |     return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 19 | def concatenate(arrs, axis=0):
 20 |     return tf.concat(axis=axis, values=arrs)
 21 | def argmax(x, axis=None):
 22 |     return tf.argmax(x, axis=axis)
 23 | def softmax(x, axis=None):
 24 |     return tf.nn.softmax(x, axis=axis)
 25 | 
 26 | # ================================================================
 27 | # Misc
 28 | # ================================================================
 29 | 
 30 | 
 31 | def is_placeholder(x):
 32 |     return type(x) is tf.Tensor and len(x.op.inputs) == 0
 33 | 
 34 | # ================================================================
 35 | # Inputs
 36 | # ================================================================
 37 | 
 38 | 
 39 | class TfInput(object):
 40 |     def __init__(self, name="(unnamed)"):
 41 |         """Generalized Tensorflow placeholder. The main differences are:
 42 |             - possibly uses multiple placeholders internally and returns multiple values
 43 |             - can apply light postprocessing to the value feed to placeholder.
 44 |         """
 45 |         self.name = name
 46 | 
 47 |     def get(self):
 48 |         """Return the tf variable(s) representing the possibly postprocessed value
 49 |         of placeholder(s).
 50 |         """
 51 |         raise NotImplemented()
 52 | 
 53 |     def make_feed_dict(data):
 54 |         """Given data input it to the placeholder(s)."""
 55 |         raise NotImplemented()
 56 | 
 57 | 
 58 | class PlacholderTfInput(TfInput):
 59 |     def __init__(self, placeholder):
 60 |         """Wrapper for regular tensorflow placeholder."""
 61 |         super().__init__(placeholder.name)
 62 |         self._placeholder = placeholder
 63 | 
 64 |     def get(self):
 65 |         return self._placeholder
 66 | 
 67 |     def make_feed_dict(self, data):
 68 |         return {self._placeholder: data}
 69 | 
 70 | 
 71 | class BatchInput(PlacholderTfInput):
 72 |     def __init__(self, shape, dtype=tf.float32, name=None):
 73 |         """Creates a placeholder for a batch of tensors of a given shape and dtype
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         shape: [int]
 78 |             shape of a single elemenet of the batch
 79 |         dtype: tf.dtype
 80 |             number representation used for tensor contents
 81 |         name: str
 82 |             name of the underlying placeholder
 83 |         """
 84 |         super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
 85 | 
 86 | 
 87 | class Uint8Input(PlacholderTfInput):
 88 |     def __init__(self, shape, name=None):
 89 |         """Takes input in uint8 format which is cast to float32 and divided by 255
 90 |         before passing it to the model.
 91 | 
 92 |         On GPU this ensures lower data transfer times.
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |         shape: [int]
 97 |             shape of the tensor.
 98 |         name: str
 99 |             name of the underlying placeholder
100 |         """
101 | 
102 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
103 |         self._shape = shape
104 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
105 | 
106 |     def get(self):
107 |         return self._output
108 | 
109 | 
110 | def ensure_tf_input(thing):
111 |     """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
112 |     if isinstance(thing, TfInput):
113 |         return thing
114 |     elif is_placeholder(thing):
115 |         return PlacholderTfInput(thing)
116 |     else:
117 |         raise ValueError("Must be a placeholder or TfInput")
118 | 
119 | # ================================================================
120 | # Mathematical utils
121 | # ================================================================
122 | 
123 | 
124 | def huber_loss(x, delta=1.0):
125 |     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
126 |     return tf.where(
127 |         tf.abs(x) < delta,
128 |         tf.square(x) * 0.5,
129 |         delta * (tf.abs(x) - 0.5 * delta)
130 |     )
131 | 
132 | # ================================================================
133 | # Optimizer utils
134 | # ================================================================
135 | 
136 | 
137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
138 |     """Minimized `objective` using `optimizer` w.r.t. variables in
139 |     `var_list` while ensure the norm of the gradients for each
140 |     variable is clipped to `clip_val`
141 |     """    
142 |     if clip_val is None:
143 |         return optimizer.minimize(objective, var_list=var_list)
144 |     else:
145 |         gradients = optimizer.compute_gradients(objective, var_list=var_list)
146 |         for i, (grad, var) in enumerate(gradients):
147 |             if grad is not None:
148 |                 gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
149 |         return optimizer.apply_gradients(gradients)
150 | 
151 | 
152 | # ================================================================
153 | # Global session
154 | # ================================================================
155 | 
156 | def get_session():
157 |     """Returns recently made Tensorflow session"""
158 |     return tf.get_default_session()
159 | 
160 | 
161 | def make_session(num_cpu):
162 |     """Returns a session that will use <num_cpu> CPU's only"""
163 |     tf_config = tf.ConfigProto(
164 |         inter_op_parallelism_threads=num_cpu,
165 |         intra_op_parallelism_threads=num_cpu)
166 |     return tf.Session(config=tf_config)
167 | 
168 | 
169 | def single_threaded_session():
170 |     """Returns a session which will only use a single CPU"""
171 |     return make_session(1)
172 | 
173 | 
174 | ALREADY_INITIALIZED = set()
175 | 
176 | 
177 | def initialize():
178 |     """Initialize all the uninitialized variables in the global scope."""
179 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
180 |     get_session().run(tf.variables_initializer(new_variables))
181 |     ALREADY_INITIALIZED.update(new_variables)
182 | 
183 | 
184 | # ================================================================
185 | # Scopes
186 | # ================================================================
187 | 
188 | 
189 | def scope_vars(scope, trainable_only=False):
190 |     """
191 |     Get variables inside a scope
192 |     The scope can be specified as a string
193 | 
194 |     Parameters
195 |     ----------
196 |     scope: str or VariableScope
197 |         scope in which the variables reside.
198 |     trainable_only: bool
199 |         whether or not to return only the variables that were marked as trainable.
200 | 
201 |     Returns
202 |     -------
203 |     vars: [tf.Variable]
204 |         list of variables in `scope`.
205 |     """
206 |     return tf.get_collection(
207 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
208 |         scope=scope if isinstance(scope, str) else scope.name
209 |     )
210 | 
211 | 
212 | def scope_name():
213 |     """Returns the name of current scope as a string, e.g. deepq/q_func"""
214 |     return tf.get_variable_scope().name
215 | 
216 | 
217 | def absolute_scope_name(relative_scope_name):
218 |     """Appends parent scope name to `relative_scope_name`"""
219 |     return scope_name() + "/" + relative_scope_name
220 | 
221 | # ================================================================
222 | # Saving variables
223 | # ================================================================
224 | 
225 | 
226 | def load_state(fname, saver=None):
227 |     """Load all the variables to the current session from the location <fname>"""
228 |     if saver is None:
229 |         saver = tf.train.Saver()
230 |     saver.restore(get_session(), fname)
231 |     return saver
232 | 
233 | 
234 | def save_state(fname, saver=None):
235 |     """Save all the variables in the current session to the location <fname>"""
236 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
237 |     if saver is None:
238 |         saver = tf.train.Saver()
239 |     saver.save(get_session(), fname + ".ckpt")
240 |     return saver
241 | 
242 | # ================================================================
243 | # Theano-like Function
244 | # ================================================================
245 | 
246 | 
247 | def function(inputs, outputs, updates=None, givens=None):
248 |     """Just like Theano function. Take a bunch of tensorflow placeholders and expersions
249 |     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
250 |     values to be feed to the inputs placeholders and produces the values of the experessions
251 |     in outputs.
252 | 
253 |     Input values can be passed in the same order as inputs or can be provided as kwargs based
254 |     on placeholder name (passed to constructor or accessible via placeholder.op.name).
255 | 
256 |     Example:
257 |         x = tf.placeholder(tf.int32, (), name="x")
258 |         y = tf.placeholder(tf.int32, (), name="y")
259 |         z = 3 * x + 2 * y
260 |         lin = function([x, y], z, givens={y: 0})
261 | 
262 |         with single_threaded_session():
263 |             initialize()
264 | 
265 |             assert lin(2) == 6
266 |             assert lin(x=3) == 9
267 |             assert lin(2, 2) == 10
268 |             assert lin(x=2, y=3) == 12
269 | 
270 |     Parameters
271 |     ----------
272 |     inputs: [tf.placeholder or TfInput]
273 |         list of input arguments
274 |     outputs: [tf.Variable] or tf.Variable
275 |         list of outputs or a single output to be returned from function. Returned
276 |         value will also have the same shape.
277 |     """
278 |     if isinstance(outputs, list):
279 |         return _Function(inputs, outputs, updates, givens=givens)
280 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
281 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
282 |         return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
283 |     else:
284 |         f = _Function(inputs, [outputs], updates, givens=givens)
285 |         return lambda *args, **kwargs: f(*args, **kwargs)[0]
286 | 
287 | 
288 | class _Function(object):
289 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
290 |         for inpt in inputs:
291 |             if not issubclass(type(inpt), TfInput):
292 |                 assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput"
293 |         self.inputs = inputs
294 |         updates = updates or []
295 |         self.update_group = tf.group(*updates)
296 |         self.outputs_update = list(outputs) + [self.update_group]
297 |         self.givens = {} if givens is None else givens
298 |         self.check_nan = check_nan
299 | 
300 |     def _feed_input(self, feed_dict, inpt, value):
301 |         if issubclass(type(inpt), TfInput):
302 |             feed_dict.update(inpt.make_feed_dict(value))
303 |         elif is_placeholder(inpt):
304 |             feed_dict[inpt] = value
305 | 
306 |     def __call__(self, *args, **kwargs):
307 |         assert len(args) <= len(self.inputs), "Too many arguments provided"
308 |         feed_dict = {}
309 |         # Update the args
310 |         for inpt, value in zip(self.inputs, args):
311 |             self._feed_input(feed_dict, inpt, value)
312 |         # Update the kwargs
313 |         kwargs_passed_inpt_names = set()
314 |         for inpt in self.inputs[len(args):]:
315 |             inpt_name = inpt.name.split(':')[0]
316 |             inpt_name = inpt_name.split('/')[-1]
317 |             assert inpt_name not in kwargs_passed_inpt_names, \
318 |                 "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
319 |             if inpt_name in kwargs:
320 |                 kwargs_passed_inpt_names.add(inpt_name)
321 |                 self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
322 |             else:
323 |                 assert inpt in self.givens, "Missing argument " + inpt_name
324 |         assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
325 |         # Update feed dict with givens.
326 |         for inpt in self.givens:
327 |             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
328 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
329 |         if self.check_nan:
330 |             if any(np.isnan(r).any() for r in results):
331 |                 raise RuntimeError("Nan detected")
332 |         return results
333 | 


--------------------------------------------------------------------------------
/alg/sharing_multi_ppo/ppo_add_entropy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from alg.optimizer import Optimizer
  4 | 
  5 | 
  6 | class PPO:
  7 |     def __init__(self, n_actions, n_features, n_agents, args, SESS, logger):
  8 |         self.args = args
  9 |         self.n_actions = n_actions
 10 |         self.n_features = n_features + n_agents
 11 |         self.n_agents = n_agents
 12 |         self.logger = logger
 13 |         self.learning_step = 0
 14 | 
 15 |         self.obs = tf.placeholder(tf.float32, [None, self.n_features], 's')
 16 | 
 17 |         self.act_probs, self.policy_param = self.build_actor_net(self.args['policy'])
 18 |         self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy'])
 19 |         self.v_preds, self.v_param = self.build_critic_net('critic')
 20 | 
 21 |         if self.args['continuous_action']:
 22 |             self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0)
 23 |         else:
 24 |             self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1)
 25 |             self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1])
 26 |             self.act_deterministic = tf.argmax(self.act_probs, axis=1)
 27 | 
 28 |         self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)]
 29 | 
 30 |         opt = Optimizer(args['optimizer'], args['learning_rate_a'])
 31 |         self.optimizer = opt.get_optimizer()
 32 |         opt_c = Optimizer(args['optimizer'], args['learning_rate_c'])
 33 |         self.optimizer_c = opt_c.get_optimizer()
 34 | 
 35 |         with tf.variable_scope('train_inp'):
 36 |             if self.args['continuous_action']:
 37 |                 self.actions = tf.placeholder(tf.float32, [None, self.n_actions], 'action')
 38 |                 self.mu = tf.placeholder(tf.float32, [None, self.n_actions], 'input_mu')
 39 |                 self.sigma = tf.placeholder(tf.float32, [None, self.n_actions], 'input_sigma')
 40 |             else:
 41 |                 self.actions = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='actions')
 42 |                 self.s_a_prob = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='s_a_prob')
 43 |             self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards')
 44 |             self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next')
 45 |             self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes')
 46 |             self.term = tf.placeholder(dtype=tf.float32, shape=[None], name='term')
 47 |             self.e = tf.placeholder(tf.float32, (), 'e')
 48 | 
 49 |         self.build_loss()
 50 | 
 51 |         self.sess = SESS
 52 | 
 53 |     def build_actor_net(self, scope, trainable=True):
 54 |         with tf.variable_scope(scope):
 55 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable)
 56 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu,
 57 |                                       trainable=trainable)
 58 |             if self.args['continuous_action']:
 59 |                 mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable)
 60 |                 sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable)
 61 |                 act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9)
 62 |             else:
 63 |                 act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax)
 64 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 65 |         return act_probs, params
 66 | 
 67 |     def build_critic_net(self, scope):
 68 |         with tf.variable_scope(scope):
 69 |             layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu)
 70 |             layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu)
 71 |             v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None)
 72 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
 73 |         return v_preds, params
 74 | 
 75 |     def build_loss(self):
 76 |         with tf.variable_scope('update_critic'):
 77 |             self.advantage = self.rewards - self.v_preds
 78 |             self.c_loss = tf.reduce_mean(tf.square(self.advantage))
 79 |             self.train_c_op = self.optimizer_c.minimize(self.c_loss, var_list=self.v_param)
 80 | 
 81 |         with tf.variable_scope('update_actor'):
 82 |             if self.args['continuous_action']:
 83 |                 act_probs = self.act_probs.prob(self.actions)
 84 |                 act_probs_old = self.o_act_probs.prob(self.actions)
 85 |                 entropy = self.act_probs.entropy()
 86 |                 otherNormal = tf.distributions.Normal(self.mu, self.sigma)
 87 |                 otherEntroy = otherNormal.cross_entropy(self.act_probs)
 88 |             else:
 89 |                 act_probs = self.act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1])
 90 |                 act_probs = tf.reduce_sum(act_probs, axis=1)
 91 |                 # probabilities of actions which agent took with old policy
 92 |                 act_probs_old = self.o_act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1])
 93 |                 act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
 94 |                 entropy = -tf.reduce_sum(self.act_probs *
 95 |                                          tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1)
 96 |                 #otherEntroy = -self.s_a_prob * tf.log(self.act_probs + 1e-9)
 97 |                 otherEntroy = -self.s_a_prob * tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0))
 98 | 
 99 |             with tf.variable_scope('loss/clip'):
100 |                 # ratios = tf.divide(act_probs, act_probs_old)
101 |                 ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old))
102 |                 clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'],
103 |                                                   clip_value_max=1 + self.args['clip_value'])
104 |                 loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
105 |                 self.loss_clip = tf.reduce_mean(loss_clip)
106 | 
107 |                 self.entropy = self.args['c2'] * tf.reduce_mean(entropy)  # mean of entropy of pi(obs)
108 | 
109 |                 t = tf.reshape(self.term, shape=[-1, 1])
110 |                 entropyTS = tf.reduce_sum(otherEntroy, axis=1,
111 |                                            keepdims=True)
112 |                 weight = 0.5 + tf.tanh(3 - self.args['c3'] * self.e) / 2
113 |                 entropyTS = entropyTS * weight * self.args['c1']
114 |                 self.entropyTS = tf.reduce_mean(entropyTS)
115 | 
116 |                 self.a_loss = -(self.loss_clip + self.entropy) + self.entropyTS
117 |                 self.train_a_op = self.optimizer.minimize(self.a_loss, var_list=self.policy_param)
118 | 
119 |     def get_agent_obs(self, obs, agent_id=0):
120 |         if type(agent_id) is int:
121 |             agent_id_arr = [agent_id] * len(obs)
122 |         elif type(agent_id) is list:
123 |             agent_id_arr = agent_id
124 |         else:
125 |             raise Exception('the agent_id field must be type of int or list')
126 |         agent_one_hot = np.eye(self.n_agents)[agent_id_arr]
127 |         obs = np.hstack((agent_one_hot, obs))
128 |         return obs
129 | 
130 |     def choose_action(self, obs, agent_id=0):
131 |         obs = np.array(obs)
132 |         obs = obs[np.newaxis, :]
133 |         obs = self.get_agent_obs(obs, agent_id)
134 |         if self.args['continuous_action']:
135 |             actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs})
136 |             return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip'])
137 |         else:
138 |             if self.args['stochastic']:
139 |                 actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs})
140 |                 action = actions[0]
141 |                 action_one_hot = np.zeros(self.n_actions)
142 |                 action_one_hot[action] = 1
143 |                 return action_one_hot
144 |             else:
145 |                 actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs})
146 |                 action = actions[0]
147 |                 action_one_hot = np.zeros(self.n_actions)
148 |                 action_one_hot[action] = 1
149 |                 return action_one_hot
150 | 
151 |     def choose_hold_action(self, obs):
152 |         return np.zeros(self.n_actions)
153 | 
154 |     def choose_deterministic_action(self, obs, agent_id=0):
155 |         obs = self.get_agent_obs(obs, agent_id)
156 |         actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0]
157 |         action_one_hots = []
158 |         for i in range(len(actions)):
159 |             action = actions[i]
160 |             action_one_hot = np.zeros(self.n_actions)
161 |             action_one_hot[action] = 1
162 |             action_one_hots.append(action_one_hot)
163 |         return action_one_hots
164 | 
165 |     def choose_acton_prob(self, observation, agent_id=0):
166 |         observation = np.array(observation)
167 |         observation = observation[np.newaxis, :]
168 |         observation = self.get_agent_obs(observation, agent_id)
169 |         if self.args['continuous_action']:
170 |             actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation})
171 |             actions_value = [actions_value[0][0], actions_value[1][0]]
172 |         else:
173 |             actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation})[0]
174 |         return actions_value
175 | 
176 |     def get_v(self, s, agent_id=0):
177 |         obs = np.array(s)
178 |         obs = obs[np.newaxis, :]
179 |         obs = self.get_agent_obs(obs, agent_id)
180 |         return self.sess.run(self.v_preds, {self.obs: obs})[0, 0]
181 | 
182 |     def update(self, actor, s, a, r, options, terms, epi, agentid):
183 |         self.sess.run(self.replace_op)
184 | 
185 |         source_actor_prob = []
186 |         mu = []
187 |         sigma = []
188 |         for i, o in enumerate(options):
189 |             o = actor[o]
190 |             if o == agentid[i]:
191 |                 terms[i] = 0
192 |             if self.args['continuous_action']:
193 |                 a_prob = self.choose_acton_prob(s[i], o)
194 |                 mu.append(a_prob[0])
195 |                 sigma.append(a_prob[1])
196 |             else:
197 |                 if o == agentid[i]:
198 |                     a_prob = self.choose_hold_action(s[i])
199 |                 else:
200 |                     a_prob = self.choose_acton_prob(s[i], o)
201 |                 source_actor_prob.append(a_prob)
202 |         s = self.get_agent_obs(s, agentid)
203 |         adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r})
204 |         if self.args['continuous_action']:
205 |             for i in range(self.args['epi_train_times']):
206 |                 _, a_loss, clip, entropy, entropyTS = self.sess.run(
207 |                     [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS],
208 |                     {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms,
209 |                      self.mu: mu, self.sigma: sigma, self.e: epi})
210 |                 __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r})
211 |                 self.logger.write_tb_log('a_loss', a_loss, self.learning_step)
212 |                 self.logger.write_tb_log('c_loss', c_loss, self.learning_step)
213 |                 self.logger.write_tb_log('clip', clip, self.learning_step)
214 |                 self.logger.write_tb_log('entropy', entropy, self.learning_step)
215 |                 self.logger.write_tb_log('entropyTS', entropyTS, self.learning_step)
216 |                 self.learning_step += 1
217 |         else:
218 |             for i in range(self.args['epi_train_times']):
219 |                 _, a_loss, clip, entropy, entropyTS = self.sess.run(
220 |                     [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS],
221 |                     {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms,
222 |                      self.s_a_prob: source_actor_prob, self.e: epi})
223 |                 __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r})
224 |                 self.logger.write_tb_log('a_loss', a_loss, self.learning_step)
225 |                 self.logger.write_tb_log('c_loss', c_loss, self.learning_step)
226 |                 self.logger.write_tb_log('clip', clip, self.learning_step)
227 |                 self.logger.write_tb_log('entropy', entropy, self.learning_step)
228 |                 self.logger.write_tb_log('entropyTS', entropyTS, self.learning_step)
229 |                 self.learning_step += 1
230 | 
231 |     def load_model(self, path):
232 |         saver = tf.train.Saver(self.policy_param)
233 |         saver.restore(self.sess, path + ".ckpt")
234 | 
235 |     def save_model(self, path):
236 |         saver = tf.train.Saver(self.policy_param)
237 |         saver.save(self.sess, path + ".ckpt")
238 | 


--------------------------------------------------------------------------------
/game/pacman/graphicsUtils.py:
--------------------------------------------------------------------------------
  1 | # graphicsUtils.py
  2 | # ----------------
  3 | # Licensing Information:  You are free to use or extend these projects for
  4 | # educational purposes provided that (1) you do not distribute or publish
  5 | # solutions, (2) you retain this notice, and (3) you provide clear
  6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
  7 | #
  8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley.
  9 | # The core projects and autograders were primarily created by John DeNero
 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 11 | # Student side autograding was added by Brad Miller, Nick Hay, and
 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu).
 13 | 
 14 | 
 15 | import sys
 16 | import math
 17 | import random
 18 | import string
 19 | import time
 20 | import types
 21 | import tkinter
 22 | 
 23 | _Windows = sys.platform == 'win32'  # True if on Win95/98/NT
 24 | 
 25 | _root_window = None      # The root window for graphics output
 26 | _canvas = None      # The canvas which holds graphics
 27 | _canvas_xs = None      # Size of canvas object
 28 | _canvas_ys = None
 29 | _canvas_x = None      # Current position on canvas
 30 | _canvas_y = None
 31 | _canvas_col = None      # Current colour (set to black below)
 32 | _canvas_tsize = 12
 33 | _canvas_tserifs = 0
 34 | 
 35 | 
 36 | def formatColor(r, g, b):
 37 |     return '#%02x%02x%02x' % (int(r * 255), int(g * 255), int(b * 255))
 38 | 
 39 | 
 40 | def colorToVector(color):
 41 |     return [int(x, 16) / 256.0 for x in [color[1:3], color[3:5], color[5:7]]]
 42 | 
 43 | if _Windows:
 44 |     _canvas_tfonts = ['times new roman', 'lucida console']
 45 | else:
 46 |     _canvas_tfonts = ['times', 'lucidasans-24']
 47 |     pass  # XXX need defaults here
 48 | 
 49 | 
 50 | def sleep(secs):
 51 |     global _root_window
 52 |     if _root_window == None:
 53 |         time.sleep(secs)
 54 |     else:
 55 |         _root_window.update_idletasks()
 56 |         _root_window.after(int(1000 * secs), _root_window.quit)
 57 |         _root_window.mainloop()
 58 | 
 59 | 
 60 | def begin_graphics(width=640, height=480, color=formatColor(0, 0, 0), title=None):
 61 | 
 62 |     global _root_window, _canvas, _canvas_x, _canvas_y, _canvas_xs, _canvas_ys, _bg_color
 63 | 
 64 |     # Check for duplicate call
 65 |     if _root_window is not None:
 66 |         # Lose the window.
 67 |         _root_window.destroy()
 68 | 
 69 |     # Save the canvas size parameters
 70 |     _canvas_xs, _canvas_ys = width - 1, height - 1
 71 |     _canvas_x, _canvas_y = 0, _canvas_ys
 72 |     _bg_color = color
 73 | 
 74 |     # Create the root window
 75 |     _root_window = tkinter.Tk()
 76 |     _root_window.protocol('WM_DELETE_WINDOW', _destroy_window)
 77 |     _root_window.title(title or 'Graphics Window')
 78 |     _root_window.resizable(0, 0)
 79 | 
 80 |     # Create the canvas object
 81 |     try:
 82 |         _canvas = tkinter.Canvas(_root_window, width=width, height=height)
 83 |         _canvas.pack()
 84 |         draw_background()
 85 |         _canvas.update()
 86 |     except:
 87 |         _root_window = None
 88 |         raise
 89 | 
 90 |     # Bind to key-down and key-up events
 91 |     _root_window.bind("<KeyPress>", _keypress)
 92 |     _root_window.bind("<KeyRelease>", _keyrelease)
 93 |     _root_window.bind("<FocusIn>", _clear_keys)
 94 |     _root_window.bind("<FocusOut>", _clear_keys)
 95 |     _root_window.bind("<Button-1>", _leftclick)
 96 |     _root_window.bind("<Button-2>", _rightclick)
 97 |     _root_window.bind("<Button-3>", _rightclick)
 98 |     _root_window.bind("<Control-Button-1>", _ctrl_leftclick)
 99 |     _clear_keys()
100 | 
101 | _leftclick_loc = None
102 | _rightclick_loc = None
103 | _ctrl_leftclick_loc = None
104 | 
105 | 
106 | def _leftclick(event):
107 |     global _leftclick_loc
108 |     _leftclick_loc = (event.x, event.y)
109 | 
110 | 
111 | def _rightclick(event):
112 |     global _rightclick_loc
113 |     _rightclick_loc = (event.x, event.y)
114 | 
115 | 
116 | def _ctrl_leftclick(event):
117 |     global _ctrl_leftclick_loc
118 |     _ctrl_leftclick_loc = (event.x, event.y)
119 | 
120 | 
121 | def wait_for_click():
122 |     while True:
123 |         global _leftclick_loc
124 |         global _rightclick_loc
125 |         global _ctrl_leftclick_loc
126 |         if _leftclick_loc != None:
127 |             val = _leftclick_loc
128 |             _leftclick_loc = None
129 |             return val, 'left'
130 |         if _rightclick_loc != None:
131 |             val = _rightclick_loc
132 |             _rightclick_loc = None
133 |             return val, 'right'
134 |         if _ctrl_leftclick_loc != None:
135 |             val = _ctrl_leftclick_loc
136 |             _ctrl_leftclick_loc = None
137 |             return val, 'ctrl_left'
138 |         sleep(0.05)
139 | 
140 | 
141 | def draw_background():
142 |     corners = [(0, 0), (0, _canvas_ys),
143 |                (_canvas_xs, _canvas_ys), (_canvas_xs, 0)]
144 |     polygon(corners, _bg_color, fillColor=_bg_color,
145 |             filled=True, smoothed=False)
146 | 
147 | 
148 | def _destroy_window(event=None):
149 |     sys.exit(0)
150 | #    global _root_window
151 | #    _root_window.destroy()
152 | #    _root_window = None
153 |     # print "DESTROY"
154 | 
155 | 
156 | def end_graphics():
157 |     global _root_window, _canvas, _mouse_enabled
158 |     try:
159 |         try:
160 |             sleep(1)
161 |             if _root_window != None:
162 |                 _root_window.destroy()
163 |         except SystemExit as e:
164 |             print(('Ending graphics raised an exception:', e))
165 |     finally:
166 |         _root_window = None
167 |         _canvas = None
168 |         _mouse_enabled = 0
169 |         _clear_keys()
170 | 
171 | 
172 | def clear_screen(background=None):
173 |     global _canvas_x, _canvas_y
174 |     _canvas.delete('all')
175 |     draw_background()
176 |     _canvas_x, _canvas_y = 0, _canvas_ys
177 | 
178 | 
179 | def polygon(coords, outlineColor, fillColor=None, filled=1, smoothed=1, behind=0, width=1):
180 |     c = []
181 |     for coord in coords:
182 |         c.append(coord[0])
183 |         c.append(coord[1])
184 |     if fillColor == None:
185 |         fillColor = outlineColor
186 |     if filled == 0:
187 |         fillColor = ""
188 |     poly = _canvas.create_polygon(
189 |         c, outline=outlineColor, fill=fillColor, smooth=smoothed, width=width)
190 |     if behind > 0:
191 |         _canvas.tag_lower(poly, behind)  # Higher should be more visible
192 |     return poly
193 | 
194 | 
195 | def square(pos, r, color, filled=1, behind=0):
196 |     x, y = pos
197 |     coords = [(x - r, y - r), (x + r, y - r), (x + r, y + r), (x - r, y + r)]
198 |     return polygon(coords, color, color, filled, 0, behind=behind)
199 | 
200 | 
201 | def circle(pos, r, outlineColor, fillColor, endpoints=None, style='pieslice', width=2):
202 |     x, y = pos
203 |     x0, x1 = x - r - 1, x + r
204 |     y0, y1 = y - r - 1, y + r
205 |     if endpoints == None:
206 |         e = [0, 359]
207 |     else:
208 |         e = list(endpoints)
209 |     while e[0] > e[1]:
210 |         e[1] = e[1] + 360
211 | 
212 |     return _canvas.create_arc(x0, y0, x1, y1, outline=outlineColor, fill=fillColor,
213 |                               extent=e[1] - e[0], start=e[0], style=style, width=width)
214 | 
215 | 
216 | def image(pos, file="../../blueghost.gif"):
217 |     x, y = pos
218 |     # img = PhotoImage(file=file)
219 |     return _canvas.create_image(x, y, image=tkinter.PhotoImage(file=file), anchor=tkinter.NW)
220 | 
221 | 
222 | def refresh():
223 |     _canvas.update_idletasks()
224 | 
225 | 
226 | def moveCircle(id, pos, r, endpoints=None):
227 |     global _canvas_x, _canvas_y
228 | 
229 |     x, y = pos
230 | #    x0, x1 = x - r, x + r + 1
231 | #    y0, y1 = y - r, y + r + 1
232 |     x0, x1 = x - r - 1, x + r
233 |     y0, y1 = y - r - 1, y + r
234 |     if endpoints == None:
235 |         e = [0, 359]
236 |     else:
237 |         e = list(endpoints)
238 |     while e[0] > e[1]:
239 |         e[1] = e[1] + 360
240 | 
241 |     edit(id, ('start', e[0]), ('extent', e[1] - e[0]))
242 |     move_to(id, x0, y0)
243 | 
244 | 
245 | def edit(id, *args):
246 |     _canvas.itemconfigure(id, **dict(args))
247 | 
248 | 
249 | def text(pos, color, contents, font='Helvetica', size=12, style='normal', anchor="nw"):
250 |     global _canvas_x, _canvas_y
251 |     x, y = pos
252 |     font = (font, str(size), style)
253 |     return _canvas.create_text(x, y, fill=color, text=contents, font=font, anchor=anchor)
254 | 
255 | 
256 | def changeText(id, newText, font=None, size=12, style='normal'):
257 |     _canvas.itemconfigure(id, text=newText)
258 |     if font != None:
259 |         _canvas.itemconfigure(id, font=(font, '-%d' % size, style))
260 | 
261 | 
262 | def changeColor(id, newColor):
263 |     _canvas.itemconfigure(id, fill=newColor)
264 | 
265 | 
266 | def line(here, there, color=formatColor(0, 0, 0), width=2):
267 |     x0, y0 = here[0], here[1]
268 |     x1, y1 = there[0], there[1]
269 |     return _canvas.create_line(x0, y0, x1, y1, fill=color, width=width)
270 | 
271 | ##############################################################################
272 | ### Keypress handling ########################################################
273 | ##############################################################################
274 | 
275 | # We bind to key-down and key-up events.
276 | 
277 | _keysdown = {}
278 | _keyswaiting = {}
279 | # This holds an unprocessed key release.  We delay key releases by up to
280 | # one call to keys_pressed() to get round a problem with auto repeat.
281 | _got_release = None
282 | 
283 | 
284 | def _keypress(event):
285 |     global _got_release
286 |     # remap_arrows(event)
287 |     _keysdown[event.keysym] = 1
288 |     _keyswaiting[event.keysym] = 1
289 | #    print event.char, event.keycode
290 |     _got_release = None
291 | 
292 | 
293 | def _keyrelease(event):
294 |     global _got_release
295 |     # remap_arrows(event)
296 |     try:
297 |         del _keysdown[event.keysym]
298 |     except:
299 |         pass
300 |     _got_release = 1
301 | 
302 | 
303 | def remap_arrows(event):
304 |     # TURN ARROW PRESSES INTO LETTERS (SHOULD BE IN KEYBOARD AGENT)
305 |     if event.char in ['a', 's', 'd', 'w']:
306 |         return
307 |     if event.keycode in [37, 101]:  # LEFT ARROW (win / x)
308 |         event.char = 'a'
309 |     if event.keycode in [38, 99]:  # UP ARROW
310 |         event.char = 'w'
311 |     if event.keycode in [39, 102]:  # RIGHT ARROW
312 |         event.char = 'd'
313 |     if event.keycode in [40, 104]:  # DOWN ARROW
314 |         event.char = 's'
315 | 
316 | 
317 | def _clear_keys(event=None):
318 |     global _keysdown, _got_release, _keyswaiting
319 |     _keysdown = {}
320 |     _keyswaiting = {}
321 |     _got_release = None
322 | 
323 | 
324 | def keys_pressed(d_o_e=None,
325 |                  d_w=tkinter._tkinter.DONT_WAIT):
326 | 
327 |     if(d_o_e is None):
328 |         d_o_e = _root_window.dooneevent
329 |     d_o_e(d_w)
330 |     if _got_release:
331 |         d_o_e(d_w)
332 |     return list(_keysdown.keys())
333 | 
334 | 
335 | def keys_waiting():
336 |     global _keyswaiting
337 |     keys = list(_keyswaiting.keys())
338 |     _keyswaiting = {}
339 |     return keys
340 | 
341 | # Block for a list of keys...
342 | 
343 | 
344 | def wait_for_keys():
345 |     keys = []
346 |     while keys == []:
347 |         keys = keys_pressed()
348 |         sleep(0.05)
349 |     return keys
350 | 
351 | 
352 | def remove_from_screen(x,
353 |                        d_o_e=None,
354 |                        d_w=tkinter._tkinter.DONT_WAIT):
355 |     if (d_o_e is None):
356 |         d_o_e = _root_window.dooneevent
357 |     _canvas.delete(x)
358 |     d_o_e(d_w)
359 | 
360 | 
361 | def _adjust_coords(coord_list, x, y):
362 |     for i in range(0, len(coord_list), 2):
363 |         coord_list[i] = coord_list[i] + x
364 |         coord_list[i + 1] = coord_list[i + 1] + y
365 |     return coord_list
366 | 
367 | 
368 | def move_to(object, x, y=None,
369 |             d_o_e=None,
370 |             d_w=tkinter._tkinter.DONT_WAIT):
371 |     if (d_o_e is None):
372 |         d_o_e = _root_window.dooneevent
373 |     if y is None:
374 |         try:
375 |             x, y = x
376 |         except:
377 |             raise Exception('incomprehensible coordinates')
378 | 
379 |     horiz = True
380 |     newCoords = []
381 |     current_x, current_y = _canvas.coords(object)[0:2]  # first point
382 |     for coord in _canvas.coords(object):
383 |         if horiz:
384 |             inc = x - current_x
385 |         else:
386 |             inc = y - current_y
387 |         horiz = not horiz
388 | 
389 |         newCoords.append(coord + inc)
390 | 
391 |     _canvas.coords(object, *newCoords)
392 |     d_o_e(d_w)
393 | 
394 | 
395 | def move_by(object, x, y=None,
396 |             d_o_e=None,
397 |             d_w=tkinter._tkinter.DONT_WAIT, lift=False):
398 |     if (d_o_e is None):
399 |         d_o_e = _root_window.dooneevent
400 |     if y is None:
401 |         try:
402 |             x, y = x
403 |         except:
404 |             raise Exception('incomprehensible coordinates')
405 | 
406 |     horiz = True
407 |     newCoords = []
408 |     for coord in _canvas.coords(object):
409 |         if horiz:
410 |             inc = x
411 |         else:
412 |             inc = y
413 |         horiz = not horiz
414 | 
415 |         newCoords.append(coord + inc)
416 | 
417 |     _canvas.coords(object, *newCoords)
418 |     d_o_e(d_w)
419 |     if lift:
420 |         _canvas.tag_raise(object)
421 | 
422 | 
423 | def writePostscript(filename):
424 |     "Writes the current canvas to a postscript file."
425 |     psfile = file(filename, 'w')
426 |     psfile.write(_canvas.postscript(pageanchor='sw',
427 |                                     y='0.c',
428 |                                     x='0.c'))
429 |     psfile.close()
430 | 
431 | ghost_shape = [
432 |     (0, - 0.5),
433 |     (0.25, - 0.75),
434 |     (0.5, - 0.5),
435 |     (0.75, - 0.75),
436 |     (0.75, 0.5),
437 |     (0.5, 0.75),
438 |     (- 0.5, 0.75),
439 |     (- 0.75, 0.5),
440 |     (- 0.75, - 0.75),
441 |     (- 0.5, - 0.5),
442 |     (- 0.25, - 0.75)
443 | ]
444 | 
445 | if __name__ == '__main__':
446 |     begin_graphics()
447 |     clear_screen()
448 |     ghost_shape = [(x * 10 + 20, y * 10 + 20) for x, y in ghost_shape]
449 |     g = polygon(ghost_shape, formatColor(1, 1, 1))
450 |     move_to(g, (50, 50))
451 |     circle((150, 150), 20, formatColor(0.7, 0.3, 0.0), endpoints=[15, - 15])
452 |     sleep(2)
453 | 


--------------------------------------------------------------------------------
/alg/maddpg/common/distributions.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import alg.maddpg.common.tf_util as U
  4 | from tensorflow.python.ops import math_ops
  5 | from game.particle.multiagent.multi_discrete import MultiDiscrete
  6 | from tensorflow.python.ops import nn
  7 | 
  8 | class Pd(object):
  9 |     """
 10 |     A particular probability distribution
 11 |     """
 12 |     def flatparam(self):
 13 |         raise NotImplementedError
 14 |     def mode(self):
 15 |         raise NotImplementedError
 16 |     def logp(self, x):
 17 |         raise NotImplementedError
 18 |     def kl(self, other):
 19 |         raise NotImplementedError
 20 |     def entropy(self):
 21 |         raise NotImplementedError
 22 |     def sample(self):
 23 |         raise NotImplementedError
 24 | 
 25 | class PdType(object):
 26 |     """
 27 |     Parametrized family of probability distributions
 28 |     """
 29 |     def pdclass(self):
 30 |         raise NotImplementedError
 31 |     def pdfromflat(self, flat):
 32 |         return self.pdclass()(flat)
 33 |     def param_shape(self):
 34 |         raise NotImplementedError
 35 |     def sample_shape(self):
 36 |         raise NotImplementedError
 37 |     def sample_dtype(self):
 38 |         raise NotImplementedError
 39 | 
 40 |     def param_placeholder(self, prepend_shape, name=None):
 41 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 42 |     def sample_placeholder(self, prepend_shape, name=None):
 43 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 44 | 
 45 | class CategoricalPdType(PdType):
 46 |     def __init__(self, ncat):
 47 |         self.ncat = ncat
 48 |     def pdclass(self):
 49 |         return CategoricalPd
 50 |     def param_shape(self):
 51 |         return [self.ncat]
 52 |     def sample_shape(self):
 53 |         return []
 54 |     def sample_dtype(self):
 55 |         return tf.int32
 56 | 
 57 | class SoftCategoricalPdType(PdType):
 58 |     def __init__(self, ncat):
 59 |         self.ncat = ncat
 60 |     def pdclass(self):
 61 |         return SoftCategoricalPd
 62 |     def param_shape(self):
 63 |         return [self.ncat]
 64 |     def sample_shape(self):
 65 |         return [self.ncat]
 66 |     def sample_dtype(self):
 67 |         return tf.float32
 68 | 
 69 | class MultiCategoricalPdType(PdType):
 70 |     def __init__(self, low, high):
 71 |         self.low = low
 72 |         self.high = high
 73 |         self.ncats = high - low + 1
 74 |     def pdclass(self):
 75 |         return MultiCategoricalPd
 76 |     def pdfromflat(self, flat):
 77 |         return MultiCategoricalPd(self.low, self.high, flat)
 78 |     def param_shape(self):
 79 |         return [sum(self.ncats)]
 80 |     def sample_shape(self):
 81 |         return [len(self.ncats)]
 82 |     def sample_dtype(self):
 83 |         return tf.int32
 84 | 
 85 | class SoftMultiCategoricalPdType(PdType):
 86 |     def __init__(self, low, high):
 87 |         self.low = low
 88 |         self.high = high
 89 |         self.ncats = high - low + 1
 90 |     def pdclass(self):
 91 |         return SoftMultiCategoricalPd
 92 |     def pdfromflat(self, flat):
 93 |         return SoftMultiCategoricalPd(self.low, self.high, flat)
 94 |     def param_shape(self):
 95 |         return [sum(self.ncats)]
 96 |     def sample_shape(self):
 97 |         return [sum(self.ncats)]
 98 |     def sample_dtype(self):
 99 |         return tf.float32
100 | 
101 | class DiagGaussianPdType(PdType):
102 |     def __init__(self, size):
103 |         self.size = size
104 |     def pdclass(self):
105 |         return DiagGaussianPd
106 |     def param_shape(self):
107 |         return [2*self.size]
108 |     def sample_shape(self):
109 |         return [self.size]
110 |     def sample_dtype(self):
111 |         return tf.float32
112 | 
113 | class BernoulliPdType(PdType):
114 |     def __init__(self, size):
115 |         self.size = size
116 |     def pdclass(self):
117 |         return BernoulliPd
118 |     def param_shape(self):
119 |         return [self.size]
120 |     def sample_shape(self):
121 |         return [self.size]
122 |     def sample_dtype(self):
123 |         return tf.int32
124 | 
125 | # WRONG SECOND DERIVATIVES
126 | # class CategoricalPd(Pd):
127 | #     def __init__(self, logits):
128 | #         self.logits = logits
129 | #         self.ps = tf.nn.softmax(logits)
130 | #     @classmethod
131 | #     def fromflat(cls, flat):
132 | #         return cls(flat)
133 | #     def flatparam(self):
134 | #         return self.logits
135 | #     def mode(self):
136 | #         return U.argmax(self.logits, axis=1)
137 | #     def logp(self, x):
138 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
139 | #     def kl(self, other):
140 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
141 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
142 | #     def entropy(self):
143 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
144 | #     def sample(self):
145 | #         u = tf.random_uniform(tf.shape(self.logits))
146 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
147 | 
148 | class CategoricalPd(Pd):
149 |     def __init__(self, logits):
150 |         self.logits = logits
151 |     def flatparam(self):
152 |         return self.logits
153 |     def mode(self):
154 |         return U.argmax(self.logits, axis=1)
155 |     def logp(self, x):
156 |         return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
157 |     def kl(self, other):
158 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
159 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
160 |         ea0 = tf.exp(a0)
161 |         ea1 = tf.exp(a1)
162 |         z0 = U.sum(ea0, axis=1, keepdims=True)
163 |         z1 = U.sum(ea1, axis=1, keepdims=True)
164 |         p0 = ea0 / z0
165 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
166 |     def entropy(self):
167 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
168 |         ea0 = tf.exp(a0)
169 |         z0 = U.sum(ea0, axis=1, keepdims=True)
170 |         p0 = ea0 / z0
171 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
172 |     def sample(self):
173 |         u = tf.random_uniform(tf.shape(self.logits))
174 |         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
175 |     @classmethod
176 |     def fromflat(cls, flat):
177 |         return cls(flat)
178 | 
179 | class SoftCategoricalPd(Pd):
180 |     def __init__(self, logits):
181 |         self.logits = logits
182 |     def flatparam(self):
183 |         return self.logits
184 |     def mode(self):
185 |         return U.softmax(self.logits, axis=-1)
186 |     def logp(self, x):
187 |         return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
188 |     def kl(self, other):
189 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
190 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
191 |         ea0 = tf.exp(a0)
192 |         ea1 = tf.exp(a1)
193 |         z0 = U.sum(ea0, axis=1, keepdims=True)
194 |         z1 = U.sum(ea1, axis=1, keepdims=True)
195 |         p0 = ea0 / z0
196 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
197 |     def entropy(self):
198 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
199 |         ea0 = tf.exp(a0)
200 |         z0 = U.sum(ea0, axis=1, keepdims=True)
201 |         p0 = ea0 / z0
202 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
203 |     def cross_entropy(self, other):
204 |         return self.kl(other) + self.entropy()
205 |     def soft_max_sample(self, other, term):
206 |         u1 = tf.random_uniform(tf.shape(self.logits))
207 |         y1 = U.softmax(self.logits - tf.log(-tf.log(u1)), axis=-1)
208 |         u2 = tf.random_uniform(tf.shape(other.logits))
209 |         y2 = U.softmax(other.logits - tf.log(-tf.log(u2)), axis=-1)
210 |         logits = y1 + term * y2
211 |         logits_out = logits / U.sum(logits)
212 |         return self.gumbel_sample(logits=logits_out)
213 |     def params(self):
214 |         return self.logits
215 |     def sample(self):
216 |         #u = tf.random_uniform(tf.shape(self.logits))
217 |         #return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1)
218 |         return self.gumbel_sample()
219 |     def gumbel_sample(self, logits=None, temperature=0.1, hard=True):
220 |         if logits is None:
221 |             logits = self.logits
222 |         u = tf.random_uniform(tf.shape(logits))
223 |         y = U.softmax((logits - tf.log(-tf.log(u))) / temperature, axis=-1)
224 |         if hard:
225 |             k = tf.shape(logits)[-1]
226 |             y_hard = tf.cast(tf.equal(y, tf.reduce_max(y, 1, keepdims=True)), y.dtype)
227 |             y = tf.stop_gradient(y_hard - y) + y
228 |         return y
229 | 
230 |     @classmethod
231 |     def fromflat(cls, flat):
232 |         return cls(flat)        
233 | 
234 | class MultiCategoricalPd(Pd):
235 |     def __init__(self, low, high, flat):
236 |         self.flat = flat
237 |         self.low = tf.constant(low, dtype=tf.int32)
238 |         self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
239 |     def flatparam(self):
240 |         return self.flat
241 |     def mode(self):
242 |         return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
243 |     def logp(self, x):
244 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
245 |     def kl(self, other):
246 |         return tf.add_n([
247 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
248 |             ])
249 |     def entropy(self):
250 |         return tf.add_n([p.entropy() for p in self.categoricals])
251 |     def sample(self):
252 |         return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
253 |     @classmethod
254 |     def fromflat(cls, flat):
255 |         return cls(flat)
256 | 
257 | class SoftMultiCategoricalPd(Pd):  # doesn't work yet
258 |     def __init__(self, low, high, flat):
259 |         self.flat = flat
260 |         self.low = tf.constant(low, dtype=tf.float32)
261 |         self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
262 |     def flatparam(self):
263 |         return self.flat
264 |     def mode(self):
265 |         x = []
266 |         for i in range(len(self.categoricals)):
267 |             x.append(self.low[i] + self.categoricals[i].mode())
268 |         return tf.concat(x, axis=-1)
269 |     def logp(self, x):
270 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
271 |     def kl(self, other):
272 |         return tf.add_n([
273 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
274 |             ])
275 |     def entropy(self):
276 |         return tf.add_n([p.entropy() for p in self.categoricals])
277 |     def sample(self):
278 |         x = []
279 |         for i in range(len(self.categoricals)):
280 |             x.append(self.low[i] + self.categoricals[i].sample())
281 |         return tf.concat(x, axis=-1)
282 |     @classmethod
283 |     def fromflat(cls, flat):
284 |         return cls(flat)
285 | 
286 | class DiagGaussianPd(Pd):
287 |     def __init__(self, flat):
288 |         self.flat = flat
289 |         mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat)
290 |         self.mean = mean
291 |         self.logstd = logstd
292 |         self.std = tf.exp(logstd)
293 |     def flatparam(self):
294 |         return self.flat        
295 |     def mode(self):
296 |         return self.mean
297 |     def logp(self, x):
298 |         return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \
299 |                - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \
300 |                - U.sum(self.logstd, axis=1)
301 |     def kl(self, other):
302 |         assert isinstance(other, DiagGaussianPd)
303 |         return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1)
304 |     def entropy(self):
305 |         return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1)
306 |     def cross_entropy(self, other):
307 |         assert isinstance(other, DiagGaussianPd)
308 |         return self.kl(other) + self.entropy()
309 | 
310 |     def params(self):
311 |         return self.mean, self.std
312 | 
313 |     def sample(self):
314 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
315 |     @classmethod
316 |     def fromflat(cls, flat):
317 |         return cls(flat)
318 | 
319 | class BernoulliPd(Pd):
320 |     def __init__(self, logits):
321 |         self.logits = logits
322 |         self.ps = tf.sigmoid(logits)
323 |     def flatparam(self):
324 |         return self.logits
325 |     def mode(self):
326 |         return tf.round(self.ps)
327 |     def logp(self, x):
328 |         return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1)
329 |     def kl(self, other):
330 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
331 |     def entropy(self):
332 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
333 |     def sample(self):
334 |         p = tf.sigmoid(self.logits)
335 |         u = tf.random_uniform(tf.shape(p))
336 |         return tf.to_float(math_ops.less(u, p))
337 |     @classmethod
338 |     def fromflat(cls, flat):
339 |         return cls(flat)
340 | 
341 | def make_pdtype(ac_space):
342 |     from gym import spaces
343 |     if isinstance(ac_space, spaces.Box):
344 |         assert len(ac_space.shape) == 1
345 |         return DiagGaussianPdType(ac_space.shape[0])
346 |     elif isinstance(ac_space, spaces.Discrete):
347 |         # return CategoricalPdType(ac_space.n)
348 |         return SoftCategoricalPdType(ac_space.n)
349 |     elif isinstance(ac_space, MultiDiscrete):
350 |         #return MultiCategoricalPdType(ac_space.low, ac_space.high)
351 |         return SoftMultiCategoricalPdType(ac_space.low, ac_space.high)
352 |     elif isinstance(ac_space, spaces.MultiBinary):
353 |         return BernoulliPdType(ac_space.n)
354 |     else:
355 |         raise NotImplementedError
356 | 
357 | def shape_el(v, i):
358 |     maybe = v.get_shape()[i]
359 |     if maybe is not None:
360 |         return maybe
361 |     else:
362 |         return tf.shape(v)[i]
363 | 


--------------------------------------------------------------------------------