├── game ├── particle │ ├── __init__.py │ ├── bin │ │ ├── __init__.py │ │ └── interactive.py │ ├── multiagent │ │ ├── scenarios │ │ │ ├── __init__.py │ │ │ ├── simple_spread.py │ │ │ └── simple_tag.py │ │ ├── scenario.py │ │ ├── __init__.py │ │ ├── policy.py │ │ ├── multi_discrete.py │ │ ├── core.py │ │ └── rendering.py │ ├── make_env.py │ └── README.md ├── pacman │ ├── pacmanDQN_Agents.py │ ├── layouts │ │ ├── openClassic.lay │ │ └── originalClassic.lay │ ├── textDisplay.py │ ├── keyboardAgents.py │ ├── ghostAgents.py │ ├── make_env.py │ ├── layout.py │ └── graphicsUtils.py └── __init__.py ├── source ├── pacman │ ├── original │ │ └── 0 │ │ │ ├── checkpoint │ │ │ ├── model_0.ckpt.meta │ │ │ ├── model_0.ckpt.index │ │ │ ├── model_0.ckpt.data-00000-of-00001 │ │ │ ├── command.txt │ │ │ └── args.json │ └── medium │ │ └── 0 │ │ ├── checkpoint │ │ ├── model_40000_0.ckpt.index │ │ ├── model_40000_0.ckpt.meta │ │ ├── model_40000_0.ckpt.data-00000-of-00001 │ │ ├── command.txt │ │ └── args.json └── simple_tag │ └── tag4 │ ├── checkpoint │ ├── model_30000_3.ckpt.index │ ├── model_30000_3.ckpt.meta │ ├── model_30000_3.ckpt.data-00000-of-00001 │ └── args.json ├── requirements.txt ├── alg ├── __init__.py ├── maddpg │ ├── __init__.py │ ├── trainer │ │ └── replay_buffer.py │ ├── train.py │ └── common │ │ ├── tf_util.py │ │ └── distributions.py ├── optimizer.py ├── common │ └── common.py ├── sharing_multi_ppo │ ├── ppo.py │ └── ppo_add_entropy.py └── muti_ptf_ppo │ ├── ppo.py │ └── ppo_add_entropy.py ├── config ├── particle_conf.yaml ├── pacman_conf.yaml ├── maddpg_conf.yaml └── ppo_conf.yaml ├── run └── __init__.py ├── util ├── fource_exit.py ├── get_out_files.py ├── output_json.py ├── logger.py └── ReplayBuffer.py ├── README.md └── main.py /game/particle/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /game/particle/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/pacman/original/0/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_3.ckpt" 2 | all_model_checkpoint_paths: "model_3.ckpt" 3 | -------------------------------------------------------------------------------- /source/pacman/medium/0/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_40000_2.ckpt" 2 | all_model_checkpoint_paths: "model_40000_2.ckpt" 3 | -------------------------------------------------------------------------------- /source/simple_tag/tag4/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_30000_3.ckpt" 2 | all_model_checkpoint_paths: "model_30000_3.ckpt" 3 | -------------------------------------------------------------------------------- /source/pacman/original/0/model_0.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.meta -------------------------------------------------------------------------------- /source/pacman/original/0/model_0.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.index -------------------------------------------------------------------------------- /source/pacman/medium/0/model_40000_0.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.index -------------------------------------------------------------------------------- /source/pacman/medium/0/model_40000_0.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.meta -------------------------------------------------------------------------------- /source/simple_tag/tag4/model_30000_3.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.index -------------------------------------------------------------------------------- /source/simple_tag/tag4/model_30000_3.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.meta -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.9 2 | numpy==1.19.5 3 | tensorboard==1.14.0 4 | tensorboard-logger==0.1.0 5 | tensorflow==1.14.0 6 | PyYAML==5.4.1 7 | 8 | 9 | -------------------------------------------------------------------------------- /game/pacman/pacmanDQN_Agents.py: -------------------------------------------------------------------------------- 1 | import game.pacman.game as game 2 | 3 | 4 | class PacmanDQN(game.Agent): 5 | def __init__(self, args): 6 | pass 7 | -------------------------------------------------------------------------------- /source/pacman/original/0/model_0.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /source/pacman/medium/0/model_40000_0.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /source/simple_tag/tag4/model_30000_3.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /alg/__init__.py: -------------------------------------------------------------------------------- 1 | from alg.maddpg.trainer.maddpg import MADDPGAgentTrainer as maddpg 2 | 3 | REGISTRY = {} 4 | 5 | 6 | REGISTRY['maddpg'] = maddpg 7 | REGISTRY['maddpg_sr'] = maddpg 8 | -------------------------------------------------------------------------------- /game/particle/multiagent/scenarios/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | import os.path as osp 3 | 4 | 5 | def load(name): 6 | pathname = osp.join(osp.dirname(__file__), name) 7 | return imp.load_source('', pathname) 8 | -------------------------------------------------------------------------------- /game/__init__.py: -------------------------------------------------------------------------------- 1 | from .particle.make_env import make_env as Particle 2 | from .pacman.make_env import make_env as PacmanEnv 3 | 4 | 5 | REGISTRY = {} 6 | 7 | REGISTRY['particle'] = Particle 8 | REGISTRY['pacman'] = PacmanEnv 9 | 10 | 11 | -------------------------------------------------------------------------------- /config/particle_conf.yaml: -------------------------------------------------------------------------------- 1 | game_name: "simple_spread" 2 | continuous_action: False 3 | reward_normalize: False 4 | benchmark: False 5 | action_clip: 1 6 | num_adversaries: 0 7 | num_good: 0 8 | obs_sort: False 9 | reward_func: "reward" 10 | restrict_move: False -------------------------------------------------------------------------------- /game/pacman/layouts/openClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%% 2 | %.. .... .... G % 3 | %.. ... ... ... ... % 4 | %.. ... ... ... ... % 5 | %.. P .... .... % 6 | %.. ... ... ... ... % 7 | %.. ... ... ... ... % 8 | %.. .... .... G % 9 | %%%%%%%%%%%%%%%%%%%%%%%%% 10 | -------------------------------------------------------------------------------- /config/pacman_conf.yaml: -------------------------------------------------------------------------------- 1 | num_adversaries: 1 #pacman 2 | timeout: 30 3 | game_name: "trickyClassic" 4 | textGraphics: False 5 | quietGraphics: False 6 | zoom: 1.0 7 | fixRandomSeed: False 8 | recordActions: False 9 | replay: None 10 | frameTime: 0.1 11 | catchExceptions: False 12 | continuous_action: False 13 | obs_sort: False 14 | -------------------------------------------------------------------------------- /game/particle/multiagent/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # defines scenario upon which the world is built 5 | class BaseScenario(object): 6 | # create elements of the world 7 | def make_world(self): 8 | raise NotImplementedError() 9 | # create initial conditions of the world 10 | 11 | def reset_world(self, world): 12 | raise NotImplementedError() 13 | -------------------------------------------------------------------------------- /run/__init__.py: -------------------------------------------------------------------------------- 1 | from .run_multi_ptf_ppo_sro import run as multi_ppo_sr_run 2 | from .run_maddpg_sr import run as run_maddpg_sr 3 | from .run_multi_ptf_shppo_sro import run as shppo_sr_run 4 | 5 | REGISTRY = {} 6 | 7 | REGISTRY['multi_ppo'] = multi_ppo_sr_run 8 | REGISTRY['multi_ppo_sro'] = multi_ppo_sr_run 9 | REGISTRY['maddpg'] = run_maddpg_sr 10 | REGISTRY['maddpg_sr'] = run_maddpg_sr 11 | REGISTRY['shppo'] = shppo_sr_run 12 | REGISTRY['shppo_sro'] = shppo_sr_run 13 | 14 | -------------------------------------------------------------------------------- /alg/maddpg/__init__.py: -------------------------------------------------------------------------------- 1 | class AgentTrainer(object): 2 | def __init__(self, name, model, obs_shape, act_space, args): 3 | raise NotImplemented() 4 | 5 | def action(self, obs): 6 | raise NotImplemented() 7 | 8 | def process_experience(self, obs, act, rew, new_obs, done, terminal): 9 | raise NotImplemented() 10 | 11 | def preupdate(self): 12 | raise NotImplemented() 13 | 14 | def update(self, agents): 15 | raise NotImplemented() 16 | -------------------------------------------------------------------------------- /util/fource_exit.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import msvcrt 3 | 4 | 5 | class Exit: 6 | def __init__(self): 7 | self.isExit = False 8 | self.thread = threading.Thread(target=self.work) 9 | 10 | def work(self): 11 | while True: 12 | newChar = msvcrt.getch() 13 | if newChar in b'\r': # 如果是换行,则输入结束 14 | self.isExit = True 15 | break 16 | 17 | def run(self): 18 | self.thread.start() 19 | 20 | def get_status(self): 21 | return self.isExit 22 | 23 | -------------------------------------------------------------------------------- /game/particle/multiagent/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # Multiagent envs 4 | # ---------------------------------------- 5 | 6 | register( 7 | id='MultiagentSimple-v0', 8 | entry_point='multiagent.envs:SimpleEnv', 9 | # FIXME(cathywu) currently has to be exactly max_path_length parameters in 10 | # rllab run script 11 | max_episode_steps=100, 12 | ) 13 | 14 | register( 15 | id='MultiagentSimpleSpeakerListener-v0', 16 | entry_point='multiagent.envs:SimpleSpeakerListenerEnv', 17 | max_episode_steps=100, 18 | ) 19 | -------------------------------------------------------------------------------- /source/pacman/medium/0/command.txt: -------------------------------------------------------------------------------- 1 | -a multi_ppo -c multi_ptf_ppo_conf -g pacman -d pacman_conf -n 50000 -e 99 -s 7 -o adam n_layer_a_1=128 n_layer_c_1=128 option_layer_1=64 n_layer_a_2=128 n_layer_c_2=128 option_layer_2=64 c2=0.01 learning_rate_a=5e-4 learning_rate_c=5e-4 learning_rate_o=1e-3 learning_rate_t=1e-3 continuous_action=False reward_decay=0.99 clip_value=0.2 e_greedy=0.95 e_greedy_increment=1e-3 replace_target_iter=1000 learning_step=1000 option_batch_size=64 batch_size=64 save_per_episodes=10000 save_model=True c3=0.001 num_adversaries=1 adv_use_option=False good_use_option=False adv_load_model=True adv_load_model_path=source/pacman/3/model_20000_0 game_name=mediumClassic obs_sort=False xi=0 use_gpu=True use_gpu_id=1 memory_size=100000 -------------------------------------------------------------------------------- /source/pacman/original/0/command.txt: -------------------------------------------------------------------------------- 1 | -a multi_ppo -c multi_ptf_ppo_conf -g pacman -d pacman_conf -n 50000 -e 99 -s 7 -o adam n_layer_a_1=128 n_layer_c_1=128 option_layer_1=64 n_layer_a_2=128 n_layer_c_2=128 option_layer_2=64 c2=0.01 learning_rate_a=5e-4 learning_rate_c=5e-4 learning_rate_o=1e-3 learning_rate_t=1e-3 continuous_action=False reward_decay=0.99 clip_value=0.2 e_greedy=0.95 e_greedy_increment=1e-3 replace_target_iter=1000 learning_step=1000 option_batch_size=64 batch_size=64 save_per_episodes=5000 save_model=True c3=0.001 num_adversaries=1 adv_use_option=False good_use_option=False load_model=True load_model_path=source/pacman/original/2020-11-18_22-49-59/model game_name=originalClassic obs_sort=False xi=0 use_gpu=False use_gpu_id=1 memory_size=100000 -------------------------------------------------------------------------------- /alg/optimizer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class Optimizer: 5 | def __init__( 6 | self, 7 | optimizer, 8 | learning_rate, 9 | momentum=None 10 | ): 11 | self.opt = None 12 | if str(optimizer).lower() == "grad": 13 | self.opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) 14 | elif str(optimizer).lower() == "momentum": 15 | self.opt = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) 16 | elif str(optimizer).lower() == 'rmsprop': 17 | self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate) 18 | elif str(optimizer).lower() == 'adam': 19 | self.opt = tf.train.AdamOptimizer(learning_rate=learning_rate) 20 | 21 | def get_optimizer(self): 22 | return self.opt 23 | -------------------------------------------------------------------------------- /game/pacman/layouts/originalClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | %............%%............% 3 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 4 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 5 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 6 | %..........................% 7 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 8 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 9 | %......%%....%%....%%......% 10 | %%%%%%.%%%%% %% %%%%%.%%%%%% 11 | %%%%%%.%%%%% %% %%%%%.%%%%%% 12 | %%%%%%.% %.%%%%%% 13 | %%%%%%.% %%%% %%%% %.%%%%%% 14 | % . %G G G% . % 15 | %%%%%%.% %%%%%%%%%% %.%%%%%% 16 | %%%%%%.% %.%%%%%% 17 | %%%%%%.% %%%%%%%%%% %.%%%%%% 18 | %............%%............% 19 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 20 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 21 | %...%%....... .......%%...% 22 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 23 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 24 | %......%%....%%....%%......% 25 | %.%%%%%%%%%%.%%.%%%%%%%%%%.% 26 | %.............P............% 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | -------------------------------------------------------------------------------- /util/get_out_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def paths(file_path): 6 | path_collection = [] 7 | path_target_collection = [] 8 | path_target_dir = [] 9 | for dirpath, dirnames, filenames in os.walk(file_path): 10 | for file_name in filenames: 11 | if file_name == 'out.json' or file_name == 'args.json' or file_name == 'command.txt': 12 | fullpath = os.path.join(dirpath, file_name) 13 | path_collection.append(fullpath) 14 | path_target_collection.append(fullpath.replace('results', 'results_out')) 15 | path_target_dir.append(dirpath.replace('results', 'results_out')) 16 | return path_collection, path_target_collection, path_target_dir 17 | 18 | 19 | source_path = '' 20 | 21 | 22 | if __name__ == "__main__": 23 | path_collection, path_target_collection, path_target_dir = paths(source_path) 24 | 25 | for (source, target, dir_path) in zip(path_collection, path_target_collection, path_target_dir): 26 | if not os.path.exists(dir_path): 27 | os.makedirs(dir_path) 28 | print(source, target) 29 | shutil.copyfile(source, target) 30 | -------------------------------------------------------------------------------- /config/maddpg_conf.yaml: -------------------------------------------------------------------------------- 1 | reward_decay: 0.99 2 | output_graph: True 3 | save_model: True 4 | summary_output_times: 10 5 | regular: 0.005 6 | learning_rate_a: 0.01 7 | learning_rate_c: 0.01 8 | ENTROPY_BETA: 0.0005 9 | USE_CPU_COUNT: True 10 | load_model: False 11 | load_model_path: '' 12 | batch_size: 1024 13 | display: False 14 | 15 | #run 16 | reward_memory: 100 17 | save_per_episodes: 2000 18 | num_adversaries: 0 19 | good_policy: 'maddpg' 20 | adv_policy: 'maddpg' 21 | adv_use_option: False 22 | good_use_option: False 23 | adv_load_model: False 24 | adv_load_model_path: '' 25 | good_load_model: False 26 | good_load_model_path: '' 27 | use_gpu_id: '0' 28 | use_gpu: False 29 | other_option_update: True 30 | 31 | #option 32 | learning_rate_o: 0.0003 33 | learning_rate_t: 0.0003 34 | option_layer_1: 128 35 | option_layer_2: 128 36 | e_greedy: 0.95 37 | e_greedy_increment: 0.005 38 | start_greedy: 0.0 39 | memory_size: 1000000 40 | option_batch_size: 16 41 | xi: 0.005 42 | option_clip_value: 10.0 43 | is_soft_max_action: True 44 | replace_target_iter: 1000 45 | learning_step: 1000 46 | c3: 0.0005 47 | c1: 1.0 48 | 49 | # SF 50 | embedding_dim: 32 51 | option_embedding_layer: 64 52 | recon_loss_coef: 0.1 53 | learning_rate_r: 0.0003 54 | clip_value: 0.2 55 | 56 | #DVM 57 | distillation_frequent: 1000 58 | distillation_interation: 2048 59 | 60 | # network 61 | n_layer_a_1: 128 62 | 63 | # output 64 | SAVE_PATH: "model" 65 | graph_path: "graph" 66 | reward_output: "output" 67 | output_filename: "out" 68 | log: "log" 69 | benchmark_dir: "benchmark" -------------------------------------------------------------------------------- /config/ppo_conf.yaml: -------------------------------------------------------------------------------- 1 | # ppo 2 | learning_rate_a: 0.0003 3 | learning_rate_c: 0.0003 4 | batch_size: 32 5 | clip_value: 0.2 6 | reward_decay: 0.99 7 | c2: 0.001 8 | stochastic: True 9 | load_model: False 10 | load_model_path: '' 11 | adv_policy: "ppo" 12 | good_policy: "ppo" 13 | reward_normalize: False 14 | done_reward: 1.0 15 | 16 | # option 17 | option_batch_size: 32 18 | option_clip_value: 10.0 19 | other_option_update: True 20 | c1: 0.005 21 | c3: 0.0005 22 | epi_train_times: 1 23 | memory_size: 100000 24 | e_greedy: 0.95 25 | replace_target_iter: 1000 26 | e_greedy_increment: 0.001 27 | start_greedy: 0.0 28 | learning_step: 1000 29 | learning_rate_o: 0.00001 30 | learning_rate_t: 0.00001 31 | xi: 0.005 32 | adv_use_option: False 33 | good_use_option: False 34 | adv_load_model: False 35 | adv_load_model_path: '' 36 | good_load_model: False 37 | good_load_model_path: '' 38 | grad_clip: 10 39 | 40 | #sro 41 | learning_rate_r: 0.0003 42 | embedding_dim: 32 43 | option_embedding_layer: 64 44 | recon_loss_coef: 0.1 45 | 46 | # transfer_agent 47 | trans_agent_start_epi: 0 48 | 49 | #run 50 | reward_memory: 100 51 | save_per_episodes: 2000 52 | use_gpu_id: '0' 53 | use_gpu: False 54 | output_graph: True 55 | save_model: True 56 | summary_output_times: 10 57 | reload_model: False 58 | reload_model_path: '' 59 | 60 | # network 61 | policy: 'policy' 62 | old_policy: 'old_policy' 63 | n_layer_a_1: 64 64 | n_layer_a_2: 64 65 | n_layer_c_1: 64 66 | n_layer_c_2: 64 67 | option_layer_1: 128 68 | option_layer_2: 128 69 | 70 | # output 71 | SAVE_PATH: "model" 72 | graph_path: "graph" 73 | reward_output: "output" 74 | output_filename: "out" 75 | log: "log" 76 | 77 | -------------------------------------------------------------------------------- /source/pacman/medium/0/args.json: -------------------------------------------------------------------------------- 1 | {"numGames": 50000, "game": "pacman", "algorithm": "multi_ppo", "epi_step": 99, "seed": 7, "optimizer": "adam", "run_test": false, "results_path": "../results/multi_ppo_pacman/2019-12-12_13-12-59/", "learning_rate_a": 0.0005, "learning_rate_c": 0.0005, "batch_size": 64, "option_batch_size": 64, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.01, "epi_train_times": 1, "stochastic": true, "load_model": false, "load_model_path": "", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.001, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 0.001, "learning_rate_t": 0.001, "ENTROPY_BETA": 0.0005, "c3": 0.001, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "adv_load_model": true, "adv_load_model_path": "source/pacman/3/model_20000_0", "reward_normalize": false, "done_reward": 1.0, "reward_memory": 100, "save_per_episodes": 10000, "use_gpu_id": "1", "use_gpu": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 128, "n_layer_a_2": 128, "n_layer_c_1": 128, "n_layer_c_2": 128, "temperature": 0.1, "option_layer_1": 64, "option_layer_2": 64, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "num_adversaries": 1, "timeout": 30, "game_name": "mediumClassic", "textGraphics": false, "quietGraphics": false, "zoom": 1.0, "fixRandomSeed": false, "recordActions": false, "replay": "None", "frameTime": 0.1, "catchExceptions": false, "continuous_action": false, "obs_sort": false} -------------------------------------------------------------------------------- /game/particle/bin/interactive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os,sys 3 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 4 | import argparse 5 | 6 | from game.particle.multiagent.environment import MultiAgentEnv 7 | from game.particle.multiagent.policy import InteractivePolicy 8 | import game.particle.multiagent.scenarios as scenarios 9 | 10 | if __name__ == '__main__': 11 | # parse arguments 12 | parser = argparse.ArgumentParser(description=None) 13 | parser.add_argument('-s', '--scenario', default='simple_adversary.py', help='Path of the scenario Python script.') 14 | args = parser.parse_args() 15 | 16 | # load scenario from script 17 | scenario = scenarios.load(args.scenario).Scenario() 18 | # create world 19 | world = scenario.make_world() 20 | # create multiagent environment 21 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False) 22 | # render call to create viewer window (necessary only for interactive policies) 23 | env.render() 24 | # create interactive policies for each agent 25 | policies = [InteractivePolicy(env, i) for i in range(env.n)] 26 | # execution loop 27 | obs_n = env.reset() 28 | while True: 29 | # query for action from each agent's policy 30 | act_n = [] 31 | for i, policy in enumerate(policies): 32 | act_n.append(policy.action(obs_n[i])) 33 | # step environment 34 | obs_n, reward_n, done_n, _ = env.step(act_n) 35 | # render all agent views 36 | env.render() 37 | # display rewards 38 | #for agent in env.world.agents: 39 | # print(agent.name + " reward: %0.3f" % env._get_reward(agent)) 40 | -------------------------------------------------------------------------------- /source/simple_tag/tag4/args.json: -------------------------------------------------------------------------------- 1 | {"numGames": 20000, "game": "particle", "algorithm": "multi_ppo_sr2", "epi_step": 99, "seed": 12345, "optimizer": "adam", "run_test": false, "obs_sort": false, "learning_rate_a": 0.0003, "learning_rate_c": 0.0003, "learning_rate_r": 0.0003, "batch_size": 32, "option_batch_size": 32, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.001, "epi_train_times": 1, "stochastic": true, "load_model": false, "load_model_path": "", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.005, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 1e-05, "learning_rate_t": 1e-05, "ENTROPY_BETA": 0.0005, "c3": 0.0005, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "reward_normalize": false, "adv_load_model": false, "adv_load_model_path": "", "good_load_model": true, "good_load_model_path": "source\\simple_tag\\ppo_tag\\model_30000_3", "done_reward": 1.0, "trans_agent_start_epi": 0, "reward_memory": 100, "save_per_episodes": 2000, "use_gpu_id": "0", "use_gpu": false, "other_option_update": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 64, "n_layer_a_2": 64, "n_layer_c_1": 64, "n_layer_c_2": 64, "temperature": 0.1, "option_layer_1": 32, "option_layer_2": 32, "embedding_dim": 32, "option_embedding_layer": 64, "recon_loss_coef": 0.1, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "game_name": "simple_tag", "continuous_action": false, "benchmark": false, "action_clip": 1, "num_adversaries": 3, "reward_func": "reward", "restrict_move": true, "results_path": "../results/multi_ppo_sr2/particle/simple_tag/2020-09-22_17-44-26/"} -------------------------------------------------------------------------------- /source/pacman/original/0/args.json: -------------------------------------------------------------------------------- 1 | {"numGames": 50000, "game": "pacman", "algorithm": "multi_ppo", "epi_step": 99, "seed": 7, "optimizer": "adam", "run_test": false, "learning_rate_a": 0.0005, "learning_rate_c": 0.0005, "batch_size": 64, "option_batch_size": 64, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.01, "epi_train_times": 1, "stochastic": true, "load_model": true, "load_model_path": "source/pacman/original/2020-11-18_22-49-59/model", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.001, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 0.001, "learning_rate_t": 0.001, "ENTROPY_BETA": 0.0005, "c3": 0.001, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "adv_load_model": false, "adv_load_model_path": "", "good_load_model": false, "good_load_model_path": "", "reward_normalize": false, "done_reward": 1.0, "grad_clip": 10, "trans_agent_start_epi": 0, "reward_memory": 100, "save_per_episodes": 5000, "use_gpu_id": "1", "use_gpu": false, "other_option_update": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 128, "n_layer_a_2": 128, "n_layer_c_1": 128, "n_layer_c_2": 128, "temperature": 0.1, "option_layer_1": 64, "option_layer_2": 64, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "num_adversaries": 1, "timeout": 30, "game_name": "originalClassic", "textGraphics": false, "quietGraphics": false, "zoom": 1.0, "fixRandomSeed": false, "recordActions": false, "replay": "None", "frameTime": 0.1, "catchExceptions": false, "continuous_action": false, "obs_sort": false, "results_path": "../results/multi_ppo/pacman/originalClassic/2020-11-21_11-40-33/"} -------------------------------------------------------------------------------- /game/particle/multiagent/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyglet.window import key 3 | 4 | 5 | # individual agent policy 6 | class Policy(object): 7 | def __init__(self): 8 | pass 9 | 10 | def action(self, obs): 11 | raise NotImplementedError() 12 | 13 | 14 | # interactive policy based on keyboard input 15 | # hard-coded to deal only with movement, not communication 16 | class InteractivePolicy(Policy): 17 | def __init__(self, env, agent_index): 18 | super(InteractivePolicy, self).__init__() 19 | self.env = env 20 | # hard-coded keyboard events 21 | self.move = [False for i in range(4)] 22 | self.comm = [False for i in range(env.world.dim_c)] 23 | # register keyboard events with this environment's window 24 | env.viewers[agent_index].window.on_key_press = self.key_press 25 | env.viewers[agent_index].window.on_key_release = self.key_release 26 | 27 | def action(self, obs): 28 | # ignore observation and just act based on keyboard events 29 | if self.env.discrete_action_input: 30 | u = 0 31 | if self.move[0]: u = 1 32 | if self.move[1]: u = 2 33 | if self.move[2]: u = 4 34 | if self.move[3]: u = 3 35 | else: 36 | u = np.zeros(5) # 5-d because of no-move action 37 | if self.move[0]: u[1] += 1.0 38 | if self.move[1]: u[2] += 1.0 39 | if self.move[3]: u[3] += 1.0 40 | if self.move[2]: u[4] += 1.0 41 | if True not in self.move: 42 | u[0] += 1.0 43 | return np.concatenate([u, np.zeros(self.env.world.dim_c)]) 44 | 45 | # keyboard event callbacks 46 | def key_press(self, k, mod): 47 | if k == key.LEFT: self.move[0] = True 48 | if k == key.RIGHT: self.move[1] = True 49 | if k == key.UP: self.move[2] = True 50 | if k == key.DOWN: self.move[3] = True 51 | 52 | def key_release(self, k, mod): 53 | if k == key.LEFT: self.move[0] = False 54 | if k == key.RIGHT: self.move[1] = False 55 | if k == key.UP: self.move[2] = False 56 | if k == key.DOWN: self.move[3] = False 57 | -------------------------------------------------------------------------------- /util/output_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | 5 | 6 | class OutputJson: 7 | def __init__(self, data_field=[]): 8 | self.data_field = data_field 9 | self.data = {} 10 | for i in range(len(data_field)): 11 | if not isinstance(data_field[i], str): 12 | raise Exception('the data field must be type of string: ' + str(data_field[i])) 13 | 14 | self.data[data_field[i]] = [] 15 | 16 | def update(self, value, key=None): 17 | if key is not None: 18 | if isinstance(value, bool): 19 | value = str(value) 20 | self.data[key].append(value) 21 | return 22 | if isinstance(value, tuple) or isinstance(value, list): 23 | if len(value) != len(self.data_field): 24 | raise Exception('Error in parameters size: ' + str(value)) 25 | for i in range(len(value)): 26 | if type(value[i]) is np.bool_ or type(value[i]) is np.bool or type(value[i]) is bool: 27 | self.data[self.data_field[i]].append(str(value[i])) 28 | else: 29 | self.data[self.data_field[i]].append(value[i]) 30 | 31 | def print_first(self): 32 | if self.data == {}: 33 | return 34 | for i, key in enumerate(self.data_field): 35 | print(key, ": %s, " % self.data[key][len(self.data[key]) - 1], end=' ') 36 | print() 37 | 38 | def print_by_key(self, key, index=None): 39 | if index is None: 40 | print(key, ": ", self.data[key]) 41 | else: 42 | print(key, " ", index, ": ", self.data[key][index]) 43 | 44 | def save(self, path, filename, field=None): 45 | if not os.path.exists(path): 46 | os.makedirs(path) 47 | if field is None: 48 | field = self.data_field 49 | out = {} 50 | for key in field: 51 | if len(self.data[key]) > 0 and type(self.data[key][0]) is np.ndarray: 52 | out[key] = [a.tolist() for a in self.data[key]] 53 | else: 54 | out[key] = self.data[key] 55 | with open(path + "/" + filename + ".json", "w") as f: 56 | json.dump(out, f) 57 | -------------------------------------------------------------------------------- /game/particle/multiagent/multi_discrete.py: -------------------------------------------------------------------------------- 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates) 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py) 3 | 4 | import numpy as np 5 | 6 | import gym 7 | from gym.spaces import prng 8 | 9 | 10 | class MultiDiscrete(gym.Space): 11 | """ 12 | - The multi-discrete action space consists of a series of discrete action spaces with different parameters 13 | - It can be adapted to both a Discrete action space or a continuous (Box) action space 14 | - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space 15 | - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space 16 | where the discrete action space can take any integers from `min` to `max` (both inclusive) 17 | Note: A value of 0 always need to represent the NOOP action. 18 | e.g. Nintendo Game Controller 19 | - Can be conceptualized as 3 discrete action spaces: 20 | 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4 21 | 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 22 | 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 23 | - Can be initialized as 24 | MultiDiscrete([ [0,4], [0,1], [0,1] ]) 25 | """ 26 | def __init__(self, array_of_param_array): 27 | self.low = np.array([x[0] for x in array_of_param_array]) 28 | self.high = np.array([x[1] for x in array_of_param_array]) 29 | self.num_discrete_space = self.low.shape[0] 30 | 31 | def sample(self): 32 | """ Returns a array with one sample from each discrete action space """ 33 | # For each row: round(random .* (max - min) + min, 0) 34 | random_array = prng.np_random.rand(self.num_discrete_space) 35 | return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] 36 | 37 | def contains(self, x): 38 | return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() 39 | 40 | @property 41 | def shape(self): 42 | return self.num_discrete_space 43 | 44 | def __repr__(self): 45 | return "MultiDiscrete" + str(self.num_discrete_space) 46 | 47 | def __eq__(self, other): 48 | return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high) 49 | -------------------------------------------------------------------------------- /alg/common/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import json 4 | from alg.muti_ptf_ppo.ppo import PPO 5 | 6 | def action_equal(action1, action2, continuous_action=None): 7 | if not continuous_action or continuous_action is None: 8 | if (isinstance(action1, list) or isinstance(action1, np.ndarray)) and (isinstance(action2, list) or isinstance(action2, np.ndarray)): 9 | return (np.array(action1) == np.array(action2)).all() 10 | else: 11 | return False 12 | elif continuous_action: 13 | mean = action1[0] 14 | std = action1[1] 15 | for i in range(len(action2)): 16 | if action2[i] < mean[i] - std[i] or action2[i] > mean[i] + std[i]: 17 | return False 18 | return True 19 | 20 | 21 | def build_source_actor(args, sess, policy_path, i=0): 22 | par_path = os.path.dirname(policy_path) 23 | file_name = '' 24 | for dirPath, dirNames, fileNames in os.walk(par_path): 25 | # print(fileNames) 26 | for fileName in fileNames: 27 | if fileName == 'args.json': 28 | file_name = fileName 29 | break 30 | if file_name != '': 31 | break 32 | file_path = par_path + "/" + file_name 33 | with open(file_path, 'r') as f: 34 | source_args = json.load(f) 35 | source_policy = 'ppo'#args['policy']'' 36 | if source_policy == 'ppo': 37 | return PPO(args['action_dim'], args['features'], source_args, sess, logger=None, i=i) 38 | else: 39 | raise Exception('no such source_policy named: ' + str(source_policy)) 40 | 41 | 42 | class OptionToList: 43 | def __init__(self, num_agent): 44 | self.num_agent = num_agent 45 | self.option_list = [] 46 | self.reset() 47 | 48 | def reset(self): 49 | self.option_list = [] 50 | length = np.power(self.num_agent - 1, self.num_agent) 51 | for i in range(length): 52 | self.option_list.append(self.number_converter(i)) 53 | 54 | # FIXME option网络输出option index,转换成union option的操作相当于进制转换,例如option_dim=3,option_index=26, union_option=[2, 2, 2] 55 | def number_converter(self, number): 56 | hex = self.num_agent 57 | res = np.zeros(hex) 58 | index = 0 59 | while True: 60 | s = number // (hex - 1) # 商 61 | y = number % (hex - 1) # 余数 62 | res[index] = y 63 | if s == 0: 64 | break 65 | number = s 66 | index += 1 67 | res = list(res) 68 | res.reverse() 69 | return res 70 | 71 | def get_option_list(self, i): 72 | if i >= len(self.option_list): 73 | assert 'out of option_list memory!' 74 | return self.option_list[i] 75 | -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy 3 | import time 4 | import threading 5 | 6 | R = threading.Lock() 7 | 8 | class Logger: 9 | def __init__(self, log_name, graph_path, args): 10 | # 第一步,创建一个logger 11 | self.logger = self.build_log(log_name) 12 | self.build_tb_log(graph_path) 13 | self.args = args 14 | self.keys = dict() 15 | if 'summary_output_times' in self.args.keys(): 16 | self.summary_times = self.args['summary_output_times'] 17 | else: 18 | self.summary_times = 1 19 | 20 | 21 | def build_log(self, name): 22 | logger = logging.getLogger() 23 | logger.setLevel(logging.INFO) # Log等级总开关 24 | # 第二步,创建一个handler,用于写入日志文件 25 | rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) 26 | log_name = name + '/out.log' 27 | logfile = log_name 28 | fh = logging.FileHandler(logfile, mode='w') 29 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 30 | # 第三步,定义handler的输出格式 31 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") 32 | fh.setFormatter(formatter) 33 | # 第四步,将logger添加到handler里面 34 | logger.addHandler(fh) 35 | return logger 36 | 37 | def build_tb_log(self, path): 38 | from tensorboard_logger import configure, log_value, log_histogram 39 | configure(path) 40 | self.tb_logger = log_value 41 | self.tb_h_logger = log_histogram 42 | 43 | def write_tb_log(self, key, value, t): 44 | if self.args['output_graph']: 45 | if t % self.summary_times != 0: 46 | return 47 | #print(key, value) 48 | if type(value) is numpy.ndarray or type(value) is list: 49 | R.acquire() 50 | if key not in self.keys.keys(): 51 | self.keys[key] = 0 52 | else: 53 | self.keys[key] += 1 54 | #print(type(value), key, value, self.keys[key]) 55 | self.tb_h_logger(key, value, self.keys[key]) 56 | R.release() 57 | else: 58 | R.acquire() 59 | if key not in self.keys.keys(): 60 | self.keys[key] = 0 61 | else: 62 | self.keys[key] += 1 63 | #print(key, value, self.keys[key]) 64 | self.tb_logger(key, value, self.keys[key]) 65 | R.release() 66 | else: 67 | return 68 | 69 | 70 | def write_log(self, msg, type='info'): 71 | if type == 'debug': 72 | self.logger.debug(msg) 73 | elif type == 'info': 74 | self.logger.log(msg) 75 | elif type == 'warning': 76 | self.logger.warning(msg) 77 | elif type == 'error': 78 | self.logger.error(msg) 79 | elif type == 'critical': 80 | self.logger.critical(msg) 81 | 82 | -------------------------------------------------------------------------------- /game/pacman/textDisplay.py: -------------------------------------------------------------------------------- 1 | # textDisplay.py 2 | # -------------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | import time 16 | try: 17 | import pacman 18 | except: 19 | pass 20 | 21 | DRAW_EVERY = 1 22 | SLEEP_TIME = 0 # This can be overwritten by __init__ 23 | DISPLAY_MOVES = False 24 | QUIET = False # Supresses output 25 | 26 | class NotGraphics: 27 | def initialize(self, state, isBlue = False): 28 | pass 29 | 30 | def update(self, state): 31 | pass 32 | 33 | def checkNullDisplay(self): 34 | return True 35 | 36 | def pause(self): 37 | time.sleep(SLEEP_TIME) 38 | 39 | def draw(self, state): 40 | pass 41 | 42 | def updateDistributions(self, dist): 43 | pass 44 | 45 | def finish(self): 46 | pass 47 | 48 | class NullGraphics: 49 | def initialize(self, state, isBlue = False): 50 | pass 51 | 52 | def update(self, state): 53 | pass 54 | 55 | def checkNullDisplay(self): 56 | return True 57 | 58 | def pause(self): 59 | time.sleep(SLEEP_TIME) 60 | 61 | def draw(self, state): 62 | print(state) 63 | 64 | def updateDistributions(self, dist): 65 | pass 66 | 67 | def finish(self): 68 | pass 69 | 70 | class PacmanGraphics: 71 | def __init__(self, speed=None): 72 | if speed != None: 73 | global SLEEP_TIME 74 | SLEEP_TIME = speed 75 | 76 | def initialize(self, state, isBlue = False): 77 | self.draw(state) 78 | self.pause() 79 | self.turn = 0 80 | self.agentCounter = 0 81 | 82 | def update(self, state): 83 | numAgents = len(state.agentStates) 84 | self.agentCounter = (self.agentCounter + 1) % numAgents 85 | if self.agentCounter == 0: 86 | self.turn += 1 87 | if DISPLAY_MOVES: 88 | ghosts = [pacman.nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)] 89 | print(("%4d) P: %-8s" % (self.turn, str(pacman.nearestPoint(state.getPacmanPosition()))), '| Score: %-5d' % state.score, '| Ghosts:', ghosts)) 90 | if self.turn % DRAW_EVERY == 0: 91 | self.draw(state) 92 | self.pause() 93 | if state._win or state._lose: 94 | self.draw(state) 95 | 96 | def pause(self): 97 | time.sleep(SLEEP_TIME) 98 | 99 | def draw(self, state): 100 | print(state) 101 | 102 | def finish(self): 103 | pass 104 | -------------------------------------------------------------------------------- /alg/maddpg/trainer/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | def __init__(self, size): 6 | """Create Prioritized Replay buffer. 7 | 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = int(size) 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def clear(self): 22 | self._storage = [] 23 | self._next_idx = 0 24 | 25 | def add(self, obs_t, action, reward, obs_tp1, option, term, done): 26 | data = (obs_t, action, reward, obs_tp1, option, term, done) 27 | 28 | if self._next_idx >= len(self._storage): 29 | self._storage.append(data) 30 | else: 31 | self._storage[self._next_idx] = data 32 | self._next_idx = (self._next_idx + 1) % self._maxsize 33 | 34 | def _encode_sample(self, idxes): 35 | obses_t, actions, rewards, obses_tp1, options, terms, dones = [], [], [], [], [], [], [] 36 | for i in idxes: 37 | data = self._storage[i] 38 | obs_t, action, reward, obs_tp1, option, term, done = data 39 | obses_t.append(np.array(obs_t, copy=False)) 40 | actions.append(np.array(action, copy=False)) 41 | rewards.append(reward) 42 | obses_tp1.append(np.array(obs_tp1, copy=False)) 43 | options.append(option) 44 | terms.append(term) 45 | dones.append(done) 46 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(options), np.array(terms), np.array(dones) 47 | 48 | def make_index(self, batch_size): 49 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 50 | 51 | def make_latest_index(self, batch_size): 52 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 53 | np.random.shuffle(idx) 54 | return idx 55 | 56 | def sample_index(self, idxes): 57 | return self._encode_sample(idxes) 58 | 59 | def sample(self, batch_size): 60 | """Sample a batch of experiences. 61 | 62 | Parameters 63 | ---------- 64 | batch_size: int 65 | How many transitions to sample. 66 | 67 | Returns 68 | ------- 69 | obs_batch: np.array 70 | batch of observations 71 | act_batch: np.array 72 | batch of actions executed given obs_batch 73 | rew_batch: np.array 74 | rewards received as results of executing act_batch 75 | next_obs_batch: np.array 76 | next set of observations seen after executing act_batch 77 | done_mask: np.array 78 | done_mask[i] = 1 if executing act_batch[i] resulted in 79 | the end of an episode and 0 otherwise. 80 | """ 81 | if batch_size > 0: 82 | idxes = self.make_index(batch_size) 83 | else: 84 | idxes = range(0, len(self._storage)) 85 | return self._encode_sample(idxes) 86 | 87 | def collect(self): 88 | return self.sample(-1) 89 | -------------------------------------------------------------------------------- /util/ReplayBuffer.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | 6 | def __init__(self, buffer_size): 7 | self.buffer_size = buffer_size 8 | self.num_experiences = 0 9 | self.buffer = deque() 10 | 11 | def get_batch(self, batch_size): 12 | # Randomly sample batch_size examples 13 | return random.sample(self.buffer, batch_size) 14 | 15 | def size(self): 16 | return self.buffer_size 17 | 18 | def add(self, state, action, reward, done, new_state, opa): 19 | experience = (state, action, reward, done, new_state, opa) 20 | if self.num_experiences < self.buffer_size: 21 | self.buffer.append(experience) 22 | self.num_experiences += 1 23 | else: 24 | self.buffer.popleft() 25 | self.buffer.append(experience) 26 | 27 | def count(self): 28 | # if buffer is full, return buffer size 29 | # otherwise, return experience counter 30 | return self.num_experiences 31 | 32 | def erase(self): 33 | self.buffer = deque() 34 | self.num_experiences = 0 35 | 36 | class ReplayBufferSR(object): 37 | 38 | def __init__(self, buffer_size): 39 | self.buffer_size = buffer_size 40 | self.num_experiences = 0 41 | self.buffer = deque() 42 | 43 | def get_batch(self, batch_size): 44 | # Randomly sample batch_size examples 45 | return random.sample(self.buffer, batch_size) 46 | 47 | def size(self): 48 | return self.buffer_size 49 | 50 | def add(self, state, action, reward, done, new_state, opa): 51 | experience = (state, action, reward, done, new_state, opa) 52 | if self.num_experiences < self.buffer_size: 53 | self.buffer.append(experience) 54 | self.num_experiences += 1 55 | else: 56 | self.buffer.popleft() 57 | self.buffer.append(experience) 58 | 59 | def count(self): 60 | # if buffer is full, return buffer size 61 | # otherwise, return experience counter 62 | return self.num_experiences 63 | 64 | def erase(self): 65 | self.buffer = deque() 66 | self.num_experiences = 0 67 | 68 | 69 | class ShareReplayBuffer(object): 70 | 71 | def __init__(self, buffer_size): 72 | self.buffer_size = buffer_size 73 | self.num_experiences = 0 74 | self.buffer = deque() 75 | 76 | def get_batch(self, batch_size): 77 | # Randomly sample batch_size examples 78 | return random.sample(self.buffer, batch_size) 79 | 80 | def size(self): 81 | return self.buffer_size 82 | 83 | def add(self, state, action, reward, done, new_state, opa, agentId): 84 | experience = (state, action, reward, done, new_state, opa, agentId) 85 | if self.num_experiences < self.buffer_size: 86 | self.buffer.append(experience) 87 | self.num_experiences += 1 88 | else: 89 | self.buffer.popleft() 90 | self.buffer.append(experience) 91 | 92 | def count(self): 93 | # if buffer is full, return buffer size 94 | # otherwise, return experience counter 95 | return self.num_experiences 96 | 97 | def erase(self): 98 | self.buffer = deque() 99 | self.num_experiences = 0 -------------------------------------------------------------------------------- /game/pacman/keyboardAgents.py: -------------------------------------------------------------------------------- 1 | # keyboardAgents.py 2 | # ----------------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | from game.pacman.game import Agent 16 | from game.pacman.game import Directions 17 | import random 18 | 19 | 20 | class KeyboardAgent(Agent): 21 | """ 22 | An agent controlled by the keyboard. 23 | """ 24 | # NOTE: Arrow keys also work. 25 | WEST_KEY = 'a' 26 | EAST_KEY = 'd' 27 | NORTH_KEY = 'w' 28 | SOUTH_KEY = 's' 29 | STOP_KEY = 'q' 30 | 31 | def __init__(self, index=0): 32 | 33 | self.lastMove = Directions.STOP 34 | self.index = index 35 | self.keys = [] 36 | 37 | def getAction(self, state): 38 | from game.pacman.graphicsUtils import keys_waiting 39 | from game.pacman.graphicsUtils import keys_pressed 40 | keys = keys_waiting() + keys_pressed() 41 | if keys != []: 42 | self.keys = keys 43 | 44 | legal = state.getLegalActions(self.index) 45 | move = self.getMove(legal) 46 | 47 | if move == Directions.STOP: 48 | # Try to move in the same direction as before 49 | if self.lastMove in legal: 50 | move = self.lastMove 51 | 52 | if (self.STOP_KEY in self.keys) and Directions.STOP in legal: 53 | move = Directions.STOP 54 | 55 | if move not in legal: 56 | move = random.choice(legal) 57 | 58 | self.lastMove = move 59 | return move 60 | 61 | def getMove(self, legal): 62 | move = Directions.STOP 63 | if (self.WEST_KEY in self.keys or 'Left' in self.keys) and Directions.WEST in legal: 64 | move = Directions.WEST 65 | if (self.EAST_KEY in self.keys or 'Right' in self.keys) and Directions.EAST in legal: 66 | move = Directions.EAST 67 | if (self.NORTH_KEY in self.keys or 'Up' in self.keys) and Directions.NORTH in legal: 68 | move = Directions.NORTH 69 | if (self.SOUTH_KEY in self.keys or 'Down' in self.keys) and Directions.SOUTH in legal: 70 | move = Directions.SOUTH 71 | return move 72 | 73 | 74 | class KeyboardAgent2(KeyboardAgent): 75 | """ 76 | A second agent controlled by the keyboard. 77 | """ 78 | # NOTE: Arrow keys also work. 79 | WEST_KEY = 'j' 80 | EAST_KEY = "l" 81 | NORTH_KEY = 'i' 82 | SOUTH_KEY = 'k' 83 | STOP_KEY = 'u' 84 | 85 | def getMove(self, legal): 86 | move = Directions.STOP 87 | if (self.WEST_KEY in self.keys) and Directions.WEST in legal: 88 | move = Directions.WEST 89 | if (self.EAST_KEY in self.keys) and Directions.EAST in legal: 90 | move = Directions.EAST 91 | if (self.NORTH_KEY in self.keys) and Directions.NORTH in legal: 92 | move = Directions.NORTH 93 | if (self.SOUTH_KEY in self.keys) and Directions.SOUTH in legal: 94 | move = Directions.SOUTH 95 | return move 96 | -------------------------------------------------------------------------------- /game/pacman/ghostAgents.py: -------------------------------------------------------------------------------- 1 | # ghostAgents.py 2 | # -------------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | from game.pacman.game import Agent 16 | from game.pacman.game import Actions 17 | from game.pacman.game import Directions 18 | import random 19 | from game.pacman.util import manhattanDistance 20 | import game.pacman.util as util 21 | 22 | 23 | class GhostAgent(Agent): 24 | 25 | def __init__(self, index): 26 | self.index = index 27 | 28 | def getAction(self, state): 29 | dist = self.getDistribution(state) 30 | if len(dist) == 0: 31 | return Directions.STOP 32 | else: 33 | return util.chooseFromDistribution(dist) 34 | 35 | def getDistribution(self, state): 36 | "Returns a Counter encoding a distribution over actions from the provided state." 37 | util.raiseNotDefined() 38 | 39 | 40 | class RandomGhost(GhostAgent): 41 | "A ghost that chooses a legal action uniformly at random." 42 | 43 | def getDistribution(self, state): 44 | dist = util.Counter() 45 | for a in state.getLegalActions(self.index): 46 | dist[a] = 1.0 47 | dist.normalize() 48 | return dist 49 | 50 | 51 | class DirectionalGhost(GhostAgent): 52 | "A ghost that prefers to rush Pacman, or flee when scared." 53 | 54 | def __init__(self, index, prob_attack=0.8, prob_scaredFlee=0.8): 55 | self.index = index 56 | self.prob_attack = prob_attack 57 | self.prob_scaredFlee = prob_scaredFlee 58 | 59 | def getDistribution(self, state): 60 | # Read variables from state 61 | ghostState = state.getGhostState(self.index) 62 | legalActions = state.getLegalActions(self.index) 63 | pos = state.getGhostPosition(self.index) 64 | isScared = ghostState.scaredTimer > 0 65 | 66 | speed = 1 67 | if isScared: 68 | speed = 0.5 69 | 70 | actionVectors = [Actions.directionToVector( 71 | a, speed) for a in legalActions] 72 | newPositions = [(pos[0] + a[0], pos[1] + a[1]) for a in actionVectors] 73 | pacmanPosition = state.getPacmanPosition() 74 | 75 | # Select best actions given the state 76 | distancesToPacman = [manhattanDistance( 77 | pos, pacmanPosition) for pos in newPositions] 78 | if isScared: 79 | bestScore = max(distancesToPacman) 80 | bestProb = self.prob_scaredFlee 81 | else: 82 | bestScore = min(distancesToPacman) 83 | bestProb = self.prob_attack 84 | bestActions = [action for action, distance in zip( 85 | legalActions, distancesToPacman) if distance == bestScore] 86 | 87 | # Construct distribution 88 | dist = util.Counter() 89 | for a in bestActions: 90 | dist[a] = bestProb / len(bestActions) 91 | for a in legalActions: 92 | dist[a] += (1 - bestProb) / len(legalActions) 93 | dist.normalize() 94 | return dist 95 | -------------------------------------------------------------------------------- /game/pacman/make_env.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import numpy as np 4 | from gym import spaces 5 | 6 | import game.pacman.layout as layout 7 | from game.pacman.pacman import readCommand 8 | from game.pacman.pacman import ClassicGameRules 9 | import game.pacman.textDisplay as textDisplay 10 | from game.pacman.ghostAgents import RandomGhost as Ghost 11 | 12 | 13 | class Agent: 14 | def __init__(self): 15 | self.name = '' 16 | 17 | def get(self): 18 | raise NotImplemented() 19 | 20 | 21 | class Wrap_pacman(): 22 | def __init__(self, args): 23 | # , layout, pacman, ghosts, display, numGames, record, numTraining = 0, catchExceptions = False, timeout = 30 24 | self.args = args 25 | self.layout = layout.getLayout(args['game_name']) 26 | self.rules = ClassicGameRules(self.args['timeout']) 27 | self.pacman = Agent() 28 | self.ghosts = [Agent() for i in range(self.layout.getNumGhosts())]#[Ghost(i+1) for i in range(self.layout.getNumGhosts())] # [Agent() for i in range(self.layout.getNumGhosts())] 29 | if self.args['quietGraphics']: 30 | display = textDisplay.NullGraphics() 31 | elif self.args['textGraphics']: 32 | textDisplay.SLEEP_TIME = self.args['frameTime'] 33 | display = textDisplay.PacmanGraphics() 34 | else: 35 | import game.pacman.graphicsDisplay as graphicsDisplay 36 | display = graphicsDisplay.PacmanGraphics(self.args['zoom'], frameTime=self.args['frameTime']) 37 | self.beQuiet = False 38 | self.textDisplay = textDisplay.NotGraphics() 39 | self.videoDisplay = display 40 | self.rules.quiet = True 41 | self.catchExceptions = self.args['catchExceptions'] 42 | self.done = True 43 | 44 | self.action2str = ['North', 'South', 'East', 'West', 'Stop'] 45 | self.game = self.rules.newGame(self.layout, self.pacman, self.ghosts, display, self.beQuiet, 46 | self.catchExceptions) 47 | 48 | # gym-like info 49 | self.n = len(self.game.agents) 50 | self.action_space = [spaces.Discrete(len(self.action2str)) for i in range(self.n)] 51 | self.observation_space = [spaces.Box(low=0, high=1, shape=((self.layout.width + self.layout.height) * self.n + 18,), dtype=np.float32) if i == 0 else 52 | spaces.Box(low=0, high=1, shape=((self.layout.width + self.layout.height) * 2,), dtype=np.float32) 53 | for i in range(self.n)] 54 | 55 | def step(self, actions, done=None): 56 | assert not self.done, 'done! step after reset' 57 | actions = [np.argmax(a) for a in actions] 58 | actions = [self.action2str[action] for action in actions] 59 | # ghost_action = [] 60 | # ghost_action.append(actions[0]) 61 | # for ghost in self.ghosts: 62 | # action = ghost.getAction(self.game.state) 63 | # ghost_action.append(action) 64 | # print(ghost_action) 65 | state, reward, done, info = self.game.step(actions) 66 | self.done = done 67 | done = [done for i in range(self.n)] 68 | return state, reward, done, info 69 | 70 | def reset(self, render=False): 71 | del self.game 72 | del self.rules 73 | del self.pacman 74 | del self.ghosts 75 | 76 | self.pacman = Agent() 77 | self.ghosts = [Agent() for i in range(self.layout.getNumGhosts())]#[Ghost(i+1) for i in range(self.layout.getNumGhosts())] 78 | 79 | self.rules = ClassicGameRules(self.args['timeout']) 80 | self.rules.quiet = True 81 | 82 | if render: 83 | display = self.videoDisplay 84 | self.rules.quiet = False 85 | else: 86 | display = self.textDisplay 87 | self.rules.quiet = True 88 | 89 | self.game = self.rules.newGame(self.layout, self.pacman, self.ghosts, display, self.beQuiet, 90 | self.catchExceptions) 91 | 92 | self.done = False 93 | 94 | return self.game.reset(render=render) 95 | 96 | def render(self): 97 | pass 98 | 99 | 100 | def runGames(args): 101 | env = Wrap_pacman(args) 102 | return env 103 | 104 | 105 | def runGames_2(layout, pacman, ghosts, display, numGames, record, numTraining=0, catchExceptions=False, timeout=30): 106 | 107 | rules = ClassicGameRules(timeout) 108 | games = [] 109 | 110 | for i in range(numGames): 111 | beQuiet = i < numTraining 112 | 113 | gameDisplay = textDisplay.NullGraphics() 114 | rules.quiet = True 115 | 116 | # render 117 | # gameDisplay = display 118 | # rules.quiet = False 119 | 120 | game = rules.newGame(layout, pacman, ghosts, 121 | gameDisplay, beQuiet, catchExceptions) 122 | game.run() 123 | 124 | return games 125 | 126 | 127 | def make_env(args): 128 | #args = readCommand(sys.argv[1:]) # Get game components based on input 129 | #print(args) 130 | return runGames(args) 131 | # runGames_2(**args) 132 | # return env -------------------------------------------------------------------------------- /game/particle/make_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for creating a multiagent environment with one of the scenarios listed 3 | in ./scenarios/. 4 | Can be called by using, for example: 5 | env = make_env('simple_speaker_listener') 6 | After producing the env object, can be used similarly to an OpenAI gym 7 | environment. 8 | 9 | A policy using this environment must output actions in the form of a list 10 | for all agents. Each element of the list should be a numpy array, 11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede 12 | communication actions in this array. See environment.py for more details. 13 | """ 14 | import numpy as np 15 | import time 16 | 17 | 18 | def make_env(args): 19 | scenario_name = args['game_name'] 20 | ''' 21 | Creates a MultiAgentEnv object as env. This can be used similar to a gym 22 | environment by calling env.reset() and env.step(). 23 | Use env.render() to view the environment on the screen. 24 | 25 | Input: 26 | scenario_name : name of the scenario from ./scenarios/ to be Returns 27 | (without the .py extension) 28 | benchmark : whether you want to produce benchmarking data 29 | (usually only done during evaluation) 30 | 31 | Some useful env properties (see environment.py): 32 | .observation_space : Returns the observation space for each agent 33 | .action_space : Returns the action space for each agent 34 | .n : Returns the number of Agents 35 | ''' 36 | from game.particle.multiagent.environment import MultiAgentEnv 37 | import game.particle.multiagent.scenarios as scenarios 38 | 39 | # load scenario from script 40 | scenario = scenarios.load(scenario_name + ".py").Scenario() 41 | # create world 42 | world = scenario.make_world(args) 43 | # create multiagent environment 44 | if args['benchmark'] and not args['obs_sort']: 45 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data, args=args) 46 | elif not args['benchmark'] and args['obs_sort']: 47 | if args["reward_func"] == "reward2": 48 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, scenario.observation_sort, scenario.is_done2, args=args) 49 | elif args["reward_func"] == "reward3": 50 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward3, scenario.observation3, scenario.observation_sort3, scenario.is_done3, args=args) 51 | else: 52 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, 53 | scenario.observation_sort, scenario.is_done, args=args) 54 | elif not args['benchmark'] and not args['obs_sort']: 55 | if args["reward_func"] == "reward2": 56 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, args=args) 57 | elif args["reward_func"] == "reward3": 58 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward3, scenario.observation3, args=args) 59 | else: 60 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, args=args) 61 | else: 62 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, args=args) 63 | return env 64 | 65 | 66 | # test 67 | def action(obs): 68 | if env.discrete_action_space: 69 | i = np.random.randint(0, 5) 70 | u = np.zeros(5) 71 | u[i] = 1 72 | else: 73 | u = np.array([(np.random.random() - 0.5) * 2, (np.random.random() - 0.5) * 2]) 74 | return u 75 | 76 | 77 | if __name__ == '__main__': 78 | args = dict() 79 | args['game_name'] = "simple_spread_old" 80 | args['benchmark'] = False 81 | args['obs_sort'] = False 82 | args['reward_func'] = 'reward' 83 | args['restrict_move'] = True 84 | args['num_adversaries'] = 0 85 | args['num_good'] = 6 86 | env = make_env(args) 87 | print(env.action_space) 88 | env.render() 89 | # create interactive policies for each agent 90 | # execution loop 91 | obs_n = env.reset() 92 | print(env.observation_space) 93 | print(env.action_space) 94 | 95 | for ep in range(100): 96 | obs_n = env.reset() 97 | step = 0 98 | reward = np.zeros(env.n) 99 | done = [False for i in range(env.n)] 100 | while True: 101 | # query for action from each agent's policy 102 | act_n = [] 103 | for i in range(env.n): 104 | act_n.append(action(obs_n[i])) 105 | #print(act_n) 106 | #print(act_n) 107 | # step environment 108 | obs_n, reward_n, done_n, _ = env.step(act_n) 109 | for i in range(env.n): 110 | if not done[i]: 111 | done[i] = done_n[i] 112 | reward += reward_n 113 | #print(obs_n) 114 | # render all agent views 115 | #time.sleep(0.1) 116 | env.render() 117 | step += 1 118 | if step > 100 or all((done_n[i] is True for i in range(env.n))): 119 | print(step, reward, done_n) 120 | break 121 | 122 | 123 | -------------------------------------------------------------------------------- /game/particle/multiagent/scenarios/simple_spread.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from game.particle.multiagent.core import World, Agent, Landmark 3 | from game.particle.multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self, args=None): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_agents = 10 12 | num_landmarks = 10 13 | if args is not None and args['num_good'] != 0: 14 | num_landmarks = args['num_good'] 15 | num_agents = num_landmarks 16 | world.cam_range = 4 17 | world.collaborative = False 18 | # add agents 19 | world.agents = [Agent() for i in range(num_agents)] 20 | for i, agent in enumerate(world.agents): 21 | agent.name = 'agent %d' % i 22 | agent.collide = True 23 | agent.silent = True 24 | #agent.size = 0.15 25 | # add landmarks 26 | world.landmarks = [Landmark() for i in range(num_landmarks)] 27 | for i, landmark in enumerate(world.landmarks): 28 | landmark.name = 'landmark %d' % i 29 | landmark.collide = False 30 | landmark.movable = False 31 | # make initial conditions 32 | self.reset_world(world) 33 | return world 34 | 35 | def reset_world(self, world): 36 | # random properties for agents 37 | for i, agent in enumerate(world.agents): 38 | agent.color = np.array([0.35, 0.35, 0.85]) 39 | # random properties for landmarks 40 | for i, landmark in enumerate(world.landmarks): 41 | landmark.color = np.array([0.25, 0.25, 0.25]) 42 | # set random initial states 43 | for agent in world.agents: 44 | agent.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p) 45 | agent.state.p_vel = np.zeros(world.dim_p) 46 | agent.state.c = np.zeros(world.dim_c) 47 | for i, landmark in enumerate(world.landmarks): 48 | landmark.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p) 49 | landmark.state.p_vel = np.zeros(world.dim_p) 50 | 51 | def benchmark_data(self, agent, world): 52 | rew = 0 53 | collisions = 0 54 | occupied_landmarks = 0 55 | min_dists = 0 56 | for l in world.landmarks: 57 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 58 | min_dists += min(dists) 59 | rew -= min(dists) 60 | if min(dists) < 0.1: 61 | occupied_landmarks += 1 62 | if agent.collide: 63 | for a in world.agents: 64 | if self.is_collision(a, agent): 65 | rew -= 1 66 | collisions += 1 67 | return (rew, collisions, min_dists, occupied_landmarks) 68 | 69 | 70 | def is_collision(self, agent1, agent2): 71 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 72 | dist = np.sqrt(np.sum(np.square(delta_pos))) 73 | dist_min = agent1.size + agent2.size 74 | return True if dist < dist_min else False 75 | 76 | # def reward(self, agent, world): 77 | # # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 78 | # rew = 0 79 | # 80 | # # for l in world.landmarks: 81 | # # dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 82 | # # rew -= min(dists) 83 | # dists = [np.sqrt(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) for l in world.landmarks] 84 | # rew -= min(dists) 85 | # if agent.collide: 86 | # for a in world.agents: 87 | # if self.is_collision(a, agent): 88 | # rew -= 1 89 | # return rew 90 | def reward(self, agent, world): 91 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 92 | rew = 0 93 | agentIndex = 0 94 | for i, a in enumerate(world.agents): 95 | if a.name == agent.name: 96 | agentIndex = i 97 | break 98 | dists = np.sqrt(np.sum(np.square(agent.state.p_pos - world.landmarks[agentIndex].state.p_pos))) 99 | # if self.is_collision(agent, world.landmarks[agentIndex]): 100 | # rew = 1 101 | rew -= dists 102 | return rew 103 | 104 | def observation(self, agent, world): 105 | # get positions of all entities in this agent's reference frame 106 | entity_pos = [] 107 | for entity in world.landmarks: # world.entities: 108 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 109 | # entity colors 110 | entity_color = [] 111 | for entity in world.landmarks: # world.entities: 112 | entity_color.append(entity.color) 113 | # communication of all other agents 114 | comm = [] 115 | other_pos = [] 116 | for other in world.agents: 117 | if other is agent: continue 118 | comm.append(other.state.c) 119 | other_pos.append(other.state.p_pos - agent.state.p_pos) 120 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MAPTF 2 | 3 | Source code for paper: An Efficient Transfer Learning Framework for Multiagent Reinforcement Learning 4 | 5 | * [MAPTF code](#MAPTF code) 6 | * [Installation](#Installation) 7 | * [Run an experiment](#Run an experiment) 8 | * [Example](#Example) 9 | * [Results](#results) 10 | * [Configuration](#Configuration) 11 | * [Operating parameters](#Operating parameters) 12 | * [Core parameters](#Core parameters) 13 | * [Some experiences setting in paper](#Some experiences setting in paper) 14 | * [In BibTeX format](#In BibTeX format) 15 | 16 | ## MAPTF code 17 | * MAPTF 18 | * alg (multiagent polices) 19 | * maddpg 20 | * muti_ptf_ppo 21 | * sharing_multi_ppo 22 | * option 23 | * config (Configuration parameters of each polices) 24 | * maddpg_conf (including maddpg and maddpg_sr) 25 | * ppo_config (including ppo sro shppo and shsro) 26 | * particle_conf (Configuration of particle game ) 27 | * pacman_conf (Configuration of pacman game) 28 | * run (execute the tasks) 29 | * run_maddpg_sr (including maddpg and maddpg_sr) 30 | * run_multi_ptf_ppo_sro (including ppo sro) 31 | * run_multi_ptf_shppo_sro (including shppo and shsro) 32 | * source (opponent policies) 33 | * util 34 | * main (entry function) 35 | 36 | ## Installation 37 | python==3.6.5 38 | 39 | pip install -r requirements.txt 40 | 41 | ## Running Example 42 | 43 | #### Example 44 | 45 | ``` 46 | #MAPTF-PPO Pacman 47 | python main.py -a multi_ppo -c ppo_conf -g pacman -d pacman_conf game_name=originalClassic num_adversaries=1 adv_load_model=True adv_load_model_path=source/pacman/original/0/model 48 | ``` 49 | ``` 50 | #MAPTF-PPO Predator-prey 4 51 | python main.py -a multi_ppo -c ppo_conf -g particle -d particle_conf game_name=simple_tag num_adversaries=3 good_use_option=False good_load_model=True good_load_model_path=source/simple_tag/tag4/model_30000 c1=0.001 52 | ``` 53 | some logs will be shown below: 54 | ``` 55 | INFO:tensorflow:Restoring parameters from source/pacman/original/0/model_0.ckpt 56 | win : [False, False, False, False], step : 100, discounted_reward : [ 0.61213843 -0.63762798 -0.63762798 -0.63762798], discount_reward_mean : [ 0.61213843 -0.63762798 -0.63762798 -0.63762798], undiscounted_reward : [ 0.31 -1.01 -1.01 -1.01], reward_mean : [ 0.31 -1.01 -1.01 -1.01], episode : 0, 57 | win : [False, False, False, False], step : 100, discounted_reward : [ 0.58945708 -0.63762798 -0.63762798 -0.63762798], discount_reward_mean : [ 0.60079775 -0.63762798 -0.63762798 -0.63762798], undiscounted_reward : [ 0.31 -1.01 -1.01 -1.01], reward_mean : [ 0.31 -1.01 -1.01 -1.01], episode : 1, 58 | ``` 59 | 60 | #### Results 61 | 62 | All results will be stored in the `results/alg_name/game_type/game_name/time` folder, every folder contains `graph`, `log`, `model`, `output`, `args.json`, `command.txt` 63 | 64 | If you do not want to save `graph` and `model`, setting param `save_model=False`. 65 | * `graph`: can use `tensorboard --logdir=path` to check the tensorflow graph and loss in terminal. 66 | * `log`: the print results in terminal. 67 | * `model`: models saved every `save_per_episodes` episodes. 68 | * `output.json`: reward results. 69 | * `args.json`: store all params. 70 | * `command.txt`: shell command. 71 | 72 | ## Source Policy 73 | 74 | Source policies contain pre-trained opponent policies. For example, in pac-man, the pac-man agent is the opponent, the policy is a pre-trained PPO; in predator-prey, the blue circle agents are pre-trained using PPO. Using test mode via `-t` `load_model`can reload the model to render 75 | 76 | ## Configuration 77 | 78 | The config files act as defaults for an algorithm or environment. 79 | 80 | They are all located in `config`. 81 | 82 | #### Operating parameters 83 | 84 | Take the above example: 85 | * `-a multi_ppo`: choose an algorithm. 86 | * `-c ppo_conf`: choose corresponding algorithm configuration. 87 | * `-g pacman`: game type. 88 | * `-d pacman_conf`: game configuration. 89 | * `-t`: evaluation the results, by setting `-t True`, and `-t False` as default. 90 | * `game_name=originalClassic`: choose a game environment. 91 | * `num_adversaries=1`: as needed. 92 | * `adv_load_model=True adv_load_model_path=source/pacman/original/0/model`: load source policy. 93 | * `adv_use_option, good_use_option`: use option, by setting `True`, `False` as default. Learning ppo, shppo and maddpg, setting `False`, otherwise setting `True` as needed. 94 | 95 | #### Core parameters 96 | 97 | Default: 98 | * `option_layer_1=128, option_layer_2=128` 99 | * `learning_rate_r=0.0003` 100 | * `embedding_dim=32` 101 | * `option_embedding_layer=64` 102 | * `recon_loss_coef=0.1` 103 | * `option_batch_size=32` 104 | * `c1=0.005` 105 | * `e_greedy_increment=0.001` 106 | * `learning_rate_o=0.00001, learning_rate_t=0.00001` 107 | * `xi=0.005` 108 | 109 | #### Some experiences setting in paper 110 | ``` 111 | #ppo+sro, game type=pacman, game environment=mediumClassic 112 | c1=0.005 113 | ``` 114 | ``` 115 | #ppo+sro, game type=pacman, game environment=originalClassic 116 | option_batch_size=128 117 | c1=0.0005 118 | ``` 119 | ``` 120 | #maddpg+sro, game type=particle, game environment=simple_tag 121 | option_layer_1=128 option_layer_2=128 122 | learning_rate_o=0.00001 learning_rate_t=0.00001 123 | c1=0.005 124 | xi=0 125 | ``` 126 | ``` 127 | #ppo+sro, game type=particle, game environment=simple_tag 128 | option_layer_1=32 option_layer_2=32 129 | c1=0.1 130 | option_batch_size=128 131 | ``` 132 | ``` 133 | #shsro, game type=particle, game environment=simple_tag 134 | option_layer_1=32 option_layer_2=32 135 | c1=0.1 136 | ``` 137 | 138 | MADDPG code follows: https://github.com/openai/maddpg 139 | 140 | ## In BibTeX format: 141 | 142 | ```tex 143 | @article{yang2021efficient, 144 | title={An Efficient Transfer Learning Framework for Multiagent Reinforcement Learning}, 145 | author={Yang, Tianpei and Wang, Weixun and Tang, Hongyao and Hao, Jianye and Meng, Zhaopeng and Mao, Hangyu and Li, Dong and Liu, Wulong and Chen, Yingfeng and Hu, Yujing and others}, 146 | journal={Advances in Neural Information Processing Systems}, 147 | volume={34}, 148 | year={2021} 149 | } 150 | ``` 151 | -------------------------------------------------------------------------------- /game/particle/README.md: -------------------------------------------------------------------------------- 1 | **Status:** Archive (code is provided as-is, no updates expected) 2 | 3 | # Multi-Agent Particle Environment 4 | 5 | A simple multi-agent particle world with a continuous observation and discrete action space, along with some basic simulated physics. 6 | Used in the paper [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf). 7 | 8 | ## Getting started: 9 | 10 | - To install, `cd` into the root directory and type `pip install -e .` 11 | 12 | - To interactively view moving to landmark scenario (see others in ./scenarios/): 13 | `bin/interactive.py --scenario simple.py` 14 | 15 | - Known dependencies: Python (3.5.4), OpenAI gym (0.10.5), numpy (1.14.5) 16 | 17 | - To use the environments, look at the code for importing them in `make_env.py`. 18 | 19 | ## Code structure 20 | 21 | - `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object. 22 | 23 | - `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.) 24 | 25 | - `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code. 26 | 27 | - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. 28 | 29 | - `./multiagent/policy.py`: contains code for interactive policy based on keyboard input. 30 | 31 | - `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios. 32 | 33 | - `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions: 34 | 1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both). 35 | called once at the beginning of each training session 36 | 2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world 37 | called before every episode (including after make_world() before the first episode) 38 | 3) `reward()`: defines the reward function for a given agent 39 | 4) `observation()`: defines the observation space of a given agent 40 | 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) 41 | 42 | ### Creating new environments 43 | 44 | You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`). 45 | 46 | ## List of environments 47 | 48 | 49 | | Env name in code (name in paper) | Communication? | Competitive? | Notes | 50 | | --- | --- | --- | --- | 51 | | `simple.py` | N | N | Single agent sees landmark position, rewarded based on how close it gets to landmark. Not a multiagent environment -- used for debugging policies. | 52 | | `simple_adversary.py` (Physical deception) | N | Y | 1 adversary (red), N good agents (green), N landmarks (usually N=2). All agents observe position of landmarks and other agents. One landmark is the ‘target landmark’ (colored green). Good agents rewarded based on how close one of them is to the target landmark, but negatively rewarded if the adversary is close to target landmark. Adversary is rewarded based on how close it is to the target, but it doesn’t know which landmark is the target landmark. So good agents have to learn to ‘split up’ and cover all landmarks to deceive the adversary. | 53 | | `simple_crypto.py` (Covert communication) | Y | Y | Two good agents (alice and bob), one adversary (eve). Alice must sent a private message to bob over a public channel. Alice and bob are rewarded based on how well bob reconstructs the message, but negatively rewarded if eve can reconstruct the message. Alice and bob have a private key (randomly generated at beginning of each episode), which they must learn to use to encrypt the message. | 54 | | `simple_push.py` (Keep-away) | N |Y | 1 agent, 1 adversary, 1 landmark. Agent is rewarded based on distance to landmark. Adversary is rewarded if it is close to the landmark, and if the agent is far from the landmark. So the adversary learns to push agent away from the landmark. | 55 | | `simple_reference.py` | Y | N | 2 agents, 3 landmarks of different colors. Each agent wants to get to their target landmark, which is known only by other agent. Reward is collective. So agents have to learn to communicate the goal of the other agent, and navigate to their landmark. This is the same as the simple_speaker_listener scenario where both agents are simultaneous speakers and listeners. | 56 | | `simple_speaker_listener.py` (Cooperative communication) | Y | N | Same as simple_reference, except one agent is the ‘speaker’ (gray) that does not move (observes goal of other agent), and other agent is the listener (cannot speak, but must navigate to correct landmark).| 57 | | `simple_spread.py` (Cooperative navigation) | N | N | N agents, N landmarks. Agents are rewarded based on how far any agent is from each landmark. Agents are penalized if they collide with other agents. So, agents have to learn to cover all the landmarks while avoiding collisions. | 58 | | `simple_tag.py` (Predator-prey) | N | Y | Predator-prey environment. Good agents (green) are faster and want to avoid being hit by adversaries (red). Adversaries are slower and want to hit good agents. Obstacles (large black circles) block the way. | 59 | | `simple_world_comm.py` | Y | Y | Environment seen in the video accompanying the paper. Same as simple_tag, except (1) there is food (small blue balls) that the good agents are rewarded for being near, (2) we now have ‘forests’ that hide agents inside from being seen from outside; (3) there is a ‘leader adversary” that can see the agents at all times, and can communicate with the other adversaries to help coordinate the chase. | 60 | 61 | ## Paper citation 62 | 63 | If you used this environment for your experiments or found it helpful, consider citing the following papers: 64 | 65 | Environments in this repo: 66 |
67 | @article{lowe2017multi,
68 |   title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
69 |   author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
70 |   journal={Neural Information Processing Systems (NIPS)},
71 |   year={2017}
72 | }
73 | 
74 | 75 | Original particle world environment: 76 |
77 | @article{mordatch2017emergence,
78 |   title={Emergence of Grounded Compositional Language in Multi-Agent Populations},
79 |   author={Mordatch, Igor and Abbeel, Pieter},
80 |   journal={arXiv preprint arXiv:1703.04908},
81 |   year={2017}
82 | }
83 | 
84 | -------------------------------------------------------------------------------- /game/pacman/layout.py: -------------------------------------------------------------------------------- 1 | # layout.py 2 | # --------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | from game.pacman.util import manhattanDistance 16 | from game.pacman.game import Grid 17 | import os 18 | import random 19 | from functools import reduce 20 | 21 | VISIBILITY_MATRIX_CACHE = {} 22 | 23 | 24 | class Layout: 25 | """ 26 | A Layout manages the static information about the game board. 27 | """ 28 | 29 | def __init__(self, layoutText): 30 | self.width = len(layoutText[0]) 31 | self.height = len(layoutText) 32 | self.walls = Grid(self.width, self.height, False) 33 | self.food = Grid(self.width, self.height, False) 34 | self.capsules = [] 35 | self.agentPositions = [] 36 | self.numGhosts = 0 37 | self.processLayoutText(layoutText) 38 | self.layoutText = layoutText 39 | self.totalFood = len(self.food.asList()) 40 | # self.initializeVisibilityMatrix() 41 | 42 | def getNumGhosts(self): 43 | return self.numGhosts 44 | 45 | def initializeVisibilityMatrix(self): 46 | global VISIBILITY_MATRIX_CACHE 47 | if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE: 48 | from game import Directions 49 | vecs = [(-0.5, 0), (0.5, 0), (0, -0.5), (0, 0.5)] 50 | dirs = [Directions.NORTH, Directions.SOUTH, 51 | Directions.WEST, Directions.EAST] 52 | vis = Grid(self.width, self.height, {Directions.NORTH: set(), Directions.SOUTH: set( 53 | ), Directions.EAST: set(), Directions.WEST: set(), Directions.STOP: set()}) 54 | for x in range(self.width): 55 | for y in range(self.height): 56 | if self.walls[x][y] == False: 57 | for vec, direction in zip(vecs, dirs): 58 | dx, dy = vec 59 | nextx, nexty = x + dx, y + dy 60 | while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)]: 61 | vis[x][y][direction].add((nextx, nexty)) 62 | nextx, nexty = x + dx, y + dy 63 | self.visibility = vis 64 | VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis 65 | else: 66 | self.visibility = VISIBILITY_MATRIX_CACHE[ 67 | reduce(str.__add__, self.layoutText)] 68 | 69 | def isWall(self, pos): 70 | x, col = pos 71 | return self.walls[x][col] 72 | 73 | def getRandomLegalPosition(self): 74 | x = random.choice(list(range(self.width))) 75 | y = random.choice(list(range(self.height))) 76 | while self.isWall((x, y)): 77 | x = random.choice(list(range(self.width))) 78 | y = random.choice(list(range(self.height))) 79 | return (x, y) 80 | 81 | def getRandomCorner(self): 82 | poses = [(1, 1), (1, self.height - 2), (self.width - 2, 1), 83 | (self.width - 2, self.height - 2)] 84 | return random.choice(poses) 85 | 86 | def getFurthestCorner(self, pacPos): 87 | poses = [(1, 1), (1, self.height - 2), (self.width - 2, 1), 88 | (self.width - 2, self.height - 2)] 89 | dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses]) 90 | return pos 91 | 92 | def isVisibleFrom(self, ghostPos, pacPos, pacDirection): 93 | row, col = [int(x) for x in pacPos] 94 | return ghostPos in self.visibility[row][col][pacDirection] 95 | 96 | def __str__(self): 97 | return "\n".join(self.layoutText) 98 | 99 | def deepCopy(self): 100 | return Layout(self.layoutText[:]) 101 | 102 | def processLayoutText(self, layoutText): 103 | """ 104 | Coordinates are flipped from the input format to the (x,y) convention here 105 | 106 | The shape of the maze. Each character 107 | represents a different type of object. 108 | % - Wall 109 | . - Food 110 | o - Capsule 111 | G - Ghost 112 | P - Pacman 113 | Other characters are ignored. 114 | """ 115 | maxY = self.height - 1 116 | for y in range(self.height): 117 | for x in range(self.width): 118 | layoutChar = layoutText[maxY - y][x] 119 | self.processLayoutChar(x, y, layoutChar) 120 | # (x1, y1) = self.getRandomLegalPosition() 121 | # self.agentPositions.append((0, (x1, y1))) 122 | # for i in range(self.numGhosts): 123 | # (x1, y1) = self.getRandomLegalPosition() 124 | # self.agentPositions.append((1, (x1, y1))) 125 | self.agentPositions.sort() 126 | self.agentPositions = [(i == 0, pos) for i, pos in self.agentPositions] 127 | 128 | def processLayoutChar(self, x, y, layoutChar): 129 | if layoutChar == '%': 130 | self.walls[x][y] = True 131 | elif layoutChar == '.': 132 | self.food[x][y] = True 133 | elif layoutChar == 'o': 134 | self.capsules.append((x, y)) 135 | elif layoutChar == 'P': 136 | self.agentPositions.append((0, (x, y))) 137 | #(x1, y1) = self.getRandomLegalPosition() 138 | #self.agentPositions.append((0, (x1, y1))) 139 | elif layoutChar in ['G']: 140 | self.agentPositions.append((1, (x, y))) 141 | #(x1, y1) = self.getRandomLegalPosition() 142 | #self.agentPositions.append((1, (x1, y1))) 143 | self.numGhosts += 1 144 | elif layoutChar in ['1', '2', '3', '4']: 145 | self.agentPositions.append((int(layoutChar), (x, y))) 146 | self.numGhosts += 1 147 | 148 | 149 | def getLayout(name, back=2): 150 | # print('1:', os.getcwd()) 151 | # print(os.path.abspath(__file__)) 152 | if name.endswith('.lay'): 153 | layout = tryToLoad(os.getcwd() + '/game/pacman/layouts/' + name) 154 | print(os.getcwd() + '/game/pacman/layouts/' + name) 155 | if layout == None: 156 | layout = tryToLoad(name) 157 | else: 158 | layout = tryToLoad(os.getcwd() + '/game/pacman/' + 'layouts/' + name + '.lay') 159 | print(os.getcwd() + '/game/pacman/' + 'layouts/' + name + '.lay') 160 | if layout == None: 161 | layout = tryToLoad(name + '.lay') 162 | if layout == None and back >= 0: 163 | curdir = os.path.abspath('.') 164 | os.chdir('..') 165 | layout = getLayout(name, back - 1) 166 | os.chdir(curdir) 167 | return layout 168 | 169 | 170 | def tryToLoad(fullname): 171 | if(not os.path.exists(fullname)): 172 | return None 173 | f = open(fullname) 174 | try: 175 | return Layout([line.strip() for line in f]) 176 | finally: 177 | f.close() 178 | -------------------------------------------------------------------------------- /game/particle/multiagent/scenarios/simple_tag.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from game.particle.multiagent.core import World, Agent, Landmark 3 | from game.particle.multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self, args=None): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_good_agents = 1 12 | num_adversaries = 6 13 | if args is not None and args['num_good'] != 0: 14 | num_good_agents = args['num_good'] 15 | if args is not None and args['num_adversaries'] != 0: 16 | num_adversaries = args['num_adversaries'] 17 | world.cam_range = 1 18 | num_agents = num_adversaries + num_good_agents 19 | num_landmarks = 2 20 | # add agents 21 | world.agents = [Agent() for i in range(num_agents)] 22 | for i, agent in enumerate(world.agents): 23 | agent.name = 'agent %d' % i 24 | agent.collide = True 25 | agent.silent = True 26 | agent.adversary = True if i < num_adversaries else False 27 | agent.size = 0.075 if agent.adversary else 0.05 28 | agent.accel = 3.0 if agent.adversary else 4.0 29 | #agent.accel = 20.0 if agent.adversary else 25.0 30 | agent.max_speed = 1.0 if agent.adversary else 1.3 31 | # add landmarks 32 | world.landmarks = [Landmark() for i in range(num_landmarks)] 33 | for i, landmark in enumerate(world.landmarks): 34 | landmark.name = 'landmark %d' % i 35 | landmark.collide = True 36 | landmark.movable = False 37 | landmark.size = 0.2 38 | landmark.boundary = False 39 | # make initial conditions 40 | self.reset_world(world) 41 | return world 42 | 43 | def reset_world(self, world): 44 | # random properties for agents 45 | for i, agent in enumerate(world.agents): 46 | agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35]) 47 | # random properties for landmarks 48 | for i, landmark in enumerate(world.landmarks): 49 | landmark.color = np.array([0.25, 0.25, 0.25]) 50 | # set random initial states 51 | for agent in world.agents: 52 | agent.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p) 53 | agent.state.p_vel = np.zeros(world.dim_p) 54 | agent.state.c = np.zeros(world.dim_c) 55 | for i, landmark in enumerate(world.landmarks): 56 | if not landmark.boundary: 57 | landmark.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p) 58 | landmark.state.p_vel = np.zeros(world.dim_p) 59 | 60 | def benchmark_data(self, agent, world): 61 | # returns data for benchmarking purposes 62 | if agent.adversary: 63 | collisions = 0 64 | for a in self.good_agents(world): 65 | if self.is_collision(a, agent): 66 | collisions += 1 67 | return collisions 68 | else: 69 | return 0 70 | 71 | def is_collision(self, agent1, agent2): 72 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 73 | dist = np.sqrt(np.sum(np.square(delta_pos))) 74 | dist_min = agent1.size + agent2.size 75 | return True if dist < dist_min else False 76 | 77 | # return all agents that are not adversaries 78 | def good_agents(self, world): 79 | return [agent for agent in world.agents if not agent.adversary] 80 | 81 | # return all adversarial agents 82 | def adversaries(self, world): 83 | return [agent for agent in world.agents if agent.adversary] 84 | 85 | def reward(self, agent, world): 86 | # Agents are rewarded based on minimum agent distance to each landmark 87 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 88 | return main_reward 89 | 90 | def agent_reward(self, agent, world): 91 | # Agents are negatively rewarded if caught by adversaries 92 | rew = 0 93 | shape = False 94 | adversaries = self.adversaries(world) 95 | if shape: # reward can optionally be shaped (increased reward for increased distance from adversary) 96 | for adv in adversaries: 97 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos))) 98 | if agent.collide: 99 | for a in adversaries: 100 | if self.is_collision(a, agent): 101 | rew -= 10 102 | 103 | # agents are penalized for exiting the screen, so that they can be caught by the adversaries 104 | def bound(x): 105 | if x < world.cam_range * 0.9: 106 | return 0 107 | if x < world.cam_range: 108 | return (x - 0.9 * world.cam_range) * 10 / world.cam_range 109 | return min(np.exp((2 * x - 2 * world.cam_range) / world.cam_range), 10) 110 | for p in range(world.dim_p): 111 | x = abs(agent.state.p_pos[p]) 112 | rew -= bound(x) 113 | return rew 114 | 115 | def adversary_reward(self, agent, world): 116 | # Adversaries are rewarded for collisions with agents 117 | rew = 0 118 | shape = False 119 | agents = self.good_agents(world) 120 | adversaries = self.adversaries(world) 121 | if shape: # reward can optionally be shaped (decreased reward for increased distance from agents) 122 | for adv in adversaries: 123 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents]) 124 | # if shape: # reward can optionally be shaped (decreased reward for increased distance from agents) 125 | # rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents]) 126 | 127 | if agent.collide: 128 | for ag in agents: 129 | agent_reward = 0 130 | for adv in adversaries: 131 | if self.is_collision(ag, adv): 132 | rew += 10 133 | # if self.is_collision(ag, agent): 134 | # rew += 10 135 | # 同一个agent被3个adversaries同时抓住才有分数,每一组最高分数30分 136 | # if agent_reward < 30: 137 | # rew += 0 138 | # else: 139 | # rew += 30 140 | return rew 141 | 142 | def observation(self, agent, world): 143 | # get positions of all entities in this agent's reference frame 144 | entity_pos = [] 145 | for entity in world.landmarks: 146 | if not entity.boundary: 147 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 148 | # communication of all other agents 149 | comm = [] 150 | other_pos = [] 151 | other_vel = [] 152 | for other in world.agents: 153 | if other is agent: continue 154 | comm.append(other.state.c) 155 | other_pos.append(other.state.p_pos - agent.state.p_pos) 156 | if not other.adversary: 157 | other_vel.append(other.state.p_vel) 158 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel) 159 | -------------------------------------------------------------------------------- /game/particle/multiagent/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # physical/external base state of all entites 5 | class EntityState(object): 6 | def __init__(self): 7 | # physical position 8 | self.p_pos = None 9 | # physical velocity 10 | self.p_vel = None 11 | 12 | 13 | # state of agents (including communication and internal/mental state) 14 | class AgentState(EntityState): 15 | def __init__(self): 16 | super(AgentState, self).__init__() 17 | # communication utterance 18 | self.c = None 19 | 20 | 21 | # action of the agent 22 | class Action(object): 23 | def __init__(self): 24 | # physical action 25 | self.u = None 26 | # communication action 27 | self.c = None 28 | 29 | 30 | # properties and state of physical world entity 31 | class Entity(object): 32 | def __init__(self): 33 | # name 34 | self.name = '' 35 | # properties: 36 | self.size = 0.050 37 | # entity can move / be pushed 38 | self.movable = False 39 | # entity collides with others 40 | self.collide = True 41 | # material density (affects mass) 42 | self.density = 25.0 43 | # color 44 | self.color = None 45 | # max speed and accel 46 | self.max_speed = None 47 | self.accel = None 48 | # state 49 | self.state = EntityState() 50 | # mass 51 | self.initial_mass = 1.0 52 | 53 | @property 54 | def mass(self): 55 | return self.initial_mass 56 | 57 | 58 | # properties of landmark entities 59 | class Landmark(Entity): 60 | def __init__(self): 61 | super(Landmark, self).__init__() 62 | 63 | 64 | # properties of agent entities 65 | class Agent(Entity): 66 | def __init__(self): 67 | super(Agent, self).__init__() 68 | # agents are movable by default 69 | self.movable = True 70 | # cannot send communication signals 71 | self.silent = False 72 | # cannot observe the world 73 | self.blind = False 74 | # physical motor noise amount 75 | self.u_noise = None 76 | # communication noise amount 77 | self.c_noise = None 78 | # control range 79 | self.u_range = 1.0 80 | # state 81 | self.state = AgentState() 82 | # action 83 | self.action = Action() 84 | # script behavior to execute 85 | self.action_callback = None 86 | 87 | 88 | # multi-agent world 89 | class World(object): 90 | def __init__(self): 91 | # list of agents and entities (can change at execution-time!) 92 | self.agents = [] 93 | self.landmarks = [] 94 | # communication channel dimensionality 95 | self.dim_c = 0 96 | # position dimensionality 97 | self.dim_p = 2 98 | # color dimensionality 99 | self.dim_color = 3 100 | # simulation timestep 101 | self.dt = 0.1 102 | # physical damping 103 | self.damping = 0.25 104 | # contact response parameters 105 | self.contact_force = 1e+2 106 | self.contact_margin = 1e-3 107 | 108 | # return all entities in the world 109 | @property 110 | def entities(self): 111 | return self.agents + self.landmarks 112 | 113 | # return all agents controllable by external policies 114 | @property 115 | def policy_agents(self): 116 | return [agent for agent in self.agents if agent.action_callback is None] 117 | 118 | # return all agents controlled by world scripts 119 | @property 120 | def scripted_agents(self): 121 | return [agent for agent in self.agents if agent.action_callback is not None] 122 | 123 | # update state of the world 124 | def step(self, done=None): 125 | # set actions for scripted agents 126 | for agent in self.scripted_agents: 127 | agent.action = agent.action_callback(agent, self) 128 | # gather forces applied to entities 129 | p_force = [None] * len(self.entities) 130 | # apply agent physical controls 131 | p_force = self.apply_action_force(p_force, done) 132 | # apply environment forces 133 | p_force = self.apply_environment_force(p_force) 134 | # integrate physical state 135 | self.integrate_state(p_force, done) 136 | # update agent state 137 | for agent in self.agents: 138 | self.update_agent_state(agent) 139 | 140 | # gather agent action forces 141 | def apply_action_force(self, p_force, done=None): 142 | # set applied forces 143 | for i, agent in enumerate(self.agents): 144 | if agent.movable: 145 | noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0 146 | p_force[i] = agent.action.u + noise 147 | return p_force 148 | 149 | # gather physical forces acting on entities 150 | def apply_environment_force(self, p_force): 151 | # simple (but inefficient) collision response 152 | for a, entity_a in enumerate(self.entities): 153 | for b,entity_b in enumerate(self.entities): 154 | if b <= a: continue 155 | [f_a, f_b] = self.get_collision_force(entity_a, entity_b) 156 | if f_a is not None: 157 | if p_force[a] is None: p_force[a] = 0.0 158 | p_force[a] = f_a + p_force[a] 159 | if f_b is not None: 160 | if p_force[b] is None: p_force[b] = 0.0 161 | p_force[b] = f_b + p_force[b] 162 | return p_force 163 | 164 | # integrate physical state 165 | def integrate_state(self, p_force, done=None): 166 | for i,entity in enumerate(self.entities): 167 | if not entity.movable: continue 168 | if entity.movable and done is not None and done[i]: continue 169 | entity.state.p_vel = entity.state.p_vel * (1 - self.damping) 170 | if p_force[i] is not None: 171 | entity.state.p_vel += (p_force[i] / entity.mass) * self.dt 172 | if entity.max_speed is not None: 173 | speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1])) 174 | if speed > entity.max_speed: 175 | entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) + 176 | np.square(entity.state.p_vel[1])) * entity.max_speed 177 | entity.state.p_pos += entity.state.p_vel * self.dt 178 | 179 | def update_agent_state(self, agent): 180 | # set communication state (directly for now) 181 | if agent.silent: 182 | agent.state.c = np.zeros(self.dim_c) 183 | else: 184 | noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0 185 | agent.state.c = agent.action.c + noise 186 | 187 | # get collision forces for any contact between two entities 188 | def get_collision_force(self, entity_a, entity_b): 189 | if (not entity_a.collide) or (not entity_b.collide): 190 | return [None, None] # not a collider 191 | if entity_a is entity_b: 192 | return [None, None] # don't collide against itself 193 | # compute actual distance between entities 194 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 195 | dist = np.sqrt(np.sum(np.square(delta_pos))) 196 | # minimum allowable distance 197 | dist_min = entity_a.size + entity_b.size 198 | # softmax penetration 199 | k = self.contact_margin 200 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k 201 | if dist == 0: 202 | force = 0 203 | else: 204 | force = self.contact_force * delta_pos / dist * penetration 205 | force_a = +force if entity_a.movable else None 206 | force_b = -force if entity_b.movable else None 207 | return [force_a, force_b] 208 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import yaml 4 | import gym.spaces 5 | import sys 6 | import tensorflow as tf 7 | from gym.utils import seeding 8 | import random 9 | 10 | from alg import REGISTRY as alg_REGISTRY 11 | from game import REGISTRY as env_REGISTRY 12 | from run import REGISTRY as run_REGISTRY 13 | from util.logger import Logger 14 | import json 15 | import time 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | 19 | 20 | def default(str): 21 | return str + ' [Default: %default]' 22 | 23 | 24 | def config_args(config_name): 25 | if config_name is not None: 26 | with open(os.path.join(os.path.dirname(__file__), "config", "{}.yaml".format(config_name)), "r") as f: 27 | try: 28 | #config_dict = yaml.load(f, Loader=yaml.FullLoader) 29 | config_dict = yaml.load(f) 30 | return config_dict 31 | except yaml.YAMLError as exc: 32 | assert False, "{}.yaml error: {}".format(config_name, exc) 33 | 34 | 35 | def readCommand(argv): 36 | """ 37 | Processes the command used to run main from the command line. 38 | """ 39 | from optparse import OptionParser 40 | usageStr = """ 41 | USAGE: python main.py 42 | """ 43 | parser = OptionParser(usageStr) 44 | 45 | parser.add_option('-n', '--numGames', dest='numGames', type='int', 46 | help=default('the number of GAMES to play'), metavar='GAMES', default=20000) 47 | parser.add_option('-e', '--epi_step', dest='epi_step', type='int', 48 | help=default('the steps of each episode'), default=99) 49 | parser.add_option('-g', '--game', dest='game', 50 | help=default('use which GAME to play'), default='pacman') 51 | parser.add_option('-a', '--alg', dest='algorithm', 52 | help=default('use which algorithm to play'), default='multi_ppo') 53 | parser.add_option('-c', '--alg_conf', dest='algorithm_config', 54 | help=default('algorithm config'), default='ppo_conf.yaml') 55 | parser.add_option('-d', '--env_conf', dest='environment_config', 56 | help=default('Environment config'), default='pacman_conf') 57 | parser.add_option('-s', '--seed', dest='seed', type='int', 58 | help=default('the seed of tf'), default=1234) 59 | parser.add_option('-o', '--optimizer', dest='optimizer', 60 | help=default('the optimizer of tensorflow'), default='adam') 61 | parser.add_option('-t', '--run_test', dest='run_test', 62 | help=default('run test'), default=False) 63 | 64 | """ 65 | parser.add_option('-f', '--fileName', dest='fileName', 66 | help=default('the file name'), default='dqn_pinball') 67 | parser.add_option('-m', '--modelName', dest='modelName', 68 | help=default('the model name'), default='dqn_pinball') 69 | """ 70 | 71 | options, otherjunk = parser.parse_args(argv) 72 | # print(type(options)) 73 | 74 | alg_conf = options.algorithm_config 75 | env_conf = options.environment_config 76 | alg_config_dict = config_args(alg_conf) 77 | env_config_dict = config_args(env_conf) 78 | 79 | args = dict() 80 | args['numGames'] = options.numGames 81 | args['game'] = options.game 82 | args['algorithm'] = options.algorithm 83 | args['epi_step'] = options.epi_step 84 | args['seed'] = options.seed 85 | args['optimizer'] = options.optimizer 86 | args['run_test'] = options.run_test 87 | t = str(time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())) 88 | 89 | if alg_config_dict is not None: 90 | args = dict(args, **alg_config_dict) 91 | if env_config_dict is not None: 92 | args = dict(args, **env_config_dict) 93 | 94 | #args['fileName'] = options.fileName 95 | #args['optimizer'] = options.optimizer 96 | 97 | for item in otherjunk: 98 | key = item.split('=')[0] 99 | value = item.split('=')[1] 100 | #print(key, value) 101 | if key not in args: 102 | raise Exception('Command line input not understood: ' + str(item)) 103 | if type(args[key]) is int: 104 | args[key] = int(value) 105 | elif type(args[key]) is float: 106 | args[key] = float(value) 107 | elif type(args[key]) is str: 108 | args[key] = str(value) 109 | elif type(args[key]) is bool: 110 | if str(value).lower() == 'true': 111 | args[key] = True 112 | elif str(value).lower() == 'false': 113 | args[key] = False 114 | else: 115 | raise Exception('Command line input is not boolean type: ' + str(value)) 116 | elif type(args[key]) is list: 117 | try: 118 | args[key] = eval(value) 119 | except (SyntaxError, NameError): 120 | value_l = str(value).replace(' ', '').replace('[', '').replace(']', '').split(',') 121 | args[key] = value_l 122 | else: 123 | raise Exception('Command line input is not valid type: ' + str(value)) 124 | 125 | args['results_path'] = "../results/" + args['algorithm'] + "/" + args['game'] + "/" + args[ 126 | 'game_name'] + "/" + t + "/" 127 | 128 | if not args['run_test']: 129 | if not os.path.exists(args['results_path']): 130 | os.makedirs(args['results_path']) 131 | if not os.path.exists(args['results_path'] + args['SAVE_PATH']): 132 | os.makedirs(args['results_path'] + args['SAVE_PATH']) 133 | if not os.path.exists(args['results_path'] + args['graph_path']): 134 | os.makedirs(args['results_path'] + args['graph_path']) 135 | if not os.path.exists(args['results_path'] + args['reward_output']): 136 | os.makedirs(args['results_path'] + args['reward_output']) 137 | if not os.path.exists(args['results_path'] + args['log']): 138 | os.makedirs(args['results_path'] + args['log']) 139 | 140 | with open( 141 | args['results_path'] + "command.txt", 142 | 'w') as f: 143 | out = ' '.join(argv) 144 | f.writelines(out) 145 | 146 | with open(args['results_path'] + "args.json", "w") as f: 147 | json.dump(args, f) 148 | 149 | # print('args', args) 150 | 151 | return args 152 | 153 | 154 | def get_space(env): 155 | if type(env.action_space) is gym.spaces.discrete.Discrete: 156 | action_dim = env.action_space.n 157 | elif type(env.action_space) is gym.spaces.box.Box: 158 | action_dim = env.action_space.shape[0] 159 | elif type(env.action_space) is int: 160 | action_dim = env.action_space 161 | elif type(env.action_space) is list: 162 | if type(env.action_space[0]) is gym.spaces.box.Box: 163 | action_dim = env.action_space[0].shape[0] 164 | else: 165 | action_dim = env.action_space[0].n 166 | else: 167 | raise Exception('action space is not a valid ' 168 | '.type') 169 | if type(env.observation_space) is gym.spaces.discrete.Discrete: 170 | features = env.observation_space.n 171 | elif type(env.observation_space) is gym.spaces.box.Box: 172 | features = env.observation_space.shape[0] 173 | elif type(env.observation_space) is int: 174 | features = env.observation_space 175 | elif type(env.observation_space) is list: 176 | features = env.observation_space[0].shape[0] 177 | else: 178 | raise Exception('observation space is not a valid type') 179 | return action_dim, features 180 | 181 | def NoneAlg(alg): 182 | algs = ['maddpg', 'multi_ppo', 'multi_ppo_sro', 'maddpg_sr' , 'shppo', 'shppo_sro'] 183 | if alg in algs: 184 | return True 185 | return False 186 | 187 | 188 | def runGames(args): 189 | print(args) 190 | if args['run_test']: 191 | logger = None 192 | else: 193 | logger = Logger(args['results_path'] + args['log'], args['results_path'] + args['graph_path'], args) 194 | np.random.seed(args['seed']) 195 | tf.set_random_seed(args['seed']) 196 | random.seed(args['seed']) 197 | seeding.np_random(args['seed']) 198 | env = env_REGISTRY[args['game']](args) 199 | args['action_dim'], args['features'] = get_space(env) 200 | if NoneAlg(args['algorithm']): 201 | alg = None 202 | else: 203 | alg = alg_REGISTRY[args['algorithm']](args['action_dim'], args['features'], args, logger) 204 | if args['run_test'] and args['game'] != 'particle': 205 | run_REGISTRY['test'](args, env, alg, logger) 206 | elif args['run_test'] and args['game'] == 'particle': 207 | run_REGISTRY['particle'](args, env, alg, logger) 208 | else: 209 | run_REGISTRY[args['algorithm']](args, env, alg, logger) 210 | 211 | 212 | if __name__ == '__main__': 213 | args = readCommand(sys.argv[1:]) # Get game components based on input 214 | runGames(args) 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /alg/maddpg/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import tensorflow as tf 4 | import time 5 | import pickle 6 | 7 | from alg.maddpg.common import tf_util as U 8 | from alg.maddpg.trainer.maddpg import MADDPGAgentTrainer 9 | import tensorflow.contrib.layers as layers 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") 14 | # Environment 15 | parser.add_argument("--scenario_name", type=str, default="simple_spread", help="name of the scenario script") 16 | parser.add_argument("--max-episode-len", type=int, default=25, help="maximum episode length") 17 | parser.add_argument("--num-episodes", type=int, default=60000, help="number of episodes") 18 | parser.add_argument("--num-adversaries", type=int, default=0, help="number of adversaries") 19 | parser.add_argument("--good-policy", type=str, default="maddpg", help="policy for good agents") 20 | parser.add_argument("--adv-policy", type=str, default="maddpg", help="policy of adversaries") 21 | # Core training parameters 22 | parser.add_argument("--lr", type=float, default=1e-2, help="learning rate for Adam optimizer") 23 | parser.add_argument("--learning_rate_c", type=float, default=1e-2, help="learning rate for Adam optimizer") 24 | parser.add_argument("--learning_rate_a", type=float, default=1e-2, help="learning rate for Adam optimizer") 25 | parser.add_argument("--gamma", type=float, default=0.95, help="discount factor") 26 | parser.add_argument("--batch-size", type=int, default=1024, help="number of episodes to optimize at the same time") 27 | parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp") 28 | # Checkpointing 29 | parser.add_argument("--exp-name", type=str, default=None, help="name of the experiment") 30 | parser.add_argument("--save-dir", type=str, default="/tmp/policy/", help="directory in which training state and model should be saved") 31 | parser.add_argument("--save-rate", type=int, default=1000, help="save model once every time this many episodes are completed") 32 | parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded") 33 | # Evaluation 34 | parser.add_argument("--restore", action="store_true", default=False) 35 | parser.add_argument("--display", action="store_true", default=False) 36 | parser.add_argument("--benchmark", action="store_true", default=False) 37 | parser.add_argument("--benchmark-iters", type=int, default=100000, help="number of iterations run for benchmarking") 38 | parser.add_argument("--benchmark-dir", type=str, default="./benchmark_files/", help="directory where benchmark data is saved") 39 | parser.add_argument("--plots-dir", type=str, default="./learning_curves/", help="directory where plot data is saved") 40 | return parser.parse_args() 41 | 42 | 43 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, rnn_cell=None): 44 | # This model takes as input an observation and returns values of all actions 45 | with tf.variable_scope(scope, reuse=reuse): 46 | out = input 47 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu) 48 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu) 49 | out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None) 50 | return out 51 | 52 | 53 | def get_trainers(env, num_adversaries, obs_shape_n, arglist): 54 | trainers = [] 55 | model = mlp_model 56 | trainer = MADDPGAgentTrainer 57 | for i in range(num_adversaries): 58 | trainers.append(trainer( 59 | "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist, 60 | local_q_func=(arglist.adv_policy=='ddpg'))) 61 | for i in range(num_adversaries, env.n): 62 | trainers.append(trainer( 63 | "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist, 64 | local_q_func=(arglist.good_policy=='ddpg'))) 65 | return trainers 66 | 67 | 68 | def make_env(scenario_name, arglist, benchmark=False): 69 | from game.particle.multiagent.environment import MultiAgentEnv 70 | import game.particle.multiagent.scenarios as scenarios 71 | 72 | # load scenario from script 73 | scenario = scenarios.load(scenario_name + ".py").Scenario() 74 | # create world 75 | world = scenario.make_world() 76 | # create multiagent environment 77 | if benchmark: 78 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) 79 | else: 80 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) 81 | return env 82 | 83 | 84 | def train(arglist): 85 | with U.single_threaded_session(): 86 | # Create environment 87 | env = make_env(arglist.scenario_name, arglist, arglist.benchmark) 88 | # Create agent trainers 89 | obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] 90 | num_adversaries = min(env.n, arglist.num_adversaries) 91 | trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) 92 | print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) 93 | 94 | # Initialize 95 | U.initialize() 96 | 97 | # Load previous results, if necessary 98 | if arglist.load_dir == "": 99 | arglist.load_dir = arglist.save_dir 100 | if arglist.display or arglist.restore or arglist.benchmark: 101 | print('Loading previous state...') 102 | U.load_state(arglist.load_dir) 103 | 104 | episode_rewards = [0.0] # sum of rewards for all agents 105 | agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward 106 | final_ep_rewards = [] # sum of rewards for training curve 107 | final_ep_ag_rewards = [] # agent rewards for training curve 108 | agent_info = [[[]]] # placeholder for benchmarking info 109 | saver = tf.train.Saver() 110 | obs_n = env.reset() 111 | episode_step = 0 112 | train_step = 0 113 | t_start = time.time() 114 | 115 | print('Starting iterations...') 116 | while True: 117 | # get action 118 | action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] 119 | # environment step 120 | print(new_obs_n) 121 | new_obs_n, rew_n, done_n, info_n = env.step(action_n) 122 | episode_step += 1 123 | done = all(done_n) 124 | terminal = (episode_step >= arglist.max_episode_len) 125 | # collect experience 126 | for i, agent in enumerate(trainers): 127 | agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) 128 | obs_n = new_obs_n 129 | 130 | for i, rew in enumerate(rew_n): 131 | episode_rewards[-1] += rew 132 | agent_rewards[i][-1] += rew 133 | 134 | if done or terminal: 135 | obs_n = env.reset() 136 | episode_step = 0 137 | episode_rewards.append(0) 138 | for a in agent_rewards: 139 | a.append(0) 140 | agent_info.append([[]]) 141 | 142 | # increment global step counter 143 | train_step += 1 144 | 145 | # for benchmarking learned policies 146 | if arglist.benchmark: 147 | for i, info in enumerate(info_n): 148 | agent_info[-1][i].append(info_n['n']) 149 | if train_step > arglist.benchmark_iters and (done or terminal): 150 | file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' 151 | print('Finished benchmarking, now saving...') 152 | with open(file_name, 'wb') as fp: 153 | pickle.dump(agent_info[:-1], fp) 154 | break 155 | continue 156 | 157 | # for displaying learned policies 158 | if arglist.display: 159 | time.sleep(0.1) 160 | env.render() 161 | continue 162 | 163 | # update all trainers, if not in display or benchmark mode 164 | loss = None 165 | for agent in trainers: 166 | agent.preupdate() 167 | for agent in trainers: 168 | loss = agent.update(trainers, train_step) 169 | 170 | # save model, display training output 171 | if terminal and (len(episode_rewards) % arglist.save_rate == 0): 172 | U.save_state(arglist.save_dir, saver=saver) 173 | # print statement depends on whether or not there are adversaries 174 | if num_adversaries == 0: 175 | print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( 176 | train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) 177 | else: 178 | print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( 179 | train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), 180 | [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) 181 | t_start = time.time() 182 | # Keep track of final episode reward 183 | final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) 184 | for rew in agent_rewards: 185 | final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) 186 | 187 | # saves final episode reward for plotting training curve later 188 | if len(episode_rewards) > arglist.num_episodes: 189 | rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' 190 | with open(rew_file_name, 'wb') as fp: 191 | pickle.dump(final_ep_rewards, fp) 192 | agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' 193 | with open(agrew_file_name, 'wb') as fp: 194 | pickle.dump(final_ep_ag_rewards, fp) 195 | print('...Finished total of {} episodes.'.format(len(episode_rewards))) 196 | break 197 | 198 | 199 | # if __name__ == '__main__': 200 | # arglist = parse_args() 201 | # train(arglist) 202 | -------------------------------------------------------------------------------- /alg/sharing_multi_ppo/ppo.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import tensorflow as tf 4 | from alg.optimizer import Optimizer 5 | 6 | 7 | class PPO: 8 | def __init__(self, n_actions, n_features, n_agents, args, SESS, logger): 9 | self.n_actions = n_actions 10 | self.n_features = n_features + n_agents 11 | self.n_agents = n_agents 12 | self.args = args 13 | self.logger = logger 14 | self.learning_step = 0 15 | self.obs = tf.placeholder(tf.float32, [None, self.n_features], 's') 16 | 17 | self.act_probs, self.policy_param = self.build_actor_net(self.args['policy']) 18 | self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy'], trainable=False) 19 | self.v_preds, self.v_param = self.build_critic_net('critic') 20 | 21 | if self.args['continuous_action']: 22 | self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0) 23 | else: 24 | self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1) 25 | self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1]) 26 | self.act_deterministic = tf.argmax(self.act_probs, axis=1) 27 | 28 | self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)] 29 | 30 | opt = Optimizer(args['optimizer'], args['learning_rate_a']) 31 | self.optimizer = opt.get_optimizer() 32 | opt_c = Optimizer(args['optimizer'], args['learning_rate_c']) 33 | self.optimizer_c = opt_c.get_optimizer() 34 | 35 | with tf.variable_scope('train_inp'): 36 | if self.args['continuous_action']: 37 | self.actions = tf.placeholder(tf.float32, [None, n_actions], 'action') 38 | else: 39 | self.actions = tf.placeholder(dtype=tf.float32, shape=[None, n_actions], name='actions') 40 | self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards') 41 | self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next') 42 | self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes') 43 | 44 | self.build_loss() 45 | 46 | self.sess = SESS 47 | #self.sess.run(tf.global_variables_initializer()) 48 | 49 | def build_actor_net(self, scope, trainable=True): 50 | with tf.variable_scope(scope): 51 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable) 52 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu, 53 | trainable=trainable) 54 | if self.args['continuous_action']: 55 | mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable) 56 | sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable) 57 | act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9) 58 | else: 59 | act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax) 60 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 61 | return act_probs, params 62 | 63 | def build_critic_net(self, scope): 64 | with tf.variable_scope(scope): 65 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu) 66 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu) 67 | v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None) 68 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 69 | return v_preds, params 70 | 71 | def build_loss(self): 72 | with tf.variable_scope('update_critic'): 73 | self.advantage = self.rewards - self.v_preds 74 | self.c_loss = tf.reduce_mean(tf.square(self.advantage)) 75 | ''' 76 | gradients = self.optimizer_c.compute_gradients(self.c_loss, var_list=self.v_param) 77 | for i, (grad, var) in enumerate(gradients): 78 | if grad is not None: 79 | gradients[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var) 80 | self.train_c_op = self.optimizer_c.apply_gradients(gradients) 81 | ''' 82 | self.train_c_op = self.optimizer_c.minimize(self.c_loss) 83 | 84 | with tf.variable_scope('update_actor'): 85 | with tf.variable_scope('loss/clip'): 86 | # ratios = tf.divide(act_probs, act_probs_old) 87 | if self.args['continuous_action']: 88 | act_probs = self.act_probs.prob(self.actions) 89 | act_probs_old = self.o_act_probs.prob(self.actions) 90 | entropy = self.act_probs.entropy() 91 | ratios = act_probs / act_probs_old 92 | #ratios = self.act_probs.prob(self.actions) / self.o_act_probs.prob(self.actions) 93 | else: 94 | act_probs = self.act_probs * self.actions# * tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1]) 95 | act_probs = tf.reduce_sum(act_probs, axis=1) 96 | # probabilities of actions which agent took with old policy 97 | act_probs_old = self.o_act_probs * self.actions#* tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1]) 98 | act_probs_old = tf.reduce_sum(act_probs_old, axis=1) 99 | entropy = -tf.reduce_sum(self.act_probs * 100 | tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1) 101 | ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old)) 102 | self.entropy = self.args['c2'] * tf.reduce_mean(entropy) # mean of entropy of pi(obs) 103 | clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'], clip_value_max=1 + self.args['clip_value']) 104 | loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios)) 105 | self.loss_clip = tf.reduce_mean(loss_clip) 106 | self.a_loss = -(self.loss_clip + self.entropy) 107 | ''' 108 | gradients_t = self.optimizer.compute_gradients(self.a_loss, var_list=self.policy_param) 109 | for i, (grad, var) in enumerate(gradients_t): 110 | if grad is not None: 111 | gradients_t[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var) 112 | self.train_a_op = self.optimizer.apply_gradients(gradients_t) 113 | ''' 114 | self.train_a_op = self.optimizer.minimize(self.a_loss) 115 | 116 | def choose_action(self, obs, agent_id=0): 117 | obs = obs[np.newaxis, :] 118 | obs = self.get_agent_obs(obs, agent_id) 119 | if self.args['continuous_action']: 120 | actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs}) 121 | #print('clip', actions[0], self.args['action_clip'], np.clip(actions[0], -self.args['action_clip'], self.args['action_clip'])) 122 | return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip']) 123 | else: 124 | if self.args['stochastic']: 125 | actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs}) 126 | action = actions[0] 127 | action_one_hot = np.zeros(self.n_actions) 128 | action_one_hot[action] = 1 129 | #print(p) 130 | return action_one_hot 131 | else: 132 | actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs}) 133 | action = actions[0] 134 | action_one_hot = np.zeros(self.n_actions) 135 | action_one_hot[action] = 1 136 | return action_one_hot 137 | 138 | def choose_hold_action(self, obs): 139 | return np.zeros(self.n_actions) 140 | 141 | def choose_deterministic_action(self, obs, agent_id=0): 142 | obs = self.get_agent_obs(obs, agent_id) 143 | actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0] 144 | action_one_hots = [] 145 | for i in range(len(actions)): 146 | action = actions[i] 147 | action_one_hot = np.zeros(self.n_actions) 148 | action_one_hot[action] = 1 149 | action_one_hots.append(action_one_hot) 150 | return action_one_hots 151 | 152 | def get_agent_obs(self, obs, agent_id=0): 153 | if type(agent_id) is int: 154 | agent_id_arr = [agent_id] * len(obs) 155 | elif type(agent_id) is list: 156 | agent_id_arr = agent_id 157 | else: 158 | raise Exception('the agent_id field must be type of int or list') 159 | agent_one_hot = np.eye(self.n_agents)[agent_id_arr] 160 | obs = np.hstack((agent_one_hot, obs)) 161 | return obs 162 | 163 | def get_v(self, s, agent_id=0): 164 | obs = np.array(s) 165 | obs = obs[np.newaxis, :] 166 | obs = self.get_agent_obs(obs, agent_id) 167 | v_preds = self.sess.run(self.v_preds, {self.obs: obs}) 168 | return v_preds[0, 0] 169 | 170 | def update(self, actor, s, a, r, options, terms, epi, agent_id=0): 171 | self.sess.run(self.replace_op) 172 | s = self.get_agent_obs(s, agent_id) 173 | adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r}) 174 | for i in range(self.args['epi_train_times']): 175 | _, a_loss, clip, entropy = self.sess.run([self.train_a_op, self.a_loss, self.loss_clip, self.entropy], {self.obs: s, self.actions: a, self.gaes: adv}) 176 | __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r}) 177 | self.logger.write_tb_log('a_loss', a_loss, self.learning_step) 178 | self.logger.write_tb_log('c_loss', c_loss, self.learning_step) 179 | self.logger.write_tb_log('clip', clip, self.learning_step) 180 | self.logger.write_tb_log('entropy', entropy, self.learning_step) 181 | self.learning_step += 1 182 | #print(a_loss, clip, entropy, c_loss) 183 | 184 | def load_model(self, path): 185 | saver = tf.train.Saver(self.policy_param) 186 | print(path + '.ckpt') 187 | saver.restore(self.sess, path + ".ckpt") 188 | 189 | def save_model(self, path): 190 | saver = tf.train.Saver(self.policy_param) 191 | saver.save(self.sess, path + ".ckpt") 192 | -------------------------------------------------------------------------------- /alg/muti_ptf_ppo/ppo.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import tensorflow as tf 4 | from alg.optimizer import Optimizer 5 | 6 | 7 | class PPO: 8 | def __init__(self, n_actions, n_features, args, SESS, logger, i): 9 | self.n_actions = n_actions 10 | self.n_features = n_features 11 | self.index = i 12 | self.args = args 13 | self.logger = logger 14 | self.learning_step = 0 15 | self.obs = tf.placeholder(tf.float32, [None, n_features], 's') 16 | 17 | self.act_probs, self.policy_param = self.build_actor_net(self.args['policy'] + "_" + str(self.index)) 18 | self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy'] + "_" + str(self.index), trainable=False) 19 | self.v_preds, self.v_param = self.build_critic_net('critic' + "_" + str(self.index)) 20 | 21 | if self.args['continuous_action']: 22 | self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0) 23 | else: 24 | self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1) 25 | self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1]) 26 | self.act_deterministic = tf.argmax(self.act_probs, axis=1) 27 | 28 | self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)] 29 | 30 | opt = Optimizer(args['optimizer'], args['learning_rate_a']) 31 | self.optimizer = opt.get_optimizer() 32 | opt_c = Optimizer(args['optimizer'], args['learning_rate_c']) 33 | self.optimizer_c = opt_c.get_optimizer() 34 | 35 | with tf.variable_scope('train_inp' + "_" + str(self.index)): 36 | if self.args['continuous_action']: 37 | self.actions = tf.placeholder(tf.float32, [None, n_actions], 'action') 38 | else: 39 | self.actions = tf.placeholder(dtype=tf.float32, shape=[None, n_actions], name='actions') 40 | self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards') 41 | self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next') 42 | self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes') 43 | 44 | self.build_loss() 45 | 46 | self.sess = SESS 47 | #self.sess.run(tf.global_variables_initializer()) 48 | 49 | def build_actor_net(self, scope, trainable=True): 50 | with tf.variable_scope(scope): 51 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable) 52 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu, 53 | trainable=trainable) 54 | if self.args['continuous_action']: 55 | mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable) 56 | sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable) 57 | act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9) 58 | else: 59 | act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax) 60 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 61 | return act_probs, params 62 | 63 | def build_critic_net(self, scope): 64 | with tf.variable_scope(scope): 65 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu) 66 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu) 67 | v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None) 68 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 69 | return v_preds, params 70 | 71 | def build_loss(self): 72 | with tf.variable_scope('update_critic' + "_" + str(self.index)): 73 | self.advantage = self.rewards - self.v_preds 74 | self.c_loss = tf.reduce_mean(tf.square(self.advantage)) 75 | ''' 76 | gradients = self.optimizer_c.compute_gradients(self.c_loss, var_list=self.v_param) 77 | for i, (grad, var) in enumerate(gradients): 78 | if grad is not None: 79 | gradients[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var) 80 | self.train_c_op = self.optimizer_c.apply_gradients(gradients) 81 | ''' 82 | self.train_c_op = self.optimizer_c.minimize(self.c_loss) 83 | 84 | with tf.variable_scope('update_actor' + "_" + str(self.index)): 85 | with tf.variable_scope('loss/clip'): 86 | # ratios = tf.divide(act_probs, act_probs_old) 87 | if self.args['continuous_action']: 88 | act_probs = self.act_probs.prob(self.actions) 89 | act_probs_old = self.o_act_probs.prob(self.actions) 90 | entropy = self.act_probs.entropy() 91 | ratios = act_probs / act_probs_old 92 | #ratios = self.act_probs.prob(self.actions) / self.o_act_probs.prob(self.actions) 93 | else: 94 | act_probs = self.act_probs * self.actions# * tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1]) 95 | act_probs = tf.reduce_sum(act_probs, axis=1) 96 | # probabilities of actions which agent took with old policy 97 | act_probs_old = self.o_act_probs * self.actions#* tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1]) 98 | act_probs_old = tf.reduce_sum(act_probs_old, axis=1) 99 | entropy = -tf.reduce_sum(self.act_probs * 100 | tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1) 101 | ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old)) 102 | self.entropy = self.args['c2'] * tf.reduce_mean(entropy) # mean of entropy of pi(obs) 103 | clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'], clip_value_max=1 + self.args['clip_value']) 104 | loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios)) 105 | self.loss_clip = tf.reduce_mean(loss_clip) 106 | self.a_loss = -(self.loss_clip + self.entropy) 107 | ''' 108 | gradients_t = self.optimizer.compute_gradients(self.a_loss, var_list=self.policy_param) 109 | for i, (grad, var) in enumerate(gradients_t): 110 | if grad is not None: 111 | gradients_t[i] = (tf.clip_by_norm(grad, self.args['grad_clip']), var) 112 | self.train_a_op = self.optimizer.apply_gradients(gradients_t) 113 | ''' 114 | self.train_a_op = self.optimizer.minimize(self.a_loss) 115 | 116 | def choose_action(self, obs, agent_id=0): 117 | obs = np.array(obs) 118 | obs = obs[np.newaxis, :] 119 | if self.args['continuous_action']: 120 | actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs}) 121 | #print('clip', actions[0], self.args['action_clip'], np.clip(actions[0], -self.args['action_clip'], self.args['action_clip'])) 122 | return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip']) 123 | else: 124 | if self.args['stochastic']: 125 | actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs}) 126 | action = actions[0] 127 | action_one_hot = np.zeros(self.n_actions) 128 | action_one_hot[action] = 1 129 | #print(p) 130 | return action_one_hot 131 | else: 132 | actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs}) 133 | action = actions[0] 134 | action_one_hot = np.zeros(self.n_actions) 135 | action_one_hot[action] = 1 136 | return action_one_hot 137 | 138 | def get_action_and_v(self, s): 139 | obs = np.array(s) 140 | if self.args['continuous_action']: 141 | actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs}) 142 | return actions, v_preds 143 | else: 144 | if self.args['stochastic']: 145 | actions, v_preds, p = self.sess.run([self.act_probs, self.v_preds, self.act_probs], feed_dict={self.obs: obs}) 146 | return actions, v_preds 147 | 148 | def choose_hold_action(self, obs): 149 | return np.zeros(self.n_actions) 150 | 151 | def choose_deterministic_action(self, obs): 152 | actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0] 153 | action_one_hots = [] 154 | for i in range(len(actions)): 155 | action = actions[i] 156 | action_one_hot = np.zeros(self.n_actions) 157 | action_one_hot[action] = 1 158 | action_one_hots.append(action_one_hot) 159 | return action_one_hots 160 | 161 | def get_v(self, s, agent_id=0): 162 | obs = np.array(s) 163 | obs = obs[np.newaxis, :] 164 | v_preds = self.sess.run(self.v_preds, {self.obs: obs}) 165 | return v_preds[0, 0] 166 | 167 | def update(self, actor, s, a, r, options, terms, epi, agentid=0): 168 | self.sess.run(self.replace_op) 169 | adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r}) 170 | for i in range(self.args['epi_train_times']): 171 | _, a_loss, clip, entropy = self.sess.run([self.train_a_op, self.a_loss, self.loss_clip, self.entropy], {self.obs: s, self.actions: a, self.gaes: adv}) 172 | __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r}) 173 | self.logger.write_tb_log('a_loss_' + str(self.index), a_loss, self.learning_step) 174 | self.logger.write_tb_log('c_loss_' + str(self.index), c_loss, self.learning_step) 175 | self.logger.write_tb_log('clip_' + str(self.index), clip, self.learning_step) 176 | self.logger.write_tb_log('entropy_' + str(self.index), entropy, self.learning_step) 177 | self.learning_step += 1 178 | #print(a_loss, clip, entropy, c_loss) 179 | 180 | def load_model(self, path): 181 | saver = tf.train.Saver(self.policy_param) 182 | print(path + '.ckpt') 183 | saver.restore(self.sess, path + ".ckpt") 184 | 185 | def save_model(self, path): 186 | saver = tf.train.Saver(self.policy_param) 187 | saver.save(self.sess, path + ".ckpt") 188 | -------------------------------------------------------------------------------- /game/particle/multiagent/rendering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2D rendering framework 3 | """ 4 | from __future__ import division 5 | import os 6 | import six 7 | import sys 8 | 9 | if "Apple" in sys.version: 10 | if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: 11 | os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' 12 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite 13 | 14 | from gym.utils import reraise 15 | from gym import error 16 | 17 | try: 18 | import pyglet 19 | except ImportError as e: 20 | reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") 21 | 22 | try: 23 | from pyglet.gl import * 24 | except ImportError as e: 25 | reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") 26 | 27 | import math 28 | import numpy as np 29 | 30 | RAD2DEG = 57.29577951308232 31 | 32 | 33 | def get_display(spec): 34 | """Convert a display specification (such as :0) into an actual Display 35 | object. 36 | 37 | Pyglet only supports multiple Displays on Linux. 38 | """ 39 | if spec is None: 40 | return None 41 | elif isinstance(spec, six.string_types): 42 | return pyglet.canvas.Display(spec) 43 | else: 44 | raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) 45 | 46 | 47 | class Viewer(object): 48 | def __init__(self, width, height, display=None): 49 | display = get_display(display) 50 | 51 | self.width = width 52 | self.height = height 53 | 54 | self.window = pyglet.window.Window(width=width, height=height, display=display) 55 | self.window.on_close = self.window_closed_by_user 56 | self.geoms = [] 57 | self.onetime_geoms = [] 58 | self.transform = Transform() 59 | 60 | glEnable(GL_BLEND) 61 | # glEnable(GL_MULTISAMPLE) 62 | glEnable(GL_LINE_SMOOTH) 63 | # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE) 64 | glHint(GL_LINE_SMOOTH_HINT, GL_NICEST) 65 | glLineWidth(2.0) 66 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 67 | 68 | def close(self): 69 | self.window.close() 70 | 71 | def window_closed_by_user(self): 72 | self.close() 73 | 74 | def set_bounds(self, left, right, bottom, top): 75 | assert right > left and top > bottom 76 | scalex = self.width/(right-left) 77 | scaley = self.height/(top-bottom) 78 | self.transform = Transform( 79 | translation=(-left*scalex, -bottom*scaley), 80 | scale=(scalex, scaley)) 81 | 82 | def add_geom(self, geom): 83 | self.geoms.append(geom) 84 | 85 | def add_onetime(self, geom): 86 | self.onetime_geoms.append(geom) 87 | 88 | def render(self, return_rgb_array=False): 89 | glClearColor(1,1,1,1) 90 | self.window.clear() 91 | self.window.switch_to() 92 | self.window.dispatch_events() 93 | self.transform.enable() 94 | for geom in self.geoms: 95 | geom.render() 96 | for geom in self.onetime_geoms: 97 | geom.render() 98 | self.transform.disable() 99 | arr = None 100 | if return_rgb_array: 101 | buffer = pyglet.image.get_buffer_manager().get_color_buffer() 102 | image_data = buffer.get_image_data() 103 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 104 | # In https://github.com/openai/gym-http-api/issues/2, we 105 | # discovered that someone using Xmonad on Arch was having 106 | # a window of size 598 x 398, though a 600 x 400 window 107 | # was requested. (Guess Xmonad was preserving a pixel for 108 | # the boundary.) So we use the buffer height/width rather 109 | # than the requested one. 110 | arr = arr.reshape(buffer.height, buffer.width, 4) 111 | arr = arr[::-1,:,0:3] 112 | self.window.flip() 113 | self.onetime_geoms = [] 114 | return arr 115 | 116 | # Convenience 117 | def draw_circle(self, radius=10, res=30, filled=True, **attrs): 118 | geom = make_circle(radius=radius, res=res, filled=filled) 119 | _add_attrs(geom, attrs) 120 | self.add_onetime(geom) 121 | return geom 122 | 123 | def draw_polygon(self, v, filled=True, **attrs): 124 | geom = make_polygon(v=v, filled=filled) 125 | _add_attrs(geom, attrs) 126 | self.add_onetime(geom) 127 | return geom 128 | 129 | def draw_polyline(self, v, **attrs): 130 | geom = make_polyline(v=v) 131 | _add_attrs(geom, attrs) 132 | self.add_onetime(geom) 133 | return geom 134 | 135 | def draw_line(self, start, end, **attrs): 136 | geom = Line(start, end) 137 | _add_attrs(geom, attrs) 138 | self.add_onetime(geom) 139 | return geom 140 | 141 | def get_array(self): 142 | self.window.flip() 143 | image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() 144 | self.window.flip() 145 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 146 | arr = arr.reshape(self.height, self.width, 4) 147 | return arr[::-1,:,0:3] 148 | 149 | 150 | def _add_attrs(geom, attrs): 151 | if "color" in attrs: 152 | geom.set_color(*attrs["color"]) 153 | if "linewidth" in attrs: 154 | geom.set_linewidth(attrs["linewidth"]) 155 | 156 | 157 | class Geom(object): 158 | def __init__(self): 159 | self._color=Color((0, 0, 0, 1.0)) 160 | self.attrs = [self._color] 161 | 162 | def render(self): 163 | for attr in reversed(self.attrs): 164 | attr.enable() 165 | self.render1() 166 | for attr in self.attrs: 167 | attr.disable() 168 | 169 | def render1(self): 170 | raise NotImplementedError 171 | 172 | def add_attr(self, attr): 173 | self.attrs.append(attr) 174 | 175 | def set_color(self, r, g, b, alpha=1): 176 | self._color.vec4 = (r, g, b, alpha) 177 | 178 | 179 | class Attr(object): 180 | def enable(self): 181 | raise NotImplementedError 182 | 183 | def disable(self): 184 | pass 185 | 186 | 187 | class Transform(Attr): 188 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): 189 | self.set_translation(*translation) 190 | self.set_rotation(rotation) 191 | self.set_scale(*scale) 192 | 193 | def enable(self): 194 | glPushMatrix() 195 | glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint 196 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) 197 | glScalef(self.scale[0], self.scale[1], 1) 198 | 199 | def disable(self): 200 | glPopMatrix() 201 | 202 | def set_translation(self, newx, newy): 203 | self.translation = (float(newx), float(newy)) 204 | 205 | def set_rotation(self, new): 206 | self.rotation = float(new) 207 | 208 | def set_scale(self, newx, newy): 209 | self.scale = (float(newx), float(newy)) 210 | 211 | 212 | class Color(Attr): 213 | def __init__(self, vec4): 214 | self.vec4 = vec4 215 | 216 | def enable(self): 217 | glColor4f(*self.vec4) 218 | 219 | 220 | class LineStyle(Attr): 221 | def __init__(self, style): 222 | self.style = style 223 | 224 | def enable(self): 225 | glEnable(GL_LINE_STIPPLE) 226 | glLineStipple(1, self.style) 227 | 228 | def disable(self): 229 | glDisable(GL_LINE_STIPPLE) 230 | 231 | 232 | class LineWidth(Attr): 233 | def __init__(self, stroke): 234 | self.stroke = stroke 235 | 236 | def enable(self): 237 | glLineWidth(self.stroke) 238 | 239 | 240 | class Point(Geom): 241 | def __init__(self): 242 | Geom.__init__(self) 243 | 244 | def render1(self): 245 | glBegin(GL_POINTS) # draw point 246 | glVertex3f(0.0, 0.0, 0.0) 247 | glEnd() 248 | 249 | 250 | class FilledPolygon(Geom): 251 | def __init__(self, v): 252 | Geom.__init__(self) 253 | self.v = v 254 | 255 | def render1(self): 256 | if len(self.v) == 4: glBegin(GL_QUADS) 257 | elif len(self.v) > 4: glBegin(GL_POLYGON) 258 | else: glBegin(GL_TRIANGLES) 259 | for p in self.v: 260 | glVertex3f(p[0], p[1], 0) # draw each vertex 261 | glEnd() 262 | 263 | color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) 264 | glColor4f(*color) 265 | glBegin(GL_LINE_LOOP) 266 | for p in self.v: 267 | glVertex3f(p[0], p[1], 0) # draw each vertex 268 | glEnd() 269 | 270 | 271 | def make_circle(radius=10, res=30, filled=True): 272 | points = [] 273 | for i in range(res): 274 | ang = 2*math.pi*i / res 275 | points.append((math.cos(ang)*radius, math.sin(ang)*radius)) 276 | if filled: 277 | return FilledPolygon(points) 278 | else: 279 | return PolyLine(points, True) 280 | 281 | 282 | def make_polygon(v, filled=True): 283 | if filled: return FilledPolygon(v) 284 | else: return PolyLine(v, True) 285 | 286 | 287 | def make_polyline(v): 288 | return PolyLine(v, False) 289 | 290 | 291 | def make_capsule(length, width): 292 | l, r, t, b = 0, length, width/2, -width/2 293 | box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) 294 | circ0 = make_circle(width/2) 295 | circ1 = make_circle(width/2) 296 | circ1.add_attr(Transform(translation=(length, 0))) 297 | geom = Compound([box, circ0, circ1]) 298 | return geom 299 | 300 | 301 | class Compound(Geom): 302 | def __init__(self, gs): 303 | Geom.__init__(self) 304 | self.gs = gs 305 | for g in self.gs: 306 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)] 307 | 308 | def render1(self): 309 | for g in self.gs: 310 | g.render() 311 | 312 | 313 | class PolyLine(Geom): 314 | def __init__(self, v, close): 315 | Geom.__init__(self) 316 | self.v = v 317 | self.close = close 318 | self.linewidth = LineWidth(1) 319 | self.add_attr(self.linewidth) 320 | 321 | def render1(self): 322 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) 323 | for p in self.v: 324 | glVertex3f(p[0], p[1],0) # draw each vertex 325 | glEnd() 326 | 327 | def set_linewidth(self, x): 328 | self.linewidth.stroke = x 329 | 330 | 331 | class Line(Geom): 332 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): 333 | Geom.__init__(self) 334 | self.start = start 335 | self.end = end 336 | self.linewidth = LineWidth(1) 337 | self.add_attr(self.linewidth) 338 | 339 | def render1(self): 340 | glBegin(GL_LINES) 341 | glVertex2f(*self.start) 342 | glVertex2f(*self.end) 343 | glEnd() 344 | 345 | 346 | class Image(Geom): 347 | def __init__(self, fname, width, height): 348 | Geom.__init__(self) 349 | self.width = width 350 | self.height = height 351 | img = pyglet.image.load(fname) 352 | self.img = img 353 | self.flip = False 354 | 355 | def render1(self): 356 | self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) 357 | 358 | # ================================================================ 359 | 360 | 361 | class SimpleImageViewer(object): 362 | def __init__(self, display=None): 363 | self.window = None 364 | self.isopen = False 365 | self.display = display 366 | 367 | def imshow(self, arr): 368 | if self.window is None: 369 | height, width, channels = arr.shape 370 | self.window = pyglet.window.Window(width=width, height=height, display=self.display) 371 | self.width = width 372 | self.height = height 373 | self.isopen = True 374 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" 375 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) 376 | self.window.clear() 377 | self.window.switch_to() 378 | self.window.dispatch_events() 379 | image.blit(0,0) 380 | self.window.flip() 381 | 382 | def close(self): 383 | if self.isopen: 384 | self.window.close() 385 | self.isopen = False 386 | 387 | def __del__(self): 388 | self.close() 389 | -------------------------------------------------------------------------------- /alg/muti_ptf_ppo/ppo_add_entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from alg.optimizer import Optimizer 4 | 5 | 6 | class PPO: 7 | def __init__(self, n_actions, n_features, args, SESS, logger, index): 8 | self.index = index 9 | self.args = args 10 | self.n_actions = n_actions 11 | self.n_features = n_features 12 | self.logger = logger 13 | self.learning_step = 0 14 | 15 | self.obs = tf.placeholder(tf.float32, [None, self.n_features], 's') 16 | 17 | self.act_probs, self.policy_param = self.build_actor_net(self.args['policy'] + "_" + str(self.index)) 18 | self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy'] + "_" + str(self.index)) 19 | self.v_preds, self.v_param = self.build_critic_net('critic' + "_" + str(self.index)) 20 | 21 | if self.args['continuous_action']: 22 | self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0) 23 | else: 24 | self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1) 25 | self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1]) 26 | self.act_deterministic = tf.argmax(self.act_probs, axis=1) 27 | 28 | self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)] 29 | 30 | opt = Optimizer(args['optimizer'], args['learning_rate_a']) 31 | self.optimizer = opt.get_optimizer() 32 | opt_c = Optimizer(args['optimizer'], args['learning_rate_c']) 33 | self.optimizer_c = opt_c.get_optimizer() 34 | 35 | with tf.variable_scope('train_inp' + "_" + str(self.index)): 36 | if self.args['continuous_action']: 37 | self.actions = tf.placeholder(tf.float32, [None, self.n_actions], 'action') 38 | self.mu = tf.placeholder(tf.float32, [None, self.n_actions], 'input_mu') 39 | self.sigma = tf.placeholder(tf.float32, [None, self.n_actions], 'input_sigma') 40 | else: 41 | self.actions = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='actions') 42 | self.s_a_prob = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='s_a_prob') 43 | self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards') 44 | self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next') 45 | self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes') 46 | self.term = tf.placeholder(dtype=tf.float32, shape=[None], name='term') 47 | self.e = tf.placeholder(tf.float32, (), 'e') 48 | 49 | self.build_loss() 50 | 51 | self.sess = SESS 52 | 53 | def build_actor_net(self, scope, trainable=True): 54 | with tf.variable_scope(scope): 55 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable) 56 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu, 57 | trainable=trainable) 58 | if self.args['continuous_action']: 59 | mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable) 60 | sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable) 61 | act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9) 62 | else: 63 | act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax) 64 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 65 | return act_probs, params 66 | 67 | def build_critic_net(self, scope): 68 | with tf.variable_scope(scope): 69 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu) 70 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu) 71 | v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None) 72 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 73 | return v_preds, params 74 | 75 | def build_loss(self): 76 | with tf.variable_scope('update_critic' + "_" + str(self.index)): 77 | self.advantage = self.rewards - self.v_preds 78 | self.c_loss = tf.reduce_mean(tf.square(self.advantage)) 79 | self.train_c_op = self.optimizer_c.minimize(self.c_loss, var_list=self.v_param) 80 | 81 | with tf.variable_scope('update_actor' + "_" + str(self.index)): 82 | if self.args['continuous_action']: 83 | act_probs = self.act_probs.prob(self.actions) 84 | act_probs_old = self.o_act_probs.prob(self.actions) 85 | entropy = self.act_probs.entropy() 86 | otherNormal = tf.distributions.Normal(self.mu, self.sigma) 87 | otherEntroy = otherNormal.cross_entropy(self.act_probs) 88 | else: 89 | act_probs = self.act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1]) 90 | act_probs = tf.reduce_sum(act_probs, axis=1) 91 | # probabilities of actions which agent took with old policy 92 | act_probs_old = self.o_act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1]) 93 | act_probs_old = tf.reduce_sum(act_probs_old, axis=1) 94 | entropy = -tf.reduce_sum(self.act_probs * 95 | tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1) 96 | #otherEntroy = -self.s_a_prob * tf.log(self.act_probs + 1e-9) 97 | otherEntroy = -self.s_a_prob * tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)) 98 | 99 | with tf.variable_scope('loss/clip' + "_" + str(self.index)): 100 | # ratios = tf.divide(act_probs, act_probs_old) 101 | ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old)) 102 | clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'], 103 | clip_value_max=1 + self.args['clip_value']) 104 | loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios)) 105 | self.loss_clip = tf.reduce_mean(loss_clip) 106 | 107 | self.entropy = self.args['c2'] * tf.reduce_mean(entropy) # mean of entropy of pi(obs) 108 | 109 | t = tf.reshape(self.term, shape=[-1, 1]) 110 | entropyTS = tf.reduce_sum(otherEntroy, axis=1, 111 | keepdims=True) 112 | weight = 0.5 + tf.tanh(3 - self.args['c3'] * self.e) / 2 113 | entropyTS = entropyTS * weight * self.args['c1'] 114 | self.entropyTS = tf.reduce_mean(entropyTS) 115 | 116 | self.a_loss = -(self.loss_clip + self.entropy) + self.entropyTS 117 | self.train_a_op = self.optimizer.minimize(self.a_loss, var_list=self.policy_param) 118 | 119 | def choose_action(self, obs): 120 | obs = np.array(obs) 121 | obs = obs[np.newaxis, :] 122 | if self.args['continuous_action']: 123 | actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs}) 124 | return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip']) 125 | else: 126 | if self.args['stochastic']: 127 | actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs}) 128 | action = actions[0] 129 | action_one_hot = np.zeros(self.n_actions) 130 | action_one_hot[action] = 1 131 | return action_one_hot 132 | else: 133 | actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs}) 134 | action = actions[0] 135 | action_one_hot = np.zeros(self.n_actions) 136 | action_one_hot[action] = 1 137 | return action_one_hot 138 | 139 | def choose_hold_action(self, obs): 140 | return np.zeros(self.n_actions) 141 | 142 | def choose_deterministic_action(self, obs, agent_id=0): 143 | actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0] 144 | action_one_hots = [] 145 | for i in range(len(actions)): 146 | action = actions[i] 147 | action_one_hot = np.zeros(self.n_actions) 148 | action_one_hot[action] = 1 149 | action_one_hots.append(action_one_hot) 150 | return action_one_hots 151 | 152 | def choose_acton_prob(self, observation, action): 153 | observation = np.array(observation) 154 | observation = observation[np.newaxis, :] 155 | if self.args['continuous_action']: 156 | actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation}) 157 | actions_value = [actions_value[0][0], actions_value[1][0]] 158 | else: 159 | actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation})[0] 160 | return actions_value 161 | 162 | def get_v(self, s): 163 | s = np.array(s) 164 | return self.sess.run(self.v_preds, {self.obs: s[np.newaxis, :]})[0, 0] 165 | 166 | def update(self, actor, s, a, r, options, terms, epi, agentid): 167 | self.sess.run(self.replace_op) 168 | 169 | source_actor_prob = [] 170 | mu = [] 171 | sigma = [] 172 | for i, o in enumerate(options): 173 | o = int(o) 174 | if o == agentid: 175 | terms[i] = 0 176 | if self.args['continuous_action']: 177 | a_prob = actor[o].choose_acton_prob(s[i], a[i]) 178 | mu.append(a_prob[0]) 179 | sigma.append(a_prob[1]) 180 | else: 181 | if o == agentid: 182 | a_prob = actor[o].choose_hold_action(s[i]) 183 | else: 184 | a_prob = actor[o].choose_acton_prob(s[i], a[i]) 185 | source_actor_prob.append(a_prob) 186 | adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r}) 187 | if self.args['continuous_action']: 188 | for i in range(self.args['epi_train_times']): 189 | _, a_loss, clip, entropy, entropyTS = self.sess.run( 190 | [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS], 191 | {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms, 192 | self.mu: mu, self.sigma: sigma, self.e: epi}) 193 | __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r}) 194 | self.logger.write_tb_log('a_loss_' + str(self.index), a_loss, self.learning_step) 195 | self.logger.write_tb_log('c_loss_' + str(self.index), c_loss, self.learning_step) 196 | self.logger.write_tb_log('clip_' + str(self.index), clip, self.learning_step) 197 | self.logger.write_tb_log('entropy_' + str(self.index), entropy, self.learning_step) 198 | self.logger.write_tb_log('entropyTS_' + str(self.index), entropyTS, self.learning_step) 199 | self.learning_step += 1 200 | else: 201 | for i in range(self.args['epi_train_times']): 202 | _, a_loss, clip, entropy, entropyTS = self.sess.run( 203 | [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS], 204 | {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms, 205 | self.s_a_prob: source_actor_prob, self.e: epi}) 206 | __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r}) 207 | self.logger.write_tb_log('a_loss_' + str(self.index), a_loss, self.learning_step) 208 | self.logger.write_tb_log('c_loss_' + str(self.index), c_loss, self.learning_step) 209 | self.logger.write_tb_log('clip_' + str(self.index), clip, self.learning_step) 210 | self.logger.write_tb_log('entropy_' + str(self.index), entropy, self.learning_step) 211 | self.logger.write_tb_log('entropyTS_' + str(self.index), entropyTS, self.learning_step) 212 | self.learning_step += 1 213 | 214 | def load_model(self, path): 215 | saver = tf.train.Saver(self.policy_param) 216 | saver.restore(self.sess, path + ".ckpt") 217 | 218 | def save_model(self, path): 219 | saver = tf.train.Saver(self.policy_param) 220 | saver.save(self.sess, path + ".ckpt") 221 | -------------------------------------------------------------------------------- /alg/maddpg/common/tf_util.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | 6 | def sum(x, axis=None, keepdims=False): 7 | return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims) 8 | def mean(x, axis=None, keepdims=False): 9 | return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims) 10 | def var(x, axis=None, keepdims=False): 11 | meanx = mean(x, axis=axis, keepdims=keepdims) 12 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 13 | def std(x, axis=None, keepdims=False): 14 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 15 | def max(x, axis=None, keepdims=False): 16 | return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims) 17 | def min(x, axis=None, keepdims=False): 18 | return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims) 19 | def concatenate(arrs, axis=0): 20 | return tf.concat(axis=axis, values=arrs) 21 | def argmax(x, axis=None): 22 | return tf.argmax(x, axis=axis) 23 | def softmax(x, axis=None): 24 | return tf.nn.softmax(x, axis=axis) 25 | 26 | # ================================================================ 27 | # Misc 28 | # ================================================================ 29 | 30 | 31 | def is_placeholder(x): 32 | return type(x) is tf.Tensor and len(x.op.inputs) == 0 33 | 34 | # ================================================================ 35 | # Inputs 36 | # ================================================================ 37 | 38 | 39 | class TfInput(object): 40 | def __init__(self, name="(unnamed)"): 41 | """Generalized Tensorflow placeholder. The main differences are: 42 | - possibly uses multiple placeholders internally and returns multiple values 43 | - can apply light postprocessing to the value feed to placeholder. 44 | """ 45 | self.name = name 46 | 47 | def get(self): 48 | """Return the tf variable(s) representing the possibly postprocessed value 49 | of placeholder(s). 50 | """ 51 | raise NotImplemented() 52 | 53 | def make_feed_dict(data): 54 | """Given data input it to the placeholder(s).""" 55 | raise NotImplemented() 56 | 57 | 58 | class PlacholderTfInput(TfInput): 59 | def __init__(self, placeholder): 60 | """Wrapper for regular tensorflow placeholder.""" 61 | super().__init__(placeholder.name) 62 | self._placeholder = placeholder 63 | 64 | def get(self): 65 | return self._placeholder 66 | 67 | def make_feed_dict(self, data): 68 | return {self._placeholder: data} 69 | 70 | 71 | class BatchInput(PlacholderTfInput): 72 | def __init__(self, shape, dtype=tf.float32, name=None): 73 | """Creates a placeholder for a batch of tensors of a given shape and dtype 74 | 75 | Parameters 76 | ---------- 77 | shape: [int] 78 | shape of a single elemenet of the batch 79 | dtype: tf.dtype 80 | number representation used for tensor contents 81 | name: str 82 | name of the underlying placeholder 83 | """ 84 | super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) 85 | 86 | 87 | class Uint8Input(PlacholderTfInput): 88 | def __init__(self, shape, name=None): 89 | """Takes input in uint8 format which is cast to float32 and divided by 255 90 | before passing it to the model. 91 | 92 | On GPU this ensures lower data transfer times. 93 | 94 | Parameters 95 | ---------- 96 | shape: [int] 97 | shape of the tensor. 98 | name: str 99 | name of the underlying placeholder 100 | """ 101 | 102 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 103 | self._shape = shape 104 | self._output = tf.cast(super().get(), tf.float32) / 255.0 105 | 106 | def get(self): 107 | return self._output 108 | 109 | 110 | def ensure_tf_input(thing): 111 | """Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" 112 | if isinstance(thing, TfInput): 113 | return thing 114 | elif is_placeholder(thing): 115 | return PlacholderTfInput(thing) 116 | else: 117 | raise ValueError("Must be a placeholder or TfInput") 118 | 119 | # ================================================================ 120 | # Mathematical utils 121 | # ================================================================ 122 | 123 | 124 | def huber_loss(x, delta=1.0): 125 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 126 | return tf.where( 127 | tf.abs(x) < delta, 128 | tf.square(x) * 0.5, 129 | delta * (tf.abs(x) - 0.5 * delta) 130 | ) 131 | 132 | # ================================================================ 133 | # Optimizer utils 134 | # ================================================================ 135 | 136 | 137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 138 | """Minimized `objective` using `optimizer` w.r.t. variables in 139 | `var_list` while ensure the norm of the gradients for each 140 | variable is clipped to `clip_val` 141 | """ 142 | if clip_val is None: 143 | return optimizer.minimize(objective, var_list=var_list) 144 | else: 145 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 146 | for i, (grad, var) in enumerate(gradients): 147 | if grad is not None: 148 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 149 | return optimizer.apply_gradients(gradients) 150 | 151 | 152 | # ================================================================ 153 | # Global session 154 | # ================================================================ 155 | 156 | def get_session(): 157 | """Returns recently made Tensorflow session""" 158 | return tf.get_default_session() 159 | 160 | 161 | def make_session(num_cpu): 162 | """Returns a session that will use CPU's only""" 163 | tf_config = tf.ConfigProto( 164 | inter_op_parallelism_threads=num_cpu, 165 | intra_op_parallelism_threads=num_cpu) 166 | return tf.Session(config=tf_config) 167 | 168 | 169 | def single_threaded_session(): 170 | """Returns a session which will only use a single CPU""" 171 | return make_session(1) 172 | 173 | 174 | ALREADY_INITIALIZED = set() 175 | 176 | 177 | def initialize(): 178 | """Initialize all the uninitialized variables in the global scope.""" 179 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 180 | get_session().run(tf.variables_initializer(new_variables)) 181 | ALREADY_INITIALIZED.update(new_variables) 182 | 183 | 184 | # ================================================================ 185 | # Scopes 186 | # ================================================================ 187 | 188 | 189 | def scope_vars(scope, trainable_only=False): 190 | """ 191 | Get variables inside a scope 192 | The scope can be specified as a string 193 | 194 | Parameters 195 | ---------- 196 | scope: str or VariableScope 197 | scope in which the variables reside. 198 | trainable_only: bool 199 | whether or not to return only the variables that were marked as trainable. 200 | 201 | Returns 202 | ------- 203 | vars: [tf.Variable] 204 | list of variables in `scope`. 205 | """ 206 | return tf.get_collection( 207 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, 208 | scope=scope if isinstance(scope, str) else scope.name 209 | ) 210 | 211 | 212 | def scope_name(): 213 | """Returns the name of current scope as a string, e.g. deepq/q_func""" 214 | return tf.get_variable_scope().name 215 | 216 | 217 | def absolute_scope_name(relative_scope_name): 218 | """Appends parent scope name to `relative_scope_name`""" 219 | return scope_name() + "/" + relative_scope_name 220 | 221 | # ================================================================ 222 | # Saving variables 223 | # ================================================================ 224 | 225 | 226 | def load_state(fname, saver=None): 227 | """Load all the variables to the current session from the location """ 228 | if saver is None: 229 | saver = tf.train.Saver() 230 | saver.restore(get_session(), fname) 231 | return saver 232 | 233 | 234 | def save_state(fname, saver=None): 235 | """Save all the variables in the current session to the location """ 236 | os.makedirs(os.path.dirname(fname), exist_ok=True) 237 | if saver is None: 238 | saver = tf.train.Saver() 239 | saver.save(get_session(), fname + ".ckpt") 240 | return saver 241 | 242 | # ================================================================ 243 | # Theano-like Function 244 | # ================================================================ 245 | 246 | 247 | def function(inputs, outputs, updates=None, givens=None): 248 | """Just like Theano function. Take a bunch of tensorflow placeholders and expersions 249 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes 250 | values to be feed to the inputs placeholders and produces the values of the experessions 251 | in outputs. 252 | 253 | Input values can be passed in the same order as inputs or can be provided as kwargs based 254 | on placeholder name (passed to constructor or accessible via placeholder.op.name). 255 | 256 | Example: 257 | x = tf.placeholder(tf.int32, (), name="x") 258 | y = tf.placeholder(tf.int32, (), name="y") 259 | z = 3 * x + 2 * y 260 | lin = function([x, y], z, givens={y: 0}) 261 | 262 | with single_threaded_session(): 263 | initialize() 264 | 265 | assert lin(2) == 6 266 | assert lin(x=3) == 9 267 | assert lin(2, 2) == 10 268 | assert lin(x=2, y=3) == 12 269 | 270 | Parameters 271 | ---------- 272 | inputs: [tf.placeholder or TfInput] 273 | list of input arguments 274 | outputs: [tf.Variable] or tf.Variable 275 | list of outputs or a single output to be returned from function. Returned 276 | value will also have the same shape. 277 | """ 278 | if isinstance(outputs, list): 279 | return _Function(inputs, outputs, updates, givens=givens) 280 | elif isinstance(outputs, (dict, collections.OrderedDict)): 281 | f = _Function(inputs, outputs.values(), updates, givens=givens) 282 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) 283 | else: 284 | f = _Function(inputs, [outputs], updates, givens=givens) 285 | return lambda *args, **kwargs: f(*args, **kwargs)[0] 286 | 287 | 288 | class _Function(object): 289 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 290 | for inpt in inputs: 291 | if not issubclass(type(inpt), TfInput): 292 | assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput" 293 | self.inputs = inputs 294 | updates = updates or [] 295 | self.update_group = tf.group(*updates) 296 | self.outputs_update = list(outputs) + [self.update_group] 297 | self.givens = {} if givens is None else givens 298 | self.check_nan = check_nan 299 | 300 | def _feed_input(self, feed_dict, inpt, value): 301 | if issubclass(type(inpt), TfInput): 302 | feed_dict.update(inpt.make_feed_dict(value)) 303 | elif is_placeholder(inpt): 304 | feed_dict[inpt] = value 305 | 306 | def __call__(self, *args, **kwargs): 307 | assert len(args) <= len(self.inputs), "Too many arguments provided" 308 | feed_dict = {} 309 | # Update the args 310 | for inpt, value in zip(self.inputs, args): 311 | self._feed_input(feed_dict, inpt, value) 312 | # Update the kwargs 313 | kwargs_passed_inpt_names = set() 314 | for inpt in self.inputs[len(args):]: 315 | inpt_name = inpt.name.split(':')[0] 316 | inpt_name = inpt_name.split('/')[-1] 317 | assert inpt_name not in kwargs_passed_inpt_names, \ 318 | "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name) 319 | if inpt_name in kwargs: 320 | kwargs_passed_inpt_names.add(inpt_name) 321 | self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name)) 322 | else: 323 | assert inpt in self.givens, "Missing argument " + inpt_name 324 | assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys())) 325 | # Update feed dict with givens. 326 | for inpt in self.givens: 327 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) 328 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 329 | if self.check_nan: 330 | if any(np.isnan(r).any() for r in results): 331 | raise RuntimeError("Nan detected") 332 | return results 333 | -------------------------------------------------------------------------------- /alg/sharing_multi_ppo/ppo_add_entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from alg.optimizer import Optimizer 4 | 5 | 6 | class PPO: 7 | def __init__(self, n_actions, n_features, n_agents, args, SESS, logger): 8 | self.args = args 9 | self.n_actions = n_actions 10 | self.n_features = n_features + n_agents 11 | self.n_agents = n_agents 12 | self.logger = logger 13 | self.learning_step = 0 14 | 15 | self.obs = tf.placeholder(tf.float32, [None, self.n_features], 's') 16 | 17 | self.act_probs, self.policy_param = self.build_actor_net(self.args['policy']) 18 | self.o_act_probs, self.o_policy_param = self.build_actor_net(self.args['old_policy']) 19 | self.v_preds, self.v_param = self.build_critic_net('critic') 20 | 21 | if self.args['continuous_action']: 22 | self.sample_action = tf.squeeze(self.act_probs.sample(1), axis=0) 23 | else: 24 | self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1) 25 | self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1]) 26 | self.act_deterministic = tf.argmax(self.act_probs, axis=1) 27 | 28 | self.replace_op = [tf.assign(t, e) for t, e in zip(self.o_policy_param, self.policy_param)] 29 | 30 | opt = Optimizer(args['optimizer'], args['learning_rate_a']) 31 | self.optimizer = opt.get_optimizer() 32 | opt_c = Optimizer(args['optimizer'], args['learning_rate_c']) 33 | self.optimizer_c = opt_c.get_optimizer() 34 | 35 | with tf.variable_scope('train_inp'): 36 | if self.args['continuous_action']: 37 | self.actions = tf.placeholder(tf.float32, [None, self.n_actions], 'action') 38 | self.mu = tf.placeholder(tf.float32, [None, self.n_actions], 'input_mu') 39 | self.sigma = tf.placeholder(tf.float32, [None, self.n_actions], 'input_sigma') 40 | else: 41 | self.actions = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='actions') 42 | self.s_a_prob = tf.placeholder(dtype=tf.float32, shape=[None, self.n_actions], name='s_a_prob') 43 | self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='rewards') 44 | self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next') 45 | self.gaes = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='gaes') 46 | self.term = tf.placeholder(dtype=tf.float32, shape=[None], name='term') 47 | self.e = tf.placeholder(tf.float32, (), 'e') 48 | 49 | self.build_loss() 50 | 51 | self.sess = SESS 52 | 53 | def build_actor_net(self, scope, trainable=True): 54 | with tf.variable_scope(scope): 55 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_a_1'], activation=tf.nn.relu, trainable=trainable) 56 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_a_2'], activation=tf.nn.relu, 57 | trainable=trainable) 58 | if self.args['continuous_action']: 59 | mu = self.args['action_clip'] * tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, trainable=trainable) 60 | sigma = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softplus, trainable=trainable) 61 | act_probs = tf.distributions.Normal(loc=mu, scale=sigma + 1e-9) 62 | else: 63 | act_probs = tf.layers.dense(inputs=layer_2, units=self.n_actions, activation=tf.nn.softmax) 64 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 65 | return act_probs, params 66 | 67 | def build_critic_net(self, scope): 68 | with tf.variable_scope(scope): 69 | layer_1 = tf.layers.dense(inputs=self.obs, units=self.args['n_layer_c_1'], activation=tf.nn.relu) 70 | layer_2 = tf.layers.dense(inputs=layer_1, units=self.args['n_layer_c_2'], activation=tf.nn.relu) 71 | v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None) 72 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) 73 | return v_preds, params 74 | 75 | def build_loss(self): 76 | with tf.variable_scope('update_critic'): 77 | self.advantage = self.rewards - self.v_preds 78 | self.c_loss = tf.reduce_mean(tf.square(self.advantage)) 79 | self.train_c_op = self.optimizer_c.minimize(self.c_loss, var_list=self.v_param) 80 | 81 | with tf.variable_scope('update_actor'): 82 | if self.args['continuous_action']: 83 | act_probs = self.act_probs.prob(self.actions) 84 | act_probs_old = self.o_act_probs.prob(self.actions) 85 | entropy = self.act_probs.entropy() 86 | otherNormal = tf.distributions.Normal(self.mu, self.sigma) 87 | otherEntroy = otherNormal.cross_entropy(self.act_probs) 88 | else: 89 | act_probs = self.act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.act_probs.shape[1]) 90 | act_probs = tf.reduce_sum(act_probs, axis=1) 91 | # probabilities of actions which agent took with old policy 92 | act_probs_old = self.o_act_probs * self.actions #tf.one_hot(indices=self.actions, depth=self.o_act_probs.shape[1]) 93 | act_probs_old = tf.reduce_sum(act_probs_old, axis=1) 94 | entropy = -tf.reduce_sum(self.act_probs * 95 | tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)), axis=1) 96 | #otherEntroy = -self.s_a_prob * tf.log(self.act_probs + 1e-9) 97 | otherEntroy = -self.s_a_prob * tf.log(tf.clip_by_value(self.act_probs, 1e-9, 1.0)) 98 | 99 | with tf.variable_scope('loss/clip'): 100 | # ratios = tf.divide(act_probs, act_probs_old) 101 | ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old)) 102 | clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - self.args['clip_value'], 103 | clip_value_max=1 + self.args['clip_value']) 104 | loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios)) 105 | self.loss_clip = tf.reduce_mean(loss_clip) 106 | 107 | self.entropy = self.args['c2'] * tf.reduce_mean(entropy) # mean of entropy of pi(obs) 108 | 109 | t = tf.reshape(self.term, shape=[-1, 1]) 110 | entropyTS = tf.reduce_sum(otherEntroy, axis=1, 111 | keepdims=True) 112 | weight = 0.5 + tf.tanh(3 - self.args['c3'] * self.e) / 2 113 | entropyTS = entropyTS * weight * self.args['c1'] 114 | self.entropyTS = tf.reduce_mean(entropyTS) 115 | 116 | self.a_loss = -(self.loss_clip + self.entropy) + self.entropyTS 117 | self.train_a_op = self.optimizer.minimize(self.a_loss, var_list=self.policy_param) 118 | 119 | def get_agent_obs(self, obs, agent_id=0): 120 | if type(agent_id) is int: 121 | agent_id_arr = [agent_id] * len(obs) 122 | elif type(agent_id) is list: 123 | agent_id_arr = agent_id 124 | else: 125 | raise Exception('the agent_id field must be type of int or list') 126 | agent_one_hot = np.eye(self.n_agents)[agent_id_arr] 127 | obs = np.hstack((agent_one_hot, obs)) 128 | return obs 129 | 130 | def choose_action(self, obs, agent_id=0): 131 | obs = np.array(obs) 132 | obs = obs[np.newaxis, :] 133 | obs = self.get_agent_obs(obs, agent_id) 134 | if self.args['continuous_action']: 135 | actions, v_preds = self.sess.run([self.sample_action, self.v_preds], {self.obs: obs}) 136 | return np.clip(actions[0], -self.args['action_clip'], self.args['action_clip']) 137 | else: 138 | if self.args['stochastic']: 139 | actions, v_preds, p = self.sess.run([self.act_stochastic, self.v_preds, self.act_probs], feed_dict={self.obs: obs}) 140 | action = actions[0] 141 | action_one_hot = np.zeros(self.n_actions) 142 | action_one_hot[action] = 1 143 | return action_one_hot 144 | else: 145 | actions, v_preds = self.sess.run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs}) 146 | action = actions[0] 147 | action_one_hot = np.zeros(self.n_actions) 148 | action_one_hot[action] = 1 149 | return action_one_hot 150 | 151 | def choose_hold_action(self, obs): 152 | return np.zeros(self.n_actions) 153 | 154 | def choose_deterministic_action(self, obs, agent_id=0): 155 | obs = self.get_agent_obs(obs, agent_id) 156 | actions = self.sess.run([self.act_deterministic], feed_dict={self.obs: obs})[0] 157 | action_one_hots = [] 158 | for i in range(len(actions)): 159 | action = actions[i] 160 | action_one_hot = np.zeros(self.n_actions) 161 | action_one_hot[action] = 1 162 | action_one_hots.append(action_one_hot) 163 | return action_one_hots 164 | 165 | def choose_acton_prob(self, observation, agent_id=0): 166 | observation = np.array(observation) 167 | observation = observation[np.newaxis, :] 168 | observation = self.get_agent_obs(observation, agent_id) 169 | if self.args['continuous_action']: 170 | actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation}) 171 | actions_value = [actions_value[0][0], actions_value[1][0]] 172 | else: 173 | actions_value = self.sess.run(self.act_probs, feed_dict={self.obs: observation})[0] 174 | return actions_value 175 | 176 | def get_v(self, s, agent_id=0): 177 | obs = np.array(s) 178 | obs = obs[np.newaxis, :] 179 | obs = self.get_agent_obs(obs, agent_id) 180 | return self.sess.run(self.v_preds, {self.obs: obs})[0, 0] 181 | 182 | def update(self, actor, s, a, r, options, terms, epi, agentid): 183 | self.sess.run(self.replace_op) 184 | 185 | source_actor_prob = [] 186 | mu = [] 187 | sigma = [] 188 | for i, o in enumerate(options): 189 | o = actor[o] 190 | if o == agentid[i]: 191 | terms[i] = 0 192 | if self.args['continuous_action']: 193 | a_prob = self.choose_acton_prob(s[i], o) 194 | mu.append(a_prob[0]) 195 | sigma.append(a_prob[1]) 196 | else: 197 | if o == agentid[i]: 198 | a_prob = self.choose_hold_action(s[i]) 199 | else: 200 | a_prob = self.choose_acton_prob(s[i], o) 201 | source_actor_prob.append(a_prob) 202 | s = self.get_agent_obs(s, agentid) 203 | adv = self.sess.run(self.advantage, {self.obs: s, self.rewards: r}) 204 | if self.args['continuous_action']: 205 | for i in range(self.args['epi_train_times']): 206 | _, a_loss, clip, entropy, entropyTS = self.sess.run( 207 | [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS], 208 | {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms, 209 | self.mu: mu, self.sigma: sigma, self.e: epi}) 210 | __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r}) 211 | self.logger.write_tb_log('a_loss', a_loss, self.learning_step) 212 | self.logger.write_tb_log('c_loss', c_loss, self.learning_step) 213 | self.logger.write_tb_log('clip', clip, self.learning_step) 214 | self.logger.write_tb_log('entropy', entropy, self.learning_step) 215 | self.logger.write_tb_log('entropyTS', entropyTS, self.learning_step) 216 | self.learning_step += 1 217 | else: 218 | for i in range(self.args['epi_train_times']): 219 | _, a_loss, clip, entropy, entropyTS = self.sess.run( 220 | [self.train_a_op, self.a_loss, self.loss_clip, self.entropy, self.entropyTS], 221 | {self.obs: s, self.actions: a, self.gaes: adv, self.term: terms, 222 | self.s_a_prob: source_actor_prob, self.e: epi}) 223 | __, c_loss = self.sess.run([self.train_c_op, self.c_loss], {self.obs: s, self.rewards: r}) 224 | self.logger.write_tb_log('a_loss', a_loss, self.learning_step) 225 | self.logger.write_tb_log('c_loss', c_loss, self.learning_step) 226 | self.logger.write_tb_log('clip', clip, self.learning_step) 227 | self.logger.write_tb_log('entropy', entropy, self.learning_step) 228 | self.logger.write_tb_log('entropyTS', entropyTS, self.learning_step) 229 | self.learning_step += 1 230 | 231 | def load_model(self, path): 232 | saver = tf.train.Saver(self.policy_param) 233 | saver.restore(self.sess, path + ".ckpt") 234 | 235 | def save_model(self, path): 236 | saver = tf.train.Saver(self.policy_param) 237 | saver.save(self.sess, path + ".ckpt") 238 | -------------------------------------------------------------------------------- /game/pacman/graphicsUtils.py: -------------------------------------------------------------------------------- 1 | # graphicsUtils.py 2 | # ---------------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | import sys 16 | import math 17 | import random 18 | import string 19 | import time 20 | import types 21 | import tkinter 22 | 23 | _Windows = sys.platform == 'win32' # True if on Win95/98/NT 24 | 25 | _root_window = None # The root window for graphics output 26 | _canvas = None # The canvas which holds graphics 27 | _canvas_xs = None # Size of canvas object 28 | _canvas_ys = None 29 | _canvas_x = None # Current position on canvas 30 | _canvas_y = None 31 | _canvas_col = None # Current colour (set to black below) 32 | _canvas_tsize = 12 33 | _canvas_tserifs = 0 34 | 35 | 36 | def formatColor(r, g, b): 37 | return '#%02x%02x%02x' % (int(r * 255), int(g * 255), int(b * 255)) 38 | 39 | 40 | def colorToVector(color): 41 | return [int(x, 16) / 256.0 for x in [color[1:3], color[3:5], color[5:7]]] 42 | 43 | if _Windows: 44 | _canvas_tfonts = ['times new roman', 'lucida console'] 45 | else: 46 | _canvas_tfonts = ['times', 'lucidasans-24'] 47 | pass # XXX need defaults here 48 | 49 | 50 | def sleep(secs): 51 | global _root_window 52 | if _root_window == None: 53 | time.sleep(secs) 54 | else: 55 | _root_window.update_idletasks() 56 | _root_window.after(int(1000 * secs), _root_window.quit) 57 | _root_window.mainloop() 58 | 59 | 60 | def begin_graphics(width=640, height=480, color=formatColor(0, 0, 0), title=None): 61 | 62 | global _root_window, _canvas, _canvas_x, _canvas_y, _canvas_xs, _canvas_ys, _bg_color 63 | 64 | # Check for duplicate call 65 | if _root_window is not None: 66 | # Lose the window. 67 | _root_window.destroy() 68 | 69 | # Save the canvas size parameters 70 | _canvas_xs, _canvas_ys = width - 1, height - 1 71 | _canvas_x, _canvas_y = 0, _canvas_ys 72 | _bg_color = color 73 | 74 | # Create the root window 75 | _root_window = tkinter.Tk() 76 | _root_window.protocol('WM_DELETE_WINDOW', _destroy_window) 77 | _root_window.title(title or 'Graphics Window') 78 | _root_window.resizable(0, 0) 79 | 80 | # Create the canvas object 81 | try: 82 | _canvas = tkinter.Canvas(_root_window, width=width, height=height) 83 | _canvas.pack() 84 | draw_background() 85 | _canvas.update() 86 | except: 87 | _root_window = None 88 | raise 89 | 90 | # Bind to key-down and key-up events 91 | _root_window.bind("", _keypress) 92 | _root_window.bind("", _keyrelease) 93 | _root_window.bind("", _clear_keys) 94 | _root_window.bind("", _clear_keys) 95 | _root_window.bind("", _leftclick) 96 | _root_window.bind("", _rightclick) 97 | _root_window.bind("", _rightclick) 98 | _root_window.bind("", _ctrl_leftclick) 99 | _clear_keys() 100 | 101 | _leftclick_loc = None 102 | _rightclick_loc = None 103 | _ctrl_leftclick_loc = None 104 | 105 | 106 | def _leftclick(event): 107 | global _leftclick_loc 108 | _leftclick_loc = (event.x, event.y) 109 | 110 | 111 | def _rightclick(event): 112 | global _rightclick_loc 113 | _rightclick_loc = (event.x, event.y) 114 | 115 | 116 | def _ctrl_leftclick(event): 117 | global _ctrl_leftclick_loc 118 | _ctrl_leftclick_loc = (event.x, event.y) 119 | 120 | 121 | def wait_for_click(): 122 | while True: 123 | global _leftclick_loc 124 | global _rightclick_loc 125 | global _ctrl_leftclick_loc 126 | if _leftclick_loc != None: 127 | val = _leftclick_loc 128 | _leftclick_loc = None 129 | return val, 'left' 130 | if _rightclick_loc != None: 131 | val = _rightclick_loc 132 | _rightclick_loc = None 133 | return val, 'right' 134 | if _ctrl_leftclick_loc != None: 135 | val = _ctrl_leftclick_loc 136 | _ctrl_leftclick_loc = None 137 | return val, 'ctrl_left' 138 | sleep(0.05) 139 | 140 | 141 | def draw_background(): 142 | corners = [(0, 0), (0, _canvas_ys), 143 | (_canvas_xs, _canvas_ys), (_canvas_xs, 0)] 144 | polygon(corners, _bg_color, fillColor=_bg_color, 145 | filled=True, smoothed=False) 146 | 147 | 148 | def _destroy_window(event=None): 149 | sys.exit(0) 150 | # global _root_window 151 | # _root_window.destroy() 152 | # _root_window = None 153 | # print "DESTROY" 154 | 155 | 156 | def end_graphics(): 157 | global _root_window, _canvas, _mouse_enabled 158 | try: 159 | try: 160 | sleep(1) 161 | if _root_window != None: 162 | _root_window.destroy() 163 | except SystemExit as e: 164 | print(('Ending graphics raised an exception:', e)) 165 | finally: 166 | _root_window = None 167 | _canvas = None 168 | _mouse_enabled = 0 169 | _clear_keys() 170 | 171 | 172 | def clear_screen(background=None): 173 | global _canvas_x, _canvas_y 174 | _canvas.delete('all') 175 | draw_background() 176 | _canvas_x, _canvas_y = 0, _canvas_ys 177 | 178 | 179 | def polygon(coords, outlineColor, fillColor=None, filled=1, smoothed=1, behind=0, width=1): 180 | c = [] 181 | for coord in coords: 182 | c.append(coord[0]) 183 | c.append(coord[1]) 184 | if fillColor == None: 185 | fillColor = outlineColor 186 | if filled == 0: 187 | fillColor = "" 188 | poly = _canvas.create_polygon( 189 | c, outline=outlineColor, fill=fillColor, smooth=smoothed, width=width) 190 | if behind > 0: 191 | _canvas.tag_lower(poly, behind) # Higher should be more visible 192 | return poly 193 | 194 | 195 | def square(pos, r, color, filled=1, behind=0): 196 | x, y = pos 197 | coords = [(x - r, y - r), (x + r, y - r), (x + r, y + r), (x - r, y + r)] 198 | return polygon(coords, color, color, filled, 0, behind=behind) 199 | 200 | 201 | def circle(pos, r, outlineColor, fillColor, endpoints=None, style='pieslice', width=2): 202 | x, y = pos 203 | x0, x1 = x - r - 1, x + r 204 | y0, y1 = y - r - 1, y + r 205 | if endpoints == None: 206 | e = [0, 359] 207 | else: 208 | e = list(endpoints) 209 | while e[0] > e[1]: 210 | e[1] = e[1] + 360 211 | 212 | return _canvas.create_arc(x0, y0, x1, y1, outline=outlineColor, fill=fillColor, 213 | extent=e[1] - e[0], start=e[0], style=style, width=width) 214 | 215 | 216 | def image(pos, file="../../blueghost.gif"): 217 | x, y = pos 218 | # img = PhotoImage(file=file) 219 | return _canvas.create_image(x, y, image=tkinter.PhotoImage(file=file), anchor=tkinter.NW) 220 | 221 | 222 | def refresh(): 223 | _canvas.update_idletasks() 224 | 225 | 226 | def moveCircle(id, pos, r, endpoints=None): 227 | global _canvas_x, _canvas_y 228 | 229 | x, y = pos 230 | # x0, x1 = x - r, x + r + 1 231 | # y0, y1 = y - r, y + r + 1 232 | x0, x1 = x - r - 1, x + r 233 | y0, y1 = y - r - 1, y + r 234 | if endpoints == None: 235 | e = [0, 359] 236 | else: 237 | e = list(endpoints) 238 | while e[0] > e[1]: 239 | e[1] = e[1] + 360 240 | 241 | edit(id, ('start', e[0]), ('extent', e[1] - e[0])) 242 | move_to(id, x0, y0) 243 | 244 | 245 | def edit(id, *args): 246 | _canvas.itemconfigure(id, **dict(args)) 247 | 248 | 249 | def text(pos, color, contents, font='Helvetica', size=12, style='normal', anchor="nw"): 250 | global _canvas_x, _canvas_y 251 | x, y = pos 252 | font = (font, str(size), style) 253 | return _canvas.create_text(x, y, fill=color, text=contents, font=font, anchor=anchor) 254 | 255 | 256 | def changeText(id, newText, font=None, size=12, style='normal'): 257 | _canvas.itemconfigure(id, text=newText) 258 | if font != None: 259 | _canvas.itemconfigure(id, font=(font, '-%d' % size, style)) 260 | 261 | 262 | def changeColor(id, newColor): 263 | _canvas.itemconfigure(id, fill=newColor) 264 | 265 | 266 | def line(here, there, color=formatColor(0, 0, 0), width=2): 267 | x0, y0 = here[0], here[1] 268 | x1, y1 = there[0], there[1] 269 | return _canvas.create_line(x0, y0, x1, y1, fill=color, width=width) 270 | 271 | ############################################################################## 272 | ### Keypress handling ######################################################## 273 | ############################################################################## 274 | 275 | # We bind to key-down and key-up events. 276 | 277 | _keysdown = {} 278 | _keyswaiting = {} 279 | # This holds an unprocessed key release. We delay key releases by up to 280 | # one call to keys_pressed() to get round a problem with auto repeat. 281 | _got_release = None 282 | 283 | 284 | def _keypress(event): 285 | global _got_release 286 | # remap_arrows(event) 287 | _keysdown[event.keysym] = 1 288 | _keyswaiting[event.keysym] = 1 289 | # print event.char, event.keycode 290 | _got_release = None 291 | 292 | 293 | def _keyrelease(event): 294 | global _got_release 295 | # remap_arrows(event) 296 | try: 297 | del _keysdown[event.keysym] 298 | except: 299 | pass 300 | _got_release = 1 301 | 302 | 303 | def remap_arrows(event): 304 | # TURN ARROW PRESSES INTO LETTERS (SHOULD BE IN KEYBOARD AGENT) 305 | if event.char in ['a', 's', 'd', 'w']: 306 | return 307 | if event.keycode in [37, 101]: # LEFT ARROW (win / x) 308 | event.char = 'a' 309 | if event.keycode in [38, 99]: # UP ARROW 310 | event.char = 'w' 311 | if event.keycode in [39, 102]: # RIGHT ARROW 312 | event.char = 'd' 313 | if event.keycode in [40, 104]: # DOWN ARROW 314 | event.char = 's' 315 | 316 | 317 | def _clear_keys(event=None): 318 | global _keysdown, _got_release, _keyswaiting 319 | _keysdown = {} 320 | _keyswaiting = {} 321 | _got_release = None 322 | 323 | 324 | def keys_pressed(d_o_e=None, 325 | d_w=tkinter._tkinter.DONT_WAIT): 326 | 327 | if(d_o_e is None): 328 | d_o_e = _root_window.dooneevent 329 | d_o_e(d_w) 330 | if _got_release: 331 | d_o_e(d_w) 332 | return list(_keysdown.keys()) 333 | 334 | 335 | def keys_waiting(): 336 | global _keyswaiting 337 | keys = list(_keyswaiting.keys()) 338 | _keyswaiting = {} 339 | return keys 340 | 341 | # Block for a list of keys... 342 | 343 | 344 | def wait_for_keys(): 345 | keys = [] 346 | while keys == []: 347 | keys = keys_pressed() 348 | sleep(0.05) 349 | return keys 350 | 351 | 352 | def remove_from_screen(x, 353 | d_o_e=None, 354 | d_w=tkinter._tkinter.DONT_WAIT): 355 | if (d_o_e is None): 356 | d_o_e = _root_window.dooneevent 357 | _canvas.delete(x) 358 | d_o_e(d_w) 359 | 360 | 361 | def _adjust_coords(coord_list, x, y): 362 | for i in range(0, len(coord_list), 2): 363 | coord_list[i] = coord_list[i] + x 364 | coord_list[i + 1] = coord_list[i + 1] + y 365 | return coord_list 366 | 367 | 368 | def move_to(object, x, y=None, 369 | d_o_e=None, 370 | d_w=tkinter._tkinter.DONT_WAIT): 371 | if (d_o_e is None): 372 | d_o_e = _root_window.dooneevent 373 | if y is None: 374 | try: 375 | x, y = x 376 | except: 377 | raise Exception('incomprehensible coordinates') 378 | 379 | horiz = True 380 | newCoords = [] 381 | current_x, current_y = _canvas.coords(object)[0:2] # first point 382 | for coord in _canvas.coords(object): 383 | if horiz: 384 | inc = x - current_x 385 | else: 386 | inc = y - current_y 387 | horiz = not horiz 388 | 389 | newCoords.append(coord + inc) 390 | 391 | _canvas.coords(object, *newCoords) 392 | d_o_e(d_w) 393 | 394 | 395 | def move_by(object, x, y=None, 396 | d_o_e=None, 397 | d_w=tkinter._tkinter.DONT_WAIT, lift=False): 398 | if (d_o_e is None): 399 | d_o_e = _root_window.dooneevent 400 | if y is None: 401 | try: 402 | x, y = x 403 | except: 404 | raise Exception('incomprehensible coordinates') 405 | 406 | horiz = True 407 | newCoords = [] 408 | for coord in _canvas.coords(object): 409 | if horiz: 410 | inc = x 411 | else: 412 | inc = y 413 | horiz = not horiz 414 | 415 | newCoords.append(coord + inc) 416 | 417 | _canvas.coords(object, *newCoords) 418 | d_o_e(d_w) 419 | if lift: 420 | _canvas.tag_raise(object) 421 | 422 | 423 | def writePostscript(filename): 424 | "Writes the current canvas to a postscript file." 425 | psfile = file(filename, 'w') 426 | psfile.write(_canvas.postscript(pageanchor='sw', 427 | y='0.c', 428 | x='0.c')) 429 | psfile.close() 430 | 431 | ghost_shape = [ 432 | (0, - 0.5), 433 | (0.25, - 0.75), 434 | (0.5, - 0.5), 435 | (0.75, - 0.75), 436 | (0.75, 0.5), 437 | (0.5, 0.75), 438 | (- 0.5, 0.75), 439 | (- 0.75, 0.5), 440 | (- 0.75, - 0.75), 441 | (- 0.5, - 0.5), 442 | (- 0.25, - 0.75) 443 | ] 444 | 445 | if __name__ == '__main__': 446 | begin_graphics() 447 | clear_screen() 448 | ghost_shape = [(x * 10 + 20, y * 10 + 20) for x, y in ghost_shape] 449 | g = polygon(ghost_shape, formatColor(1, 1, 1)) 450 | move_to(g, (50, 50)) 451 | circle((150, 150), 20, formatColor(0.7, 0.3, 0.0), endpoints=[15, - 15]) 452 | sleep(2) 453 | -------------------------------------------------------------------------------- /alg/maddpg/common/distributions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import alg.maddpg.common.tf_util as U 4 | from tensorflow.python.ops import math_ops 5 | from game.particle.multiagent.multi_discrete import MultiDiscrete 6 | from tensorflow.python.ops import nn 7 | 8 | class Pd(object): 9 | """ 10 | A particular probability distribution 11 | """ 12 | def flatparam(self): 13 | raise NotImplementedError 14 | def mode(self): 15 | raise NotImplementedError 16 | def logp(self, x): 17 | raise NotImplementedError 18 | def kl(self, other): 19 | raise NotImplementedError 20 | def entropy(self): 21 | raise NotImplementedError 22 | def sample(self): 23 | raise NotImplementedError 24 | 25 | class PdType(object): 26 | """ 27 | Parametrized family of probability distributions 28 | """ 29 | def pdclass(self): 30 | raise NotImplementedError 31 | def pdfromflat(self, flat): 32 | return self.pdclass()(flat) 33 | def param_shape(self): 34 | raise NotImplementedError 35 | def sample_shape(self): 36 | raise NotImplementedError 37 | def sample_dtype(self): 38 | raise NotImplementedError 39 | 40 | def param_placeholder(self, prepend_shape, name=None): 41 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 42 | def sample_placeholder(self, prepend_shape, name=None): 43 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 44 | 45 | class CategoricalPdType(PdType): 46 | def __init__(self, ncat): 47 | self.ncat = ncat 48 | def pdclass(self): 49 | return CategoricalPd 50 | def param_shape(self): 51 | return [self.ncat] 52 | def sample_shape(self): 53 | return [] 54 | def sample_dtype(self): 55 | return tf.int32 56 | 57 | class SoftCategoricalPdType(PdType): 58 | def __init__(self, ncat): 59 | self.ncat = ncat 60 | def pdclass(self): 61 | return SoftCategoricalPd 62 | def param_shape(self): 63 | return [self.ncat] 64 | def sample_shape(self): 65 | return [self.ncat] 66 | def sample_dtype(self): 67 | return tf.float32 68 | 69 | class MultiCategoricalPdType(PdType): 70 | def __init__(self, low, high): 71 | self.low = low 72 | self.high = high 73 | self.ncats = high - low + 1 74 | def pdclass(self): 75 | return MultiCategoricalPd 76 | def pdfromflat(self, flat): 77 | return MultiCategoricalPd(self.low, self.high, flat) 78 | def param_shape(self): 79 | return [sum(self.ncats)] 80 | def sample_shape(self): 81 | return [len(self.ncats)] 82 | def sample_dtype(self): 83 | return tf.int32 84 | 85 | class SoftMultiCategoricalPdType(PdType): 86 | def __init__(self, low, high): 87 | self.low = low 88 | self.high = high 89 | self.ncats = high - low + 1 90 | def pdclass(self): 91 | return SoftMultiCategoricalPd 92 | def pdfromflat(self, flat): 93 | return SoftMultiCategoricalPd(self.low, self.high, flat) 94 | def param_shape(self): 95 | return [sum(self.ncats)] 96 | def sample_shape(self): 97 | return [sum(self.ncats)] 98 | def sample_dtype(self): 99 | return tf.float32 100 | 101 | class DiagGaussianPdType(PdType): 102 | def __init__(self, size): 103 | self.size = size 104 | def pdclass(self): 105 | return DiagGaussianPd 106 | def param_shape(self): 107 | return [2*self.size] 108 | def sample_shape(self): 109 | return [self.size] 110 | def sample_dtype(self): 111 | return tf.float32 112 | 113 | class BernoulliPdType(PdType): 114 | def __init__(self, size): 115 | self.size = size 116 | def pdclass(self): 117 | return BernoulliPd 118 | def param_shape(self): 119 | return [self.size] 120 | def sample_shape(self): 121 | return [self.size] 122 | def sample_dtype(self): 123 | return tf.int32 124 | 125 | # WRONG SECOND DERIVATIVES 126 | # class CategoricalPd(Pd): 127 | # def __init__(self, logits): 128 | # self.logits = logits 129 | # self.ps = tf.nn.softmax(logits) 130 | # @classmethod 131 | # def fromflat(cls, flat): 132 | # return cls(flat) 133 | # def flatparam(self): 134 | # return self.logits 135 | # def mode(self): 136 | # return U.argmax(self.logits, axis=1) 137 | # def logp(self, x): 138 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 139 | # def kl(self, other): 140 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 141 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 142 | # def entropy(self): 143 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 144 | # def sample(self): 145 | # u = tf.random_uniform(tf.shape(self.logits)) 146 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 147 | 148 | class CategoricalPd(Pd): 149 | def __init__(self, logits): 150 | self.logits = logits 151 | def flatparam(self): 152 | return self.logits 153 | def mode(self): 154 | return U.argmax(self.logits, axis=1) 155 | def logp(self, x): 156 | return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 157 | def kl(self, other): 158 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 159 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 160 | ea0 = tf.exp(a0) 161 | ea1 = tf.exp(a1) 162 | z0 = U.sum(ea0, axis=1, keepdims=True) 163 | z1 = U.sum(ea1, axis=1, keepdims=True) 164 | p0 = ea0 / z0 165 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 166 | def entropy(self): 167 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 168 | ea0 = tf.exp(a0) 169 | z0 = U.sum(ea0, axis=1, keepdims=True) 170 | p0 = ea0 / z0 171 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 172 | def sample(self): 173 | u = tf.random_uniform(tf.shape(self.logits)) 174 | return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 175 | @classmethod 176 | def fromflat(cls, flat): 177 | return cls(flat) 178 | 179 | class SoftCategoricalPd(Pd): 180 | def __init__(self, logits): 181 | self.logits = logits 182 | def flatparam(self): 183 | return self.logits 184 | def mode(self): 185 | return U.softmax(self.logits, axis=-1) 186 | def logp(self, x): 187 | return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 188 | def kl(self, other): 189 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 190 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 191 | ea0 = tf.exp(a0) 192 | ea1 = tf.exp(a1) 193 | z0 = U.sum(ea0, axis=1, keepdims=True) 194 | z1 = U.sum(ea1, axis=1, keepdims=True) 195 | p0 = ea0 / z0 196 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 197 | def entropy(self): 198 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 199 | ea0 = tf.exp(a0) 200 | z0 = U.sum(ea0, axis=1, keepdims=True) 201 | p0 = ea0 / z0 202 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 203 | def cross_entropy(self, other): 204 | return self.kl(other) + self.entropy() 205 | def soft_max_sample(self, other, term): 206 | u1 = tf.random_uniform(tf.shape(self.logits)) 207 | y1 = U.softmax(self.logits - tf.log(-tf.log(u1)), axis=-1) 208 | u2 = tf.random_uniform(tf.shape(other.logits)) 209 | y2 = U.softmax(other.logits - tf.log(-tf.log(u2)), axis=-1) 210 | logits = y1 + term * y2 211 | logits_out = logits / U.sum(logits) 212 | return self.gumbel_sample(logits=logits_out) 213 | def params(self): 214 | return self.logits 215 | def sample(self): 216 | #u = tf.random_uniform(tf.shape(self.logits)) 217 | #return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1) 218 | return self.gumbel_sample() 219 | def gumbel_sample(self, logits=None, temperature=0.1, hard=True): 220 | if logits is None: 221 | logits = self.logits 222 | u = tf.random_uniform(tf.shape(logits)) 223 | y = U.softmax((logits - tf.log(-tf.log(u))) / temperature, axis=-1) 224 | if hard: 225 | k = tf.shape(logits)[-1] 226 | y_hard = tf.cast(tf.equal(y, tf.reduce_max(y, 1, keepdims=True)), y.dtype) 227 | y = tf.stop_gradient(y_hard - y) + y 228 | return y 229 | 230 | @classmethod 231 | def fromflat(cls, flat): 232 | return cls(flat) 233 | 234 | class MultiCategoricalPd(Pd): 235 | def __init__(self, low, high, flat): 236 | self.flat = flat 237 | self.low = tf.constant(low, dtype=tf.int32) 238 | self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 239 | def flatparam(self): 240 | return self.flat 241 | def mode(self): 242 | return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 243 | def logp(self, x): 244 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 245 | def kl(self, other): 246 | return tf.add_n([ 247 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 248 | ]) 249 | def entropy(self): 250 | return tf.add_n([p.entropy() for p in self.categoricals]) 251 | def sample(self): 252 | return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 253 | @classmethod 254 | def fromflat(cls, flat): 255 | return cls(flat) 256 | 257 | class SoftMultiCategoricalPd(Pd): # doesn't work yet 258 | def __init__(self, low, high, flat): 259 | self.flat = flat 260 | self.low = tf.constant(low, dtype=tf.float32) 261 | self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 262 | def flatparam(self): 263 | return self.flat 264 | def mode(self): 265 | x = [] 266 | for i in range(len(self.categoricals)): 267 | x.append(self.low[i] + self.categoricals[i].mode()) 268 | return tf.concat(x, axis=-1) 269 | def logp(self, x): 270 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 271 | def kl(self, other): 272 | return tf.add_n([ 273 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 274 | ]) 275 | def entropy(self): 276 | return tf.add_n([p.entropy() for p in self.categoricals]) 277 | def sample(self): 278 | x = [] 279 | for i in range(len(self.categoricals)): 280 | x.append(self.low[i] + self.categoricals[i].sample()) 281 | return tf.concat(x, axis=-1) 282 | @classmethod 283 | def fromflat(cls, flat): 284 | return cls(flat) 285 | 286 | class DiagGaussianPd(Pd): 287 | def __init__(self, flat): 288 | self.flat = flat 289 | mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat) 290 | self.mean = mean 291 | self.logstd = logstd 292 | self.std = tf.exp(logstd) 293 | def flatparam(self): 294 | return self.flat 295 | def mode(self): 296 | return self.mean 297 | def logp(self, x): 298 | return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \ 299 | - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \ 300 | - U.sum(self.logstd, axis=1) 301 | def kl(self, other): 302 | assert isinstance(other, DiagGaussianPd) 303 | return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1) 304 | def entropy(self): 305 | return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1) 306 | def cross_entropy(self, other): 307 | assert isinstance(other, DiagGaussianPd) 308 | return self.kl(other) + self.entropy() 309 | 310 | def params(self): 311 | return self.mean, self.std 312 | 313 | def sample(self): 314 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 315 | @classmethod 316 | def fromflat(cls, flat): 317 | return cls(flat) 318 | 319 | class BernoulliPd(Pd): 320 | def __init__(self, logits): 321 | self.logits = logits 322 | self.ps = tf.sigmoid(logits) 323 | def flatparam(self): 324 | return self.logits 325 | def mode(self): 326 | return tf.round(self.ps) 327 | def logp(self, x): 328 | return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1) 329 | def kl(self, other): 330 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 331 | def entropy(self): 332 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 333 | def sample(self): 334 | p = tf.sigmoid(self.logits) 335 | u = tf.random_uniform(tf.shape(p)) 336 | return tf.to_float(math_ops.less(u, p)) 337 | @classmethod 338 | def fromflat(cls, flat): 339 | return cls(flat) 340 | 341 | def make_pdtype(ac_space): 342 | from gym import spaces 343 | if isinstance(ac_space, spaces.Box): 344 | assert len(ac_space.shape) == 1 345 | return DiagGaussianPdType(ac_space.shape[0]) 346 | elif isinstance(ac_space, spaces.Discrete): 347 | # return CategoricalPdType(ac_space.n) 348 | return SoftCategoricalPdType(ac_space.n) 349 | elif isinstance(ac_space, MultiDiscrete): 350 | #return MultiCategoricalPdType(ac_space.low, ac_space.high) 351 | return SoftMultiCategoricalPdType(ac_space.low, ac_space.high) 352 | elif isinstance(ac_space, spaces.MultiBinary): 353 | return BernoulliPdType(ac_space.n) 354 | else: 355 | raise NotImplementedError 356 | 357 | def shape_el(v, i): 358 | maybe = v.get_shape()[i] 359 | if maybe is not None: 360 | return maybe 361 | else: 362 | return tf.shape(v)[i] 363 | --------------------------------------------------------------------------------