├── game ├── particle │ ├── __init__.py │ ├── bin │ │ ├── __init__.py │ │ └── interactive.py │ ├── multiagent │ │ ├── scenarios │ │ │ ├── __init__.py │ │ │ ├── simple_spread.py │ │ │ └── simple_tag.py │ │ ├── scenario.py │ │ ├── __init__.py │ │ ├── policy.py │ │ ├── multi_discrete.py │ │ ├── core.py │ │ └── rendering.py │ ├── make_env.py │ └── README.md ├── pacman │ ├── pacmanDQN_Agents.py │ ├── layouts │ │ ├── openClassic.lay │ │ └── originalClassic.lay │ ├── textDisplay.py │ ├── keyboardAgents.py │ ├── ghostAgents.py │ ├── make_env.py │ ├── layout.py │ └── graphicsUtils.py └── __init__.py ├── source ├── pacman │ ├── original │ │ └── 0 │ │ │ ├── checkpoint │ │ │ ├── model_0.ckpt.meta │ │ │ ├── model_0.ckpt.index │ │ │ ├── model_0.ckpt.data-00000-of-00001 │ │ │ ├── command.txt │ │ │ └── args.json │ └── medium │ │ └── 0 │ │ ├── checkpoint │ │ ├── model_40000_0.ckpt.index │ │ ├── model_40000_0.ckpt.meta │ │ ├── model_40000_0.ckpt.data-00000-of-00001 │ │ ├── command.txt │ │ └── args.json └── simple_tag │ └── tag4 │ ├── checkpoint │ ├── model_30000_3.ckpt.index │ ├── model_30000_3.ckpt.meta │ ├── model_30000_3.ckpt.data-00000-of-00001 │ └── args.json ├── requirements.txt ├── alg ├── __init__.py ├── maddpg │ ├── __init__.py │ ├── trainer │ │ └── replay_buffer.py │ ├── train.py │ └── common │ │ ├── tf_util.py │ │ └── distributions.py ├── optimizer.py ├── common │ └── common.py ├── sharing_multi_ppo │ ├── ppo.py │ └── ppo_add_entropy.py └── muti_ptf_ppo │ ├── ppo.py │ └── ppo_add_entropy.py ├── config ├── particle_conf.yaml ├── pacman_conf.yaml ├── maddpg_conf.yaml └── ppo_conf.yaml ├── run └── __init__.py ├── util ├── fource_exit.py ├── get_out_files.py ├── output_json.py ├── logger.py └── ReplayBuffer.py ├── README.md └── main.py /game/particle/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /game/particle/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/pacman/original/0/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_3.ckpt" 2 | all_model_checkpoint_paths: "model_3.ckpt" 3 | -------------------------------------------------------------------------------- /source/pacman/medium/0/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_40000_2.ckpt" 2 | all_model_checkpoint_paths: "model_40000_2.ckpt" 3 | -------------------------------------------------------------------------------- /source/simple_tag/tag4/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_30000_3.ckpt" 2 | all_model_checkpoint_paths: "model_30000_3.ckpt" 3 | -------------------------------------------------------------------------------- /source/pacman/original/0/model_0.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.meta -------------------------------------------------------------------------------- /source/pacman/original/0/model_0.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.index -------------------------------------------------------------------------------- /source/pacman/medium/0/model_40000_0.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.index -------------------------------------------------------------------------------- /source/pacman/medium/0/model_40000_0.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.meta -------------------------------------------------------------------------------- /source/simple_tag/tag4/model_30000_3.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.index -------------------------------------------------------------------------------- /source/simple_tag/tag4/model_30000_3.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.meta -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.9 2 | numpy==1.19.5 3 | tensorboard==1.14.0 4 | tensorboard-logger==0.1.0 5 | tensorflow==1.14.0 6 | PyYAML==5.4.1 7 | 8 | 9 | -------------------------------------------------------------------------------- /game/pacman/pacmanDQN_Agents.py: -------------------------------------------------------------------------------- 1 | import game.pacman.game as game 2 | 3 | 4 | class PacmanDQN(game.Agent): 5 | def __init__(self, args): 6 | pass 7 | -------------------------------------------------------------------------------- /source/pacman/original/0/model_0.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/original/0/model_0.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /source/pacman/medium/0/model_40000_0.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/pacman/medium/0/model_40000_0.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /source/simple_tag/tag4/model_30000_3.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianpeiyang/MAPTF_code/HEAD/source/simple_tag/tag4/model_30000_3.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /alg/__init__.py: -------------------------------------------------------------------------------- 1 | from alg.maddpg.trainer.maddpg import MADDPGAgentTrainer as maddpg 2 | 3 | REGISTRY = {} 4 | 5 | 6 | REGISTRY['maddpg'] = maddpg 7 | REGISTRY['maddpg_sr'] = maddpg 8 | -------------------------------------------------------------------------------- /game/particle/multiagent/scenarios/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | import os.path as osp 3 | 4 | 5 | def load(name): 6 | pathname = osp.join(osp.dirname(__file__), name) 7 | return imp.load_source('', pathname) 8 | -------------------------------------------------------------------------------- /game/__init__.py: -------------------------------------------------------------------------------- 1 | from .particle.make_env import make_env as Particle 2 | from .pacman.make_env import make_env as PacmanEnv 3 | 4 | 5 | REGISTRY = {} 6 | 7 | REGISTRY['particle'] = Particle 8 | REGISTRY['pacman'] = PacmanEnv 9 | 10 | 11 | -------------------------------------------------------------------------------- /config/particle_conf.yaml: -------------------------------------------------------------------------------- 1 | game_name: "simple_spread" 2 | continuous_action: False 3 | reward_normalize: False 4 | benchmark: False 5 | action_clip: 1 6 | num_adversaries: 0 7 | num_good: 0 8 | obs_sort: False 9 | reward_func: "reward" 10 | restrict_move: False -------------------------------------------------------------------------------- /game/pacman/layouts/openClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%% 2 | %.. .... .... G % 3 | %.. ... ... ... ... % 4 | %.. ... ... ... ... % 5 | %.. P .... .... % 6 | %.. ... ... ... ... % 7 | %.. ... ... ... ... % 8 | %.. .... .... G % 9 | %%%%%%%%%%%%%%%%%%%%%%%%% 10 | -------------------------------------------------------------------------------- /config/pacman_conf.yaml: -------------------------------------------------------------------------------- 1 | num_adversaries: 1 #pacman 2 | timeout: 30 3 | game_name: "trickyClassic" 4 | textGraphics: False 5 | quietGraphics: False 6 | zoom: 1.0 7 | fixRandomSeed: False 8 | recordActions: False 9 | replay: None 10 | frameTime: 0.1 11 | catchExceptions: False 12 | continuous_action: False 13 | obs_sort: False 14 | -------------------------------------------------------------------------------- /game/particle/multiagent/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # defines scenario upon which the world is built 5 | class BaseScenario(object): 6 | # create elements of the world 7 | def make_world(self): 8 | raise NotImplementedError() 9 | # create initial conditions of the world 10 | 11 | def reset_world(self, world): 12 | raise NotImplementedError() 13 | -------------------------------------------------------------------------------- /run/__init__.py: -------------------------------------------------------------------------------- 1 | from .run_multi_ptf_ppo_sro import run as multi_ppo_sr_run 2 | from .run_maddpg_sr import run as run_maddpg_sr 3 | from .run_multi_ptf_shppo_sro import run as shppo_sr_run 4 | 5 | REGISTRY = {} 6 | 7 | REGISTRY['multi_ppo'] = multi_ppo_sr_run 8 | REGISTRY['multi_ppo_sro'] = multi_ppo_sr_run 9 | REGISTRY['maddpg'] = run_maddpg_sr 10 | REGISTRY['maddpg_sr'] = run_maddpg_sr 11 | REGISTRY['shppo'] = shppo_sr_run 12 | REGISTRY['shppo_sro'] = shppo_sr_run 13 | 14 | -------------------------------------------------------------------------------- /alg/maddpg/__init__.py: -------------------------------------------------------------------------------- 1 | class AgentTrainer(object): 2 | def __init__(self, name, model, obs_shape, act_space, args): 3 | raise NotImplemented() 4 | 5 | def action(self, obs): 6 | raise NotImplemented() 7 | 8 | def process_experience(self, obs, act, rew, new_obs, done, terminal): 9 | raise NotImplemented() 10 | 11 | def preupdate(self): 12 | raise NotImplemented() 13 | 14 | def update(self, agents): 15 | raise NotImplemented() 16 | -------------------------------------------------------------------------------- /util/fource_exit.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import msvcrt 3 | 4 | 5 | class Exit: 6 | def __init__(self): 7 | self.isExit = False 8 | self.thread = threading.Thread(target=self.work) 9 | 10 | def work(self): 11 | while True: 12 | newChar = msvcrt.getch() 13 | if newChar in b'\r': # 如果是换行,则输入结束 14 | self.isExit = True 15 | break 16 | 17 | def run(self): 18 | self.thread.start() 19 | 20 | def get_status(self): 21 | return self.isExit 22 | 23 | -------------------------------------------------------------------------------- /game/particle/multiagent/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # Multiagent envs 4 | # ---------------------------------------- 5 | 6 | register( 7 | id='MultiagentSimple-v0', 8 | entry_point='multiagent.envs:SimpleEnv', 9 | # FIXME(cathywu) currently has to be exactly max_path_length parameters in 10 | # rllab run script 11 | max_episode_steps=100, 12 | ) 13 | 14 | register( 15 | id='MultiagentSimpleSpeakerListener-v0', 16 | entry_point='multiagent.envs:SimpleSpeakerListenerEnv', 17 | max_episode_steps=100, 18 | ) 19 | -------------------------------------------------------------------------------- /source/pacman/medium/0/command.txt: -------------------------------------------------------------------------------- 1 | -a multi_ppo -c multi_ptf_ppo_conf -g pacman -d pacman_conf -n 50000 -e 99 -s 7 -o adam n_layer_a_1=128 n_layer_c_1=128 option_layer_1=64 n_layer_a_2=128 n_layer_c_2=128 option_layer_2=64 c2=0.01 learning_rate_a=5e-4 learning_rate_c=5e-4 learning_rate_o=1e-3 learning_rate_t=1e-3 continuous_action=False reward_decay=0.99 clip_value=0.2 e_greedy=0.95 e_greedy_increment=1e-3 replace_target_iter=1000 learning_step=1000 option_batch_size=64 batch_size=64 save_per_episodes=10000 save_model=True c3=0.001 num_adversaries=1 adv_use_option=False good_use_option=False adv_load_model=True adv_load_model_path=source/pacman/3/model_20000_0 game_name=mediumClassic obs_sort=False xi=0 use_gpu=True use_gpu_id=1 memory_size=100000 -------------------------------------------------------------------------------- /source/pacman/original/0/command.txt: -------------------------------------------------------------------------------- 1 | -a multi_ppo -c multi_ptf_ppo_conf -g pacman -d pacman_conf -n 50000 -e 99 -s 7 -o adam n_layer_a_1=128 n_layer_c_1=128 option_layer_1=64 n_layer_a_2=128 n_layer_c_2=128 option_layer_2=64 c2=0.01 learning_rate_a=5e-4 learning_rate_c=5e-4 learning_rate_o=1e-3 learning_rate_t=1e-3 continuous_action=False reward_decay=0.99 clip_value=0.2 e_greedy=0.95 e_greedy_increment=1e-3 replace_target_iter=1000 learning_step=1000 option_batch_size=64 batch_size=64 save_per_episodes=5000 save_model=True c3=0.001 num_adversaries=1 adv_use_option=False good_use_option=False load_model=True load_model_path=source/pacman/original/2020-11-18_22-49-59/model game_name=originalClassic obs_sort=False xi=0 use_gpu=False use_gpu_id=1 memory_size=100000 -------------------------------------------------------------------------------- /alg/optimizer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class Optimizer: 5 | def __init__( 6 | self, 7 | optimizer, 8 | learning_rate, 9 | momentum=None 10 | ): 11 | self.opt = None 12 | if str(optimizer).lower() == "grad": 13 | self.opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) 14 | elif str(optimizer).lower() == "momentum": 15 | self.opt = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) 16 | elif str(optimizer).lower() == 'rmsprop': 17 | self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate) 18 | elif str(optimizer).lower() == 'adam': 19 | self.opt = tf.train.AdamOptimizer(learning_rate=learning_rate) 20 | 21 | def get_optimizer(self): 22 | return self.opt 23 | -------------------------------------------------------------------------------- /game/pacman/layouts/originalClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | %............%%............% 3 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 4 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 5 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 6 | %..........................% 7 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 8 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 9 | %......%%....%%....%%......% 10 | %%%%%%.%%%%% %% %%%%%.%%%%%% 11 | %%%%%%.%%%%% %% %%%%%.%%%%%% 12 | %%%%%%.% %.%%%%%% 13 | %%%%%%.% %%%% %%%% %.%%%%%% 14 | % . %G G G% . % 15 | %%%%%%.% %%%%%%%%%% %.%%%%%% 16 | %%%%%%.% %.%%%%%% 17 | %%%%%%.% %%%%%%%%%% %.%%%%%% 18 | %............%%............% 19 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 20 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 21 | %...%%....... .......%%...% 22 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 23 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 24 | %......%%....%%....%%......% 25 | %.%%%%%%%%%%.%%.%%%%%%%%%%.% 26 | %.............P............% 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | -------------------------------------------------------------------------------- /util/get_out_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def paths(file_path): 6 | path_collection = [] 7 | path_target_collection = [] 8 | path_target_dir = [] 9 | for dirpath, dirnames, filenames in os.walk(file_path): 10 | for file_name in filenames: 11 | if file_name == 'out.json' or file_name == 'args.json' or file_name == 'command.txt': 12 | fullpath = os.path.join(dirpath, file_name) 13 | path_collection.append(fullpath) 14 | path_target_collection.append(fullpath.replace('results', 'results_out')) 15 | path_target_dir.append(dirpath.replace('results', 'results_out')) 16 | return path_collection, path_target_collection, path_target_dir 17 | 18 | 19 | source_path = '' 20 | 21 | 22 | if __name__ == "__main__": 23 | path_collection, path_target_collection, path_target_dir = paths(source_path) 24 | 25 | for (source, target, dir_path) in zip(path_collection, path_target_collection, path_target_dir): 26 | if not os.path.exists(dir_path): 27 | os.makedirs(dir_path) 28 | print(source, target) 29 | shutil.copyfile(source, target) 30 | -------------------------------------------------------------------------------- /config/maddpg_conf.yaml: -------------------------------------------------------------------------------- 1 | reward_decay: 0.99 2 | output_graph: True 3 | save_model: True 4 | summary_output_times: 10 5 | regular: 0.005 6 | learning_rate_a: 0.01 7 | learning_rate_c: 0.01 8 | ENTROPY_BETA: 0.0005 9 | USE_CPU_COUNT: True 10 | load_model: False 11 | load_model_path: '' 12 | batch_size: 1024 13 | display: False 14 | 15 | #run 16 | reward_memory: 100 17 | save_per_episodes: 2000 18 | num_adversaries: 0 19 | good_policy: 'maddpg' 20 | adv_policy: 'maddpg' 21 | adv_use_option: False 22 | good_use_option: False 23 | adv_load_model: False 24 | adv_load_model_path: '' 25 | good_load_model: False 26 | good_load_model_path: '' 27 | use_gpu_id: '0' 28 | use_gpu: False 29 | other_option_update: True 30 | 31 | #option 32 | learning_rate_o: 0.0003 33 | learning_rate_t: 0.0003 34 | option_layer_1: 128 35 | option_layer_2: 128 36 | e_greedy: 0.95 37 | e_greedy_increment: 0.005 38 | start_greedy: 0.0 39 | memory_size: 1000000 40 | option_batch_size: 16 41 | xi: 0.005 42 | option_clip_value: 10.0 43 | is_soft_max_action: True 44 | replace_target_iter: 1000 45 | learning_step: 1000 46 | c3: 0.0005 47 | c1: 1.0 48 | 49 | # SF 50 | embedding_dim: 32 51 | option_embedding_layer: 64 52 | recon_loss_coef: 0.1 53 | learning_rate_r: 0.0003 54 | clip_value: 0.2 55 | 56 | #DVM 57 | distillation_frequent: 1000 58 | distillation_interation: 2048 59 | 60 | # network 61 | n_layer_a_1: 128 62 | 63 | # output 64 | SAVE_PATH: "model" 65 | graph_path: "graph" 66 | reward_output: "output" 67 | output_filename: "out" 68 | log: "log" 69 | benchmark_dir: "benchmark" -------------------------------------------------------------------------------- /config/ppo_conf.yaml: -------------------------------------------------------------------------------- 1 | # ppo 2 | learning_rate_a: 0.0003 3 | learning_rate_c: 0.0003 4 | batch_size: 32 5 | clip_value: 0.2 6 | reward_decay: 0.99 7 | c2: 0.001 8 | stochastic: True 9 | load_model: False 10 | load_model_path: '' 11 | adv_policy: "ppo" 12 | good_policy: "ppo" 13 | reward_normalize: False 14 | done_reward: 1.0 15 | 16 | # option 17 | option_batch_size: 32 18 | option_clip_value: 10.0 19 | other_option_update: True 20 | c1: 0.005 21 | c3: 0.0005 22 | epi_train_times: 1 23 | memory_size: 100000 24 | e_greedy: 0.95 25 | replace_target_iter: 1000 26 | e_greedy_increment: 0.001 27 | start_greedy: 0.0 28 | learning_step: 1000 29 | learning_rate_o: 0.00001 30 | learning_rate_t: 0.00001 31 | xi: 0.005 32 | adv_use_option: False 33 | good_use_option: False 34 | adv_load_model: False 35 | adv_load_model_path: '' 36 | good_load_model: False 37 | good_load_model_path: '' 38 | grad_clip: 10 39 | 40 | #sro 41 | learning_rate_r: 0.0003 42 | embedding_dim: 32 43 | option_embedding_layer: 64 44 | recon_loss_coef: 0.1 45 | 46 | # transfer_agent 47 | trans_agent_start_epi: 0 48 | 49 | #run 50 | reward_memory: 100 51 | save_per_episodes: 2000 52 | use_gpu_id: '0' 53 | use_gpu: False 54 | output_graph: True 55 | save_model: True 56 | summary_output_times: 10 57 | reload_model: False 58 | reload_model_path: '' 59 | 60 | # network 61 | policy: 'policy' 62 | old_policy: 'old_policy' 63 | n_layer_a_1: 64 64 | n_layer_a_2: 64 65 | n_layer_c_1: 64 66 | n_layer_c_2: 64 67 | option_layer_1: 128 68 | option_layer_2: 128 69 | 70 | # output 71 | SAVE_PATH: "model" 72 | graph_path: "graph" 73 | reward_output: "output" 74 | output_filename: "out" 75 | log: "log" 76 | 77 | -------------------------------------------------------------------------------- /source/pacman/medium/0/args.json: -------------------------------------------------------------------------------- 1 | {"numGames": 50000, "game": "pacman", "algorithm": "multi_ppo", "epi_step": 99, "seed": 7, "optimizer": "adam", "run_test": false, "results_path": "../results/multi_ppo_pacman/2019-12-12_13-12-59/", "learning_rate_a": 0.0005, "learning_rate_c": 0.0005, "batch_size": 64, "option_batch_size": 64, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.01, "epi_train_times": 1, "stochastic": true, "load_model": false, "load_model_path": "", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.001, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 0.001, "learning_rate_t": 0.001, "ENTROPY_BETA": 0.0005, "c3": 0.001, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "adv_load_model": true, "adv_load_model_path": "source/pacman/3/model_20000_0", "reward_normalize": false, "done_reward": 1.0, "reward_memory": 100, "save_per_episodes": 10000, "use_gpu_id": "1", "use_gpu": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 128, "n_layer_a_2": 128, "n_layer_c_1": 128, "n_layer_c_2": 128, "temperature": 0.1, "option_layer_1": 64, "option_layer_2": 64, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "num_adversaries": 1, "timeout": 30, "game_name": "mediumClassic", "textGraphics": false, "quietGraphics": false, "zoom": 1.0, "fixRandomSeed": false, "recordActions": false, "replay": "None", "frameTime": 0.1, "catchExceptions": false, "continuous_action": false, "obs_sort": false} -------------------------------------------------------------------------------- /game/particle/bin/interactive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os,sys 3 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 4 | import argparse 5 | 6 | from game.particle.multiagent.environment import MultiAgentEnv 7 | from game.particle.multiagent.policy import InteractivePolicy 8 | import game.particle.multiagent.scenarios as scenarios 9 | 10 | if __name__ == '__main__': 11 | # parse arguments 12 | parser = argparse.ArgumentParser(description=None) 13 | parser.add_argument('-s', '--scenario', default='simple_adversary.py', help='Path of the scenario Python script.') 14 | args = parser.parse_args() 15 | 16 | # load scenario from script 17 | scenario = scenarios.load(args.scenario).Scenario() 18 | # create world 19 | world = scenario.make_world() 20 | # create multiagent environment 21 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False) 22 | # render call to create viewer window (necessary only for interactive policies) 23 | env.render() 24 | # create interactive policies for each agent 25 | policies = [InteractivePolicy(env, i) for i in range(env.n)] 26 | # execution loop 27 | obs_n = env.reset() 28 | while True: 29 | # query for action from each agent's policy 30 | act_n = [] 31 | for i, policy in enumerate(policies): 32 | act_n.append(policy.action(obs_n[i])) 33 | # step environment 34 | obs_n, reward_n, done_n, _ = env.step(act_n) 35 | # render all agent views 36 | env.render() 37 | # display rewards 38 | #for agent in env.world.agents: 39 | # print(agent.name + " reward: %0.3f" % env._get_reward(agent)) 40 | -------------------------------------------------------------------------------- /source/simple_tag/tag4/args.json: -------------------------------------------------------------------------------- 1 | {"numGames": 20000, "game": "particle", "algorithm": "multi_ppo_sr2", "epi_step": 99, "seed": 12345, "optimizer": "adam", "run_test": false, "obs_sort": false, "learning_rate_a": 0.0003, "learning_rate_c": 0.0003, "learning_rate_r": 0.0003, "batch_size": 32, "option_batch_size": 32, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.001, "epi_train_times": 1, "stochastic": true, "load_model": false, "load_model_path": "", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.005, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 1e-05, "learning_rate_t": 1e-05, "ENTROPY_BETA": 0.0005, "c3": 0.0005, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "reward_normalize": false, "adv_load_model": false, "adv_load_model_path": "", "good_load_model": true, "good_load_model_path": "source\\simple_tag\\ppo_tag\\model_30000_3", "done_reward": 1.0, "trans_agent_start_epi": 0, "reward_memory": 100, "save_per_episodes": 2000, "use_gpu_id": "0", "use_gpu": false, "other_option_update": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 64, "n_layer_a_2": 64, "n_layer_c_1": 64, "n_layer_c_2": 64, "temperature": 0.1, "option_layer_1": 32, "option_layer_2": 32, "embedding_dim": 32, "option_embedding_layer": 64, "recon_loss_coef": 0.1, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "game_name": "simple_tag", "continuous_action": false, "benchmark": false, "action_clip": 1, "num_adversaries": 3, "reward_func": "reward", "restrict_move": true, "results_path": "../results/multi_ppo_sr2/particle/simple_tag/2020-09-22_17-44-26/"} -------------------------------------------------------------------------------- /source/pacman/original/0/args.json: -------------------------------------------------------------------------------- 1 | {"numGames": 50000, "game": "pacman", "algorithm": "multi_ppo", "epi_step": 99, "seed": 7, "optimizer": "adam", "run_test": false, "learning_rate_a": 0.0005, "learning_rate_c": 0.0005, "batch_size": 64, "option_batch_size": 64, "output_graph": true, "save_model": true, "summary_output_times": 10, "clip_value": 0.2, "option_clip_value": 10.0, "reward_decay": 0.99, "c1": 1.0, "c2": 0.01, "epi_train_times": 1, "stochastic": true, "load_model": true, "load_model_path": "source/pacman/original/2020-11-18_22-49-59/model", "memory_size": 100000, "e_greedy": 0.95, "replace_target_iter": 1000, "e_greedy_increment": 0.001, "start_greedy": 0.0, "learning_step": 1000, "regular": 0.005, "learning_rate_o": 0.001, "learning_rate_t": 0.001, "ENTROPY_BETA": 0.0005, "c3": 0.001, "xi": 0.0, "adv_policy": "ppo", "good_policy": "ppo", "adv_use_option": false, "good_use_option": false, "adv_load_model": false, "adv_load_model_path": "", "good_load_model": false, "good_load_model_path": "", "reward_normalize": false, "done_reward": 1.0, "grad_clip": 10, "trans_agent_start_epi": 0, "reward_memory": 100, "save_per_episodes": 5000, "use_gpu_id": "1", "use_gpu": false, "other_option_update": true, "policy": "policy", "old_policy": "old_policy", "n_layer_a_1": 128, "n_layer_a_2": 128, "n_layer_c_1": 128, "n_layer_c_2": 128, "temperature": 0.1, "option_layer_1": 64, "option_layer_2": 64, "SAVE_PATH": "model", "graph_path": "graph", "reward_output": "output", "output_filename": "out", "log": "log", "num_adversaries": 1, "timeout": 30, "game_name": "originalClassic", "textGraphics": false, "quietGraphics": false, "zoom": 1.0, "fixRandomSeed": false, "recordActions": false, "replay": "None", "frameTime": 0.1, "catchExceptions": false, "continuous_action": false, "obs_sort": false, "results_path": "../results/multi_ppo/pacman/originalClassic/2020-11-21_11-40-33/"} -------------------------------------------------------------------------------- /game/particle/multiagent/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyglet.window import key 3 | 4 | 5 | # individual agent policy 6 | class Policy(object): 7 | def __init__(self): 8 | pass 9 | 10 | def action(self, obs): 11 | raise NotImplementedError() 12 | 13 | 14 | # interactive policy based on keyboard input 15 | # hard-coded to deal only with movement, not communication 16 | class InteractivePolicy(Policy): 17 | def __init__(self, env, agent_index): 18 | super(InteractivePolicy, self).__init__() 19 | self.env = env 20 | # hard-coded keyboard events 21 | self.move = [False for i in range(4)] 22 | self.comm = [False for i in range(env.world.dim_c)] 23 | # register keyboard events with this environment's window 24 | env.viewers[agent_index].window.on_key_press = self.key_press 25 | env.viewers[agent_index].window.on_key_release = self.key_release 26 | 27 | def action(self, obs): 28 | # ignore observation and just act based on keyboard events 29 | if self.env.discrete_action_input: 30 | u = 0 31 | if self.move[0]: u = 1 32 | if self.move[1]: u = 2 33 | if self.move[2]: u = 4 34 | if self.move[3]: u = 3 35 | else: 36 | u = np.zeros(5) # 5-d because of no-move action 37 | if self.move[0]: u[1] += 1.0 38 | if self.move[1]: u[2] += 1.0 39 | if self.move[3]: u[3] += 1.0 40 | if self.move[2]: u[4] += 1.0 41 | if True not in self.move: 42 | u[0] += 1.0 43 | return np.concatenate([u, np.zeros(self.env.world.dim_c)]) 44 | 45 | # keyboard event callbacks 46 | def key_press(self, k, mod): 47 | if k == key.LEFT: self.move[0] = True 48 | if k == key.RIGHT: self.move[1] = True 49 | if k == key.UP: self.move[2] = True 50 | if k == key.DOWN: self.move[3] = True 51 | 52 | def key_release(self, k, mod): 53 | if k == key.LEFT: self.move[0] = False 54 | if k == key.RIGHT: self.move[1] = False 55 | if k == key.UP: self.move[2] = False 56 | if k == key.DOWN: self.move[3] = False 57 | -------------------------------------------------------------------------------- /util/output_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | 5 | 6 | class OutputJson: 7 | def __init__(self, data_field=[]): 8 | self.data_field = data_field 9 | self.data = {} 10 | for i in range(len(data_field)): 11 | if not isinstance(data_field[i], str): 12 | raise Exception('the data field must be type of string: ' + str(data_field[i])) 13 | 14 | self.data[data_field[i]] = [] 15 | 16 | def update(self, value, key=None): 17 | if key is not None: 18 | if isinstance(value, bool): 19 | value = str(value) 20 | self.data[key].append(value) 21 | return 22 | if isinstance(value, tuple) or isinstance(value, list): 23 | if len(value) != len(self.data_field): 24 | raise Exception('Error in parameters size: ' + str(value)) 25 | for i in range(len(value)): 26 | if type(value[i]) is np.bool_ or type(value[i]) is np.bool or type(value[i]) is bool: 27 | self.data[self.data_field[i]].append(str(value[i])) 28 | else: 29 | self.data[self.data_field[i]].append(value[i]) 30 | 31 | def print_first(self): 32 | if self.data == {}: 33 | return 34 | for i, key in enumerate(self.data_field): 35 | print(key, ": %s, " % self.data[key][len(self.data[key]) - 1], end=' ') 36 | print() 37 | 38 | def print_by_key(self, key, index=None): 39 | if index is None: 40 | print(key, ": ", self.data[key]) 41 | else: 42 | print(key, " ", index, ": ", self.data[key][index]) 43 | 44 | def save(self, path, filename, field=None): 45 | if not os.path.exists(path): 46 | os.makedirs(path) 47 | if field is None: 48 | field = self.data_field 49 | out = {} 50 | for key in field: 51 | if len(self.data[key]) > 0 and type(self.data[key][0]) is np.ndarray: 52 | out[key] = [a.tolist() for a in self.data[key]] 53 | else: 54 | out[key] = self.data[key] 55 | with open(path + "/" + filename + ".json", "w") as f: 56 | json.dump(out, f) 57 | -------------------------------------------------------------------------------- /game/particle/multiagent/multi_discrete.py: -------------------------------------------------------------------------------- 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates) 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py) 3 | 4 | import numpy as np 5 | 6 | import gym 7 | from gym.spaces import prng 8 | 9 | 10 | class MultiDiscrete(gym.Space): 11 | """ 12 | - The multi-discrete action space consists of a series of discrete action spaces with different parameters 13 | - It can be adapted to both a Discrete action space or a continuous (Box) action space 14 | - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space 15 | - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space 16 | where the discrete action space can take any integers from `min` to `max` (both inclusive) 17 | Note: A value of 0 always need to represent the NOOP action. 18 | e.g. Nintendo Game Controller 19 | - Can be conceptualized as 3 discrete action spaces: 20 | 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4 21 | 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 22 | 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 23 | - Can be initialized as 24 | MultiDiscrete([ [0,4], [0,1], [0,1] ]) 25 | """ 26 | def __init__(self, array_of_param_array): 27 | self.low = np.array([x[0] for x in array_of_param_array]) 28 | self.high = np.array([x[1] for x in array_of_param_array]) 29 | self.num_discrete_space = self.low.shape[0] 30 | 31 | def sample(self): 32 | """ Returns a array with one sample from each discrete action space """ 33 | # For each row: round(random .* (max - min) + min, 0) 34 | random_array = prng.np_random.rand(self.num_discrete_space) 35 | return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] 36 | 37 | def contains(self, x): 38 | return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() 39 | 40 | @property 41 | def shape(self): 42 | return self.num_discrete_space 43 | 44 | def __repr__(self): 45 | return "MultiDiscrete" + str(self.num_discrete_space) 46 | 47 | def __eq__(self, other): 48 | return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high) 49 | -------------------------------------------------------------------------------- /alg/common/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import json 4 | from alg.muti_ptf_ppo.ppo import PPO 5 | 6 | def action_equal(action1, action2, continuous_action=None): 7 | if not continuous_action or continuous_action is None: 8 | if (isinstance(action1, list) or isinstance(action1, np.ndarray)) and (isinstance(action2, list) or isinstance(action2, np.ndarray)): 9 | return (np.array(action1) == np.array(action2)).all() 10 | else: 11 | return False 12 | elif continuous_action: 13 | mean = action1[0] 14 | std = action1[1] 15 | for i in range(len(action2)): 16 | if action2[i] < mean[i] - std[i] or action2[i] > mean[i] + std[i]: 17 | return False 18 | return True 19 | 20 | 21 | def build_source_actor(args, sess, policy_path, i=0): 22 | par_path = os.path.dirname(policy_path) 23 | file_name = '' 24 | for dirPath, dirNames, fileNames in os.walk(par_path): 25 | # print(fileNames) 26 | for fileName in fileNames: 27 | if fileName == 'args.json': 28 | file_name = fileName 29 | break 30 | if file_name != '': 31 | break 32 | file_path = par_path + "/" + file_name 33 | with open(file_path, 'r') as f: 34 | source_args = json.load(f) 35 | source_policy = 'ppo'#args['policy']'' 36 | if source_policy == 'ppo': 37 | return PPO(args['action_dim'], args['features'], source_args, sess, logger=None, i=i) 38 | else: 39 | raise Exception('no such source_policy named: ' + str(source_policy)) 40 | 41 | 42 | class OptionToList: 43 | def __init__(self, num_agent): 44 | self.num_agent = num_agent 45 | self.option_list = [] 46 | self.reset() 47 | 48 | def reset(self): 49 | self.option_list = [] 50 | length = np.power(self.num_agent - 1, self.num_agent) 51 | for i in range(length): 52 | self.option_list.append(self.number_converter(i)) 53 | 54 | # FIXME option网络输出option index,转换成union option的操作相当于进制转换,例如option_dim=3,option_index=26, union_option=[2, 2, 2] 55 | def number_converter(self, number): 56 | hex = self.num_agent 57 | res = np.zeros(hex) 58 | index = 0 59 | while True: 60 | s = number // (hex - 1) # 商 61 | y = number % (hex - 1) # 余数 62 | res[index] = y 63 | if s == 0: 64 | break 65 | number = s 66 | index += 1 67 | res = list(res) 68 | res.reverse() 69 | return res 70 | 71 | def get_option_list(self, i): 72 | if i >= len(self.option_list): 73 | assert 'out of option_list memory!' 74 | return self.option_list[i] 75 | -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy 3 | import time 4 | import threading 5 | 6 | R = threading.Lock() 7 | 8 | class Logger: 9 | def __init__(self, log_name, graph_path, args): 10 | # 第一步,创建一个logger 11 | self.logger = self.build_log(log_name) 12 | self.build_tb_log(graph_path) 13 | self.args = args 14 | self.keys = dict() 15 | if 'summary_output_times' in self.args.keys(): 16 | self.summary_times = self.args['summary_output_times'] 17 | else: 18 | self.summary_times = 1 19 | 20 | 21 | def build_log(self, name): 22 | logger = logging.getLogger() 23 | logger.setLevel(logging.INFO) # Log等级总开关 24 | # 第二步,创建一个handler,用于写入日志文件 25 | rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) 26 | log_name = name + '/out.log' 27 | logfile = log_name 28 | fh = logging.FileHandler(logfile, mode='w') 29 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 30 | # 第三步,定义handler的输出格式 31 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") 32 | fh.setFormatter(formatter) 33 | # 第四步,将logger添加到handler里面 34 | logger.addHandler(fh) 35 | return logger 36 | 37 | def build_tb_log(self, path): 38 | from tensorboard_logger import configure, log_value, log_histogram 39 | configure(path) 40 | self.tb_logger = log_value 41 | self.tb_h_logger = log_histogram 42 | 43 | def write_tb_log(self, key, value, t): 44 | if self.args['output_graph']: 45 | if t % self.summary_times != 0: 46 | return 47 | #print(key, value) 48 | if type(value) is numpy.ndarray or type(value) is list: 49 | R.acquire() 50 | if key not in self.keys.keys(): 51 | self.keys[key] = 0 52 | else: 53 | self.keys[key] += 1 54 | #print(type(value), key, value, self.keys[key]) 55 | self.tb_h_logger(key, value, self.keys[key]) 56 | R.release() 57 | else: 58 | R.acquire() 59 | if key not in self.keys.keys(): 60 | self.keys[key] = 0 61 | else: 62 | self.keys[key] += 1 63 | #print(key, value, self.keys[key]) 64 | self.tb_logger(key, value, self.keys[key]) 65 | R.release() 66 | else: 67 | return 68 | 69 | 70 | def write_log(self, msg, type='info'): 71 | if type == 'debug': 72 | self.logger.debug(msg) 73 | elif type == 'info': 74 | self.logger.log(msg) 75 | elif type == 'warning': 76 | self.logger.warning(msg) 77 | elif type == 'error': 78 | self.logger.error(msg) 79 | elif type == 'critical': 80 | self.logger.critical(msg) 81 | 82 | -------------------------------------------------------------------------------- /game/pacman/textDisplay.py: -------------------------------------------------------------------------------- 1 | # textDisplay.py 2 | # -------------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | import time 16 | try: 17 | import pacman 18 | except: 19 | pass 20 | 21 | DRAW_EVERY = 1 22 | SLEEP_TIME = 0 # This can be overwritten by __init__ 23 | DISPLAY_MOVES = False 24 | QUIET = False # Supresses output 25 | 26 | class NotGraphics: 27 | def initialize(self, state, isBlue = False): 28 | pass 29 | 30 | def update(self, state): 31 | pass 32 | 33 | def checkNullDisplay(self): 34 | return True 35 | 36 | def pause(self): 37 | time.sleep(SLEEP_TIME) 38 | 39 | def draw(self, state): 40 | pass 41 | 42 | def updateDistributions(self, dist): 43 | pass 44 | 45 | def finish(self): 46 | pass 47 | 48 | class NullGraphics: 49 | def initialize(self, state, isBlue = False): 50 | pass 51 | 52 | def update(self, state): 53 | pass 54 | 55 | def checkNullDisplay(self): 56 | return True 57 | 58 | def pause(self): 59 | time.sleep(SLEEP_TIME) 60 | 61 | def draw(self, state): 62 | print(state) 63 | 64 | def updateDistributions(self, dist): 65 | pass 66 | 67 | def finish(self): 68 | pass 69 | 70 | class PacmanGraphics: 71 | def __init__(self, speed=None): 72 | if speed != None: 73 | global SLEEP_TIME 74 | SLEEP_TIME = speed 75 | 76 | def initialize(self, state, isBlue = False): 77 | self.draw(state) 78 | self.pause() 79 | self.turn = 0 80 | self.agentCounter = 0 81 | 82 | def update(self, state): 83 | numAgents = len(state.agentStates) 84 | self.agentCounter = (self.agentCounter + 1) % numAgents 85 | if self.agentCounter == 0: 86 | self.turn += 1 87 | if DISPLAY_MOVES: 88 | ghosts = [pacman.nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)] 89 | print(("%4d) P: %-8s" % (self.turn, str(pacman.nearestPoint(state.getPacmanPosition()))), '| Score: %-5d' % state.score, '| Ghosts:', ghosts)) 90 | if self.turn % DRAW_EVERY == 0: 91 | self.draw(state) 92 | self.pause() 93 | if state._win or state._lose: 94 | self.draw(state) 95 | 96 | def pause(self): 97 | time.sleep(SLEEP_TIME) 98 | 99 | def draw(self, state): 100 | print(state) 101 | 102 | def finish(self): 103 | pass 104 | -------------------------------------------------------------------------------- /alg/maddpg/trainer/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | def __init__(self, size): 6 | """Create Prioritized Replay buffer. 7 | 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = int(size) 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def clear(self): 22 | self._storage = [] 23 | self._next_idx = 0 24 | 25 | def add(self, obs_t, action, reward, obs_tp1, option, term, done): 26 | data = (obs_t, action, reward, obs_tp1, option, term, done) 27 | 28 | if self._next_idx >= len(self._storage): 29 | self._storage.append(data) 30 | else: 31 | self._storage[self._next_idx] = data 32 | self._next_idx = (self._next_idx + 1) % self._maxsize 33 | 34 | def _encode_sample(self, idxes): 35 | obses_t, actions, rewards, obses_tp1, options, terms, dones = [], [], [], [], [], [], [] 36 | for i in idxes: 37 | data = self._storage[i] 38 | obs_t, action, reward, obs_tp1, option, term, done = data 39 | obses_t.append(np.array(obs_t, copy=False)) 40 | actions.append(np.array(action, copy=False)) 41 | rewards.append(reward) 42 | obses_tp1.append(np.array(obs_tp1, copy=False)) 43 | options.append(option) 44 | terms.append(term) 45 | dones.append(done) 46 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(options), np.array(terms), np.array(dones) 47 | 48 | def make_index(self, batch_size): 49 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 50 | 51 | def make_latest_index(self, batch_size): 52 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 53 | np.random.shuffle(idx) 54 | return idx 55 | 56 | def sample_index(self, idxes): 57 | return self._encode_sample(idxes) 58 | 59 | def sample(self, batch_size): 60 | """Sample a batch of experiences. 61 | 62 | Parameters 63 | ---------- 64 | batch_size: int 65 | How many transitions to sample. 66 | 67 | Returns 68 | ------- 69 | obs_batch: np.array 70 | batch of observations 71 | act_batch: np.array 72 | batch of actions executed given obs_batch 73 | rew_batch: np.array 74 | rewards received as results of executing act_batch 75 | next_obs_batch: np.array 76 | next set of observations seen after executing act_batch 77 | done_mask: np.array 78 | done_mask[i] = 1 if executing act_batch[i] resulted in 79 | the end of an episode and 0 otherwise. 80 | """ 81 | if batch_size > 0: 82 | idxes = self.make_index(batch_size) 83 | else: 84 | idxes = range(0, len(self._storage)) 85 | return self._encode_sample(idxes) 86 | 87 | def collect(self): 88 | return self.sample(-1) 89 | -------------------------------------------------------------------------------- /util/ReplayBuffer.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | 6 | def __init__(self, buffer_size): 7 | self.buffer_size = buffer_size 8 | self.num_experiences = 0 9 | self.buffer = deque() 10 | 11 | def get_batch(self, batch_size): 12 | # Randomly sample batch_size examples 13 | return random.sample(self.buffer, batch_size) 14 | 15 | def size(self): 16 | return self.buffer_size 17 | 18 | def add(self, state, action, reward, done, new_state, opa): 19 | experience = (state, action, reward, done, new_state, opa) 20 | if self.num_experiences < self.buffer_size: 21 | self.buffer.append(experience) 22 | self.num_experiences += 1 23 | else: 24 | self.buffer.popleft() 25 | self.buffer.append(experience) 26 | 27 | def count(self): 28 | # if buffer is full, return buffer size 29 | # otherwise, return experience counter 30 | return self.num_experiences 31 | 32 | def erase(self): 33 | self.buffer = deque() 34 | self.num_experiences = 0 35 | 36 | class ReplayBufferSR(object): 37 | 38 | def __init__(self, buffer_size): 39 | self.buffer_size = buffer_size 40 | self.num_experiences = 0 41 | self.buffer = deque() 42 | 43 | def get_batch(self, batch_size): 44 | # Randomly sample batch_size examples 45 | return random.sample(self.buffer, batch_size) 46 | 47 | def size(self): 48 | return self.buffer_size 49 | 50 | def add(self, state, action, reward, done, new_state, opa): 51 | experience = (state, action, reward, done, new_state, opa) 52 | if self.num_experiences < self.buffer_size: 53 | self.buffer.append(experience) 54 | self.num_experiences += 1 55 | else: 56 | self.buffer.popleft() 57 | self.buffer.append(experience) 58 | 59 | def count(self): 60 | # if buffer is full, return buffer size 61 | # otherwise, return experience counter 62 | return self.num_experiences 63 | 64 | def erase(self): 65 | self.buffer = deque() 66 | self.num_experiences = 0 67 | 68 | 69 | class ShareReplayBuffer(object): 70 | 71 | def __init__(self, buffer_size): 72 | self.buffer_size = buffer_size 73 | self.num_experiences = 0 74 | self.buffer = deque() 75 | 76 | def get_batch(self, batch_size): 77 | # Randomly sample batch_size examples 78 | return random.sample(self.buffer, batch_size) 79 | 80 | def size(self): 81 | return self.buffer_size 82 | 83 | def add(self, state, action, reward, done, new_state, opa, agentId): 84 | experience = (state, action, reward, done, new_state, opa, agentId) 85 | if self.num_experiences < self.buffer_size: 86 | self.buffer.append(experience) 87 | self.num_experiences += 1 88 | else: 89 | self.buffer.popleft() 90 | self.buffer.append(experience) 91 | 92 | def count(self): 93 | # if buffer is full, return buffer size 94 | # otherwise, return experience counter 95 | return self.num_experiences 96 | 97 | def erase(self): 98 | self.buffer = deque() 99 | self.num_experiences = 0 -------------------------------------------------------------------------------- /game/pacman/keyboardAgents.py: -------------------------------------------------------------------------------- 1 | # keyboardAgents.py 2 | # ----------------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | from game.pacman.game import Agent 16 | from game.pacman.game import Directions 17 | import random 18 | 19 | 20 | class KeyboardAgent(Agent): 21 | """ 22 | An agent controlled by the keyboard. 23 | """ 24 | # NOTE: Arrow keys also work. 25 | WEST_KEY = 'a' 26 | EAST_KEY = 'd' 27 | NORTH_KEY = 'w' 28 | SOUTH_KEY = 's' 29 | STOP_KEY = 'q' 30 | 31 | def __init__(self, index=0): 32 | 33 | self.lastMove = Directions.STOP 34 | self.index = index 35 | self.keys = [] 36 | 37 | def getAction(self, state): 38 | from game.pacman.graphicsUtils import keys_waiting 39 | from game.pacman.graphicsUtils import keys_pressed 40 | keys = keys_waiting() + keys_pressed() 41 | if keys != []: 42 | self.keys = keys 43 | 44 | legal = state.getLegalActions(self.index) 45 | move = self.getMove(legal) 46 | 47 | if move == Directions.STOP: 48 | # Try to move in the same direction as before 49 | if self.lastMove in legal: 50 | move = self.lastMove 51 | 52 | if (self.STOP_KEY in self.keys) and Directions.STOP in legal: 53 | move = Directions.STOP 54 | 55 | if move not in legal: 56 | move = random.choice(legal) 57 | 58 | self.lastMove = move 59 | return move 60 | 61 | def getMove(self, legal): 62 | move = Directions.STOP 63 | if (self.WEST_KEY in self.keys or 'Left' in self.keys) and Directions.WEST in legal: 64 | move = Directions.WEST 65 | if (self.EAST_KEY in self.keys or 'Right' in self.keys) and Directions.EAST in legal: 66 | move = Directions.EAST 67 | if (self.NORTH_KEY in self.keys or 'Up' in self.keys) and Directions.NORTH in legal: 68 | move = Directions.NORTH 69 | if (self.SOUTH_KEY in self.keys or 'Down' in self.keys) and Directions.SOUTH in legal: 70 | move = Directions.SOUTH 71 | return move 72 | 73 | 74 | class KeyboardAgent2(KeyboardAgent): 75 | """ 76 | A second agent controlled by the keyboard. 77 | """ 78 | # NOTE: Arrow keys also work. 79 | WEST_KEY = 'j' 80 | EAST_KEY = "l" 81 | NORTH_KEY = 'i' 82 | SOUTH_KEY = 'k' 83 | STOP_KEY = 'u' 84 | 85 | def getMove(self, legal): 86 | move = Directions.STOP 87 | if (self.WEST_KEY in self.keys) and Directions.WEST in legal: 88 | move = Directions.WEST 89 | if (self.EAST_KEY in self.keys) and Directions.EAST in legal: 90 | move = Directions.EAST 91 | if (self.NORTH_KEY in self.keys) and Directions.NORTH in legal: 92 | move = Directions.NORTH 93 | if (self.SOUTH_KEY in self.keys) and Directions.SOUTH in legal: 94 | move = Directions.SOUTH 95 | return move 96 | -------------------------------------------------------------------------------- /game/pacman/ghostAgents.py: -------------------------------------------------------------------------------- 1 | # ghostAgents.py 2 | # -------------- 3 | # Licensing Information: You are free to use or extend these projects for 4 | # educational purposes provided that (1) you do not distribute or publish 5 | # solutions, (2) you retain this notice, and (3) you provide clear 6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu. 7 | # 8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley. 9 | # The core projects and autograders were primarily created by John DeNero 10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 11 | # Student side autograding was added by Brad Miller, Nick Hay, and 12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu). 13 | 14 | 15 | from game.pacman.game import Agent 16 | from game.pacman.game import Actions 17 | from game.pacman.game import Directions 18 | import random 19 | from game.pacman.util import manhattanDistance 20 | import game.pacman.util as util 21 | 22 | 23 | class GhostAgent(Agent): 24 | 25 | def __init__(self, index): 26 | self.index = index 27 | 28 | def getAction(self, state): 29 | dist = self.getDistribution(state) 30 | if len(dist) == 0: 31 | return Directions.STOP 32 | else: 33 | return util.chooseFromDistribution(dist) 34 | 35 | def getDistribution(self, state): 36 | "Returns a Counter encoding a distribution over actions from the provided state." 37 | util.raiseNotDefined() 38 | 39 | 40 | class RandomGhost(GhostAgent): 41 | "A ghost that chooses a legal action uniformly at random." 42 | 43 | def getDistribution(self, state): 44 | dist = util.Counter() 45 | for a in state.getLegalActions(self.index): 46 | dist[a] = 1.0 47 | dist.normalize() 48 | return dist 49 | 50 | 51 | class DirectionalGhost(GhostAgent): 52 | "A ghost that prefers to rush Pacman, or flee when scared." 53 | 54 | def __init__(self, index, prob_attack=0.8, prob_scaredFlee=0.8): 55 | self.index = index 56 | self.prob_attack = prob_attack 57 | self.prob_scaredFlee = prob_scaredFlee 58 | 59 | def getDistribution(self, state): 60 | # Read variables from state 61 | ghostState = state.getGhostState(self.index) 62 | legalActions = state.getLegalActions(self.index) 63 | pos = state.getGhostPosition(self.index) 64 | isScared = ghostState.scaredTimer > 0 65 | 66 | speed = 1 67 | if isScared: 68 | speed = 0.5 69 | 70 | actionVectors = [Actions.directionToVector( 71 | a, speed) for a in legalActions] 72 | newPositions = [(pos[0] + a[0], pos[1] + a[1]) for a in actionVectors] 73 | pacmanPosition = state.getPacmanPosition() 74 | 75 | # Select best actions given the state 76 | distancesToPacman = [manhattanDistance( 77 | pos, pacmanPosition) for pos in newPositions] 78 | if isScared: 79 | bestScore = max(distancesToPacman) 80 | bestProb = self.prob_scaredFlee 81 | else: 82 | bestScore = min(distancesToPacman) 83 | bestProb = self.prob_attack 84 | bestActions = [action for action, distance in zip( 85 | legalActions, distancesToPacman) if distance == bestScore] 86 | 87 | # Construct distribution 88 | dist = util.Counter() 89 | for a in bestActions: 90 | dist[a] = bestProb / len(bestActions) 91 | for a in legalActions: 92 | dist[a] += (1 - bestProb) / len(legalActions) 93 | dist.normalize() 94 | return dist 95 | -------------------------------------------------------------------------------- /game/pacman/make_env.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import numpy as np 4 | from gym import spaces 5 | 6 | import game.pacman.layout as layout 7 | from game.pacman.pacman import readCommand 8 | from game.pacman.pacman import ClassicGameRules 9 | import game.pacman.textDisplay as textDisplay 10 | from game.pacman.ghostAgents import RandomGhost as Ghost 11 | 12 | 13 | class Agent: 14 | def __init__(self): 15 | self.name = '' 16 | 17 | def get(self): 18 | raise NotImplemented() 19 | 20 | 21 | class Wrap_pacman(): 22 | def __init__(self, args): 23 | # , layout, pacman, ghosts, display, numGames, record, numTraining = 0, catchExceptions = False, timeout = 30 24 | self.args = args 25 | self.layout = layout.getLayout(args['game_name']) 26 | self.rules = ClassicGameRules(self.args['timeout']) 27 | self.pacman = Agent() 28 | self.ghosts = [Agent() for i in range(self.layout.getNumGhosts())]#[Ghost(i+1) for i in range(self.layout.getNumGhosts())] # [Agent() for i in range(self.layout.getNumGhosts())] 29 | if self.args['quietGraphics']: 30 | display = textDisplay.NullGraphics() 31 | elif self.args['textGraphics']: 32 | textDisplay.SLEEP_TIME = self.args['frameTime'] 33 | display = textDisplay.PacmanGraphics() 34 | else: 35 | import game.pacman.graphicsDisplay as graphicsDisplay 36 | display = graphicsDisplay.PacmanGraphics(self.args['zoom'], frameTime=self.args['frameTime']) 37 | self.beQuiet = False 38 | self.textDisplay = textDisplay.NotGraphics() 39 | self.videoDisplay = display 40 | self.rules.quiet = True 41 | self.catchExceptions = self.args['catchExceptions'] 42 | self.done = True 43 | 44 | self.action2str = ['North', 'South', 'East', 'West', 'Stop'] 45 | self.game = self.rules.newGame(self.layout, self.pacman, self.ghosts, display, self.beQuiet, 46 | self.catchExceptions) 47 | 48 | # gym-like info 49 | self.n = len(self.game.agents) 50 | self.action_space = [spaces.Discrete(len(self.action2str)) for i in range(self.n)] 51 | self.observation_space = [spaces.Box(low=0, high=1, shape=((self.layout.width + self.layout.height) * self.n + 18,), dtype=np.float32) if i == 0 else 52 | spaces.Box(low=0, high=1, shape=((self.layout.width + self.layout.height) * 2,), dtype=np.float32) 53 | for i in range(self.n)] 54 | 55 | def step(self, actions, done=None): 56 | assert not self.done, 'done! step after reset' 57 | actions = [np.argmax(a) for a in actions] 58 | actions = [self.action2str[action] for action in actions] 59 | # ghost_action = [] 60 | # ghost_action.append(actions[0]) 61 | # for ghost in self.ghosts: 62 | # action = ghost.getAction(self.game.state) 63 | # ghost_action.append(action) 64 | # print(ghost_action) 65 | state, reward, done, info = self.game.step(actions) 66 | self.done = done 67 | done = [done for i in range(self.n)] 68 | return state, reward, done, info 69 | 70 | def reset(self, render=False): 71 | del self.game 72 | del self.rules 73 | del self.pacman 74 | del self.ghosts 75 | 76 | self.pacman = Agent() 77 | self.ghosts = [Agent() for i in range(self.layout.getNumGhosts())]#[Ghost(i+1) for i in range(self.layout.getNumGhosts())] 78 | 79 | self.rules = ClassicGameRules(self.args['timeout']) 80 | self.rules.quiet = True 81 | 82 | if render: 83 | display = self.videoDisplay 84 | self.rules.quiet = False 85 | else: 86 | display = self.textDisplay 87 | self.rules.quiet = True 88 | 89 | self.game = self.rules.newGame(self.layout, self.pacman, self.ghosts, display, self.beQuiet, 90 | self.catchExceptions) 91 | 92 | self.done = False 93 | 94 | return self.game.reset(render=render) 95 | 96 | def render(self): 97 | pass 98 | 99 | 100 | def runGames(args): 101 | env = Wrap_pacman(args) 102 | return env 103 | 104 | 105 | def runGames_2(layout, pacman, ghosts, display, numGames, record, numTraining=0, catchExceptions=False, timeout=30): 106 | 107 | rules = ClassicGameRules(timeout) 108 | games = [] 109 | 110 | for i in range(numGames): 111 | beQuiet = i < numTraining 112 | 113 | gameDisplay = textDisplay.NullGraphics() 114 | rules.quiet = True 115 | 116 | # render 117 | # gameDisplay = display 118 | # rules.quiet = False 119 | 120 | game = rules.newGame(layout, pacman, ghosts, 121 | gameDisplay, beQuiet, catchExceptions) 122 | game.run() 123 | 124 | return games 125 | 126 | 127 | def make_env(args): 128 | #args = readCommand(sys.argv[1:]) # Get game components based on input 129 | #print(args) 130 | return runGames(args) 131 | # runGames_2(**args) 132 | # return env -------------------------------------------------------------------------------- /game/particle/make_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for creating a multiagent environment with one of the scenarios listed 3 | in ./scenarios/. 4 | Can be called by using, for example: 5 | env = make_env('simple_speaker_listener') 6 | After producing the env object, can be used similarly to an OpenAI gym 7 | environment. 8 | 9 | A policy using this environment must output actions in the form of a list 10 | for all agents. Each element of the list should be a numpy array, 11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede 12 | communication actions in this array. See environment.py for more details. 13 | """ 14 | import numpy as np 15 | import time 16 | 17 | 18 | def make_env(args): 19 | scenario_name = args['game_name'] 20 | ''' 21 | Creates a MultiAgentEnv object as env. This can be used similar to a gym 22 | environment by calling env.reset() and env.step(). 23 | Use env.render() to view the environment on the screen. 24 | 25 | Input: 26 | scenario_name : name of the scenario from ./scenarios/ to be Returns 27 | (without the .py extension) 28 | benchmark : whether you want to produce benchmarking data 29 | (usually only done during evaluation) 30 | 31 | Some useful env properties (see environment.py): 32 | .observation_space : Returns the observation space for each agent 33 | .action_space : Returns the action space for each agent 34 | .n : Returns the number of Agents 35 | ''' 36 | from game.particle.multiagent.environment import MultiAgentEnv 37 | import game.particle.multiagent.scenarios as scenarios 38 | 39 | # load scenario from script 40 | scenario = scenarios.load(scenario_name + ".py").Scenario() 41 | # create world 42 | world = scenario.make_world(args) 43 | # create multiagent environment 44 | if args['benchmark'] and not args['obs_sort']: 45 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data, args=args) 46 | elif not args['benchmark'] and args['obs_sort']: 47 | if args["reward_func"] == "reward2": 48 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, scenario.observation_sort, scenario.is_done2, args=args) 49 | elif args["reward_func"] == "reward3": 50 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward3, scenario.observation3, scenario.observation_sort3, scenario.is_done3, args=args) 51 | else: 52 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, 53 | scenario.observation_sort, scenario.is_done, args=args) 54 | elif not args['benchmark'] and not args['obs_sort']: 55 | if args["reward_func"] == "reward2": 56 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, args=args) 57 | elif args["reward_func"] == "reward3": 58 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward3, scenario.observation3, args=args) 59 | else: 60 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, args=args) 61 | else: 62 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward2, scenario.observation, args=args) 63 | return env 64 | 65 | 66 | # test 67 | def action(obs): 68 | if env.discrete_action_space: 69 | i = np.random.randint(0, 5) 70 | u = np.zeros(5) 71 | u[i] = 1 72 | else: 73 | u = np.array([(np.random.random() - 0.5) * 2, (np.random.random() - 0.5) * 2]) 74 | return u 75 | 76 | 77 | if __name__ == '__main__': 78 | args = dict() 79 | args['game_name'] = "simple_spread_old" 80 | args['benchmark'] = False 81 | args['obs_sort'] = False 82 | args['reward_func'] = 'reward' 83 | args['restrict_move'] = True 84 | args['num_adversaries'] = 0 85 | args['num_good'] = 6 86 | env = make_env(args) 87 | print(env.action_space) 88 | env.render() 89 | # create interactive policies for each agent 90 | # execution loop 91 | obs_n = env.reset() 92 | print(env.observation_space) 93 | print(env.action_space) 94 | 95 | for ep in range(100): 96 | obs_n = env.reset() 97 | step = 0 98 | reward = np.zeros(env.n) 99 | done = [False for i in range(env.n)] 100 | while True: 101 | # query for action from each agent's policy 102 | act_n = [] 103 | for i in range(env.n): 104 | act_n.append(action(obs_n[i])) 105 | #print(act_n) 106 | #print(act_n) 107 | # step environment 108 | obs_n, reward_n, done_n, _ = env.step(act_n) 109 | for i in range(env.n): 110 | if not done[i]: 111 | done[i] = done_n[i] 112 | reward += reward_n 113 | #print(obs_n) 114 | # render all agent views 115 | #time.sleep(0.1) 116 | env.render() 117 | step += 1 118 | if step > 100 or all((done_n[i] is True for i in range(env.n))): 119 | print(step, reward, done_n) 120 | break 121 | 122 | 123 | -------------------------------------------------------------------------------- /game/particle/multiagent/scenarios/simple_spread.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from game.particle.multiagent.core import World, Agent, Landmark 3 | from game.particle.multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self, args=None): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_agents = 10 12 | num_landmarks = 10 13 | if args is not None and args['num_good'] != 0: 14 | num_landmarks = args['num_good'] 15 | num_agents = num_landmarks 16 | world.cam_range = 4 17 | world.collaborative = False 18 | # add agents 19 | world.agents = [Agent() for i in range(num_agents)] 20 | for i, agent in enumerate(world.agents): 21 | agent.name = 'agent %d' % i 22 | agent.collide = True 23 | agent.silent = True 24 | #agent.size = 0.15 25 | # add landmarks 26 | world.landmarks = [Landmark() for i in range(num_landmarks)] 27 | for i, landmark in enumerate(world.landmarks): 28 | landmark.name = 'landmark %d' % i 29 | landmark.collide = False 30 | landmark.movable = False 31 | # make initial conditions 32 | self.reset_world(world) 33 | return world 34 | 35 | def reset_world(self, world): 36 | # random properties for agents 37 | for i, agent in enumerate(world.agents): 38 | agent.color = np.array([0.35, 0.35, 0.85]) 39 | # random properties for landmarks 40 | for i, landmark in enumerate(world.landmarks): 41 | landmark.color = np.array([0.25, 0.25, 0.25]) 42 | # set random initial states 43 | for agent in world.agents: 44 | agent.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p) 45 | agent.state.p_vel = np.zeros(world.dim_p) 46 | agent.state.c = np.zeros(world.dim_c) 47 | for i, landmark in enumerate(world.landmarks): 48 | landmark.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p) 49 | landmark.state.p_vel = np.zeros(world.dim_p) 50 | 51 | def benchmark_data(self, agent, world): 52 | rew = 0 53 | collisions = 0 54 | occupied_landmarks = 0 55 | min_dists = 0 56 | for l in world.landmarks: 57 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 58 | min_dists += min(dists) 59 | rew -= min(dists) 60 | if min(dists) < 0.1: 61 | occupied_landmarks += 1 62 | if agent.collide: 63 | for a in world.agents: 64 | if self.is_collision(a, agent): 65 | rew -= 1 66 | collisions += 1 67 | return (rew, collisions, min_dists, occupied_landmarks) 68 | 69 | 70 | def is_collision(self, agent1, agent2): 71 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 72 | dist = np.sqrt(np.sum(np.square(delta_pos))) 73 | dist_min = agent1.size + agent2.size 74 | return True if dist < dist_min else False 75 | 76 | # def reward(self, agent, world): 77 | # # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 78 | # rew = 0 79 | # 80 | # # for l in world.landmarks: 81 | # # dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 82 | # # rew -= min(dists) 83 | # dists = [np.sqrt(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) for l in world.landmarks] 84 | # rew -= min(dists) 85 | # if agent.collide: 86 | # for a in world.agents: 87 | # if self.is_collision(a, agent): 88 | # rew -= 1 89 | # return rew 90 | def reward(self, agent, world): 91 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 92 | rew = 0 93 | agentIndex = 0 94 | for i, a in enumerate(world.agents): 95 | if a.name == agent.name: 96 | agentIndex = i 97 | break 98 | dists = np.sqrt(np.sum(np.square(agent.state.p_pos - world.landmarks[agentIndex].state.p_pos))) 99 | # if self.is_collision(agent, world.landmarks[agentIndex]): 100 | # rew = 1 101 | rew -= dists 102 | return rew 103 | 104 | def observation(self, agent, world): 105 | # get positions of all entities in this agent's reference frame 106 | entity_pos = [] 107 | for entity in world.landmarks: # world.entities: 108 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 109 | # entity colors 110 | entity_color = [] 111 | for entity in world.landmarks: # world.entities: 112 | entity_color.append(entity.color) 113 | # communication of all other agents 114 | comm = [] 115 | other_pos = [] 116 | for other in world.agents: 117 | if other is agent: continue 118 | comm.append(other.state.c) 119 | other_pos.append(other.state.p_pos - agent.state.p_pos) 120 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MAPTF 2 | 3 | Source code for paper: An Efficient Transfer Learning Framework for Multiagent Reinforcement Learning 4 | 5 | * [MAPTF code](#MAPTF code) 6 | * [Installation](#Installation) 7 | * [Run an experiment](#Run an experiment) 8 | * [Example](#Example) 9 | * [Results](#results) 10 | * [Configuration](#Configuration) 11 | * [Operating parameters](#Operating parameters) 12 | * [Core parameters](#Core parameters) 13 | * [Some experiences setting in paper](#Some experiences setting in paper) 14 | * [In BibTeX format](#In BibTeX format) 15 | 16 | ## MAPTF code 17 | * MAPTF 18 | * alg (multiagent polices) 19 | * maddpg 20 | * muti_ptf_ppo 21 | * sharing_multi_ppo 22 | * option 23 | * config (Configuration parameters of each polices) 24 | * maddpg_conf (including maddpg and maddpg_sr) 25 | * ppo_config (including ppo sro shppo and shsro) 26 | * particle_conf (Configuration of particle game ) 27 | * pacman_conf (Configuration of pacman game) 28 | * run (execute the tasks) 29 | * run_maddpg_sr (including maddpg and maddpg_sr) 30 | * run_multi_ptf_ppo_sro (including ppo sro) 31 | * run_multi_ptf_shppo_sro (including shppo and shsro) 32 | * source (opponent policies) 33 | * util 34 | * main (entry function) 35 | 36 | ## Installation 37 | python==3.6.5 38 | 39 | pip install -r requirements.txt 40 | 41 | ## Running Example 42 | 43 | #### Example 44 | 45 | ``` 46 | #MAPTF-PPO Pacman 47 | python main.py -a multi_ppo -c ppo_conf -g pacman -d pacman_conf game_name=originalClassic num_adversaries=1 adv_load_model=True adv_load_model_path=source/pacman/original/0/model 48 | ``` 49 | ``` 50 | #MAPTF-PPO Predator-prey 4 51 | python main.py -a multi_ppo -c ppo_conf -g particle -d particle_conf game_name=simple_tag num_adversaries=3 good_use_option=False good_load_model=True good_load_model_path=source/simple_tag/tag4/model_30000 c1=0.001 52 | ``` 53 | some logs will be shown below: 54 | ``` 55 | INFO:tensorflow:Restoring parameters from source/pacman/original/0/model_0.ckpt 56 | win : [False, False, False, False], step : 100, discounted_reward : [ 0.61213843 -0.63762798 -0.63762798 -0.63762798], discount_reward_mean : [ 0.61213843 -0.63762798 -0.63762798 -0.63762798], undiscounted_reward : [ 0.31 -1.01 -1.01 -1.01], reward_mean : [ 0.31 -1.01 -1.01 -1.01], episode : 0, 57 | win : [False, False, False, False], step : 100, discounted_reward : [ 0.58945708 -0.63762798 -0.63762798 -0.63762798], discount_reward_mean : [ 0.60079775 -0.63762798 -0.63762798 -0.63762798], undiscounted_reward : [ 0.31 -1.01 -1.01 -1.01], reward_mean : [ 0.31 -1.01 -1.01 -1.01], episode : 1, 58 | ``` 59 | 60 | #### Results 61 | 62 | All results will be stored in the `results/alg_name/game_type/game_name/time` folder, every folder contains `graph`, `log`, `model`, `output`, `args.json`, `command.txt` 63 | 64 | If you do not want to save `graph` and `model`, setting param `save_model=False`. 65 | * `graph`: can use `tensorboard --logdir=path` to check the tensorflow graph and loss in terminal. 66 | * `log`: the print results in terminal. 67 | * `model`: models saved every `save_per_episodes` episodes. 68 | * `output.json`: reward results. 69 | * `args.json`: store all params. 70 | * `command.txt`: shell command. 71 | 72 | ## Source Policy 73 | 74 | Source policies contain pre-trained opponent policies. For example, in pac-man, the pac-man agent is the opponent, the policy is a pre-trained PPO; in predator-prey, the blue circle agents are pre-trained using PPO. Using test mode via `-t` `load_model`can reload the model to render 75 | 76 | ## Configuration 77 | 78 | The config files act as defaults for an algorithm or environment. 79 | 80 | They are all located in `config`. 81 | 82 | #### Operating parameters 83 | 84 | Take the above example: 85 | * `-a multi_ppo`: choose an algorithm. 86 | * `-c ppo_conf`: choose corresponding algorithm configuration. 87 | * `-g pacman`: game type. 88 | * `-d pacman_conf`: game configuration. 89 | * `-t`: evaluation the results, by setting `-t True`, and `-t False` as default. 90 | * `game_name=originalClassic`: choose a game environment. 91 | * `num_adversaries=1`: as needed. 92 | * `adv_load_model=True adv_load_model_path=source/pacman/original/0/model`: load source policy. 93 | * `adv_use_option, good_use_option`: use option, by setting `True`, `False` as default. Learning ppo, shppo and maddpg, setting `False`, otherwise setting `True` as needed. 94 | 95 | #### Core parameters 96 | 97 | Default: 98 | * `option_layer_1=128, option_layer_2=128` 99 | * `learning_rate_r=0.0003` 100 | * `embedding_dim=32` 101 | * `option_embedding_layer=64` 102 | * `recon_loss_coef=0.1` 103 | * `option_batch_size=32` 104 | * `c1=0.005` 105 | * `e_greedy_increment=0.001` 106 | * `learning_rate_o=0.00001, learning_rate_t=0.00001` 107 | * `xi=0.005` 108 | 109 | #### Some experiences setting in paper 110 | ``` 111 | #ppo+sro, game type=pacman, game environment=mediumClassic 112 | c1=0.005 113 | ``` 114 | ``` 115 | #ppo+sro, game type=pacman, game environment=originalClassic 116 | option_batch_size=128 117 | c1=0.0005 118 | ``` 119 | ``` 120 | #maddpg+sro, game type=particle, game environment=simple_tag 121 | option_layer_1=128 option_layer_2=128 122 | learning_rate_o=0.00001 learning_rate_t=0.00001 123 | c1=0.005 124 | xi=0 125 | ``` 126 | ``` 127 | #ppo+sro, game type=particle, game environment=simple_tag 128 | option_layer_1=32 option_layer_2=32 129 | c1=0.1 130 | option_batch_size=128 131 | ``` 132 | ``` 133 | #shsro, game type=particle, game environment=simple_tag 134 | option_layer_1=32 option_layer_2=32 135 | c1=0.1 136 | ``` 137 | 138 | MADDPG code follows: https://github.com/openai/maddpg 139 | 140 | ## In BibTeX format: 141 | 142 | ```tex 143 | @article{yang2021efficient, 144 | title={An Efficient Transfer Learning Framework for Multiagent Reinforcement Learning}, 145 | author={Yang, Tianpei and Wang, Weixun and Tang, Hongyao and Hao, Jianye and Meng, Zhaopeng and Mao, Hangyu and Li, Dong and Liu, Wulong and Chen, Yingfeng and Hu, Yujing and others}, 146 | journal={Advances in Neural Information Processing Systems}, 147 | volume={34}, 148 | year={2021} 149 | } 150 | ``` 151 | -------------------------------------------------------------------------------- /game/particle/README.md: -------------------------------------------------------------------------------- 1 | **Status:** Archive (code is provided as-is, no updates expected) 2 | 3 | # Multi-Agent Particle Environment 4 | 5 | A simple multi-agent particle world with a continuous observation and discrete action space, along with some basic simulated physics. 6 | Used in the paper [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf). 7 | 8 | ## Getting started: 9 | 10 | - To install, `cd` into the root directory and type `pip install -e .` 11 | 12 | - To interactively view moving to landmark scenario (see others in ./scenarios/): 13 | `bin/interactive.py --scenario simple.py` 14 | 15 | - Known dependencies: Python (3.5.4), OpenAI gym (0.10.5), numpy (1.14.5) 16 | 17 | - To use the environments, look at the code for importing them in `make_env.py`. 18 | 19 | ## Code structure 20 | 21 | - `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object. 22 | 23 | - `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.) 24 | 25 | - `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code. 26 | 27 | - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. 28 | 29 | - `./multiagent/policy.py`: contains code for interactive policy based on keyboard input. 30 | 31 | - `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios. 32 | 33 | - `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions: 34 | 1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both). 35 | called once at the beginning of each training session 36 | 2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world 37 | called before every episode (including after make_world() before the first episode) 38 | 3) `reward()`: defines the reward function for a given agent 39 | 4) `observation()`: defines the observation space of a given agent 40 | 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) 41 | 42 | ### Creating new environments 43 | 44 | You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`). 45 | 46 | ## List of environments 47 | 48 | 49 | | Env name in code (name in paper) | Communication? | Competitive? | Notes | 50 | | --- | --- | --- | --- | 51 | | `simple.py` | N | N | Single agent sees landmark position, rewarded based on how close it gets to landmark. Not a multiagent environment -- used for debugging policies. | 52 | | `simple_adversary.py` (Physical deception) | N | Y | 1 adversary (red), N good agents (green), N landmarks (usually N=2). All agents observe position of landmarks and other agents. One landmark is the ‘target landmark’ (colored green). Good agents rewarded based on how close one of them is to the target landmark, but negatively rewarded if the adversary is close to target landmark. Adversary is rewarded based on how close it is to the target, but it doesn’t know which landmark is the target landmark. So good agents have to learn to ‘split up’ and cover all landmarks to deceive the adversary. | 53 | | `simple_crypto.py` (Covert communication) | Y | Y | Two good agents (alice and bob), one adversary (eve). Alice must sent a private message to bob over a public channel. Alice and bob are rewarded based on how well bob reconstructs the message, but negatively rewarded if eve can reconstruct the message. Alice and bob have a private key (randomly generated at beginning of each episode), which they must learn to use to encrypt the message. | 54 | | `simple_push.py` (Keep-away) | N |Y | 1 agent, 1 adversary, 1 landmark. Agent is rewarded based on distance to landmark. Adversary is rewarded if it is close to the landmark, and if the agent is far from the landmark. So the adversary learns to push agent away from the landmark. | 55 | | `simple_reference.py` | Y | N | 2 agents, 3 landmarks of different colors. Each agent wants to get to their target landmark, which is known only by other agent. Reward is collective. So agents have to learn to communicate the goal of the other agent, and navigate to their landmark. This is the same as the simple_speaker_listener scenario where both agents are simultaneous speakers and listeners. | 56 | | `simple_speaker_listener.py` (Cooperative communication) | Y | N | Same as simple_reference, except one agent is the ‘speaker’ (gray) that does not move (observes goal of other agent), and other agent is the listener (cannot speak, but must navigate to correct landmark).| 57 | | `simple_spread.py` (Cooperative navigation) | N | N | N agents, N landmarks. Agents are rewarded based on how far any agent is from each landmark. Agents are penalized if they collide with other agents. So, agents have to learn to cover all the landmarks while avoiding collisions. | 58 | | `simple_tag.py` (Predator-prey) | N | Y | Predator-prey environment. Good agents (green) are faster and want to avoid being hit by adversaries (red). Adversaries are slower and want to hit good agents. Obstacles (large black circles) block the way. | 59 | | `simple_world_comm.py` | Y | Y | Environment seen in the video accompanying the paper. Same as simple_tag, except (1) there is food (small blue balls) that the good agents are rewarded for being near, (2) we now have ‘forests’ that hide agents inside from being seen from outside; (3) there is a ‘leader adversary” that can see the agents at all times, and can communicate with the other adversaries to help coordinate the chase. | 60 | 61 | ## Paper citation 62 | 63 | If you used this environment for your experiments or found it helpful, consider citing the following papers: 64 | 65 | Environments in this repo: 66 |
67 | @article{lowe2017multi,
68 | title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
69 | author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
70 | journal={Neural Information Processing Systems (NIPS)},
71 | year={2017}
72 | }
73 |
74 |
75 | Original particle world environment:
76 |
77 | @article{mordatch2017emergence,
78 | title={Emergence of Grounded Compositional Language in Multi-Agent Populations},
79 | author={Mordatch, Igor and Abbeel, Pieter},
80 | journal={arXiv preprint arXiv:1703.04908},
81 | year={2017}
82 | }
83 |
84 |
--------------------------------------------------------------------------------
/game/pacman/layout.py:
--------------------------------------------------------------------------------
1 | # layout.py
2 | # ---------
3 | # Licensing Information: You are free to use or extend these projects for
4 | # educational purposes provided that (1) you do not distribute or publish
5 | # solutions, (2) you retain this notice, and (3) you provide clear
6 | # attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
7 | #
8 | # Attribution Information: The Pacman AI projects were developed at UC Berkeley.
9 | # The core projects and autograders were primarily created by John DeNero
10 | # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
11 | # Student side autograding was added by Brad Miller, Nick Hay, and
12 | # Pieter Abbeel (pabbeel@cs.berkeley.edu).
13 |
14 |
15 | from game.pacman.util import manhattanDistance
16 | from game.pacman.game import Grid
17 | import os
18 | import random
19 | from functools import reduce
20 |
21 | VISIBILITY_MATRIX_CACHE = {}
22 |
23 |
24 | class Layout:
25 | """
26 | A Layout manages the static information about the game board.
27 | """
28 |
29 | def __init__(self, layoutText):
30 | self.width = len(layoutText[0])
31 | self.height = len(layoutText)
32 | self.walls = Grid(self.width, self.height, False)
33 | self.food = Grid(self.width, self.height, False)
34 | self.capsules = []
35 | self.agentPositions = []
36 | self.numGhosts = 0
37 | self.processLayoutText(layoutText)
38 | self.layoutText = layoutText
39 | self.totalFood = len(self.food.asList())
40 | # self.initializeVisibilityMatrix()
41 |
42 | def getNumGhosts(self):
43 | return self.numGhosts
44 |
45 | def initializeVisibilityMatrix(self):
46 | global VISIBILITY_MATRIX_CACHE
47 | if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE:
48 | from game import Directions
49 | vecs = [(-0.5, 0), (0.5, 0), (0, -0.5), (0, 0.5)]
50 | dirs = [Directions.NORTH, Directions.SOUTH,
51 | Directions.WEST, Directions.EAST]
52 | vis = Grid(self.width, self.height, {Directions.NORTH: set(), Directions.SOUTH: set(
53 | ), Directions.EAST: set(), Directions.WEST: set(), Directions.STOP: set()})
54 | for x in range(self.width):
55 | for y in range(self.height):
56 | if self.walls[x][y] == False:
57 | for vec, direction in zip(vecs, dirs):
58 | dx, dy = vec
59 | nextx, nexty = x + dx, y + dy
60 | while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)]:
61 | vis[x][y][direction].add((nextx, nexty))
62 | nextx, nexty = x + dx, y + dy
63 | self.visibility = vis
64 | VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis
65 | else:
66 | self.visibility = VISIBILITY_MATRIX_CACHE[
67 | reduce(str.__add__, self.layoutText)]
68 |
69 | def isWall(self, pos):
70 | x, col = pos
71 | return self.walls[x][col]
72 |
73 | def getRandomLegalPosition(self):
74 | x = random.choice(list(range(self.width)))
75 | y = random.choice(list(range(self.height)))
76 | while self.isWall((x, y)):
77 | x = random.choice(list(range(self.width)))
78 | y = random.choice(list(range(self.height)))
79 | return (x, y)
80 |
81 | def getRandomCorner(self):
82 | poses = [(1, 1), (1, self.height - 2), (self.width - 2, 1),
83 | (self.width - 2, self.height - 2)]
84 | return random.choice(poses)
85 |
86 | def getFurthestCorner(self, pacPos):
87 | poses = [(1, 1), (1, self.height - 2), (self.width - 2, 1),
88 | (self.width - 2, self.height - 2)]
89 | dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses])
90 | return pos
91 |
92 | def isVisibleFrom(self, ghostPos, pacPos, pacDirection):
93 | row, col = [int(x) for x in pacPos]
94 | return ghostPos in self.visibility[row][col][pacDirection]
95 |
96 | def __str__(self):
97 | return "\n".join(self.layoutText)
98 |
99 | def deepCopy(self):
100 | return Layout(self.layoutText[:])
101 |
102 | def processLayoutText(self, layoutText):
103 | """
104 | Coordinates are flipped from the input format to the (x,y) convention here
105 |
106 | The shape of the maze. Each character
107 | represents a different type of object.
108 | % - Wall
109 | . - Food
110 | o - Capsule
111 | G - Ghost
112 | P - Pacman
113 | Other characters are ignored.
114 | """
115 | maxY = self.height - 1
116 | for y in range(self.height):
117 | for x in range(self.width):
118 | layoutChar = layoutText[maxY - y][x]
119 | self.processLayoutChar(x, y, layoutChar)
120 | # (x1, y1) = self.getRandomLegalPosition()
121 | # self.agentPositions.append((0, (x1, y1)))
122 | # for i in range(self.numGhosts):
123 | # (x1, y1) = self.getRandomLegalPosition()
124 | # self.agentPositions.append((1, (x1, y1)))
125 | self.agentPositions.sort()
126 | self.agentPositions = [(i == 0, pos) for i, pos in self.agentPositions]
127 |
128 | def processLayoutChar(self, x, y, layoutChar):
129 | if layoutChar == '%':
130 | self.walls[x][y] = True
131 | elif layoutChar == '.':
132 | self.food[x][y] = True
133 | elif layoutChar == 'o':
134 | self.capsules.append((x, y))
135 | elif layoutChar == 'P':
136 | self.agentPositions.append((0, (x, y)))
137 | #(x1, y1) = self.getRandomLegalPosition()
138 | #self.agentPositions.append((0, (x1, y1)))
139 | elif layoutChar in ['G']:
140 | self.agentPositions.append((1, (x, y)))
141 | #(x1, y1) = self.getRandomLegalPosition()
142 | #self.agentPositions.append((1, (x1, y1)))
143 | self.numGhosts += 1
144 | elif layoutChar in ['1', '2', '3', '4']:
145 | self.agentPositions.append((int(layoutChar), (x, y)))
146 | self.numGhosts += 1
147 |
148 |
149 | def getLayout(name, back=2):
150 | # print('1:', os.getcwd())
151 | # print(os.path.abspath(__file__))
152 | if name.endswith('.lay'):
153 | layout = tryToLoad(os.getcwd() + '/game/pacman/layouts/' + name)
154 | print(os.getcwd() + '/game/pacman/layouts/' + name)
155 | if layout == None:
156 | layout = tryToLoad(name)
157 | else:
158 | layout = tryToLoad(os.getcwd() + '/game/pacman/' + 'layouts/' + name + '.lay')
159 | print(os.getcwd() + '/game/pacman/' + 'layouts/' + name + '.lay')
160 | if layout == None:
161 | layout = tryToLoad(name + '.lay')
162 | if layout == None and back >= 0:
163 | curdir = os.path.abspath('.')
164 | os.chdir('..')
165 | layout = getLayout(name, back - 1)
166 | os.chdir(curdir)
167 | return layout
168 |
169 |
170 | def tryToLoad(fullname):
171 | if(not os.path.exists(fullname)):
172 | return None
173 | f = open(fullname)
174 | try:
175 | return Layout([line.strip() for line in f])
176 | finally:
177 | f.close()
178 |
--------------------------------------------------------------------------------
/game/particle/multiagent/scenarios/simple_tag.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from game.particle.multiagent.core import World, Agent, Landmark
3 | from game.particle.multiagent.scenario import BaseScenario
4 |
5 |
6 | class Scenario(BaseScenario):
7 | def make_world(self, args=None):
8 | world = World()
9 | # set any world properties first
10 | world.dim_c = 2
11 | num_good_agents = 1
12 | num_adversaries = 6
13 | if args is not None and args['num_good'] != 0:
14 | num_good_agents = args['num_good']
15 | if args is not None and args['num_adversaries'] != 0:
16 | num_adversaries = args['num_adversaries']
17 | world.cam_range = 1
18 | num_agents = num_adversaries + num_good_agents
19 | num_landmarks = 2
20 | # add agents
21 | world.agents = [Agent() for i in range(num_agents)]
22 | for i, agent in enumerate(world.agents):
23 | agent.name = 'agent %d' % i
24 | agent.collide = True
25 | agent.silent = True
26 | agent.adversary = True if i < num_adversaries else False
27 | agent.size = 0.075 if agent.adversary else 0.05
28 | agent.accel = 3.0 if agent.adversary else 4.0
29 | #agent.accel = 20.0 if agent.adversary else 25.0
30 | agent.max_speed = 1.0 if agent.adversary else 1.3
31 | # add landmarks
32 | world.landmarks = [Landmark() for i in range(num_landmarks)]
33 | for i, landmark in enumerate(world.landmarks):
34 | landmark.name = 'landmark %d' % i
35 | landmark.collide = True
36 | landmark.movable = False
37 | landmark.size = 0.2
38 | landmark.boundary = False
39 | # make initial conditions
40 | self.reset_world(world)
41 | return world
42 |
43 | def reset_world(self, world):
44 | # random properties for agents
45 | for i, agent in enumerate(world.agents):
46 | agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35])
47 | # random properties for landmarks
48 | for i, landmark in enumerate(world.landmarks):
49 | landmark.color = np.array([0.25, 0.25, 0.25])
50 | # set random initial states
51 | for agent in world.agents:
52 | agent.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p)
53 | agent.state.p_vel = np.zeros(world.dim_p)
54 | agent.state.c = np.zeros(world.dim_c)
55 | for i, landmark in enumerate(world.landmarks):
56 | if not landmark.boundary:
57 | landmark.state.p_pos = np.random.uniform(-world.cam_range, +world.cam_range, world.dim_p)
58 | landmark.state.p_vel = np.zeros(world.dim_p)
59 |
60 | def benchmark_data(self, agent, world):
61 | # returns data for benchmarking purposes
62 | if agent.adversary:
63 | collisions = 0
64 | for a in self.good_agents(world):
65 | if self.is_collision(a, agent):
66 | collisions += 1
67 | return collisions
68 | else:
69 | return 0
70 |
71 | def is_collision(self, agent1, agent2):
72 | delta_pos = agent1.state.p_pos - agent2.state.p_pos
73 | dist = np.sqrt(np.sum(np.square(delta_pos)))
74 | dist_min = agent1.size + agent2.size
75 | return True if dist < dist_min else False
76 |
77 | # return all agents that are not adversaries
78 | def good_agents(self, world):
79 | return [agent for agent in world.agents if not agent.adversary]
80 |
81 | # return all adversarial agents
82 | def adversaries(self, world):
83 | return [agent for agent in world.agents if agent.adversary]
84 |
85 | def reward(self, agent, world):
86 | # Agents are rewarded based on minimum agent distance to each landmark
87 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
88 | return main_reward
89 |
90 | def agent_reward(self, agent, world):
91 | # Agents are negatively rewarded if caught by adversaries
92 | rew = 0
93 | shape = False
94 | adversaries = self.adversaries(world)
95 | if shape: # reward can optionally be shaped (increased reward for increased distance from adversary)
96 | for adv in adversaries:
97 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
98 | if agent.collide:
99 | for a in adversaries:
100 | if self.is_collision(a, agent):
101 | rew -= 10
102 |
103 | # agents are penalized for exiting the screen, so that they can be caught by the adversaries
104 | def bound(x):
105 | if x < world.cam_range * 0.9:
106 | return 0
107 | if x < world.cam_range:
108 | return (x - 0.9 * world.cam_range) * 10 / world.cam_range
109 | return min(np.exp((2 * x - 2 * world.cam_range) / world.cam_range), 10)
110 | for p in range(world.dim_p):
111 | x = abs(agent.state.p_pos[p])
112 | rew -= bound(x)
113 | return rew
114 |
115 | def adversary_reward(self, agent, world):
116 | # Adversaries are rewarded for collisions with agents
117 | rew = 0
118 | shape = False
119 | agents = self.good_agents(world)
120 | adversaries = self.adversaries(world)
121 | if shape: # reward can optionally be shaped (decreased reward for increased distance from agents)
122 | for adv in adversaries:
123 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents])
124 | # if shape: # reward can optionally be shaped (decreased reward for increased distance from agents)
125 | # rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents])
126 |
127 | if agent.collide:
128 | for ag in agents:
129 | agent_reward = 0
130 | for adv in adversaries:
131 | if self.is_collision(ag, adv):
132 | rew += 10
133 | # if self.is_collision(ag, agent):
134 | # rew += 10
135 | # 同一个agent被3个adversaries同时抓住才有分数,每一组最高分数30分
136 | # if agent_reward < 30:
137 | # rew += 0
138 | # else:
139 | # rew += 30
140 | return rew
141 |
142 | def observation(self, agent, world):
143 | # get positions of all entities in this agent's reference frame
144 | entity_pos = []
145 | for entity in world.landmarks:
146 | if not entity.boundary:
147 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
148 | # communication of all other agents
149 | comm = []
150 | other_pos = []
151 | other_vel = []
152 | for other in world.agents:
153 | if other is agent: continue
154 | comm.append(other.state.c)
155 | other_pos.append(other.state.p_pos - agent.state.p_pos)
156 | if not other.adversary:
157 | other_vel.append(other.state.p_vel)
158 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
159 |
--------------------------------------------------------------------------------
/game/particle/multiagent/core.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # physical/external base state of all entites
5 | class EntityState(object):
6 | def __init__(self):
7 | # physical position
8 | self.p_pos = None
9 | # physical velocity
10 | self.p_vel = None
11 |
12 |
13 | # state of agents (including communication and internal/mental state)
14 | class AgentState(EntityState):
15 | def __init__(self):
16 | super(AgentState, self).__init__()
17 | # communication utterance
18 | self.c = None
19 |
20 |
21 | # action of the agent
22 | class Action(object):
23 | def __init__(self):
24 | # physical action
25 | self.u = None
26 | # communication action
27 | self.c = None
28 |
29 |
30 | # properties and state of physical world entity
31 | class Entity(object):
32 | def __init__(self):
33 | # name
34 | self.name = ''
35 | # properties:
36 | self.size = 0.050
37 | # entity can move / be pushed
38 | self.movable = False
39 | # entity collides with others
40 | self.collide = True
41 | # material density (affects mass)
42 | self.density = 25.0
43 | # color
44 | self.color = None
45 | # max speed and accel
46 | self.max_speed = None
47 | self.accel = None
48 | # state
49 | self.state = EntityState()
50 | # mass
51 | self.initial_mass = 1.0
52 |
53 | @property
54 | def mass(self):
55 | return self.initial_mass
56 |
57 |
58 | # properties of landmark entities
59 | class Landmark(Entity):
60 | def __init__(self):
61 | super(Landmark, self).__init__()
62 |
63 |
64 | # properties of agent entities
65 | class Agent(Entity):
66 | def __init__(self):
67 | super(Agent, self).__init__()
68 | # agents are movable by default
69 | self.movable = True
70 | # cannot send communication signals
71 | self.silent = False
72 | # cannot observe the world
73 | self.blind = False
74 | # physical motor noise amount
75 | self.u_noise = None
76 | # communication noise amount
77 | self.c_noise = None
78 | # control range
79 | self.u_range = 1.0
80 | # state
81 | self.state = AgentState()
82 | # action
83 | self.action = Action()
84 | # script behavior to execute
85 | self.action_callback = None
86 |
87 |
88 | # multi-agent world
89 | class World(object):
90 | def __init__(self):
91 | # list of agents and entities (can change at execution-time!)
92 | self.agents = []
93 | self.landmarks = []
94 | # communication channel dimensionality
95 | self.dim_c = 0
96 | # position dimensionality
97 | self.dim_p = 2
98 | # color dimensionality
99 | self.dim_color = 3
100 | # simulation timestep
101 | self.dt = 0.1
102 | # physical damping
103 | self.damping = 0.25
104 | # contact response parameters
105 | self.contact_force = 1e+2
106 | self.contact_margin = 1e-3
107 |
108 | # return all entities in the world
109 | @property
110 | def entities(self):
111 | return self.agents + self.landmarks
112 |
113 | # return all agents controllable by external policies
114 | @property
115 | def policy_agents(self):
116 | return [agent for agent in self.agents if agent.action_callback is None]
117 |
118 | # return all agents controlled by world scripts
119 | @property
120 | def scripted_agents(self):
121 | return [agent for agent in self.agents if agent.action_callback is not None]
122 |
123 | # update state of the world
124 | def step(self, done=None):
125 | # set actions for scripted agents
126 | for agent in self.scripted_agents:
127 | agent.action = agent.action_callback(agent, self)
128 | # gather forces applied to entities
129 | p_force = [None] * len(self.entities)
130 | # apply agent physical controls
131 | p_force = self.apply_action_force(p_force, done)
132 | # apply environment forces
133 | p_force = self.apply_environment_force(p_force)
134 | # integrate physical state
135 | self.integrate_state(p_force, done)
136 | # update agent state
137 | for agent in self.agents:
138 | self.update_agent_state(agent)
139 |
140 | # gather agent action forces
141 | def apply_action_force(self, p_force, done=None):
142 | # set applied forces
143 | for i, agent in enumerate(self.agents):
144 | if agent.movable:
145 | noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
146 | p_force[i] = agent.action.u + noise
147 | return p_force
148 |
149 | # gather physical forces acting on entities
150 | def apply_environment_force(self, p_force):
151 | # simple (but inefficient) collision response
152 | for a, entity_a in enumerate(self.entities):
153 | for b,entity_b in enumerate(self.entities):
154 | if b <= a: continue
155 | [f_a, f_b] = self.get_collision_force(entity_a, entity_b)
156 | if f_a is not None:
157 | if p_force[a] is None: p_force[a] = 0.0
158 | p_force[a] = f_a + p_force[a]
159 | if f_b is not None:
160 | if p_force[b] is None: p_force[b] = 0.0
161 | p_force[b] = f_b + p_force[b]
162 | return p_force
163 |
164 | # integrate physical state
165 | def integrate_state(self, p_force, done=None):
166 | for i,entity in enumerate(self.entities):
167 | if not entity.movable: continue
168 | if entity.movable and done is not None and done[i]: continue
169 | entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
170 | if p_force[i] is not None:
171 | entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
172 | if entity.max_speed is not None:
173 | speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
174 | if speed > entity.max_speed:
175 | entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
176 | np.square(entity.state.p_vel[1])) * entity.max_speed
177 | entity.state.p_pos += entity.state.p_vel * self.dt
178 |
179 | def update_agent_state(self, agent):
180 | # set communication state (directly for now)
181 | if agent.silent:
182 | agent.state.c = np.zeros(self.dim_c)
183 | else:
184 | noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
185 | agent.state.c = agent.action.c + noise
186 |
187 | # get collision forces for any contact between two entities
188 | def get_collision_force(self, entity_a, entity_b):
189 | if (not entity_a.collide) or (not entity_b.collide):
190 | return [None, None] # not a collider
191 | if entity_a is entity_b:
192 | return [None, None] # don't collide against itself
193 | # compute actual distance between entities
194 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
195 | dist = np.sqrt(np.sum(np.square(delta_pos)))
196 | # minimum allowable distance
197 | dist_min = entity_a.size + entity_b.size
198 | # softmax penetration
199 | k = self.contact_margin
200 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
201 | if dist == 0:
202 | force = 0
203 | else:
204 | force = self.contact_force * delta_pos / dist * penetration
205 | force_a = +force if entity_a.movable else None
206 | force_b = -force if entity_b.movable else None
207 | return [force_a, force_b]
208 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import yaml
4 | import gym.spaces
5 | import sys
6 | import tensorflow as tf
7 | from gym.utils import seeding
8 | import random
9 |
10 | from alg import REGISTRY as alg_REGISTRY
11 | from game import REGISTRY as env_REGISTRY
12 | from run import REGISTRY as run_REGISTRY
13 | from util.logger import Logger
14 | import json
15 | import time
16 |
17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
18 |
19 |
20 | def default(str):
21 | return str + ' [Default: %default]'
22 |
23 |
24 | def config_args(config_name):
25 | if config_name is not None:
26 | with open(os.path.join(os.path.dirname(__file__), "config", "{}.yaml".format(config_name)), "r") as f:
27 | try:
28 | #config_dict = yaml.load(f, Loader=yaml.FullLoader)
29 | config_dict = yaml.load(f)
30 | return config_dict
31 | except yaml.YAMLError as exc:
32 | assert False, "{}.yaml error: {}".format(config_name, exc)
33 |
34 |
35 | def readCommand(argv):
36 | """
37 | Processes the command used to run main from the command line.
38 | """
39 | from optparse import OptionParser
40 | usageStr = """
41 | USAGE: python main.py