├── hacktrick_rl ├── hacktrick_rl │ ├── ppo │ │ ├── __init__.py │ │ ├── .gitignore │ │ ├── ppo_rllib.py │ │ └── ppo_rllib_client.py │ ├── rllib │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── tests.py │ │ └── rllib.py │ └── utils.py └── setup.py ├── .gitmodules ├── install.sh ├── .gitignore ├── hacktrick_agent.py ├── client.py ├── hackathon_tutorial.ipynb └── README.md /hacktrick_rl/hacktrick_rl/ppo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/rllib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/ppo/.gitignore: -------------------------------------------------------------------------------- 1 | hpsearch.py -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "hacktrick_ai"] 2 | path = hacktrick_ai 3 | url = https://github.com/hacktrick-hackathon/hacktrick_ai 4 | branch = master 5 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd hacktrick_ai 3 | pip install -e . 4 | cd ../hacktrick_rl 5 | pip install -e . 6 | 7 | cd ./hacktrick_rl 8 | [ ! -f data_dir.py ] && echo "import os; DATA_DIR = os.path.abspath('.')" >> data_dir.py 9 | 10 | pip install protobuf 11 | pip install python-socketio[asyncio_client]==4.6.0 12 | pip install python-engineio==3.13.0 -------------------------------------------------------------------------------- /hacktrick_rl/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup(name='hacktrick_rl', 6 | version='0.0.1', 7 | description='This package has shared components.', 8 | packages=find_packages(), 9 | install_requires=[ 10 | 'GitPython', 11 | 'memory_profiler', 12 | 'sacred', 13 | 'pymongo', 14 | 'dill', 15 | 'matplotlib', 16 | 'requests', 17 | 'pygame', 18 | 'numpy', 19 | 'seaborn==0.9.0', 20 | 'ray[rllib]==0.8.5' 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.swp 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv*/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # Other 110 | .DS_Store 111 | *.key 112 | *.png 113 | 114 | # Models and run data 115 | .temp_best_model 116 | checkpoint/ 117 | data/ppo_runs/ 118 | data/ftw_runs/ 119 | data/pbt_runs/ 120 | data/agent_runs/ 121 | data/bc_runs/ 122 | data/chosen_layouts/ 123 | data/expert_agent/ 124 | data/ftw_exp/ 125 | data/pbt_exp/ 126 | data/gail_runs/ 127 | data/joint_ppo_runs/ 128 | data/ppo_exp/ 129 | 130 | # Other files 131 | transfer_agent.sh 132 | 133 | # sacred config files 134 | **/slack.json 135 | 136 | # VSCode metadata 137 | **/.vscode 138 | 139 | # Data directories 140 | **/data_dir.py 141 | 142 | # PyCharm 143 | .idea/ 144 | -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/ppo/ppo_rllib.py: -------------------------------------------------------------------------------- 1 | from ray.rllib.models.tf.tf_modelv2 import TFModelV2 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | 6 | 7 | 8 | class RllibPPOModel(TFModelV2): 9 | """ 10 | Model that will map environment states to action probabilities. Will be shared across agents 11 | """ 12 | def __init__(self, obs_space, action_space, num_outputs, model_config, name, **kwargs): 13 | 14 | super(RllibPPOModel, self).__init__(obs_space, action_space, num_outputs, model_config, name) 15 | 16 | # params we got to pass in from the call to "run" 17 | custom_params = model_config["custom_options"] 18 | 19 | 20 | ## Parse custom network params 21 | num_hidden_layers = custom_params["NUM_HIDDEN_LAYERS"] 22 | size_hidden_layers = custom_params["SIZE_HIDDEN_LAYERS"] 23 | num_filters = custom_params["NUM_FILTERS"] 24 | num_convs = custom_params["NUM_CONV_LAYERS"] 25 | d2rl = custom_params["D2RL"] 26 | assert type(d2rl) == bool 27 | 28 | ## Model inputs 29 | # Your input is a tensor the size of the grid with each channel representing a diffetent item as stated in the documentation 30 | # For example, in the channel representing a solar cell you will have an array (h x w) with 1 if a solar cell exists in this location and 0 otherwise 31 | self.inputs = tf.keras.Input(shape=obs_space.shape, name="observations") 32 | out = self.inputs 33 | 34 | # Implement your model architicture here using the given parameters if needed 35 | 36 | # This is just a dummpy layer so that the model works out of the box 37 | # It uses normal tf functional API and you can do the same 38 | out = tf.keras.layers.Flatten()(out) 39 | 40 | ## Model ouptus 41 | # Linear last layer for action distribution logits 42 | layer_out = tf.keras.layers.Dense(self.num_outputs)(out) 43 | # Linear last layer for value function branch of model 44 | value_out = tf.keras.layers.Dense(1)(out) 45 | 46 | self.base_model = tf.keras.Model(self.inputs, [layer_out, value_out]) 47 | self.register_variables(self.base_model.variables) 48 | 49 | 50 | def forward(self, input_dict, state=None, seq_lens=None): 51 | model_out, self._value_out = self.base_model(input_dict["obs"]) 52 | return model_out, state 53 | 54 | def value_function(self): 55 | return tf.reshape(self._value_out, [-1]) -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/rllib/utils.py: -------------------------------------------------------------------------------- 1 | from hacktrick_ai_py.agents.benchmarking import AgentEvaluator 2 | import numpy as np 3 | import inspect 4 | 5 | def softmax(logits): 6 | e_x = np.exp(logits.T - np.max(logits)) 7 | return (e_x / np.sum(e_x, axis=0)).T 8 | 9 | def get_base_env(mdp_params, env_params, outer_shape=None, mdp_params_schedule_fn=None): 10 | ae = get_base_ae(mdp_params, env_params, outer_shape, mdp_params_schedule_fn) 11 | return ae.env 12 | 13 | def get_base_mlam(mdp_params, env_params, outer_shape=None, mdp_params_schedule_fn=None): 14 | ae = get_base_ae(mdp_params, env_params, outer_shape, mdp_params_schedule_fn) 15 | return ae.mlam 16 | 17 | def get_base_ae(mdp_params, env_params, outer_shape=None, mdp_params_schedule_fn=None): 18 | """ 19 | mdp_params: one set of fixed mdp parameter used by the enviroment 20 | env_params: env parameters (horizon, etc) 21 | outer_shape: outer shape of the environment 22 | mdp_params_schedule_fn: the schedule for varying mdp params 23 | 24 | return: the base agent evaluator 25 | """ 26 | assert mdp_params == None or mdp_params_schedule_fn == None, "either of the two has to be null" 27 | if type(mdp_params) == dict and "layout_name" in mdp_params: 28 | ae = AgentEvaluator.from_layout_name(mdp_params=mdp_params, env_params=env_params) 29 | elif 'num_mdp' in env_params: 30 | if np.isinf(env_params['num_mdp']): 31 | ae = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_params, env_params=env_params, 32 | outer_shape=outer_shape, mdp_params_schedule_fn=mdp_params_schedule_fn) 33 | else: 34 | ae = AgentEvaluator.from_mdp_params_finite(mdp_params=mdp_params, env_params=env_params, 35 | outer_shape=outer_shape, mdp_params_schedule_fn=mdp_params_schedule_fn) 36 | else: 37 | # should not reach this case 38 | raise NotImplementedError() 39 | return ae 40 | 41 | # Returns the required arguments as inspect.Parameter objects in a list 42 | def get_required_arguments(fn): 43 | required = [] 44 | params = inspect.signature(fn).parameters.values() 45 | for param in params: 46 | if param.default == inspect.Parameter.empty and param.kind == param.POSITIONAL_OR_KEYWORD: 47 | required.append(param) 48 | return required 49 | 50 | def iterable_equal(a, b): 51 | if hasattr(a, '__iter__') != hasattr(b, '__iter__'): 52 | return False 53 | if not hasattr(a, '__iter__'): 54 | return a == b 55 | 56 | if len(a) != len(b): 57 | return False 58 | 59 | for elem_a, elem_b in zip(a, b): 60 | if not iterable_equal(elem_a, elem_b): 61 | return False 62 | 63 | return True -------------------------------------------------------------------------------- /hacktrick_agent.py: -------------------------------------------------------------------------------- 1 | from hacktrick_ai_py.agents.agent import Agent, AgentPair 2 | from hacktrick_ai_py.mdp.hacktrick_mdp import HacktrickState, Recipe 3 | from hacktrick_ai_py.mdp.actions import Action 4 | from hacktrick_rl.rllib.rllib import RlLibAgent, load_agent_pair 5 | 6 | 7 | class MainAgent(Agent): 8 | 9 | def __init__(self): 10 | super().__init__() 11 | 12 | def action(self, state): 13 | # Implement your logic here 14 | # You should change your action value to a compatible Action value from the Action class in Hacktric_ai 15 | # You do not need to implement the action_probs but it is basically the probability distribution of actions 16 | action, action_probs = Action.STAY, {} 17 | return action, action_probs 18 | 19 | 20 | class OptionalAgent(Agent): 21 | 22 | def __init__(self): 23 | super().__init__() 24 | 25 | def action(self, state): 26 | # Implement your logic here 27 | action, action_probs = Action.STAY, {} 28 | return action, action_probs 29 | 30 | 31 | class HacktrickAgent(object): 32 | # Enable this flag if you are using reinforcement learning from the included ppo ray support library 33 | RL = False 34 | # Rplace with the directory for the trained agent 35 | # Note that `agent_dir` is the full path to the checkpoint FILE, not the checkpoint directory 36 | agent_dir = '' 37 | # If you do not plan to use the same agent logic for both agents and use the OptionalAgent set it to False 38 | # Does not matter if you are using RL as this is controlled by the RL agent 39 | share_agent_logic = True 40 | 41 | def __init__(self): 42 | Recipe.configure({}) 43 | 44 | if self.RL: 45 | agent_pair = load_agent_pair(self.agent_dir) 46 | self.agent0 = agent_pair.a0 47 | self.agent1 = agent_pair.a1 48 | else: 49 | self.agent0 = MainAgent() 50 | self.agent1 = OptionalAgent() 51 | 52 | def set_mode(self, mode): 53 | self.mode = mode 54 | 55 | if "collaborative" in self.mode: 56 | if self.share_agent_logic and not self.RL: 57 | self.agent1 = MainAgent() 58 | self.agent_pair = AgentPair(self.agent0, self.agent1) 59 | else: 60 | self.agent1 =None 61 | self.agent_pair =None 62 | 63 | def map_action(self, action): 64 | action_map = {(0, 0): 'STAY', (0, -1): 'UP', (0, 1): 'DOWN', (1, 0): 'RIGHT', (-1, 0): 'LEFT', 'interact': 'SPACE'} 65 | action_str = action_map[action[0]] 66 | return action_str 67 | 68 | def action(self, state_dict): 69 | state = HacktrickState.from_dict(state_dict['state']['state']) 70 | 71 | if "collaborative" in self.mode: 72 | (action0, action1) = self.agent_pair.joint_action(state) 73 | action0 = self.map_action(action0) 74 | action1 = self.map_action(action1) 75 | action = [action0, action1] 76 | else: 77 | action0 = self.agent0.action(state) 78 | action0 = self.map_action(action0) 79 | action = action0 80 | 81 | return action -------------------------------------------------------------------------------- /client.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import socketio 4 | import random 5 | import signal 6 | import sys 7 | from hacktrick_agent import HacktrickAgent 8 | 9 | 10 | sio = socketio.AsyncClient() 11 | 12 | settings = {} 13 | agent = HacktrickAgent() 14 | 15 | @sio.event 16 | async def connect(): 17 | print('connection established') 18 | 19 | 20 | @sio.event 21 | async def start_game(data): 22 | print('start_game received with ', data) 23 | # await sio.emit('my response', {'response': 'my response'}) 24 | 25 | @sio.event 26 | async def end_game(data): 27 | print('end_game received with ', data) 28 | # await sio.emit('my response', {'response': 'my response'}) 29 | 30 | @sio.event 31 | async def state_pong(data): 32 | action = agent.action(data) 33 | 34 | if "collaborative" in settings['mode']: 35 | print("actions", action) 36 | await sio.emit('action_collaborative', {'actions':action,'team_name': settings['team_name']}) 37 | 38 | else: 39 | print("action", action) 40 | await sio.emit('action', {'action': action}) 41 | 42 | score = data['state']['score'] 43 | state = data['state']['state'] 44 | print("score:", score) 45 | 46 | 47 | @sio.event 48 | async def end_game(data): 49 | print('end_game received with ', data) 50 | # await sio.emit('my response', {'response': 'my response'}) 51 | await sio.disconnect() 52 | 53 | @sio.event 54 | async def waiting(data): 55 | print('waiting received with ', data) 56 | 57 | @sio.event 58 | async def creation_failed(data): 59 | print('Failed to create game') 60 | print('Received the following error', data['error']) 61 | 62 | @sio.event 63 | async def reset_game(data): 64 | print('creation_failed received with ', data) 65 | 66 | @sio.event 67 | async def disconnect(): 68 | print('disconnected from server') 69 | 70 | @sio.event 71 | async def authentication_error(data): 72 | print('authentication_error received') 73 | 74 | 75 | async def main(): 76 | await sio.connect('http://ec2-3-14-245-107.us-east-2.compute.amazonaws.com/') ## Change here to aws url 77 | await sio.emit('create', {'mode': settings['mode'],'team_name': settings['team_name'], 'password':settings['password'], 'layout':settings['layout']}) 78 | await sio.wait() 79 | 80 | 81 | async def signal_handler(signal, frame): 82 | print ('You pressed Ctrl+C - or killed me with -2') 83 | #.... Put your logic here ..... 84 | await sio.disconnect() 85 | sys.exit(0) 86 | 87 | if __name__ == '__main__': 88 | modes = ["single" ,"collaborative"] 89 | layouts = [ 90 | "leaderboard_single", 91 | "leaderboard_collaborative", 92 | "round_of_16_single", 93 | "round_of_16_collaborative", 94 | "quarter_final_single", 95 | "quarter_final_collaborative", 96 | "semi_final_single", 97 | "semi_final_collaborative", 98 | "final_single", 99 | "final_collaborative" 100 | ] 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument('--team_name', type=str, required=True) 103 | parser.add_argument('--password', type=str, required=True) 104 | parser.add_argument('--mode', type=str, required=True) 105 | parser.add_argument('--layout', type=str, required=True) 106 | args = parser.parse_args() 107 | 108 | if args.mode not in modes or \ 109 | args.layout not in layouts: 110 | print("invalid parameters have been entered. Please ensure mode and layout are correct") 111 | sys.exit(0) 112 | settings['team_name'] = args.team_name 113 | settings['password'] = args.password 114 | settings['mode'] = args.mode 115 | settings['layout'] = args.layout 116 | 117 | print(settings) 118 | # signal.signal(signal.SIGINT, signal_handler) 119 | agent.set_mode(settings['mode']) 120 | asyncio.run(main()) 121 | # asyncio.get_event_loop().run_until_complete(main(args.host, args.team_name, args.password)) -------------------------------------------------------------------------------- /hackathon_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "DCLyD1xhb9X2" 7 | }, 8 | "source": [ 9 | "# How to test and visualise your agents.\n", 10 | "\n", 11 | "---\n", 12 | "\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "id": "XCClqutxV1Xq" 19 | }, 20 | "source": [ 21 | "## Imports\n", 22 | "\n", 23 | "\n", 24 | "\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "id": "jKsFs6UfDWJG" 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "from hacktrick_ai.src.hacktrick_ai_py.agents.benchmarking import AgentEvaluator, LayoutGenerator\n", 36 | "from hacktrick_ai.src.hacktrick_ai_py.visualization.state_visualizer import StateVisualizer\n", 37 | "from hacktrick_ai_py.agents.agent import AgentPair, StayAgent\n", 38 | "from hacktrick_agent import HacktrickAgent" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "IM5MAxKLWEDa" 45 | }, 46 | "source": [ 47 | "## How to Run\n", 48 | "\n", 49 | "1. Set game mode to either single or collaborative.\n", 50 | "2. Set timesteps (We will be evaluating on 1200 timesteps).\n", 51 | "3. Set layout name.\n", 52 | "4. Create a HacktrickAgent instance, it will automatically include your algorith or RL agent if used from the `hacktrick_agent.py` file.\n", 53 | "5. Call run_agent() and pass the required parameters.\n", 54 | "6. run_agent() will return the trajectories of the played game.\n", 55 | "7. Call visualize() and pass the trajectories returned from running the agent to graphically view the game.\n", 56 | "\n", 57 | "(basically just run the notebook ;))" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": { 64 | "id": "6bJJmpl_EsZU" 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "def run_agent(mode, timesteps, layout_name, hacktrick_agent):\n", 69 | " hacktrick_agent.set_mode(mode)\n", 70 | " if mode == 'collaborative':\n", 71 | " agent0 = hacktrick_agent.agent0\n", 72 | " agent1 = hacktrick_agent.agent1\n", 73 | " agent = AgentPair(agent0, agent1)\n", 74 | " elif mode == 'single':\n", 75 | " agent0 = hacktrick_agent.agent0\n", 76 | " agent1 = StayAgent()\n", 77 | " agent = AgentPair(agent0, agent1)\n", 78 | " mdp_gen_params = {\"layout_name\": layout_name}\n", 79 | " mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params)\n", 80 | " env_params = {\"horizon\": timesteps}\n", 81 | " agent_eval = AgentEvaluator(env_params=env_params, mdp_fn=mdp_fn)\n", 82 | " trajectories = agent_eval.evaluate_agent_pair(agent, num_games=1)\n", 83 | " return trajectories" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "id": "WGYCS4fsQgk4" 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "# Parameters to be changed\n", 95 | "\n", 96 | "mode = 'single'\n", 97 | "timesteps = 200\n", 98 | "layout_name = 'leaderboard_single'\n", 99 | "agent = HacktrickAgent()\n", 100 | "trajectories = run_agent(mode, timesteps, layout_name, agent)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 7, 106 | "metadata": { 107 | "id": "CfmIXTEYJc_M" 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "def visualize(trajectories):\n", 112 | " img_dir_path = StateVisualizer().display_rendered_trajectory(trajectories, trajectory_idx=0, ipython_display=True)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "id": "UFaj9yXPVfuN" 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "visualize(trajectories)" 124 | ] 125 | } 126 | ], 127 | "metadata": { 128 | "colab": { 129 | "collapsed_sections": [], 130 | "name": "hackathon-tutorial.ipynb", 131 | "provenance": [] 132 | }, 133 | "kernelspec": { 134 | "display_name": "Python 3", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.8.10" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 0 152 | } 153 | -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/rllib/tests.py: -------------------------------------------------------------------------------- 1 | from hacktrick_rl.rllib.rllib import HacktrickMultiAgent 2 | from hacktrick_rl.rllib.utils import softmax, get_required_arguments, iterable_equal 3 | from math import isclose 4 | import unittest, copy 5 | import numpy as np 6 | 7 | class RllibEnvTest(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.params = copy.deepcopy(HacktrickMultiAgent.DEFAULT_CONFIG) 11 | self.timesteps = [0, 10, 100, 500, 1000, 1500, 2000, 2500] 12 | 13 | def tearDown(self): 14 | pass 15 | 16 | def _assert_lists_almost_equal(self, first, second, places=7): 17 | for a, b in zip(first, second): 18 | self.assertAlmostEqual(a, b, places=places) 19 | 20 | def _test_bc_schedule(self, bc_schedule, expected_bc_factors): 21 | self.params['multi_agent_params']['bc_schedule'] = bc_schedule 22 | env = HacktrickMultiAgent.from_config(self.params) 23 | actual_bc_factors = [] 24 | 25 | for t in self.timesteps: 26 | env.anneal_bc_factor(t) 27 | actual_bc_factors.append(env.bc_factor) 28 | 29 | self._assert_lists_almost_equal(expected_bc_factors, actual_bc_factors) 30 | 31 | def _test_bc_creation_proportion(self, env, factor, trials=10000): 32 | env.bc_factor = factor 33 | tot_bc = 0 34 | for _ in range(trials): 35 | env.reset(regen_mdp=False) 36 | num_bc = sum(map(lambda agent : int(agent.startswith('bc')), env.curr_agents)) 37 | self.assertLessEqual(num_bc, 1) 38 | tot_bc += num_bc 39 | actual_factor = tot_bc / trials 40 | self.assertAlmostEqual(actual_factor, factor, places=1) 41 | 42 | 43 | def test_env_creation(self): 44 | # Valid creation 45 | env = HacktrickMultiAgent.from_config(self.params) 46 | for param, expected in self.params['multi_agent_params'].items(): 47 | self.assertEqual(expected, getattr(env, param)) 48 | 49 | # Invalid bc_schedules 50 | invalid_schedules = [[(-1, 0.0), (1.0, 1e5)], [(0.0, 0.0), (10, 1), (5, 0.5)], [(0, 0), (5, 1), (10, 1.5)]] 51 | for sched in invalid_schedules: 52 | self.params['multi_agent_params']['bc_schedule'] = sched 53 | self.assertRaises(AssertionError, HacktrickMultiAgent.from_config, self.params) 54 | 55 | def test_reward_shaping_annealing(self): 56 | self.params['multi_agent_params']['reward_shaping_factor'] = 1 57 | self.params['multi_agent_params']['reward_shaping_horizon'] = 1e3 58 | 59 | expected_rew_factors = [1, 990/1e3, 900/1e3, 500/1e3, 0.0, 0.0, 0.0, 0.0] 60 | actual_rew_factors = [] 61 | 62 | env = HacktrickMultiAgent.from_config(self.params) 63 | 64 | for t in self.timesteps: 65 | env.anneal_reward_shaping_factor(t) 66 | actual_rew_factors.append(env.reward_shaping_factor) 67 | 68 | self._assert_lists_almost_equal(expected_rew_factors, actual_rew_factors) 69 | 70 | def test_bc_annealing(self): 71 | # Test no annealing 72 | self._test_bc_schedule(HacktrickMultiAgent.self_play_bc_schedule, [0.0]*len(self.timesteps)) 73 | 74 | # Test annealing 75 | anneal_bc_schedule = [(0, 0.0), (1e3, 1.0), (2e3, 0.0)] 76 | expected_bc_factors = [0.0, 10/1e3, 100/1e3, 500/1e3, 1.0, 500/1e3, 0.0, 0.0] 77 | self._test_bc_schedule(anneal_bc_schedule, expected_bc_factors) 78 | 79 | def test_agent_creation(self): 80 | env = HacktrickMultiAgent.from_config(self.params) 81 | obs = env.reset() 82 | 83 | # Check that we have the right number of agents with valid names 84 | self.assertEqual(len(env.curr_agents), 2) 85 | self.assertListEqual(list(obs.keys()), env.curr_agents) 86 | 87 | # Ensure that bc agents are created 'factor' percentage of the time 88 | bc_factors = [0.0, 0.1, 0.5, 0.9, 1.0] 89 | for factor in bc_factors: 90 | self._test_bc_creation_proportion(env, factor) 91 | 92 | 93 | class RllibUtilsTest(unittest.TestCase): 94 | 95 | def setUp(self): 96 | pass 97 | 98 | def tearDown(self): 99 | pass 100 | 101 | def test_softmax(self): 102 | logits = np.array([[0.1, 0.1, 0.1], 103 | [-0.1, 0.0, 0.1], 104 | [0.5, -1.2, 3.2], 105 | [-1.6, -2.0, -1.5]]) 106 | expected = np.array([[0.33333333, 0.33333333, 0.33333333], 107 | [0.30060961, 0.33222499, 0.3671654 ], 108 | [0.06225714, 0.01137335, 0.92636951], 109 | [0.36029662, 0.24151404, 0.39818934]]) 110 | 111 | actual = softmax(logits) 112 | 113 | self.assertTrue(np.allclose(expected, actual)) 114 | 115 | def test_iterable_equal(self): 116 | a = [(1,), (1, 2)] 117 | b = ([1], [1, 2]) 118 | 119 | self.assertTrue(iterable_equal(a, b)) 120 | 121 | a = [(1, 2), (1)] 122 | b = [(1,), (1, 2)] 123 | 124 | self.assertFalse(iterable_equal(a, b)) 125 | 126 | def test_get_required_arguments(self): 127 | 128 | def foo1(a): 129 | pass 130 | def foo2(a, b): 131 | pass 132 | def foo3(a, b, c): 133 | pass 134 | def foo4(a, b, c='bar'): 135 | pass 136 | def foo5(a, b='bar', d='baz', **kwargs): 137 | pass 138 | 139 | fns = [foo1, foo2, foo3, foo4, foo5] 140 | expected = [1, 2, 3, 2, 1] 141 | 142 | for fn, expected in zip(fns, expected): 143 | self.assertEqual(expected, len(get_required_arguments(fn))) 144 | 145 | 146 | 147 | if __name__ == '__main__': 148 | unittest.main() -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import git 4 | import shutil 5 | import random 6 | import itertools 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | 11 | def delete_dir_if_exists(dir_path, verbose=False): 12 | if os.path.exists(dir_path): 13 | if verbose: 14 | print("Deleting old dir", dir_path) 15 | shutil.rmtree(dir_path) 16 | 17 | def create_dir_if_not_exists(dir_path): 18 | if not os.path.exists(dir_path): 19 | os.makedirs(dir_path) 20 | 21 | def reset_tf(): 22 | """Clean up tensorflow graph and session. 23 | NOTE: this also resets the tensorflow seed""" 24 | tf.reset_default_graph() 25 | if tf.get_default_session() is not None: 26 | tf.get_default_session().close() 27 | 28 | def num_tf_params(): 29 | """Prints number of trainable parameters defined""" 30 | total_parameters = 0 31 | for variable in tf.trainable_variables(): 32 | # shape is an array of tf.Dimension 33 | shape = variable.get_shape() 34 | variable_parameters = 1 35 | for dim in shape: 36 | variable_parameters *= dim.value 37 | total_parameters += variable_parameters 38 | print(total_parameters) 39 | 40 | def get_current_commit_hash(): 41 | repo = git.Repo(search_parent_directories=True) 42 | return repo.head.object.hexsha 43 | 44 | def get_trailing_number(s): 45 | """ 46 | Get the trailing number from a string, 47 | i.e. 'file123' -> '123' 48 | """ 49 | m = re.search(r'\d+$', s) 50 | return int(m.group()) if m else None 51 | 52 | def get_max_iter(agent_folder): 53 | """Return biggest PBT iteration that has been run""" 54 | saved_iters = [] 55 | for folder_s in os.listdir(agent_folder): 56 | folder_iter = get_trailing_number(folder_s) 57 | if folder_iter is not None: 58 | saved_iters.append(folder_iter) 59 | if len(saved_iters) == 0: 60 | raise ValueError("Agent folder {} seemed to not have any pbt_iter subfolders".format(agent_folder)) 61 | return max(saved_iters) 62 | 63 | def cross_entropy(action_probs, y, eps=1e-4): 64 | """ 65 | X is the output from fully connected layer (num_examples x num_classes) 66 | y is labels (num_examples x 1) 67 | Note that y is not one-hot encoded vector. 68 | It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required. 69 | """ 70 | m = y.shape[0] 71 | # We use multidimensional array indexing to extract 72 | # softmax probability of the correct label for each sample. 73 | probs_for_correct = action_probs[range(m), y] 74 | 75 | # NOTE: eps was added to correct for some actions being deterministically removed from 76 | # the human model when it would get stuck. It was chosen empirically as to be about an order of 77 | # magnitude less than the smallest probability assigned to any event by the model 78 | probs_for_correct = np.array([p if p > eps else eps for p in probs_for_correct]).astype(float) 79 | 80 | log_likelihood = -np.log(probs_for_correct) 81 | cross_entropy_loss = np.sum(log_likelihood) / m 82 | return cross_entropy_loss 83 | 84 | def accuracy(action_probs, y): 85 | return np.sum(np.argmax(action_probs, axis=1) == y) / len(y) 86 | 87 | def set_global_seed(seed): 88 | random.seed(seed) 89 | np.random.seed(seed) 90 | tf.random.set_seed(seed) 91 | 92 | def prepare_nested_default_dict_for_pickle(nested_defaultdict): 93 | """Need to make all nested defaultdicts into normal dicts to pickle""" 94 | for k,v in nested_defaultdict.items(): 95 | nested_defaultdict[k] = dict(v) 96 | pickleable_dict = dict(nested_defaultdict) 97 | return pickleable_dict 98 | 99 | def set_style(font_scale=1.6): 100 | import seaborn, matplotlib 101 | seaborn.set(font='serif', font_scale=font_scale) 102 | # Make the background white, and specify the specific font family 103 | seaborn.set_style("white", { 104 | "font.family": "serif", 105 | "font.weight": "normal", 106 | "font.serif": ["Times", "Palatino", "serif"], 107 | 'axes.facecolor': 'white', 108 | 'lines.markeredgewidth': 1}) 109 | matplotlib.rcParams['text.usetex'] = True 110 | matplotlib.rc('font',family='serif', serif=['Palatino']) 111 | 112 | def common_keys_equal(dict_a, dict_b): 113 | common_keys = set(dict_a.keys()).intersection(set(dict_b.keys())) 114 | for k in common_keys: 115 | if dict_a[k] != dict_b[k]: return False 116 | return True 117 | 118 | class Node(object): 119 | def __init__(self, agent_name, params, parent=None): 120 | self.agent_name = agent_name 121 | self.params = params 122 | self.parent = parent 123 | 124 | def get_flattened_keys(dictionary): 125 | if type(dictionary) != dict: 126 | return [] 127 | return list(dictionary.keys()) + list(itertools.chain(*[get_flattened_keys(dictionary[key]) for key in dictionary])) 128 | 129 | def recursive_dict_update(map, key, value): 130 | if type(map) != dict: 131 | return False 132 | if key in map: 133 | map[key] = value 134 | return True 135 | return any([recursive_dict_update(child, key, value) for child in map.values()]) 136 | 137 | def equal_dicts(d1, d2, ignore_keys): 138 | ignored = set(ignore_keys) 139 | for k1, v1 in d1.items(): 140 | if k1 not in ignored and (k1 not in d2 or d2[k1] != v1): 141 | if k1 not in d2: 142 | print("d2 missing", k1) 143 | else: 144 | if k1 == "objects": 145 | print("object difference") 146 | for o1 in d1[k1]: 147 | print(o1) 148 | print("----") 149 | for o2 in d2[k1]: 150 | print(o2) 151 | else: 152 | print("different at ", k1, "one is ", d2[k1], "one is ", v1) 153 | return False 154 | for k2, v2 in d2.items(): 155 | if k2 not in ignored and k2 not in d1: 156 | print("d1 missing", k2) 157 | return False 158 | return True 159 | 160 | def get_dict_stats(d): 161 | new_d = d.copy() 162 | for k, v in d.items(): 163 | new_d[k] = { 164 | 'mean': np.mean(v), 165 | 'standard_error': np.std(v) / np.sqrt(len(v)), 166 | 'max': np.max(v), 167 | 'n': len(v) 168 | } 169 | return new_d -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hacktrick 2022 2 | Welcome to Hacktrick! 3 | In this hackathon, you will be required to implement agents that navigate through different layouts with 4 | lab components scattered around the layout. 5 | Your agents should be able to build four different types of labs, with each lab having different 6 | requirements and specifications. We will be evaluating your agents based on the number of labs they 7 | build in the allotted time. More in-depth technical details are provided in the following sections. 8 | There will be two different types of agents and gameplay: 9 | 1. Single Mode: Only one agent collecting the components and building the labs. 10 | 2. Collaborative Mode: Two agents working together in the same layout to build the required labs. 11 | 12 | Finally, it is worth noting that there are no constraints on how you implement these agents. We will be 13 | providing you with tips on how to implement a reinforcement learning agent in this environment, but by 14 | no means do we require you to submit an RL-based solution. You are free to implement your solutions 15 | using any method you see fitting (Ex: rule-based agent). 16 | 17 | We will be evaluating on **1200 timesteps**. 18 | 19 | # Contents 20 | - [Hacktrick 2022](#hacktrick-2022) 21 | - [Contents](#contents) 22 | - [Installation](#installation) 23 | - [Python Environment Setup](#python-environment-setup) 24 | - [Reinforcement Learning Setup](#reinforcement-learning-setup) 25 | - [PPO Tests](#ppo-tests) 26 | - [Rllib Tests](#rllib-tests) 27 | - [Repo Structure Overview](#repo-structure-overview) 28 | - [Implementation](#implementation) 29 | - [Agents](#agents) 30 | - [Visualizing Locally](#visualizing-locally) 31 | - [Submission](#submission) 32 | - [Reinforcement Learning Modules Usage](#reinforcement-learning-modules-usage) 33 | 34 | 35 | # Installation 36 | When cloning the repository, make sure you also clone the submodules 37 | ``` 38 | $ git clone --recursive https://github.com/hacktrick-hackathon/hacktrick-hackathon-2022.git 39 | ``` 40 | 41 | ## Python Environment Setup 42 | Create a new python environment (this is optional) using any environment manager you want (we will use venv) and run the install script as before 43 | ```bash 44 | $ python -m venv venv 45 | $ source venv/bin/activate 46 | (venv) $ ./install.sh 47 | ``` 48 | 49 | ## Reinforcement Learning Setup 50 | Install the latest stable version of tensorflow (if you don't have it) compatible with rllib. 51 | Make sure to train using a gpu or use google colab. If you are not planning to use reinforcement learning or other machine learning methods, you do not need this. 52 | ```bash 53 | (venv) $ pip install tensorflow 54 | ``` 55 | 56 | Your virtual environment should now be configured to run the rllib training code. Verify it by running the following command 57 | ```bash 58 | (venv) $ python -c "from ray import rllib" 59 | ``` 60 | Note: if you ever get an import error, please first check if you activated the venv 61 | 62 | ### PPO Tests 63 | ```bash 64 | (venv) $ cd hacktrick_rl/ppo 65 | (venv) hacktrick_rl/ppo $ python ppo_rllib_test.py 66 | ``` 67 | 68 | ### Rllib Tests 69 | Tests rllib environments and models, as well as various utility functions. Does not actually test rllib training 70 | ```bash 71 | (venv) $ cd rllib 72 | (venv) rllib $ python tests.py 73 | ``` 74 | You should see all tests passing. 75 | 76 | 77 | # Repo Structure Overview 78 | `hacktrick_rl` 79 | - `ppo/`: 80 | - `ppo_rllib.py`: Primary module where code for training a PPO agent resides. This is where you will implement your model architicture for a PPO agent 81 | - `ppo_rllib_client.py` Driver code for configuing and launching the training of an agent. More details about usage below 82 | - `ppo_rllib_test.py` Reproducibility tests for local sanity checks 83 | - `rllib/`: 84 | - `rllib.py`: rllib agent and training utils that utilize Hacktrick APIs 85 | - `utils.py`: utils for the above 86 | - `tests.py`: preliminary tests for the above 87 | - `utils.py`: utils for the repo 88 | 89 | `hacktrick_ai` 90 | - `mdp/`: 91 | - `hacktric_mdp.py`: main Hacktric game logic 92 | - `hacktric_env.py`: environment classes built on top of the Hacktric mdp 93 | - `layout_generator.py`: functions to generate random layouts programmatically 94 | 95 | - `agents/`: 96 | - `agent.py`: location of agent classes 97 | - `benchmarking.py`: sample trajectories of agents (both trained and planners) and load various models 98 | 99 | - `planning/`: 100 | - This directory contains some logic that might help you in implementing a rule-based agent. 101 | - You are free to disregard this directory and implement your own functions. 102 | - If you find any functions that make your implementation easier, or even as a guide/starter, feel free to use them. 103 | 104 | 105 | # Implementation 106 | ## Agents 107 | You should not need to play around in the `hacktrick_ai` dirctory as this is for the environment you will use. you implementation and submissions are disscussed below. The above is only added for completion. 108 | In `hacktrick_agent.py` you will find two base classes `MainAgent()` and `OptionalAgent()`. Implement according to the following cases. 109 | - In single mode, implement only the `MainAgent()` class and make sure your logic is correct for the `action()` method. 110 | - In collaborative mode, implement both classes if you want to implement different agent logic and set `share_agent_logic` to `False`. 111 | - In collaborative mode, implement `MainAgent()` only if you want to apply the same logic on both agents and set `share_agent_logic` to `True`. 112 | 113 | 114 | ## Visualizing Locally 115 | Follow the steps in this notebook `hackathon_tutorial.ipynb` 116 | 117 | Note: 118 | - The `horizon` variable corresponds to the number of timesteps. 119 | - Setting `num_games` to more than one will output the average score of these games. Feel free to adjust this parameter when testing, but we will be evaluating on one game only. 120 | 121 | 122 | ## Submission 123 | - In `hacktrick_agent.py` you will find two base classes `MainAgent()` and `OptionalAgent()`. Implement your logic in these classes. 124 | - Run this command `python3 client.py --team_name=TEAM_NAME --password=PASSWORD --mode=MODE --layout=LAYOUT_NAME`. Note that `mode` is either `single` or `collaborative` 125 | 126 | 127 | # Reinforcement Learning Modules Usage 128 | Before proceeding, it is important to note that there are two primary groups of hyperparameter defaults, `local` and `production`. Which is selected is controlled by the `RUN_ENV` environment variable, which defaults to `production`. In order to use local hyperparameters, run 129 | ```bash 130 | $ export RUN_ENV=local 131 | ``` 132 | 133 | Your model architicture should go in the `ppo_rllib.py` file. You need to develop a PPO model utilizing the poilerblate code that you have to give you an idea about the inputs and outputs of the model. You do not need to worry about the training loop as this is handled by ray library in the background. Your only concern should be the model architicture and if you need to change the reward funciton check `get_dense_reward()` method in `rllib/`. 134 | Training of agents is done through the `ppo_rllib_client.py` script. It has the following usage: 135 | ```bash 136 | ppo_rllib_client.py [with [=] ... ] 137 | ``` 138 | 139 | For example, the following snippet trains a self play ppo agent on seed 1, 2, and 3, with learning rate `1e-3`, on the `"cramped_room"` layout for `5` iterations without using any gpus. The rest of the parameters are left to their defaults 140 | ``` 141 | (venv) ppo $ python ppo_rllib_client.py with seeds="[1, 2, 3] lr=1e-3 layout_name=cramped_room num_training_iters=5 num_gpus=0 experiment_name="my_agent" 142 | ``` 143 | For a complete list of all hyperparameters as well as their local and production defaults, refer to the `my_config` section of `ppo_rllib_client.py` 144 | 145 | 146 | Training results and checkpoints are stored in a directory called `~/ray_results/my_agent__`. You can visualize the results using tensorboard 147 | ```bash 148 | (venv) $ cd ~/ray_results 149 | (venv) ray_results $ tensorboard --logdir . 150 | ``` 151 | The last command assumes you have installed tensorboard in a GUI-enabled environment for linux. If you are using WSL or colab you can easly figure out how to run tensorboard. 152 | -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/ppo/ppo_rllib_client.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore", category=DeprecationWarning) 3 | #!/usr/bin/env python -W ignore::DeprecationWarning 4 | 5 | # All imports except rllib 6 | import argparse, os, sys 7 | from hacktrick_ai_py.agents.benchmarking import AgentEvaluator 8 | import numpy as np 9 | 10 | # environment variable that tells us whether this code is running on the server or not 11 | LOCAL_TESTING = os.getenv('RUN_ENV', 'production') == 'local' 12 | 13 | # Sacred setup (must be before rllib imports) 14 | from sacred import Experiment 15 | ex = Experiment("PPO RLLib") 16 | 17 | # Necessary work-around to make sacred pickling compatible with rllib 18 | from sacred import SETTINGS 19 | SETTINGS.CONFIG.READ_ONLY_CONFIG = False 20 | 21 | # Slack notification configuration 22 | from sacred.observers import SlackObserver 23 | if os.path.exists('slack.json') and not LOCAL_TESTING: 24 | slack_obs = SlackObserver.from_config('slack.json') 25 | ex.observers.append(slack_obs) 26 | 27 | # Necessary for capturing stdout in multiprocessing setting 28 | SETTINGS.CAPTURE_MODE = 'sys' 29 | 30 | # rllib and rllib-dependent imports 31 | # Note: tensorflow and tensorflow dependent imports must also come after rllib imports 32 | # This is because rllib disables eager execution. Otherwise, it must be manually disabled 33 | import ray 34 | from ray.tune.result import DEFAULT_RESULTS_DIR 35 | from ray.tune.registry import register_env 36 | from ray.rllib.models import ModelCatalog 37 | from ray.rllib.agents.ppo.ppo import PPOTrainer 38 | from hacktrick_rl.ppo.ppo_rllib import RllibPPOModel 39 | from hacktrick_rl.rllib.rllib import HacktrickMultiAgent, save_trainer, gen_trainer_from_params 40 | 41 | 42 | ###################### Temp Documentation ####################### 43 | # run the following command in order to train a PPO self-play # 44 | # agent with the static parameters listed in my_config # 45 | # # 46 | # python ppo_rllib_client.py # 47 | # # 48 | # In order to view the results of training, run the following # 49 | # command # 50 | # # 51 | # tensorboard --log-dir ~/ray_results/ # 52 | # # 53 | ################################################################# 54 | 55 | # Dummy wrapper to pass rllib type checks 56 | def _env_creator(env_config): 57 | # Re-import required here to work with serialization 58 | from hacktrick_rl.rllib.rllib import HacktrickMultiAgent 59 | return HacktrickMultiAgent.from_config(env_config) 60 | 61 | @ex.config 62 | def my_config(): 63 | ### Model params ### 64 | 65 | # Whether the model is for single or collaborative training 66 | mode = 'single' 67 | 68 | # Whether dense reward should come from potential function or not 69 | use_phi = True 70 | 71 | # Base model params 72 | NUM_HIDDEN_LAYERS = 3 73 | SIZE_HIDDEN_LAYERS = 64 74 | NUM_FILTERS = 25 75 | NUM_CONV_LAYERS = 3 76 | 77 | # whether to use D2RL https://arxiv.org/pdf/2010.09163.pdf (concatenation the result of last conv layer to each hidden layer); 78 | D2RL = False 79 | ### Training Params ### 80 | 81 | num_workers = 1 if not LOCAL_TESTING else 1 82 | 83 | # list of all random seeds to use for experiments, used to reproduce results 84 | seeds = [0] 85 | 86 | # Placeholder for random for current trial 87 | seed = 2229 88 | 89 | # Number of gpus the central driver should use 90 | num_gpus = 0 if LOCAL_TESTING else 1 91 | 92 | # How many environment timesteps will be simulated (across all environments) 93 | # for one set of gradient updates. Is divided equally across environments 94 | train_batch_size = 12000 if not LOCAL_TESTING else 800 95 | 96 | # size of minibatches we divide up each batch into before 97 | # performing gradient steps 98 | sgd_minibatch_size = 2000 if not LOCAL_TESTING else 800 99 | 100 | # Rollout length 101 | rollout_fragment_length = 400 102 | 103 | # Whether all PPO agents should share the same policy network 104 | shared_policy = True 105 | 106 | # Number of training iterations to run 107 | num_training_iters = 500 if not LOCAL_TESTING else 2 108 | 109 | # Stepsize of SGD. 110 | lr = 5e-4 111 | 112 | # Learning rate schedule. 113 | lr_schedule = None 114 | 115 | # If specified, clip the global norm of gradients by this amount 116 | grad_clip = 0.1 117 | 118 | # Discount factor 119 | gamma = 0.99 120 | 121 | # Exponential decay factor for GAE (how much weight to put on monte carlo samples) 122 | # Reference: https://arxiv.org/pdf/1506.02438.pdf 123 | lmbda = 0.98 124 | 125 | # Whether the value function shares layers with the policy model 126 | vf_share_layers = True 127 | 128 | # How much the loss of the value network is weighted in overall loss 129 | vf_loss_coeff = 1e-4 130 | 131 | # Entropy bonus coefficient, will anneal linearly from _start to _end over _horizon steps 132 | entropy_coeff_start = 0.2 133 | entropy_coeff_end = 1e-2 134 | entropy_coeff_horizon = 3e6 135 | 136 | # Initial coefficient for KL divergence. 137 | kl_coeff = 0.2 138 | 139 | # PPO clipping factor 140 | clip_param = 0.05 141 | 142 | # Number of SGD iterations in each outer loop (i.e., number of epochs to 143 | # execute per train batch). 144 | num_sgd_iter = 8 if not LOCAL_TESTING else 1 145 | 146 | # How many trainind iterations (calls to trainer.train()) to run before saving model checkpoint 147 | save_freq = 25 148 | 149 | # How many training iterations to run between each evaluation 150 | evaluation_interval = 50 if not LOCAL_TESTING else 1 151 | 152 | # How many timesteps should be in an evaluation episode 153 | evaluation_ep_length = 400 154 | 155 | # Number of games to simulation each evaluation 156 | evaluation_num_games = 1 157 | 158 | # Whether to display rollouts in evaluation 159 | evaluation_display = False 160 | 161 | # Where to log the ray dashboard stats 162 | temp_dir = os.path.join(os.path.abspath(os.sep), "tmp", "ray_tmp") 163 | 164 | # Where to store model checkpoints and training stats 165 | results_dir = DEFAULT_RESULTS_DIR 166 | 167 | # Whether tensorflow should execute eagerly or not 168 | eager = False 169 | 170 | # Whether to log training progress and debugging info 171 | verbose = True 172 | 173 | 174 | ### BC Params ### Kept only for backward compatability 175 | # path to pickled policy model for behavior cloning 176 | bc_model_dir = None 177 | 178 | # Whether bc agents should return action logit argmax or sample 179 | bc_stochastic = True 180 | 181 | 182 | 183 | ### Environment Params ### 184 | # Which hacktrick level to use 185 | layout_name = "cramped_room" 186 | 187 | # all_layout_names = '_'.join(layout_names) 188 | 189 | # Name of directory to store training results in (stored in ~/ray_results/) 190 | 191 | params_str = str(use_phi) + "_nw=%d_vf=%f_es=%f_en=%f_kl=%f" % ( 192 | num_workers, 193 | vf_loss_coeff, 194 | entropy_coeff_start, 195 | entropy_coeff_end, 196 | kl_coeff 197 | ) 198 | 199 | experiment_name = "{0}_{1}_{2}".format("PPO", layout_name, params_str) 200 | 201 | # Rewards the agent will receive for intermediate actions 202 | rew_shaping_params = { 203 | "PLACEMENT_IN_CONSTRUCTION_SITE_REW": 3, 204 | "CONTAINER_PICKUP_REWARD": 3, 205 | "SOLARLAB_PICKUP_REWARD": 5, 206 | "CONTAINER_DISP_DISTANCE_REW": 0, 207 | "CONSTRUCTION_SITE_DISTANCE_REW": 0, 208 | "SOLARLAB_DISTANCE_REW": 0 209 | } 210 | 211 | # Max episode length 212 | horizon = 400 213 | 214 | # Constant by which shaped rewards are multiplied by when calculating total reward 215 | reward_shaping_factor = 1.0 216 | 217 | # Linearly anneal the reward shaping factor such that it reaches zero after this number of timesteps 218 | reward_shaping_horizon = 2.5e6 219 | 220 | # Kept only for backward compatability 221 | bc_schedule = HacktrickMultiAgent.self_play_bc_schedule 222 | 223 | 224 | # To be passed into rl-lib model/custom_options config 225 | model_params = { 226 | "NUM_HIDDEN_LAYERS" : NUM_HIDDEN_LAYERS, 227 | "SIZE_HIDDEN_LAYERS" : SIZE_HIDDEN_LAYERS, 228 | "NUM_FILTERS" : NUM_FILTERS, 229 | "NUM_CONV_LAYERS" : NUM_CONV_LAYERS, 230 | "D2RL": D2RL 231 | } 232 | 233 | # to be passed into the rllib.PPOTrainer class 234 | training_params = { 235 | "num_workers" : num_workers, 236 | "train_batch_size" : train_batch_size, 237 | "sgd_minibatch_size" : sgd_minibatch_size, 238 | "rollout_fragment_length" : rollout_fragment_length, 239 | "num_sgd_iter" : num_sgd_iter, 240 | "lr" : lr, 241 | "lr_schedule" : lr_schedule, 242 | "grad_clip" : grad_clip, 243 | "gamma" : gamma, 244 | "lambda" : lmbda, 245 | "vf_share_layers" : vf_share_layers, 246 | "vf_loss_coeff" : vf_loss_coeff, 247 | "kl_coeff" : kl_coeff, 248 | "clip_param" : clip_param, 249 | "num_gpus" : num_gpus, 250 | "seed" : seed, 251 | "evaluation_interval" : evaluation_interval, 252 | "entropy_coeff_schedule" : [(0, entropy_coeff_start), (entropy_coeff_horizon, entropy_coeff_end)], 253 | "eager" : eager, 254 | "log_level" : "WARN" if verbose else "ERROR" 255 | } 256 | 257 | # To be passed into AgentEvaluator constructor and _evaluate function 258 | evaluation_params = { 259 | "ep_length" : evaluation_ep_length, 260 | "num_games" : evaluation_num_games, 261 | "display" : evaluation_display, 262 | "mode" : mode 263 | } 264 | 265 | 266 | environment_params = { 267 | # To be passed into HacktrickGridWorld constructor 268 | 269 | "mdp_params" : { 270 | "layout_name": layout_name, 271 | "rew_shaping_params": rew_shaping_params 272 | }, 273 | # To be passed into HacktrickEnv constructor 274 | "env_params" : { 275 | "horizon" : horizon 276 | }, 277 | 278 | # To be passed into HacktrickMultiAgent constructor 279 | "multi_agent_params" : { 280 | "reward_shaping_factor" : reward_shaping_factor, 281 | "reward_shaping_horizon" : reward_shaping_horizon, 282 | "use_phi" : use_phi, 283 | "bc_schedule" : bc_schedule, 284 | "mode" : mode 285 | } 286 | } 287 | 288 | bc_params = { 289 | "bc_policy_cls" : None, #// BehaviorCloningPolicy, 290 | "bc_config" : { 291 | "model_dir" : bc_model_dir, 292 | "stochastic" : bc_stochastic, 293 | "eager" : eager 294 | } 295 | } 296 | 297 | ray_params = { 298 | "custom_model_id" : "MyPPOModel", 299 | "custom_model_cls" : RllibPPOModel, 300 | "temp_dir" : temp_dir, 301 | "env_creator" : _env_creator 302 | } 303 | 304 | params = { 305 | "model_params" : model_params, 306 | "training_params" : training_params, 307 | "environment_params" : environment_params, 308 | "bc_params" : bc_params, 309 | "shared_policy" : shared_policy, 310 | "num_training_iters" : num_training_iters, 311 | "evaluation_params" : evaluation_params, 312 | "experiment_name" : experiment_name, 313 | "save_every" : save_freq, 314 | "seeds" : seeds, 315 | "results_dir" : results_dir, 316 | "ray_params" : ray_params, 317 | "verbose" : verbose 318 | } 319 | 320 | 321 | def run(params): 322 | # Retrieve the tune.Trainable object that is used for the experiment 323 | trainer = gen_trainer_from_params(params) 324 | 325 | # Object to store training results in 326 | result = {} 327 | 328 | # Training loop 329 | for i in range(params['num_training_iters']): 330 | if params['verbose']: 331 | print("Starting training iteration", i) 332 | result = trainer.train() 333 | 334 | if i % params['save_every'] == 0: 335 | save_path = save_trainer(trainer, params) 336 | if params['verbose']: 337 | print("saved trainer at", save_path) 338 | 339 | # Save the state of the experiment at end 340 | save_path = save_trainer(trainer, params) 341 | if params['verbose']: 342 | print("saved trainer at", save_path) 343 | 344 | return result 345 | 346 | 347 | @ex.automain 348 | def main(params): 349 | # List of each random seed to run 350 | seeds = params['seeds'] 351 | del params['seeds'] 352 | 353 | # List to store results dicts (to be passed to sacred slack observer) 354 | results = [] 355 | 356 | # Train an agent to completion for each random seed specified 357 | for seed in seeds: 358 | # Override the seed 359 | params['training_params']['seed'] = seed 360 | 361 | # Do the thing 362 | result = run(params) 363 | results.append(result) 364 | 365 | # Return value gets sent to our slack observer for notification 366 | average_sparse_reward = np.mean([res['custom_metrics']['sparse_reward_mean'] for res in results]) 367 | average_episode_reward = np.mean([res['episode_reward_mean'] for res in results]) 368 | return { "average_sparse_reward" : average_sparse_reward, "average_total_reward" : average_episode_reward } -------------------------------------------------------------------------------- /hacktrick_rl/hacktrick_rl/rllib/rllib.py: -------------------------------------------------------------------------------- 1 | from hacktrick_ai_py.mdp.actions import Action 2 | from hacktrick_ai_py.mdp.hacktrick_env import HacktrickEnv 3 | from hacktrick_ai_py.mdp.hacktrick_mdp import HacktrickGridworld, EVENT_TYPES 4 | from hacktrick_ai_py.agents.benchmarking import AgentEvaluator 5 | from hacktrick_ai_py.agents.agent import Agent, AgentPair, StayAgent 6 | from ray.tune.registry import register_env 7 | from ray.tune.logger import UnifiedLogger 8 | from ray.tune.result import DEFAULT_RESULTS_DIR 9 | from ray.rllib.env.multi_agent_env import MultiAgentEnv 10 | from ray.rllib.agents.callbacks import DefaultCallbacks 11 | from ray.rllib.agents.ppo.ppo import PPOTrainer 12 | from ray.rllib.models import ModelCatalog 13 | from hacktrick_rl.rllib.utils import softmax, get_base_ae, get_required_arguments, iterable_equal 14 | from datetime import datetime 15 | import tempfile 16 | import gym 17 | import numpy as np 18 | import os, copy, dill 19 | import ray 20 | import logging 21 | 22 | action_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS)) 23 | obs_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS)) 24 | timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S") 25 | 26 | 27 | class RlLibAgent(Agent): 28 | """ 29 | Class for wrapping a trained RLLib Policy object into an Hacktrick compatible Agent 30 | """ 31 | def __init__(self, policy, agent_index, featurize_fn): 32 | self.policy = policy 33 | self.agent_index = agent_index 34 | self.featurize = featurize_fn 35 | 36 | def reset(self): 37 | # Get initial rnn states and add batch dimension to each 38 | if hasattr(self.policy.model, 'get_initial_state'): 39 | self.rnn_state = [np.expand_dims(state, axis=0) for state in self.policy.model.get_initial_state()] 40 | elif hasattr(self.policy, "get_initial_state"): 41 | self.rnn_state = [np.expand_dims(state, axis=0) for state in self.policy.get_initial_state()] 42 | else: 43 | self.rnn_state = [] 44 | 45 | def action_probabilities(self, state): 46 | """ 47 | Arguments: 48 | - state (Hacktrick_mdp.HacktrickState) object encoding the global view of the environment 49 | returns: 50 | - Normalized action probabilities determined by self.policy 51 | """ 52 | # Preprocess the environment state 53 | obs = self.featurize(state, debug=False) 54 | my_obs = obs[self.agent_index] 55 | 56 | # Compute non-normalized log probabilities from the underlying model 57 | logits = self.policy.compute_actions(np.array([my_obs]), self.rnn_state)[2]['action_dist_inputs'] 58 | 59 | # Softmax in numpy to convert logits to normalized probabilities 60 | return softmax(logits) 61 | 62 | def action(self, state): 63 | """ 64 | Arguments: 65 | - state (Hacktrick_mdp.HacktrickState) object encoding the global view of the environment 66 | returns: 67 | - the argmax action for a single observation state 68 | - action_info (dict) that stores action probabilities under 'action_probs' key 69 | """ 70 | # Preprocess the environment state 71 | obs = self.featurize(state) 72 | my_obs = obs[self.agent_index] 73 | 74 | # Use Rllib.Policy class to compute action argmax and action probabilities 75 | [action_idx], rnn_state, info = self.policy.compute_actions(np.array([my_obs]), self.rnn_state) 76 | agent_action = Action.INDEX_TO_ACTION[action_idx] 77 | 78 | # Softmax in numpy to convert logits to normalized probabilities 79 | logits = info['action_dist_inputs'] 80 | action_probabilities = softmax(logits) 81 | 82 | agent_action_info = {'action_probs' : action_probabilities} 83 | self.rnn_state = rnn_state 84 | 85 | return agent_action, agent_action_info 86 | 87 | 88 | class HacktrickMultiAgent(MultiAgentEnv): 89 | """ 90 | Class used to wrap HacktrickEnv in an Rllib compatible multi-agent environment 91 | """ 92 | 93 | # List of all agent types currently supported 94 | supported_agents = ['ppo', 'bc'] 95 | 96 | # Default bc_schedule, includes no bc agent at any time 97 | bc_schedule = self_play_bc_schedule = [(0, 0), (float('inf'), 0)] 98 | 99 | # Default environment params used for creation 100 | DEFAULT_CONFIG = { 101 | # To be passed into HacktrickGridWorld constructor 102 | "mdp_params" : { 103 | "layout_name" : "cramped_room", 104 | "rew_shaping_params" : {} 105 | }, 106 | # To be passed into HacktrickEnv constructor 107 | "env_params" : { 108 | "horizon" : 400 109 | }, 110 | # To be passed into HacktrickMultiAgent constructor 111 | "multi_agent_params" : { 112 | "reward_shaping_factor" : 0.0, 113 | "reward_shaping_horizon" : 0, 114 | "bc_schedule" : self_play_bc_schedule, 115 | "use_phi" : True, 116 | "mode" : 'single' 117 | } 118 | } 119 | 120 | def __init__(self, base_env, reward_shaping_factor=0.0, reward_shaping_horizon=0, 121 | bc_schedule=None, use_phi=True, mode='single'): 122 | """ 123 | base_env: HacktrickEnv 124 | reward_shaping_factor (float): Coefficient multiplied by dense reward before adding to sparse reward to determine shaped reward 125 | reward_shaping_horizon (int): Timestep by which the reward_shaping_factor reaches zero through linear annealing 126 | bc_schedule (list[tuple]): List of (t_i, v_i) pairs where v_i represents the value of bc_factor at timestep t_i 127 | with linear interpolation in between the t_i 128 | use_phi (bool): Whether to use 'shaped_r_by_agent' or 'phi_s_prime' - 'phi_s' to determine dense reward 129 | """ 130 | if bc_schedule: 131 | self.bc_schedule = bc_schedule 132 | self._validate_schedule(self.bc_schedule) 133 | self.base_env = base_env 134 | # since we are not passing featurize_fn in as an argument, we create it here and check its validity 135 | self.featurize_fn_map = { 136 | "ppo": lambda state: self.base_env.lossless_state_encoding_mdp(state), 137 | "bc": lambda state: self.base_env.featurize_state_mdp(state) 138 | } 139 | self._validate_featurize_fns(self.featurize_fn_map) 140 | self._initial_reward_shaping_factor = reward_shaping_factor 141 | self.reward_shaping_factor = reward_shaping_factor 142 | self.reward_shaping_horizon = reward_shaping_horizon 143 | self.use_phi = use_phi 144 | self.mode = mode 145 | self._setup_observation_space() 146 | self.action_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS)) 147 | self.anneal_bc_factor(0) 148 | self.reset() 149 | 150 | def _validate_featurize_fns(self, mapping): 151 | assert 'ppo' in mapping, "At least one ppo agent must be specified" 152 | for k, v in mapping.items(): 153 | assert k in self.supported_agents, "Unsuported agent type in featurize mapping {0}".format(k) 154 | assert callable(v), "Featurize_fn values must be functions" 155 | assert len(get_required_arguments(v)) == 1, "Featurize_fn value must accept exactly one argument" 156 | 157 | def _validate_schedule(self, schedule): 158 | timesteps = [p[0] for p in schedule] 159 | values = [p[1] for p in schedule] 160 | 161 | assert len(schedule) >= 2, "Need at least 2 points to linearly interpolate schedule" 162 | assert schedule[0][0] == 0, "Schedule must start at timestep 0" 163 | assert all([t >=0 for t in timesteps]), "All timesteps in schedule must be non-negative" 164 | assert all([v >=0 and v <= 1 for v in values]), "All values in schedule must be between 0 and 1" 165 | assert sorted(timesteps) == timesteps, "Timesteps must be in increasing order in schedule" 166 | 167 | # To ensure we flatline after passing last timestep 168 | if (schedule[-1][0] < float('inf')): 169 | schedule.append((float('inf'), schedule[-1][1])) 170 | 171 | def _setup_observation_space(self): 172 | dummy_state = self.base_env.mdp.get_standard_start_state() 173 | 174 | #ppo observation 175 | featurize_fn_ppo = lambda state: self.base_env.lossless_state_encoding_mdp(state) 176 | obs_shape = featurize_fn_ppo(dummy_state)[0].shape 177 | high = np.ones(obs_shape) * float("inf") 178 | low = np.ones(obs_shape) * 0 179 | self.ppo_observation_space = gym.spaces.Box(np.float32(low), np.float32(high), dtype=np.float32) 180 | 181 | # bc observation 182 | # featurize_fn_bc = lambda state: self.base_env.featurize_state_mdp(state) 183 | # obs_shape = featurize_fn_bc(dummy_state)[0].shape 184 | # high = np.ones(obs_shape) * 100 185 | # low = np.ones(obs_shape) * -100 186 | # self.bc_observation_space = gym.spaces.Box(np.float32(low), np.float32(high), dtype=np.float32) 187 | 188 | def _get_featurize_fn(self, agent_id): 189 | if agent_id.startswith('ppo'): 190 | return lambda state: self.base_env.lossless_state_encoding_mdp(state) 191 | if agent_id.startswith('bc'): 192 | return lambda state: self.base_env.featurize_state_mdp(state) 193 | raise ValueError("Unsupported agent type {0}".format(agent_id)) 194 | 195 | def _get_obs(self, state): 196 | ob_p0 = self._get_featurize_fn(self.curr_agents[0])(state)[0] 197 | ob_p1 = self._get_featurize_fn(self.curr_agents[1])(state)[1] 198 | return ob_p0.astype(np.float32), ob_p1.astype(np.float32) 199 | 200 | def _populate_agents(self): 201 | # Always include at least one ppo agent (i.e. bc_sp not supported for simplicity) 202 | agents = ['ppo'] 203 | 204 | # Coin flip to determine whether other agent should be ppo or bc 205 | other_agent = 'bc' if np.random.uniform() < self.bc_factor else 'ppo' 206 | agents.append(other_agent) 207 | 208 | # Randomize starting indices 209 | np.random.shuffle(agents) 210 | 211 | # Ensure agent names are unique 212 | agents[0] = agents[0] + '_0' 213 | agents[1] = agents[1] + '_1' 214 | 215 | return agents 216 | 217 | def _anneal(self, start_v, curr_t, end_t, end_v=0, start_t=0): 218 | if end_t == 0: 219 | # No annealing if horizon is zero 220 | return start_v 221 | else: 222 | off_t = curr_t - start_t 223 | # Calculate the new value based on linear annealing formula 224 | fraction = max(1 - float(off_t) / (end_t - start_t), 0) 225 | return fraction * start_v + (1 - fraction) * end_v 226 | 227 | 228 | # Hacktrick Participant Critical Message: ################################ 229 | def get_dense_reward(self, info, next_state): 230 | # to activate the caluclation of the custom reward change this to True 231 | use_custom_reward = False 232 | 233 | if use_custom_reward: 234 | # Implement your reward logic here 235 | # next_state has the next state object of type HacktrickState 236 | # to get current state use self.base_env.state 237 | pass 238 | 239 | elif self.use_phi: 240 | potential = info['phi_s_prime'] - info['phi_s'] 241 | dense_reward = (potential, potential) 242 | else: 243 | dense_reward = info["shaped_r_by_agent"] 244 | 245 | return dense_reward 246 | 247 | # Hacktrick Participant Critical Message: ################################ 248 | def step(self, action_dict): 249 | """ 250 | action: 251 | (agent with index self.agent_idx action, other agent action) 252 | is a tuple with the joint action of the primary and secondary agents in index format 253 | 254 | returns: 255 | observation: formatted to be standard input for self.agent_idx's policy 256 | """ 257 | if self.mode == 'single': 258 | action = [action_dict[self.curr_agents[0]]] 259 | assert all(self.action_space.contains(a) for a in action), "%r (%s) invalid"%(action, type(action)) 260 | joint_action = [Action.INDEX_TO_ACTION[action[0]], Action.STAY] 261 | # take a step in the current base environment 262 | elif self.mode == 'collaborative': 263 | action = [action_dict[self.curr_agents[0]], action_dict[self.curr_agents[1]]] 264 | assert all(self.action_space.contains(a) for a in action), "%r (%s) invalid"%(action, type(action)) 265 | joint_action = [Action.INDEX_TO_ACTION[a] for a in action] 266 | # take a step in the current base environment 267 | else: 268 | raise 'mode has to be either single or collaborative' 269 | 270 | next_state, sparse_reward, done, info = self.base_env.step(joint_action, display_phi=self.use_phi) 271 | dense_reward = self.get_dense_reward(info, next_state) 272 | 273 | ob_p0, ob_p1 = self._get_obs(next_state) 274 | 275 | """ 276 | shaped_reward is the total reward calculated from your score (sparse_reward) plus the dense_reward. 277 | dense_reward is weighted by an annealing factor that decreases over time to let your agent learn 278 | from its raw score only as time progresses. 279 | You can use the provided rewared function or implement your own in the get_dense_reward function above. 280 | If you choose to do so you can remove or change the annealing factor by changing self.reward_shaping_factor 281 | """ 282 | shaped_reward_p0 = sparse_reward + self.reward_shaping_factor * dense_reward[0] 283 | shaped_reward_p1 = sparse_reward + self.reward_shaping_factor * dense_reward[1] 284 | 285 | obs = { self.curr_agents[0]: ob_p0, self.curr_agents[1]: ob_p1 } 286 | rewards = { self.curr_agents[0]: shaped_reward_p0, self.curr_agents[1]: shaped_reward_p1 } 287 | dones = { self.curr_agents[0]: done, self.curr_agents[1]: done, "__all__": done } 288 | infos = { self.curr_agents[0]: info, self.curr_agents[1]: info } 289 | return obs, rewards, dones, infos 290 | 291 | def reset(self, regen_mdp=True): 292 | """ 293 | When training on individual maps, we want to randomize which agent is assigned to which 294 | starting location, in order to make sure that the agents are trained to be able to 295 | complete the task starting at either of the hardcoded positions. 296 | 297 | NOTE: a nicer way to do this would be to just randomize starting positions, and not 298 | have to deal with randomizing indices. 299 | """ 300 | self.base_env.reset(regen_mdp) 301 | self.curr_agents = self._populate_agents() 302 | ob_p0, ob_p1 = self._get_obs(self.base_env.state) 303 | return {self.curr_agents[0]: ob_p0, self.curr_agents[1]: ob_p1} 304 | 305 | def anneal_reward_shaping_factor(self, timesteps): 306 | """ 307 | Set the current reward shaping factor such that we anneal linearly until self.reward_shaping_horizon 308 | timesteps, given that we are currently at timestep "timesteps" 309 | """ 310 | new_factor = self._anneal(self._initial_reward_shaping_factor, timesteps, self.reward_shaping_horizon) 311 | self.set_reward_shaping_factor(new_factor) 312 | 313 | def anneal_bc_factor(self, timesteps): 314 | """ 315 | Set the current bc factor such that we anneal linearly until self.bc_factor_horizon 316 | timesteps, given that we are currently at timestep "timesteps" 317 | """ 318 | p_0 = self.bc_schedule[0] 319 | p_1 = self.bc_schedule[1] 320 | i = 2 321 | while timesteps > p_1[0] and i < len(self.bc_schedule): 322 | p_0 = p_1 323 | p_1 = self.bc_schedule[i] 324 | i += 1 325 | start_t, start_v = p_0 326 | end_t, end_v = p_1 327 | new_factor = self._anneal(start_v, timesteps, end_t, end_v, start_t) 328 | self.set_bc_factor(new_factor) 329 | 330 | def set_reward_shaping_factor(self, factor): 331 | self.reward_shaping_factor = factor 332 | 333 | def set_bc_factor(self, factor): 334 | self.bc_factor = factor 335 | 336 | def seed(self, seed): 337 | """ 338 | set global random seed to make environment deterministic 339 | """ 340 | # Our environment is already deterministic 341 | pass 342 | 343 | @classmethod 344 | def from_config(cls, env_config): 345 | """ 346 | Factory method for generating environments in style with rllib guidlines 347 | 348 | env_config (dict): Must contain keys 'mdp_params', 'env_params' and 'multi_agent_params', the last of which 349 | gets fed into the HacktrickMultiAgent constuctor 350 | 351 | Returns: 352 | HacktrickMultiAgent instance specified by env_config params 353 | """ 354 | assert env_config and "env_params" in env_config and "multi_agent_params" in env_config 355 | assert "mdp_params" in env_config or "mdp_params_schedule_fn" in env_config, \ 356 | "either a fixed set of mdp params or a schedule function needs to be given" 357 | # "layout_name" and "rew_shaping_params" 358 | if "mdp_params" in env_config: 359 | mdp_params = env_config["mdp_params"] 360 | outer_shape = None 361 | mdp_params_schedule_fn = None 362 | elif "mdp_params_schedule_fn" in env_config: 363 | mdp_params = None 364 | outer_shape = env_config["outer_shape"] 365 | mdp_params_schedule_fn = env_config["mdp_params_schedule_fn"] 366 | 367 | # "start_state_fn" and "horizon" 368 | env_params = env_config["env_params"] 369 | # "reward_shaping_factor" 370 | multi_agent_params = env_config["multi_agent_params"] 371 | base_ae = get_base_ae(mdp_params, env_params, outer_shape, mdp_params_schedule_fn) 372 | base_env = base_ae.env 373 | 374 | return cls(base_env, **multi_agent_params) 375 | 376 | 377 | 378 | ################## 379 | # Training Utils # 380 | ################## 381 | 382 | class TrainingCallbacks(DefaultCallbacks): 383 | def on_episode_start(self, worker, base_env, policies, episode, **kwargs): 384 | pass 385 | 386 | def on_episode_step(self, worker, base_env, episode, **kwargs): 387 | pass 388 | 389 | def on_episode_end(self, worker, base_env, policies, episode, **kwargs): 390 | """ 391 | Used in order to add custom metrics to our tensorboard data 392 | 393 | sparse_reward (int) - total reward from deliveries agent earned this episode 394 | shaped_reward (int) - total reward shaping reward the agent earned this episode 395 | """ 396 | # Get rllib.HacktrickMultiAgentEnv refernce from rllib wraper 397 | env = base_env.get_unwrapped()[0] 398 | # Both agents share the same info so it doesn't matter whose we use, just use 0th agent's 399 | info_dict = episode.last_info_for(env.curr_agents[0]) 400 | 401 | ep_info = info_dict["episode"] 402 | game_stats = ep_info["ep_game_stats"] 403 | 404 | # List of episode stats we'd like to collect by agent 405 | stats_to_collect = EVENT_TYPES 406 | 407 | # Parse info dicts generated by HacktrickEnv 408 | tot_sparse_reward = ep_info["ep_sparse_r"] 409 | tot_shaped_reward = ep_info["ep_shaped_r"] 410 | 411 | 412 | # Store metrics where they will be visible to rllib for tensorboard logging 413 | episode.custom_metrics["sparse_reward"] = tot_sparse_reward 414 | episode.custom_metrics["shaped_reward"] = tot_shaped_reward 415 | 416 | # Store per-agent game stats to rllib info dicts 417 | for stat in stats_to_collect: 418 | stats = game_stats[stat] 419 | episode.custom_metrics[stat + "_agent_0"] = len(stats[0]) 420 | episode.custom_metrics[stat + "_agent_1"] = len(stats[1]) 421 | 422 | def on_sample_end(self, worker, samples, **kwargs): 423 | pass 424 | 425 | # Executes at the end of a call to Trainer.train, we'll update environment params (like annealing shaped rewards) 426 | def on_train_result(self, trainer, result, **kwargs): 427 | # Anneal the reward shaping coefficient based on environment paremeters and current timestep 428 | timestep = result['timesteps_total'] 429 | trainer.workers.foreach_worker( 430 | lambda ev: ev.foreach_env( 431 | lambda env: env.anneal_reward_shaping_factor(timestep))) 432 | 433 | # Anneal the bc factor based on environment paremeters and current timestep 434 | trainer.workers.foreach_worker( 435 | lambda ev: ev.foreach_env( 436 | lambda env: env.anneal_bc_factor(timestep))) 437 | 438 | def on_postprocess_trajectory(self, worker, episode, agent_id, policy_id, policies, postprocessed_batch, original_batches, **kwargs): 439 | pass 440 | 441 | 442 | def get_rllib_eval_function(eval_params, eval_mdp_params, env_params, outer_shape, agent_0_policy_str='ppo', agent_1_policy_str='ppo', verbose=False): 443 | """ 444 | Used to "curry" rllib evaluation function by wrapping additional parameters needed in a local scope, and returning a 445 | function with rllib custom_evaluation_function compatible signature 446 | 447 | eval_params (dict): Contains 'num_games' (int), 'display' (bool), and 'ep_length' (int) 448 | mdp_params (dict): Used to create underlying HacktrickMDP (see that class for configuration) 449 | env_params (dict): Used to create underlying HacktrickEnv (see that class for configuration) 450 | outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout 451 | agent_0_policy_str (str): Key associated with the rllib policy object used to select actions (must be either 'ppo' or 'bc') 452 | agent_1_policy_str (str): Key associated with the rllib policy object used to select actions (must be either 'ppo' or 'bc') 453 | Note: Agent policies are shuffled each time, so agent_0_policy_str and agent_1_policy_str are symmetric 454 | Returns: 455 | _evaluate (func): Runs an evaluation specified by the curried params, ignores the rllib parameter 'evaluation_workers' 456 | """ 457 | 458 | def _evaluate(trainer, evaluation_workers): 459 | if verbose: 460 | print("Computing rollout of current trained policy") 461 | 462 | # Randomize starting indices 463 | policies = [agent_0_policy_str, agent_1_policy_str] 464 | np.random.shuffle(policies) 465 | agent_0_policy, agent_1_policy = policies 466 | 467 | # Get the corresponding rllib policy objects for each policy string name 468 | agent_0_policy = trainer.get_policy(agent_0_policy) 469 | agent_1_policy = trainer.get_policy(agent_1_policy) 470 | 471 | agent_0_feat_fn = agent_1_feat_fn = None 472 | if 'bc' in policies: 473 | base_ae = get_base_ae(eval_mdp_params, env_params) 474 | base_env = base_ae.env 475 | bc_featurize_fn = lambda state : base_env.featurize_state_mdp(state) 476 | if policies[0] == 'bc': 477 | agent_0_feat_fn = bc_featurize_fn 478 | if policies[1] == 'bc': 479 | agent_1_feat_fn = bc_featurize_fn 480 | 481 | # Compute the evauation rollout. Note this doesn't use the rllib passed in evaluation_workers, so this 482 | # computation all happens on the CPU. Could change this if evaluation becomes a bottleneck 483 | results = evaluate(eval_params, eval_mdp_params, outer_shape, agent_0_policy, agent_1_policy, agent_0_feat_fn, agent_1_feat_fn, verbose=verbose) 484 | 485 | # Log any metrics we care about for rllib tensorboard visualization 486 | metrics = {} 487 | metrics['average_sparse_reward'] = np.mean(results['ep_returns']) 488 | return metrics 489 | 490 | return _evaluate 491 | 492 | 493 | def evaluate(eval_params, mdp_params, outer_shape, agent_0_policy, agent_1_policy, agent_0_featurize_fn=None, agent_1_featurize_fn=None, verbose=False): 494 | """ 495 | Used to visualize rollouts of trained policies 496 | 497 | eval_params (dict): Contains configurations such as the rollout length, number of games, and whether to display rollouts 498 | mdp_params (dict): HacktrickMDP compatible configuration used to create environment used for evaluation 499 | outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout 500 | agent_0_policy (rllib.Policy): Policy instance used to map states to action logits for agent 0 501 | agent_1_policy (rllib.Policy): Policy instance used to map states to action logits for agent 1 502 | agent_0_featurize_fn (func): Used to preprocess states for agent 0, defaults to lossless_state_encoding if 'None' 503 | agent_1_featurize_fn (func): Used to preprocess states for agent 1, defaults to lossless_state_encoding if 'None' 504 | """ 505 | if verbose: 506 | print("eval mdp params", mdp_params) 507 | evaluator = get_base_ae(mdp_params, {"horizon" : eval_params['ep_length'], "num_mdp":1}, outer_shape) 508 | 509 | # Override pre-processing functions with defaults if necessary 510 | agent_0_featurize_fn = agent_0_featurize_fn if agent_0_featurize_fn else evaluator.env.lossless_state_encoding_mdp 511 | agent_1_featurize_fn = agent_1_featurize_fn if agent_1_featurize_fn else evaluator.env.lossless_state_encoding_mdp 512 | 513 | # Wrap rllib policies in hacktrick agents to be compatible with Evaluator code 514 | agent0 = RlLibAgent(agent_0_policy, agent_index=0, featurize_fn=agent_0_featurize_fn) 515 | if eval_params['mode'] == 'single': 516 | agent1 = StayAgent() 517 | elif eval_params['mode'] == 'collaborative': 518 | agent1 = RlLibAgent(agent_1_policy, agent_index=1, featurize_fn=agent_1_featurize_fn) 519 | else: 520 | raise 'mode has to be either single or collaborative' 521 | 522 | # Compute rollouts 523 | if 'store_dir' not in eval_params: 524 | eval_params['store_dir'] = None 525 | if 'display_phi' not in eval_params: 526 | eval_params['display_phi'] = False 527 | results = evaluator.evaluate_agent_pair(AgentPair(agent0, agent1), 528 | num_games=eval_params['num_games'], 529 | display=eval_params['display'], 530 | dir=eval_params['store_dir'], 531 | display_phi=eval_params['display_phi'], 532 | info=verbose) 533 | 534 | return results 535 | 536 | 537 | ########################### 538 | # rllib.Trainer functions # 539 | ########################### 540 | 541 | 542 | def gen_trainer_from_params(params): 543 | # All ray environment set-up 544 | if not ray.is_initialized(): 545 | init_params = { 546 | "ignore_reinit_error" : True, 547 | "include_webui" : False, 548 | "temp_dir" : params['ray_params']['temp_dir'], 549 | "log_to_driver" : params['verbose'], 550 | "logging_level" : logging.INFO if params['verbose'] else logging.CRITICAL 551 | } 552 | ray.init(**init_params) 553 | register_env("hacktrick_multi_agent", params['ray_params']['env_creator']) 554 | ModelCatalog.register_custom_model(params['ray_params']['custom_model_id'], params['ray_params']['custom_model_cls']) 555 | 556 | # Parse params 557 | model_params = params['model_params'] 558 | training_params = params['training_params'] 559 | environment_params = params['environment_params'] 560 | evaluation_params = params['evaluation_params'] 561 | bc_params = params['bc_params'] 562 | multi_agent_params = params['environment_params']['multi_agent_params'] 563 | 564 | env = HacktrickMultiAgent.from_config(environment_params) 565 | 566 | # Returns a properly formatted policy tuple to be passed into ppotrainer config 567 | def gen_policy(policy_type="ppo"): 568 | # supported policy types thus far 569 | #// assert policy_type in ["ppo", "bc"] 570 | assert policy_type == "ppo", 'ppo is the main policy supported, remove this assertion only if you know what you are doing' 571 | 572 | if policy_type == "ppo": 573 | config = { 574 | "model" : { 575 | "custom_options" : model_params, 576 | 577 | "custom_model" : "MyPPOModel" 578 | } 579 | } 580 | return (None, env.ppo_observation_space, env.action_space, config) 581 | elif policy_type == "bc": 582 | bc_cls = bc_params['bc_policy_cls'] 583 | bc_config = bc_params['bc_config'] 584 | return (bc_cls, env.bc_observation_space, env.action_space, bc_config) 585 | 586 | # Rllib compatible way of setting the directory we store agent checkpoints in 587 | logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"], params['training_params']['seed'], timestr) 588 | def custom_logger_creator(config): 589 | """Creates a Unified logger that stores results in /__ 590 | """ 591 | results_dir = params['results_dir'] 592 | if not os.path.exists(results_dir): 593 | try: 594 | os.makedirs(results_dir) 595 | except Exception as e: 596 | print("error creating custom logging dir. Falling back to default logdir {}".format(DEFAULT_RESULTS_DIR)) 597 | results_dir = DEFAULT_RESULTS_DIR 598 | logdir = tempfile.mkdtemp( 599 | prefix=logdir_prefix, dir=results_dir) 600 | logger = UnifiedLogger(config, logdir, loggers=None) 601 | return logger 602 | 603 | # Create rllib compatible multi-agent config based on params 604 | multi_agent_config = {} 605 | all_policies = ['ppo'] 606 | 607 | # Whether both agents should be learned 608 | self_play = iterable_equal(multi_agent_params['bc_schedule'], HacktrickMultiAgent.self_play_bc_schedule) 609 | if not self_play: 610 | all_policies.append('bc') 611 | 612 | multi_agent_config['policies'] = { policy : gen_policy(policy) for policy in all_policies } 613 | 614 | def select_policy(agent_id): 615 | if agent_id.startswith('ppo'): 616 | return 'ppo' 617 | if agent_id.startswith('bc'): 618 | return 'bc' 619 | multi_agent_config['policy_mapping_fn'] = select_policy 620 | multi_agent_config['policies_to_train'] = 'ppo' 621 | 622 | if "outer_shape" not in environment_params: 623 | environment_params["outer_shape"] = None 624 | 625 | if "mdp_params" in environment_params: 626 | environment_params["eval_mdp_params"] = environment_params["mdp_params"] 627 | trainer = PPOTrainer(env="hacktrick_multi_agent", config={ 628 | "multiagent": multi_agent_config, 629 | "callbacks" : TrainingCallbacks, 630 | "custom_eval_function" : get_rllib_eval_function(evaluation_params, environment_params['eval_mdp_params'], environment_params['env_params'], 631 | environment_params["outer_shape"], 'ppo', 'ppo' if self_play else 'bc', 632 | verbose=params['verbose']), 633 | "env_config" : environment_params, 634 | "eager" : False, 635 | **training_params 636 | }, logger_creator=custom_logger_creator) 637 | return trainer 638 | 639 | 640 | 641 | ### Serialization ### 642 | 643 | 644 | def save_trainer(trainer, params, path=None): 645 | """ 646 | Saves a serialized trainer checkpoint at `path`. If none provided, the default path is 647 | ~/ray_results//checkpoint_/checkpoint- 648 | 649 | Note that `params` should follow the same schema as the dict passed into `gen_trainer_from_params` 650 | """ 651 | # Save trainer 652 | save_path = trainer.save(path) 653 | 654 | # Save params used to create trainer in /path/to/checkpoint_dir/config.pkl 655 | config = copy.deepcopy(params) 656 | config_path = os.path.join(os.path.dirname(save_path), "config.pkl") 657 | 658 | # Note that we use dill (not pickle) here because it supports function serialization 659 | with open(config_path, "wb") as f: 660 | dill.dump(config, f) 661 | return save_path 662 | 663 | def load_trainer(save_path): 664 | """ 665 | Returns a ray compatible trainer object that was previously saved at `save_path` by a call to `save_trainer` 666 | Note that `save_path` is the full path to the checkpoint FILE, not the checkpoint directory 667 | """ 668 | # Read in params used to create trainer 669 | config_path = os.path.join(os.path.dirname(save_path), "config.pkl") 670 | with open(config_path, "rb") as f: 671 | # We use dill (instead of pickle) here because we must deserialize functions 672 | config = dill.load(f) 673 | 674 | # Override this param to lower overhead in trainer creation 675 | config['training_params']['num_workers'] = 0 676 | 677 | # Get un-trained trainer object with proper config 678 | trainer = gen_trainer_from_params(config) 679 | 680 | # Load weights into dummy object 681 | trainer.restore(save_path) 682 | return trainer 683 | 684 | def get_agent_from_trainer(trainer, policy_id="ppo", agent_index=0): 685 | policy = trainer.get_policy(policy_id) 686 | dummy_env = trainer.env_creator(trainer.config['env_config']) 687 | featurize_fn = dummy_env.featurize_fn_map[policy_id] 688 | agent = RlLibAgent(policy, agent_index, featurize_fn=featurize_fn) 689 | return agent 690 | 691 | def get_agent_pair_from_trainer(trainer, policy_id_0='ppo', policy_id_1='ppo'): 692 | agent0 = get_agent_from_trainer(trainer, policy_id=policy_id_0) 693 | agent1 = get_agent_from_trainer(trainer, policy_id=policy_id_1) 694 | return AgentPair(agent0, agent1) 695 | 696 | 697 | def load_agent_pair(save_path, policy_id_0='ppo', policy_id_1='ppo'): 698 | """ 699 | Returns an Hacktrick AgentPair object that has as player 0 and player 1 policies with 700 | ID policy_id_0 and policy_id_1, respectively 701 | """ 702 | trainer = load_trainer(save_path) 703 | return get_agent_pair_from_trainer(trainer, policy_id_0, policy_id_1) 704 | 705 | def load_agent(save_path, policy_id='ppo', agent_index=0): 706 | """ 707 | Returns an RllibAgent (compatible with the Hacktrick Agent API) from the `save_path` to a previously 708 | serialized trainer object created with `save_trainer` 709 | 710 | The trainer can have multiple independent policies, so extract the one with ID `policy_id` to wrap in 711 | an RllibAgent 712 | 713 | Agent index indicates whether the agent is player zero or player one (or player n in the general case) 714 | as the featurization is not symmetric for both players 715 | """ 716 | trainer = load_trainer(save_path) 717 | return get_agent_from_trainer(trainer, policy_id=policy_id, agent_index=agent_index) 718 | 719 | 720 | --------------------------------------------------------------------------------