├── hacktrick_rl
    ├── hacktrick_rl
    │   ├── ppo
    │   │   ├── __init__.py
    │   │   ├── .gitignore
    │   │   ├── ppo_rllib.py
    │   │   └── ppo_rllib_client.py
    │   ├── rllib
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── tests.py
    │   │   └── rllib.py
    │   └── utils.py
    └── setup.py
├── .gitmodules
├── install.sh
├── .gitignore
├── hacktrick_agent.py
├── client.py
├── hackathon_tutorial.ipynb
└── README.md


/hacktrick_rl/hacktrick_rl/ppo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/rllib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/ppo/.gitignore:
--------------------------------------------------------------------------------
1 | hpsearch.py


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "hacktrick_ai"]
2 | 	path = hacktrick_ai
3 | 	url = https://github.com/hacktrick-hackathon/hacktrick_ai
4 | 	branch = master
5 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | cd hacktrick_ai
 3 | pip install -e .
 4 | cd ../hacktrick_rl
 5 | pip install -e .
 6 | 
 7 | cd ./hacktrick_rl
 8 | [ ! -f data_dir.py ] && echo "import os; DATA_DIR = os.path.abspath('.')" >> data_dir.py
 9 | 
10 | pip install protobuf
11 | pip install python-socketio[asyncio_client]==4.6.0
12 | pip install python-engineio==3.13.0


--------------------------------------------------------------------------------
/hacktrick_rl/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(name='hacktrick_rl',
 6 |       version='0.0.1',
 7 |       description='This package has shared components.',
 8 |       packages=find_packages(),
 9 |       install_requires=[
10 |         'GitPython',
11 |         'memory_profiler',
12 |         'sacred',
13 |         'pymongo',
14 |         'dill',
15 |         'matplotlib',
16 |         'requests',
17 |         'pygame',
18 |         'numpy',
19 |         'seaborn==0.9.0',
20 |         'ray[rllib]==0.8.5'
21 |       ],
22 |     )
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *~
  2 | *.swp
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv*/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | # Other
110 | .DS_Store
111 | *.key
112 | *.png
113 | 
114 | # Models and run data
115 | .temp_best_model
116 | checkpoint/
117 | data/ppo_runs/
118 | data/ftw_runs/
119 | data/pbt_runs/
120 | data/agent_runs/
121 | data/bc_runs/
122 | data/chosen_layouts/
123 | data/expert_agent/
124 | data/ftw_exp/
125 | data/pbt_exp/
126 | data/gail_runs/
127 | data/joint_ppo_runs/
128 | data/ppo_exp/
129 | 
130 | # Other files
131 | transfer_agent.sh
132 | 
133 | # sacred config files
134 | **/slack.json
135 | 
136 | # VSCode metadata
137 | **/.vscode
138 | 
139 | # Data directories
140 | **/data_dir.py
141 | 
142 | # PyCharm
143 | .idea/
144 | 


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/ppo/ppo_rllib.py:
--------------------------------------------------------------------------------
 1 | from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | 
 7 | 
 8 | class RllibPPOModel(TFModelV2):
 9 |     """
10 |     Model that will map environment states to action probabilities. Will be shared across agents
11 |     """
12 |     def __init__(self, obs_space, action_space, num_outputs, model_config, name, **kwargs):
13 | 
14 |         super(RllibPPOModel, self).__init__(obs_space, action_space, num_outputs, model_config, name)
15 | 
16 |         # params we got to pass in from the call to "run"
17 |         custom_params = model_config["custom_options"]
18 | 
19 | 
20 |         ## Parse custom network params
21 |         num_hidden_layers = custom_params["NUM_HIDDEN_LAYERS"]
22 |         size_hidden_layers = custom_params["SIZE_HIDDEN_LAYERS"]
23 |         num_filters = custom_params["NUM_FILTERS"]
24 |         num_convs = custom_params["NUM_CONV_LAYERS"]
25 |         d2rl = custom_params["D2RL"]
26 |         assert type(d2rl) == bool
27 | 
28 |         ## Model inputs
29 |         # Your input is a tensor the size of the grid with each channel representing a diffetent item as stated in the documentation
30 |         # For example, in the channel representing a solar cell you will have an array (h x w) with 1 if a solar cell exists in this location and 0 otherwise
31 |         self.inputs = tf.keras.Input(shape=obs_space.shape, name="observations")
32 |         out = self.inputs
33 | 
34 |         # Implement your model architicture here using the given parameters if needed
35 | 
36 |         # This is just a dummpy layer so that the model works out of the box
37 |         # It uses normal tf functional API and you can do the same
38 |         out = tf.keras.layers.Flatten()(out)
39 | 
40 |         ## Model ouptus
41 |         # Linear last layer for action distribution logits
42 |         layer_out = tf.keras.layers.Dense(self.num_outputs)(out)
43 |         # Linear last layer for value function branch of model
44 |         value_out = tf.keras.layers.Dense(1)(out)
45 | 
46 |         self.base_model = tf.keras.Model(self.inputs, [layer_out, value_out])
47 |         self.register_variables(self.base_model.variables)
48 | 
49 | 
50 |     def forward(self, input_dict, state=None, seq_lens=None):
51 |         model_out, self._value_out = self.base_model(input_dict["obs"])
52 |         return model_out, state
53 | 
54 |     def value_function(self):
55 |         return tf.reshape(self._value_out, [-1])


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/rllib/utils.py:
--------------------------------------------------------------------------------
 1 | from hacktrick_ai_py.agents.benchmarking import AgentEvaluator
 2 | import numpy as np
 3 | import inspect
 4 | 
 5 | def softmax(logits):
 6 |     e_x = np.exp(logits.T - np.max(logits))
 7 |     return (e_x / np.sum(e_x, axis=0)).T
 8 | 
 9 | def get_base_env(mdp_params, env_params, outer_shape=None, mdp_params_schedule_fn=None):
10 |     ae = get_base_ae(mdp_params, env_params, outer_shape, mdp_params_schedule_fn)
11 |     return ae.env
12 | 
13 | def get_base_mlam(mdp_params, env_params, outer_shape=None, mdp_params_schedule_fn=None):
14 |     ae = get_base_ae(mdp_params, env_params, outer_shape, mdp_params_schedule_fn)
15 |     return ae.mlam
16 | 
17 | def get_base_ae(mdp_params, env_params, outer_shape=None, mdp_params_schedule_fn=None):
18 |     """
19 |     mdp_params: one set of fixed mdp parameter used by the enviroment
20 |     env_params: env parameters (horizon, etc)
21 |     outer_shape: outer shape of the environment
22 |     mdp_params_schedule_fn: the schedule for varying mdp params
23 | 
24 |     return: the base agent evaluator
25 |     """
26 |     assert mdp_params == None or mdp_params_schedule_fn == None, "either of the two has to be null"
27 |     if type(mdp_params) == dict and "layout_name" in mdp_params:
28 |         ae = AgentEvaluator.from_layout_name(mdp_params=mdp_params, env_params=env_params)
29 |     elif 'num_mdp' in env_params:
30 |         if np.isinf(env_params['num_mdp']):
31 |             ae = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_params, env_params=env_params,
32 |                                                          outer_shape=outer_shape, mdp_params_schedule_fn=mdp_params_schedule_fn)
33 |         else:
34 |             ae = AgentEvaluator.from_mdp_params_finite(mdp_params=mdp_params, env_params=env_params,
35 |                                                          outer_shape=outer_shape, mdp_params_schedule_fn=mdp_params_schedule_fn)
36 |     else:
37 |         # should not reach this case
38 |         raise NotImplementedError()
39 |     return ae
40 | 
41 | # Returns the required arguments as inspect.Parameter objects in a list
42 | def get_required_arguments(fn):
43 |     required = []
44 |     params = inspect.signature(fn).parameters.values()
45 |     for param in params:
46 |         if param.default == inspect.Parameter.empty and param.kind == param.POSITIONAL_OR_KEYWORD:
47 |             required.append(param)
48 |     return required
49 | 
50 | def iterable_equal(a, b):
51 |     if hasattr(a, '__iter__') != hasattr(b, '__iter__'):
52 |         return False
53 |     if not hasattr(a, '__iter__'):
54 |         return a == b
55 | 
56 |     if len(a) != len(b):
57 |         return False
58 | 
59 |     for elem_a, elem_b in zip(a, b):
60 |         if not iterable_equal(elem_a, elem_b):
61 |             return False
62 | 
63 |     return True


--------------------------------------------------------------------------------
/hacktrick_agent.py:
--------------------------------------------------------------------------------
 1 | from hacktrick_ai_py.agents.agent import Agent, AgentPair
 2 | from hacktrick_ai_py.mdp.hacktrick_mdp import HacktrickState, Recipe
 3 | from hacktrick_ai_py.mdp.actions import Action
 4 | from hacktrick_rl.rllib.rllib import RlLibAgent, load_agent_pair
 5 | 
 6 | 
 7 | class MainAgent(Agent):
 8 | 
 9 |     def __init__(self):
10 |         super().__init__()
11 | 
12 |     def action(self, state):
13 |         # Implement your logic here
14 |         # You should change your action value to a compatible Action value from the Action class in Hacktric_ai
15 |         # You do not need to implement the action_probs but it is basically the probability distribution of actions
16 |         action, action_probs = Action.STAY, {}
17 |         return action, action_probs
18 | 
19 | 
20 | class OptionalAgent(Agent):
21 | 
22 |     def __init__(self):
23 |         super().__init__()
24 |         
25 |     def action(self, state):
26 |         # Implement your logic here
27 |         action, action_probs = Action.STAY, {}
28 |         return action, action_probs
29 | 
30 | 
31 | class HacktrickAgent(object):
32 |     # Enable this flag if you are using reinforcement learning from the included ppo ray support library
33 |     RL = False
34 |     # Rplace with the directory for the trained agent
35 |     # Note that `agent_dir` is the full path to the checkpoint FILE, not the checkpoint directory
36 |     agent_dir = ''
37 |     # If you do not plan to use the same agent logic for both agents and use the OptionalAgent set it to False
38 |     # Does not matter if you are using RL as this is controlled by the RL agent
39 |     share_agent_logic = True
40 | 
41 |     def __init__(self):
42 |         Recipe.configure({})
43 | 
44 |         if self.RL:
45 |             agent_pair = load_agent_pair(self.agent_dir)
46 |             self.agent0 = agent_pair.a0
47 |             self.agent1 = agent_pair.a1
48 |         else:
49 |             self.agent0 = MainAgent()
50 |             self.agent1 = OptionalAgent()
51 |     
52 |     def set_mode(self, mode):
53 |         self.mode = mode
54 | 
55 |         if "collaborative" in self.mode:
56 |             if self.share_agent_logic and not self.RL:
57 |                 self.agent1 = MainAgent()
58 |             self.agent_pair = AgentPair(self.agent0, self.agent1)
59 |         else:
60 |             self.agent1 =None
61 |             self.agent_pair =None
62 |     
63 |     def map_action(self, action):
64 |         action_map = {(0, 0): 'STAY', (0, -1): 'UP', (0, 1): 'DOWN', (1, 0): 'RIGHT', (-1, 0): 'LEFT', 'interact': 'SPACE'}
65 |         action_str = action_map[action[0]]
66 |         return action_str
67 | 
68 |     def action(self, state_dict):
69 |         state = HacktrickState.from_dict(state_dict['state']['state'])
70 | 
71 |         if "collaborative" in self.mode:
72 |             (action0, action1) = self.agent_pair.joint_action(state)
73 |             action0 = self.map_action(action0)
74 |             action1 = self.map_action(action1)
75 |             action = [action0, action1]
76 |         else:
77 |             action0 = self.agent0.action(state)
78 |             action0 = self.map_action(action0)
79 |             action = action0
80 | 
81 |         return action


--------------------------------------------------------------------------------
/client.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import socketio
  4 | import random
  5 | import signal
  6 | import sys
  7 | from hacktrick_agent import HacktrickAgent
  8 | 
  9 | 
 10 | sio = socketio.AsyncClient()
 11 | 
 12 | settings = {}
 13 | agent = HacktrickAgent()
 14 | 
 15 | @sio.event
 16 | async def connect():
 17 |     print('connection established')
 18 | 
 19 | 
 20 | @sio.event
 21 | async def start_game(data):
 22 |     print('start_game received with ', data)
 23 |     # await sio.emit('my response', {'response': 'my response'})
 24 | 
 25 | @sio.event
 26 | async def end_game(data):
 27 |     print('end_game received with ', data)
 28 |     # await sio.emit('my response', {'response': 'my response'})
 29 | 
 30 | @sio.event
 31 | async def state_pong(data):
 32 |     action =  agent.action(data)
 33 |     
 34 |     if "collaborative" in settings['mode']:
 35 |         print("actions", action)
 36 |         await sio.emit('action_collaborative', {'actions':action,'team_name': settings['team_name']})
 37 | 
 38 |     else:
 39 |         print("action", action)        
 40 |         await sio.emit('action', {'action': action})
 41 |     
 42 |     score = data['state']['score']
 43 |     state = data['state']['state']
 44 |     print("score:", score)
 45 |         
 46 | 
 47 | @sio.event
 48 | async def end_game(data):
 49 |     print('end_game received with ', data)
 50 |     # await sio.emit('my response', {'response': 'my response'})
 51 |     await sio.disconnect()
 52 | 
 53 | @sio.event
 54 | async def waiting(data):
 55 |     print('waiting received with ', data)
 56 |    
 57 | @sio.event
 58 | async def creation_failed(data):
 59 |     print('Failed to create game')
 60 |     print('Received the following error', data['error'])  
 61 | 
 62 | @sio.event
 63 | async def reset_game(data):
 64 |     print('creation_failed received with ', data)
 65 | 
 66 | @sio.event
 67 | async def disconnect():
 68 |     print('disconnected from server')
 69 | 
 70 | @sio.event
 71 | async def authentication_error(data):
 72 |     print('authentication_error received')
 73 | 
 74 | 
 75 | async def main():
 76 |     await sio.connect('http://ec2-3-14-245-107.us-east-2.compute.amazonaws.com/') ## Change here to aws url
 77 |     await sio.emit('create', {'mode': settings['mode'],'team_name': settings['team_name'], 'password':settings['password'], 'layout':settings['layout']})
 78 |     await sio.wait()
 79 | 
 80 | 
 81 | async def signal_handler(signal, frame):
 82 |      print ('You pressed Ctrl+C - or killed me with -2')
 83 |      #.... Put your logic here .....
 84 |      await sio.disconnect()
 85 |      sys.exit(0)
 86 | 
 87 | if __name__ == '__main__':
 88 |     modes = ["single" ,"collaborative"]
 89 |     layouts = [
 90 |         "leaderboard_single",
 91 |         "leaderboard_collaborative",
 92 |         "round_of_16_single",
 93 |         "round_of_16_collaborative",
 94 |         "quarter_final_single",
 95 |         "quarter_final_collaborative",
 96 |         "semi_final_single",
 97 |         "semi_final_collaborative",
 98 |         "final_single",
 99 |         "final_collaborative"
100 |     ]
101 |     parser = argparse.ArgumentParser()
102 |     parser.add_argument('--team_name', type=str, required=True)
103 |     parser.add_argument('--password', type=str, required=True)
104 |     parser.add_argument('--mode', type=str, required=True)
105 |     parser.add_argument('--layout', type=str, required=True)
106 |     args = parser.parse_args()
107 | 
108 |     if args.mode not in modes or \
109 |         args.layout not in layouts:
110 |         print("invalid parameters have been entered. Please ensure mode and layout are correct")
111 |         sys.exit(0)
112 |     settings['team_name'] = args.team_name
113 |     settings['password'] = args.password
114 |     settings['mode'] = args.mode
115 |     settings['layout'] = args.layout
116 | 
117 |     print(settings)
118 |     # signal.signal(signal.SIGINT, signal_handler)
119 |     agent.set_mode(settings['mode'])
120 |     asyncio.run(main())
121 |     # asyncio.get_event_loop().run_until_complete(main(args.host, args.team_name, args.password))


--------------------------------------------------------------------------------
/hackathon_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "DCLyD1xhb9X2"
  7 |       },
  8 |       "source": [
  9 |         "# How to test and visualise your agents.\n",
 10 |         "\n",
 11 |         "---\n",
 12 |         "\n"
 13 |       ]
 14 |     },
 15 |     {
 16 |       "cell_type": "markdown",
 17 |       "metadata": {
 18 |         "id": "XCClqutxV1Xq"
 19 |       },
 20 |       "source": [
 21 |         "## Imports\n",
 22 |         "\n",
 23 |         "\n",
 24 |         "\n"
 25 |       ]
 26 |     },
 27 |     {
 28 |       "cell_type": "code",
 29 |       "execution_count": null,
 30 |       "metadata": {
 31 |         "id": "jKsFs6UfDWJG"
 32 |       },
 33 |       "outputs": [],
 34 |       "source": [
 35 |         "from hacktrick_ai.src.hacktrick_ai_py.agents.benchmarking import AgentEvaluator, LayoutGenerator\n",
 36 |         "from hacktrick_ai.src.hacktrick_ai_py.visualization.state_visualizer import StateVisualizer\n",
 37 |         "from hacktrick_ai_py.agents.agent import AgentPair, StayAgent\n",
 38 |         "from hacktrick_agent import HacktrickAgent"
 39 |       ]
 40 |     },
 41 |     {
 42 |       "cell_type": "markdown",
 43 |       "metadata": {
 44 |         "id": "IM5MAxKLWEDa"
 45 |       },
 46 |       "source": [
 47 |         "## How to Run\n",
 48 |         "\n",
 49 |         "1. Set game mode to either single or collaborative.\n",
 50 |         "2. Set timesteps (We will be evaluating on 1200 timesteps).\n",
 51 |         "3. Set layout name.\n",
 52 |         "4. Create a HacktrickAgent instance, it will automatically include your algorith or RL agent if used from the `hacktrick_agent.py` file.\n",
 53 |         "5. Call run_agent() and pass the required parameters.\n",
 54 |         "6. run_agent() will return the trajectories of the played game.\n",
 55 |         "7. Call visualize() and pass the trajectories returned from running the agent to graphically view the game.\n",
 56 |         "\n",
 57 |         "(basically just run the notebook ;))"
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "execution_count": 2,
 63 |       "metadata": {
 64 |         "id": "6bJJmpl_EsZU"
 65 |       },
 66 |       "outputs": [],
 67 |       "source": [
 68 |         "def run_agent(mode, timesteps, layout_name, hacktrick_agent):\n",
 69 |         "  hacktrick_agent.set_mode(mode)\n",
 70 |         "  if mode == 'collaborative':\n",
 71 |         "    agent0 = hacktrick_agent.agent0\n",
 72 |         "    agent1 = hacktrick_agent.agent1\n",
 73 |         "    agent = AgentPair(agent0, agent1)\n",
 74 |         "  elif mode == 'single':\n",
 75 |         "    agent0 = hacktrick_agent.agent0\n",
 76 |         "    agent1 = StayAgent()\n",
 77 |         "    agent = AgentPair(agent0, agent1)\n",
 78 |         "  mdp_gen_params = {\"layout_name\": layout_name}\n",
 79 |         "  mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params)\n",
 80 |         "  env_params = {\"horizon\": timesteps}\n",
 81 |         "  agent_eval = AgentEvaluator(env_params=env_params, mdp_fn=mdp_fn)\n",
 82 |         "  trajectories = agent_eval.evaluate_agent_pair(agent, num_games=1)\n",
 83 |         "  return trajectories"
 84 |       ]
 85 |     },
 86 |     {
 87 |       "cell_type": "code",
 88 |       "execution_count": null,
 89 |       "metadata": {
 90 |         "id": "WGYCS4fsQgk4"
 91 |       },
 92 |       "outputs": [],
 93 |       "source": [
 94 |         "# Parameters to be changed\n",
 95 |         "\n",
 96 |         "mode = 'single'\n",
 97 |         "timesteps = 200\n",
 98 |         "layout_name = 'leaderboard_single'\n",
 99 |         "agent = HacktrickAgent()\n",
100 |         "trajectories = run_agent(mode, timesteps, layout_name, agent)"
101 |       ]
102 |     },
103 |     {
104 |       "cell_type": "code",
105 |       "execution_count": 7,
106 |       "metadata": {
107 |         "id": "CfmIXTEYJc_M"
108 |       },
109 |       "outputs": [],
110 |       "source": [
111 |         "def visualize(trajectories):\n",
112 |         "  img_dir_path = StateVisualizer().display_rendered_trajectory(trajectories, trajectory_idx=0, ipython_display=True)"
113 |       ]
114 |     },
115 |     {
116 |       "cell_type": "code",
117 |       "execution_count": null,
118 |       "metadata": {
119 |         "id": "UFaj9yXPVfuN"
120 |       },
121 |       "outputs": [],
122 |       "source": [
123 |         "visualize(trajectories)"
124 |       ]
125 |     }
126 |   ],
127 |   "metadata": {
128 |     "colab": {
129 |       "collapsed_sections": [],
130 |       "name": "hackathon-tutorial.ipynb",
131 |       "provenance": []
132 |     },
133 |     "kernelspec": {
134 |       "display_name": "Python 3",
135 |       "name": "python3"
136 |     },
137 |     "language_info": {
138 |       "codemirror_mode": {
139 |         "name": "ipython",
140 |         "version": 3
141 |       },
142 |       "file_extension": ".py",
143 |       "mimetype": "text/x-python",
144 |       "name": "python",
145 |       "nbconvert_exporter": "python",
146 |       "pygments_lexer": "ipython3",
147 |       "version": "3.8.10"
148 |     }
149 |   },
150 |   "nbformat": 4,
151 |   "nbformat_minor": 0
152 | }
153 | 


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/rllib/tests.py:
--------------------------------------------------------------------------------
  1 | from hacktrick_rl.rllib.rllib import HacktrickMultiAgent
  2 | from hacktrick_rl.rllib.utils import softmax, get_required_arguments, iterable_equal
  3 | from math import isclose
  4 | import unittest, copy
  5 | import numpy as np
  6 | 
  7 | class RllibEnvTest(unittest.TestCase):
  8 | 
  9 |     def setUp(self):
 10 |         self.params = copy.deepcopy(HacktrickMultiAgent.DEFAULT_CONFIG)
 11 |         self.timesteps = [0, 10, 100, 500, 1000, 1500, 2000, 2500]
 12 | 
 13 |     def tearDown(self):
 14 |         pass
 15 | 
 16 |     def _assert_lists_almost_equal(self, first, second, places=7):
 17 |         for a, b in zip(first, second):
 18 |             self.assertAlmostEqual(a, b, places=places)
 19 | 
 20 |     def _test_bc_schedule(self, bc_schedule, expected_bc_factors):
 21 |         self.params['multi_agent_params']['bc_schedule'] = bc_schedule
 22 |         env = HacktrickMultiAgent.from_config(self.params)
 23 |         actual_bc_factors = []
 24 | 
 25 |         for t in self.timesteps:
 26 |             env.anneal_bc_factor(t)
 27 |             actual_bc_factors.append(env.bc_factor)
 28 | 
 29 |         self._assert_lists_almost_equal(expected_bc_factors, actual_bc_factors)
 30 | 
 31 |     def _test_bc_creation_proportion(self, env, factor, trials=10000):
 32 |         env.bc_factor = factor
 33 |         tot_bc = 0
 34 |         for _ in range(trials):
 35 |             env.reset(regen_mdp=False)
 36 |             num_bc = sum(map(lambda agent : int(agent.startswith('bc')), env.curr_agents))
 37 |             self.assertLessEqual(num_bc, 1)
 38 |             tot_bc += num_bc
 39 |         actual_factor = tot_bc / trials
 40 |         self.assertAlmostEqual(actual_factor, factor, places=1)
 41 | 
 42 | 
 43 |     def test_env_creation(self):
 44 |         # Valid creation
 45 |         env = HacktrickMultiAgent.from_config(self.params)
 46 |         for param, expected in self.params['multi_agent_params'].items():
 47 |             self.assertEqual(expected, getattr(env, param))
 48 | 
 49 |         # Invalid bc_schedules
 50 |         invalid_schedules = [[(-1, 0.0), (1.0, 1e5)], [(0.0, 0.0), (10, 1),  (5, 0.5)], [(0, 0), (5, 1), (10, 1.5)]]
 51 |         for sched in invalid_schedules:
 52 |             self.params['multi_agent_params']['bc_schedule'] = sched
 53 |             self.assertRaises(AssertionError, HacktrickMultiAgent.from_config, self.params)
 54 | 
 55 |     def test_reward_shaping_annealing(self):
 56 |         self.params['multi_agent_params']['reward_shaping_factor'] = 1
 57 |         self.params['multi_agent_params']['reward_shaping_horizon'] = 1e3
 58 | 
 59 |         expected_rew_factors = [1, 990/1e3, 900/1e3, 500/1e3, 0.0, 0.0, 0.0, 0.0]
 60 |         actual_rew_factors = []
 61 | 
 62 |         env = HacktrickMultiAgent.from_config(self.params)
 63 | 
 64 |         for t in self.timesteps:
 65 |             env.anneal_reward_shaping_factor(t)
 66 |             actual_rew_factors.append(env.reward_shaping_factor)
 67 | 
 68 |         self._assert_lists_almost_equal(expected_rew_factors, actual_rew_factors)
 69 | 
 70 |     def test_bc_annealing(self):
 71 |         # Test no annealing
 72 |         self._test_bc_schedule(HacktrickMultiAgent.self_play_bc_schedule, [0.0]*len(self.timesteps))
 73 | 
 74 |         # Test annealing
 75 |         anneal_bc_schedule = [(0, 0.0), (1e3, 1.0), (2e3, 0.0)]
 76 |         expected_bc_factors = [0.0, 10/1e3, 100/1e3, 500/1e3, 1.0, 500/1e3, 0.0, 0.0]
 77 |         self._test_bc_schedule(anneal_bc_schedule, expected_bc_factors)
 78 | 
 79 |     def test_agent_creation(self):
 80 |         env = HacktrickMultiAgent.from_config(self.params)
 81 |         obs = env.reset()
 82 | 
 83 |         # Check that we have the right number of agents with valid names
 84 |         self.assertEqual(len(env.curr_agents), 2)
 85 |         self.assertListEqual(list(obs.keys()), env.curr_agents)
 86 | 
 87 |         # Ensure that bc agents are created 'factor' percentage of the time
 88 |         bc_factors = [0.0, 0.1, 0.5, 0.9, 1.0]
 89 |         for factor in bc_factors:
 90 |             self._test_bc_creation_proportion(env, factor)
 91 | 
 92 | 
 93 | class RllibUtilsTest(unittest.TestCase):
 94 | 
 95 |     def setUp(self):
 96 |         pass
 97 | 
 98 |     def tearDown(self):
 99 |         pass
100 | 
101 |     def test_softmax(self):
102 |         logits = np.array([[0.1, 0.1, 0.1],
103 |                            [-0.1, 0.0, 0.1],
104 |                            [0.5, -1.2, 3.2],
105 |                            [-1.6, -2.0, -1.5]])
106 |         expected = np.array([[0.33333333, 0.33333333, 0.33333333],
107 |                              [0.30060961, 0.33222499, 0.3671654 ],
108 |                              [0.06225714, 0.01137335, 0.92636951],
109 |                              [0.36029662, 0.24151404, 0.39818934]])
110 | 
111 |         actual = softmax(logits)
112 | 
113 |         self.assertTrue(np.allclose(expected, actual))
114 | 
115 |     def test_iterable_equal(self):
116 |         a = [(1,), (1, 2)]
117 |         b = ([1], [1, 2])
118 | 
119 |         self.assertTrue(iterable_equal(a, b))
120 | 
121 |         a = [(1, 2), (1)]
122 |         b = [(1,), (1, 2)]
123 | 
124 |         self.assertFalse(iterable_equal(a, b))
125 | 
126 |     def test_get_required_arguments(self):
127 |         
128 |         def foo1(a):
129 |             pass
130 |         def foo2(a, b):
131 |             pass
132 |         def foo3(a, b, c):
133 |             pass
134 |         def foo4(a, b, c='bar'):
135 |             pass
136 |         def foo5(a, b='bar', d='baz', **kwargs):
137 |             pass
138 | 
139 |         fns = [foo1, foo2, foo3, foo4, foo5]
140 |         expected = [1, 2, 3, 2, 1]
141 | 
142 |         for fn, expected in zip(fns, expected):
143 |             self.assertEqual(expected, len(get_required_arguments(fn)))
144 | 
145 | 
146 | 
147 | if __name__ == '__main__':
148 |     unittest.main()


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import git
  4 | import shutil
  5 | import random
  6 | import itertools
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | 
 10 | 
 11 | def delete_dir_if_exists(dir_path, verbose=False):
 12 |     if os.path.exists(dir_path):
 13 |         if verbose:
 14 |             print("Deleting old dir", dir_path)
 15 |         shutil.rmtree(dir_path)
 16 | 
 17 | def create_dir_if_not_exists(dir_path):
 18 |     if not os.path.exists(dir_path):
 19 | 	    os.makedirs(dir_path)
 20 | 
 21 | def reset_tf():
 22 |     """Clean up tensorflow graph and session.
 23 |     NOTE: this also resets the tensorflow seed"""
 24 |     tf.reset_default_graph()
 25 |     if tf.get_default_session() is not None:
 26 | 	    tf.get_default_session().close()
 27 | 
 28 | def num_tf_params():
 29 |     """Prints number of trainable parameters defined"""
 30 |     total_parameters = 0
 31 |     for variable in tf.trainable_variables():
 32 | 	    # shape is an array of tf.Dimension
 33 |         shape = variable.get_shape()
 34 |         variable_parameters = 1
 35 |         for dim in shape:
 36 |             variable_parameters *= dim.value
 37 |             total_parameters += variable_parameters
 38 |     print(total_parameters)
 39 | 
 40 | def get_current_commit_hash():
 41 |     repo = git.Repo(search_parent_directories=True)
 42 |     return repo.head.object.hexsha
 43 | 
 44 | def get_trailing_number(s):
 45 |     """
 46 |     Get the trailing number from a string,
 47 |     i.e. 'file123' -> '123'
 48 |     """
 49 |     m = re.search(r'\d+$', s)
 50 |     return int(m.group()) if m else None
 51 | 
 52 | def get_max_iter(agent_folder):
 53 |     """Return biggest PBT iteration that has been run"""
 54 |     saved_iters = []
 55 |     for folder_s in os.listdir(agent_folder):
 56 |         folder_iter = get_trailing_number(folder_s) 
 57 |         if folder_iter is not None:
 58 |             saved_iters.append(folder_iter)
 59 |     if len(saved_iters) == 0:
 60 |         raise ValueError("Agent folder {} seemed to not have any pbt_iter subfolders".format(agent_folder))
 61 |     return max(saved_iters)
 62 | 
 63 | def cross_entropy(action_probs, y, eps=1e-4):
 64 |     """
 65 |     X is the output from fully connected layer (num_examples x num_classes)
 66 |     y is labels (num_examples x 1)
 67 |     	Note that y is not one-hot encoded vector. 
 68 |     	It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
 69 |     """
 70 |     m = y.shape[0]
 71 |     # We use multidimensional array indexing to extract 
 72 |     # softmax probability of the correct label for each sample.
 73 |     probs_for_correct = action_probs[range(m), y]
 74 |     
 75 |     # NOTE: eps was added to correct for some actions being deterministically removed from
 76 |     # the human model when it would get stuck. It was chosen empirically as to be about an order of
 77 |     # magnitude less than the smallest probability assigned to any event by the model
 78 |     probs_for_correct = np.array([p if p > eps else eps for p in probs_for_correct]).astype(float)
 79 |     
 80 |     log_likelihood = -np.log(probs_for_correct)
 81 |     cross_entropy_loss = np.sum(log_likelihood) / m
 82 |     return cross_entropy_loss
 83 | 
 84 | def accuracy(action_probs, y):
 85 |     return np.sum(np.argmax(action_probs, axis=1) == y) / len(y)
 86 | 
 87 | def set_global_seed(seed):
 88 |     random.seed(seed)
 89 |     np.random.seed(seed)
 90 |     tf.random.set_seed(seed)
 91 | 
 92 | def prepare_nested_default_dict_for_pickle(nested_defaultdict):
 93 |     """Need to make all nested defaultdicts into normal dicts to pickle"""
 94 |     for k,v in nested_defaultdict.items():
 95 |         nested_defaultdict[k] = dict(v)
 96 |     pickleable_dict = dict(nested_defaultdict)
 97 |     return pickleable_dict
 98 | 
 99 | def set_style(font_scale=1.6):
100 |     import seaborn, matplotlib
101 |     seaborn.set(font='serif', font_scale=font_scale)
102 |     # Make the background white, and specify the specific font family
103 |     seaborn.set_style("white", {
104 |         "font.family": "serif",
105 |         "font.weight": "normal",
106 |         "font.serif": ["Times", "Palatino", "serif"],
107 |         'axes.facecolor': 'white',
108 |         'lines.markeredgewidth': 1})
109 |     matplotlib.rcParams['text.usetex'] = True
110 |     matplotlib.rc('font',family='serif', serif=['Palatino'])
111 | 
112 | def common_keys_equal(dict_a, dict_b):
113 |     common_keys = set(dict_a.keys()).intersection(set(dict_b.keys()))
114 |     for k in common_keys:
115 |         if dict_a[k] != dict_b[k]: return False
116 |     return True
117 | 
118 | class Node(object):
119 |     def __init__(self, agent_name, params, parent=None):
120 |         self.agent_name = agent_name
121 |         self.params = params
122 |         self.parent = parent
123 | 
124 | def get_flattened_keys(dictionary):
125 |     if type(dictionary) != dict:
126 |         return []
127 |     return list(dictionary.keys()) + list(itertools.chain(*[get_flattened_keys(dictionary[key]) for key in dictionary]))
128 | 
129 | def recursive_dict_update(map, key, value):
130 |     if type(map) != dict:
131 |         return False
132 |     if key in map:
133 |         map[key] = value
134 |         return True
135 |     return any([recursive_dict_update(child, key, value) for child in map.values()])
136 | 
137 | def equal_dicts(d1, d2, ignore_keys):
138 |     ignored = set(ignore_keys)
139 |     for k1, v1 in d1.items():
140 |         if k1 not in ignored and (k1 not in d2 or d2[k1] != v1):
141 |             if k1 not in d2:
142 |                 print("d2 missing", k1)
143 |             else:
144 |                 if k1 == "objects":
145 |                     print("object difference")
146 |                     for o1 in d1[k1]:
147 |                         print(o1)
148 |                     print("----")
149 |                     for o2 in d2[k1]:
150 |                         print(o2)
151 |                 else:
152 |                     print("different at ", k1, "one is ", d2[k1], "one is ", v1)
153 |             return False
154 |     for k2, v2 in d2.items():
155 |         if k2 not in ignored and k2 not in d1:
156 |             print("d1 missing", k2)
157 |             return False
158 |     return True
159 | 
160 | def get_dict_stats(d):
161 |     new_d = d.copy()
162 |     for k, v in d.items():
163 |         new_d[k] = {
164 |             'mean': np.mean(v),
165 |             'standard_error': np.std(v) / np.sqrt(len(v)),
166 |             'max': np.max(v),
167 |             'n': len(v)
168 |         }
169 |     return new_d


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Hacktrick 2022
  2 | Welcome to Hacktrick! 
  3 | In this hackathon, you will be required to implement agents that navigate through different layouts with 
  4 | lab components scattered around the layout.
  5 | Your agents should be able to build four different types of labs, with each lab having different 
  6 | requirements and specifications. We will be evaluating your agents based on the number of labs they 
  7 | build in the allotted time. More in-depth technical details are provided in the following sections. 
  8 | There will be two different types of agents and gameplay:
  9 | 1. Single Mode: Only one agent collecting the components and building the labs.
 10 | 2. Collaborative Mode: Two agents working together in the same layout to build the required labs.
 11 | 
 12 | Finally, it is worth noting that there are no constraints on how you implement these agents. We will be 
 13 | providing you with tips on how to implement a reinforcement learning agent in this environment, but by 
 14 | no means do we require you to submit an RL-based solution. You are free to implement your solutions 
 15 | using any method you see fitting (Ex: rule-based agent).
 16 | 
 17 | We will be evaluating on **1200 timesteps**. 
 18 | 
 19 | # Contents
 20 | - [Hacktrick 2022](#hacktrick-2022)
 21 | - [Contents](#contents)
 22 | - [Installation](#installation)
 23 |   - [Python Environment Setup](#python-environment-setup)
 24 |   - [Reinforcement Learning Setup](#reinforcement-learning-setup)
 25 |     - [PPO Tests](#ppo-tests)
 26 |     - [Rllib Tests](#rllib-tests)
 27 | - [Repo Structure Overview](#repo-structure-overview)
 28 | - [Implementation](#implementation)
 29 |   - [Agents](#agents)
 30 |   - [Visualizing Locally](#visualizing-locally)
 31 |   - [Submission](#submission)
 32 | - [Reinforcement Learning Modules Usage](#reinforcement-learning-modules-usage)
 33 | 
 34 | 
 35 | # Installation
 36 | When cloning the repository, make sure you also clone the submodules
 37 | ```
 38 | $ git clone --recursive https://github.com/hacktrick-hackathon/hacktrick-hackathon-2022.git
 39 | ```
 40 | 
 41 | ## Python Environment Setup
 42 | Create a new python environment (this is optional) using any environment manager you want (we will use venv) and run the install script as before
 43 | ```bash
 44 | $ python -m venv venv
 45 | $ source venv/bin/activate
 46 | (venv) $ ./install.sh
 47 | ```
 48 | 
 49 | ## Reinforcement Learning Setup
 50 | Install the latest stable version of tensorflow (if you don't have it) compatible with rllib.
 51 | Make sure to train using a gpu or use google colab. If you are not planning to use reinforcement learning or other machine learning methods, you do not need this.
 52 | ```bash
 53 | (venv) $ pip install tensorflow
 54 | ```
 55 | 
 56 | Your virtual environment should now be configured to run the rllib training code. Verify it by running the following command 
 57 | ```bash
 58 | (venv) $ python -c "from ray import rllib"
 59 | ```
 60 | Note: if you ever get an import error, please first check if you activated the venv
 61 | 
 62 | ### PPO Tests
 63 | ```bash
 64 | (venv) $ cd hacktrick_rl/ppo
 65 | (venv) hacktrick_rl/ppo $ python ppo_rllib_test.py
 66 | ```
 67 | 
 68 | ### Rllib Tests
 69 | Tests rllib environments and models, as well as various utility functions. Does not actually test rllib training
 70 | ```bash
 71 | (venv) $ cd rllib
 72 | (venv) rllib $ python tests.py
 73 | ```
 74 | You should see all tests passing. 
 75 | 
 76 | 
 77 | # Repo Structure Overview
 78 | `hacktrick_rl`
 79 | - `ppo/`:
 80 |   - `ppo_rllib.py`: Primary module where code for training a PPO agent resides. This is where you will implement your model architicture for a PPO agent
 81 |   - `ppo_rllib_client.py` Driver code for configuing and launching the training of an agent. More details about usage below
 82 |   - `ppo_rllib_test.py` Reproducibility tests for local sanity checks
 83 | - `rllib/`:
 84 |   - `rllib.py`: rllib agent and training utils that utilize Hacktrick APIs
 85 |   - `utils.py`: utils for the above
 86 |   - `tests.py`: preliminary tests for the above
 87 | - `utils.py`: utils for the repo
 88 | 
 89 | `hacktrick_ai`
 90 | - `mdp/`:
 91 |   - `hacktric_mdp.py`: main Hacktric game logic
 92 |   - `hacktric_env.py`: environment classes built on top of the Hacktric mdp
 93 |   - `layout_generator.py`: functions to generate random layouts programmatically
 94 | 
 95 | - `agents/`:
 96 |   - `agent.py`: location of agent classes
 97 |   - `benchmarking.py`: sample trajectories of agents (both trained and planners) and load various models
 98 | 
 99 | - `planning/`:
100 |   - This directory contains some logic that might help you in implementing a rule-based agent.
101 |   - You are free to disregard this directory and implement your own functions.
102 |   - If you find any functions that make your implementation easier, or even as a guide/starter, feel free to use them.
103 | 
104 | 
105 | # Implementation
106 | ## Agents
107 | You should not need to play around in the `hacktrick_ai` dirctory as this is for the environment you will use. you implementation and submissions are disscussed below. The above is only added for completion.
108 | In `hacktrick_agent.py` you will find two base classes `MainAgent()` and `OptionalAgent()`. Implement according to the following cases. 
109 | - In single mode, implement only the `MainAgent()` class and make sure your logic is correct for the `action()` method.
110 | - In collaborative mode, implement both classes if you want to implement different agent logic and set `share_agent_logic` to `False`.
111 | - In collaborative mode, implement `MainAgent()` only if you want to apply the same logic on both agents and set `share_agent_logic` to `True`.
112 | 
113 | 
114 | ## Visualizing Locally
115 | Follow the steps in this notebook `hackathon_tutorial.ipynb`
116 | 
117 | Note: 
118 | - The `horizon` variable corresponds to the number of timesteps.
119 | - Setting `num_games` to more than one will output the average score of these games. Feel free to adjust this parameter when testing, but we will be evaluating on one game only.
120 | 
121 | 
122 | ## Submission
123 | - In `hacktrick_agent.py` you will find two base classes `MainAgent()` and `OptionalAgent()`. Implement your logic in these classes.
124 | - Run this command `python3 client.py --team_name=TEAM_NAME --password=PASSWORD --mode=MODE --layout=LAYOUT_NAME`. Note that `mode` is either `single` or `collaborative`
125 | 
126 | 
127 | # Reinforcement Learning Modules Usage
128 | Before proceeding, it is important to note that there are two primary groups of hyperparameter defaults, `local` and `production`. Which is selected is controlled by the `RUN_ENV` environment variable, which defaults to `production`. In order to use local hyperparameters, run
129 | ```bash
130 | $ export RUN_ENV=local
131 | ```
132 | 
133 | Your model architicture should go in the `ppo_rllib.py` file. You need to develop a PPO model utilizing the poilerblate code that you have to give you an idea about the inputs and outputs of the model. You do not need to worry about the training loop as this is handled by ray library in the background. Your only concern should be the model architicture and if you need to change the reward funciton check `get_dense_reward()` method in `rllib/`.
134 | Training of agents is done through the `ppo_rllib_client.py` script. It has the following usage:
135 | ```bash
136 |  ppo_rllib_client.py [with [<param_0>=<argument_0>] ... ]
137 | ```
138 | 
139 | For example, the following snippet trains a self play ppo agent on seed 1, 2, and 3, with learning rate `1e-3`, on the `"cramped_room"` layout for `5` iterations without using any gpus. The rest of the parameters are left to their defaults
140 | ```
141 | (venv) ppo $ python ppo_rllib_client.py with seeds="[1, 2, 3] lr=1e-3 layout_name=cramped_room num_training_iters=5 num_gpus=0 experiment_name="my_agent"
142 | ```
143 | For a complete list of all hyperparameters as well as their local and production defaults, refer to the `my_config` section of  `ppo_rllib_client.py`
144 | 
145 | 
146 | Training results and checkpoints are stored in a directory called `~/ray_results/my_agent_<seed>_<timestamp>`. You can visualize the results using tensorboard
147 | ```bash
148 | (venv) $ cd ~/ray_results
149 | (venv) ray_results $ tensorboard --logdir .
150 | ```
151 | The last command assumes you have installed tensorboard in a GUI-enabled environment for linux. If you are using WSL or colab you can easly figure out how to run tensorboard.
152 | 


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/ppo/ppo_rllib_client.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore", category=DeprecationWarning)
  3 | #!/usr/bin/env python -W ignore::DeprecationWarning
  4 | 
  5 | # All imports except rllib
  6 | import argparse, os, sys
  7 | from hacktrick_ai_py.agents.benchmarking import AgentEvaluator
  8 | import numpy as np
  9 | 
 10 | # environment variable that tells us whether this code is running on the server or not
 11 | LOCAL_TESTING = os.getenv('RUN_ENV', 'production') == 'local'
 12 | 
 13 | # Sacred setup (must be before rllib imports)
 14 | from sacred import Experiment
 15 | ex = Experiment("PPO RLLib")
 16 | 
 17 | # Necessary work-around to make sacred pickling compatible with rllib
 18 | from sacred import SETTINGS
 19 | SETTINGS.CONFIG.READ_ONLY_CONFIG = False
 20 | 
 21 | # Slack notification configuration
 22 | from sacred.observers import SlackObserver
 23 | if os.path.exists('slack.json') and not LOCAL_TESTING:
 24 |     slack_obs = SlackObserver.from_config('slack.json')
 25 |     ex.observers.append(slack_obs)
 26 | 
 27 |     # Necessary for capturing stdout in multiprocessing setting
 28 |     SETTINGS.CAPTURE_MODE = 'sys'
 29 | 
 30 | # rllib and rllib-dependent imports
 31 | # Note: tensorflow and tensorflow dependent imports must also come after rllib imports
 32 | # This is because rllib disables eager execution. Otherwise, it must be manually disabled
 33 | import ray
 34 | from ray.tune.result import DEFAULT_RESULTS_DIR
 35 | from ray.tune.registry import register_env
 36 | from ray.rllib.models import ModelCatalog
 37 | from ray.rllib.agents.ppo.ppo import PPOTrainer
 38 | from hacktrick_rl.ppo.ppo_rllib import RllibPPOModel
 39 | from hacktrick_rl.rllib.rllib import HacktrickMultiAgent, save_trainer, gen_trainer_from_params
 40 | 
 41 | 
 42 | ###################### Temp Documentation #######################
 43 | #   run the following command in order to train a PPO self-play #
 44 | #   agent with the static parameters listed in my_config        #
 45 | #                                                               #
 46 | #   python ppo_rllib_client.py                                  #
 47 | #                                                               #
 48 | #   In order to view the results of training, run the following #
 49 | #   command                                                     #
 50 | #                                                               #
 51 | #   tensorboard --log-dir ~/ray_results/                        #
 52 | #                                                               #
 53 | #################################################################
 54 | 
 55 | # Dummy wrapper to pass rllib type checks
 56 | def _env_creator(env_config):
 57 |     # Re-import required here to work with serialization
 58 |     from hacktrick_rl.rllib.rllib import HacktrickMultiAgent 
 59 |     return HacktrickMultiAgent.from_config(env_config)
 60 | 
 61 | @ex.config
 62 | def my_config():
 63 |     ### Model params ###
 64 | 
 65 |     # Whether the model is for single or collaborative training
 66 |     mode = 'single'
 67 | 
 68 |     # Whether dense reward should come from potential function or not
 69 |     use_phi = True
 70 | 
 71 |     # Base model params
 72 |     NUM_HIDDEN_LAYERS = 3
 73 |     SIZE_HIDDEN_LAYERS = 64
 74 |     NUM_FILTERS = 25
 75 |     NUM_CONV_LAYERS = 3
 76 | 
 77 |     # whether to use D2RL https://arxiv.org/pdf/2010.09163.pdf (concatenation the result of last conv layer to each hidden layer);
 78 |     D2RL = False
 79 |     ### Training Params ###
 80 | 
 81 |     num_workers = 1 if not LOCAL_TESTING else 1
 82 | 
 83 |     # list of all random seeds to use for experiments, used to reproduce results
 84 |     seeds = [0]
 85 | 
 86 |     # Placeholder for random for current trial
 87 |     seed = 2229
 88 | 
 89 |     # Number of gpus the central driver should use
 90 |     num_gpus = 0 if LOCAL_TESTING else 1
 91 | 
 92 |     # How many environment timesteps will be simulated (across all environments)
 93 |     # for one set of gradient updates. Is divided equally across environments
 94 |     train_batch_size = 12000 if not LOCAL_TESTING else 800
 95 | 
 96 |     # size of minibatches we divide up each batch into before
 97 |     # performing gradient steps
 98 |     sgd_minibatch_size = 2000 if not LOCAL_TESTING else 800
 99 | 
100 |     # Rollout length
101 |     rollout_fragment_length = 400
102 |     
103 |     # Whether all PPO agents should share the same policy network
104 |     shared_policy = True
105 | 
106 |     # Number of training iterations to run
107 |     num_training_iters = 500 if not LOCAL_TESTING else 2
108 | 
109 |     # Stepsize of SGD.
110 |     lr = 5e-4
111 | 
112 |     # Learning rate schedule.
113 |     lr_schedule = None
114 | 
115 |     # If specified, clip the global norm of gradients by this amount
116 |     grad_clip = 0.1
117 | 
118 |     # Discount factor
119 |     gamma = 0.99
120 | 
121 |     # Exponential decay factor for GAE (how much weight to put on monte carlo samples)
122 |     # Reference: https://arxiv.org/pdf/1506.02438.pdf
123 |     lmbda = 0.98
124 | 
125 |     # Whether the value function shares layers with the policy model
126 |     vf_share_layers = True
127 | 
128 |     # How much the loss of the value network is weighted in overall loss
129 |     vf_loss_coeff = 1e-4
130 | 
131 |     # Entropy bonus coefficient, will anneal linearly from _start to _end over _horizon steps
132 |     entropy_coeff_start = 0.2
133 |     entropy_coeff_end = 1e-2
134 |     entropy_coeff_horizon = 3e6
135 | 
136 |     # Initial coefficient for KL divergence.
137 |     kl_coeff = 0.2
138 | 
139 |     # PPO clipping factor
140 |     clip_param = 0.05
141 | 
142 |     # Number of SGD iterations in each outer loop (i.e., number of epochs to
143 |     # execute per train batch).
144 |     num_sgd_iter = 8 if not LOCAL_TESTING else 1
145 | 
146 |     # How many trainind iterations (calls to trainer.train()) to run before saving model checkpoint
147 |     save_freq = 25
148 | 
149 |     # How many training iterations to run between each evaluation
150 |     evaluation_interval = 50 if not LOCAL_TESTING else 1
151 | 
152 |     # How many timesteps should be in an evaluation episode
153 |     evaluation_ep_length = 400
154 | 
155 |     # Number of games to simulation each evaluation
156 |     evaluation_num_games = 1
157 | 
158 |     # Whether to display rollouts in evaluation
159 |     evaluation_display = False
160 | 
161 |     # Where to log the ray dashboard stats
162 |     temp_dir = os.path.join(os.path.abspath(os.sep), "tmp", "ray_tmp")
163 | 
164 |     # Where to store model checkpoints and training stats
165 |     results_dir = DEFAULT_RESULTS_DIR
166 | 
167 |     # Whether tensorflow should execute eagerly or not
168 |     eager = False
169 | 
170 |     # Whether to log training progress and debugging info
171 |     verbose = True
172 | 
173 | 
174 |     ### BC Params ### Kept only for backward compatability
175 |     # path to pickled policy model for behavior cloning
176 |     bc_model_dir = None
177 | 
178 |     # Whether bc agents should return action logit argmax or sample
179 |     bc_stochastic = True
180 | 
181 | 
182 | 
183 |     ### Environment Params ###
184 |     # Which hacktrick level to use
185 |     layout_name = "cramped_room"
186 | 
187 |     # all_layout_names = '_'.join(layout_names)
188 | 
189 |     # Name of directory to store training results in (stored in ~/ray_results/<experiment_name>)
190 | 
191 |     params_str = str(use_phi) + "_nw=%d_vf=%f_es=%f_en=%f_kl=%f" % (
192 |         num_workers,
193 |         vf_loss_coeff,
194 |         entropy_coeff_start,
195 |         entropy_coeff_end,
196 |         kl_coeff
197 |     )
198 | 
199 |     experiment_name = "{0}_{1}_{2}".format("PPO", layout_name, params_str)
200 | 
201 |     # Rewards the agent will receive for intermediate actions
202 |     rew_shaping_params = {
203 |         "PLACEMENT_IN_CONSTRUCTION_SITE_REW": 3,
204 |         "CONTAINER_PICKUP_REWARD": 3,
205 |         "SOLARLAB_PICKUP_REWARD": 5,
206 |         "CONTAINER_DISP_DISTANCE_REW": 0,
207 |         "CONSTRUCTION_SITE_DISTANCE_REW": 0,
208 |         "SOLARLAB_DISTANCE_REW": 0
209 |     }
210 | 
211 |     # Max episode length
212 |     horizon = 400
213 | 
214 |     # Constant by which shaped rewards are multiplied by when calculating total reward
215 |     reward_shaping_factor = 1.0
216 | 
217 |     # Linearly anneal the reward shaping factor such that it reaches zero after this number of timesteps
218 |     reward_shaping_horizon = 2.5e6
219 | 
220 |     # Kept only for backward compatability
221 |     bc_schedule = HacktrickMultiAgent.self_play_bc_schedule
222 | 
223 | 
224 |     # To be passed into rl-lib model/custom_options config
225 |     model_params = {
226 |         "NUM_HIDDEN_LAYERS" : NUM_HIDDEN_LAYERS,
227 |         "SIZE_HIDDEN_LAYERS" : SIZE_HIDDEN_LAYERS,
228 |         "NUM_FILTERS" : NUM_FILTERS,
229 |         "NUM_CONV_LAYERS" : NUM_CONV_LAYERS,
230 |         "D2RL": D2RL
231 |     }
232 | 
233 |     # to be passed into the rllib.PPOTrainer class
234 |     training_params = {
235 |         "num_workers" : num_workers,
236 |         "train_batch_size" : train_batch_size,
237 |         "sgd_minibatch_size" : sgd_minibatch_size,
238 |         "rollout_fragment_length" : rollout_fragment_length,
239 |         "num_sgd_iter" : num_sgd_iter,
240 |         "lr" : lr,
241 |         "lr_schedule" : lr_schedule,
242 |         "grad_clip" : grad_clip,
243 |         "gamma" : gamma,
244 |         "lambda" : lmbda,
245 |         "vf_share_layers" : vf_share_layers,
246 |         "vf_loss_coeff" : vf_loss_coeff,
247 |         "kl_coeff" : kl_coeff,
248 |         "clip_param" : clip_param,
249 |         "num_gpus" : num_gpus,
250 |         "seed" : seed,
251 |         "evaluation_interval" : evaluation_interval,
252 |         "entropy_coeff_schedule" : [(0, entropy_coeff_start), (entropy_coeff_horizon, entropy_coeff_end)],
253 |         "eager" : eager,
254 |         "log_level" : "WARN" if verbose else "ERROR"
255 |     }
256 | 
257 |     # To be passed into AgentEvaluator constructor and _evaluate function
258 |     evaluation_params = {
259 |         "ep_length" : evaluation_ep_length,
260 |         "num_games" : evaluation_num_games,
261 |         "display" : evaluation_display,
262 |         "mode" : mode
263 |     }
264 | 
265 | 
266 |     environment_params = {
267 |         # To be passed into HacktrickGridWorld constructor
268 | 
269 |         "mdp_params" : {
270 |             "layout_name": layout_name,
271 |             "rew_shaping_params": rew_shaping_params
272 |         },
273 |         # To be passed into HacktrickEnv constructor
274 |         "env_params" : {
275 |             "horizon" : horizon
276 |         },
277 | 
278 |         # To be passed into HacktrickMultiAgent constructor
279 |         "multi_agent_params" : {
280 |             "reward_shaping_factor" : reward_shaping_factor,
281 |             "reward_shaping_horizon" : reward_shaping_horizon,
282 |             "use_phi" : use_phi,
283 |             "bc_schedule" : bc_schedule,
284 |             "mode" : mode
285 |         }
286 |     }
287 | 
288 |     bc_params = {
289 |         "bc_policy_cls" : None, #// BehaviorCloningPolicy,
290 |         "bc_config" : {
291 |             "model_dir" : bc_model_dir,
292 |             "stochastic" : bc_stochastic,
293 |             "eager" : eager
294 |         }
295 |     }
296 | 
297 |     ray_params = {
298 |         "custom_model_id" : "MyPPOModel",
299 |         "custom_model_cls" : RllibPPOModel,
300 |         "temp_dir" : temp_dir,
301 |         "env_creator" : _env_creator
302 |     }
303 | 
304 |     params = {
305 |         "model_params" : model_params,
306 |         "training_params" : training_params,
307 |         "environment_params" : environment_params,
308 |         "bc_params" : bc_params,
309 |         "shared_policy" : shared_policy,
310 |         "num_training_iters" : num_training_iters,
311 |         "evaluation_params" : evaluation_params,
312 |         "experiment_name" : experiment_name,
313 |         "save_every" : save_freq,
314 |         "seeds" : seeds,
315 |         "results_dir" : results_dir,
316 |         "ray_params" : ray_params,
317 |         "verbose" : verbose
318 |     }
319 | 
320 | 
321 | def run(params):
322 |     # Retrieve the tune.Trainable object that is used for the experiment
323 |     trainer = gen_trainer_from_params(params)
324 | 
325 |     # Object to store training results in
326 |     result = {}
327 | 
328 |     # Training loop
329 |     for i in range(params['num_training_iters']):
330 |         if params['verbose']:
331 |             print("Starting training iteration", i)
332 |         result = trainer.train()
333 | 
334 |         if i % params['save_every'] == 0:
335 |             save_path = save_trainer(trainer, params)
336 |             if params['verbose']:
337 |                 print("saved trainer at", save_path)
338 | 
339 |     # Save the state of the experiment at end
340 |     save_path = save_trainer(trainer, params)
341 |     if params['verbose']:
342 |         print("saved trainer at", save_path)
343 | 
344 |     return result
345 | 
346 | 
347 | @ex.automain
348 | def main(params):
349 |     # List of each random seed to run
350 |     seeds = params['seeds']
351 |     del params['seeds']
352 | 
353 |     # List to store results dicts (to be passed to sacred slack observer)
354 |     results = []
355 | 
356 |     # Train an agent to completion for each random seed specified
357 |     for seed in seeds:
358 |         # Override the seed
359 |         params['training_params']['seed'] = seed
360 | 
361 |         # Do the thing
362 |         result = run(params)
363 |         results.append(result)
364 | 
365 |     # Return value gets sent to our slack observer for notification
366 |     average_sparse_reward = np.mean([res['custom_metrics']['sparse_reward_mean'] for res in results])
367 |     average_episode_reward = np.mean([res['episode_reward_mean'] for res in results])
368 |     return { "average_sparse_reward" : average_sparse_reward, "average_total_reward" : average_episode_reward }


--------------------------------------------------------------------------------
/hacktrick_rl/hacktrick_rl/rllib/rllib.py:
--------------------------------------------------------------------------------
  1 | from hacktrick_ai_py.mdp.actions import Action
  2 | from hacktrick_ai_py.mdp.hacktrick_env import HacktrickEnv
  3 | from hacktrick_ai_py.mdp.hacktrick_mdp import HacktrickGridworld, EVENT_TYPES
  4 | from hacktrick_ai_py.agents.benchmarking import AgentEvaluator
  5 | from hacktrick_ai_py.agents.agent import Agent, AgentPair, StayAgent
  6 | from ray.tune.registry import register_env
  7 | from ray.tune.logger import UnifiedLogger
  8 | from ray.tune.result import DEFAULT_RESULTS_DIR
  9 | from ray.rllib.env.multi_agent_env import MultiAgentEnv
 10 | from ray.rllib.agents.callbacks import DefaultCallbacks
 11 | from ray.rllib.agents.ppo.ppo import PPOTrainer
 12 | from ray.rllib.models import ModelCatalog
 13 | from hacktrick_rl.rllib.utils import softmax, get_base_ae, get_required_arguments, iterable_equal
 14 | from datetime import datetime
 15 | import tempfile
 16 | import gym
 17 | import numpy as np
 18 | import os, copy, dill
 19 | import ray
 20 | import logging
 21 | 
 22 | action_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS))
 23 | obs_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS))
 24 | timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
 25 | 
 26 | 
 27 | class RlLibAgent(Agent):
 28 |     """ 
 29 |     Class for wrapping a trained RLLib Policy object into an Hacktrick compatible Agent
 30 |     """
 31 |     def __init__(self, policy, agent_index, featurize_fn):
 32 |         self.policy = policy
 33 |         self.agent_index = agent_index
 34 |         self.featurize = featurize_fn
 35 | 
 36 |     def reset(self):
 37 |         # Get initial rnn states and add batch dimension to each
 38 |         if hasattr(self.policy.model, 'get_initial_state'):
 39 |             self.rnn_state = [np.expand_dims(state, axis=0) for state in self.policy.model.get_initial_state()]
 40 |         elif hasattr(self.policy, "get_initial_state"):
 41 |             self.rnn_state = [np.expand_dims(state, axis=0) for state in self.policy.get_initial_state()]
 42 |         else:
 43 |             self.rnn_state = []
 44 | 
 45 |     def action_probabilities(self, state):
 46 |         """
 47 |         Arguments:
 48 |             - state (Hacktrick_mdp.HacktrickState) object encoding the global view of the environment
 49 |         returns:
 50 |             - Normalized action probabilities determined by self.policy
 51 |         """
 52 |         # Preprocess the environment state
 53 |         obs = self.featurize(state, debug=False)
 54 |         my_obs = obs[self.agent_index]
 55 | 
 56 |         # Compute non-normalized log probabilities from the underlying model
 57 |         logits = self.policy.compute_actions(np.array([my_obs]), self.rnn_state)[2]['action_dist_inputs']
 58 | 
 59 |         # Softmax in numpy to convert logits to normalized probabilities
 60 |         return softmax(logits)
 61 | 
 62 |     def action(self, state):
 63 |         """
 64 |         Arguments: 
 65 |             - state (Hacktrick_mdp.HacktrickState) object encoding the global view of the environment
 66 |         returns: 
 67 |             - the argmax action for a single observation state
 68 |             - action_info (dict) that stores action probabilities under 'action_probs' key
 69 |         """
 70 |         # Preprocess the environment state
 71 |         obs = self.featurize(state)
 72 |         my_obs = obs[self.agent_index]
 73 | 
 74 |         # Use Rllib.Policy class to compute action argmax and action probabilities
 75 |         [action_idx], rnn_state, info = self.policy.compute_actions(np.array([my_obs]), self.rnn_state)
 76 |         agent_action =  Action.INDEX_TO_ACTION[action_idx]
 77 |         
 78 |         # Softmax in numpy to convert logits to normalized probabilities
 79 |         logits = info['action_dist_inputs']
 80 |         action_probabilities = softmax(logits)
 81 | 
 82 |         agent_action_info = {'action_probs' : action_probabilities}
 83 |         self.rnn_state = rnn_state
 84 | 
 85 |         return agent_action, agent_action_info
 86 | 
 87 | 
 88 | class HacktrickMultiAgent(MultiAgentEnv):
 89 |     """
 90 |     Class used to wrap HacktrickEnv in an Rllib compatible multi-agent environment
 91 |     """
 92 | 
 93 |     # List of all agent types currently supported
 94 |     supported_agents = ['ppo', 'bc']
 95 | 
 96 |     # Default bc_schedule, includes no bc agent at any time
 97 |     bc_schedule = self_play_bc_schedule = [(0, 0), (float('inf'), 0)]
 98 | 
 99 |     # Default environment params used for creation
100 |     DEFAULT_CONFIG = {
101 |         # To be passed into HacktrickGridWorld constructor
102 |         "mdp_params" : {
103 |             "layout_name" : "cramped_room",
104 |             "rew_shaping_params" : {}
105 |         },
106 |         # To be passed into HacktrickEnv constructor
107 |         "env_params" : {
108 |             "horizon" : 400
109 |         },
110 |         # To be passed into HacktrickMultiAgent constructor
111 |         "multi_agent_params" : {
112 |             "reward_shaping_factor" : 0.0,
113 |             "reward_shaping_horizon" : 0,
114 |             "bc_schedule" : self_play_bc_schedule,
115 |             "use_phi" : True,
116 |             "mode" : 'single'
117 |         }
118 |     }
119 | 
120 |     def __init__(self, base_env, reward_shaping_factor=0.0, reward_shaping_horizon=0,
121 |                             bc_schedule=None, use_phi=True, mode='single'):
122 |         """
123 |         base_env: HacktrickEnv
124 |         reward_shaping_factor (float): Coefficient multiplied by dense reward before adding to sparse reward to determine shaped reward
125 |         reward_shaping_horizon (int): Timestep by which the reward_shaping_factor reaches zero through linear annealing
126 |         bc_schedule (list[tuple]): List of (t_i, v_i) pairs where v_i represents the value of bc_factor at timestep t_i
127 |             with linear interpolation in between the t_i
128 |         use_phi (bool): Whether to use 'shaped_r_by_agent' or 'phi_s_prime' - 'phi_s' to determine dense reward
129 |         """
130 |         if bc_schedule:
131 |             self.bc_schedule = bc_schedule
132 |         self._validate_schedule(self.bc_schedule)
133 |         self.base_env = base_env
134 |         # since we are not passing featurize_fn in as an argument, we create it here and check its validity
135 |         self.featurize_fn_map = {
136 |             "ppo": lambda state: self.base_env.lossless_state_encoding_mdp(state),
137 |             "bc": lambda state: self.base_env.featurize_state_mdp(state)
138 |         }
139 |         self._validate_featurize_fns(self.featurize_fn_map)
140 |         self._initial_reward_shaping_factor = reward_shaping_factor
141 |         self.reward_shaping_factor = reward_shaping_factor
142 |         self.reward_shaping_horizon = reward_shaping_horizon
143 |         self.use_phi = use_phi
144 |         self.mode = mode
145 |         self._setup_observation_space()
146 |         self.action_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS))
147 |         self.anneal_bc_factor(0)
148 |         self.reset()
149 |     
150 |     def _validate_featurize_fns(self, mapping):
151 |         assert 'ppo' in mapping, "At least one ppo agent must be specified"
152 |         for k, v in mapping.items():
153 |             assert k in self.supported_agents, "Unsuported agent type in featurize mapping {0}".format(k)
154 |             assert callable(v), "Featurize_fn values must be functions"
155 |             assert len(get_required_arguments(v)) == 1, "Featurize_fn value must accept exactly one argument"
156 |     
157 |     def _validate_schedule(self, schedule):
158 |         timesteps = [p[0] for p in schedule]
159 |         values = [p[1] for p in schedule]
160 | 
161 |         assert len(schedule) >= 2, "Need at least 2 points to linearly interpolate schedule"
162 |         assert schedule[0][0] == 0, "Schedule must start at timestep 0"
163 |         assert all([t >=0 for t in timesteps]), "All timesteps in schedule must be non-negative"
164 |         assert all([v >=0 and v <= 1 for v in values]), "All values in schedule must be between 0 and 1"
165 |         assert sorted(timesteps) == timesteps, "Timesteps must be in increasing order in schedule"
166 | 
167 |         # To ensure we flatline after passing last timestep
168 |         if (schedule[-1][0] < float('inf')):
169 |             schedule.append((float('inf'), schedule[-1][1]))
170 | 
171 |     def _setup_observation_space(self):
172 |         dummy_state = self.base_env.mdp.get_standard_start_state()
173 | 
174 |         #ppo observation
175 |         featurize_fn_ppo = lambda state: self.base_env.lossless_state_encoding_mdp(state)
176 |         obs_shape = featurize_fn_ppo(dummy_state)[0].shape
177 |         high = np.ones(obs_shape) * float("inf")
178 |         low = np.ones(obs_shape) * 0
179 |         self.ppo_observation_space = gym.spaces.Box(np.float32(low), np.float32(high), dtype=np.float32)
180 | 
181 |         # bc observation
182 |         # featurize_fn_bc = lambda state: self.base_env.featurize_state_mdp(state)
183 |         # obs_shape = featurize_fn_bc(dummy_state)[0].shape
184 |         # high = np.ones(obs_shape) * 100
185 |         # low = np.ones(obs_shape) * -100
186 |         # self.bc_observation_space = gym.spaces.Box(np.float32(low), np.float32(high), dtype=np.float32)
187 | 
188 |     def _get_featurize_fn(self, agent_id):
189 |         if agent_id.startswith('ppo'):
190 |             return lambda state: self.base_env.lossless_state_encoding_mdp(state)
191 |         if agent_id.startswith('bc'):
192 |             return lambda state: self.base_env.featurize_state_mdp(state)
193 |         raise ValueError("Unsupported agent type {0}".format(agent_id))
194 | 
195 |     def _get_obs(self, state):
196 |         ob_p0 = self._get_featurize_fn(self.curr_agents[0])(state)[0]
197 |         ob_p1 = self._get_featurize_fn(self.curr_agents[1])(state)[1]
198 |         return ob_p0.astype(np.float32), ob_p1.astype(np.float32)
199 | 
200 |     def _populate_agents(self):
201 |         # Always include at least one ppo agent (i.e. bc_sp not supported for simplicity)
202 |         agents = ['ppo']
203 | 
204 |         # Coin flip to determine whether other agent should be ppo or bc
205 |         other_agent = 'bc' if np.random.uniform() < self.bc_factor else 'ppo'
206 |         agents.append(other_agent)
207 | 
208 |         # Randomize starting indices
209 |         np.random.shuffle(agents)
210 | 
211 |         # Ensure agent names are unique
212 |         agents[0] = agents[0] + '_0'
213 |         agents[1] = agents[1] + '_1'
214 |         
215 |         return agents
216 | 
217 |     def _anneal(self, start_v, curr_t, end_t, end_v=0, start_t=0):
218 |         if end_t == 0:
219 |             # No annealing if horizon is zero
220 |             return start_v
221 |         else:
222 |             off_t = curr_t - start_t
223 |             # Calculate the new value based on linear annealing formula
224 |             fraction = max(1 - float(off_t) / (end_t - start_t), 0)
225 |             return fraction * start_v + (1 - fraction) * end_v
226 | 
227 | 
228 |     # Hacktrick Participant Critical Message: ################################
229 |     def get_dense_reward(self, info, next_state):
230 |         # to activate the caluclation of the custom reward change this to True
231 |         use_custom_reward = False
232 | 
233 |         if use_custom_reward:
234 |             # Implement your reward logic here
235 |             # next_state has the next state object of type HacktrickState
236 |             # to get current state use self.base_env.state
237 |             pass
238 | 
239 |         elif self.use_phi:
240 |             potential = info['phi_s_prime'] - info['phi_s']
241 |             dense_reward = (potential, potential)
242 |         else:
243 |             dense_reward = info["shaped_r_by_agent"]
244 |         
245 |         return dense_reward
246 |     
247 |     # Hacktrick Participant Critical Message: ################################
248 |     def step(self, action_dict):
249 |         """
250 |         action:
251 |             (agent with index self.agent_idx action, other agent action)
252 |             is a tuple with the joint action of the primary and secondary agents in index format
253 |         
254 |         returns:
255 |             observation: formatted to be standard input for self.agent_idx's policy
256 |         """
257 |         if self.mode == 'single':
258 |             action = [action_dict[self.curr_agents[0]]]
259 |             assert all(self.action_space.contains(a) for a in action), "%r (%s) invalid"%(action, type(action))
260 |             joint_action = [Action.INDEX_TO_ACTION[action[0]], Action.STAY]
261 |             # take a step in the current base environment
262 |         elif self.mode == 'collaborative':
263 |             action = [action_dict[self.curr_agents[0]], action_dict[self.curr_agents[1]]]
264 |             assert all(self.action_space.contains(a) for a in action), "%r (%s) invalid"%(action, type(action))
265 |             joint_action = [Action.INDEX_TO_ACTION[a] for a in action]
266 |             # take a step in the current base environment
267 |         else:
268 |             raise 'mode has to be either single or collaborative'
269 | 
270 |         next_state, sparse_reward, done, info = self.base_env.step(joint_action, display_phi=self.use_phi)
271 |         dense_reward = self.get_dense_reward(info, next_state)
272 | 
273 |         ob_p0, ob_p1 = self._get_obs(next_state)
274 | 
275 |         """
276 |         shaped_reward is the total reward calculated from your score (sparse_reward) plus the dense_reward.
277 |         dense_reward is weighted by an annealing factor that decreases over time to let your agent learn
278 |         from its raw score only as time progresses.
279 |         You can use the provided rewared function or implement your own in the get_dense_reward function above.
280 |         If you choose to do so you can remove or change the annealing factor by changing self.reward_shaping_factor
281 |         """
282 |         shaped_reward_p0 = sparse_reward + self.reward_shaping_factor * dense_reward[0]
283 |         shaped_reward_p1 = sparse_reward + self.reward_shaping_factor * dense_reward[1]
284 |         
285 |         obs = { self.curr_agents[0]: ob_p0, self.curr_agents[1]: ob_p1 }
286 |         rewards = { self.curr_agents[0]: shaped_reward_p0, self.curr_agents[1]: shaped_reward_p1 }
287 |         dones = { self.curr_agents[0]: done, self.curr_agents[1]: done, "__all__": done }
288 |         infos = { self.curr_agents[0]: info, self.curr_agents[1]: info }
289 |         return obs, rewards, dones, infos
290 | 
291 |     def reset(self, regen_mdp=True):
292 |         """
293 |         When training on individual maps, we want to randomize which agent is assigned to which
294 |         starting location, in order to make sure that the agents are trained to be able to 
295 |         complete the task starting at either of the hardcoded positions.
296 | 
297 |         NOTE: a nicer way to do this would be to just randomize starting positions, and not
298 |         have to deal with randomizing indices.
299 |         """
300 |         self.base_env.reset(regen_mdp)
301 |         self.curr_agents = self._populate_agents()
302 |         ob_p0, ob_p1 = self._get_obs(self.base_env.state)
303 |         return {self.curr_agents[0]: ob_p0, self.curr_agents[1]: ob_p1}
304 |     
305 |     def anneal_reward_shaping_factor(self, timesteps):
306 |         """
307 |         Set the current reward shaping factor such that we anneal linearly until self.reward_shaping_horizon
308 |         timesteps, given that we are currently at timestep "timesteps"
309 |         """
310 |         new_factor = self._anneal(self._initial_reward_shaping_factor, timesteps, self.reward_shaping_horizon)
311 |         self.set_reward_shaping_factor(new_factor)
312 | 
313 |     def anneal_bc_factor(self, timesteps):
314 |         """
315 |         Set the current bc factor such that we anneal linearly until self.bc_factor_horizon
316 |         timesteps, given that we are currently at timestep "timesteps"
317 |         """
318 |         p_0 = self.bc_schedule[0]
319 |         p_1 = self.bc_schedule[1]
320 |         i = 2
321 |         while timesteps > p_1[0] and i < len(self.bc_schedule):
322 |             p_0 = p_1
323 |             p_1 = self.bc_schedule[i]
324 |             i += 1
325 |         start_t, start_v = p_0
326 |         end_t, end_v = p_1
327 |         new_factor = self._anneal(start_v, timesteps, end_t, end_v, start_t)
328 |         self.set_bc_factor(new_factor)
329 | 
330 |     def set_reward_shaping_factor(self, factor):
331 |         self.reward_shaping_factor = factor
332 | 
333 |     def set_bc_factor(self, factor):
334 |         self.bc_factor = factor
335 | 
336 |     def seed(self, seed):
337 |         """
338 |         set global random seed to make environment deterministic
339 |         """
340 |         # Our environment is already deterministic
341 |         pass
342 |     
343 |     @classmethod
344 |     def from_config(cls, env_config):
345 |         """
346 |         Factory method for generating environments in style with rllib guidlines
347 | 
348 |         env_config (dict):  Must contain keys 'mdp_params', 'env_params' and 'multi_agent_params', the last of which
349 |                             gets fed into the HacktrickMultiAgent constuctor
350 | 
351 |         Returns:
352 |             HacktrickMultiAgent instance specified by env_config params
353 |         """
354 |         assert env_config and "env_params" in env_config and "multi_agent_params" in env_config
355 |         assert "mdp_params" in env_config or "mdp_params_schedule_fn" in env_config, \
356 |             "either a fixed set of mdp params or a schedule function needs to be given"
357 |         # "layout_name" and "rew_shaping_params"
358 |         if "mdp_params" in env_config:
359 |             mdp_params = env_config["mdp_params"]
360 |             outer_shape = None
361 |             mdp_params_schedule_fn = None
362 |         elif "mdp_params_schedule_fn" in env_config:
363 |             mdp_params = None
364 |             outer_shape = env_config["outer_shape"]
365 |             mdp_params_schedule_fn = env_config["mdp_params_schedule_fn"]
366 | 
367 |         # "start_state_fn" and "horizon"
368 |         env_params = env_config["env_params"]
369 |         # "reward_shaping_factor"
370 |         multi_agent_params = env_config["multi_agent_params"]
371 |         base_ae = get_base_ae(mdp_params, env_params, outer_shape, mdp_params_schedule_fn)
372 |         base_env = base_ae.env
373 | 
374 |         return cls(base_env, **multi_agent_params)
375 | 
376 | 
377 | 
378 | ##################
379 | # Training Utils #
380 | ##################
381 | 
382 | class TrainingCallbacks(DefaultCallbacks):
383 |     def on_episode_start(self, worker, base_env, policies, episode, **kwargs):
384 |         pass
385 | 
386 |     def on_episode_step(self, worker, base_env, episode, **kwargs):
387 |         pass
388 | 
389 |     def on_episode_end(self, worker, base_env, policies, episode, **kwargs):
390 |         """
391 |         Used in order to add custom metrics to our tensorboard data
392 | 
393 |         sparse_reward (int) - total reward from deliveries agent earned this episode
394 |         shaped_reward (int) - total reward shaping reward the agent earned this episode
395 |         """
396 |         # Get rllib.HacktrickMultiAgentEnv refernce from rllib wraper
397 |         env = base_env.get_unwrapped()[0]
398 |         # Both agents share the same info so it doesn't matter whose we use, just use 0th agent's
399 |         info_dict = episode.last_info_for(env.curr_agents[0])
400 | 
401 |         ep_info = info_dict["episode"]
402 |         game_stats = ep_info["ep_game_stats"]
403 | 
404 |         # List of episode stats we'd like to collect by agent
405 |         stats_to_collect = EVENT_TYPES
406 | 
407 |         # Parse info dicts generated by HacktrickEnv
408 |         tot_sparse_reward = ep_info["ep_sparse_r"]
409 |         tot_shaped_reward = ep_info["ep_shaped_r"]
410 | 
411 | 
412 |         # Store metrics where they will be visible to rllib for tensorboard logging
413 |         episode.custom_metrics["sparse_reward"] = tot_sparse_reward
414 |         episode.custom_metrics["shaped_reward"] = tot_shaped_reward
415 | 
416 |         # Store per-agent game stats to rllib info dicts
417 |         for stat in stats_to_collect:
418 |             stats = game_stats[stat]
419 |             episode.custom_metrics[stat + "_agent_0"] = len(stats[0])
420 |             episode.custom_metrics[stat + "_agent_1"] = len(stats[1])
421 | 
422 |     def on_sample_end(self, worker, samples, **kwargs):
423 |         pass
424 | 
425 |     # Executes at the end of a call to Trainer.train, we'll update environment params (like annealing shaped rewards)
426 |     def on_train_result(self, trainer, result, **kwargs):
427 |         # Anneal the reward shaping coefficient based on environment paremeters and current timestep
428 |         timestep = result['timesteps_total']
429 |         trainer.workers.foreach_worker(
430 |             lambda ev: ev.foreach_env(
431 |                 lambda env: env.anneal_reward_shaping_factor(timestep)))
432 | 
433 |         # Anneal the bc factor based on environment paremeters and current timestep
434 |         trainer.workers.foreach_worker(
435 |             lambda ev: ev.foreach_env(
436 |                 lambda env: env.anneal_bc_factor(timestep)))
437 | 
438 |     def on_postprocess_trajectory(self, worker, episode, agent_id, policy_id, policies, postprocessed_batch, original_batches, **kwargs):
439 |         pass
440 | 
441 | 
442 | def get_rllib_eval_function(eval_params, eval_mdp_params, env_params, outer_shape, agent_0_policy_str='ppo', agent_1_policy_str='ppo', verbose=False):
443 |     """
444 |     Used to "curry" rllib evaluation function by wrapping additional parameters needed in a local scope, and returning a
445 |     function with rllib custom_evaluation_function compatible signature
446 | 
447 |     eval_params (dict): Contains 'num_games' (int), 'display' (bool), and 'ep_length' (int)
448 |     mdp_params (dict): Used to create underlying HacktrickMDP (see that class for configuration)
449 |     env_params (dict): Used to create underlying HacktrickEnv (see that class for configuration)
450 |     outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout
451 |     agent_0_policy_str (str): Key associated with the rllib policy object used to select actions (must be either 'ppo' or 'bc')
452 |     agent_1_policy_str (str): Key associated with the rllib policy object used to select actions (must be either 'ppo' or 'bc')
453 |     Note: Agent policies are shuffled each time, so agent_0_policy_str and agent_1_policy_str are symmetric
454 |     Returns:
455 |         _evaluate (func): Runs an evaluation specified by the curried params, ignores the rllib parameter 'evaluation_workers'
456 |     """
457 | 
458 |     def _evaluate(trainer, evaluation_workers):
459 |         if verbose:
460 |             print("Computing rollout of current trained policy")
461 | 
462 |         # Randomize starting indices
463 |         policies = [agent_0_policy_str, agent_1_policy_str]
464 |         np.random.shuffle(policies)
465 |         agent_0_policy, agent_1_policy = policies
466 | 
467 |         # Get the corresponding rllib policy objects for each policy string name
468 |         agent_0_policy = trainer.get_policy(agent_0_policy)
469 |         agent_1_policy = trainer.get_policy(agent_1_policy)
470 | 
471 |         agent_0_feat_fn = agent_1_feat_fn = None
472 |         if 'bc' in policies:
473 |             base_ae = get_base_ae(eval_mdp_params, env_params)
474 |             base_env = base_ae.env
475 |             bc_featurize_fn = lambda state : base_env.featurize_state_mdp(state)
476 |             if policies[0] == 'bc':
477 |                 agent_0_feat_fn = bc_featurize_fn
478 |             if policies[1] == 'bc':
479 |                 agent_1_feat_fn = bc_featurize_fn
480 | 
481 |         # Compute the evauation rollout. Note this doesn't use the rllib passed in evaluation_workers, so this 
482 |         # computation all happens on the CPU. Could change this if evaluation becomes a bottleneck
483 |         results = evaluate(eval_params, eval_mdp_params, outer_shape, agent_0_policy, agent_1_policy, agent_0_feat_fn, agent_1_feat_fn, verbose=verbose)
484 | 
485 |         # Log any metrics we care about for rllib tensorboard visualization
486 |         metrics = {}
487 |         metrics['average_sparse_reward'] = np.mean(results['ep_returns'])
488 |         return metrics
489 | 
490 |     return _evaluate
491 | 
492 | 
493 | def evaluate(eval_params, mdp_params, outer_shape, agent_0_policy, agent_1_policy, agent_0_featurize_fn=None, agent_1_featurize_fn=None, verbose=False):
494 |     """
495 |     Used to visualize rollouts of trained policies
496 | 
497 |     eval_params (dict): Contains configurations such as the rollout length, number of games, and whether to display rollouts
498 |     mdp_params (dict): HacktrickMDP compatible configuration used to create environment used for evaluation
499 |     outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout
500 |     agent_0_policy (rllib.Policy): Policy instance used to map states to action logits for agent 0
501 |     agent_1_policy (rllib.Policy): Policy instance used to map states to action logits for agent 1
502 |     agent_0_featurize_fn (func): Used to preprocess states for agent 0, defaults to lossless_state_encoding if 'None'
503 |     agent_1_featurize_fn (func): Used to preprocess states for agent 1, defaults to lossless_state_encoding if 'None'
504 |     """
505 |     if verbose:
506 |         print("eval mdp params", mdp_params)
507 |     evaluator = get_base_ae(mdp_params, {"horizon" : eval_params['ep_length'], "num_mdp":1}, outer_shape)
508 | 
509 |     # Override pre-processing functions with defaults if necessary
510 |     agent_0_featurize_fn = agent_0_featurize_fn if agent_0_featurize_fn else evaluator.env.lossless_state_encoding_mdp
511 |     agent_1_featurize_fn = agent_1_featurize_fn if agent_1_featurize_fn else evaluator.env.lossless_state_encoding_mdp
512 | 
513 |     # Wrap rllib policies in hacktrick agents to be compatible with Evaluator code
514 |     agent0 = RlLibAgent(agent_0_policy, agent_index=0, featurize_fn=agent_0_featurize_fn)
515 |     if eval_params['mode'] == 'single':
516 |         agent1 = StayAgent()
517 |     elif eval_params['mode'] == 'collaborative':
518 |         agent1 = RlLibAgent(agent_1_policy, agent_index=1, featurize_fn=agent_1_featurize_fn)
519 |     else:
520 |         raise 'mode has to be either single or collaborative'
521 | 
522 |     # Compute rollouts
523 |     if 'store_dir' not in eval_params:
524 |         eval_params['store_dir'] = None
525 |     if 'display_phi' not in eval_params:
526 |         eval_params['display_phi'] = False
527 |     results = evaluator.evaluate_agent_pair(AgentPair(agent0, agent1),
528 |                                             num_games=eval_params['num_games'],
529 |                                             display=eval_params['display'],
530 |                                             dir=eval_params['store_dir'],
531 |                                             display_phi=eval_params['display_phi'],
532 |                                             info=verbose)
533 | 
534 |     return results
535 | 
536 | 
537 | ###########################
538 | # rllib.Trainer functions #
539 | ###########################
540 | 
541 | 
542 | def gen_trainer_from_params(params):
543 |     # All ray environment set-up
544 |     if not ray.is_initialized():
545 |         init_params = {
546 |             "ignore_reinit_error" : True,
547 |             "include_webui" : False,
548 |             "temp_dir" : params['ray_params']['temp_dir'],
549 |             "log_to_driver" : params['verbose'],
550 |             "logging_level" : logging.INFO if params['verbose'] else logging.CRITICAL
551 |         }
552 |         ray.init(**init_params)
553 |     register_env("hacktrick_multi_agent", params['ray_params']['env_creator'])
554 |     ModelCatalog.register_custom_model(params['ray_params']['custom_model_id'], params['ray_params']['custom_model_cls'])
555 | 
556 |     # Parse params
557 |     model_params = params['model_params']
558 |     training_params = params['training_params']
559 |     environment_params = params['environment_params']
560 |     evaluation_params = params['evaluation_params']
561 |     bc_params = params['bc_params']
562 |     multi_agent_params = params['environment_params']['multi_agent_params']
563 | 
564 |     env = HacktrickMultiAgent.from_config(environment_params)
565 | 
566 |     # Returns a properly formatted policy tuple to be passed into ppotrainer config
567 |     def gen_policy(policy_type="ppo"):
568 |         # supported policy types thus far
569 |         #// assert policy_type in ["ppo", "bc"]
570 |         assert policy_type == "ppo", 'ppo is the main policy supported, remove this assertion only if you know what you are doing'
571 | 
572 |         if policy_type == "ppo":
573 |             config = {
574 |                 "model" : {
575 |                     "custom_options" : model_params,
576 |                     
577 |                     "custom_model" : "MyPPOModel"
578 |                 }
579 |             }
580 |             return (None, env.ppo_observation_space, env.action_space, config)
581 |         elif policy_type == "bc":
582 |             bc_cls = bc_params['bc_policy_cls']
583 |             bc_config = bc_params['bc_config']
584 |             return (bc_cls, env.bc_observation_space, env.action_space, bc_config)
585 | 
586 |     # Rllib compatible way of setting the directory we store agent checkpoints in
587 |     logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"], params['training_params']['seed'], timestr)
588 |     def custom_logger_creator(config):
589 |                 """Creates a Unified logger that stores results in <params['results_dir']>/<params["experiment_name"]>_<seed>_<timestamp>
590 |                 """
591 |                 results_dir = params['results_dir']
592 |                 if not os.path.exists(results_dir):
593 |                     try:
594 |                         os.makedirs(results_dir)
595 |                     except Exception as e:
596 |                         print("error creating custom logging dir. Falling back to default logdir {}".format(DEFAULT_RESULTS_DIR))
597 |                         results_dir = DEFAULT_RESULTS_DIR
598 |                 logdir = tempfile.mkdtemp(
599 |                     prefix=logdir_prefix, dir=results_dir)
600 |                 logger = UnifiedLogger(config, logdir, loggers=None)
601 |                 return logger
602 | 
603 |     # Create rllib compatible multi-agent config based on params
604 |     multi_agent_config = {}
605 |     all_policies = ['ppo']
606 | 
607 |     # Whether both agents should be learned
608 |     self_play = iterable_equal(multi_agent_params['bc_schedule'], HacktrickMultiAgent.self_play_bc_schedule)
609 |     if not self_play:
610 |         all_policies.append('bc')
611 | 
612 |     multi_agent_config['policies'] = { policy : gen_policy(policy) for policy in all_policies }
613 | 
614 |     def select_policy(agent_id):
615 |         if agent_id.startswith('ppo'):
616 |             return 'ppo'
617 |         if agent_id.startswith('bc'):
618 |             return 'bc'
619 |     multi_agent_config['policy_mapping_fn'] = select_policy
620 |     multi_agent_config['policies_to_train'] = 'ppo'
621 | 
622 |     if "outer_shape" not in environment_params:
623 |         environment_params["outer_shape"] = None
624 | 
625 |     if "mdp_params" in environment_params:
626 |         environment_params["eval_mdp_params"] = environment_params["mdp_params"]
627 |     trainer = PPOTrainer(env="hacktrick_multi_agent", config={
628 |         "multiagent": multi_agent_config,
629 |         "callbacks" : TrainingCallbacks,
630 |         "custom_eval_function" : get_rllib_eval_function(evaluation_params, environment_params['eval_mdp_params'], environment_params['env_params'],
631 |                                         environment_params["outer_shape"], 'ppo', 'ppo' if self_play else 'bc',
632 |                                         verbose=params['verbose']),
633 |         "env_config" : environment_params,
634 |         "eager" : False,
635 |         **training_params
636 |     }, logger_creator=custom_logger_creator)
637 |     return trainer
638 | 
639 | 
640 | 
641 | ### Serialization ###
642 | 
643 | 
644 | def save_trainer(trainer, params, path=None):
645 |     """
646 |     Saves a serialized trainer checkpoint at `path`. If none provided, the default path is
647 |     ~/ray_results/<experiment_results_dir>/checkpoint_<i>/checkpoint-<i>
648 | 
649 |     Note that `params` should follow the same schema as the dict passed into `gen_trainer_from_params`
650 |     """
651 |     # Save trainer
652 |     save_path = trainer.save(path)
653 | 
654 |     # Save params used to create trainer in /path/to/checkpoint_dir/config.pkl
655 |     config = copy.deepcopy(params)
656 |     config_path = os.path.join(os.path.dirname(save_path), "config.pkl")
657 | 
658 |     # Note that we use dill (not pickle) here because it supports function serialization
659 |     with open(config_path, "wb") as f:
660 |         dill.dump(config, f)
661 |     return save_path
662 | 
663 | def load_trainer(save_path):
664 |     """
665 |     Returns a ray compatible trainer object that was previously saved at `save_path` by a call to `save_trainer`
666 |     Note that `save_path` is the full path to the checkpoint FILE, not the checkpoint directory
667 |     """
668 |     # Read in params used to create trainer
669 |     config_path = os.path.join(os.path.dirname(save_path), "config.pkl")
670 |     with open(config_path, "rb") as f:
671 |         # We use dill (instead of pickle) here because we must deserialize functions
672 |         config = dill.load(f)
673 |     
674 |     # Override this param to lower overhead in trainer creation
675 |     config['training_params']['num_workers'] = 0
676 | 
677 |     # Get un-trained trainer object with proper config
678 |     trainer = gen_trainer_from_params(config)
679 | 
680 |     # Load weights into dummy object
681 |     trainer.restore(save_path)
682 |     return trainer
683 | 
684 | def get_agent_from_trainer(trainer, policy_id="ppo", agent_index=0):
685 |     policy = trainer.get_policy(policy_id)
686 |     dummy_env = trainer.env_creator(trainer.config['env_config'])
687 |     featurize_fn = dummy_env.featurize_fn_map[policy_id]
688 |     agent = RlLibAgent(policy, agent_index, featurize_fn=featurize_fn)
689 |     return agent
690 | 
691 | def get_agent_pair_from_trainer(trainer, policy_id_0='ppo', policy_id_1='ppo'):
692 |     agent0 = get_agent_from_trainer(trainer, policy_id=policy_id_0)
693 |     agent1 = get_agent_from_trainer(trainer, policy_id=policy_id_1)
694 |     return AgentPair(agent0, agent1)
695 | 
696 | 
697 | def load_agent_pair(save_path, policy_id_0='ppo', policy_id_1='ppo'):
698 |     """
699 |     Returns an Hacktrick AgentPair object that has as player 0 and player 1 policies with 
700 |     ID policy_id_0 and policy_id_1, respectively
701 |     """
702 |     trainer = load_trainer(save_path)
703 |     return get_agent_pair_from_trainer(trainer, policy_id_0, policy_id_1)
704 | 
705 | def load_agent(save_path, policy_id='ppo', agent_index=0):
706 |     """
707 |     Returns an RllibAgent (compatible with the Hacktrick Agent API) from the `save_path` to a previously
708 |     serialized trainer object created with `save_trainer`
709 | 
710 |     The trainer can have multiple independent policies, so extract the one with ID `policy_id` to wrap in
711 |     an RllibAgent
712 | 
713 |     Agent index indicates whether the agent is player zero or player one (or player n in the general case)
714 |     as the featurization is not symmetric for both players
715 |     """
716 |     trainer = load_trainer(save_path)
717 |     return get_agent_from_trainer(trainer, policy_id=policy_id, agent_index=agent_index)
718 | 
719 | 
720 | 


--------------------------------------------------------------------------------