├── custom_scripts ├── base ├── ppo.py ├── runner.py └── agent.py ├── RL_main_scripts.zip ├── gym_drone ├── envs │ ├── __init__.py │ ├── drone_env.py │ └── droneint_env.py └── __init__.py ├── setup.py ├── report_tester_mayhem.py ├── report_tester.py ├── README.md ├── runtime.py └── LICENSE /custom_scripts/base: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /RL_main_scripts.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JNC96/drone-gym/HEAD/RL_main_scripts.zip -------------------------------------------------------------------------------- /gym_drone/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym_drone.envs.drone_env import DroneEnv 2 | from gym_drone.envs.droneint_env import DroneIntEnv 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='gym_drone', 4 | version='0.0.1', 5 | install_requires=['gym'] # And any other dependencies foo needs 6 | ) 7 | -------------------------------------------------------------------------------- /gym_drone/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='drone-v0', 5 | entry_point='gym_drone.envs:DroneEnv', 6 | max_episode_steps=50, 7 | reward_threshold = 0.95, 8 | nondeterministic = False 9 | ) 10 | 11 | register( 12 | id='droneInt-v0', 13 | entry_point='gym_drone.envs:DroneIntEnv', 14 | max_episode_steps=50, 15 | reward_threshold=0.95, 16 | nondeterministic = False, 17 | ) 18 | 19 | -------------------------------------------------------------------------------- /report_tester_mayhem.py: -------------------------------------------------------------------------------- 1 | agent = Agent.create( 2 | agent='ppo', environment=env, 3 | # Automatically configured network 4 | network='auto', 5 | # Optimization 6 | batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2, 7 | optimization_steps=5, 8 | # Reward estimation 9 | likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, 10 | # Critic 11 | critic_network='auto', 12 | critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), 13 | # Preprocessing 14 | preprocessing=None, 15 | # Exploration 16 | exploration=0.0, variable_noise=0.0, 17 | # Regularization 18 | l2_regularization=0.0, entropy_regularization=0.0, 19 | # TensorFlow etc 20 | name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, 21 | summarizer=dict( 22 | directory="D:\summ", 23 | labels="all" 24 | ), 25 | recorder=None 26 | ) 27 | -------------------------------------------------------------------------------- /report_tester.py: -------------------------------------------------------------------------------- 1 | if self.agent.episodes == 0 and self.interactive: 2 | print(self.interactive) 3 | print(self.agent.episodes) 4 | user_action = self.action_rank(states=states, evaluation=evaluation) 5 | else: 6 | user_action = 0 7 | # run with selected action 8 | if self.agent.episodes > 0: 9 | self.interactive = False 10 | actions = self.agent.act(states=states, evaluation=evaluation, int_bool = self.interactive, int_act = user_action) 11 | 12 | ---------- 13 | 14 | def action_rank(self, states, evaluation): 15 | 16 | action_buffer = [] 17 | print("*********************") 18 | print("*********************") 19 | print("\n%------------------------") 20 | print("% STATE @ STEP# "+str(states[0]*states[1])) 21 | print("%------------------------\n") 22 | print("Slope: "+str(states[2])+" --- @("+str(states[0])+","+str(states[1])+")") 23 | 24 | for _ in range(0,4): 25 | 26 | # here,independent is TRUE because in the normal pipeline you would have to observe after taking an action, but we are simply sampling actions. 27 | tmp_action = self.agent.act(states=states, independent = True, evaluation = False) 28 | 29 | print("\n%------------------------") 30 | print("% ACTION "+str(_+1)) 31 | print("%------------------------\n") 32 | 33 | print("Camera Angle: "+str(tmp_action[0])) 34 | print("Speed: "+str(tmp_action[1])) 35 | print("Height: "+str(tmp_action[2])) 36 | 37 | action_buffer.append(tmp_action) 38 | 39 | action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1 40 | while action_choice>4 or action_choice<0: 41 | action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1 42 | 43 | 44 | return action_buffer[action_choice] 45 | 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![BuildStatus][build-status]][ci-server] 2 | [![PackageVersion][pypi-version]][pypi-home] 3 | [![PythonVersion][python-version]][python-home] 4 | [![Stable][pypi-status]][pypi-home] 5 | [![Format][pypi-format]][pypi-home] 6 | 7 | [build-status]: 8 | [ci-server]: 9 | [pypi-version]: 10 | [pypi-license]: 11 | [pypi-status]: 12 | [pypi-format]: 13 | [pypi-home]: 14 | [python-version]: 15 | [python-home]: https://python.org 16 | 17 |  18 | 19 | # Drone Gym Environment 20 | 21 | This repository contains a PIP package which is an OpenAI Gym environment for a drone that learns via RL. It also introduces the concept of Interactive Reinforcement Learning with this particular environment. 22 | 23 | # Installation 24 | 25 | Install OpenAI gym 26 | 27 | Then install this package via ``pip install -e .`` 28 | 29 | Then, make the environment: 30 | 31 | import gym 32 | import gym_pull 33 | 34 | gym_pull.pull('github.com/jnc96/drone-gym') 35 | env = gym.make('Drone-v0') 36 | 37 |
38 | 39 | See https://github.com/matthiasplappert/keras-rl/tree/master/examples for some examples. 40 | 41 | # Dependencies 42 | 43 | The entire ecosystem heavily depends on TensorForce (see: https://github.com/tensorforce). OpenAI Gym was also used in the creation of the environment (see: https://gym.openai.com/). 44 | 45 | Special thanks to Alexander Kuhnle for his help in developing this. 46 | 47 | # The Environment 48 | 49 | The environment leverages the framework as defined by OpenAI Gym to create a custom environment. The environment contains a grid of terrain gradient values. The reward of the environment is predicted coverage, which is calculated as a linear function of the actions taken by the agent. 50 | 51 | # IRL 52 | 53 | Main purpose of this entire system is to investigate how human interaction can affect the traditional reinforcement learning framework. Custom scripts were written to facilitate this, and several TensorForce scripts were modified as well. These can be found in the custom scripts folder, which need to be manually extracted and placed in the TensorForce package directory. 54 | 55 | Created by Jia Ning Choo 2019 (https://github.com/jnc96). 56 | -------------------------------------------------------------------------------- /custom_scripts/ppo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Tensorforce Team. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from tensorforce.agents import PolicyAgent 17 | 18 | 19 | class ProximalPolicyOptimization(PolicyAgent): 20 | """ 21 | [Proximal Policy Optimization](https://arxiv.org/abs/1707.06347) agent (specification key: 22 | `ppo`). 23 | """ 24 | 25 | def __init__( 26 | # Environment 27 | self, states, actions, max_episode_timesteps, 28 | # Network 29 | network='auto', 30 | # Optimization 31 | batch_size=10, update_frequency=None, learning_rate=3e-4, subsampling_fraction=0.33, 32 | optimization_steps=10, 33 | # Reward estimation 34 | likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, 35 | # Critic 36 | critic_network=None, critic_optimizer=None, 37 | # Preprocessing 38 | preprocessing=None, 39 | # Exploration 40 | exploration=0.0, variable_noise=0.0, 41 | # Regularization 42 | l2_regularization=0.0, entropy_regularization=0.0, 43 | # TensorFlow etc 44 | name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, 45 | summarizer=None, recorder=None, config=None 46 | ): 47 | memory = dict(type='recent', capacity=((batch_size + 1) * max_episode_timesteps)) 48 | if update_frequency is None: 49 | update = dict(unit='episodes', batch_size=batch_size) 50 | else: 51 | update = dict(unit='episodes', batch_size=batch_size, frequency=update_frequency) 52 | optimizer = dict(type='adam', learning_rate=learning_rate) 53 | optimizer = dict( 54 | type='subsampling_step', optimizer=optimizer, fraction=subsampling_fraction 55 | ) 56 | optimizer = dict(type='multi_step', optimizer=optimizer, num_steps=optimization_steps) 57 | objective = dict( 58 | type='policy_gradient', ratio_based=True, clipping_value=likelihood_ratio_clipping 59 | ) 60 | if critic_network is None: 61 | reward_estimation = dict(horizon='episode', discount=discount) 62 | else: 63 | reward_estimation = dict( 64 | horizon='episode', discount=discount, estimate_horizon='late', 65 | estimate_terminal=estimate_terminal, estimate_advantage=True 66 | ) 67 | if critic_network is None: 68 | baseline_policy = None 69 | baseline_objective = None 70 | else: 71 | # State value doesn't exist for Beta 72 | baseline_policy = dict(network=critic_network, distributions=dict(float='gaussian')) 73 | assert critic_optimizer is not None 74 | baseline_objective = 'state_value' 75 | 76 | super().__init__( 77 | # Agent 78 | states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, 79 | parallel_interactions=parallel_interactions, buffer_observe=True, seed=seed, 80 | recorder=recorder, config=config, 81 | # Model 82 | name=name, device=device, execution=execution, saver=saver, summarizer=summarizer, 83 | preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise, 84 | l2_regularization=l2_regularization, 85 | # PolicyModel 86 | policy=None, network=network, memory=memory, update=update, optimizer=optimizer, 87 | objective=objective, reward_estimation=reward_estimation, 88 | baseline_policy=baseline_policy, baseline_network=None, 89 | baseline_optimizer=critic_optimizer, baseline_objective=baseline_objective, 90 | entropy_regularization=entropy_regularization 91 | ) 92 | -------------------------------------------------------------------------------- /runtime.py: -------------------------------------------------------------------------------- 1 | ## Main 2 | 3 | """ 4 | OpenAI gym execution. 5 | """ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import argparse 12 | import importlib 13 | import json 14 | import logging 15 | import os 16 | import time 17 | import sys 18 | 19 | from tensorforce import TensorForceError 20 | from tensorforce.agents import Agent 21 | from tensorforce.execution import Runner 22 | from tensorforce.contrib.openai_gym import OpenAIGym 23 | 24 | 25 | # python examples/openai_gym.py Pong-ram-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 50000 -m 2000 26 | 27 | # python examples/openai_gym.py CartPole-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 2000 -m 200 28 | 29 | 30 | def main(): 31 | parser = argparse.ArgumentParser() 32 | 33 | parser.add_argument('gym_id', help="Id of the Gym environment") 34 | parser.add_argument('-i', '--import-modules', help="Import module(s) required for environment") 35 | parser.add_argument('-a', '--agent', help="Agent configuration file") 36 | parser.add_argument('-n', '--network', default=None, help="Network specification file") 37 | parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") 38 | parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") 39 | parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") 40 | parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") 41 | parser.add_argument('-s', '--save', help="Save agent to this dir") 42 | parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") 43 | parser.add_argument('-l', '--load', help="Load agent from this dir") 44 | parser.add_argument('--monitor', help="Save results to this directory") 45 | parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") 46 | parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") 47 | parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization") 48 | parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") 49 | parser.add_argument('-te', '--test', action='store_true', default=False, help="Test agent without learning.") 50 | parser.add_argument('-sl', '--sleep', type=float, default=None, help="Slow down simulation by sleeping for x seconds (fractions allowed).") 51 | parser.add_argument('--job', type=str, default=None, help="For distributed mode: The job type of this agent.") 52 | parser.add_argument('--task', type=int, default=0, help="For distributed mode: The task index of this agent.") 53 | 54 | args = parser.parse_args() 55 | 56 | logging.basicConfig(level=logging.INFO) 57 | 58 | logger = logging.getLogger() 59 | logger.setLevel(logging.INFO) 60 | 61 | if args.import_modules is not None: 62 | for module in args.import_modules.split(','): 63 | importlib.import_module(name=module) 64 | 65 | environment = OpenAIGym( 66 | gym_id=args.gym_id, 67 | monitor=args.monitor, 68 | monitor_safe=args.monitor_safe, 69 | monitor_video=args.monitor_video, 70 | visualize=args.visualize 71 | ) 72 | 73 | if args.agent is not None: 74 | with open(args.agent, 'r') as fp: 75 | agent = json.load(fp=fp) 76 | else: 77 | raise TensorForceError("No agent configuration provided.") 78 | 79 | if args.network is not None: 80 | with open(args.network, 'r') as fp: 81 | network = json.load(fp=fp) 82 | agent = Agent.from_spec( 83 | spec=agent, 84 | kwargs=dict( 85 | states=environment.states, 86 | actions=environment.actions, 87 | network=network 88 | ) 89 | ) 90 | else: 91 | logger.info("No network configuration provided.") 92 | agent = Agent.from_spec( 93 | spec=agent, 94 | kwargs=dict( 95 | states=environment.states, 96 | actions=environment.actions 97 | ) 98 | ) 99 | 100 | if args.load: 101 | load_dir = os.path.dirname(args.load) 102 | if not os.path.isdir(load_dir): 103 | raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) 104 | agent.restore_model(args.load) 105 | 106 | if args.save: 107 | save_dir = os.path.dirname(args.save) 108 | if not os.path.isdir(save_dir): 109 | try: 110 | os.mkdir(save_dir, 0o755) 111 | except OSError: 112 | raise OSError("Cannot save agent to dir {} ()".format(save_dir)) 113 | 114 | if args.debug: 115 | logger.info("-" * 16) 116 | logger.info("Configuration:") 117 | logger.info(agent) 118 | 119 | runner = Runner( 120 | agent=agent, 121 | environment=environment, 122 | repeat_actions=1 123 | ) 124 | 125 | if args.debug: # TODO: Timestep-based reporting 126 | report_episodes = 1 127 | else: 128 | report_episodes = 100 129 | 130 | logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) 131 | 132 | def episode_finished(r, id_): 133 | if r.episode % report_episodes == 0: 134 | steps_per_second = r.timestep / (time.time() - r.start_time) 135 | logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format( 136 | r.agent.episode, r.episode_timestep, steps_per_second 137 | )) 138 | logger.info("Episode reward: {}".format(r.episode_rewards[-1])) 139 | logger.info("Average of last 500 rewards: {:0.2f}". 140 | format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) 141 | logger.info("Average of last 100 rewards: {:0.2f}". 142 | format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) 143 | if args.save and args.save_episodes is not None and not r.episode % args.save_episodes: 144 | logger.info("Saving agent to {}".format(args.save)) 145 | r.agent.save_model(args.save) 146 | 147 | return True 148 | 149 | runner.run( 150 | num_timesteps=args.timesteps, 151 | num_episodes=args.episodes, 152 | max_episode_timesteps=args.max_episode_timesteps, 153 | deterministic=args.deterministic, 154 | episode_finished=episode_finished, 155 | testing=args.test, 156 | sleep=args.sleep 157 | ) 158 | runner.close() 159 | 160 | logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.agent.episode)) 161 | 162 | 163 | if __name__ == '__main__': 164 | main() 165 | -------------------------------------------------------------------------------- /gym_drone/envs/drone_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import logging 3 | import math 4 | import numpy as np 5 | from gym import error, spaces, utils 6 | from gym.utils import seeding 7 | 8 | 9 | class DroneEnv(gym.Env): 10 | metadata = {'render.modes': ['human']} 11 | 12 | def __init__(self): 13 | 14 | # debug vars 15 | 16 | self.__version__ = "2.1.1" 17 | 18 | # Hyperparameter definition 19 | self.x_min = int(0) 20 | self.x_max = int(4) 21 | self.y_min = int(0) 22 | self.y_max = int(4) 23 | self.min_cam_angle = int(1) 24 | self.max_cam_angle = int(3) 25 | self.min_terr_angle = int(0) 26 | self.max_terr_angle = int(4) #terrain angle - something that is observed 27 | self.min_speed = int(1) 28 | self.max_speed = int(3) #max speed is actually 56 kmh (this is m/s) 29 | self.min_height = int(1) #meter 30 | self.max_height = int(3) #meter 31 | 32 | 33 | # ??? 34 | self.state = None #initiate state holder 35 | self.episode_over = False 36 | self.current_episode = -1 37 | self.current_timestep = 0 # -1 because timestep increments before action 38 | self.current_pos = [0,0] 39 | self.action_episode_memory = [] 40 | self.grid_step_max = (self.x_max+1)*(self.y_max+1) - 1 # number of grid squares 41 | self.max_timestep = 2*self.grid_step_max # Visits all grid squares twice. 42 | #self. 43 | 44 | # Observations are (in this order): current x-pos, current y-pos, terrain angle (from horizontal axis) 45 | # Let's assume that the map is of grid size 5x5. Position of the drone is represented as (grid x index, 46 | # grid y index), where (0,0) is the top left of the grid ((4,4) is max value)). 47 | 48 | # Here, low is the lower limit of observation range, and high is the higher limit. 49 | low_ob = np.array([self.x_min, # x-pos 50 | self.y_min, # y-pos 51 | self.min_terr_angle]) # terrain_angle_deg 52 | high_ob = np.array([self.x_max, # x-pos 53 | self.y_max, # y-pos 54 | self.max_terr_angle]) # terrain_angle_deg 55 | self.observation_space = spaces.Box(low_ob, high_ob, dtype=np.float32) 56 | 57 | # Action space 58 | low_action = np.array([self.min_cam_angle, # cam angle in deg 59 | self.min_speed, # flight speed in m/s 60 | self.min_height]) # flight height in m 61 | high_action = np.array([self.max_cam_angle, # cam angle in deg 62 | self.max_speed, # flight speed in m/s 63 | self.max_height]) # flight height in m 64 | self.action_space = spaces.MultiDiscrete([self.max_cam_angle, self.max_speed, self.max_height]) 65 | 66 | # generate random terrain gradients/create them here 67 | # import random 68 | # list = [111,222,333,444,555] 69 | # print("random.choice() to select random item from list - ", random.choice(list)) 70 | 71 | 72 | self.terr_angle_grid = [0,0,0,0,0, 73 | 0,0,0,0,0, 74 | 1,1,1,1,1, 75 | 0,0,0,0,0, 76 | 0,0,0,0,0 77 | ] 78 | 79 | 80 | def step(self, action): 81 | 82 | """ 83 | The agent (drone) takes a step (flies somewhere) in the environment. 84 | Parameters 85 | ---------- 86 | action : (int,int) - the coordinates, (int) - the terrain gradient 87 | Returns: (int) - terrain angle (observation), (float32) reward, (bool) episode_over, (int,int) - coords 88 | ------- 89 | ob, reward, episode_over, info : tuple 90 | ob (object) : 91 | an environment-specific object representing your observation of 92 | the environment. 93 | reward (float) : 94 | amount of reward achieved by the previous action. The scale 95 | varies between environments, but the goal is always to increase 96 | your total reward. (This reward per step is normalised to 1.) 97 | episode_over (bool) : 98 | whether it's time to reset the environment again. Most (but not 99 | all) tasks are divided up into well-defined episodes, and done 100 | being True indicates the episode has terminated. (For example, 101 | perhaps the pole tipped too far, or you lost your last life.) 102 | info (dict) : 103 | diagnostic information useful for debugging. It can sometimes 104 | be useful for learning (for example, it might contain the raw 105 | probabilities behind the environment's last state change). 106 | However, official evaluations of your agent are not allowed to 107 | use this for learning. 108 | """ 109 | 110 | if self.episode_over: 111 | raise RuntimeError("Episode is done. You're running step() despite this fact. Or reset the env by calling reset().") #end execution, and finish run 112 | 113 | # Return the reward for action taken given state. Save action to action memory buffer. 114 | self.action_episode_memory[self.current_episode].append(action) 115 | reward = self._get_reward(action) 116 | 117 | # Take a step, and observe environment. 118 | self.current_timestep += 1 119 | self.current_pos = self.index2coord(self.current_timestep) 120 | self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max]) 121 | self.state = list.copy(self.current_pos) 122 | 123 | if self.current_timestep>=50: 124 | self.episode_over = True 125 | 126 | return self.state, reward, self.episode_over, {} 127 | 128 | #def print_action(self,action): 129 | 130 | def index2coord(self, index): 131 | 132 | # converts an index value to x-y coords 133 | # see order of the grid above in __init__ 134 | 135 | if (index<=self.x_max): 136 | return [0, index] 137 | else: 138 | return [(index%(self.grid_step_max+1))//(self.x_max+1), index%(self.x_max+1)] 139 | 140 | # grid step max is n*m (grid dimensions) -1 141 | 142 | def _get_state(self): 143 | 144 | return self.terr_angle_grid[self.current_timestep%self.grid_step_max] 145 | 146 | def _get_info(self): 147 | 148 | return self.index2coord(self.current_timestep) 149 | 150 | def _get_reward(self, action): 151 | 152 | # reward factors 153 | # calculatinng the normalised rewards needs -1 because the max values is actually the number of actions, and actions start from 0. 154 | 155 | gradient_delta_rf = 0.4 156 | speed_rf = 0.3 157 | height_rf = 0.3 158 | 159 | #logging.warning("the current timestep. ="+str(self.current_timestep)) 160 | #logging.warning("self.current_timestep%self.grid_step_max = "+ str(self.current_timestep%self.grid_step_max)) 161 | 162 | gradient_delta = abs(self.terr_angle_grid[(self.current_timestep%self.grid_step_max)] - action[0]) # action [1] is the camera angle 163 | 164 | gradient_delta_norm = 1 - gradient_delta/(self.max_cam_angle-1) # this will give us a normalised value that rewards less difference 165 | 166 | speed_norm = 1 - action[1]/(self.max_speed-1) # speed normalised, and reward less speed 167 | 168 | height_norm = action[2]/(self.max_height-1) # height normalised, and more height is better (FOR NOW) 169 | 170 | tmp_reward = gradient_delta_norm*gradient_delta_rf + speed_norm*speed_rf + height_norm*height_rf 171 | 172 | return tmp_reward 173 | 174 | 175 | def reset(self): 176 | # reset should always run at the end of an episode and before the first run. 177 | self.current_timestep = 0 178 | self.current_episode += 1 179 | self.action_episode_memory.append([]) 180 | self.episode_over = False 181 | 182 | self.current_pos = self.index2coord(self.current_timestep) 183 | self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max]) 184 | self.state = list.copy(self.current_pos) 185 | 186 | return self.state 187 | 188 | def _render(self, mode='human', close=False): 189 | return 0 190 | def close(self): 191 | return 0 192 | -------------------------------------------------------------------------------- /gym_drone/envs/droneint_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import logging 3 | import math 4 | import numpy as np 5 | from gym import error, spaces, utils 6 | from gym.utils import seeding 7 | 8 | 9 | class DroneIntEnv(gym.Env): 10 | metadata = {'render.modes': ['human']} 11 | 12 | def __init__(self): 13 | 14 | # debug vars 15 | 16 | self.__version__ = "1.0.0" 17 | 18 | # Hyperparameter definition 19 | self.x_min = int(0) 20 | self.x_max = int(4) 21 | self.y_min = int(0) 22 | self.y_max = int(4) 23 | self.min_cam_angle = int(1) 24 | self.max_cam_angle = int(3) 25 | self.min_terr_angle = int(1) 26 | self.max_terr_angle = int(3) #terrain angle - something that is observed 27 | self.min_speed = int(1) 28 | self.max_speed = int(3) #max speed is actually 56 kmh (this is m/s) 29 | self.min_height = int(1) #meter 30 | self.max_height = int(3) #meter 31 | 32 | 33 | # ??? 34 | self.state = None #initiate state holder 35 | self.episode_over = False 36 | self.current_episode = -1 37 | self.current_timestep = 0 # -1 because timestep increments before action 38 | self.current_pos = [0,0] 39 | self.action_episode_memory = [] 40 | self.grid_step_max = (self.x_max+1)*(self.y_max+1) - 1 # number of grid squares 41 | self.max_timestep = 2*self.grid_step_max # Visits all grid squares twice. 42 | #self. 43 | 44 | # Observations are (in this order): current x-pos, current y-pos, terrain angle (from horizontal axis) 45 | # Let's assume that the map is of grid size 5x5. Position of the drone is represented as (grid x index, 46 | # grid y index), where (0,0) is the top left of the grid ((4,4) is max value)). 47 | 48 | # Here, low is the lower limit of observation range, and high is the higher limit. 49 | low_ob = np.array([self.x_min, # x-pos 50 | self.y_min, # y-pos 51 | self.min_cam_angle]) # terrain_angle_deg 52 | high_ob = np.array([self.x_max, # x-pos 53 | self.y_max, # y-pos 54 | self.max_cam_angle]) # terrain_angle_deg 55 | self.observation_space = spaces.Box(low_ob, high_ob, dtype=np.float32) 56 | 57 | # Action space 58 | low_action = np.array([self.min_cam_angle, # cam angle in deg 59 | self.min_speed, # flight speed in m/s 60 | self.min_height]) # flight height in m 61 | high_action = np.array([self.max_cam_angle, # cam angle in deg 62 | self.max_speed, # flight speed in m/s 63 | self.max_height]) # flight height in m 64 | self.action_space = spaces.MultiDiscrete([self.max_cam_angle, self.max_speed, self.max_height]) 65 | 66 | # generate random terrain gradients/create them here 67 | # import random 68 | # list = [111,222,333,444,555] 69 | # print("random.choice() to select random item from list - ", random.choice(list)) 70 | 71 | 72 | self.terr_angle_grid = [0,0,0,0,0, 73 | 0,0,0,0,0, 74 | 1,1,1,1,1, 75 | 0,0,0,0,0, 76 | 0,0,0,0,0 77 | ] 78 | 79 | def step(self, action): 80 | 81 | """ 82 | The agent (drone) takes a step (flies somewhere) in the environment. 83 | Parameters 84 | ---------- 85 | action : (int,int) - the coordinates, (int) - the terrain gradient 86 | Returns: (int) - terrain angle (observation), (float32) reward, (bool) episode_over, (int,int) - coords 87 | ------- 88 | ob, reward, episode_over, info : tuple 89 | ob (object) : 90 | an environment-specific object representing your observation of 91 | the environment. 92 | reward (float) : 93 | amount of reward achieved by the previous action. The scale 94 | varies between environments, but the goal is always to increase 95 | your total reward. (This reward per step is normalised to 1.) 96 | episode_over (bool) : 97 | whether it's time to reset the environment again. Most (but not 98 | all) tasks are divided up into well-defined episodes, and done 99 | being True indicates the episode has terminated. (For example, 100 | perhaps the pole tipped too far, or you lost your last life.) 101 | info (dict) : 102 | diagnostic information useful for debugging. It can sometimes 103 | be useful for learning (for example, it might contain the raw 104 | probabilities behind the environment's last state change). 105 | However, official evaluations of your agent are not allowed to 106 | use this for learning. 107 | """ 108 | 109 | if self.episode_over: 110 | raise RuntimeError("Episode is done. You're running step() despite this fact. Or reset the env by calling reset().") #end execution, and finish run 111 | 112 | # Return the reward for action taken given state. Save action to action memory buffer. 113 | self.action_episode_memory[self.current_episode].append(action) 114 | 115 | 116 | # Take a step, and observe environment. 117 | self.current_timestep += 1 118 | self.current_pos = self.index2coord(self.current_timestep) 119 | self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max]) 120 | self.state = list.copy(self.current_pos) 121 | 122 | reward = self.get_user_reward(action,self.state) 123 | 124 | if self.current_timestep>=self.max_timestep: 125 | self.episode_over = True 126 | 127 | return self.state, reward, self.episode_over, {} 128 | 129 | #def print_action(self,action): 130 | 131 | def index2coord(self, index): 132 | 133 | # converts an index value to x-y coords 134 | # see order of the grid above in __init__ 135 | 136 | if (index<=self.x_max): 137 | return [0, index] 138 | else: 139 | return [(index%(self.grid_step_max+1))//(self.x_max+1), index%(self.x_max+1)] 140 | 141 | # grid step max is n*m (grid dimensions) -1 142 | 143 | def _get_state(self): 144 | 145 | return self.terr_angle_grid[self.current_timestep%self.grid_step_max] 146 | 147 | def _get_info(self): 148 | 149 | return self.index2coord(self.current_timestep) 150 | 151 | def _get_reward(self,action): 152 | 153 | # reward factors 154 | 155 | gradient_delta_rf = 0.3 156 | speed_rf = 0.35 157 | height_rf = 0.35 158 | 159 | #logging.warning("the current timestep. ="+str(self.current_timestep)) 160 | #logging.warning("self.current_timestep%self.grid_step_max = "+ str(self.current_timestep%self.grid_step_max)) 161 | 162 | gradient_delta = abs(self.terr_angle_grid[(self.current_timestep%self.grid_step_max)] - action[0]) # action [1] is the camera angle 163 | 164 | gradient_delta_norm = 1 - gradient_delta/self.max_cam_angle # this will give us a normalised value that rewards less difference 165 | 166 | speed_norm = 1 - action[1]/self.max_speed # speed normalised, and reward less speed 167 | 168 | height_norm = action[2]/self.max_height # height normalised, and more height is better (FOR NOW) 169 | 170 | tmp_reward = gradient_delta_norm*gradient_delta_rf + speed_norm*speed_rf + height_norm*height_rf 171 | 172 | return tmp_reward 173 | 174 | def get_user_reward(self, action, state): 175 | 176 | #init variables. 177 | uinput_reward = None 178 | 179 | print("Given that the:\n\n") 180 | print("Slope is"+str(state[2])+".\n") 181 | print("@("+str(state[0])+","+str(state[1])+")\n\n") 182 | 183 | print("Agent takes actions:\n\n") 184 | print("Camera Angle: "+str(action[0])+"\n") 185 | print("Speed: "+str(action[1])+"\n") 186 | print("Height: "+str(action[2])+"\n\n") 187 | 188 | print("This returns a predicted coverage of:") 189 | tmp = _get_reward(action) 190 | print("\n\n") 191 | 192 | uinput_reward = input("How would you rate the most recent action?") 193 | return uinput_reward 194 | 195 | def reset(self): 196 | # reset should always run at the end of an episode and before the first run. 197 | self.current_timestep = 0 198 | self.current_episode += 1 199 | self.action_episode_memory.append([]) 200 | self.episode_over = False 201 | 202 | self.current_pos = self.index2coord(self.current_timestep) 203 | self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max]) 204 | self.state = list.copy(self.current_pos) 205 | 206 | return self.state 207 | 208 | def _render(self, mode='human', close=False): 209 | return 0 210 | def close(self): 211 | return 0 212 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /custom_scripts/runner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Tensorforce Team. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY K , either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import time 17 | from tqdm import tqdm 18 | 19 | import numpy as np 20 | 21 | from tensorforce import util 22 | from tensorforce.agents import Agent 23 | from tensorforce.environments import Environment 24 | 25 | 26 | class Runner(object): 27 | 28 | def __init__(self, agent, environment, evaluation_environment=None, save_best_agent=False): 29 | # save_best overwrites saver... 30 | self.is_environment_external = isinstance(environment, Environment) 31 | self.environment = Environment.create(environment=environment) 32 | 33 | self.is_eval_environment_external = isinstance(evaluation_environment, Environment) 34 | if evaluation_environment is None: 35 | self.evaluation_environment = None 36 | else: 37 | self.evaluation_environment = Environment.create(environment=evaluation_environment) 38 | 39 | self.save_best_agent = save_best_agent 40 | self.is_agent_external = isinstance(agent, Agent) 41 | kwargs = dict() 42 | # warning: save_best_agent 43 | if not self.is_agent_external and self.save_best_agent: 44 | # Disable periodic saving 45 | kwargs = dict(saver=dict(seconds=None, steps=None)) 46 | self.agent = Agent.create(agent=agent, environment=self.environment, **kwargs) 47 | if not self.agent.model.is_initialized: 48 | self.agent.initialize() 49 | 50 | self.global_episodes = self.agent.episodes 51 | self.global_timesteps = self.agent.timesteps 52 | self.global_updates = self.agent.updates 53 | self.episode_rewards = list() 54 | self.episode_timesteps = list() 55 | self.episode_seconds = list() 56 | self.episode_agent_seconds = list() 57 | 58 | def close(self): 59 | if hasattr(self, 'tqdm'): 60 | self.tqdm.close() 61 | if not self.is_agent_external: 62 | self.agent.close() 63 | if not self.is_environment_external: 64 | self.environment.close() 65 | if self.evaluation_environment is not None and not self.is_eval_environment_external: 66 | self.evaluation_environment.close() 67 | 68 | # TODO: make average reward another possible criteria for runner-termination 69 | def run( 70 | self, 71 | # General 72 | num_episodes=None, num_timesteps=None, num_updates=None, max_episode_timesteps=None, 73 | num_repeat_actions=1, 74 | # Callback 75 | callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, 76 | # Tqdm 77 | use_tqdm=True, mean_horizon=10, 78 | # Evaluation 79 | evaluation=False, evaluation_callback=None, evaluation_frequency=None, 80 | max_evaluation_timesteps=None, num_evaluation_iterations=1 81 | ): 82 | # General 83 | if num_episodes is None: 84 | self.num_episodes = float('inf') 85 | else: 86 | self.num_episodes = num_episodes 87 | if num_timesteps is None: 88 | self.num_timesteps = float('inf') 89 | else: 90 | self.num_timesteps = num_timesteps 91 | if num_updates is None: 92 | self.num_updates = float('inf') 93 | else: 94 | self.num_updates = num_updates 95 | if max_episode_timesteps is None: 96 | self.max_episode_timesteps = float('inf') 97 | else: 98 | self.max_episode_timesteps = max_episode_timesteps 99 | self.num_repeat_actions = num_repeat_actions 100 | 101 | # Callback 102 | assert callback_episode_frequency is None or callback_timestep_frequency is None 103 | if callback_episode_frequency is None and callback_timestep_frequency is None: 104 | callback_episode_frequency = 1 105 | if callback_episode_frequency is None: 106 | self.callback_episode_frequency = float('inf') 107 | else: 108 | self.callback_episode_frequency = callback_episode_frequency 109 | if callback_timestep_frequency is None: 110 | self.callback_timestep_frequency = float('inf') 111 | else: 112 | self.callback_timestep_frequency = callback_timestep_frequency 113 | if callback is None: 114 | self.callback = (lambda r: True) 115 | elif util.is_iterable(x=callback): 116 | def sequential_callback(runner): 117 | result = True 118 | for fn in callback: 119 | x = fn(runner) 120 | if isinstance(result, bool): 121 | result = result and x 122 | return result 123 | self.callback = sequential_callback 124 | else: 125 | def boolean_callback(runner): 126 | result = callback(runner) 127 | if isinstance(result, bool): 128 | return result 129 | else: 130 | return True 131 | self.callback = boolean_callback 132 | 133 | # Tqdm 134 | if use_tqdm: 135 | if hasattr(self, 'tqdm'): 136 | self.tqdm.close() 137 | 138 | assert self.num_episodes != float('inf') or self.num_timesteps != float('inf') 139 | inner_callback = self.callback 140 | 141 | if self.num_episodes != float('inf'): 142 | # Episode-based tqdm (default option if both num_episodes and num_timesteps set) 143 | assert self.num_episodes != float('inf') 144 | bar_format = ( 145 | '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep=' 146 | '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent=' 147 | '{postfix[4]:.1f}%]' 148 | ) 149 | postfix = [0.0, 0, 0.0, 0.0, 0.0] 150 | self.tqdm = tqdm( 151 | desc='Episodes', total=self.num_episodes, bar_format=bar_format, 152 | initial=self.global_episodes, postfix=postfix 153 | ) 154 | self.tqdm_last_update = self.global_episodes 155 | 156 | def tqdm_callback(runner): 157 | mean_reward = float(np.mean(runner.episode_rewards[-mean_horizon:])) 158 | mean_ts_per_ep = int(np.mean(runner.episode_timesteps[-mean_horizon:])) 159 | mean_sec_per_ep = float(np.mean(runner.episode_seconds[-mean_horizon:])) 160 | mean_agent_sec = float(np.mean(runner.episode_agent_seconds[-mean_horizon:])) 161 | mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep 162 | mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep 163 | runner.tqdm.postfix[0] = mean_reward 164 | runner.tqdm.postfix[1] = mean_ts_per_ep 165 | runner.tqdm.postfix[2] = mean_sec_per_ep 166 | runner.tqdm.postfix[3] = mean_ms_per_ts 167 | runner.tqdm.postfix[4] = mean_rel_agent 168 | runner.tqdm.update(n=(runner.global_episodes - runner.tqdm_last_update)) 169 | runner.tqdm_last_update = runner.global_episodes 170 | return inner_callback(runner) 171 | 172 | else: 173 | # Timestep-based tqdm 174 | assert self.num_timesteps != float('inf') 175 | self.tqdm = tqdm( 176 | desc='Timesteps', total=self.num_timesteps, initial=self.global_timesteps, 177 | postfix=dict(mean_reward='n/a') 178 | ) 179 | self.tqdm_last_update = self.global_timesteps 180 | 181 | def tqdm_callback(runner): 182 | # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:]) 183 | # num_timesteps = min(num_mean_reward, runner.episode_timestep) 184 | # mean_reward = sum_timesteps_reward / num_episodes 185 | runner.tqdm.set_postfix(mean_reward='n/a') 186 | runner.tqdm.update(n=(runner.global_timesteps - runner.tqdm_last_update)) 187 | runner.tqdm_last_update = runner.global_timesteps 188 | return inner_callback(runner) 189 | 190 | self.callback = tqdm_callback 191 | 192 | # Evaluation 193 | self.evaluation = evaluation 194 | if evaluation_callback is None: 195 | self.evaluation_callback = (lambda r: None) 196 | else: 197 | assert not self.evaluation 198 | self.evaluation_callback = evaluation_callback 199 | self.evaluation_frequency = evaluation_frequency 200 | if max_evaluation_timesteps is None: 201 | self.max_evaluation_timesteps = float('inf') 202 | else: 203 | assert not self.evaluation 204 | self.max_evaluation_timesteps = max_evaluation_timesteps 205 | self.num_evaluation_iterations = num_evaluation_iterations 206 | if self.save_best_agent: 207 | assert not self.evaluation 208 | inner_evaluation_callback = self.evaluation_callback 209 | 210 | def mean_reward_callback(runner): 211 | result = inner_evaluation_callback(runner) 212 | if result is None: 213 | return float(np.mean(runner.evaluation_rewards)) 214 | else: 215 | return result 216 | 217 | self.evaluation_callback = mean_reward_callback 218 | self.best_evaluation_score = None 219 | 220 | # Reset agent 221 | self.agent.reset() 222 | 223 | # Timestep/episode/update counter 224 | self.timesteps = 0 225 | self.episodes = 0 226 | self.updates = 0 227 | self.interactive = bool(int(input("\nWould you like this run to use user inputs? 0 - No, 1 - Yes -- "))) 228 | 229 | # Episode loop 230 | while True: 231 | # Run episode 232 | if not self.run_episode( 233 | environment=self.environment, max_timesteps=self.max_episode_timesteps, 234 | evaluation=self.evaluation 235 | ): 236 | return 237 | 238 | # Increment episode counter (after calling callback) 239 | self.episodes += 1 240 | 241 | # Update experiment statistics 242 | self.episode_rewards.append(self.episode_reward) 243 | self.episode_timesteps.append(self.episode_timestep) 244 | self.episode_seconds.append(self.episode_second) 245 | self.episode_agent_seconds.append(self.episode_agent_second) 246 | 247 | # Run evaluation 248 | if self.evaluation_frequency is None: 249 | is_evaluation = self.episode_updated 250 | else: 251 | is_evaluation = (self.episodes % self.evaluation_frequency == 0) 252 | if is_evaluation: 253 | if self.evaluation_environment is None: 254 | environment = self.environment 255 | else: 256 | environment = self.evaluation_environment 257 | 258 | self.evaluation_rewards = list() 259 | self.evaluation_timesteps = list() 260 | self.evaluation_seconds = list() 261 | self.evaluation_agent_seconds = list() 262 | 263 | # Evaluation loop 264 | for _ in range(self.num_evaluation_iterations): 265 | self.run_episode( 266 | environment=environment, max_timesteps=self.max_evaluation_timesteps, 267 | evaluation=True 268 | ) 269 | 270 | self.evaluation_rewards.append(self.episode_reward) 271 | self.evaluation_timesteps.append(self.episode_timestep) 272 | self.evaluation_seconds.append(self.episode_second) 273 | self.evaluation_agent_seconds.append(self.episode_agent_second) 274 | 275 | # Evaluation callback 276 | if self.save_best_agent: 277 | evaluation_score = self.evaluation_callback(self) 278 | assert isinstance(evaluation_score, float) 279 | if self.best_evaluation_score is None: 280 | self.best_evaluation_score = evaluation_score 281 | elif evaluation_score > self.best_evaluation_score: 282 | self.best_evaluation_score = evaluation_score 283 | self.agent.save(filename='best-model', append_timestep=False) 284 | else: 285 | self.evaluation_callback(self) 286 | 287 | # Update global timestep/episode/update 288 | self.global_timesteps = self.agent.timesteps 289 | self.global_episodes = self.agent.episodes 290 | self.global_updates = self.agent.updates 291 | 292 | # Callback 293 | if self.episodes % self.callback_episode_frequency == 0 and not self.callback(self): 294 | return 295 | 296 | # Terminate experiment if too long 297 | if self.global_timesteps >= self.num_timesteps: 298 | return 299 | elif self.evaluation and self.timesteps >= self.num_timesteps: 300 | return 301 | elif self.global_episodes >= self.num_episodes: 302 | return 303 | elif self.evaluation and self.episodes >= self.num_episodes: 304 | return 305 | elif self.global_updates >= self.num_updates: 306 | return 307 | elif self.evaluation and self.updates >= self.num_updates: 308 | return 309 | elif self.agent.should_stop(): 310 | return 311 | 312 | def action_rank(self, states, evaluation): 313 | 314 | action_buffer = [] 315 | print("*********************") 316 | print("*********************") 317 | print("\n%------------------------") 318 | print("% STATE @ STEP# "+str(states[0]*states[1])) 319 | print("%------------------------\n") 320 | print("Slope: "+str(states[2])+" --- @("+str(states[0])+","+str(states[1])+")") 321 | 322 | for _ in range(0,4): 323 | 324 | # here,independent is TRUE because in the normal pipeline you would have to observe 325 | # after taking an action, but we are simply sampling actions. 326 | tmp_action = self.agent.act(states=states, independent = True, evaluation = False) 327 | 328 | print("\n%------------------------") 329 | print("% ACTION "+str(_+1)) 330 | print("%------------------------\n") 331 | 332 | print("Camera Angle: "+str(tmp_action[0])) 333 | print("Speed: "+str(tmp_action[1])) 334 | print("Height: "+str(tmp_action[2])) 335 | 336 | action_buffer.append(tmp_action) 337 | 338 | action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1 339 | while action_choice>4 or action_choice<0: 340 | action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1 341 | 342 | 343 | return action_buffer[action_choice] 344 | 345 | def run_episode(self, environment, max_timesteps, evaluation): 346 | # Episode statistics 347 | self.episode_reward = 0 348 | self.episode_timestep = 0 349 | self.episode_updated = False 350 | self.episode_agent_second = 0.0 351 | episode_start = time.time() 352 | 353 | # Start environment episode 354 | states = environment.reset() 355 | 356 | # Timestep loop 357 | while True: 358 | # Retrieve actions from agent 359 | agent_start = time.time() 360 | # user action only runs for the first episodes: only 50 steps 361 | if self.agent.episodes == 0 and self.interactive: 362 | print(self.interactive) 363 | print(self.agent.episodes) 364 | user_action = self.action_rank(states=states, evaluation=evaluation) 365 | else: 366 | user_action = 0 367 | # run with selected action 368 | if self.agent.episodes > 0: 369 | self.interactive = False 370 | actions = self.agent.act(states=states, evaluation=evaluation, int_bool = self.interactive, int_act = user_action) 371 | self.timesteps += 1 372 | self.episode_agent_second += time.time() - agent_start 373 | self.episode_timestep += 1 374 | # Execute actions in environment (optional repeated execution) 375 | reward = 0.0 376 | for _ in range(self.num_repeat_actions): 377 | states, terminal, step_reward = environment.execute(actions=actions) 378 | if isinstance(terminal, bool): 379 | terminal = int(terminal) 380 | reward += step_reward 381 | if terminal > 0: 382 | break 383 | self.episode_reward += reward 384 | 385 | # Terminate episode if too long 386 | if self.episode_timestep >= max_timesteps: 387 | terminal = 2 388 | 389 | # Observe unless evaluation 390 | if not evaluation: 391 | agent_start = time.time() 392 | updated = self.agent.observe(terminal=terminal, reward=reward) 393 | self.updates += int(updated) 394 | self.episode_agent_second += time.time() - agent_start 395 | self.episode_updated = self.episode_updated or updated 396 | 397 | 398 | 399 | # Callback 400 | if self.episode_timestep % self.callback_timestep_frequency == 0 and \ 401 | not self.callback(self): 402 | return False 403 | 404 | # Episode termination check 405 | if terminal > 0: 406 | break 407 | 408 | # No callbacks for evaluation 409 | if evaluation: 410 | continue 411 | 412 | # Update global timestep/episode/update 413 | self.global_timesteps = self.agent.timesteps 414 | self.global_episodes = self.agent.episodes 415 | self.global_updates = self.agent.updates 416 | 417 | # Terminate experiment if too long 418 | if self.global_timesteps >= self.num_timesteps: 419 | return 420 | elif self.global_episodes >= self.num_episodes: 421 | return 422 | elif self.global_updates >= self.num_updates: 423 | return 424 | elif self.agent.should_stop(): 425 | return False 426 | 427 | # Update episode statistics 428 | self.episode_second = time.time() - episode_start 429 | 430 | return True 431 | -------------------------------------------------------------------------------- /custom_scripts/agent.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Tensorforce Team. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from collections import OrderedDict 17 | import importlib 18 | import json 19 | import logging 20 | import os 21 | import random 22 | import time 23 | 24 | import numpy as np 25 | import tensorflow as tf 26 | 27 | from tensorforce import util, TensorforceError 28 | import tensorforce.agents 29 | 30 | 31 | class Agent(object): 32 | """ 33 | Tensorforce agent interface. 34 | """ 35 | 36 | @staticmethod 37 | def create(agent=None, environment=None, **kwargs): 38 | """ 39 | Creates an agent from a specification. 40 | 41 | Args: 42 | agent (specification): JSON file, specification key, configuration dictionary, 43 | library module, or `Agent` subclass 44 | (default: Policy agent). 45 | environment (Environment): Environment which the agent is supposed to be trained on, 46 | environment-related arguments like state/action space specifications will be 47 | extract if given. 48 | kwargs: Additional arguments. 49 | """ 50 | if agent is None: 51 | agent = 'default' 52 | 53 | if isinstance(agent, Agent): 54 | # TODO: asserts??????? 55 | return agent 56 | 57 | elif isinstance(agent, dict): 58 | # Dictionary specification 59 | util.deep_disjoint_update(target=kwargs, source=agent) 60 | agent = kwargs.pop('agent', kwargs.pop('type', 'default')) 61 | 62 | return Agent.create(agent=agent, environment=environment, **kwargs) 63 | 64 | elif isinstance(agent, str): 65 | if os.path.isfile(agent): 66 | # JSON file specification 67 | with open(agent, 'r') as fp: 68 | agent = json.load(fp=fp) 69 | 70 | util.deep_disjoint_update(target=kwargs, source=agent) 71 | agent = kwargs.pop('agent', kwargs.pop('type', 'default')) 72 | 73 | return Agent.create(agent=agent, environment=environment, **kwargs) 74 | 75 | elif '.' in agent: 76 | # Library specification 77 | library_name, module_name = agent.rsplit('.', 1) 78 | library = importlib.import_module(name=library_name) 79 | agent = getattr(library, module_name) 80 | 81 | if environment is not None: 82 | env_spec = dict(states=environment.states(), actions=environment.actions()) 83 | if environment.max_episode_timesteps() is not None: 84 | env_spec['max_episode_timesteps'] = environment.max_episode_timesteps() 85 | util.deep_disjoint_update(target=kwargs, source=env_spec) 86 | 87 | agent = agent(**kwargs) 88 | assert isinstance(agent, Agent) 89 | 90 | return agent 91 | 92 | else: 93 | # Keyword specification 94 | if environment is not None: 95 | env_spec = dict(states=environment.states(), actions=environment.actions()) 96 | if environment.max_episode_timesteps() is not None: 97 | env_spec['max_episode_timesteps'] = environment.max_episode_timesteps() 98 | util.deep_disjoint_update(target=kwargs, source=env_spec) 99 | 100 | agent = tensorforce.agents.agents[agent](**kwargs) 101 | assert isinstance(agent, Agent) 102 | 103 | return agent 104 | 105 | else: 106 | assert False 107 | 108 | def __init__( 109 | # Environment 110 | self, states, actions, max_episode_timesteps=None, 111 | # TensorFlow etc 112 | parallel_interactions=1, buffer_observe=True, seed=None, recorder=None 113 | ): 114 | if seed is not None: 115 | assert isinstance(seed, int) 116 | random.seed(n=seed) 117 | np.random.seed(seed=seed) 118 | tf.random.set_random_seed(seed=seed) 119 | 120 | # States/actions specification 121 | self.states_spec = util.valid_values_spec( 122 | values_spec=states, value_type='state', return_normalized=True 123 | ) 124 | self.actions_spec = util.valid_values_spec( 125 | values_spec=actions, value_type='action', return_normalized=True 126 | ) 127 | self.max_episode_timesteps = max_episode_timesteps 128 | 129 | # Check for name overlap 130 | for name in self.states_spec: 131 | if name in self.actions_spec: 132 | TensorforceError.collision( 133 | name='name', value=name, group1='states', group2='actions' 134 | ) 135 | 136 | # Parallel episodes 137 | if isinstance(parallel_interactions, int): 138 | if parallel_interactions <= 0: 139 | raise TensorforceError.value( 140 | name='parallel_interactions', value=parallel_interactions 141 | ) 142 | self.parallel_interactions = parallel_interactions 143 | else: 144 | raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) 145 | 146 | # Buffer observe 147 | if isinstance(buffer_observe, bool): 148 | if not buffer_observe and self.parallel_interactions > 1: 149 | raise TensorforceError.unexpected() 150 | if self.max_episode_timesteps is None and self.parallel_interactions > 1: 151 | raise TensorforceError.unexpected() 152 | if not buffer_observe: 153 | self.buffer_observe = 1 154 | elif self.max_episode_timesteps is None: 155 | self.buffer_observe = 100 156 | else: 157 | self.buffer_observe = self.max_episode_timesteps 158 | elif isinstance(buffer_observe, int): 159 | if buffer_observe <= 0: 160 | raise TensorforceError.value(name='buffer_observe', value=buffer_observe) 161 | if self.parallel_interactions > 1: 162 | raise TensorforceError.unexpected() 163 | if self.max_episode_timesteps is None: 164 | self.buffer_observe = buffer_observe 165 | else: 166 | self.buffer_observe = min(buffer_observe, self.max_episode_timesteps) 167 | else: 168 | raise TensorforceError.type(name='buffer_observe', value=buffer_observe) 169 | 170 | # Parallel terminal/reward buffers 171 | self.terminal_buffers = np.ndarray( 172 | shape=(self.parallel_interactions, self.buffer_observe), 173 | dtype=util.np_dtype(dtype='long') 174 | ) 175 | self.reward_buffers = np.ndarray( 176 | shape=(self.parallel_interactions, self.buffer_observe), 177 | dtype=util.np_dtype(dtype='float') 178 | ) 179 | 180 | # Parallel buffer indices 181 | self.buffer_indices = np.zeros( 182 | shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int') 183 | ) 184 | 185 | self.timesteps = 0 186 | self.episodes = 0 187 | self.updates = 0 188 | 189 | # Recorder 190 | if recorder is None: 191 | pass 192 | elif not all(key in ('directory', 'frequency', 'max-traces') for key in recorder): 193 | raise TensorforceError.value(name='recorder', value=list(recorder)) 194 | self.recorder_spec = recorder 195 | if self.recorder_spec is not None: 196 | self.record_states = OrderedDict(((name, list()) for name in self.states_spec)) 197 | for name, spec in self.actions_spec.items(): 198 | if spec['type'] == 'int': 199 | self.record_states[name + '_mask'] = list() 200 | self.record_actions = OrderedDict(((name, list()) for name in self.actions_spec)) 201 | self.record_terminal = list() 202 | self.record_reward = list() 203 | self.num_episodes = 0 204 | 205 | def __str__(self): 206 | return self.__class__.__name__ 207 | 208 | def initialize(self): 209 | """ 210 | Initializes the agent. 211 | """ 212 | if not hasattr(self, 'model'): 213 | raise TensorforceError.missing(name='Agent', value='model') 214 | 215 | # Setup Model 216 | # (create and build graph (local and global if distributed), server, session, etc..). 217 | self.model.initialize() 218 | self.reset() 219 | 220 | def close(self): 221 | """ 222 | Closes the agent. 223 | """ 224 | self.model.close() 225 | 226 | def reset(self): 227 | """ 228 | Resets the agent to start a new episode. 229 | """ 230 | self.buffer_indices = np.zeros( 231 | shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int') 232 | ) 233 | self.timesteps, self.episodes, self.updates = self.model.reset() 234 | 235 | def act( 236 | self, states, int_bool=False, int_act=None, parallel=0, deterministic=False, independent=False, evaluation=False, 237 | query=None, **kwargs 238 | ): 239 | """ 240 | Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless 241 | `independent` is true. 242 | 243 | Args: 244 | states (dict[state]): Dictionary containing state(s) to be acted on 245 | (required). 246 | parallel (int): Parallel execution index 247 | (default: 0). 248 | deterministic (bool): Whether to apply exploration and sampling 249 | (default: false). 250 | independent (bool): Whether action is not remembered, and this call is thus not 251 | followed by observe 252 | (default: false). 253 | evaluation (bool): Whether the agent is currently evaluated, implies and overwrites 254 | deterministic and independent 255 | (default: false). 256 | query (list[str]): Names of tensors to retrieve 257 | (default: none). 258 | kwargs: Additional input values, for instance, for dynamic hyperparameters. 259 | 260 | Returns: 261 | (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried 262 | tensor values if requested. 263 | """ 264 | assert util.reduce_all(predicate=util.not_nan_inf, xs=states) 265 | 266 | ''' 267 | # ***** NING EDIT **** 268 | if kwargs is not None: 269 | extra = dict(kwargs) 270 | inre = False 271 | for item in extra: 272 | if item=='inre': 273 | inre = True 274 | interactive_action = extra['interactive_action'] 275 | else: 276 | inre = False 277 | # ****** NING EDIT END *** 278 | ''' 279 | 280 | # self.current_internals = self.next_internals 281 | if evaluation: 282 | if deterministic or independent: 283 | raise TensorforceError.unexpected() 284 | deterministic = independent = True 285 | 286 | # Auxiliaries 287 | auxiliaries = OrderedDict() 288 | if isinstance(states, dict): 289 | states = dict(states) 290 | for name, spec in self.actions_spec.items(): 291 | if spec['type'] == 'int' and name + '_mask' in states: 292 | auxiliaries[name + '_mask'] = states.pop(name + '_mask') 293 | 294 | # Normalize states dictionary 295 | states = util.normalize_values( 296 | value_type='state', values=states, values_spec=self.states_spec 297 | ) 298 | 299 | # Batch states 300 | states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1) 301 | auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1) 302 | 303 | # Model.act() 304 | if query is None: 305 | actions, self.timesteps = self.model.act( 306 | states=states, auxiliaries=auxiliaries, parallel=parallel, 307 | deterministic=deterministic, independent=independent, **kwargs 308 | ) 309 | 310 | else: 311 | actions, self.timesteps, queried = self.model.act( 312 | states=states, auxiliaries=auxiliaries, parallel=parallel, 313 | deterministic=deterministic, independent=independent, query=query, **kwargs 314 | ) 315 | 316 | 317 | 318 | if int_bool: 319 | print("\nModel action is:") 320 | print(actions['action']) 321 | actions['action'] = [int_act] 322 | print("\nUser action is:") 323 | print(actions['action']) 324 | 325 | 326 | if self.recorder_spec is not None and not independent: 327 | for name in self.states_spec: 328 | self.record_states[name].append(states[name]) 329 | for name, spec in self.actions_spec.items(): 330 | self.record_actions[name].append(actions[name]) 331 | if spec['type'] == 'int': 332 | if name + '_mask' in auxiliaries: 333 | self.record_states[name].append(auxiliaries[name + '_mask']) 334 | else: 335 | shape = (1,) + spec['shape'] + (spec['num_values'],) 336 | self.record_states[name].append( 337 | np.full(shape, True, dtype=util.np_dtype(dtype='bool')) 338 | ) 339 | 340 | # Unbatch actions 341 | actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1) 342 | 343 | # Reverse normalized actions dictionary 344 | actions = util.unpack_values( 345 | value_type='action', values=actions, values_spec=self.actions_spec 346 | ) 347 | 348 | # if independent, return processed state as well? 349 | 350 | if query is None: 351 | return actions 352 | else: 353 | return actions, queried 354 | 355 | def observe(self, reward, terminal=False, parallel=0, query=None, **kwargs): 356 | """ 357 | Observes reward and whether a terminal state is reached, needs to be preceded by 358 | `act(...)`. 359 | 360 | Args: 361 | reward (float): Reward 362 | (required). 363 | terminal (bool | 0 | 1 | 2): Whether a terminal state is reached or 2 if the 364 | episode was aborted (default: false). 365 | parallel (int): Parallel execution index 366 | (default: 0). 367 | query (list[str]): Names of tensors to retrieve 368 | (default: none). 369 | kwargs: Additional input values, for instance, for dynamic hyperparameters. 370 | 371 | Returns: 372 | (bool, optional list[str]): Whether an update was performed, plus queried tensor values 373 | if requested. 374 | """ 375 | assert util.reduce_all(predicate=util.not_nan_inf, xs=reward) 376 | 377 | if query is not None and self.parallel_interactions > 1: 378 | raise TensorforceError.unexpected() 379 | 380 | if isinstance(terminal, bool): 381 | terminal = int(terminal) 382 | 383 | if self.recorder_spec is not None: 384 | self.record_terminal.append(terminal) 385 | self.record_reward.append(reward) 386 | if terminal > 0: 387 | self.num_episodes += 1 388 | 389 | if self.num_episodes == self.recorder_spec.get('frequency', 1): 390 | directory = self.recorder_spec['directory'] 391 | if os.path.isdir(directory): 392 | files = sorted( 393 | f for f in os.listdir(directory) 394 | if os.path.isfile(os.path.join(directory, f)) 395 | and f.startswith('trace-') 396 | ) 397 | else: 398 | os.makedirs(directory) 399 | files = list() 400 | max_traces = self.recorder_spec.get('max-traces') 401 | if max_traces is not None and len(files) > max_traces - 1: 402 | for filename in files[:-max_traces + 1]: 403 | filename = os.path.join(directory, filename) 404 | os.remove(filename) 405 | 406 | filename = 'trace-{}-{}.npz'.format( 407 | self.episodes, time.strftime('%Y%m%d-%H%M%S') 408 | ) 409 | filename = os.path.join(directory, filename) 410 | self.record_states = util.fmap( 411 | function=np.concatenate, xs=self.record_states, depth=1 412 | ) 413 | self.record_actions = util.fmap( 414 | function=np.concatenate, xs=self.record_actions, depth=1 415 | ) 416 | self.record_terminal = np.asarray(self.record_terminal) 417 | self.record_reward = np.asarray(self.record_reward) 418 | np.savez_compressed( 419 | filename, **self.record_states, **self.record_actions, 420 | terminal=self.record_terminal, reward=self.record_reward 421 | ) 422 | self.record_states = util.fmap( 423 | function=(lambda x: list()), xs=self.record_states, depth=1 424 | ) 425 | self.record_actions = util.fmap( 426 | function=(lambda x: list()), xs=self.record_actions, depth=1 427 | ) 428 | self.record_terminal = list() 429 | self.record_reward = list() 430 | self.num_episodes = 0 431 | 432 | # Update terminal/reward buffer 433 | index = self.buffer_indices[parallel] 434 | self.terminal_buffers[parallel, index] = terminal 435 | self.reward_buffers[parallel, index] = reward 436 | index += 1 437 | 438 | if self.max_episode_timesteps is not None and index > self.max_episode_timesteps: 439 | raise TensorforceError.unexpected() 440 | 441 | if terminal > 0 or index == self.buffer_observe or query is not None: 442 | # Model.observe() 443 | if query is None: 444 | updated, self.episodes, self.updates = self.model.observe( 445 | terminal=self.terminal_buffers[parallel, :index], 446 | reward=self.reward_buffers[parallel, :index], parallel=parallel, **kwargs 447 | ) 448 | 449 | else: 450 | updated, self.episodes, self.updates, queried = self.model.observe( 451 | terminal=self.terminal_buffers[parallel, :index], 452 | reward=self.reward_buffers[parallel, :index], parallel=parallel, query=query, 453 | **kwargs 454 | ) 455 | 456 | # Reset buffer index 457 | self.buffer_indices[parallel] = 0 458 | 459 | else: 460 | # Increment buffer index 461 | self.buffer_indices[parallel] = index 462 | updated = False 463 | 464 | if query is None: 465 | return updated 466 | else: 467 | return updated, queried 468 | 469 | def save(self, directory=None, filename=None, append_timestep=True): 470 | """ 471 | Saves the current state of the agent. 472 | 473 | Args: 474 | directory (str): Checkpoint directory 475 | (default: directory specified for 476 | TensorFlow saver). 477 | filename (str): Checkpoint filename 478 | (default: filename specified for 479 | TensorFlow saver). 480 | append_timestep: Whether to append the current timestep to the checkpoint file 481 | (default: true). 482 | 483 | Returns: 484 | str: Checkpoint path. 485 | """ 486 | # TODO: Messes with required parallel disentangling, better to remove unfinished episodes 487 | # from memory, but currently entire episode buffered anyway... 488 | # # Empty buffers before saving 489 | # for parallel in range(self.parallel_interactions): 490 | # index = self.buffer_indices[parallel] 491 | # if index > 0: 492 | # # if self.parallel_interactions > 1: 493 | # # raise TensorforceError.unexpected() 494 | # self.episode = self.model.observe( 495 | # terminal=self.terminal_buffers[parallel, :index], 496 | # reward=self.reward_buffers[parallel, :index], parallel=parallel 497 | # ) 498 | # self.buffer_indices[parallel] = 0 499 | 500 | return self.model.save( 501 | directory=directory, filename=filename, append_timestep=append_timestep 502 | ) 503 | 504 | def restore(self, directory=None, filename=None): 505 | """ 506 | Restores the agent. 507 | 508 | Args: 509 | directory (str): Checkpoint directory 510 | (default: directory specified for 511 | TensorFlow saver). 512 | filename (str): Checkpoint filename 513 | (default: latest checkpoint in 514 | directory). 515 | """ 516 | if not hasattr(self, 'model'): 517 | raise TensorforceError.missing(name='Agent', value='model') 518 | 519 | if not self.model.is_initialized: 520 | self.model.initialize() 521 | 522 | self.timesteps, self.episodes, self.updates = self.model.restore( 523 | directory=directory, filename=filename 524 | ) 525 | 526 | def get_output_tensors(self, function): 527 | """ 528 | Returns the names of output tensors for the given function. 529 | 530 | Args: 531 | function (str): Function name 532 | (required). 533 | 534 | Returns: 535 | list[str]: Names of output tensors. 536 | """ 537 | if function in self.model.output_tensors: 538 | return self.model.output_tensors[function] 539 | else: 540 | raise TensorforceError.unexpected() 541 | 542 | def get_query_tensors(self, function): 543 | """ 544 | Returns the names of queryable tensors for the given function. 545 | 546 | Args: 547 | function (str): Function name 548 | (required). 549 | 550 | Returns: 551 | list[str]: Names of queryable tensors. 552 | """ 553 | if function in self.model.query_tensors: 554 | return self.model.query_tensors[function] 555 | else: 556 | raise TensorforceError.unexpected() 557 | 558 | def get_available_summaries(self): 559 | """ 560 | Returns the summary labels provided by the agent. 561 | 562 | Returns: 563 | list[str]: Available summary labels. 564 | """ 565 | return self.model.get_available_summaries() 566 | 567 | def should_stop(self): 568 | return self.model.monitored_session.should_stop() 569 | --------------------------------------------------------------------------------