├── custom_scripts
    ├── base
    ├── ppo.py
    ├── runner.py
    └── agent.py
├── RL_main_scripts.zip
├── gym_drone
    ├── envs
    │   ├── __init__.py
    │   ├── drone_env.py
    │   └── droneint_env.py
    └── __init__.py
├── setup.py
├── report_tester_mayhem.py
├── report_tester.py
├── README.md
├── runtime.py
└── LICENSE


/custom_scripts/base:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/RL_main_scripts.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JNC96/drone-gym/HEAD/RL_main_scripts.zip


--------------------------------------------------------------------------------
/gym_drone/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym_drone.envs.drone_env import DroneEnv
2 | from gym_drone.envs.droneint_env import DroneIntEnv
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(name='gym_drone',
4 |       version='0.0.1',
5 |       install_requires=['gym']  # And any other dependencies foo needs
6 | )
7 | 


--------------------------------------------------------------------------------
/gym_drone/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | register(
 4 |     id='drone-v0',
 5 |     entry_point='gym_drone.envs:DroneEnv',
 6 |     max_episode_steps=50,
 7 |     reward_threshold = 0.95,
 8 |     nondeterministic = False
 9 | )
10 | 
11 | register(
12 |     id='droneInt-v0',
13 |     entry_point='gym_drone.envs:DroneIntEnv',
14 |     max_episode_steps=50,
15 |     reward_threshold=0.95,
16 |     nondeterministic = False,
17 | )
18 | 
19 | 


--------------------------------------------------------------------------------
/report_tester_mayhem.py:
--------------------------------------------------------------------------------
 1 | agent = Agent.create(
 2 |         agent='ppo', environment=env,
 3 |         # Automatically configured network
 4 |         network='auto',
 5 |         # Optimization
 6 |         batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2,
 7 |         optimization_steps=5,
 8 |         # Reward estimation
 9 |         likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False,
10 |         # Critic
11 |         critic_network='auto',
12 |         critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3),
13 |         # Preprocessing
14 |         preprocessing=None,
15 |         # Exploration
16 |         exploration=0.0, variable_noise=0.0,
17 |         # Regularization
18 |         l2_regularization=0.0, entropy_regularization=0.0,
19 |         # TensorFlow etc
20 |         name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None,
21 |         summarizer=dict(
22 |             directory="D:\summ",
23 |             labels="all"
24 |             ),
25 |         recorder=None
26 |     )
27 | 


--------------------------------------------------------------------------------
/report_tester.py:
--------------------------------------------------------------------------------
 1 | if self.agent.episodes == 0 and self.interactive:
 2 |                 print(self.interactive)
 3 |                 print(self.agent.episodes)
 4 |                 user_action = self.action_rank(states=states, evaluation=evaluation)
 5 |             else:
 6 |                 user_action = 0
 7 |             # run with selected action
 8 |             if self.agent.episodes > 0:
 9 |                 self.interactive = False
10 |             actions = self.agent.act(states=states, evaluation=evaluation, int_bool = self.interactive, int_act = user_action)
11 |             
12 | ----------
13 | 
14 | def action_rank(self, states, evaluation):
15 | 
16 |         action_buffer = []
17 |         print("*********************")
18 |         print("*********************")
19 |         print("\n%------------------------")
20 |         print("% STATE @ STEP# "+str(states[0]*states[1]))
21 |         print("%------------------------\n")
22 |         print("Slope: "+str(states[2])+" --- @("+str(states[0])+","+str(states[1])+")")
23 |         
24 |         for _ in range(0,4):
25 | 
26 |             # here,independent is TRUE because in the normal pipeline you would have to observe after taking an action, but we are simply sampling actions.
27 |             tmp_action = self.agent.act(states=states, independent = True, evaluation = False)
28 |             
29 |             print("\n%------------------------")
30 |             print("% ACTION "+str(_+1))
31 |             print("%------------------------\n")
32 | 
33 |             print("Camera Angle: "+str(tmp_action[0]))
34 |             print("Speed: "+str(tmp_action[1]))
35 |             print("Height: "+str(tmp_action[2]))            
36 | 
37 |             action_buffer.append(tmp_action)
38 | 
39 |         action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1
40 |         while action_choice>4 or action_choice<0:
41 |             action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1
42 | 
43 | 
44 |         return action_buffer[action_choice]
45 | 
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![BuildStatus][build-status]][ci-server]
 2 | [![PackageVersion][pypi-version]][pypi-home]
 3 | [![PythonVersion][python-version]][python-home]
 4 | [![Stable][pypi-status]][pypi-home]
 5 | [![Format][pypi-format]][pypi-home]
 6 | 
 7 | [build-status]: 
 8 | [ci-server]: 
 9 | [pypi-version]: 
10 | [pypi-license]: 
11 | [pypi-status]: 
12 | [pypi-format]: 
13 | [pypi-home]: 
14 | [python-version]:
15 | [python-home]: https://python.org
16 | 
17 | ![gif](https://media.giphy.com/media/h5NXof7XfEYHm/giphy.gif)
18 | 
19 | # Drone Gym Environment
20 | 
21 | This repository contains a PIP package which is an OpenAI Gym environment for a drone that learns via RL. It also introduces the concept of Interactive Reinforcement Learning with this particular environment.
22 | 
23 | # Installation
24 | 
25 |   <b> Install OpenAI gym </b>
26 | 
27 | Then install this package via ``pip install -e .``
28 | 
29 | Then, make the environment:
30 | 
31 | 	import gym
32 | 	import gym_pull
33 |     
34 |     gym_pull.pull('github.com/jnc96/drone-gym')
35 |     env = gym.make('Drone-v0')
36 | 
37 | <div id="basic_usage"></div>
38 | 
39 | See https://github.com/matthiasplappert/keras-rl/tree/master/examples for some examples.
40 | 
41 | # Dependencies
42 | 
43 | The entire ecosystem heavily depends on TensorForce (see: https://github.com/tensorforce). OpenAI Gym was also used in the creation of the environment (see: https://gym.openai.com/).
44 | 
45 | Special thanks to Alexander Kuhnle for his help in developing this.
46 | 
47 | # The Environment
48 | 
49 | The environment leverages the framework as defined by OpenAI Gym to create a custom environment. The environment contains a grid of terrain gradient values. The reward of the environment is predicted coverage, which is calculated as a linear function of the actions taken by the agent.
50 | 
51 | # IRL
52 | 
53 | Main purpose of this entire system is to investigate how human interaction can affect the traditional reinforcement learning framework. Custom scripts were written to facilitate this, and several TensorForce scripts were modified as well. These can be found in the custom scripts folder, which need to be manually extracted and placed in the TensorForce package directory.
54 | 
55 | Created by Jia Ning Choo 2019 (https://github.com/jnc96).
56 | 


--------------------------------------------------------------------------------
/custom_scripts/ppo.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Tensorforce Team. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | from tensorforce.agents import PolicyAgent
17 | 
18 | 
19 | class ProximalPolicyOptimization(PolicyAgent):
20 |     """
21 |     [Proximal Policy Optimization](https://arxiv.org/abs/1707.06347) agent (specification key:
22 |     `ppo`).
23 |     """
24 | 
25 |     def __init__(
26 |         # Environment
27 |         self, states, actions, max_episode_timesteps,
28 |         # Network
29 |         network='auto',
30 |         # Optimization
31 |         batch_size=10, update_frequency=None, learning_rate=3e-4, subsampling_fraction=0.33,
32 |         optimization_steps=10,
33 |         # Reward estimation
34 |         likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False,
35 |         # Critic
36 |         critic_network=None, critic_optimizer=None,
37 |         # Preprocessing
38 |         preprocessing=None,
39 |         # Exploration
40 |         exploration=0.0, variable_noise=0.0,
41 |         # Regularization
42 |         l2_regularization=0.0, entropy_regularization=0.0,
43 |         # TensorFlow etc
44 |         name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None,
45 |         summarizer=None, recorder=None, config=None
46 |     ):
47 |         memory = dict(type='recent', capacity=((batch_size + 1) * max_episode_timesteps))
48 |         if update_frequency is None:
49 |             update = dict(unit='episodes', batch_size=batch_size)
50 |         else:
51 |             update = dict(unit='episodes', batch_size=batch_size, frequency=update_frequency)
52 |         optimizer = dict(type='adam', learning_rate=learning_rate)
53 |         optimizer = dict(
54 |             type='subsampling_step', optimizer=optimizer, fraction=subsampling_fraction
55 |         )
56 |         optimizer = dict(type='multi_step', optimizer=optimizer, num_steps=optimization_steps)
57 |         objective = dict(
58 |             type='policy_gradient', ratio_based=True, clipping_value=likelihood_ratio_clipping
59 |         )
60 |         if critic_network is None:
61 |             reward_estimation = dict(horizon='episode', discount=discount)
62 |         else:
63 |             reward_estimation = dict(
64 |                 horizon='episode', discount=discount, estimate_horizon='late',
65 |                 estimate_terminal=estimate_terminal, estimate_advantage=True
66 |             )
67 |         if critic_network is None:
68 |             baseline_policy = None
69 |             baseline_objective = None
70 |         else:
71 |             # State value doesn't exist for Beta
72 |             baseline_policy = dict(network=critic_network, distributions=dict(float='gaussian'))
73 |             assert critic_optimizer is not None
74 |             baseline_objective = 'state_value'
75 | 
76 |         super().__init__(
77 |             # Agent
78 |             states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
79 |             parallel_interactions=parallel_interactions, buffer_observe=True, seed=seed,
80 |             recorder=recorder, config=config,
81 |             # Model
82 |             name=name, device=device, execution=execution, saver=saver, summarizer=summarizer,
83 |             preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise,
84 |             l2_regularization=l2_regularization,
85 |             # PolicyModel
86 |             policy=None, network=network, memory=memory, update=update, optimizer=optimizer,
87 |             objective=objective, reward_estimation=reward_estimation,
88 |             baseline_policy=baseline_policy, baseline_network=None,
89 |             baseline_optimizer=critic_optimizer, baseline_objective=baseline_objective,
90 |             entropy_regularization=entropy_regularization
91 |         )
92 | 


--------------------------------------------------------------------------------
/runtime.py:
--------------------------------------------------------------------------------
  1 | ## Main 
  2 | 
  3 | """
  4 | OpenAI gym execution.
  5 | """
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import argparse
 12 | import importlib
 13 | import json
 14 | import logging
 15 | import os
 16 | import time
 17 | import sys
 18 | 
 19 | from tensorforce import TensorForceError
 20 | from tensorforce.agents import Agent
 21 | from tensorforce.execution import Runner
 22 | from tensorforce.contrib.openai_gym import OpenAIGym
 23 | 
 24 | 
 25 | # python examples/openai_gym.py Pong-ram-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 50000 -m 2000
 26 | 
 27 | # python examples/openai_gym.py CartPole-v0 -a examples/configs/vpg.json -n examples/configs/mlp2_network.json -e 2000 -m 200
 28 | 
 29 | 
 30 | def main():
 31 |     parser = argparse.ArgumentParser()
 32 | 
 33 |     parser.add_argument('gym_id', help="Id of the Gym environment")
 34 |     parser.add_argument('-i', '--import-modules', help="Import module(s) required for environment")
 35 |     parser.add_argument('-a', '--agent', help="Agent configuration file")
 36 |     parser.add_argument('-n', '--network', default=None, help="Network specification file")
 37 |     parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes")
 38 |     parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps")
 39 |     parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode")
 40 |     parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically")
 41 |     parser.add_argument('-s', '--save', help="Save agent to this dir")
 42 |     parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
 43 |     parser.add_argument('-l', '--load', help="Load agent from this dir")
 44 |     parser.add_argument('--monitor', help="Save results to this directory")
 45 |     parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
 46 |     parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
 47 |     parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization")
 48 |     parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")
 49 |     parser.add_argument('-te', '--test', action='store_true', default=False, help="Test agent without learning.")
 50 |     parser.add_argument('-sl', '--sleep', type=float, default=None, help="Slow down simulation by sleeping for x seconds (fractions allowed).")
 51 |     parser.add_argument('--job', type=str, default=None, help="For distributed mode: The job type of this agent.")
 52 |     parser.add_argument('--task', type=int, default=0, help="For distributed mode: The task index of this agent.")
 53 | 
 54 |     args = parser.parse_args()
 55 | 
 56 |     logging.basicConfig(level=logging.INFO)
 57 | 
 58 |     logger = logging.getLogger()
 59 |     logger.setLevel(logging.INFO)
 60 | 
 61 |     if args.import_modules is not None:
 62 |         for module in args.import_modules.split(','):
 63 |             importlib.import_module(name=module)
 64 | 
 65 |     environment = OpenAIGym(
 66 |         gym_id=args.gym_id,
 67 |         monitor=args.monitor,
 68 |         monitor_safe=args.monitor_safe,
 69 |         monitor_video=args.monitor_video,
 70 |         visualize=args.visualize
 71 |     )
 72 | 
 73 |     if args.agent is not None:
 74 |         with open(args.agent, 'r') as fp:
 75 |             agent = json.load(fp=fp)
 76 |     else:
 77 |         raise TensorForceError("No agent configuration provided.")
 78 | 
 79 |     if args.network is not None:
 80 |         with open(args.network, 'r') as fp:
 81 |             network = json.load(fp=fp)
 82 |         agent = Agent.from_spec(
 83 |             spec=agent,
 84 |             kwargs=dict(
 85 |                 states=environment.states,
 86 |                 actions=environment.actions,
 87 |                 network=network
 88 |             )
 89 |         )
 90 |     else:
 91 |         logger.info("No network configuration provided.")
 92 |         agent = Agent.from_spec(
 93 |             spec=agent,
 94 |             kwargs=dict(
 95 |                 states=environment.states,
 96 |                 actions=environment.actions
 97 |             )
 98 |         )
 99 | 
100 |     if args.load:
101 |         load_dir = os.path.dirname(args.load)
102 |         if not os.path.isdir(load_dir):
103 |             raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
104 |         agent.restore_model(args.load)
105 | 
106 |     if args.save:
107 |         save_dir = os.path.dirname(args.save)
108 |         if not os.path.isdir(save_dir):
109 |             try:
110 |                 os.mkdir(save_dir, 0o755)
111 |             except OSError:
112 |                 raise OSError("Cannot save agent to dir {} ()".format(save_dir))
113 | 
114 |     if args.debug:
115 |         logger.info("-" * 16)
116 |         logger.info("Configuration:")
117 |         logger.info(agent)
118 | 
119 |     runner = Runner(
120 |         agent=agent,
121 |         environment=environment,
122 |         repeat_actions=1
123 |     )
124 | 
125 |     if args.debug:  # TODO: Timestep-based reporting
126 |         report_episodes = 1
127 |     else:
128 |         report_episodes = 100
129 | 
130 |     logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))
131 | 
132 |     def episode_finished(r, id_):
133 |         if r.episode % report_episodes == 0:
134 |             steps_per_second = r.timestep / (time.time() - r.start_time)
135 |             logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format(
136 |                 r.agent.episode, r.episode_timestep, steps_per_second
137 |             ))
138 |             logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
139 |             logger.info("Average of last 500 rewards: {:0.2f}".
140 |                         format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards))))
141 |             logger.info("Average of last 100 rewards: {:0.2f}".
142 |                         format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards))))
143 |         if args.save and args.save_episodes is not None and not r.episode % args.save_episodes:
144 |             logger.info("Saving agent to {}".format(args.save))
145 |             r.agent.save_model(args.save)
146 | 
147 |         return True
148 | 
149 |     runner.run(
150 |         num_timesteps=args.timesteps,
151 |         num_episodes=args.episodes,
152 |         max_episode_timesteps=args.max_episode_timesteps,
153 |         deterministic=args.deterministic,
154 |         episode_finished=episode_finished,
155 |         testing=args.test,
156 |         sleep=args.sleep
157 |     )
158 |     runner.close()
159 | 
160 |     logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.agent.episode))
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     main()
165 | 


--------------------------------------------------------------------------------
/gym_drone/envs/drone_env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import logging
  3 | import math
  4 | import numpy as np
  5 | from gym import error, spaces, utils
  6 | from gym.utils import seeding
  7 | 
  8 | 
  9 | class DroneEnv(gym.Env):
 10 |   metadata = {'render.modes': ['human']}
 11 | 
 12 |   def __init__(self):
 13 |     
 14 |     # debug vars
 15 |     
 16 |     self.__version__ = "2.1.1"
 17 |     
 18 |     # Hyperparameter definition 
 19 |     self.x_min = int(0)
 20 |     self.x_max = int(4)
 21 |     self.y_min = int(0)
 22 |     self.y_max = int(4)
 23 |     self.min_cam_angle = int(1)
 24 |     self.max_cam_angle = int(3)
 25 |     self.min_terr_angle = int(0)
 26 |     self.max_terr_angle = int(4) #terrain angle - something that is observed
 27 |     self.min_speed = int(1)
 28 |     self.max_speed = int(3) #max speed is actually 56 kmh (this is m/s)
 29 |     self.min_height = int(1) #meter
 30 |     self.max_height = int(3) #meter
 31 |     
 32 |     
 33 |     # ???
 34 |     self.state = None #initiate state holder
 35 |     self.episode_over = False
 36 |     self.current_episode = -1 
 37 |     self.current_timestep = 0 # -1 because timestep increments before action
 38 |     self.current_pos = [0,0]
 39 |     self.action_episode_memory = []
 40 |     self.grid_step_max = (self.x_max+1)*(self.y_max+1) - 1 # number of grid squares
 41 |     self.max_timestep = 2*self.grid_step_max   # Visits all grid squares twice.
 42 |     #self.
 43 |     
 44 |     # Observations are (in this order): current x-pos, current y-pos, terrain angle (from horizontal axis)
 45 |     # Let's assume that the map is of grid size 5x5. Position of the drone is represented as (grid x index,
 46 |     # grid y index), where (0,0) is the top left of the grid ((4,4) is max value)).
 47 |     
 48 |     # Here, low is the lower limit of observation range, and high is the higher limit.
 49 |     low_ob = np.array([self.x_min,  # x-pos
 50 |                     self.y_min,  # y-pos
 51 |                     self.min_terr_angle]) # terrain_angle_deg
 52 |     high_ob = np.array([self.x_max,  # x-pos
 53 |                     self.y_max,  # y-pos
 54 |                     self.max_terr_angle]) # terrain_angle_deg
 55 |     self.observation_space = spaces.Box(low_ob, high_ob, dtype=np.float32)
 56 |     
 57 |     # Action space
 58 |     low_action = np.array([self.min_cam_angle,  # cam angle in deg
 59 |                     self.min_speed,  # flight speed in m/s
 60 |                     self.min_height]) # flight height in m
 61 |     high_action = np.array([self.max_cam_angle,  # cam angle in deg
 62 |                     self.max_speed,  # flight speed in m/s
 63 |                     self.max_height]) # flight height in m
 64 |     self.action_space = spaces.MultiDiscrete([self.max_cam_angle,  self.max_speed, self.max_height])
 65 |     
 66 |     # generate random terrain gradients/create them here
 67 |     # import random
 68 |     # list  = [111,222,333,444,555]
 69 |     # print("random.choice() to select random item from list - ", random.choice(list))
 70 | 
 71 |     
 72 |     self.terr_angle_grid = [0,0,0,0,0,
 73 |                             0,0,0,0,0,
 74 |                             1,1,1,1,1,
 75 |                             0,0,0,0,0,
 76 |                             0,0,0,0,0
 77 |                            ]
 78 | 
 79 |     
 80 |   def step(self, action):
 81 |     
 82 |     """
 83 |         The agent (drone) takes a step (flies somewhere) in the environment.
 84 |         Parameters
 85 |         ----------
 86 |  action : (int,int) - the coordinates, (int) - the terrain gradient
 87 |         Returns: (int) - terrain angle (observation), (float32) reward, (bool) episode_over, (int,int) - coords
 88 |         -------
 89 |         ob, reward, episode_over, info : tuple
 90 |             ob (object) :
 91 |                 an environment-specific object representing your observation of
 92 |                 the environment.
 93 |             reward (float) :
 94 |                 amount of reward achieved by the previous action. The scale
 95 |                 varies between environments, but the goal is always to increase
 96 |                 your total reward. (This reward per step is normalised to 1.)
 97 |             episode_over (bool) :
 98 |                 whether it's time to reset the environment again. Most (but not
 99 |                 all) tasks are divided up into well-defined episodes, and done
100 |                 being True indicates the episode has terminated. (For example,
101 |                 perhaps the pole tipped too far, or you lost your last life.)
102 |             info (dict) :
103 |                  diagnostic information useful for debugging. It can sometimes
104 |                  be useful for learning (for example, it might contain the raw
105 |                  probabilities behind the environment's last state change).
106 |                  However, official evaluations of your agent are not allowed to
107 |                  use this for learning.
108 |         """
109 |     
110 |     if self.episode_over:
111 |         raise RuntimeError("Episode is done. You're running step() despite this fact. Or reset the env by calling reset().") #end execution, and finish run
112 | 
113 |     # Return the reward for action taken given state. Save action to action memory buffer.
114 |     self.action_episode_memory[self.current_episode].append(action)
115 |     reward = self._get_reward(action)
116 | 
117 |     # Take a step, and observe environment.
118 |     self.current_timestep += 1
119 |     self.current_pos = self.index2coord(self.current_timestep)
120 |     self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max])
121 |     self.state = list.copy(self.current_pos)
122 |     
123 |     if self.current_timestep>=50:
124 |       self.episode_over = True
125 |       
126 |     return self.state, reward, self.episode_over, {}
127 | 
128 |   #def print_action(self,action):
129 |             
130 |   def index2coord(self, index):
131 |     
132 |     # converts an index value to x-y coords
133 |     # see order of the grid above in __init__
134 |     
135 |     if (index<=self.x_max):
136 |       return [0, index]
137 |     else:        
138 |       return [(index%(self.grid_step_max+1))//(self.x_max+1), index%(self.x_max+1)]
139 |           
140 |       # grid step max is n*m (grid dimensions) -1
141 | 
142 |   def _get_state(self):
143 |     
144 |     return self.terr_angle_grid[self.current_timestep%self.grid_step_max]
145 |   
146 |   def _get_info(self):
147 |     
148 |     return self.index2coord(self.current_timestep)
149 |   
150 |   def _get_reward(self, action):
151 |     
152 |     # reward factors
153 |     # calculatinng the normalised rewards needs -1 because the max values is actually the number of actions, and actions start from 0.
154 |     
155 |     gradient_delta_rf = 0.4
156 |     speed_rf = 0.3
157 |     height_rf = 0.3
158 |     
159 |     #logging.warning("the current timestep.  ="+str(self.current_timestep))
160 |     #logging.warning("self.current_timestep%self.grid_step_max  =  "+ str(self.current_timestep%self.grid_step_max))
161 | 
162 |     gradient_delta = abs(self.terr_angle_grid[(self.current_timestep%self.grid_step_max)] - action[0]) # action [1] is the camera angle
163 | 
164 |     gradient_delta_norm = 1 - gradient_delta/(self.max_cam_angle-1) # this will give us a normalised value that rewards less difference
165 |     
166 |     speed_norm = 1 - action[1]/(self.max_speed-1) # speed normalised, and reward less speed
167 |     
168 |     height_norm = action[2]/(self.max_height-1) # height normalised, and more height is better (FOR NOW)
169 |     
170 |     tmp_reward = gradient_delta_norm*gradient_delta_rf + speed_norm*speed_rf + height_norm*height_rf
171 |     
172 |     return tmp_reward
173 |     
174 |     
175 |   def reset(self):
176 |     # reset should always run at the end of an episode and before the first run.
177 |     self.current_timestep = 0
178 |     self.current_episode += 1
179 |     self.action_episode_memory.append([])
180 |     self.episode_over = False
181 |     
182 |     self.current_pos = self.index2coord(self.current_timestep)
183 |     self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max])
184 |     self.state = list.copy(self.current_pos)
185 |     
186 |     return self.state
187 |     
188 |   def _render(self, mode='human', close=False):
189 |     return 0
190 |   def close(self):
191 |     return 0
192 | 


--------------------------------------------------------------------------------
/gym_drone/envs/droneint_env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import logging
  3 | import math
  4 | import numpy as np
  5 | from gym import error, spaces, utils
  6 | from gym.utils import seeding
  7 | 
  8 | 
  9 | class DroneIntEnv(gym.Env):
 10 |   metadata = {'render.modes': ['human']}
 11 | 
 12 |   def __init__(self):
 13 |     
 14 |     # debug vars
 15 |     
 16 |     self.__version__ = "1.0.0"
 17 |     
 18 |     # Hyperparameter definition 
 19 |     self.x_min = int(0)
 20 |     self.x_max = int(4)
 21 |     self.y_min = int(0)
 22 |     self.y_max = int(4)
 23 |     self.min_cam_angle = int(1)
 24 |     self.max_cam_angle = int(3)
 25 |     self.min_terr_angle = int(1)
 26 |     self.max_terr_angle = int(3) #terrain angle - something that is observed
 27 |     self.min_speed = int(1)
 28 |     self.max_speed = int(3) #max speed is actually 56 kmh (this is m/s)
 29 |     self.min_height = int(1) #meter
 30 |     self.max_height = int(3) #meter
 31 |     
 32 |     
 33 |     # ???
 34 |     self.state = None #initiate state holder
 35 |     self.episode_over = False
 36 |     self.current_episode = -1 
 37 |     self.current_timestep = 0 # -1 because timestep increments before action
 38 |     self.current_pos = [0,0]
 39 |     self.action_episode_memory = []
 40 |     self.grid_step_max = (self.x_max+1)*(self.y_max+1) - 1 # number of grid squares
 41 |     self.max_timestep = 2*self.grid_step_max   # Visits all grid squares twice.
 42 |     #self.
 43 |     
 44 |     # Observations are (in this order): current x-pos, current y-pos, terrain angle (from horizontal axis)
 45 |     # Let's assume that the map is of grid size 5x5. Position of the drone is represented as (grid x index,
 46 |     # grid y index), where (0,0) is the top left of the grid ((4,4) is max value)).
 47 |     
 48 |     # Here, low is the lower limit of observation range, and high is the higher limit.
 49 |     low_ob = np.array([self.x_min,  # x-pos
 50 |                     self.y_min,  # y-pos
 51 |                     self.min_cam_angle]) # terrain_angle_deg
 52 |     high_ob = np.array([self.x_max,  # x-pos
 53 |                     self.y_max,  # y-pos
 54 |                     self.max_cam_angle]) # terrain_angle_deg
 55 |     self.observation_space = spaces.Box(low_ob, high_ob, dtype=np.float32)
 56 |     
 57 |     # Action space
 58 |     low_action = np.array([self.min_cam_angle,  # cam angle in deg
 59 |                     self.min_speed,  # flight speed in m/s
 60 |                     self.min_height]) # flight height in m
 61 |     high_action = np.array([self.max_cam_angle,  # cam angle in deg
 62 |                     self.max_speed,  # flight speed in m/s
 63 |                     self.max_height]) # flight height in m
 64 |     self.action_space = spaces.MultiDiscrete([self.max_cam_angle,  self.max_speed, self.max_height])
 65 |     
 66 |     # generate random terrain gradients/create them here
 67 |     # import random
 68 |     # list  = [111,222,333,444,555]
 69 |     # print("random.choice() to select random item from list - ", random.choice(list))
 70 | 
 71 |     
 72 |     self.terr_angle_grid = [0,0,0,0,0,
 73 |                             0,0,0,0,0,
 74 |                             1,1,1,1,1,
 75 |                             0,0,0,0,0,
 76 |                             0,0,0,0,0
 77 |                            ]
 78 | 
 79 |   def step(self, action):
 80 |     
 81 |     """
 82 |         The agent (drone) takes a step (flies somewhere) in the environment.
 83 |         Parameters
 84 |         ----------
 85 |  action : (int,int) - the coordinates, (int) - the terrain gradient
 86 |         Returns: (int) - terrain angle (observation), (float32) reward, (bool) episode_over, (int,int) - coords
 87 |         -------
 88 |         ob, reward, episode_over, info : tuple
 89 |             ob (object) :
 90 |                 an environment-specific object representing your observation of
 91 |                 the environment.
 92 |             reward (float) :
 93 |                 amount of reward achieved by the previous action. The scale
 94 |                 varies between environments, but the goal is always to increase
 95 |                 your total reward. (This reward per step is normalised to 1.)
 96 |             episode_over (bool) :
 97 |                 whether it's time to reset the environment again. Most (but not
 98 |                 all) tasks are divided up into well-defined episodes, and done
 99 |                 being True indicates the episode has terminated. (For example,
100 |                 perhaps the pole tipped too far, or you lost your last life.)
101 |             info (dict) :
102 |                  diagnostic information useful for debugging. It can sometimes
103 |                  be useful for learning (for example, it might contain the raw
104 |                  probabilities behind the environment's last state change).
105 |                  However, official evaluations of your agent are not allowed to
106 |                  use this for learning.
107 |         """
108 |     
109 |     if self.episode_over:
110 |         raise RuntimeError("Episode is done. You're running step() despite this fact. Or reset the env by calling reset().") #end execution, and finish run
111 | 
112 |     # Return the reward for action taken given state. Save action to action memory buffer.
113 |     self.action_episode_memory[self.current_episode].append(action)
114 |     
115 | 
116 |     # Take a step, and observe environment.
117 |     self.current_timestep += 1
118 |     self.current_pos = self.index2coord(self.current_timestep)
119 |     self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max])
120 |     self.state = list.copy(self.current_pos)
121 | 
122 |     reward = self.get_user_reward(action,self.state)                         
123 |     
124 |     if self.current_timestep>=self.max_timestep:
125 |       self.episode_over = True
126 |       
127 |     return self.state, reward, self.episode_over, {}
128 | 
129 |   #def print_action(self,action):
130 |             
131 |   def index2coord(self, index):
132 |     
133 |     # converts an index value to x-y coords
134 |     # see order of the grid above in __init__
135 |     
136 |     if (index<=self.x_max):
137 |       return [0, index]
138 |     else:        
139 |       return [(index%(self.grid_step_max+1))//(self.x_max+1), index%(self.x_max+1)]
140 |           
141 |       # grid step max is n*m (grid dimensions) -1
142 | 
143 |   def _get_state(self):
144 |     
145 |     return self.terr_angle_grid[self.current_timestep%self.grid_step_max]
146 |   
147 |   def _get_info(self):
148 |     
149 |     return self.index2coord(self.current_timestep)
150 |   
151 |   def _get_reward(self,action):
152 | 
153 |      # reward factors
154 |     
155 |     gradient_delta_rf = 0.3
156 |     speed_rf = 0.35
157 |     height_rf = 0.35
158 |     
159 |     #logging.warning("the current timestep.  ="+str(self.current_timestep))
160 |     #logging.warning("self.current_timestep%self.grid_step_max  =  "+ str(self.current_timestep%self.grid_step_max))
161 | 
162 |     gradient_delta = abs(self.terr_angle_grid[(self.current_timestep%self.grid_step_max)] - action[0]) # action [1] is the camera angle
163 | 
164 |     gradient_delta_norm = 1 - gradient_delta/self.max_cam_angle # this will give us a normalised value that rewards less difference
165 |     
166 |     speed_norm = 1 - action[1]/self.max_speed # speed normalised, and reward less speed
167 |     
168 |     height_norm = action[2]/self.max_height # height normalised, and more height is better (FOR NOW)
169 |     
170 |     tmp_reward = gradient_delta_norm*gradient_delta_rf + speed_norm*speed_rf + height_norm*height_rf
171 |     
172 |     return tmp_reward
173 |   
174 |   def get_user_reward(self, action, state):
175 | 
176 |     #init variables.
177 |     uinput_reward = None
178 |     
179 |     print("Given that the:\n\n")
180 |     print("Slope is"+str(state[2])+".\n")
181 |     print("@("+str(state[0])+","+str(state[1])+")\n\n")
182 | 
183 |     print("Agent takes actions:\n\n")
184 |     print("Camera Angle: "+str(action[0])+"\n")
185 |     print("Speed: "+str(action[1])+"\n")
186 |     print("Height: "+str(action[2])+"\n\n")
187 | 
188 |     print("This returns a predicted coverage of:")
189 |     tmp = _get_reward(action)
190 |     print("\n\n")
191 |                          
192 |     uinput_reward = input("How would you rate the most recent action?")
193 |     return uinput_reward
194 |     
195 |   def reset(self):
196 |     # reset should always run at the end of an episode and before the first run.
197 |     self.current_timestep = 0
198 |     self.current_episode += 1
199 |     self.action_episode_memory.append([])
200 |     self.episode_over = False
201 |     
202 |     self.current_pos = self.index2coord(self.current_timestep)
203 |     self.current_pos.append(self.terr_angle_grid[self.current_timestep%self.grid_step_max])
204 |     self.state = list.copy(self.current_pos)
205 |     
206 |     return self.state
207 |     
208 |   def _render(self, mode='human', close=False):
209 |     return 0
210 |   def close(self):
211 |     return 0
212 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/custom_scripts/runner.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Tensorforce Team. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY K , either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import time
 17 | from tqdm import tqdm
 18 | 
 19 | import numpy as np
 20 | 
 21 | from tensorforce import util
 22 | from tensorforce.agents import Agent
 23 | from tensorforce.environments import Environment
 24 | 
 25 | 
 26 | class Runner(object):
 27 | 
 28 |     def __init__(self, agent, environment, evaluation_environment=None, save_best_agent=False):
 29 |         # save_best overwrites saver...
 30 |         self.is_environment_external = isinstance(environment, Environment)
 31 |         self.environment = Environment.create(environment=environment)
 32 | 
 33 |         self.is_eval_environment_external = isinstance(evaluation_environment, Environment)
 34 |         if evaluation_environment is None:
 35 |             self.evaluation_environment = None
 36 |         else:
 37 |             self.evaluation_environment = Environment.create(environment=evaluation_environment)
 38 | 
 39 |         self.save_best_agent = save_best_agent
 40 |         self.is_agent_external = isinstance(agent, Agent)
 41 |         kwargs = dict()
 42 |         # warning: save_best_agent
 43 |         if not self.is_agent_external and self.save_best_agent:
 44 |             # Disable periodic saving
 45 |             kwargs = dict(saver=dict(seconds=None, steps=None))
 46 |         self.agent = Agent.create(agent=agent, environment=self.environment, **kwargs)
 47 |         if not self.agent.model.is_initialized:
 48 |             self.agent.initialize()
 49 | 
 50 |         self.global_episodes = self.agent.episodes
 51 |         self.global_timesteps = self.agent.timesteps
 52 |         self.global_updates = self.agent.updates
 53 |         self.episode_rewards = list()
 54 |         self.episode_timesteps = list()
 55 |         self.episode_seconds = list()
 56 |         self.episode_agent_seconds = list()
 57 | 
 58 |     def close(self):
 59 |         if hasattr(self, 'tqdm'):
 60 |             self.tqdm.close()
 61 |         if not self.is_agent_external:
 62 |             self.agent.close()
 63 |         if not self.is_environment_external:
 64 |             self.environment.close()
 65 |         if self.evaluation_environment is not None and not self.is_eval_environment_external:
 66 |             self.evaluation_environment.close()
 67 | 
 68 |     # TODO: make average reward another possible criteria for runner-termination
 69 |     def run(
 70 |         self,
 71 |         # General
 72 |         num_episodes=None, num_timesteps=None, num_updates=None, max_episode_timesteps=None,
 73 |         num_repeat_actions=1,
 74 |         # Callback
 75 |         callback=None, callback_episode_frequency=None, callback_timestep_frequency=None,
 76 |         # Tqdm
 77 |         use_tqdm=True, mean_horizon=10,
 78 |         # Evaluation
 79 |         evaluation=False, evaluation_callback=None, evaluation_frequency=None,
 80 |         max_evaluation_timesteps=None, num_evaluation_iterations=1
 81 |     ):
 82 |         # General
 83 |         if num_episodes is None:
 84 |             self.num_episodes = float('inf')
 85 |         else:
 86 |             self.num_episodes = num_episodes
 87 |         if num_timesteps is None:
 88 |             self.num_timesteps = float('inf')
 89 |         else:
 90 |             self.num_timesteps = num_timesteps
 91 |         if num_updates is None:
 92 |             self.num_updates = float('inf')
 93 |         else:
 94 |             self.num_updates = num_updates
 95 |         if max_episode_timesteps is None:
 96 |             self.max_episode_timesteps = float('inf')
 97 |         else:
 98 |             self.max_episode_timesteps = max_episode_timesteps
 99 |         self.num_repeat_actions = num_repeat_actions
100 | 
101 |         # Callback
102 |         assert callback_episode_frequency is None or callback_timestep_frequency is None
103 |         if callback_episode_frequency is None and callback_timestep_frequency is None:
104 |             callback_episode_frequency = 1
105 |         if callback_episode_frequency is None:
106 |             self.callback_episode_frequency = float('inf')
107 |         else:
108 |             self.callback_episode_frequency = callback_episode_frequency
109 |         if callback_timestep_frequency is None:
110 |             self.callback_timestep_frequency = float('inf')
111 |         else:
112 |             self.callback_timestep_frequency = callback_timestep_frequency
113 |         if callback is None:
114 |             self.callback = (lambda r: True)
115 |         elif util.is_iterable(x=callback):
116 |             def sequential_callback(runner):
117 |                 result = True
118 |                 for fn in callback:
119 |                     x = fn(runner)
120 |                     if isinstance(result, bool):
121 |                         result = result and x
122 |                 return result
123 |             self.callback = sequential_callback
124 |         else:
125 |             def boolean_callback(runner):
126 |                 result = callback(runner)
127 |                 if isinstance(result, bool):
128 |                     return result
129 |                 else:
130 |                     return True
131 |             self.callback = boolean_callback
132 | 
133 |         # Tqdm
134 |         if use_tqdm:
135 |             if hasattr(self, 'tqdm'):
136 |                 self.tqdm.close()
137 | 
138 |             assert self.num_episodes != float('inf') or self.num_timesteps != float('inf')
139 |             inner_callback = self.callback
140 | 
141 |             if self.num_episodes != float('inf'):
142 |                 # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
143 |                 assert self.num_episodes != float('inf')
144 |                 bar_format = (
145 |                     '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep='
146 |                     '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent='
147 |                     '{postfix[4]:.1f}%]'
148 |                 )
149 |                 postfix = [0.0, 0, 0.0, 0.0, 0.0]
150 |                 self.tqdm = tqdm(
151 |                     desc='Episodes', total=self.num_episodes, bar_format=bar_format,
152 |                     initial=self.global_episodes, postfix=postfix
153 |                 )
154 |                 self.tqdm_last_update = self.global_episodes
155 | 
156 |                 def tqdm_callback(runner):
157 |                     mean_reward = float(np.mean(runner.episode_rewards[-mean_horizon:]))
158 |                     mean_ts_per_ep = int(np.mean(runner.episode_timesteps[-mean_horizon:]))
159 |                     mean_sec_per_ep = float(np.mean(runner.episode_seconds[-mean_horizon:]))
160 |                     mean_agent_sec = float(np.mean(runner.episode_agent_seconds[-mean_horizon:]))
161 |                     mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep
162 |                     mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep
163 |                     runner.tqdm.postfix[0] = mean_reward
164 |                     runner.tqdm.postfix[1] = mean_ts_per_ep
165 |                     runner.tqdm.postfix[2] = mean_sec_per_ep
166 |                     runner.tqdm.postfix[3] = mean_ms_per_ts
167 |                     runner.tqdm.postfix[4] = mean_rel_agent
168 |                     runner.tqdm.update(n=(runner.global_episodes - runner.tqdm_last_update))
169 |                     runner.tqdm_last_update = runner.global_episodes
170 |                     return inner_callback(runner)
171 | 
172 |             else:
173 |                 # Timestep-based tqdm
174 |                 assert self.num_timesteps != float('inf')
175 |                 self.tqdm = tqdm(
176 |                     desc='Timesteps', total=self.num_timesteps, initial=self.global_timesteps,
177 |                     postfix=dict(mean_reward='n/a')
178 |                 )
179 |                 self.tqdm_last_update = self.global_timesteps
180 | 
181 |                 def tqdm_callback(runner):
182 |                     # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:])
183 |                     # num_timesteps = min(num_mean_reward, runner.episode_timestep)
184 |                     # mean_reward = sum_timesteps_reward / num_episodes
185 |                     runner.tqdm.set_postfix(mean_reward='n/a')
186 |                     runner.tqdm.update(n=(runner.global_timesteps - runner.tqdm_last_update))
187 |                     runner.tqdm_last_update = runner.global_timesteps
188 |                     return inner_callback(runner)
189 | 
190 |             self.callback = tqdm_callback
191 | 
192 |         # Evaluation
193 |         self.evaluation = evaluation
194 |         if evaluation_callback is None:
195 |             self.evaluation_callback = (lambda r: None)
196 |         else:
197 |             assert not self.evaluation
198 |             self.evaluation_callback = evaluation_callback
199 |         self.evaluation_frequency = evaluation_frequency
200 |         if max_evaluation_timesteps is None:
201 |             self.max_evaluation_timesteps = float('inf')
202 |         else:
203 |             assert not self.evaluation
204 |             self.max_evaluation_timesteps = max_evaluation_timesteps
205 |         self.num_evaluation_iterations = num_evaluation_iterations
206 |         if self.save_best_agent:
207 |             assert not self.evaluation
208 |             inner_evaluation_callback = self.evaluation_callback
209 | 
210 |             def mean_reward_callback(runner):
211 |                 result = inner_evaluation_callback(runner)
212 |                 if result is None:
213 |                     return float(np.mean(runner.evaluation_rewards))
214 |                 else:
215 |                     return result
216 | 
217 |             self.evaluation_callback = mean_reward_callback
218 |             self.best_evaluation_score = None
219 | 
220 |         # Reset agent
221 |         self.agent.reset()
222 | 
223 |         # Timestep/episode/update counter
224 |         self.timesteps = 0
225 |         self.episodes = 0
226 |         self.updates = 0
227 |         self.interactive = bool(int(input("\nWould you like this run to use user inputs? 0 - No, 1 - Yes  -- ")))
228 | 
229 |         # Episode loop
230 |         while True:
231 |             # Run episode
232 |             if not self.run_episode(
233 |                 environment=self.environment, max_timesteps=self.max_episode_timesteps,
234 |                 evaluation=self.evaluation
235 |             ):
236 |                 return
237 | 
238 |             # Increment episode counter (after calling callback)
239 |             self.episodes += 1
240 | 
241 |             # Update experiment statistics
242 |             self.episode_rewards.append(self.episode_reward)
243 |             self.episode_timesteps.append(self.episode_timestep)
244 |             self.episode_seconds.append(self.episode_second)
245 |             self.episode_agent_seconds.append(self.episode_agent_second)
246 | 
247 |             # Run evaluation
248 |             if self.evaluation_frequency is None:
249 |                 is_evaluation = self.episode_updated
250 |             else:
251 |                 is_evaluation = (self.episodes % self.evaluation_frequency == 0)
252 |             if is_evaluation:
253 |                 if self.evaluation_environment is None:
254 |                     environment = self.environment
255 |                 else:
256 |                     environment = self.evaluation_environment
257 | 
258 |                 self.evaluation_rewards = list()
259 |                 self.evaluation_timesteps = list()
260 |                 self.evaluation_seconds = list()
261 |                 self.evaluation_agent_seconds = list()
262 | 
263 |                 # Evaluation loop
264 |                 for _ in range(self.num_evaluation_iterations):
265 |                     self.run_episode(
266 |                         environment=environment, max_timesteps=self.max_evaluation_timesteps,
267 |                         evaluation=True
268 |                     )
269 | 
270 |                     self.evaluation_rewards.append(self.episode_reward)
271 |                     self.evaluation_timesteps.append(self.episode_timestep)
272 |                     self.evaluation_seconds.append(self.episode_second)
273 |                     self.evaluation_agent_seconds.append(self.episode_agent_second)
274 | 
275 |                 # Evaluation callback
276 |                 if self.save_best_agent:
277 |                     evaluation_score = self.evaluation_callback(self)
278 |                     assert isinstance(evaluation_score, float)
279 |                     if self.best_evaluation_score is None:
280 |                         self.best_evaluation_score = evaluation_score
281 |                     elif evaluation_score > self.best_evaluation_score:
282 |                         self.best_evaluation_score = evaluation_score
283 |                         self.agent.save(filename='best-model', append_timestep=False)
284 |                 else:
285 |                     self.evaluation_callback(self)
286 | 
287 |             # Update global timestep/episode/update
288 |             self.global_timesteps = self.agent.timesteps
289 |             self.global_episodes = self.agent.episodes
290 |             self.global_updates = self.agent.updates
291 | 
292 |             # Callback
293 |             if self.episodes % self.callback_episode_frequency == 0 and not self.callback(self):
294 |                 return
295 | 
296 |             # Terminate experiment if too long
297 |             if self.global_timesteps >= self.num_timesteps:
298 |                 return
299 |             elif self.evaluation and self.timesteps >= self.num_timesteps:
300 |                 return
301 |             elif self.global_episodes >= self.num_episodes:
302 |                 return
303 |             elif self.evaluation and self.episodes >= self.num_episodes:
304 |                 return
305 |             elif self.global_updates >= self.num_updates:
306 |                 return
307 |             elif self.evaluation and self.updates >= self.num_updates:
308 |                 return
309 |             elif self.agent.should_stop():
310 |                 return
311 | 
312 |     def action_rank(self, states, evaluation):
313 | 
314 |         action_buffer = []
315 |         print("*********************")
316 |         print("*********************")
317 |         print("\n%------------------------")
318 |         print("% STATE @ STEP# "+str(states[0]*states[1]))
319 |         print("%------------------------\n")
320 |         print("Slope: "+str(states[2])+" --- @("+str(states[0])+","+str(states[1])+")")
321 |         
322 |         for _ in range(0,4):
323 | 
324 |             # here,independent is TRUE because in the normal pipeline you would have to observe
325 |             # after taking an action, but we are simply sampling actions.
326 |             tmp_action = self.agent.act(states=states, independent = True, evaluation = False)
327 |             
328 |             print("\n%------------------------")
329 |             print("% ACTION "+str(_+1))
330 |             print("%------------------------\n")
331 | 
332 |             print("Camera Angle: "+str(tmp_action[0]))
333 |             print("Speed: "+str(tmp_action[1]))
334 |             print("Height: "+str(tmp_action[2]))            
335 | 
336 |             action_buffer.append(tmp_action)
337 | 
338 |         action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1
339 |         while action_choice>4 or action_choice<0:
340 |             action_choice = int(input("\nPlease select the optimal action (1-4): ")) - 1
341 | 
342 | 
343 |         return action_buffer[action_choice]
344 |             
345 |     def run_episode(self, environment, max_timesteps, evaluation):
346 |         # Episode statistics
347 |         self.episode_reward = 0
348 |         self.episode_timestep = 0
349 |         self.episode_updated = False
350 |         self.episode_agent_second = 0.0
351 |         episode_start = time.time()
352 | 
353 |         # Start environment episode
354 |         states = environment.reset()
355 | 
356 |         # Timestep loop
357 |         while True:
358 |             # Retrieve actions from agent
359 |             agent_start = time.time()
360 |             # user action only runs for the first episodes: only 50 steps
361 |             if self.agent.episodes == 0 and self.interactive:
362 |                 print(self.interactive)
363 |                 print(self.agent.episodes)
364 |                 user_action = self.action_rank(states=states, evaluation=evaluation)
365 |             else:
366 |                 user_action = 0
367 |             # run with selected action
368 |             if self.agent.episodes > 0:
369 |                 self.interactive = False
370 |             actions = self.agent.act(states=states, evaluation=evaluation, int_bool = self.interactive, int_act = user_action)
371 |             self.timesteps += 1
372 |             self.episode_agent_second += time.time() - agent_start
373 |             self.episode_timestep += 1
374 |             # Execute actions in environment (optional repeated execution)
375 |             reward = 0.0
376 |             for _ in range(self.num_repeat_actions):
377 |                 states, terminal, step_reward = environment.execute(actions=actions)                
378 |                 if isinstance(terminal, bool):
379 |                     terminal = int(terminal)
380 |                 reward += step_reward
381 |                 if terminal > 0:
382 |                     break
383 |             self.episode_reward += reward
384 | 
385 |             # Terminate episode if too long
386 |             if self.episode_timestep >= max_timesteps:
387 |                 terminal = 2
388 | 
389 |             # Observe unless evaluation
390 |             if not evaluation:
391 |                 agent_start = time.time()
392 |                 updated = self.agent.observe(terminal=terminal, reward=reward)
393 |                 self.updates += int(updated)
394 |                 self.episode_agent_second += time.time() - agent_start
395 |                 self.episode_updated = self.episode_updated or updated
396 | 
397 |             
398 | 
399 |             # Callback
400 |             if self.episode_timestep % self.callback_timestep_frequency == 0 and \
401 |                     not self.callback(self):
402 |                 return False
403 | 
404 |             # Episode termination check
405 |             if terminal > 0:
406 |                 break
407 | 
408 |             # No callbacks for evaluation
409 |             if evaluation:
410 |                 continue
411 | 
412 |             # Update global timestep/episode/update
413 |             self.global_timesteps = self.agent.timesteps
414 |             self.global_episodes = self.agent.episodes
415 |             self.global_updates = self.agent.updates
416 | 
417 |             # Terminate experiment if too long
418 |             if self.global_timesteps >= self.num_timesteps:
419 |                 return
420 |             elif self.global_episodes >= self.num_episodes:
421 |                 return
422 |             elif self.global_updates >= self.num_updates:
423 |                 return
424 |             elif self.agent.should_stop():
425 |                 return False
426 | 
427 |         # Update episode statistics
428 |         self.episode_second = time.time() - episode_start
429 | 
430 |         return True
431 | 


--------------------------------------------------------------------------------
/custom_scripts/agent.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Tensorforce Team. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | from collections import OrderedDict
 17 | import importlib
 18 | import json
 19 | import logging
 20 | import os
 21 | import random
 22 | import time
 23 | 
 24 | import numpy as np
 25 | import tensorflow as tf
 26 | 
 27 | from tensorforce import util, TensorforceError
 28 | import tensorforce.agents
 29 | 
 30 | 
 31 | class Agent(object):
 32 |     """
 33 |     Tensorforce agent interface.
 34 |     """
 35 | 
 36 |     @staticmethod
 37 |     def create(agent=None, environment=None, **kwargs):
 38 |         """
 39 |         Creates an agent from a specification.
 40 | 
 41 |         Args:
 42 |             agent (specification): JSON file, specification key, configuration dictionary,
 43 |                 library module, or `Agent` subclass
 44 |                 (<span style="color:#00C000"><b>default</b></span>: Policy agent).
 45 |             environment (Environment): Environment which the agent is supposed to be trained on,
 46 |                 environment-related arguments like state/action space specifications will be
 47 |                 extract if given.
 48 |             kwargs: Additional arguments.
 49 |         """
 50 |         if agent is None:
 51 |             agent = 'default'
 52 | 
 53 |         if isinstance(agent, Agent):
 54 |             # TODO: asserts???????
 55 |             return agent
 56 | 
 57 |         elif isinstance(agent, dict):
 58 |             # Dictionary specification
 59 |             util.deep_disjoint_update(target=kwargs, source=agent)
 60 |             agent = kwargs.pop('agent', kwargs.pop('type', 'default'))
 61 | 
 62 |             return Agent.create(agent=agent, environment=environment, **kwargs)
 63 | 
 64 |         elif isinstance(agent, str):
 65 |             if os.path.isfile(agent):
 66 |                 # JSON file specification
 67 |                 with open(agent, 'r') as fp:
 68 |                     agent = json.load(fp=fp)
 69 | 
 70 |                 util.deep_disjoint_update(target=kwargs, source=agent)
 71 |                 agent = kwargs.pop('agent', kwargs.pop('type', 'default'))
 72 | 
 73 |                 return Agent.create(agent=agent, environment=environment, **kwargs)
 74 | 
 75 |             elif '.' in agent:
 76 |                 # Library specification
 77 |                 library_name, module_name = agent.rsplit('.', 1)
 78 |                 library = importlib.import_module(name=library_name)
 79 |                 agent = getattr(library, module_name)
 80 | 
 81 |                 if environment is not None:
 82 |                     env_spec = dict(states=environment.states(), actions=environment.actions())
 83 |                     if environment.max_episode_timesteps() is not None:
 84 |                         env_spec['max_episode_timesteps'] = environment.max_episode_timesteps()
 85 |                     util.deep_disjoint_update(target=kwargs, source=env_spec)
 86 | 
 87 |                 agent = agent(**kwargs)
 88 |                 assert isinstance(agent, Agent)
 89 | 
 90 |                 return agent
 91 | 
 92 |             else:
 93 |                 # Keyword specification
 94 |                 if environment is not None:
 95 |                     env_spec = dict(states=environment.states(), actions=environment.actions())
 96 |                     if environment.max_episode_timesteps() is not None:
 97 |                         env_spec['max_episode_timesteps'] = environment.max_episode_timesteps()
 98 |                     util.deep_disjoint_update(target=kwargs, source=env_spec)
 99 | 
100 |                 agent = tensorforce.agents.agents[agent](**kwargs)
101 |                 assert isinstance(agent, Agent)
102 | 
103 |                 return agent
104 | 
105 |         else:
106 |             assert False
107 | 
108 |     def __init__(
109 |         # Environment
110 |         self, states, actions, max_episode_timesteps=None,
111 |         # TensorFlow etc
112 |         parallel_interactions=1, buffer_observe=True, seed=None, recorder=None
113 |     ):
114 |         if seed is not None:
115 |             assert isinstance(seed, int)
116 |             random.seed(n=seed)
117 |             np.random.seed(seed=seed)
118 |             tf.random.set_random_seed(seed=seed)
119 | 
120 |         # States/actions specification
121 |         self.states_spec = util.valid_values_spec(
122 |             values_spec=states, value_type='state', return_normalized=True
123 |         )
124 |         self.actions_spec = util.valid_values_spec(
125 |             values_spec=actions, value_type='action', return_normalized=True
126 |         )
127 |         self.max_episode_timesteps = max_episode_timesteps
128 | 
129 |         # Check for name overlap
130 |         for name in self.states_spec:
131 |             if name in self.actions_spec:
132 |                 TensorforceError.collision(
133 |                     name='name', value=name, group1='states', group2='actions'
134 |                 )
135 | 
136 |         # Parallel episodes
137 |         if isinstance(parallel_interactions, int):
138 |             if parallel_interactions <= 0:
139 |                 raise TensorforceError.value(
140 |                     name='parallel_interactions', value=parallel_interactions
141 |                 )
142 |             self.parallel_interactions = parallel_interactions
143 |         else:
144 |             raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions)
145 | 
146 |         # Buffer observe
147 |         if isinstance(buffer_observe, bool):
148 |             if not buffer_observe and self.parallel_interactions > 1:
149 |                 raise TensorforceError.unexpected()
150 |             if self.max_episode_timesteps is None and self.parallel_interactions > 1:
151 |                 raise TensorforceError.unexpected()
152 |             if not buffer_observe:
153 |                 self.buffer_observe = 1
154 |             elif self.max_episode_timesteps is None:
155 |                 self.buffer_observe = 100
156 |             else:
157 |                 self.buffer_observe = self.max_episode_timesteps
158 |         elif isinstance(buffer_observe, int):
159 |             if buffer_observe <= 0:
160 |                 raise TensorforceError.value(name='buffer_observe', value=buffer_observe)
161 |             if self.parallel_interactions > 1:
162 |                 raise TensorforceError.unexpected()
163 |             if self.max_episode_timesteps is None:
164 |                 self.buffer_observe = buffer_observe
165 |             else:
166 |                 self.buffer_observe = min(buffer_observe, self.max_episode_timesteps)
167 |         else:
168 |             raise TensorforceError.type(name='buffer_observe', value=buffer_observe)
169 | 
170 |         # Parallel terminal/reward buffers
171 |         self.terminal_buffers = np.ndarray(
172 |             shape=(self.parallel_interactions, self.buffer_observe),
173 |             dtype=util.np_dtype(dtype='long')
174 |         )
175 |         self.reward_buffers = np.ndarray(
176 |             shape=(self.parallel_interactions, self.buffer_observe),
177 |             dtype=util.np_dtype(dtype='float')
178 |         )
179 | 
180 |         # Parallel buffer indices
181 |         self.buffer_indices = np.zeros(
182 |             shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int')
183 |         )
184 | 
185 |         self.timesteps = 0
186 |         self.episodes = 0
187 |         self.updates = 0
188 | 
189 |         # Recorder
190 |         if recorder is None:
191 |             pass
192 |         elif not all(key in ('directory', 'frequency', 'max-traces') for key in recorder):
193 |             raise TensorforceError.value(name='recorder', value=list(recorder))
194 |         self.recorder_spec = recorder
195 |         if self.recorder_spec is not None:
196 |             self.record_states = OrderedDict(((name, list()) for name in self.states_spec))
197 |             for name, spec in self.actions_spec.items():
198 |                 if spec['type'] == 'int':
199 |                     self.record_states[name + '_mask'] = list()
200 |             self.record_actions = OrderedDict(((name, list()) for name in self.actions_spec))
201 |             self.record_terminal = list()
202 |             self.record_reward = list()
203 |             self.num_episodes = 0
204 | 
205 |     def __str__(self):
206 |         return self.__class__.__name__
207 | 
208 |     def initialize(self):
209 |         """
210 |         Initializes the agent.
211 |         """
212 |         if not hasattr(self, 'model'):
213 |             raise TensorforceError.missing(name='Agent', value='model')
214 | 
215 |         # Setup Model
216 |         # (create and build graph (local and global if distributed), server, session, etc..).
217 |         self.model.initialize()
218 |         self.reset()
219 | 
220 |     def close(self):
221 |         """
222 |         Closes the agent.
223 |         """
224 |         self.model.close()
225 | 
226 |     def reset(self):
227 |         """
228 |         Resets the agent to start a new episode.
229 |         """
230 |         self.buffer_indices = np.zeros(
231 |             shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int')
232 |         )
233 |         self.timesteps, self.episodes, self.updates = self.model.reset()
234 | 
235 |     def act(
236 |         self, states, int_bool=False, int_act=None, parallel=0, deterministic=False, independent=False, evaluation=False,
237 |         query=None, **kwargs
238 |     ):
239 |         """
240 |         Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless
241 |         `independent` is true.
242 | 
243 |         Args:
244 |             states (dict[state]): Dictionary containing state(s) to be acted on
245 |                 (<span style="color:#C00000"><b>required</b></span>).
246 |             parallel (int): Parallel execution index
247 |                 (<span style="color:#00C000"><b>default</b></span>: 0).
248 |             deterministic (bool): Whether to apply exploration and sampling
249 |                 (<span style="color:#00C000"><b>default</b></span>: false).
250 |             independent (bool): Whether action is not remembered, and this call is thus not
251 |                 followed by observe
252 |                 (<span style="color:#00C000"><b>default</b></span>: false).
253 |             evaluation (bool): Whether the agent is currently evaluated, implies and overwrites
254 |                 deterministic and independent
255 |                 (<span style="color:#00C000"><b>default</b></span>: false).
256 |             query (list[str]): Names of tensors to retrieve
257 |                 (<span style="color:#00C000"><b>default</b></span>: none).
258 |             kwargs: Additional input values, for instance, for dynamic hyperparameters.
259 | 
260 |         Returns:
261 |             (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried
262 |             tensor values if requested.
263 |         """
264 |         assert util.reduce_all(predicate=util.not_nan_inf, xs=states)
265 | 
266 |         '''
267 |         # ***** NING EDIT ****
268 |         if kwargs is not None:
269 |             extra = dict(kwargs)
270 |             inre = False
271 |             for item in extra:
272 |                 if item=='inre':
273 |                     inre = True
274 |                     interactive_action = extra['interactive_action']
275 |         else:
276 |             inre = False
277 |         # ****** NING EDIT END ***
278 |         '''
279 | 
280 |         # self.current_internals = self.next_internals
281 |         if evaluation:
282 |             if deterministic or independent:
283 |                 raise TensorforceError.unexpected()
284 |             deterministic = independent = True
285 | 
286 |         # Auxiliaries
287 |         auxiliaries = OrderedDict()
288 |         if isinstance(states, dict):
289 |             states = dict(states)
290 |             for name, spec in self.actions_spec.items():
291 |                 if spec['type'] == 'int' and name + '_mask' in states:
292 |                     auxiliaries[name + '_mask'] = states.pop(name + '_mask')
293 | 
294 |         # Normalize states dictionary
295 |         states = util.normalize_values(
296 |             value_type='state', values=states, values_spec=self.states_spec
297 |         )
298 | 
299 |         # Batch states
300 |         states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1)
301 |         auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1)
302 | 
303 |         # Model.act()
304 |         if query is None:
305 |             actions, self.timesteps = self.model.act(
306 |                 states=states, auxiliaries=auxiliaries, parallel=parallel,
307 |                 deterministic=deterministic, independent=independent, **kwargs
308 |             )
309 | 
310 |         else:
311 |             actions, self.timesteps, queried = self.model.act(
312 |                 states=states, auxiliaries=auxiliaries, parallel=parallel,
313 |                 deterministic=deterministic, independent=independent, query=query, **kwargs
314 |             )
315 | 
316 |         
317 | 
318 |         if int_bool:
319 |             print("\nModel action is:")
320 |             print(actions['action'])
321 |             actions['action'] = [int_act]
322 |             print("\nUser action is:")
323 |             print(actions['action'])
324 |                 
325 | 
326 |         if self.recorder_spec is not None and not independent:
327 |             for name in self.states_spec:
328 |                 self.record_states[name].append(states[name])
329 |             for name, spec in self.actions_spec.items():
330 |                 self.record_actions[name].append(actions[name])
331 |                 if spec['type'] == 'int':
332 |                     if name + '_mask' in auxiliaries:
333 |                         self.record_states[name].append(auxiliaries[name + '_mask'])
334 |                     else:
335 |                         shape = (1,) + spec['shape'] + (spec['num_values'],)
336 |                         self.record_states[name].append(
337 |                             np.full(shape, True, dtype=util.np_dtype(dtype='bool'))
338 |                         )
339 | 
340 |         # Unbatch actions
341 |         actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1)
342 | 
343 |         # Reverse normalized actions dictionary
344 |         actions = util.unpack_values(
345 |             value_type='action', values=actions, values_spec=self.actions_spec
346 |         )
347 | 
348 |         # if independent, return processed state as well?
349 | 
350 |         if query is None:
351 |             return actions
352 |         else:
353 |             return actions, queried
354 | 
355 |     def observe(self, reward, terminal=False, parallel=0, query=None, **kwargs):
356 |         """
357 |         Observes reward and whether a terminal state is reached, needs to be preceded by
358 |         `act(...)`.
359 | 
360 |         Args:
361 |             reward (float): Reward
362 |                 (<span style="color:#C00000"><b>required</b></span>).
363 |             terminal (bool | 0 | 1 | 2): Whether a terminal state is reached or 2 if the
364 |                 episode was aborted (<span style="color:#00C000"><b>default</b></span>: false).
365 |             parallel (int): Parallel execution index
366 |                 (<span style="color:#00C000"><b>default</b></span>: 0).
367 |             query (list[str]): Names of tensors to retrieve
368 |                 (<span style="color:#00C000"><b>default</b></span>: none).
369 |             kwargs: Additional input values, for instance, for dynamic hyperparameters.
370 | 
371 |         Returns:
372 |             (bool, optional list[str]): Whether an update was performed, plus queried tensor values
373 |             if requested.
374 |         """
375 |         assert util.reduce_all(predicate=util.not_nan_inf, xs=reward)
376 | 
377 |         if query is not None and self.parallel_interactions > 1:
378 |             raise TensorforceError.unexpected()
379 | 
380 |         if isinstance(terminal, bool):
381 |             terminal = int(terminal)
382 | 
383 |         if self.recorder_spec is not None:
384 |             self.record_terminal.append(terminal)
385 |             self.record_reward.append(reward)
386 |             if terminal > 0:
387 |                 self.num_episodes += 1
388 | 
389 |                 if self.num_episodes == self.recorder_spec.get('frequency', 1):
390 |                     directory = self.recorder_spec['directory']
391 |                     if os.path.isdir(directory):
392 |                         files = sorted(
393 |                             f for f in os.listdir(directory)
394 |                             if os.path.isfile(os.path.join(directory, f))
395 |                             and f.startswith('trace-')
396 |                         )
397 |                     else:
398 |                         os.makedirs(directory)
399 |                         files = list()
400 |                     max_traces = self.recorder_spec.get('max-traces')
401 |                     if max_traces is not None and len(files) > max_traces - 1:
402 |                         for filename in files[:-max_traces + 1]:
403 |                             filename = os.path.join(directory, filename)
404 |                             os.remove(filename)
405 | 
406 |                     filename = 'trace-{}-{}.npz'.format(
407 |                         self.episodes, time.strftime('%Y%m%d-%H%M%S')
408 |                     )
409 |                     filename = os.path.join(directory, filename)
410 |                     self.record_states = util.fmap(
411 |                         function=np.concatenate, xs=self.record_states, depth=1
412 |                     )
413 |                     self.record_actions = util.fmap(
414 |                         function=np.concatenate, xs=self.record_actions, depth=1
415 |                     )
416 |                     self.record_terminal = np.asarray(self.record_terminal)
417 |                     self.record_reward = np.asarray(self.record_reward)
418 |                     np.savez_compressed(
419 |                         filename, **self.record_states, **self.record_actions,
420 |                         terminal=self.record_terminal, reward=self.record_reward
421 |                     )
422 |                     self.record_states = util.fmap(
423 |                         function=(lambda x: list()), xs=self.record_states, depth=1
424 |                     )
425 |                     self.record_actions = util.fmap(
426 |                         function=(lambda x: list()), xs=self.record_actions, depth=1
427 |                     )
428 |                     self.record_terminal = list()
429 |                     self.record_reward = list()
430 |                     self.num_episodes = 0
431 | 
432 |         # Update terminal/reward buffer
433 |         index = self.buffer_indices[parallel]
434 |         self.terminal_buffers[parallel, index] = terminal
435 |         self.reward_buffers[parallel, index] = reward
436 |         index += 1
437 | 
438 |         if self.max_episode_timesteps is not None and index > self.max_episode_timesteps:
439 |             raise TensorforceError.unexpected()
440 | 
441 |         if terminal > 0 or index == self.buffer_observe or query is not None:
442 |             # Model.observe()
443 |             if query is None:
444 |                 updated, self.episodes, self.updates = self.model.observe(
445 |                     terminal=self.terminal_buffers[parallel, :index],
446 |                     reward=self.reward_buffers[parallel, :index], parallel=parallel, **kwargs
447 |                 )
448 | 
449 |             else:
450 |                 updated, self.episodes, self.updates, queried = self.model.observe(
451 |                     terminal=self.terminal_buffers[parallel, :index],
452 |                     reward=self.reward_buffers[parallel, :index], parallel=parallel, query=query,
453 |                     **kwargs
454 |                 )
455 | 
456 |             # Reset buffer index
457 |             self.buffer_indices[parallel] = 0
458 | 
459 |         else:
460 |             # Increment buffer index
461 |             self.buffer_indices[parallel] = index
462 |             updated = False
463 | 
464 |         if query is None:
465 |             return updated
466 |         else:
467 |             return updated, queried
468 | 
469 |     def save(self, directory=None, filename=None, append_timestep=True):
470 |         """
471 |         Saves the current state of the agent.
472 | 
473 |         Args:
474 |             directory (str): Checkpoint directory
475 |                 (<span style="color:#00C000"><b>default</b></span>: directory specified for
476 |                 TensorFlow saver).
477 |             filename (str): Checkpoint filename
478 |                 (<span style="color:#00C000"><b>default</b></span>: filename specified for
479 |                 TensorFlow saver).
480 |             append_timestep: Whether to append the current timestep to the checkpoint file
481 |                 (<span style="color:#00C000"><b>default</b></span>: true).
482 | 
483 |         Returns:
484 |             str: Checkpoint path.
485 |         """
486 |         # TODO: Messes with required parallel disentangling, better to remove unfinished episodes
487 |         # from memory, but currently entire episode buffered anyway...
488 |         # # Empty buffers before saving
489 |         # for parallel in range(self.parallel_interactions):
490 |         #     index = self.buffer_indices[parallel]
491 |         #     if index > 0:
492 |         #         # if self.parallel_interactions > 1:
493 |         #         #     raise TensorforceError.unexpected()
494 |         #         self.episode = self.model.observe(
495 |         #             terminal=self.terminal_buffers[parallel, :index],
496 |         #             reward=self.reward_buffers[parallel, :index], parallel=parallel
497 |         #         )
498 |         #         self.buffer_indices[parallel] = 0
499 | 
500 |         return self.model.save(
501 |             directory=directory, filename=filename, append_timestep=append_timestep
502 |         )
503 | 
504 |     def restore(self, directory=None, filename=None):
505 |         """
506 |         Restores the agent.
507 | 
508 |         Args:
509 |             directory (str): Checkpoint directory
510 |                 (<span style="color:#00C000"><b>default</b></span>: directory specified for
511 |                 TensorFlow saver).
512 |             filename (str): Checkpoint filename
513 |                 (<span style="color:#00C000"><b>default</b></span>: latest checkpoint in
514 |                 directory).
515 |         """
516 |         if not hasattr(self, 'model'):
517 |             raise TensorforceError.missing(name='Agent', value='model')
518 | 
519 |         if not self.model.is_initialized:
520 |             self.model.initialize()
521 | 
522 |         self.timesteps, self.episodes, self.updates = self.model.restore(
523 |             directory=directory, filename=filename
524 |         )
525 | 
526 |     def get_output_tensors(self, function):
527 |         """
528 |         Returns the names of output tensors for the given function.
529 | 
530 |         Args:
531 |             function (str): Function name
532 |                 (<span style="color:#C00000"><b>required</b></span>).
533 | 
534 |         Returns:
535 |             list[str]: Names of output tensors.
536 |         """
537 |         if function in self.model.output_tensors:
538 |             return self.model.output_tensors[function]
539 |         else:
540 |             raise TensorforceError.unexpected()
541 | 
542 |     def get_query_tensors(self, function):
543 |         """
544 |         Returns the names of queryable tensors for the given function.
545 | 
546 |         Args:
547 |             function (str): Function name
548 |                 (<span style="color:#C00000"><b>required</b></span>).
549 | 
550 |         Returns:
551 |             list[str]: Names of queryable tensors.
552 |         """
553 |         if function in self.model.query_tensors:
554 |             return self.model.query_tensors[function]
555 |         else:
556 |             raise TensorforceError.unexpected()
557 | 
558 |     def get_available_summaries(self):
559 |         """
560 |         Returns the summary labels provided by the agent.
561 | 
562 |         Returns:
563 |             list[str]: Available summary labels.
564 |         """
565 |         return self.model.get_available_summaries()
566 | 
567 |     def should_stop(self):
568 |         return self.model.monitored_session.should_stop()
569 | 


--------------------------------------------------------------------------------