├── .gitignore
├── A3C
    ├── .gitignore
    ├── README.md
    ├── a3c_ff.py
    ├── a3c_lstm.py
    └── run_a3c.py
├── CEM
    ├── .gitignore
    ├── README.md
    ├── cem_gym.py
    └── continuous_cem_gym.py
├── DP
    ├── README.md
    ├── policy_iteration_gym.py
    └── value_iteration_gym.py
├── DQN
    ├── .gitignore
    ├── README.md
    ├── dqn.py
    ├── drqn.py
    └── run_dqn.py
├── FA
    ├── .gitignore
    ├── README.md
    └── q_learning_gym.py
├── GEN
    ├── README.md
    └── genetic_gym.py
├── LICENSE
├── MC
    ├── .directory
    ├── Blackjack Playground.ipynb
    ├── MC Control with Epsilon-Greedy Policies.ipynb
    ├── MC Prediction.ipynb
    ├── Off-Policy MC Control with Weighted Importance Sampling.ipynb
    └── README.md
├── PG
    ├── .gitignore
    ├── README.md
    ├── reinforce.py
    └── run_reinforce.py
├── README.md
├── TD
    ├── README.md
    ├── evsarsa.py
    ├── qlearning.py
    ├── run.py
    └── sarsa.py
├── agents
    ├── README.md
    ├── __init__.py
    ├── agent_networks.py
    └── agent_states.py
├── common
    ├── __init__.py
    ├── buffer.py
    ├── networks.py
    ├── schedules.py
    └── segment_tree.py
├── environment_corners.ipynb
├── jedi_upload.py
├── requirements.txt
├── wrappers
    ├── __init__.py
    ├── gym_wrappers.py
    └── run_wrappers.py
└── xvfb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | .idea
 92 | .code
 93 | 
 94 | *.png
 95 | *.pid
 96 | 
 97 | *tmp*
 98 | *logs*
 99 | *.log
100 | 
101 | solutions.md
102 | *.swp
103 | *.ipynb
104 | vizdoom.ini
105 | 


--------------------------------------------------------------------------------
/A3C/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | logs*
3 | 


--------------------------------------------------------------------------------
/A3C/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/A3C/README.md


--------------------------------------------------------------------------------
/A3C/a3c_ff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import tensorflow as tf
 4 | from rstools.tf.optimization import build_model_optimization, build_scope_optimization
 5 | 
 6 | from agents.agent_networks import FeatureNet, PolicyNet, ValueNet
 7 | from agents.agent_states import LinearHiddenState
 8 | 
 9 | 
10 | class A3CFFAgent(object):
11 |     def __init__(self, state_shape, n_actions, network, special=None):
12 |         self.special = special or {}
13 |         self.state_shape = state_shape
14 |         self.n_actions = n_actions
15 |         self.special = special
16 | 
17 |         self.special = special
18 |         self.scope = special.get("scope", "a3c_ff")
19 | 
20 |         with tf.variable_scope(self.scope):
21 |             self._build_graph(network)
22 | 
23 |     def _build_graph(self, network):
24 |         self.feature_net = FeatureNet(
25 |             self.state_shape, network,
26 |             self.special.get("feature_get", None))
27 | 
28 |         self.hidden_state = LinearHiddenState(
29 |             self.feature_net.feature_state,
30 |             self.special.get("hidden_size", 512),
31 |             self.special.get("hidden_activation", tf.nn.elu))
32 | 
33 |         self.policy_net = PolicyNet(
34 |             self.hidden_state.state, self.n_actions,
35 |             self.special.get("policy_net", None))
36 |         self.value_net = ValueNet(
37 |             self.hidden_state.state,
38 |             self.special.get("value_net", None))
39 | 
40 |         build_model_optimization(
41 |             self.policy_net,
42 |             self.special.get("policy_net_optimization", None))
43 |         build_model_optimization(
44 |             self.value_net,
45 |             self.special.get("value_net_optimization", None))
46 |         build_model_optimization(
47 |             self.hidden_state,
48 |             self.special.get("hidden_state_optimization", None),
49 |             loss=0.5 * (self.policy_net.loss + self.value_net.loss))
50 |         build_model_optimization(
51 |             self.feature_net,
52 |             self.special.get("feature_net_optimization", None),
53 |             loss=0.5 * (self.policy_net.loss + self.value_net.loss))
54 | 
55 |     def predict_values(self, sess, state_batch):
56 |         return sess.run(
57 |             self.value_net.predicted_values_for_actions,
58 |             feed_dict={
59 |                 self.feature_net.states: state_batch,
60 |                 self.feature_net.is_training: False})
61 | 
62 |     def predict_probs(self, sess, state_batch):
63 |         return sess.run(
64 |             self.policy_net.predicted_probs,
65 |             feed_dict={
66 |                 self.feature_net.states: state_batch,
67 |                 self.feature_net.is_training: False})
68 | 


--------------------------------------------------------------------------------
/A3C/a3c_lstm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import tensorflow as tf
 4 | from rstools.tf.optimization import build_model_optimization, build_scope_optimization
 5 | from tensorflow.contrib import rnn
 6 | 
 7 | from agents.agent_networks import FeatureNet, PolicyNet, ValueNet
 8 | from agents.agent_states import RecurrentHiddenState, get_state_variables, get_state_update_op
 9 | 
10 | 
11 | class A3CLstmAgent(object):
12 |     def __init__(self, state_shape, n_actions, network, special=None):
13 |         self.state_shape = state_shape
14 |         self.n_actions = n_actions
15 | 
16 |         self.special = special
17 |         self.scope = special.get("scope", "a3c_lstm")
18 | 
19 |         with tf.variable_scope(self.scope):
20 |             self._build_graph(network)
21 | 
22 |     def _build_graph(self, network):
23 |         self.feature_net = FeatureNet(
24 |             self.state_shape, network,
25 |             self.special.get("feature_net", None))
26 | 
27 |         self.hidden_state = RecurrentHiddenState(
28 |             self.feature_net.feature_state,
29 |             self.special.get("hidden_size", 512),
30 |             self.special.get("hidden_activation", tf.tanh),
31 |             self.special.get("batch_size", 1))
32 | 
33 |         self.policy_net = PolicyNet(
34 |             self.hidden_state.state, self.n_actions,
35 |             self.special.get("policy_net", {}))
36 |         self.value_net = ValueNet(
37 |             self.hidden_state.state,
38 |             self.special.get("value_net", {}))
39 | 
40 |         build_model_optimization(
41 |             self.policy_net,
42 |             self.special.get("policy_net_optimization", None))
43 |         build_model_optimization(
44 |             self.value_net,
45 |             self.special.get("value_net_optimization", None))
46 |         build_model_optimization(
47 |             self.hidden_state,
48 |             self.special.get("hidden_state_optimization", None),
49 |             loss=0.5 * (self.value_net.loss + self.policy_net.loss))
50 |         build_model_optimization(
51 |             self.feature_net, self.special.get("feature_net_optimization", None),
52 |             loss=0.5 * (self.value_net.loss + self.policy_net.loss))
53 | 
54 |     def predict_values(self, sess, state_batch):
55 |         return sess.run(
56 |             self.value_net.predicted_values_for_actions,
57 |             feed_dict={
58 |                 self.feature_net.states: state_batch,
59 |                 self.feature_net.is_training: False
60 |             })
61 | 
62 |     def predict_probs(self, sess, state_batch):
63 |         return sess.run(
64 |             self.policy_net.predicted_probs,
65 |             feed_dict={
66 |                 self.feature_net.states: state_batch,
67 |                 self.feature_net.is_training: False
68 |             })
69 | 
70 |     def update_belief_state(self, sess, state_batch, done_batch):
71 |         _ = sess.run(
72 |             self.hidden_state.belief_update,
73 |             feed_dict={
74 |                 self.feature_net.states: state_batch,
75 |                 self.hidden_state.is_end: done_batch,
76 |                 self.feature_net.is_training: False
77 |             })
78 | 
79 |     def assign_belief_state(self, sess, new_belief):
80 |         _ = sess.run(
81 |             self.hidden_state.belief_assign,
82 |             feed_dict={
83 |                 self.hidden_state.belief_out: new_belief
84 |             })
85 | 
86 |     def get_belief_state(self, sess):
87 |         return sess.run(self.hidden_state.belief_state)
88 | 


--------------------------------------------------------------------------------
/A3C/run_a3c.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | from rstools.utils.batch_utils import iterate_minibatches, merge_generators
  5 | from tqdm import trange
  6 | 
  7 | from A3C.a3c_ff import A3CFFAgent
  8 | from A3C.a3c_lstm import A3CLstmAgent
  9 | from common.networks import activations
 10 | from wrappers.gym_wrappers import Transition
 11 | from wrappers.run_wrappers import typical_args, typical_argsparse, run_wrapper, update_wraper, \
 12 |     epsilon_greedy_policy, play_session
 13 | 
 14 | 
 15 | def update(sess, a3c_agent, transitions, initial_state=None,
 16 |            discount_factor=0.99, reward_norm=1.0,
 17 |            batch_size=32, time_major=True):
 18 |     policy_targets = []
 19 |     value_targets = []
 20 |     state_history = []
 21 |     action_history = []
 22 |     done_history = []
 23 | 
 24 |     cumulative_reward = np.zeros_like(transitions[-1].reward) + \
 25 |                         np.invert(transitions[-1].done) * \
 26 |                         a3c_agent.predict_values(sess, transitions[-1].next_state)
 27 |     for transition in reversed(transitions):
 28 |         cumulative_reward = reward_norm * transition.reward + \
 29 |                             np.invert(transition.done) * discount_factor * cumulative_reward
 30 |         policy_target = cumulative_reward - a3c_agent.predict_values(sess, transition.state)
 31 | 
 32 |         value_targets.append(cumulative_reward)
 33 |         policy_targets.append(policy_target)
 34 |         state_history.append(transition.state)
 35 |         action_history.append(transition.action)
 36 |         done_history.append(transition.done)
 37 | 
 38 |     value_targets = np.array(value_targets[::-1])  # time-major
 39 |     policy_targets = np.array(policy_targets[::-1])
 40 |     state_history = np.array(state_history[::-1])
 41 |     action_history = np.array(action_history[::-1])
 42 |     done_history = np.array(done_history[::-1])
 43 | 
 44 |     if isinstance(a3c_agent, A3CLstmAgent):
 45 |         a3c_agent.assign_belief_state(sess, initial_state)
 46 | 
 47 |     time_len = state_history.shape[0]
 48 |     value_loss, policy_loss = 0.0, 0.0
 49 |     for state_axis, action_axis, value_target_axis, policy_target_axis, done_axis in \
 50 |             zip(state_history, action_history, value_targets, policy_targets, done_history):
 51 |         axis_len = state_axis.shape[0]
 52 |         axis_value_loss, axis_policy_loss = 0.0, 0.0
 53 | 
 54 |         state_axis = iterate_minibatches(state_axis, batch_size)
 55 |         action_axis = iterate_minibatches(action_axis, batch_size)
 56 |         value_target_axis = iterate_minibatches(value_target_axis, batch_size)
 57 |         policy_target_axis = iterate_minibatches(policy_target_axis, batch_size)
 58 |         done_axis = iterate_minibatches(done_axis, batch_size)
 59 | 
 60 |         batch_generator = merge_generators(
 61 |             [state_axis, action_axis, value_target_axis, policy_target_axis, done_axis])
 62 | 
 63 |         for state_batch, action_batch, value_target, policy_target, done_batch in batch_generator:
 64 |             run_params = [
 65 |                  a3c_agent.policy_net.loss,
 66 |                  a3c_agent.value_net.loss,
 67 |                  a3c_agent.policy_net.train_op,
 68 |                  a3c_agent.value_net.train_op,
 69 |                  a3c_agent.feature_net.train_op]
 70 |             feed_params = {
 71 |                 a3c_agent.feature_net.states: state_batch,
 72 |                 a3c_agent.feature_net.is_training: True,
 73 |                 a3c_agent.policy_net.actions: action_batch,
 74 |                 a3c_agent.policy_net.cumulative_rewards: policy_target,
 75 |                 a3c_agent.policy_net.is_training: True,
 76 |                 a3c_agent.value_net.td_target: value_target,
 77 |                 a3c_agent.value_net.is_training: True
 78 |             }
 79 | 
 80 |             if isinstance(a3c_agent, A3CLstmAgent):
 81 |                 run_params += [a3c_agent.hidden_state.belief_update]
 82 |                 feed_params[a3c_agent.hidden_state.is_end] = done_batch
 83 | 
 84 |             run_result = sess.run(
 85 |                 run_params,
 86 |                 feed_dict=feed_params)
 87 | 
 88 |             batch_loss_policy = run_result[0]
 89 |             batch_loss_state = run_result[1]
 90 | 
 91 |             axis_value_loss += batch_loss_state
 92 |             axis_policy_loss += batch_loss_policy
 93 | 
 94 |         policy_loss += axis_policy_loss / axis_len
 95 |         value_loss += axis_value_loss / axis_len
 96 | 
 97 |     return policy_loss / time_len, value_loss / time_len
 98 | 
 99 | 
100 | def generate_sessions(sess, a3c_agent, env_pool, update_fn, t_max=1000):
101 |     total_reward = 0.0
102 |     total_games = 0.0
103 | 
104 |     transitions = []
105 |     init_state = None
106 |     if isinstance(a3c_agent, A3CLstmAgent):
107 |         init_state = a3c_agent.get_belief_state(sess)
108 | 
109 |     states = env_pool.pool_states()
110 |     for t in range(t_max):
111 |         actions = epsilon_greedy_policy(a3c_agent, sess, states)
112 |         next_states, rewards, dones, _ = env_pool.step(actions)
113 | 
114 |         transitions.append(Transition(
115 |             state=states, action=actions, reward=rewards, next_state=next_states, done=dones))
116 | 
117 |         if isinstance(a3c_agent, A3CLstmAgent):
118 |             a3c_agent.update_belief_state(sess, states, dones)
119 | 
120 |         states = next_states
121 | 
122 |         total_reward += rewards.sum()
123 |         total_games += dones.sum()
124 | 
125 |         if env_pool.n_envs == 1 and total_games > 0:
126 |             break
127 | 
128 |     total_policy_loss, total_value_loss = update_fn(sess, a3c_agent, transitions, init_state)
129 | 
130 |     return total_reward / env_pool.n_envs, \
131 |            total_policy_loss, total_value_loss, \
132 |            t / (total_games / env_pool.n_envs)
133 | 
134 | 
135 | def a3c_learning(
136 |         sess, agent, env, update_fn,
137 |         n_epochs=1000, n_sessions=100, t_max=1000):
138 |     tr = trange(
139 |         n_epochs,
140 |         desc="",
141 |         leave=True)
142 | 
143 |     history = {
144 |         "reward": np.zeros(n_epochs),
145 |         "policy_loss": np.zeros(n_epochs),
146 |         "value_loss": np.zeros(n_epochs),
147 |         "steps": np.zeros(n_epochs),
148 |     }
149 | 
150 |     for i in tr:
151 |         sessions = [
152 |             generate_sessions(sess, agent, env, update_fn, t_max)
153 |             for _ in range(n_sessions)]
154 |         session_rewards, session_policy_loss, session_value_loss, session_steps = \
155 |             map(np.array, zip(*sessions))
156 | 
157 |         history["reward"][i] = np.mean(session_rewards)
158 |         history["policy_loss"][i] = np.mean(session_policy_loss)
159 |         history["value_loss"][i] = np.mean(session_value_loss)
160 |         history["steps"][i] = np.mean(session_steps)
161 | 
162 |         desc = "\t".join(
163 |             ["{} = {:.3f}".format(key, value[i]) for key, value in history.items()])
164 |         tr.set_description(desc)
165 | 
166 |     return history
167 | 
168 | 
169 | def run(env_name, make_env_fn, agent_cls,
170 |         run_args, update_args, agent_agrs,
171 |         log_dir=None, episode_limit=None,
172 |         plot_stats=False, api_key=None,
173 |         load=False, gpu_option=0.4,
174 |         n_games=10):
175 |     run_wrapper(
176 |         n_games, a3c_learning, update_wraper(update, **update_args),
177 |         play_session, epsilon_greedy_policy,
178 |         env_name, make_env_fn, agent_cls,
179 |         run_args, agent_agrs,
180 |         log_dir=log_dir, episode_limit=episode_limit,
181 |         plot_stats=plot_stats, api_key=api_key,
182 |         load=load, gpu_option=gpu_option)
183 | 
184 | 
185 | def _parse_args():
186 |     parser = argparse.ArgumentParser(description='A3C Agent Learning')
187 |     # typical params
188 |     parser.add_argument(
189 |         '--agent',
190 |         type=str,
191 |         default="feed_forward",
192 |         choices=["feed_forward", "recurrent"],
193 |         help='Which agent to use. (default: %(default)s)')
194 | 
195 |     parser = typical_args(parser)
196 | 
197 |     # agent special params & optimization
198 |     parser.add_argument(
199 |         '--policy_lr',
200 |         type=float,
201 |         default=1e-5,
202 |         help='Learning rate for policy network. (default: %(default)s)')
203 |     parser.add_argument(
204 |         '--value_lr',
205 |         type=float,
206 |         default=1e-5,
207 |         help='Learning rate for value network. (default: %(default)s)')
208 | 
209 |     parser.add_argument(
210 |         '--entropy_factor',
211 |         type=float,
212 |         default=1e-2,
213 |         help='Entropy factor for policy network. (default: %(default)s)')
214 | 
215 |     args = parser.parse_args()
216 |     return args
217 | 
218 | 
219 | def main():
220 |     args = _parse_args()
221 | 
222 |     assert args.time_major, "Please, use time_major flag for updates"
223 | 
224 |     network, run_args, update_args, optimization_params, make_env_fn = typical_argsparse(args)
225 | 
226 |     policy_optimization_params = {
227 |         **optimization_params,
228 |         **{"initial_lr": args.policy_lr}
229 |     }
230 | 
231 |     value_optimization_params = {
232 |         **optimization_params,
233 |         **{"initial_lr": args.value_lr}
234 |     }
235 |     policy_net_params = {
236 |         "entropy_factor": args.entropy_factor
237 |     }
238 | 
239 |     agent_cls = A3CFFAgent if args.agent == "feed_forward" else A3CLstmAgent
240 | 
241 |     special = {
242 |         "policy_net": policy_net_params,
243 |         "hidden_size": args.hidden_size,
244 |         "hidden_activation": activations[args.hidden_activation],
245 |         "feature_net_optimization": optimization_params,
246 |         "hidden_state_optimization": optimization_params,
247 |         "value_net_optimization": value_optimization_params,
248 |         "policy_net_optimization": policy_optimization_params,
249 |     }
250 | 
251 |     agent_args = {
252 |         "network": network,
253 |         "special": special
254 |     }
255 | 
256 |     run(args.env, make_env_fn, agent_cls,
257 |         run_args, update_args, agent_args,
258 |         args.log_dir, args.episode_limit,
259 |         args.plot_history, args.api_key,
260 |         args.load, args.gpu_option,
261 |         args.n_games)
262 | 
263 | 
264 | if __name__ == '__main__':
265 |     main()
266 | 


--------------------------------------------------------------------------------
/CEM/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | logs*
3 | 


--------------------------------------------------------------------------------
/CEM/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/CEM/README.md


--------------------------------------------------------------------------------
/CEM/cem_gym.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import gym
  4 | from gym import wrappers
  5 | import argparse
  6 | import pickle
  7 | import numpy as np
  8 | from tqdm import trange
  9 | from joblib import Parallel, delayed
 10 | import collections
 11 | import sklearn.pipeline
 12 | import sklearn.preprocessing
 13 | from sklearn.neural_network import MLPClassifier
 14 | from sklearn.kernel_approximation import RBFSampler
 15 | 
 16 | from matplotlib import pyplot as plt
 17 | 
 18 | plt.style.use("ggplot")
 19 | 
 20 | 
 21 | def plot_unimetric(history, metric, save_dir):
 22 |     plt.figure()
 23 |     plt.plot(history[metric])
 24 |     plt.title('model {}'.format(metric))
 25 |     plt.ylabel(metric)
 26 |     plt.xlabel('epoch')
 27 |     plt.savefig("{}/{}.png".format(save_dir, metric),
 28 |                 format='png', dpi=300)
 29 | 
 30 | 
 31 | class Estimator(object):
 32 |     """
 33 |     Value Function approximator.
 34 |     """
 35 | 
 36 |     def __init__(self, env, layers):
 37 |         self.n_actions = env.action_space.n
 38 |         self._prepare_estimator_for_env(env)
 39 |         self.model = MLPClassifier(
 40 |             hidden_layer_sizes=layers,
 41 |             activation='tanh',
 42 |             warm_start=True,
 43 |             max_iter=1)
 44 |         # We need to call partial_fit once to initialize the model
 45 |         # or we get a NotFittedError when trying to make a prediction
 46 |         # This is quite hacky.
 47 |         self.model.fit(
 48 |             [self.featurize_state(env.reset())] * self.n_actions,
 49 |             range(self.n_actions))
 50 | 
 51 |     def _prepare_estimator_for_env(self, env):
 52 |         observation_examples = np.array(
 53 |             [env.observation_space.sample() for _ in range(10000)])
 54 |         observation_examples = self._vectorise_state(observation_examples)
 55 | 
 56 |         scaler = sklearn.preprocessing.StandardScaler()
 57 |         scaler.fit(observation_examples)
 58 |         self.scaler = scaler
 59 | 
 60 |         featurizer = sklearn.pipeline.FeatureUnion([
 61 |             ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
 62 |             ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
 63 |             ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
 64 |             ("rbf4", RBFSampler(gamma=0.5, n_components=100))
 65 |         ])
 66 |         featurizer.fit(scaler.transform(observation_examples))
 67 |         self.featurizer = featurizer
 68 | 
 69 |     def _vectorise_state(self, states):
 70 |         obs_shape = states.shape
 71 |         if len(obs_shape) < 2:  # just one observation
 72 |             states = np.expand_dims(states, 0)
 73 |         elif len(obs_shape) > 2:  # some many states magic
 74 |             states = states.reshape((obs_shape[0], -1))
 75 |         return states
 76 | 
 77 |     def featurize_state(self, state):
 78 |         """
 79 |         Returns the featurized representation for a state.
 80 |         """
 81 |         state = self._vectorise_state(state)
 82 |         scaled = self.scaler.transform(state)
 83 |         featurized = self.featurizer.transform(scaled)
 84 |         if featurized.shape[0] == 1:
 85 |             return featurized[0]
 86 |         else:
 87 |             return featurized
 88 | 
 89 |     def predict_proba(self, s):
 90 |         features = self.featurize_state(s)
 91 |         return self.model.predict_proba([features])
 92 | 
 93 |     def fit(self, s, y):
 94 |         features = self.featurize_state(s)
 95 |         self.model.partial_fit(features, y)
 96 | 
 97 | 
 98 | def generate_session(env, agent, t_max=int(1e5), step_penalty=0.01):
 99 |     states, actions = [], []
100 |     total_reward = 0
101 | 
102 |     s = env.reset()
103 | 
104 |     for t in range(t_max):
105 | 
106 |         # predict array of action probabilities
107 |         probs = agent.predict_proba([s])[0]
108 | 
109 |         a = np.random.choice(env.action_space.n, p=probs)
110 | 
111 |         new_s, r, done, info = env.step(a)
112 | 
113 |         # record sessions like you did before
114 |         states.append(s)
115 |         actions.append(a)
116 |         total_reward += r
117 | 
118 |         s = new_s
119 |         if done:
120 |             break
121 | 
122 |     total_reward -= t * step_penalty
123 | 
124 |     return states, actions, total_reward, t
125 | 
126 | 
127 | glob_env = None
128 | glob_agent = None
129 | 
130 | 
131 | def generate_parallel_session(t_max=int(1e5), step_penalty=0.01):
132 |     states, actions = [], []
133 |     total_reward = 0
134 | 
135 |     s = glob_env.reset()
136 | 
137 |     for t in range(t_max):
138 | 
139 |         # predict array of action probabilities
140 |         probs = glob_agent.predict_proba([s])[0]
141 | 
142 |         a = np.random.choice(glob_env.action_space.n, p=probs)
143 | 
144 |         new_s, r, done, info = glob_env.step(a)
145 | 
146 |         # record sessions like you did before
147 |         states.append(s)
148 |         actions.append(a)
149 |         total_reward += r
150 | 
151 |         s = new_s
152 |         if done:
153 |             break
154 | 
155 |     total_reward -= t * step_penalty
156 | 
157 |     return states, actions, total_reward, t
158 | 
159 | 
160 | def generate_parallel_sessions(n, t_max, step_penalty, n_jobs=-1):
161 |     return Parallel(n_jobs)(n * [delayed(generate_parallel_session)(t_max, step_penalty)])
162 | 
163 | 
164 | def cem(env, agent, num_episodes, max_steps=int(1e6), step_penalty=0.01,
165 |         n_samples=200, percentile=50, n_jobs=-1, verbose=False):
166 |     global glob_env, glob_agent
167 |     init_n_samples = n_samples
168 |     final_n_samples = n_samples // 5
169 |     plays_to_decay = num_episodes // 2
170 | 
171 |     states_deque = collections.deque(maxlen=int(init_n_samples * 2))
172 |     actions_deque = collections.deque(maxlen=int(init_n_samples * 2))
173 |     rewards_deque = collections.deque(maxlen=int(init_n_samples * 2))
174 | 
175 |     glob_env = env  # NEVER DO LIKE THIS PLEASE!
176 |     glob_agent = agent
177 | 
178 |     # Keeps track of useful statistics
179 |     history = {
180 |         "threshold": np.zeros(num_episodes),
181 |         "reward": np.zeros(num_episodes),
182 |         "n_steps": np.zeros(num_episodes),
183 |     }
184 | 
185 |     tr = trange(
186 |         num_episodes,
187 |         desc="mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format(
188 |             0.0, 0.0, 0.0),
189 |         leave=True)
190 | 
191 |     for i in tr:
192 |         # generate new sessions
193 |         sessions = generate_parallel_sessions(n_samples, max_steps, step_penalty, n_jobs)
194 |         if i < plays_to_decay:
195 |             n_samples -= (init_n_samples - final_n_samples) // plays_to_decay
196 | 
197 |         batch_states, batch_actions, batch_rewards, batch_steps = map(np.array, zip(*sessions))
198 |         # batch_states: a list of lists of states in each session
199 |         # batch_actions: a list of lists of actions in each session
200 |         # batch_rewards: a list of floats - total rewards at each session
201 |         states_deque.extend(batch_states)
202 |         actions_deque.extend(batch_actions)
203 |         rewards_deque.extend(batch_rewards)
204 | 
205 |         batch_states = np.array(states_deque)
206 |         batch_actions = np.array(actions_deque)
207 |         batch_rewards = np.array(rewards_deque)
208 | 
209 |         threshold = np.percentile(batch_rewards, percentile)
210 | 
211 |         history["threshold"][i] = threshold
212 |         history["reward"][i] = np.mean(batch_rewards)
213 |         history["n_steps"][i] = np.mean(batch_steps)
214 | 
215 |         # look like > better, cause >= refer to reuse of bad examples
216 |         if i < plays_to_decay:
217 |             elite_states = batch_states[batch_rewards > threshold]
218 |             elite_actions = batch_actions[batch_rewards > threshold]
219 |         else:
220 |             elite_states = batch_states[batch_rewards >= threshold]
221 |             elite_actions = batch_actions[batch_rewards >= threshold]
222 | 
223 |         if len(elite_actions) > 0:
224 |             elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions])
225 |             # elite_states: a list of states from top games
226 |             # elite_actions: a list of actions from top games
227 |             try:
228 |                 agent.fit(elite_states, elite_actions)
229 |             except:
230 |                 # just a hack
231 |                 addition = np.array([env.reset()] * env.action_space.n)
232 |                 elite_states = np.vstack((elite_states, addition))
233 |                 elite_actions = np.hstack((elite_actions, list(range(env.action_space.n))))
234 |                 agent.fit(elite_states, elite_actions)
235 | 
236 |         tr.set_description(
237 |             "mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format(
238 |                 np.mean(batch_rewards) + step_penalty * np.mean(batch_steps),
239 |                 threshold, np.mean(batch_steps)))
240 | 
241 |     return history
242 | 
243 | 
244 | def _parse_args():
245 |     parser = argparse.ArgumentParser(description='Policy iteration example')
246 |     parser.add_argument(
247 |         '--env',
248 |         type=str,
249 |         default='CartPole-v0',  # CartPole-v0, MountainCar-v0, LunarLander-v2
250 |         help='The environment to use')
251 |     parser.add_argument(
252 |         '--num_episodes',
253 |         type=int,
254 |         default=200,
255 |         help='Number of episodes')
256 |     parser.add_argument(
257 |         '--max_steps',
258 |         type=int,
259 |         default=int(1e5),
260 |         help='Number of steps per episode')
261 |     parser.add_argument(
262 |         '--n_samples',
263 |         type=int,
264 |         default=1000,
265 |         help='Games per epoch')
266 |     parser.add_argument(
267 |         '--step_penalty',
268 |         type=float,
269 |         default=0.01)
270 |     parser.add_argument(
271 |         '--percentile',
272 |         type=int,
273 |         default=80,
274 |         help='percentile')
275 |     parser.add_argument(
276 |         '--verbose',
277 |         action='store_true',
278 |         default=False)
279 |     parser.add_argument(
280 |         '--plot_stats',
281 |         action='store_true',
282 |         default=False)
283 |     parser.add_argument(
284 |         '--features',
285 |         action='store_true',
286 |         default=False)
287 |     parser.add_argument(
288 |         '--layers',
289 |         type=str,
290 |         default=None)
291 |     parser.add_argument(
292 |         '--api_key',
293 |         type=str,
294 |         default=None)
295 |     parser.add_argument(
296 |         '--n_jobs',
297 |         type=int,
298 |         default=-1)
299 |     parser.add_argument(
300 |         '--seed',
301 |         type=int,
302 |         default=42)
303 |     parser.add_argument(
304 |         '--resume',
305 |         action='store_true',
306 |         default=False)
307 | 
308 |     args, _ = parser.parse_known_args()
309 |     return args
310 | 
311 | 
312 | def save_stats(stats, save_dir="./"):
313 |     for key in stats:
314 |         plot_unimetric(stats, key, save_dir)
315 | 
316 | 
317 | def run(env, n_episodes=200, max_steps=int(1e5), n_samples=1000, step_penalty=0.01,
318 |         percentile=80, features=False, layers=None,
319 |         verbose=False, plot_stats=False, api_key=None, n_jobs=-1, seed=42, resume=False):
320 |     env_name = env
321 |     if env_name == "MountainCar-v0":
322 |         env = gym.make(env).env
323 |         layers = layers or (20, 10, 20)
324 |     else:
325 |         env = gym.make(env)
326 |         layers = layers or (256, 256, 128)
327 | 
328 |     if features:
329 |         agent = Estimator(env, layers)
330 |     else:
331 |         agent = MLPClassifier(
332 |             hidden_layer_sizes=layers,
333 |             activation='tanh',
334 |             warm_start=True,
335 |             max_iter=1)
336 |         agent.fit([env.reset()] * env.action_space.n, range(env.action_space.n))
337 | 
338 |     if resume:
339 |         agent = pickle.load(open("agent.pkl", "rb"))
340 | 
341 |     env.seed(seed)
342 |     np.random.seed(seed)
343 | 
344 |     stats = cem(env, agent, n_episodes,
345 |                 max_steps=max_steps, step_penalty=step_penalty,
346 |                 n_samples=n_samples, percentile=percentile,
347 |                 n_jobs=n_jobs, verbose=verbose)
348 |     if plot_stats:
349 |         save_stats(stats)
350 | 
351 |     pickle.dump(agent, open("agent.pkl", "wb"))
352 | 
353 |     if api_key is not None:
354 |         env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True)
355 |         sessions = [generate_session(env, agent, int(1e10), 0.0) for _ in range(1000)]
356 |         env.close()
357 |         gym.upload("/tmp/" + env_name, api_key=api_key)
358 | 
359 | 
360 | def main():
361 |     args = _parse_args()
362 |     try:
363 |         layers = tuple(map(int, args.layers.split("-")))
364 |     except:
365 |         layers = None
366 |     run(args.env, args.num_episodes, args.max_steps, args.n_samples, args.step_penalty,
367 |         args.percentile, args.features, layers,
368 |         args.verbose, args.plot_stats, args.api_key, args.n_jobs, args.seed, args.resume)
369 | 
370 | 
371 | if __name__ == '__main__':
372 |     main()
373 | 


--------------------------------------------------------------------------------
/CEM/continuous_cem_gym.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import gym
  4 | from gym import wrappers
  5 | import pickle
  6 | import argparse
  7 | import numpy as np
  8 | from tqdm import trange
  9 | from joblib import Parallel, delayed
 10 | import collections
 11 | from sklearn.neural_network import MLPRegressor
 12 | 
 13 | from matplotlib import pyplot as plt
 14 | 
 15 | plt.style.use("ggplot")
 16 | 
 17 | 
 18 | def plot_unimetric(history, metric, save_dir):
 19 |     plt.figure()
 20 |     plt.plot(history[metric])
 21 |     plt.title('model {}'.format(metric))
 22 |     plt.ylabel(metric)
 23 |     plt.xlabel('epoch')
 24 |     plt.savefig("{}/{}.png".format(save_dir, metric),
 25 |                 format='png', dpi=300)
 26 | 
 27 | 
 28 | def generate_session(env, agent, t_max=int(1e5), step_penalty=0.01):
 29 |     states, actions = [], []
 30 |     total_reward = 0
 31 | 
 32 |     s = env.reset()
 33 | 
 34 |     for t in range(t_max):
 35 |         a = agent.predict([s])[0]
 36 |         a = np.array(list(map(
 37 |             lambda x: min(
 38 |                 max(x[1], env.action_space.low[x[0]]),
 39 |                 env.action_space.high[x[0]]),
 40 |             enumerate(a))))
 41 | 
 42 |         new_s, r, done, info = env.step(a)
 43 | 
 44 |         # record sessions like you did before
 45 |         states.append(s)
 46 |         actions.append(a)
 47 |         total_reward += r
 48 | 
 49 |         s = new_s
 50 |         if done:
 51 |             break
 52 | 
 53 |     total_reward -= t * step_penalty
 54 | 
 55 |     return states, actions, total_reward, t
 56 | 
 57 | 
 58 | glob_env = None
 59 | glob_agent = None
 60 | 
 61 | 
 62 | def generate_parallel_session(t_max=int(1e5), step_penalty=0.01):
 63 |     states, actions = [], []
 64 |     total_reward = 0
 65 | 
 66 |     s = glob_env.reset()
 67 | 
 68 |     for t in range(t_max):
 69 |         a = glob_agent.predict([s])[0]
 70 |         a = np.array(list(map(
 71 |             lambda x: min(
 72 |                 max(x[1], glob_env.action_space.low[x[0]]),
 73 |                 glob_env.action_space.high[x[0]]),
 74 |             enumerate(a))))
 75 | 
 76 |         new_s, r, done, info = glob_env.step(a)
 77 | 
 78 |         # record sessions like you did before
 79 |         states.append(s)
 80 |         actions.append(a)
 81 |         total_reward += r
 82 | 
 83 |         s = new_s
 84 |         if done:
 85 |             break
 86 | 
 87 |     total_reward -= t * step_penalty
 88 | 
 89 |     return states, actions, total_reward, t
 90 | 
 91 | 
 92 | def generate_parallel_sessions(n, t_max, step_penalty, n_jobs=-1):
 93 |     return Parallel(n_jobs)(n * [delayed(generate_parallel_session)(t_max, step_penalty)])
 94 | 
 95 | 
 96 | def cem(env, agent, num_episodes, max_steps=int(1e6), step_penalty=0.01,
 97 |         n_samples=200, percentile=50, n_jobs=-1, verbose=False):
 98 |     global glob_env, glob_agent
 99 |     init_n_samples = n_samples
100 |     final_n_samples = n_samples // 5
101 |     plays_to_decay = num_episodes // 2
102 | 
103 |     states_deque = collections.deque(maxlen=int(init_n_samples * 2))
104 |     actions_deque = collections.deque(maxlen=int(init_n_samples * 2))
105 |     rewards_deque = collections.deque(maxlen=int(init_n_samples * 2))
106 | 
107 |     glob_env = env  # NEVER DO LIKE THIS PLEASE!
108 |     glob_agent = agent
109 | 
110 |     # Keeps track of useful statistics
111 |     history = {
112 |         "threshold": np.zeros(num_episodes),
113 |         "reward": np.zeros(num_episodes),
114 |         "n_steps": np.zeros(num_episodes),
115 |     }
116 | 
117 |     tr = trange(
118 |         num_episodes,
119 |         desc="mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format(0.0, 0.0,
120 |                                                                                       0.0),
121 |         leave=True)
122 | 
123 |     for i in tr:
124 |         # generate new sessions
125 |         # sessions = [
126 |         #     generate_session(env, agent, max_steps, step_penalty)
127 |         #     for _ in range(n_samples)]
128 |         sessions = generate_parallel_sessions(n_samples, max_steps, step_penalty, n_jobs)
129 |         if i < plays_to_decay:
130 |             n_samples -= (init_n_samples - final_n_samples) // plays_to_decay
131 | 
132 |         batch_states, batch_actions, batch_rewards, batch_steps = map(np.array, zip(*sessions))
133 |         # batch_states: a list of lists of states in each session
134 |         # batch_actions: a list of lists of actions in each session
135 |         # batch_rewards: a list of floats - total rewards at each session
136 |         states_deque.extend(batch_states)
137 |         actions_deque.extend(batch_actions)
138 |         rewards_deque.extend(batch_rewards)
139 | 
140 |         batch_states = np.array(states_deque)
141 |         batch_actions = np.array(actions_deque)
142 |         batch_rewards = np.array(rewards_deque)
143 | 
144 |         threshold = np.percentile(batch_rewards, percentile)
145 | 
146 |         history["threshold"][i] = threshold
147 |         history["reward"][i] = np.mean(batch_rewards)
148 |         history["n_steps"][i] = np.mean(batch_steps)
149 | 
150 |         # look like > better, cause >= refer to reuse of bad examples
151 |         if i < plays_to_decay:
152 |             elite_states = batch_states[batch_rewards > threshold]
153 |             elite_actions = batch_actions[batch_rewards > threshold]
154 |         else:
155 |             elite_states = batch_states[batch_rewards >= threshold]
156 |             elite_actions = batch_actions[batch_rewards >= threshold]
157 | 
158 |         if len(elite_actions) > 0:
159 |             elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions])
160 |             # elite_states: a list of states from top games
161 |             # elite_actions: a list of actions from top games
162 |             try:
163 |                 agent.fit(elite_states, elite_actions)
164 |             except:
165 |                 # just a hack
166 |                 addition = np.array([env.reset()] * env.action_space.n)
167 |                 elite_states = np.vstack((elite_states, addition))
168 |                 elite_actions = np.hstack((elite_actions, list(range(env.action_space.n))))
169 |                 agent.fit(elite_states, elite_actions)
170 | 
171 |         tr.set_description(
172 |             "mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format(
173 |                 np.mean(batch_rewards) + step_penalty * np.mean(batch_steps),
174 |                 threshold, np.mean(batch_steps)))
175 | 
176 |     return history
177 | 
178 | 
179 | def _parse_args():
180 |     parser = argparse.ArgumentParser(description='Policy iteration example')
181 |     parser.add_argument(
182 |         '--env',
183 |         type=str,
184 |         default='MountainCarContinuous-v0',  # MountainCar-v0, LunarLander-v2
185 |         help='The environment to use')
186 |     parser.add_argument(
187 |         '--num_episodes',
188 |         type=int,
189 |         default=200,
190 |         help='Number of episodes')
191 |     parser.add_argument(
192 |         '--max_steps',
193 |         type=int,
194 |         default=int(1e5),
195 |         help='Number of steps per episode')
196 |     parser.add_argument(
197 |         '--n_samples',
198 |         type=int,
199 |         default=1000,
200 |         help='Games per epoch')
201 |     parser.add_argument(
202 |         '--step_penalty',
203 |         type=float,
204 |         default=0.01)
205 |     parser.add_argument(
206 |         '--percentile',
207 |         type=int,
208 |         default=80,
209 |         help='percentile')
210 |     parser.add_argument(
211 |         '--verbose',
212 |         action='store_true',
213 |         default=False)
214 |     parser.add_argument(
215 |         '--plot_stats',
216 |         action='store_true',
217 |         default=False)
218 |     parser.add_argument(
219 |         '--layers',
220 |         type=str,
221 |         default=None)
222 |     parser.add_argument(
223 |         '--api_key',
224 |         type=str,
225 |         default=None)
226 |     parser.add_argument(
227 |         '--n_jobs',
228 |         type=int,
229 |         default=-1)
230 |     parser.add_argument(
231 |         '--seed',
232 |         type=int,
233 |         default=42)
234 |     parser.add_argument(
235 |         '--resume',
236 |         action='store_true',
237 |         default=False)
238 | 
239 |     args, _ = parser.parse_known_args()
240 |     return args
241 | 
242 | 
243 | def save_stats(stats, save_dir="./"):
244 |     for key in stats:
245 |         plot_unimetric(stats, key, save_dir)
246 | 
247 | 
248 | def run(env, n_episodes=200, max_steps=int(1e5), n_samples=1000, step_penalty=0.01,
249 |         percentile=80, layers=None,
250 |         verbose=False, plot_stats=False, api_key=None, n_jobs=-1, seed=42, resume=False):
251 |     env_name = env
252 |     if env_name == "MountainCarContinuous-v0":
253 |         env = gym.make(env).env
254 |     else:
255 |         env = gym.make(env)
256 |     layers = layers or (256, 256, 128)
257 | 
258 |     agent = MLPRegressor(
259 |         hidden_layer_sizes=layers,
260 |         activation='tanh',
261 |         warm_start=True,
262 |         max_iter=1)
263 |     agent.fit(
264 |         np.zeros(env.observation_space.shape).reshape(1, -1),
265 |         np.zeros(env.action_space.shape).reshape(1, -1))
266 | 
267 |     if resume:
268 |         agent = pickle.load(open("agent.pkl", "rb"))
269 | 
270 |     env.seed(seed)
271 |     np.random.seed(seed)
272 | 
273 |     stats = cem(env, agent, n_episodes,
274 |                 max_steps=max_steps, step_penalty=step_penalty,
275 |                 n_samples=n_samples, percentile=percentile,
276 |                 n_jobs=n_jobs, verbose=verbose)
277 |     if plot_stats:
278 |         save_stats(stats)
279 | 
280 |     pickle.dump(agent, open("agent.pkl", "wb"))
281 | 
282 |     if api_key is not None:
283 |         env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True)
284 |         sessions = [generate_session(env, agent, int(1e10), 0.0) for _ in range(300)]
285 |         env.close()
286 |         gym.upload("/tmp/" + env_name, api_key=api_key)
287 | 
288 | 
289 | def main():
290 |     args = _parse_args()
291 |     try:
292 |         layers = tuple(map(int, args.layers.split("-")))
293 |     except:
294 |         layers = None
295 |     run(args.env, args.num_episodes, args.max_steps, args.n_samples, args.step_penalty,
296 |         args.percentile, layers,
297 |         args.verbose, args.plot_stats, args.api_key, args.n_jobs, args.seed, args.resume)
298 | 
299 | 
300 | if __name__ == '__main__':
301 |     main()
302 | 


--------------------------------------------------------------------------------
/DP/README.md:
--------------------------------------------------------------------------------
 1 | ## Model-Based RL: Policy and Value Iteration using Dynamic Programming
 2 | 
 3 | ### Learning Goals
 4 | 
 5 | - Understand the difference between Policy Evaluation and Policy Improvement and how these processes interact
 6 | - Understand the Policy Iteration Algorithm
 7 | - Understand the Value Iteration Algorithm
 8 | - Understand the Limitations of Dynamic Programming Approaches
 9 | 
10 | 
11 | ### Summary
12 | 
13 | - Dynamic Programming (DP) methods assume that we have a perfect model of the environment's Markov Decision Process (MDP). That's usually not the case in practice, but it's important to study DP anyway.
14 | - Policy Evaluation: Calculates the state-value function `V(s)` for a given policy. In DP this is done using a "full backup". At each state, we look ahead one step at each possible action and next state. We can only do this because we have a perfect model of the environment.
15 | - Full backups are basically the Bellman equations turned into updates.
16 | - Policy Improvement: Given the correct state-value function for a policy we can act greedily with respect to it (i.e. pick the best action at each state). Then we are guaranteed to improve the policy or keep it fixed if it's already optimal.
17 | - Policy Iteration: Iteratively perform Policy Evaluation and Policy Improvement until we reach the optimal policy.
18 | - Value Iteration: Instead of doing multiple steps of Policy Evaluation to find the "correct" V(s) we only do a single step and improve the policy immediately. In practice, this converges faster.
19 | - Generalized Policy Iteration: The process of iteratively doing policy evaluation and improvement. We can pick different algorithms for each of these steps but the basic idea stays the same.
20 | - DP methods bootstrap: They update estimates based on other estimates (one step ahead).
21 | 
22 | 
23 | ### Lectures & Readings
24 | 
25 | **Required:**
26 | 
27 | - David Silver's RL Course Lecture 3 - Planning by Dynamic Programming ([video](https://www.youtube.com/watch?v=Nd1-UUMVfz4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/DP.pdf))
28 | 
29 | **Optional:**
30 | 
31 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 4: Dynamic Programming
32 | 
33 | [source](https://github.com/dennybritz/reinforcement-learning)
34 | 


--------------------------------------------------------------------------------
/DP/policy_iteration_gym.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import gym
  4 | from gym import wrappers
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | 
  9 | def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
 10 |     """
 11 |     Evaluate a policy given an environment
 12 |         and a full description of the environment's dynamics.
 13 | 
 14 |     Args:
 15 |         policy: [S, A] shaped matrix representing the policy.
 16 |         env: OpenAI env.
 17 |             env.P represents the transition probabilities of the environment.
 18 |             env.P[s][a] is a (prob, next_state, reward, done) tuple.
 19 |         theta: We stop evaluation
 20 |             one our value function change is less than theta for all states.
 21 |         discount_factor: lambda discount factor.
 22 | 
 23 |     Returns:
 24 |         Vector of length env.observation_space.n representing the value function.
 25 |     """
 26 |     # Start with a random (all 0) value function
 27 |     V = np.zeros(env.observation_space.n)
 28 |     while True:
 29 |         delta = 0
 30 |         # For each state, perform a "full backup"
 31 |         for s in range(env.observation_space.n):
 32 |             v = 0
 33 |             # Look at the possible next actions
 34 |             for a, action_prob in enumerate(policy[s]):
 35 |                 # For each action, look at the possible next states...
 36 |                 for prob, next_state, reward, done in env.P[s][a]:
 37 |                     # Calculate the expected value
 38 |                     v += action_prob * prob * (
 39 |                         reward + discount_factor * V[next_state])
 40 |             # How much our value function changed (across any states)
 41 |             delta = max(delta, np.abs(v - V[s]))
 42 |             V[s] = v
 43 |         # Stop evaluating once our value function change is below a threshold
 44 |         if delta < theta:
 45 |             break
 46 |     return np.array(V)
 47 | 
 48 | 
 49 | def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):
 50 |     """
 51 |     Policy Improvement Algorithm. Iteratively evaluates and improves a policy
 52 |     until an optimal policy is found.
 53 | 
 54 |     Args:
 55 |         env: The OpenAI envrionment.
 56 |         policy_eval_fn: Policy Evaluation function that takes 3 arguments:
 57 |             policy, env, discount_factor.
 58 |         discount_factor: Lambda discount factor.
 59 | 
 60 |     Returns:
 61 |         A tuple (policy, V).
 62 |             policy is the optimal policy,
 63 |             a matrix of shape [S, A] where each state s
 64 |             contains a valid probability distribution over actions.
 65 |         V is the value function for the optimal policy.
 66 | 
 67 |     """
 68 |     # Start with a random policy
 69 |     policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
 70 | 
 71 |     while True:
 72 |         # Evaluate the current policy
 73 |         V = policy_eval_fn(policy, env, discount_factor)
 74 | 
 75 |         # Will be set to false if we make any changes to the policy
 76 |         policy_stable = True
 77 | 
 78 |         # For each state...
 79 |         for s in range(env.observation_space.n):
 80 |             # The best action we would take under the currect policy
 81 |             chosen_a = np.argmax(policy[s])
 82 | 
 83 |             # Find the best action by one-step lookahead
 84 |             # Ties are resolved arbitarily
 85 |             action_values = np.zeros(env.action_space.n)
 86 |             for a in range(env.action_space.n):
 87 |                 for prob, next_state, reward, done in env.P[s][a]:
 88 |                     action_values[a] += prob * (
 89 |                         reward + discount_factor * V[next_state])
 90 |             best_a = np.argmax(action_values)
 91 | 
 92 |             # Greedily update the policy
 93 |             if chosen_a != best_a:
 94 |                 policy_stable = False
 95 |             policy[s] = np.eye(env.action_space.n)[best_a]
 96 | 
 97 |         # If the policy is stable we've found an optimal policy. Return it
 98 |         if policy_stable:
 99 |             return policy, V
100 | 
101 | 
102 | def env_description(env, policy, v):
103 |     print("Policy Probability Distribution:")
104 |     print(policy)
105 |     print("")
106 | 
107 |     import pdb; pdb.set_trace()
108 | 
109 |     print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
110 |     print(np.reshape(np.argmax(policy, axis=1), (env.nrow, env.ncol)))
111 |     print("")
112 | 
113 |     print("Value Function:")
114 |     print(v)
115 |     print("")
116 | 
117 |     print("Reshaped Grid Value Function:")
118 |     print(v.reshape((env.nrow, env.ncol)))
119 |     print("")
120 | 
121 | 
122 | def env_run(env, n_episodes, policy, versbose=False):
123 |     rewards = []
124 |     for ep in range(n_episodes):
125 |         done = False
126 |         epoch_reward = 0
127 |         s = env.reset()
128 |         while not done:
129 |             if versbose:
130 |                 env.render()
131 |             action = np.argmax(policy[s])
132 |             s, reward, done, info = env.step(action)
133 |             epoch_reward += reward
134 |         rewards.append(epoch_reward)
135 |     return rewards
136 | 
137 | 
138 | def _parse_args():
139 |     parser = argparse.ArgumentParser(description='Policy iteration example')
140 |     parser.add_argument('--env',
141 |                         type=str,
142 |                         default='Taxi-v1',
143 |                         help='The environment to use')
144 |     parser.add_argument('--num_episodes',
145 |                         type=int,
146 |                         default=1000,
147 |                         help='Number of episodes')
148 |     parser.add_argument('--gamma',
149 |                         type=float,
150 |                         default=0.99,
151 |                         help='Gamma discount factor')
152 |     parser.add_argument('--verbose',
153 |                         action='store_true',
154 |                         default=False)
155 |     parser.add_argument('--api_key',
156 |                         type=str,
157 |                         default=None)
158 | 
159 |     args, _ = parser.parse_known_args()
160 |     return args
161 | 
162 | 
163 | def run(env, n_episodes, discount_factor, verbose=False, api_key=None):
164 |     env_name = env
165 |     env = gym.make(env)
166 |     if api_key is not None:
167 |         env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True)
168 |     policy, v = policy_improvement(env, discount_factor=discount_factor)
169 |     if verbose:
170 |         try:
171 |             env_description(env, policy, v)
172 |         except:
173 |             print("Sorry, something go wrong.")
174 |     rewards = env_run(env, n_episodes, policy, verbose)
175 |     print("Avg rewards over {} episodes: {:.4f} +/-{:.4f}".format(
176 |         n_episodes, np.mean(rewards), np.std(rewards)))
177 |     if api_key is not None:
178 |         env.close()
179 |         gym.upload("/tmp/" + env_name, api_key=api_key)
180 | 
181 | 
182 | def main():
183 |     args = _parse_args()
184 |     run(args.env, args.num_episodes, args.gamma, args.verbose, args.api_key)
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     main()
189 | 


--------------------------------------------------------------------------------
/DP/value_iteration_gym.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import gym
  4 | from gym import wrappers
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | 
  9 | def value_iteration(env, theta=0.0001, discount_factor=1.0):
 10 |     """
 11 |     Value Iteration Algorithm.
 12 | 
 13 |     Args:
 14 |         env: OpenAI environment.
 15 |             env.P represents the transition probabilities of the environment.
 16 |         theta: Stopping threshold.
 17 |             If the value of all states changes less than theta
 18 |             in one iteration we are done.
 19 |         discount_factor: lambda time discount factor.
 20 | 
 21 |     Returns:
 22 |         A tuple (policy, V) of the optimal policy and the optimal value function.
 23 |     """
 24 | 
 25 |     def one_step_lookahead(state, V):
 26 |         """
 27 |         Helper function to calculate the value for all action in a given state.
 28 | 
 29 |         Args:
 30 |             state: The state to consider (int)
 31 |             V: The value to use as an estimator, Vector of length env.observation_space.n
 32 | 
 33 |         Returns:
 34 |             A vector of length env.action_space.n`
 35 |                 containing the expected value of each action.
 36 |         """
 37 |         A = np.zeros(env.action_space.n)
 38 |         for a in range(env.action_space.n):
 39 |             for prob, next_state, reward, done in env.env.env.P[state][a]:
 40 |                 A[a] += prob * (reward + discount_factor * V[next_state])
 41 |         return A
 42 | 
 43 |     V = np.zeros(env.observation_space.n)
 44 |     while True:
 45 |         # Stopping condition
 46 |         delta = 0
 47 |         # Update each state...
 48 |         for s in range(env.observation_space.n):
 49 |             # Do a one-step lookahead to find the best action
 50 |             A = one_step_lookahead(s, V)
 51 |             best_action_value = np.max(A)
 52 |             # Calculate delta across all states seen so far
 53 |             delta = max(delta, np.abs(best_action_value - V[s]))
 54 |             # Update the value function
 55 |             V[s] = best_action_value
 56 |             # Check if we can stop
 57 |         if delta < theta:
 58 |             break
 59 | 
 60 |     # Create a deterministic policy using the optimal value function
 61 |     policy = np.zeros([env.observation_space.n, env.action_space.n])
 62 |     for s in range(env.observation_space.n):
 63 |         # One step lookahead to find the best action for this state
 64 |         A = one_step_lookahead(s, V)
 65 |         best_action = np.argmax(A)
 66 |         # Always take the best action
 67 |         policy[s, best_action] = 1.0
 68 | 
 69 |     return policy, V
 70 | 
 71 | 
 72 | def env_description(env, policy, v):
 73 |     print("Policy Probability Distribution:")
 74 |     print(policy)
 75 |     print("")
 76 | 
 77 |     print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
 78 |     print(np.reshape(np.argmax(policy, axis=1), (env.nrow, env.ncol)))
 79 |     print("")
 80 | 
 81 |     print("Value Function:")
 82 |     print(v)
 83 |     print("")
 84 | 
 85 |     print("Reshaped Grid Value Function:")
 86 |     print(v.reshape((env.nrow, env.ncol)))
 87 |     print("")
 88 | 
 89 | 
 90 | def env_run(env, n_episodes, policy, versbose=False):
 91 |     rewards = []
 92 |     for ep in range(n_episodes):
 93 |         done = False
 94 |         epoch_reward = 0
 95 |         s = env.reset()
 96 |         while not done:
 97 |             if versbose:
 98 |                 env.render()
 99 |             action = np.argmax(policy[s])
100 |             s, reward, done, info = env.step(action)
101 |             epoch_reward += reward
102 |         rewards.append(epoch_reward)
103 |     return rewards
104 | 
105 | 
106 | def _parse_args():
107 |     parser = argparse.ArgumentParser(description='Policy iteration example')
108 |     parser.add_argument('--env',
109 |                         type=str,
110 |                         default='Taxi-v1',
111 |                         help='The environment to use')
112 |     parser.add_argument('--num_episodes',
113 |                         type=int,
114 |                         default=1000,
115 |                         help='Number of episodes')
116 |     parser.add_argument('--gamma',
117 |                         type=float,
118 |                         default=0.99,
119 |                         help='Gamma discount factor')
120 |     parser.add_argument('--verbose',
121 |                         action='store_true',
122 |                         default=False)
123 |     parser.add_argument('--api_key',
124 |                         type=str,
125 |                         default=None)
126 | 
127 |     args, _ = parser.parse_known_args()
128 |     return args
129 | 
130 | 
131 | def run(env, n_episodes, discount_factor, verbose=False, api_key=None):
132 |     env_name = env
133 |     env = gym.make(env)
134 |     if api_key is not None:
135 |         env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True)
136 |     policy, v = value_iteration(env, discount_factor=discount_factor)
137 |     if verbose:
138 |         try:
139 |             env_description(env, policy, v)
140 |         except:
141 |             print("Sorry, something go wrong.")
142 |     rewards = env_run(env, n_episodes, policy, verbose)
143 |     print("Avg rewards over {} episodes: {:.4f} +/-{:.4f}".format(
144 |         n_episodes, np.mean(rewards), np.std(rewards)))
145 |     if api_key is not None:
146 |         env.close()
147 |         gym.upload("/tmp/" + env_name, api_key=api_key)
148 | 
149 | 
150 | def main():
151 |     args = _parse_args()
152 |     run(args.env, args.num_episodes, args.gamma, args.verbose, args.api_key)
153 | 
154 | 
155 | if __name__ == '__main__':
156 |     main()
157 | 


--------------------------------------------------------------------------------
/DQN/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | logs*
3 | 


--------------------------------------------------------------------------------
/DQN/README.md:
--------------------------------------------------------------------------------
 1 | ## Deep Q-Learning
 2 | 
 3 | ### Algorithms & Readings
 4 | 
 5 | - [Deep Q-Learning (DQN)](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
 6 | - [Human-Level Control through Deep Reinforcement Learning](http://www.davidqiu.com:8888/research/nature14236.pdf)
 7 | - [Deep Reinforcement Learning with Double Q-learning](http://arxiv.org/abs/1509.06461)
 8 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581)
 9 | - [Deep Recurrent Q-Learning for Partially Observable MDPs (DRQN)](https://arxiv.org/abs/1507.06527)
10 | - [Prioritized Experience Replay](http://arxiv.org/abs/1511.05952)
11 | 
12 | 
13 | ### Summary
14 | 
15 | - DQN: Q-Learning but with a Deep Neural Network as a function approximator.
16 | - Using a non-linear Deep Neural Network is powerful, but training is unstable if we apply it naively.
17 | - Trick 1 - Experience Replay: Store experience `(S, A, R, S_next)` in a replay buffer and sample minibatches from it to train the network. This decorrelates the data and leads to better data efficiency. In the beginning, the replay buffer is filled with random experience.
18 | - Trick 2 - Target Network: Use a separate network to estimate the TD target. This target network has the same architecture as the function approximator but with frozen parameters. Every T steps (a hyperparameter) the parameters from the Q network are copied to the target network. This leads to more stable training because it keeps the target function fixed (for a while).
19 | - Double DQN: Just like regular Q-Learning, DQN tends to overestimate values due to its max operation applied to both selecting and estimating actions. We get around this by using the Q network for selection and the target network for estimation when making updates.
20 | 


--------------------------------------------------------------------------------
/DQN/dqn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import tensorflow as tf
 4 | from agents.agent_states import LinearHiddenState
 5 | from rstools.tf.optimization import build_model_optimization
 6 | 
 7 | from agents.agent_networks import FeatureNet, QvalueNet, ValueNet
 8 | 
 9 | 
10 | class DqnAgent(object):
11 |     def __init__(self, state_shape, n_actions, network, special=None):
12 |         self.special = special or {}
13 |         self.state_shape = state_shape
14 |         self.n_actions = n_actions
15 |         self.special = special
16 | 
17 |         self.scope = tf.get_variable_scope().name + "/" + special.get("scope", "dqn") \
18 |             if tf.get_variable_scope().name else special.get("scope", "dqn")
19 | 
20 |         with tf.variable_scope(self.scope):
21 |             self._build_graph(network)
22 | 
23 |     def _build_graph(self, network):
24 |         self.feature_net = FeatureNet(
25 |             self.state_shape, network,
26 |             self.special.get("feature_net", {}))
27 | 
28 |         self.hidden_state = LinearHiddenState(
29 |             self.feature_net.feature_state,
30 |             self.special.get("hidden_size", 512),
31 |             self.special.get("hidden_activation", tf.nn.elu))
32 | 
33 |         if self.special.get("dueling_network", False):
34 |             self.qvalue_net = QvalueNet(
35 |                 self.hidden_state.state, self.n_actions,
36 |                 dict(**self.special.get("qvalue_net", {}), **{"advantage": True}))
37 |             self.value_net = ValueNet(
38 |                 self.hidden_state.state,
39 |                 self.special.get("value_net", {}))
40 | 
41 |             # a bit hacky way
42 |             self.predicted_qvalues = self.value_net.predicted_values + \
43 |                                      self.qvalue_net.predicted_qvalues
44 |             self.predicted_qvalues_for_action = self.value_net.predicted_values_for_actions + \
45 |                                                 self.qvalue_net.predicted_qvalues_for_actions
46 |             self.agent_loss = tf.losses.mean_squared_error(
47 |                 labels=self.qvalue_net.td_target,
48 |                 predictions=self.predicted_qvalues_for_action)
49 | 
50 |             build_model_optimization(
51 |                 self.value_net,
52 |                 self.special.get("value_net_optimization", None),
53 |                 loss=self.agent_loss)
54 |         else:
55 |             self.qvalue_net = QvalueNet(
56 |                 self.hidden_state.state, self.n_actions,
57 |                 self.special.get("qvalue_net", {}))
58 |             self.predicted_qvalues = self.qvalue_net.predicted_qvalues
59 |             self.predicted_qvalues_for_action = self.qvalue_net.predicted_qvalues_for_actions
60 |             self.agent_loss = self.qvalue_net.loss
61 | 
62 |         build_model_optimization(
63 |             self.qvalue_net,
64 |             self.special.get("qvalue_net_optimization", None))
65 | 
66 |         build_model_optimization(
67 |             self.hidden_state,
68 |             self.special.get("hidden_state_optimization", None),
69 |             loss=self.agent_loss)
70 |         build_model_optimization(
71 |             self.feature_net,
72 |             self.special.get("feature_net_optimization", None),
73 |             loss=self.agent_loss)
74 | 
75 |     def predict_qvalues(self, sess, state_batch):
76 |         return sess.run(
77 |             self.predicted_qvalues,
78 |             feed_dict={
79 |                 self.feature_net.states: state_batch,
80 |                 self.feature_net.is_training: False})
81 | 


--------------------------------------------------------------------------------
/DQN/drqn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | from rstools.tf.optimization import build_model_optimization
  6 | 
  7 | from agents.agent_networks import FeatureNet, QvalueNet, ValueNet
  8 | from agents.agent_states import RecurrentHiddenState
  9 | 
 10 | 
 11 | class DrqnAgent(object):
 12 |     def __init__(self, state_shape, n_actions, network, special=None):
 13 |         self.state_shape = state_shape
 14 |         self.n_actions = n_actions
 15 | 
 16 |         self.special = special
 17 |         self.scope = special.get("scope", "drqn")
 18 | 
 19 |         with tf.variable_scope(self.scope):
 20 |             self._build_graph(network)
 21 | 
 22 |     def _build_graph(self, network):
 23 |         self.feature_net = FeatureNet(
 24 |             self.state_shape, network,
 25 |             self.special.get("feature_net", {}))
 26 | 
 27 |         self.hidden_state = RecurrentHiddenState(
 28 |             self.feature_net.feature_state,
 29 |             self.special.get("hidden_size", 512),
 30 |             self.special.get("hidden_activation", tf.tanh),
 31 |             self.special.get("batch_size", 1))
 32 | 
 33 |         if self.special.get("dueling_network", False):
 34 |             self.qvalue_net = QvalueNet(
 35 |                 self.hidden_state.state, self.n_actions,
 36 |                 dict(**self.special.get("qvalue_net", {}), **{"advantage": True}))
 37 |             self.value_net = ValueNet(
 38 |                 self.hidden_state.state,
 39 |                 self.special.get("value_net", {}))
 40 | 
 41 |             # a bit hacky way
 42 |             self.predicted_qvalues = self.value_net.predicted_values + \
 43 |                                      self.qvalue_net.predicted_qvalues
 44 |             self.predicted_qvalues_for_action = self.value_net.predicted_values_for_actions + \
 45 |                                                 self.qvalue_net.predicted_qvalues_for_actions
 46 |             self.agent_loss = tf.losses.mean_squared_error(
 47 |                 labels=self.qvalue_net.td_target,
 48 |                 predictions=self.value_net.predicted_values_for_actions +
 49 |                             self.qvalue_net.predicted_qvalues_for_actions)
 50 | 
 51 |             build_model_optimization(
 52 |                 self.value_net,
 53 |                 self.special.get("value_net_optimization", None),
 54 |                 loss=self.agent_loss)
 55 |         else:
 56 |             self.qvalue_net = QvalueNet(
 57 |                 self.hidden_state.state, self.n_actions,
 58 |                 self.special.get("qvalue_net", {}))
 59 |             self.predicted_qvalues = self.qvalue_net.predicted_qvalues
 60 |             self.predicted_qvalues_for_action = self.qvalue_net.predicted_qvalues_for_actions
 61 |             self.agent_loss = self.qvalue_net.loss
 62 | 
 63 |         build_model_optimization(
 64 |             self.qvalue_net,
 65 |             self.special.get("qvalue_net_optimization", None))
 66 | 
 67 |         build_model_optimization(
 68 |             self.hidden_state,
 69 |             self.special.get("hidden_state_optimization", None),
 70 |             loss=self.agent_loss)
 71 |         build_model_optimization(
 72 |             self.feature_net,
 73 |             self.special.get("feature_net_optimization", None),
 74 |             loss=self.agent_loss)
 75 | 
 76 |     def predict_qvalues(self, sess, state_batch):
 77 |         return sess.run(
 78 |             self.predicted_qvalues,
 79 |             feed_dict={
 80 |                 self.feature_net.states: state_batch,
 81 |                 self.feature_net.is_training: False})
 82 | 
 83 |     def update_belief_state(self, sess, state_batch, done_batch):
 84 |         _ = sess.run(
 85 |             self.hidden_state.belief_update,
 86 |             feed_dict={
 87 |                 self.feature_net.states: state_batch,
 88 |                 self.hidden_state.is_end: done_batch,
 89 |                 self.feature_net.is_training: False
 90 |             })
 91 | 
 92 |     def assign_belief_state(self, sess, new_belief):
 93 |         _ = sess.run(
 94 |             self.hidden_state.belief_assign,
 95 |             feed_dict={
 96 |                 self.hidden_state.belief_out: new_belief
 97 |             })
 98 | 
 99 |     def get_belief_state(self, sess):
100 |         return sess.run(self.hidden_state.belief_state)
101 | 


--------------------------------------------------------------------------------
/DQN/run_dqn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | from rstools.utils.batch_utils import iterate_minibatches
  5 | from tqdm import trange
  6 | 
  7 | from DQN.dqn import DqnAgent
  8 | from DQN.drqn import DrqnAgent
  9 | from agents.agent_networks import copy_model_parameters
 10 | from common.networks import activations
 11 | from common.buffer import buffers
 12 | from wrappers.gym_wrappers import Transition
 13 | from wrappers.run_wrappers import typical_args, typical_argsparse, run_wrapper, update_wraper, \
 14 |     epsilon_greedy_actions, play_session
 15 | 
 16 | 
 17 | def update(sess, agent, target_agent, transitions, init_state=None,
 18 |            discount_factor=0.99, reward_norm=1.0, batch_size=32, time_major=False,
 19 |            replay_buffer=None):
 20 |     loss = 0.0
 21 |     if replay_buffer is not None:
 22 |         for transition in zip(
 23 |                 transitions.state, transitions.action, transitions.reward,
 24 |                 transitions.next_state, transitions.done.astype(np.float32)):
 25 |             replay_buffer.add(*transition)
 26 |         states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
 27 |         transitions = Transition(
 28 |             state=states, action=actions, reward=rewards,
 29 |             next_state=next_states, done=dones.astype(bool))
 30 | 
 31 |     time_len = transitions.state.shape[0]
 32 |     transitions_it = zip(
 33 |         iterate_minibatches(transitions.state, batch_size),
 34 |         iterate_minibatches(transitions.action, batch_size),
 35 |         iterate_minibatches(transitions.reward, batch_size),
 36 |         iterate_minibatches(transitions.next_state, batch_size),
 37 |         iterate_minibatches(transitions.done, batch_size))
 38 | 
 39 |     for states, actions, rewards, next_states, dones in transitions_it:
 40 |         qvalues_next = agent.predict_qvalues(sess, next_states)
 41 |         best_actions = qvalues_next.argmax(axis=1)
 42 |         qvalues_next_target = target_agent.predict_qvalues(sess, next_states)
 43 |         qvalues_next_target = qvalues_next_target[np.arange(batch_size), best_actions]
 44 | 
 45 |         td_target = rewards * reward_norm + \
 46 |                     np.invert(dones).astype(np.float32) * \
 47 |                     discount_factor * qvalues_next_target
 48 | 
 49 |         run_params = [
 50 |             agent.qvalue_net.loss,
 51 |             agent.qvalue_net.train_op, agent.hidden_state.train_op, agent.feature_net.train_op
 52 |         ]
 53 | 
 54 |         feed_params = {
 55 |             agent.feature_net.states: states,
 56 |             agent.feature_net.is_training: True,
 57 |             agent.qvalue_net.actions: actions,
 58 |             agent.qvalue_net.td_target: td_target,
 59 |             agent.qvalue_net.is_training: True,
 60 |         }
 61 | 
 62 |         if agent.special.get("dueling_network", False):
 63 |             run_params[0] = agent.agent_loss
 64 |             run_params += [agent.value_net.train_op]
 65 |             feed_params[agent.value_net.td_target] = td_target  # @TODO: why need to feed?
 66 |             feed_params[agent.value_net.is_training] = True
 67 | 
 68 |         if isinstance(agent, DrqnAgent):
 69 |             run_params += [agent.hidden_state.belief_update]
 70 |             feed_params[agent.hidden_state.is_end] = dones
 71 | 
 72 |         run_results = sess.run(
 73 |             run_params,
 74 |             feed_dict=feed_params)
 75 | 
 76 |         batch_loss = run_results[0]
 77 |         loss += batch_loss
 78 |     return loss / time_len
 79 | 
 80 | 
 81 | def generate_sessions(
 82 |         sess, agent, target_agent, env_pool, update_fn,
 83 |         t_max=1000, epsilon=0.01):
 84 |     total_reward = 0.0
 85 |     total_qvalue_loss = 0.0
 86 |     total_games = 0.0
 87 | 
 88 |     states = env_pool.pool_states()
 89 |     for t in range(t_max):
 90 |         actions = epsilon_greedy_actions(agent, sess, states, epsilon=epsilon)
 91 |         next_states, rewards, dones, _ = env_pool.step(actions)
 92 |         transition = Transition(
 93 |             state=states, action=actions, reward=rewards, next_state=next_states, done=dones)
 94 |         total_qvalue_loss += update_fn(sess, agent, target_agent, transition)
 95 | 
 96 |         states = next_states
 97 | 
 98 |         total_reward += rewards.sum()
 99 |         total_games += dones.sum()
100 | 
101 |         if env_pool.n_envs == 1 and total_games > 0:
102 |             break
103 | 
104 |     return total_reward / env_pool.n_envs, \
105 |            total_qvalue_loss / t, \
106 |            t / (total_games / env_pool.n_envs)
107 | 
108 | 
109 | def dqn_learning(
110 |         sess, agent, env, update_fn,
111 |         n_epochs=1000, n_sessions=100, t_max=1000,
112 |         initial_epsilon=0.5, final_epsilon=0.01,
113 |         use_target_net=False, copy_n_epoch=5):
114 |     tr = trange(
115 |         n_epochs,
116 |         desc="",
117 |         leave=True)
118 | 
119 |     if use_target_net:
120 |         agent, target_agent = agent
121 |         # copy_model_parameters(sess, agent, target_agent)
122 |     else:
123 |         target_agent = agent
124 | 
125 |     history = {
126 |         "reward": np.zeros(n_epochs),
127 |         "qvalue_loss": np.zeros(n_epochs),
128 |         "steps": np.zeros(n_epochs),
129 |         "epsilon": np.zeros(n_epochs)
130 |     }
131 | 
132 |     epsilon = initial_epsilon
133 |     n_epochs_decay = n_epochs * 0.8
134 | 
135 |     for i in tr:
136 |         sessions = [
137 |             generate_sessions(
138 |                 sess, agent, target_agent, env, update_fn, t_max=t_max, epsilon=epsilon)
139 |             for _ in range(n_sessions)]
140 |         session_rewards, session_qvalue_loss, session_steps = map(np.array, zip(*sessions))
141 | 
142 |         history["reward"][i] = np.mean(session_rewards)
143 |         history["qvalue_loss"][i] = np.mean(session_qvalue_loss)
144 |         history["steps"][i] = np.mean(session_steps)
145 |         history["epsilon"][i] = epsilon
146 | 
147 |         if i < n_epochs_decay:
148 |             epsilon -= (initial_epsilon - final_epsilon) / float(n_epochs_decay)
149 | 
150 |         if use_target_net and (i + 1) % copy_n_epoch == 0:
151 |             copy_model_parameters(sess, agent, target_agent)
152 | 
153 |         desc = "\t".join(
154 |             ["{} = {:.3f}".format(key, value[i]) for key, value in history.items()])
155 |         tr.set_description(desc)
156 | 
157 |     if use_target_net:
158 |         copy_model_parameters(sess, agent, target_agent)
159 | 
160 |     return history
161 | 
162 | 
163 | def run(env_name, make_env_fn, agent_cls,
164 |         run_args, update_args, agent_agrs,
165 |         log_dir=None, episode_limit=None,
166 |         plot_stats=False, api_key=None,
167 |         load=False, gpu_option=0.4,
168 |         n_games=10,
169 |         use_target_net=False):
170 |     run_wrapper(
171 |         n_games, dqn_learning,
172 |         update_wraper(update, **update_args),
173 |         play_session, epsilon_greedy_actions,
174 |         env_name, make_env_fn, agent_cls,
175 |         run_args, agent_agrs,
176 |         log_dir=log_dir, episode_limit=episode_limit,
177 |         plot_stats=plot_stats, api_key=api_key,
178 |         load=load, gpu_option=gpu_option,
179 |         use_target_network=use_target_net)
180 | 
181 | 
182 | def _parse_args():
183 |     parser = argparse.ArgumentParser(description='DQN Agent Learning')
184 |     # typical params
185 |     parser.add_argument(
186 |         '--agent',
187 |         type=str,
188 |         default="dqn",
189 |         choices=["dqn", "drqn"],
190 |         help='Which agent to use. (default: %(default)s)')
191 | 
192 |     parser.add_argument(
193 |         '--replay_buffer',
194 |         type=str,
195 |         choices=["none", "simple", "prioritized"],
196 |         default="none",
197 |         help="ReplayBuffer to use for training")
198 |     parser.add_argument(
199 |         '--replay_buffer_size',
200 |         type=int,
201 |         default=5000,
202 |         help="Number of transitions to store in replay buffer.")
203 | 
204 |     # special exploration params
205 |     parser.add_argument(
206 |         '--initial_epsilon',
207 |         type=float,
208 |         default=0.5,
209 |         help='DQN exploration: initial epsilon. (default: %(default)s)')
210 |     parser.add_argument(
211 |         '--final_epsilon',
212 |         type=float,
213 |         default=0.01,
214 |         help='DQN exploration: final epsilon at 0.8*epochs. (default: %(default)s)')
215 | 
216 |     parser.add_argument(
217 |         '--copy_n_epoch',
218 |         type=int,
219 |         default=5,
220 |         help='Target DQN: copy parameters every %(default)s epoch')
221 | 
222 |     # special optimization params
223 |     parser.add_argument(
224 |         '--qvalue_lr',
225 |         type=float,
226 |         default=1e-5,
227 |         help='Learning rate for qvalue network. (default: %(default)s)')
228 |     parser.add_argument(
229 |         '--value_lr',
230 |         type=float,
231 |         default=1e-5,
232 |         help='Learning rate for value network. (default: %(default)s)')
233 | 
234 |     # agent special params & optimization
235 |     parser.add_argument(
236 |         '--use_target_net',
237 |         action='store_true',
238 |         default=False,
239 |         help='Flag for target network use.')
240 |     parser.add_argument(
241 |         '--dueling_dqn',
242 |         action='store_true',
243 |         default=False,
244 |         help='Flag for dueling network architecture use.')
245 | 
246 |     parser = typical_args(parser)
247 | 
248 |     args = parser.parse_args()
249 |     return args
250 | 
251 | 
252 | def main():
253 |     args = _parse_args()
254 | 
255 |     network, run_args, update_args, optimization_params, make_env_fn = typical_argsparse(args)
256 | 
257 |     special_run_args = {
258 |         "use_target_net": args.use_target_net,
259 |         "initial_epsilon": args.initial_epsilon,
260 |         "final_epsilon": args.final_epsilon,
261 |         "copy_n_epoch": args.copy_n_epoch
262 |     }
263 |     run_args = {**run_args, **special_run_args}
264 | 
265 |     buffer = buffers[args.replay_buffer](args.replay_buffer_size) \
266 |         if args.replay_buffer != "none" \
267 |         else None
268 |     special_update_args = {
269 |         "replay_buffer": buffer
270 |     }
271 | 
272 |     update_args = {**update_args, **special_update_args}
273 | 
274 |     qvalue_optimization_params = {
275 |         **optimization_params,
276 |         **{"initial_lr": args.qvalue_lr}
277 |     }
278 |     value_optimization_params = {
279 |         **optimization_params,
280 |         **{"initial_lr": args.value_lr}
281 |     }
282 | 
283 |     agent_cls = DqnAgent if args.agent == "dqn" else DrqnAgent
284 | 
285 |     special = {
286 |         "dueling_network": args.dueling_dqn,
287 |         "hidden_size": args.hidden_size,
288 |         "hidden_activation": activations[args.hidden_activation],
289 |         "feature_net_optimization": optimization_params,
290 |         "hidden_state_optimization": optimization_params,
291 |         "value_net_optimization": value_optimization_params,
292 |         "qvalue_net_optimization": qvalue_optimization_params,
293 |     }
294 | 
295 |     agent_args = {
296 |         "network": network,
297 |         "special": special
298 |     }
299 | 
300 |     run(args.env, make_env_fn, agent_cls,
301 |         run_args, update_args, agent_args,
302 |         args.log_dir, args.episode_limit,
303 |         args.plot_history, args.api_key,
304 |         args.load, args.gpu_option,
305 |         args.n_games,
306 |         args.use_target_net)
307 | 
308 | 
309 | if __name__ == '__main__':
310 |     main()
311 | 


--------------------------------------------------------------------------------
/FA/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | logs*
3 | 


--------------------------------------------------------------------------------
/FA/README.md:
--------------------------------------------------------------------------------
 1 | ## Function Approximation
 2 | 
 3 | ### Learning Goals
 4 | 
 5 | - Understand the motivation for Function Approximation over Table Lookup
 6 | - Understand how to incorporate function approximation into existing algorithms
 7 | - Understand convergence properties of function approximators and RL algorithms
 8 | - Understand batching using experience replay
 9 | 
10 | 
11 | ### Summary
12 | 
13 | - Building a big table, one value for each state or state-action pair, is memory- and data-inefficient. Function Approximation can generalize to unseen states by  using a featurized state representation.
14 | - Treat RL as supervised learning problem with the MC- or TD-target as the label and the current state/action as the input. Often the target also depends on the function estimator but we simply ignore its gradient. That's why these methods are called semi-gradient methods.
15 | - Challenge: We have non-stationary (policy changes, bootstrapping) and non-iid (correlated in time) data.
16 | - Many methods assume that our action space is discrete because they rely on calculating the argmax over all actions. Large and continuous action spaces are ongoing research.
17 | - For Control very few convergence guarantees exist. For non-linear approximators there are basically no guarantees at all. But in works in practice.
18 | - Experience Replay: Store experience as dataset, randomize it, and repeatedly apply minibatch SGD.
19 | - Tricks to stabilize non-linear function approximators: Fixed Targets. The target is calculated based on frozen parameter values from a previous time step.
20 | - For the non-episodic (continuing) case function approximation is more complex and we need to give up discounting and use an "average reward" formulation.
21 | 
22 | 
23 | ### Lectures & Readings
24 | 
25 | **Required:**
26 | 
27 | - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf))
28 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 9: On-policy Prediction with Approximation
29 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 10: On-policy Control with Approximation
30 | 
31 | **Optional:**
32 | 
33 | - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4)
34 | 
35 | 
36 | ### Exercises
37 | 
38 | - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation
39 |   - [Exercise](Q-Learning with Value Function Approximation.ipynb)
40 |   - [Solution](Q-Learning with Value Function Approximation Solution.ipynb)
41 | 


--------------------------------------------------------------------------------
/FA/q_learning_gym.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import gym
  4 | from gym import wrappers
  5 | import sys
  6 | import argparse
  7 | import numpy as np
  8 | import sklearn.pipeline
  9 | import sklearn.preprocessing
 10 | from sklearn.linear_model import SGDRegressor
 11 | from sklearn.kernel_approximation import RBFSampler
 12 | 
 13 | from matplotlib import pyplot as plt
 14 | 
 15 | plt.style.use("ggplot")
 16 | 
 17 | 
 18 | def plot_unimetric(history, metric, save_dir):
 19 |     plt.figure()
 20 |     plt.plot(history[metric])
 21 |     plt.title('model {}'.format(metric))
 22 |     plt.ylabel(metric)
 23 |     plt.xlabel('epoch')
 24 |     plt.savefig("{}/{}.png".format(save_dir, metric),
 25 |                 format='png', dpi=300)
 26 | 
 27 | 
 28 | def make_epsilon_greedy_policy(estimator, epsilon, nA):
 29 |     """
 30 |     Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.
 31 | 
 32 |     Args:
 33 |         estimator: An estimator that returns q values for a given state
 34 |         epsilon: The probability to select a random action . float between 0 and 1.
 35 |         nA: Number of actions in the environment.
 36 | 
 37 |     Returns:
 38 |         A function that takes the observation as an argument and returns
 39 |         the probabilities for each action in the form of a numpy array of length nA.
 40 | 
 41 |     """
 42 | 
 43 |     def policy_fn(observation):
 44 |         A = np.ones(nA, dtype=float) * epsilon / nA
 45 |         q_values = estimator.predict(observation)
 46 |         best_action = np.argmax(q_values)
 47 |         A[best_action] += (1.0 - epsilon)
 48 |         return A
 49 | 
 50 |     return policy_fn
 51 | 
 52 | 
 53 | class Estimator(object):
 54 |     """
 55 |     Value Function approximator.
 56 |     """
 57 | 
 58 |     def __init__(self, env):
 59 |         self._prepare_estimator_for_env(env)
 60 |         # We create a separate model for each action in the environment's
 61 |         # action space. Alternatively we could somehow encode the action
 62 |         # into the features, but this way it's easier to code up.
 63 |         self.models = []
 64 |         for _ in range(env.action_space.n):
 65 |             model = SGDRegressor(learning_rate="constant")
 66 |             # We need to call partial_fit once to initialize the model
 67 |             # or we get a NotFittedError when trying to make a prediction
 68 |             # This is quite hacky.
 69 |             model.partial_fit([self.featurize_state(env.reset())], [0])
 70 |             self.models.append(model)
 71 | 
 72 |     def _prepare_estimator_for_env(self, env):
 73 |         observation_examples = np.array(
 74 |             [env.observation_space.sample() for _ in range(1000)])
 75 |         observation_examples = self._vectorise_state(observation_examples)
 76 | 
 77 |         scaler = sklearn.preprocessing.StandardScaler()
 78 |         scaler.fit(observation_examples)
 79 |         self.scaler = scaler
 80 | 
 81 |         featurizer = sklearn.pipeline.FeatureUnion([
 82 |             ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
 83 |             ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
 84 |             ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
 85 |             ("rbf4", RBFSampler(gamma=0.5, n_components=100))
 86 |         ])
 87 |         featurizer.fit(scaler.transform(observation_examples))
 88 |         self.featurizer = featurizer
 89 | 
 90 |     def _vectorise_state(self, states):
 91 |         obs_shape = states.shape
 92 |         if len(obs_shape) > 2:
 93 |             states = states.reshape((obs_shape[0], -1))
 94 |         return states
 95 | 
 96 |     def featurize_state(self, state):
 97 |         """
 98 |         Returns the featurized representation for a state.
 99 |         """
100 |         state = self._vectorise_state(np.array([state]))
101 |         scaled = self.scaler.transform(state)
102 |         featurized = self.featurizer.transform(scaled)
103 |         return featurized[0]
104 | 
105 |     def predict(self, s, a=None):
106 |         """
107 |         Makes value function predictions.
108 | 
109 |         Args:
110 |             s: state to make a prediction for
111 |             a: (Optional) action to make a prediction for
112 | 
113 |         Returns
114 |             If an action a is given this returns a single number as the prediction.
115 |             If no action is given this returns a vector or predictions for all actions
116 |             in the environment where pred[i] is the prediction for action i.
117 | 
118 |         """
119 |         features = self.featurize_state(s)
120 |         return self.models[a].predict([features])[0] if a \
121 |             else np.array([model.predict([features])[0] for model in self.models])
122 | 
123 |     def update(self, s, a, y):
124 |         """
125 |         Updates the estimator parameters for a given state and action towards
126 |         the target y.
127 |         """
128 |         features = self.featurize_state(s)
129 |         self.models[a].partial_fit([features], [y])
130 | 
131 | 
132 | def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0,
133 |                verbose=False):
134 |     """
135 |     Q-Learning algorithm for fff-policy TD control using Function Approximation.
136 |     Finds the optimal greedy policy while following an epsilon-greedy policy.
137 | 
138 |     Args:
139 |         env: OpenAI environment.
140 |         estimator: Action-Value function estimator
141 |         num_episodes: Number of episodes to run for.
142 |         discount_factor: Lambda time discount factor.
143 |         epsilon: Chance the sample a random action. Float betwen 0 and 1.
144 |         epsilon_decay: Each episode, epsilon is decayed by this factor
145 | 
146 |     Returns:
147 |         An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
148 |     """
149 | 
150 |     # Keeps track of useful statistics
151 |     episode_lengths = np.zeros(num_episodes)
152 |     episode_rewards = np.zeros(num_episodes)
153 | 
154 |     for i_episode in range(num_episodes):
155 | 
156 |         # The policy we're following
157 |         policy = make_epsilon_greedy_policy(
158 |             estimator, epsilon * epsilon_decay ** i_episode, env.action_space.n)
159 | 
160 |         # Print out which episode we're on, useful for debugging.
161 |         # Also print reward for last episode
162 |         if verbose:
163 |             last_reward = episode_rewards[i_episode - 1]
164 |             print("\rEpisode {}/{} ({})".format(i_episode + 1, num_episodes, last_reward), end="")
165 |             sys.stdout.flush()
166 | 
167 |         state = env.reset()
168 |         n_action = None
169 | 
170 |         len_counter = 0
171 |         reward_counter = 0
172 |         done = False
173 |         while not done:
174 |             if verbose:
175 |                 pass
176 |                 # env.render()
177 |             if n_action is None:
178 |                 probs = policy(state)
179 |                 action = np.random.choice(np.arange(len(probs)), p=probs)
180 |             else:
181 |                 action = n_action
182 | 
183 |             n_state, reward, done, info = env.step(action)
184 |             reward_counter += reward
185 |             len_counter += 1
186 | 
187 |             q_val_next = estimator.predict(n_state)
188 |             td_target = reward + discount_factor * np.max(q_val_next)
189 | 
190 |             estimator.update(state, action, td_target)
191 | 
192 |             state = n_state
193 | 
194 |         episode_rewards[i_episode] = reward_counter
195 |         episode_lengths[i_episode] = len_counter
196 | 
197 |     return {"episode_rewards": episode_rewards, "episode_lengths": episode_lengths}
198 | 
199 | 
200 | def _parse_args():
201 |     parser = argparse.ArgumentParser(description='Policy iteration example')
202 |     parser.add_argument(
203 |         '--env',
204 |         type=str,
205 |         default='MountainCar-v0',  # CartPole-v0, MountainCar-v0
206 |         help='The environment to use')
207 |     parser.add_argument(
208 |         '--num_episodes',
209 |         type=int,
210 |         default=1000,
211 |         help='Number of episodes')
212 |     parser.add_argument(
213 |         '--gamma',
214 |         type=float,
215 |         default=0.99,
216 |         help='Gamma discount factor')
217 |     parser.add_argument(
218 |         '--verbose',
219 |         action='store_true',
220 |         default=False)
221 |     parser.add_argument(
222 |         '--plot_stats',
223 |         action='store_true',
224 |         default=False)
225 |     parser.add_argument(
226 |         '--api_key',
227 |         type=str,
228 |         default=None)
229 | 
230 |     args, _ = parser.parse_known_args()
231 |     return args
232 | 
233 | 
234 | def save_stats(stats, save_dir="./"):
235 |     for key in stats:
236 |         plot_unimetric(stats, key, save_dir)
237 | 
238 | 
239 | def run(env, n_episodes, discount_factor, verbose=False, plot_stats=False, api_key=None):
240 |     env_name = env
241 |     env = gym.make(env)
242 | 
243 |     estimator = Estimator(env)
244 | 
245 |     if api_key is not None:
246 |         env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True)
247 | 
248 |     stats = q_learning(env, estimator, n_episodes,
249 |                        discount_factor=discount_factor, epsilon=0.0,
250 |                        verbose=verbose)
251 |     if plot_stats:
252 |         save_stats(stats)
253 | 
254 |     if api_key is not None:
255 |         env.close()
256 |         gym.upload("/tmp/" + env_name, api_key=api_key)
257 | 
258 | 
259 | def main():
260 |     args = _parse_args()
261 |     run(args.env, args.num_episodes, args.gamma,
262 |         args.verbose, args.plot_stats, args.api_key)
263 | 
264 | 
265 | if __name__ == '__main__':
266 |     main()
267 | 


--------------------------------------------------------------------------------
/GEN/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/GEN/README.md


--------------------------------------------------------------------------------
/GEN/genetic_gym.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import gym
  4 | from gym import wrappers
  5 | import argparse
  6 | import numpy as np
  7 | import random
  8 | from tqdm import trange
  9 | 
 10 | 
 11 | def get_random_policy(env):
 12 |     """
 13 |     Build a numpy array representing agent policy.
 14 |     This array must have one element per each of 16 environment states.
 15 |     Element must be an integer from 0 to 3, representing action
 16 |     to take from that state.
 17 |     """
 18 |     return np.random.randint(0, int(env.action_space.n), int(env.observation_space.n))
 19 | 
 20 | 
 21 | def sample_reward(env, policy, t_max=100):
 22 |     """
 23 |     Interact with an environment, return sum of all rewards.
 24 |     If game doesn't end on t_max (e.g. agent walks into a wall),
 25 |     force end the game and return whatever reward you got so far.
 26 |     Tip: see signature of env.step(...) method above.
 27 |     """
 28 |     s = env.reset()
 29 |     total_reward = 0
 30 | 
 31 |     for _ in range(t_max):
 32 |         action = policy[s]
 33 |         s, reward, done, info = env.step(action)
 34 |         total_reward += reward
 35 |         if done:
 36 |             break
 37 |     return total_reward
 38 | 
 39 | 
 40 | def evaluate(sample_func, env, policy, n_times=100):
 41 |     """Run several evaluations and average the score the policy gets."""
 42 |     rewards = [sample_func(env, policy) for _ in range(n_times)]
 43 |     return float(np.mean(rewards))
 44 | 
 45 | 
 46 | def crossover(env, policy1, policy2, p=0.5, prioritize_func=None):
 47 |     """
 48 |     for each state, with probability p take action from policy1, else policy2
 49 |     """
 50 |     if prioritize_func is not None:
 51 |         p = prioritize_func(env, policy1, policy2, p)
 52 |     return np.choose(
 53 |         (np.random.random_sample(policy1.shape[0]) <= p).astype(int), [policy1, policy2])
 54 | 
 55 | 
 56 | def mutation(env, policy, p=0.1):
 57 |     """
 58 |     for each state, with probability p replace action with random action
 59 |     Tip: mutation can be written as crossover with random policy
 60 |     """
 61 |     return crossover(env, get_random_policy(env), policy, p)
 62 | 
 63 | 
 64 | def run(env, n_episodes, max_steps,
 65 |         pool_size, n_crossovers, n_mutations,
 66 |         seed=42, verbose=False, api_key=None):
 67 |     random.seed(seed)
 68 |     np.random.seed(seed)
 69 | 
 70 |     env_name = env
 71 |     env = gym.make(env).env
 72 |     env.reset()
 73 | 
 74 |     if api_key is not None:
 75 |         env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True)
 76 | 
 77 |     if verbose:
 78 |         print("initializing...")
 79 |     pool = [get_random_policy(env) for _ in range(pool_size)]
 80 | 
 81 |     rewards = np.zeros(n_episodes)
 82 | 
 83 |     tr = trange(
 84 |         n_episodes,
 85 |         desc="best score: {:.4}".format(0.0),
 86 |         leave=True)
 87 | 
 88 |     def sample_func(env, policy):
 89 |         return sample_reward(
 90 |             env, policy, t_max=max_steps if api_key is None else int(1e10))
 91 | 
 92 |     def prioritize_func(env, policy1, policy2, p):
 93 |         return min(
 94 |             p * evaluate(sample_func, env, policy1) / (evaluate(sample_func, env, policy2) + 0.001),
 95 |             1.0)
 96 | 
 97 |     for i_epoch in tr:
 98 |         crossovered = [
 99 |             crossover(env, random.choice(pool), random.choice(pool),
100 |                       prioritize_func=prioritize_func)
101 |             for _ in range(n_crossovers)]
102 |         mutated = [mutation(env, random.choice(pool)) for _ in range(n_mutations)]
103 | 
104 |         assert type(crossovered) == type(mutated) == list
105 | 
106 |         # add new policies to the pool
107 |         pool = pool + crossovered + mutated
108 |         pool_scores = list(map(lambda x: evaluate(sample_func, env, x), pool))
109 | 
110 |         # select pool_size best policies
111 |         selected_indices = np.argsort(pool_scores)[-pool_size:]
112 |         pool = [pool[i] for i in selected_indices]
113 |         pool_scores = [pool_scores[i] for i in selected_indices]
114 | 
115 |         # print the best policy so far (last in ascending score order)
116 |         tr.set_description("best score: {:.4}".format(pool_scores[-1]))
117 |         rewards[i_epoch] = pool_scores[-1]
118 | 
119 |     print("Avg rewards over {} episodes: {:.4f} +/-{:.4f}".format(
120 |         n_episodes, np.mean(rewards), np.std(rewards)))
121 |     if api_key is not None:
122 |         env.close()
123 |         gym.upload("/tmp/" + env_name, api_key=api_key)
124 | 
125 | 
126 | def _parse_args():
127 |     parser = argparse.ArgumentParser(description='Policy iteration example')
128 |     parser.add_argument(
129 |         '--env',
130 |         type=str,
131 |         default='FrozenLake8x8-v0',
132 |         help='The environment to use')
133 |     parser.add_argument(
134 |         '--num_episodes',
135 |         type=int,
136 |         default=200,
137 |         help='Number of episodes')
138 |     parser.add_argument(
139 |         '--max_steps',
140 |         type=int,
141 |         default=200,
142 |         help='Max number per episode')
143 |     parser.add_argument(
144 |         '--pool_size',
145 |         type=int,
146 |         default=200,
147 |         help='Population size')
148 |     parser.add_argument(
149 |         '--n_crossovers',
150 |         type=int,
151 |         default=100,
152 |         help='Number of crossovers per episode')
153 |     parser.add_argument(
154 |         '--n_mutations',
155 |         type=int,
156 |         default=100,
157 |         help='Number of mutations per episode')
158 |     parser.add_argument(
159 |         '--seed',
160 |         type=int,
161 |         default=42)
162 |     parser.add_argument(
163 |         '--verbose',
164 |         action='store_true',
165 |         default=False)
166 |     parser.add_argument(
167 |         '--api_key',
168 |         type=str,
169 |         default=None)
170 | 
171 |     args, _ = parser.parse_known_args()
172 |     return args
173 | 
174 | 
175 | def main():
176 |     args = _parse_args()
177 |     run(args.env, args.num_episodes, args.max_steps,
178 |         args.pool_size, args.n_crossovers, args.n_mutations,
179 |         args.seed, args.verbose, args.api_key)
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     main()
184 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Sergey Kolesnikov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MC/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | HeaderColumnWidths=570,72,107
3 | Timestamp=2016,12,13,9,50,27
4 | Version=3
5 | ViewMode=1
6 | 


--------------------------------------------------------------------------------
/MC/Blackjack Playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 419,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import sys\n",
 13 |     "if \"../\" not in sys.path:\n",
 14 |     "  sys.path.append(\"../\") \n",
 15 |     "from lib.envs.blackjack import BlackjackEnv"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 420,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "env = BlackjackEnv()"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 422,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Player Score: 17 (Usable Ace: False), Dealer Score: 10\n",
 41 |       "Taking action: Hit\n",
 42 |       "Player Score: 18 (Usable Ace: False), Dealer Score: 10\n",
 43 |       "Taking action: Hit\n",
 44 |       "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n",
 45 |       "Game end. Reward: -1.0\n",
 46 |       "\n",
 47 |       "Player Score: 6 (Usable Ace: False), Dealer Score: 9\n",
 48 |       "Taking action: Hit\n",
 49 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 9\n",
 50 |       "Taking action: Hit\n",
 51 |       "Player Score: 26 (Usable Ace: False), Dealer Score: 9\n",
 52 |       "Game end. Reward: -1.0\n",
 53 |       "\n",
 54 |       "Player Score: 12 (Usable Ace: False), Dealer Score: 6\n",
 55 |       "Taking action: Hit\n",
 56 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n",
 57 |       "Taking action: Stick\n",
 58 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n",
 59 |       "Game end. Reward: 1.0\n",
 60 |       "\n",
 61 |       "Player Score: 17 (Usable Ace: True), Dealer Score: 8\n",
 62 |       "Taking action: Hit\n",
 63 |       "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n",
 64 |       "Taking action: Hit\n",
 65 |       "Player Score: 22 (Usable Ace: False), Dealer Score: 8\n",
 66 |       "Game end. Reward: -1.0\n",
 67 |       "\n",
 68 |       "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n",
 69 |       "Taking action: Hit\n",
 70 |       "Player Score: 27 (Usable Ace: False), Dealer Score: 8\n",
 71 |       "Game end. Reward: -1.0\n",
 72 |       "\n",
 73 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
 74 |       "Taking action: Hit\n",
 75 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
 76 |       "Taking action: Hit\n",
 77 |       "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n",
 78 |       "Game end. Reward: -1.0\n",
 79 |       "\n",
 80 |       "Player Score: 13 (Usable Ace: False), Dealer Score: 7\n",
 81 |       "Taking action: Hit\n",
 82 |       "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n",
 83 |       "Taking action: Hit\n",
 84 |       "Player Score: 24 (Usable Ace: False), Dealer Score: 7\n",
 85 |       "Game end. Reward: -1.0\n",
 86 |       "\n",
 87 |       "Player Score: 17 (Usable Ace: False), Dealer Score: 5\n",
 88 |       "Taking action: Hit\n",
 89 |       "Player Score: 25 (Usable Ace: False), Dealer Score: 5\n",
 90 |       "Game end. Reward: -1.0\n",
 91 |       "\n",
 92 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n",
 93 |       "Taking action: Stick\n",
 94 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n",
 95 |       "Game end. Reward: 1.0\n",
 96 |       "\n",
 97 |       "Player Score: 12 (Usable Ace: True), Dealer Score: 10\n",
 98 |       "Taking action: Hit\n",
 99 |       "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n",
100 |       "Taking action: Stick\n",
101 |       "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n",
102 |       "Game end. Reward: 0.0\n",
103 |       "\n",
104 |       "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n",
105 |       "Taking action: Hit\n",
106 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
107 |       "Taking action: Hit\n",
108 |       "Player Score: 24 (Usable Ace: False), Dealer Score: 10\n",
109 |       "Game end. Reward: -1.0\n",
110 |       "\n",
111 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n",
112 |       "Taking action: Hit\n",
113 |       "Player Score: 22 (Usable Ace: False), Dealer Score: 4\n",
114 |       "Game end. Reward: -1.0\n",
115 |       "\n",
116 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
117 |       "Taking action: Hit\n",
118 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
119 |       "Taking action: Stick\n",
120 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
121 |       "Game end. Reward: 0.0\n",
122 |       "\n",
123 |       "Player Score: 4 (Usable Ace: False), Dealer Score: 3\n",
124 |       "Taking action: Hit\n",
125 |       "Player Score: 14 (Usable Ace: False), Dealer Score: 3\n",
126 |       "Taking action: Hit\n",
127 |       "Player Score: 24 (Usable Ace: False), Dealer Score: 3\n",
128 |       "Game end. Reward: -1.0\n",
129 |       "\n",
130 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
131 |       "Taking action: Stick\n",
132 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
133 |       "Game end. Reward: 1.0\n",
134 |       "\n",
135 |       "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n",
136 |       "Taking action: Hit\n",
137 |       "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n",
138 |       "Taking action: Hit\n",
139 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
140 |       "Taking action: Stick\n",
141 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
142 |       "Game end. Reward: 1.0\n",
143 |       "\n",
144 |       "Player Score: 9 (Usable Ace: False), Dealer Score: 10\n",
145 |       "Taking action: Hit\n",
146 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
147 |       "Taking action: Hit\n",
148 |       "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n",
149 |       "Game end. Reward: -1.0\n",
150 |       "\n",
151 |       "Player Score: 12 (Usable Ace: False), Dealer Score: 5\n",
152 |       "Taking action: Hit\n",
153 |       "Player Score: 15 (Usable Ace: False), Dealer Score: 5\n",
154 |       "Taking action: Hit\n",
155 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n",
156 |       "Taking action: Stick\n",
157 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n",
158 |       "Game end. Reward: 1.0\n",
159 |       "\n",
160 |       "Player Score: 11 (Usable Ace: False), Dealer Score: 9\n",
161 |       "Taking action: Hit\n",
162 |       "Player Score: 13 (Usable Ace: False), Dealer Score: 9\n",
163 |       "Taking action: Hit\n",
164 |       "Player Score: 17 (Usable Ace: False), Dealer Score: 9\n",
165 |       "Taking action: Hit\n",
166 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 9\n",
167 |       "Taking action: Hit\n",
168 |       "Player Score: 29 (Usable Ace: False), Dealer Score: 9\n",
169 |       "Game end. Reward: -1.0\n",
170 |       "\n",
171 |       "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n",
172 |       "Taking action: Hit\n",
173 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 7\n",
174 |       "Taking action: Hit\n",
175 |       "Player Score: 29 (Usable Ace: False), Dealer Score: 7\n",
176 |       "Game end. Reward: -1.0\n",
177 |       "\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "def print_observation(observation):\n",
183 |     "    score, dealer_score, usable_ace = observation\n",
184 |     "    print(\"Player Score: {} (Usable Ace: {}), Dealer Score: {}\".format(\n",
185 |     "          score, usable_ace, dealer_score))\n",
186 |     "\n",
187 |     "def strategy(observation):\n",
188 |     "    score, dealer_score, usable_ace = observation\n",
189 |     "    # Stick (action 0) if the score is > 20, hit (action 1) otherwise\n",
190 |     "    return 0 if score >= 20 else 1\n",
191 |     "\n",
192 |     "for i_episode in range(20):\n",
193 |     "    observation = env.reset()\n",
194 |     "    for t in range(100):\n",
195 |     "        print_observation(observation)\n",
196 |     "        action = strategy(observation)\n",
197 |     "        print(\"Taking action: {}\".format( [\"Stick\", \"Hit\"][action]))\n",
198 |     "        observation, reward, done, _ = env.step(action)\n",
199 |     "        if done:\n",
200 |     "            print_observation(observation)\n",
201 |     "            print(\"Game end. Reward: {}\\n\".format(float(reward)))\n",
202 |     "            break"
203 |    ]
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "Python 3",
209 |    "language": "python",
210 |    "name": "python3"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.5.1"
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 0
227 | }
228 | 


--------------------------------------------------------------------------------
/MC/README.md:
--------------------------------------------------------------------------------
 1 | ## Model-Free Prediction & Control with Monte Carlo (MC)
 2 | 
 3 | 
 4 | ### Learning Goals
 5 | 
 6 | - Understand the difference between Prediction and Control
 7 | - Know how to use the MC method for predicting state values and state-action values
 8 | - Understand the on-policy first-visit MC control algorithm
 9 | - Understand off-policy MC control algorithms
10 | - Understand Weighted Importance Sampling
11 | - Understand the benefits of MC algorithms over the Dynamic Programming approach
12 | 
13 | 
14 | ### Summary
15 | 
16 | - Dynamic Programming approaches assume complete knowledge of the environment (the MDP). In practice, we often don't have full knowledge of how the world works.
17 | - Monte Carlo (MC) methods can learn directly from experience collected by interacting with the environment. An episode of experience is a series of `(State, Action, Reward, Next State)` tuples.
18 | - MC methods work based on episodes. We sample episodes of experience and make updates to our estimates at the end of each episode. MC methods have high variance (due to lots of random decisions within an episode) but are unbiased.
19 | - MC Policy Evaluation: Given a policy, we want to estimate the state-value function V(s). Sample episodes of experience and estimate V(s) to be the reward received from that state onwards averaged across all of your experience. The same technique works for the action-value function Q(s, a). Given enough samples, this is proven to converge.
20 | - MC Control: Idea is the same as for Dynamic Programming. Use MC Policy Evaluation to evaluate the current policy then improve the policy greedily. The Problem: How do we ensure that we explore all states if we don't know the full environment?
21 | - Solution to exploration problem: Use epsilon-greedy policies instead of full greedy policies. When making a decision act randomly with probability epsilon. This will learn the optimal epsilon-greedy policy.
22 | - Off-Policy Learning: How can we learn about the actual optimal (greedy) policy while following an exploratory (epsilon-greedy) policy? We can use importance sampling, which weighs returns by their probability of occurring under the policy we want to learn about.
23 | 
24 | 
25 | ### Lectures & Readings
26 | 
27 | **Required:**
28 | 
29 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 5: Monte Carlo Methods
30 | 
31 | 
32 | **Optional:**
33 | 
34 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf))
35 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf))
36 | 
37 | 
38 | ### Exercises
39 | 
40 | - [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack Playground.ipynb)
41 | - Implement the Monte Carlo Prediction to estimate state-action values
42 |   - [Exercise](MC Prediction.ipynb)
43 |   - [Solution](MC Prediction Solution.ipynb)
44 | - Implement the on-policy first-visit Monte Carlo Control algorithm
45 |   - [Exercise](MC Control with Epsilon-Greedy Policies.ipynb)
46 |   - [Solution](MC Control with Epsilon-Greedy Policies Solution.ipynb)
47 | - Implement the off-policy every-visit Monte Carlo Control using Weighted Important Sampling algorithm
48 |   - [Exercise](Off-Policy MC Control with Weighted Importance Sampling.ipynb)
49 |   - [Solution](Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb)


--------------------------------------------------------------------------------
/PG/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | logs*
3 | 


--------------------------------------------------------------------------------
/PG/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/PG/README.md


--------------------------------------------------------------------------------
/PG/reinforce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import tensorflow as tf
 4 | from agents.agent_states import LinearHiddenState
 5 | from rstools.tf.optimization import build_model_optimization
 6 | 
 7 | from agents.agent_networks import FeatureNet, PolicyNet
 8 | 
 9 | 
10 | class ReinforceAgent(object):
11 |     def __init__(self, state_shape, n_actions, network, special=None):
12 |         self.special = special or {}
13 |         self.state_shape = state_shape
14 |         self.n_actions = n_actions
15 |         self.special = special
16 | 
17 |         self.scope = tf.get_variable_scope().name + "/" + special.get("scope", "dqn") \
18 |             if tf.get_variable_scope().name else special.get("scope", "dqn")
19 | 
20 |         with tf.variable_scope(self.scope):
21 |             self._build_graph(network)
22 | 
23 |     def _build_graph(self, network):
24 |         self.feature_net = FeatureNet(
25 |             self.state_shape, network,
26 |             self.special.get("feature_net", {}))
27 | 
28 |         self.hidden_state = LinearHiddenState(
29 |             self.feature_net.feature_state,
30 |             self.special.get("hidden_size", 512),
31 |             self.special.get("hidden_activation", tf.nn.elu))
32 | 
33 |         self.policy_net = PolicyNet(
34 |             self.hidden_state.state, self.n_actions,
35 |             self.special.get("policy_net", {}))
36 | 
37 |         build_model_optimization(
38 |             self.policy_net,
39 |             self.special.get("policy_net_optimization", None))
40 |         build_model_optimization(
41 |             self.hidden_state,
42 |             self.special.get("hidden_state_optimization", None),
43 |             loss=self.policy_net.loss)
44 |         build_model_optimization(
45 |             self.feature_net,
46 |             self.special.get("feature_net_optimization", None),
47 |             loss=self.policy_net.loss)
48 | 
49 |     def predict_probs(self, sess, state_batch, is_training=False):
50 |         return sess.run(
51 |             self.policy_net.predicted_probs,
52 |             feed_dict={
53 |                 self.feature_net.states: state_batch,
54 |                 self.feature_net.is_training: is_training})
55 | 


--------------------------------------------------------------------------------
/PG/run_reinforce.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | from rstools.utils.batch_utils import iterate_minibatches
  5 | from tqdm import trange
  6 | 
  7 | from PG.reinforce import ReinforceAgent
  8 | from common.networks import activations
  9 | from wrappers.gym_wrappers import Transition
 10 | from wrappers.run_wrappers import typical_args, typical_argsparse, run_wrapper, update_wraper, \
 11 |     epsilon_greedy_policy, play_session
 12 | 
 13 | 
 14 | def update(sess, reinforce_agent, transitions, initial_state=None,
 15 |            discount_factor=0.99, reward_norm=1.0, batch_size=32, time_major=True):
 16 |     policy_targets = []
 17 |     state_history = []
 18 |     action_history = []
 19 | 
 20 |     cumulative_reward = np.zeros_like(transitions[-1].reward)
 21 |     for transition in reversed(transitions):
 22 |         cumulative_reward = reward_norm * transition.reward + \
 23 |                             np.invert(transition.done) * discount_factor * cumulative_reward
 24 | 
 25 |         policy_targets.append(cumulative_reward)
 26 |         state_history.append(transition.state)
 27 |         action_history.append(transition.action)
 28 | 
 29 |     # time-major
 30 |     policy_targets = np.array(policy_targets[::-1])
 31 |     state_history = np.array(state_history[::-1])
 32 |     action_history = np.array(action_history[::-1])
 33 | 
 34 |     time_len = state_history.shape[0]
 35 | 
 36 |     policy_loss = 0.0
 37 |     for state_axis, action_axis, policy_target_axis in \
 38 |             zip(state_history, action_history, policy_targets):
 39 |         axis_len = state_axis.shape[0]
 40 |         axis_policy_loss = 0.0
 41 | 
 42 |         state_axis = iterate_minibatches(state_axis, batch_size)
 43 |         action_axis = iterate_minibatches(action_axis, batch_size)
 44 |         policy_target_axis = iterate_minibatches(policy_target_axis, batch_size)
 45 | 
 46 |         for state_batch, action_batch, policy_target in \
 47 |                 zip(state_axis, action_axis, policy_target_axis):
 48 |             run_params = [
 49 |                 reinforce_agent.policy_net.loss,
 50 |                 reinforce_agent.policy_net.train_op,
 51 |                 reinforce_agent.feature_net.train_op]
 52 |             feed_params = {
 53 |                 reinforce_agent.feature_net.states: state_batch,
 54 |                 reinforce_agent.feature_net.is_training: True,
 55 |                 reinforce_agent.policy_net.actions: action_batch,
 56 |                 reinforce_agent.policy_net.cumulative_rewards: policy_target,
 57 |                 reinforce_agent.policy_net.is_training: True
 58 |             }
 59 | 
 60 |             run_result = sess.run(
 61 |                 run_params,
 62 |                 feed_dict=feed_params)
 63 | 
 64 |             batch_loss_policy = run_result[0]
 65 | 
 66 |             axis_policy_loss += batch_loss_policy
 67 | 
 68 |         policy_loss += axis_policy_loss / axis_len
 69 | 
 70 |     return policy_loss / time_len
 71 | 
 72 | 
 73 | def generate_sessions(sess, a3c_agent, env_pool, update_fn, t_max=1000):
 74 |     total_reward = 0.0
 75 |     total_games = 0.0
 76 | 
 77 |     transitions = []
 78 | 
 79 |     states = env_pool.pool_states()
 80 |     for t in range(t_max):
 81 |         actions = epsilon_greedy_policy(a3c_agent, sess, states)
 82 |         next_states, rewards, dones, _ = env_pool.step(actions)
 83 | 
 84 |         transitions.append(Transition(
 85 |             state=states, action=actions, reward=rewards, next_state=next_states, done=dones))
 86 |         states = next_states
 87 | 
 88 |         total_reward += rewards.sum()
 89 |         total_games += dones.sum()
 90 | 
 91 |         if env_pool.n_envs == 1 and total_games > 0:
 92 |             break
 93 | 
 94 |     total_policy_loss = update_fn(sess, a3c_agent, transitions)
 95 | 
 96 |     return total_reward / env_pool.n_envs, \
 97 |            total_policy_loss, \
 98 |            t / (total_games / env_pool.n_envs)
 99 | 
100 | 
101 | def reinforce_learning(
102 |         sess, agent, env, update_fn,
103 |         n_epochs=1000, n_sessions=100, t_max=1000):
104 |     tr = trange(
105 |         n_epochs,
106 |         desc="",
107 |         leave=True)
108 | 
109 |     history = {
110 |         "reward": np.zeros(n_epochs),
111 |         "policy_loss": np.zeros(n_epochs),
112 |         "steps": np.zeros(n_epochs),
113 |     }
114 | 
115 |     for i in tr:
116 |         sessions = [
117 |             generate_sessions(sess, agent, env, update_fn, t_max)
118 |             for _ in range(n_sessions)]
119 |         session_rewards, session_policy_loss, session_steps = \
120 |             map(np.array, zip(*sessions))
121 | 
122 |         history["reward"][i] = np.mean(session_rewards)
123 |         history["policy_loss"][i] = np.mean(session_policy_loss)
124 |         history["steps"][i] = np.mean(session_steps)
125 | 
126 |         desc = "\t".join(
127 |             ["{} = {:.3f}".format(key, value[i]) for key, value in history.items()])
128 |         tr.set_description(desc)
129 | 
130 |     return history
131 | 
132 | 
133 | def run(env_name, make_env_fn, agent_cls,
134 |         run_args, update_args, agent_agrs,
135 |         log_dir=None, episode_limit=None,
136 |         plot_stats=False, api_key=None,
137 |         load=False, gpu_option=0.4,
138 |         n_games=10):
139 |     run_wrapper(
140 |         n_games, reinforce_learning, update_wraper(update, **update_args),
141 |         play_session, epsilon_greedy_policy,
142 |         env_name, make_env_fn, agent_cls,
143 |         run_args, agent_agrs,
144 |         log_dir=log_dir, episode_limit=episode_limit,
145 |         plot_stats=plot_stats, api_key=api_key,
146 |         load=load, gpu_option=gpu_option)
147 | 
148 | 
149 | def _parse_args():
150 |     parser = argparse.ArgumentParser(description='Reinforce Agent Learning')
151 |     # typical params
152 |     parser = typical_args(parser)
153 | 
154 |     # agent special params & optimization
155 |     parser.add_argument(
156 |         '--policy_lr',
157 |         type=float,
158 |         default=1e-5,
159 |         help='Learning rate for policy network. (default: %(default)s)')
160 | 
161 |     parser.add_argument(
162 |         '--entropy_factor',
163 |         type=float,
164 |         default=1e-2,
165 |         help='Entropy factor for policy network. (default: %(default)s)')
166 | 
167 |     args = parser.parse_args()
168 |     return args
169 | 
170 | 
171 | def main():
172 |     args = _parse_args()
173 | 
174 |     assert args.time_major, "Please, use time_major flag for updates"
175 | 
176 |     network, run_args, update_args, optimization_params, make_env_fn = typical_argsparse(args)
177 | 
178 |     policy_optimization_params = {
179 |         **optimization_params,
180 |         **{"initial_lr": args.policy_lr}
181 |     }
182 | 
183 |     policy_net_params = {
184 |         "entropy_factor": args.entropy_factor
185 |     }
186 | 
187 |     agent_cls = ReinforceAgent
188 | 
189 |     special = {
190 |         "policy_net": policy_net_params,
191 |         "hidden_size": args.hidden_size,
192 |         "hidden_activation": activations[args.hidden_activation],
193 |         "feature_net_optimization": optimization_params,
194 |         "hidden_state_optimization": optimization_params,
195 |         "policy_net_optimization": policy_optimization_params,
196 |     }
197 | 
198 |     agent_args = {
199 |         "network": network,
200 |         "special": special
201 |     }
202 | 
203 |     run(args.env, make_env_fn, agent_cls,
204 |         run_args, update_args, agent_args,
205 |         args.log_dir, args.episode_limit,
206 |         args.plot_history, args.api_key,
207 |         args.load, args.gpu_option,
208 |         args.n_games)
209 | 
210 | 
211 | if __name__ == '__main__':
212 |     main()
213 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RL course experiments
  2 | 
  3 | ### Overview
  4 | This repository provides code implementations for popular Reinforcement Learning algorithms.
  5 | 
  6 | Main idea was to generalise main RL algorithms and provide unified interface for testing them on any gym environment. 
  7 | For example, now your can create your own Double Dueling Deep Recurrent Q-Learning agent (Let's name it, 3DRQ). 
  8 | For simplicity, all main agent blocks are in `agents` folder. 
  9 | 
 10 | For now, repository is under after-course refactoring. So, many documentation needed.
 11 | 
 12 | All code is written in Python 3 and uses RL environments from OpenAI Gym. 
 13 | Advanced techniques use Tensorflow for neural network implementations.
 14 | 
 15 | ### Inspired by:
 16 | * [Berkeley CS188x](http://ai.berkeley.edu/home.html)
 17 | * [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
 18 | * [dennybritz/reinforcement-learning](https://github.com/dennybritz/reinforcement-learning)
 19 | * [yandexdataschool/Practical_RL](https://github.com/yandexdataschool/Practical_RL)
 20 | * [yandexdataschool/AgentNet](https://github.com/yandexdataschool/AgentNet)
 21 | 
 22 | ##### Additional thanks to [JustHeuristic](https://github.com/justheuristic) for Practical_RL course
 23 | 
 24 | ### Table of Contents
 25 | * [Genetic algorithm](https://github.com/Scitator/rl-course-experiments/tree/master/GEN)
 26 | * [Dynamic Programming](https://github.com/Scitator/rl-course-experiments/tree/master/DP)
 27 | * [Cross Entropy Method](https://github.com/Scitator/rl-course-experiments/tree/master/CEM)
 28 | * [Monte Carlo Control](https://github.com/Scitator/rl-course-experiments/tree/master/MC)
 29 | * [Temporal Difference](https://github.com/Scitator/rl-course-experiments/tree/master/TD)
 30 | * [Deep Q-Networks](https://github.com/Scitator/rl-course-experiments/tree/master/DQN)
 31 | * [Policy Gradient](https://github.com/Scitator/rl-course-experiments/tree/master/PG)
 32 | * [Asynchronous Advantage Actor-Critic](https://github.com/Scitator/rl-course-experiments/tree/master/A3C)
 33 | * [Optimality Tightening](https://arxiv.org/abs/1611.01606) [TODO]
 34 | * [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477) [TODO]
 35 | * Continuous action space [TODO]
 36 | * Monte Carlo Tree Search [TODO]
 37 | 
 38 | For more information, look at folder readme.
 39 | 
 40 | #### Special requirements
 41 | 
 42 | For simple script running you need to install additional [repo](https://github.com/Scitator/rstools) with optimization stuff for neural networks:
 43 | 
 44 | `pip install git+https://github.com/Scitator/rstools`
 45 | 
 46 | #### Example usage
 47 | 
 48 | DQN:
 49 | 
 50 | ```
 51 | PYTHONPATH=. python DQN/run_dqn.py --plot_history --env CartPole-v0 \
 52 | --feature_network linear --layers 128-128 --hidden_size 64 \
 53 | --n_epochs 1000 --n_games 4 --batch_size 128 --t_max 500 --episode_limit 500 \
 54 | --replay_buffer simple --replay_buffer_size 2000 \
 55 | --qvalue_lr 0.0001 --feature_lr 0.0001 --value_lr 0.0001 \
 56 | --initial_epsilon 0.8 --final_epsilon 0.1 \
 57 | --gpu_option 0.25 \
 58 | --api_key <paste_your_gym_api_key_here>
 59 | ```
 60 | 
 61 | Reinforce:
 62 | 
 63 | ```
 64 | PYTHONPATH=. python PG/run_reinforce.py --plot_history --env CartPole-v0 \ 
 65 | --feature_network linear --layers 128-128 --hidden_size 64 \ 
 66 | --n_epochs 10000 --n_games 1 --batch_size 1 --t_max 500 --episode_limit 500 \
 67 | --entropy_factor 0.005 --policy_lr 0.0000001 --feature_lr 0.0000001 --grad_clip 10.0 \ 
 68 |  --gpu_option 0.25 --time_major \
 69 | --api_key <paste_your_gym_api_key_here>
 70 | ```
 71 | 
 72 | Feed-Forward Asynchronous Advantage Actor-Critic:
 73 | 
 74 | ```
 75 | PYTHONPATH=. python A3C/run_a3c.py --plot_history --env CartPole-v0 \
 76 | --feature_network linear --layers 128-128 --hidden_size 64 \  
 77 | --n_epochs 500 --n_games 1 --batch_size 1 --t_max 100 --episode_limit 500 \
 78 | --entropy_factor 0.005 --policy_lr 0.00001 --feature_lr 0.00001 --value_lr 0.00001 --grad_clip 10.0 \
 79 | --gpu_option 0.25 --time_major \
 80 | --api_key <paste_your_gym_api_key_here>
 81 | ```
 82 | 
 83 | If agent start to play well, you can always stop training by `Ctrl+C` hotkey.
 84 | If something go wrong, you can always evaluate agent thought magic `--load --n_epochs 0` 
 85 | combination.
 86 | 
 87 | ##### Metrics
 88 | 
 89 | - loss - typical neural network loss
 90 | - reward - typical environment reward, 
 91 | but because Environment Pool is always used not very informative for now
 92 | - steps - mean number of game ends per epoch session
 93 | 
 94 | ##### If you have linux with NVIDIA GPU and no X server, but want to try gym
 95 | 
 96 | You need to reinstall NVIDIA drivers.
 97 | 
 98 | [issue source](https://github.com/openai/gym/issues/366)
 99 | [how-to guide](https://davidsanwald.github.io/2016/11/13/building-tensorflow-with-gpu-support.html)
100 | 
101 | and add `bash xvfb start; DISPLAY=:1` before run command. 
102 | 
103 | #### Contributing
104 | 
105 | ##### write code
106 | 
107 | Found a bug or know how to write it simpler? 
108 | Or maybe you want to create your own agent? 
109 | Just follow PEP8 and make merge request.
110 | 
111 | ##### ...or play a game
112 | 
113 | We have a lot of RL algorithms, and even more gym environments to test them. 
114 | So, play a game, save
115 | * agent parameters (so anyone can reproduce)
116 | * agent itself (`model.ckpt*`)
117 | * plots (they will be automatically generated with `--plot_history` flag)
118 | * gym-link (main results)
119 | * make merge request (solutions should be at `field/solutions.md`, for example `DQN/solutions.md`)
120 | 


--------------------------------------------------------------------------------
/TD/README.md:
--------------------------------------------------------------------------------
 1 | ## Model-Free Prediction & Control with Temporal Difference (TD) and Q-Learning
 2 | 
 3 | 
 4 | ### Learning Goals
 5 | 
 6 | - Understand TD(0) for prediction
 7 | - Understand SARSA for on-policy control
 8 | - Understand Q-Learning for off-policy control
 9 | - Understand the benefits of TD algorithms over MC and DP approaches
10 | - Understand how n-step methods unify MC and TD approaches
11 | - Understand the backward and forward view of TD-Lambda
12 | 
13 | 
14 | ### Summary
15 | 
16 | - TD-Learning is a combination of Monte Carlo and Dynamic Programming ideas. Like Monte Carlo, TD works based on samples and doesn't require a model of the environment. Like Dynamic Programming, TD uses bootstrapping to make updates.
17 | - Whether MC or TD is better depends on the problem and there are no theoretical results that prove a clear winner.
18 | - General Update Rule: `Q[s,a] += learning_rate * (td_target - Q[s,a])`. `td_target - Q[s,a]` is also called the TD Error.
19 | - SARSA: On-Policy TD Control
20 | - TD Target for SARSA: `R[t+1] + discount_factor * Q[next_state][next_action]`
21 | - Q-Learning: Off-policy TD Control
22 | - TD Target for Q-Learning: `R[t+1] + discount_factor * max(Q[next_state])`
23 | - Q-Learning has a positive bias because it uses the maximum of estimated Q values to estimate the maximum action value, all from the same experience. Double Q-Learning gets around this by splitting the experience and using different Q functions for maximization and estimation.
24 | - N-Step methods unify MC and TD approaches. They making updates based on n-steps instead of a single step (TD-0) or a full episode (MC).
25 | 
26 | 
27 | ### Lectures & Readings
28 | 
29 | **Required:**
30 | 
31 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 6: Temporal-Difference Learning
32 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf))
33 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf))
34 | 
35 | **Optional:**
36 | 
37 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 7: Multi-Step Bootstrapping
38 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 12: Eligibility Traces
39 | 
40 | 


--------------------------------------------------------------------------------
/TD/evsarsa.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Expected Value SARSA
  3 | This file builds upon the same functions as Q-learning agent (qlearning.py).
  4 | 
  5 | Here's usage example:
  6 |     from expected_value_sarsa import EVSarsaAgent
  7 | 
  8 |     agent = EVSarsaAgent(
  9 |         alpha=0.5,epsilon=0.25,discount=0.99,
 10 |         getLegalActions = lambda s: actions_from_that_state)
 11 |     action = agent.getAction(state)
 12 |     agent.update(state,action, next_state,reward)
 13 |     agent.epsilon *= 0.99
 14 | """
 15 | 
 16 | import random
 17 | 
 18 | import numpy as np
 19 | from collections import defaultdict
 20 | 
 21 | 
 22 | class EVSarsaAgent():
 23 |     """
 24 |       Expected Value SARSA Agent.
 25 | 
 26 |       The two main methods are
 27 |       - self.getAction(state) - returns agent's action in that state
 28 |       - self.update(state,action,nextState,reward) - returns agent's next action
 29 | 
 30 |       Instance variables you have access to
 31 |         - self.epsilon (exploration prob)
 32 |         - self.alpha (learning rate)
 33 |         - self.discount (discount rate aka gamma)
 34 | 
 35 |     """
 36 | 
 37 |     def __init__(self, alpha, epsilon, discount, getLegalActions):
 38 |         "We initialize agent and Q-values here."
 39 |         self.getLegalActions = getLegalActions
 40 |         self._qValues = defaultdict(lambda: defaultdict(lambda: 0))
 41 |         self.alpha = alpha
 42 |         self.epsilon = epsilon
 43 |         self.discount = discount
 44 | 
 45 |     def getQValue(self, state, action):
 46 |         """
 47 |           Returns Q(state,action)
 48 |         """
 49 |         return self._qValues[state][action]
 50 | 
 51 |     def setQValue(self, state, action, value):
 52 |         """
 53 |           Sets the Qvalue for [state,action] to the given value
 54 |         """
 55 |         self._qValues[state][action] = value
 56 | 
 57 |     def getValue(self, state):
 58 |         """
 59 |           Returns max_action Q(state,action)
 60 |           where the max is over legal actions.
 61 |         """
 62 | 
 63 |         possibleActions = self.getLegalActions(state)
 64 |         # If there are no legal actions, return 0.0
 65 |         if len(possibleActions) == 0:
 66 |             return 0.0
 67 | 
 68 |         # You'll need this to estimate action probabilities
 69 |         epsilon = self.epsilon
 70 | 
 71 |         value = np.array([self.getQValue(state, a) for a in possibleActions])
 72 |         value = (value * epsilon).sum() + value.max() * (
 73 |             1.0 - epsilon / float(len(possibleActions)))
 74 |         return value
 75 | 
 76 |     def getPolicy(self, state):
 77 |         """
 78 |           Compute the best action to take in a state.
 79 |         """
 80 |         possibleActions = self.getLegalActions(state)
 81 | 
 82 |         # If there are no legal actions, return None
 83 |         if len(possibleActions) == 0:
 84 |             return None
 85 | 
 86 |         best_action = possibleActions[
 87 |             np.argmax([self.getQValue(state, a) for a in possibleActions])]
 88 |         return best_action
 89 | 
 90 |     def getAction(self, state):
 91 |         """
 92 |           Compute the action to take in the current state, including exploration.
 93 | 
 94 |           With probability self.epsilon, we should take a random action.
 95 |           otherwise - the best policy action (self.getPolicy).
 96 |         """
 97 | 
 98 |         # Pick Action
 99 |         possibleActions = self.getLegalActions(state)
100 |         action = None
101 | 
102 |         # If there are no legal actions, return None
103 |         if len(possibleActions) == 0:
104 |             return None
105 | 
106 |         # agent parameters:
107 |         epsilon = self.epsilon
108 | 
109 |         if np.random.random() <= epsilon:
110 |             action = random.choice(possibleActions)
111 |         else:
112 |             action = self.getPolicy(state)
113 |         return action
114 | 
115 |     def update(self, state, action, nextState, reward):
116 |         """
117 |           You should do your Q-Value update here
118 |         """
119 |         # agent parameters
120 |         gamma = self.discount
121 |         learning_rate = self.alpha
122 | 
123 |         reference_qvalue = reward + gamma * self.getValue(nextState)
124 |         updated_qvalue = (1 - learning_rate) * self.getQValue(state, action) + \
125 |                          learning_rate * reference_qvalue
126 |         self.setQValue(state, action, updated_qvalue)
127 | 


--------------------------------------------------------------------------------
/TD/qlearning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Q-learning Agent
  3 | 
  4 | Here's an example:
  5 |     from qlearning import QLearningAgent
  6 | 
  7 |     agent = QLearningAgent(
  8 |         alpha=0.5,epsilon=0.25,discount=0.99,
  9 |         getLegalActions = lambda s: actions_from_that_state)
 10 |     action = agent.getAction(state)
 11 |     agent.update(state,action, next_state,reward)
 12 |     agent.epsilon *= 0.99
 13 | """
 14 | 
 15 | import random
 16 | 
 17 | import numpy as np
 18 | from collections import defaultdict
 19 | 
 20 | 
 21 | class QLearningAgent(object):
 22 |     """
 23 |       Q-Learning Agent
 24 | 
 25 |       The two main methods are
 26 |       - self.getAction(state) - returns agent's action in that state
 27 |       - self.update(state,action,nextState,reward) - returns agent's next action
 28 | 
 29 |       Functions you should use
 30 |         - self.getLegalActions(state)
 31 |           which returns legal actions for a state
 32 |         - self.getQValue(state,action)
 33 |           which returns Q(state,action)
 34 |         - self.setQValue(state,action,value)
 35 |           which sets Q(state,action) := value
 36 | 
 37 |       !!!Important!!!
 38 |       NOTE: please avoid using self._qValues directly to make code cleaner
 39 |     """
 40 | 
 41 |     def __init__(self, alpha, epsilon, discount, getLegalActions):
 42 |         "We initialize agent and Q-values here."
 43 |         self.getLegalActions = getLegalActions
 44 |         self._qValues = defaultdict(lambda: defaultdict(lambda: 0))
 45 |         self.alpha = alpha
 46 |         self.epsilon = epsilon
 47 |         self.discount = discount
 48 | 
 49 |     def getQValue(self, state, action):
 50 |         """
 51 |           Returns Q(state,action)
 52 |         """
 53 |         return self._qValues[state][action]
 54 | 
 55 |     def setQValue(self, state, action, value):
 56 |         """
 57 |           Sets the Qvalue for [state,action] to the given value
 58 |         """
 59 |         self._qValues[state][action] = value
 60 | 
 61 |     def getValue(self, state):
 62 |         """
 63 |           Returns max_action Q(state,action)
 64 |           where the max is over legal actions.
 65 |         """
 66 | 
 67 |         possibleActions = self.getLegalActions(state)
 68 |         # If there are no legal actions, return 0.0
 69 |         if len(possibleActions) == 0:
 70 |             return 0.0
 71 | 
 72 |         return max([self.getQValue(state, a) for a in possibleActions])
 73 | 
 74 |     def getPolicy(self, state):
 75 |         """
 76 |           Compute the best action to take in a state.
 77 | 
 78 |         """
 79 |         possibleActions = self.getLegalActions(state)
 80 | 
 81 |         # If there are no legal actions, return None
 82 |         if len(possibleActions) == 0:
 83 |             return None
 84 | 
 85 |         best_action = possibleActions[
 86 |             np.argmax([self.getQValue(state, a) for a in possibleActions])]
 87 |         return best_action
 88 | 
 89 |     def getAction(self, state):
 90 |         """
 91 |           Compute the action to take in the current state, including exploration.
 92 | 
 93 |           With probability self.epsilon, we should take a random action.
 94 |           otherwise - the best policy action (self.getPolicy).
 95 | 
 96 |         """
 97 | 
 98 |         # Pick Action
 99 |         possibleActions = self.getLegalActions(state)
100 |         action = None
101 | 
102 |         # If there are no legal actions, return None
103 |         if len(possibleActions) == 0:
104 |             return None
105 | 
106 |         # agent parameters:
107 |         epsilon = self.epsilon
108 | 
109 |         if np.random.random() <= epsilon:
110 |             action = random.choice(possibleActions)
111 |         else:
112 |             action = self.getPolicy(state)
113 |         return action
114 | 
115 |     def update(self, state, action, nextState, reward):
116 |         """
117 |           You should do your Q-Value update here
118 |         """
119 |         # agent parameters
120 |         gamma = self.discount
121 |         learning_rate = self.alpha
122 | 
123 |         reference_qvalue = reward + gamma * self.getValue(nextState)
124 |         updated_qvalue = (1 - learning_rate) * self.getQValue(state, action) + \
125 |                          learning_rate * reference_qvalue
126 |         self.setQValue(state, action, updated_qvalue)
127 | 


--------------------------------------------------------------------------------
/TD/run.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | 
  3 | matplotlib.use('Agg')
  4 | import matplotlib.pyplot as plt  # noqa: E402
  5 | import matplotlib.cm as cm
  6 | 
  7 | plt.style.use("ggplot")
  8 | import seaborn as sns  # noqa: E402
  9 | 
 10 | sns.set(color_codes=True)
 11 | 
 12 | import numpy as np
 13 | import argparse
 14 | import gym
 15 | from gym.core import ObservationWrapper
 16 | import os
 17 | import pickle
 18 | from tqdm import trange
 19 | 
 20 | from qlearning import QLearningAgent
 21 | from sarsa import SarsaAgent
 22 | from evsarsa import EVSarsaAgent
 23 | 
 24 | 
 25 | def plot_unimetric(history, metric, save_dir):
 26 |     plt.figure()
 27 |     plt.plot(history[metric])
 28 |     plt.title('model {}'.format(metric))
 29 |     plt.ylabel(metric)
 30 |     plt.xlabel('epoch')
 31 |     plt.savefig("{}/{}.png".format(save_dir, metric),
 32 |                 format='png', dpi=300)
 33 | 
 34 | 
 35 | def save_stats(stats, save_dir="./"):
 36 |     for key in stats:
 37 |         plot_unimetric(stats, key, save_dir)
 38 | 
 39 | 
 40 | class Binarizer(ObservationWrapper):
 41 |     def __init__(self, env, bins=None):
 42 |         super().__init__(env)
 43 |         self.n_bins = (bins or [10] * env.action_space.n)
 44 | 
 45 |     def _state_encoder(self, i, s_i):
 46 |         return int(self.n_bins[i] * s_i)
 47 | 
 48 |     def _observation(self, state):
 49 |         state = map(lambda x: self._state_encoder(x[0], x[1]), enumerate(state))
 50 | 
 51 |         return tuple(state)
 52 | 
 53 | 
 54 | def play_and_train_qlearning(env, agent, t_max=10 ** 3):
 55 |     total_reward = 0.0
 56 |     s = env.reset()
 57 | 
 58 |     for t in range(t_max):
 59 |         a = agent.getAction(s)
 60 | 
 61 |         next_s, r, done, _ = env.step(a)
 62 | 
 63 |         agent.update(s, a, next_s, r)
 64 | 
 65 |         s = next_s
 66 |         total_reward += r
 67 |         if done:
 68 |             break
 69 | 
 70 |     return total_reward
 71 | 
 72 | 
 73 | def play_and_train_sarsa(env, agent, t_max=10 ** 3):
 74 |     total_reward = 0.0
 75 |     s = env.reset()
 76 | 
 77 |     for t in range(t_max):
 78 |         a = agent.getAction(s)
 79 | 
 80 |         next_s, r, done, _ = env.step(a)
 81 | 
 82 |         agent.update(s, a, next_s, agent.getAction(next_s), r)
 83 | 
 84 |         s = next_s
 85 |         total_reward += r
 86 |         if done:
 87 |             break
 88 | 
 89 |     return total_reward
 90 | 
 91 | 
 92 | def play_and_train_evsarsa(env, agent, t_max=10 ** 3):
 93 |     total_reward = 0.0
 94 |     s = env.reset()
 95 | 
 96 |     for t in range(t_max):
 97 |         a = agent.getAction(s)
 98 | 
 99 |         next_s, r, done, _ = env.step(a)
100 | 
101 |         agent.update(s, a, next_s, r)
102 | 
103 |         s = next_s
104 |         total_reward += r
105 |         if done:
106 |             break
107 | 
108 |     return total_reward
109 | 
110 | 
111 | def agent_runner(
112 |         env, agent_fn, agent_play_fn,
113 |         n_epochs=int(2e5), alpha=0.05, discount=0.99,
114 |         initial_epsilon=0.25, final_epsilon=0.01):
115 |     n_actions = env.action_space.n
116 | 
117 |     agent = agent_fn(
118 |         alpha=alpha, epsilon=initial_epsilon, discount=discount,
119 |         getLegalActions=lambda s: range(n_actions))
120 | 
121 |     n_epochs_decay = n_epochs * 0.8
122 | 
123 |     tr = trange(
124 |         n_epochs,
125 |         desc="",
126 |         leave=True)
127 | 
128 |     rewards = np.zeros(n_epochs)
129 |     eps = np.zeros(n_epochs)
130 |     epoch_rewards = np.zeros(n_epochs // 1000)
131 |     agent.epsilon = initial_epsilon
132 |     for i in tr:
133 |         rewards[i] = agent_play_fn(env, agent)
134 |         eps[i] = agent.epsilon
135 | 
136 |         if i < n_epochs_decay:
137 |             agent.epsilon -= (initial_epsilon - final_epsilon) / float(n_epochs_decay)
138 | 
139 |         if i % 1000 == 0:
140 |             epoch_rewards[i // 1000] = np.mean(rewards[i - 1000:i])
141 |             desc = "reward: {}\tepsilon: {}".format(epoch_rewards[i // 1000], agent.epsilon)
142 |             tr.set_description(desc)
143 | 
144 |     return {
145 |         "reward": rewards,
146 |         "epoch_reward": epoch_rewards,
147 |         "epsilon": eps
148 |     }
149 | 
150 | 
151 | AGENTS = {
152 |     "qlearning": QLearningAgent,
153 |     "sarsa": SarsaAgent,
154 |     "evsarsa": EVSarsaAgent,
155 | }
156 | 
157 | AGENTS_FN = {
158 |     "qlearning": play_and_train_qlearning,
159 |     "sarsa": play_and_train_sarsa,
160 |     "evsarsa": play_and_train_evsarsa,
161 | }
162 | 
163 | 
164 | def run(env, agent, bins=None,
165 |         lr=0.05, discount_factor=0.99, n_steps=1, initial_epsilon=0.25,
166 |         n_epochs=1000, t_max=1000,
167 |         plot_stats=False, api_key=None):
168 |     env_name = env
169 |     env = Binarizer(gym.make(env).env, bins=bins)
170 |     agent = AGENTS[agent]
171 |     agent_fn = lambda env, agent: AGENTS_FN[agent](env, agent, t_max=t_max)
172 | 
173 |     history = agent_runner(
174 |         env, agent, agent_fn,
175 |         n_epochs, lr, discount_factor,
176 |         initial_epsilon)
177 | 
178 |     if plot_stats:
179 |         save_stats(history)
180 | 
181 |     if api_key is not None:
182 |         env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True)
183 |         for _ in range(200):
184 |             agent_fn(env, agent)
185 |         env.close()
186 |         gym.upload("/tmp/" + env_name, api_key=api_key)
187 | 
188 | 
189 | def _parse_args():
190 |     parser = argparse.ArgumentParser(description='Policy iteration example')
191 |     parser.add_argument(
192 |         '--env',
193 |         type=str,
194 |         default='CartPole-v0',  # CartPole-v0, MountainCar-v0
195 |         help='The environment to use')
196 |     parser.add_argument(
197 |         '--agent',
198 |         type=str,
199 |         default='qlearning',  # qlearning, sarsa, evsarsa
200 |         help='The agent to use')
201 |     parser.add_argument(
202 |         '--n_epochs',
203 |         type=int,
204 |         default=1000)
205 |     parser.add_argument(
206 |         '--t_max',
207 |         type=int,
208 |         default=1000)
209 |     parser.add_argument(
210 |         '--lr',
211 |         type=float,
212 |         default=0.05,
213 |         help='Agent learning rate')
214 |     parser.add_argument(
215 |         '--initial_epsilon',
216 |         type=float,
217 |         default=0.99,
218 |         help='Agent start exploration factor')
219 |     parser.add_argument(
220 |         '--gamma',
221 |         type=float,
222 |         default=0.99,
223 |         help='Gamma discount factor')
224 |     parser.add_argument(
225 |         '--plot_stats',
226 |         action='store_true',
227 |         default=False)
228 |     parser.add_argument(
229 |         '--api_key',
230 |         type=str,
231 |         default=None)
232 |     parser.add_argument(
233 |         '--n_steps',
234 |         type=int,
235 |         default=1)
236 |     parser.add_argument(
237 |         '--bins',
238 |         type=str,
239 |         default=None)
240 | 
241 |     args, _ = parser.parse_known_args()
242 |     return args
243 | 
244 | 
245 | def main():
246 |     args = _parse_args()
247 |     try:
248 |         bins = tuple(map(int, args.bins.split("-")))
249 |     except:
250 |         bins = None
251 |     run(args.env, args.agent, bins,
252 |         args.lr, args.gamma, args.n_steps, args.initial_epsilon,
253 |         args.n_epochs, args.t_max,
254 |         args.plot_stats, args.api_key)
255 | 
256 | 
257 | if __name__ == '__main__':
258 |     main()
259 | 


--------------------------------------------------------------------------------
/TD/sarsa.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SARSA Agent
  3 | This file builds upon the same functions as Q-learning agent (qlearning.py).
  4 | 
  5 | Here's usage example:
  6 |     from sarsa import SarsaAgent
  7 | 
  8 |     agent = SarsaAgent(
  9 |         alpha=0.1,epsilon=0.25,discount=0.99,
 10 |         getLegalActions = lambda s: actions_from_that_state)
 11 |     action = agent.getAction(state)
 12 |     agent.update(state, action, next_state, reward)
 13 |     agent.epsilon *= 0.99
 14 | """
 15 | import random
 16 | 
 17 | import numpy as np
 18 | from collections import defaultdict
 19 | 
 20 | 
 21 | class SarsaAgent(object):
 22 |     """
 23 |       Classical SARSA agent.
 24 | 
 25 |       The two main methods are
 26 |       - self.getAction(state) - returns agent's action in that state
 27 |       - self.update(state,action,reward,nextState,nextAction) - returns agent's next action
 28 | 
 29 |       Instance variables you have access to
 30 |         - self.epsilon (exploration prob)
 31 |         - self.alpha (learning rate)
 32 |         - self.discount (discount rate aka gamma)
 33 | 
 34 |     """
 35 | 
 36 |     def __init__(self, alpha, epsilon, discount, getLegalActions):
 37 |         "We initialize agent and Q-values here."
 38 |         self.getLegalActions = getLegalActions
 39 |         self._qValues = defaultdict(lambda: defaultdict(lambda: 0))
 40 |         self.alpha = alpha
 41 |         self.epsilon = epsilon
 42 |         self.discount = discount
 43 | 
 44 |     def getQValue(self, state, action):
 45 |         """
 46 |           Returns Q(state,action)
 47 |         """
 48 |         return self._qValues[state][action]
 49 | 
 50 |     def setQValue(self, state, action, value):
 51 |         """
 52 |           Sets the Qvalue for [state,action] to the given value
 53 |         """
 54 |         self._qValues[state][action] = value
 55 | 
 56 |     def getPolicy(self, state):
 57 |         """
 58 |           Compute the best action to take in a state.
 59 |         """
 60 |         possibleActions = self.getLegalActions(state)
 61 | 
 62 |         # If there are no legal actions, return None
 63 |         if len(possibleActions) == 0:
 64 |             return None
 65 | 
 66 |         "*** this code works exactly as Q-learning ***"
 67 |         best_action = possibleActions[
 68 |             np.argmax([self.getQValue(state, a) for a in possibleActions])]
 69 |         return best_action
 70 | 
 71 |     def getAction(self, state):
 72 |         """
 73 |           Compute the action to take in the current state, including exploration.
 74 |         """
 75 | 
 76 |         # Pick Action
 77 |         possibleActions = self.getLegalActions(state)
 78 |         action = None
 79 | 
 80 |         # If there are no legal actions, return None
 81 |         if len(possibleActions) == 0:
 82 |             return None
 83 | 
 84 |         # agent parameters:
 85 |         epsilon = self.epsilon
 86 | 
 87 |         "*** Epsilon-greedy strategy exactly as Q-learning ***"
 88 |         if np.random.random() <= epsilon:
 89 |             action = random.choice(possibleActions)
 90 |         else:
 91 |             action = self.getPolicy(state)
 92 |         return action
 93 | 
 94 |     def update(self, state, action, nextState, nextAction, reward):
 95 |         """
 96 |           You should do your Q-Value update here
 97 |         """
 98 |         # agent parameters
 99 |         gamma = self.discount
100 |         learning_rate = self.alpha
101 | 
102 |         "*** YOUR CODE HERE ***"
103 |         reference_qvalue = reward + gamma * self.getQValue(nextState, nextAction)
104 | 
105 |         updated_qvalue = (1 - learning_rate) * self.getQValue(state, action) + \
106 |             learning_rate * reference_qvalue
107 | 
108 |         self.setQValue(state, action, updated_qvalue)
109 | 


--------------------------------------------------------------------------------
/agents/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/agents/README.md


--------------------------------------------------------------------------------
/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/agents/__init__.py


--------------------------------------------------------------------------------
/agents/agent_networks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib import rnn
  4 | 
  5 | 
  6 | class FeatureNet(object):
  7 |     def __init__(self, state_shape, network, special=None):
  8 |         self.special = special or {}
  9 |         self.state_shape = state_shape
 10 | 
 11 |         self.states = tf.placeholder(shape=(None,) + state_shape, dtype=tf.float32, name="states")
 12 |         self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")
 13 |         self.global_step = tf.Variable(0, name='global_step', trainable=False)
 14 | 
 15 |         self.loss = None
 16 |         self.optimizer = None
 17 |         self.train_op = None
 18 | 
 19 |         self.relative_scope = self.special.get("scope", "feature_network")
 20 |         self.scope = tf.get_variable_scope().name + "/" + self.relative_scope
 21 | 
 22 |         self.feature_state = network(
 23 |             self.states,
 24 |             scope=self.relative_scope + "/feature",
 25 |             reuse=self.special.get("reuse_feature", False),
 26 |             is_training=self.is_training)
 27 | 
 28 | 
 29 | class PolicyNet(object):
 30 |     def __init__(self, hidden_state, n_actions, special=None):
 31 |         self.special = special or {}
 32 |         self.n_actions = n_actions
 33 | 
 34 |         self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
 35 |         self.cumulative_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="rewards")
 36 |         self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")
 37 |         self.global_step = tf.Variable(0, name='global_step', trainable=False)
 38 | 
 39 |         self.optimizer = None
 40 |         self.train_op = None
 41 | 
 42 |         self.relative_scope = self.special.get("scope", "policy_network")
 43 |         self.scope = tf.get_variable_scope().name + "/" + self.relative_scope
 44 | 
 45 |         self.predicted_probs = self._probs(
 46 |             hidden_state,
 47 |             scope=self.relative_scope + "/probs",
 48 |             reuse=self.special.get("reuse_probs", False)) + 1e-8
 49 | 
 50 |         batch_size = tf.shape(self.actions)[0]
 51 |         predicted_ids = tf.range(batch_size) * tf.shape(self.predicted_probs)[1] + self.actions
 52 | 
 53 |         self.predicted_probs_for_actions = tf.gather(
 54 |             tf.reshape(self.predicted_probs, [-1]), predicted_ids)
 55 | 
 56 |         J = -tf.reduce_mean(tf.log(self.predicted_probs_for_actions) * self.cumulative_rewards)
 57 |         self.loss = J
 58 | 
 59 |         # a bit of regularization
 60 |         if self.special.get("entropy_loss", True):
 61 |             entropy = tf.reduce_mean(
 62 |                 tf.reduce_sum(
 63 |                     self.predicted_probs * tf.log(self.predicted_probs),
 64 |                     axis=-1))
 65 |             entropy *= self.special.get("entropy_factor", 0.01)
 66 |             self.loss += entropy
 67 | 
 68 |     def _probs(self, hidden_state, scope, reuse=False):
 69 |         with tf.variable_scope(scope, reuse=reuse):
 70 |             probs = tf.layers.dense(
 71 |                 hidden_state,
 72 |                 units=self.n_actions,
 73 |                 activation=tf.nn.softmax)
 74 |             return probs
 75 | 
 76 | 
 77 | class ValueNet(object):
 78 |     def __init__(self, hidden_state, special=None):
 79 |         self.special = special or {}
 80 | 
 81 |         self.td_target = tf.placeholder(shape=[None], dtype=tf.float32, name="td_target")
 82 |         self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")
 83 |         self.global_step = tf.Variable(0, name='global_step', trainable=False)
 84 | 
 85 |         self.optimizer = None
 86 |         self.train_op = None
 87 | 
 88 |         self.relative_scope = self.special.get("scope", "value_network")
 89 |         self.scope = tf.get_variable_scope().name + "/" + self.relative_scope
 90 | 
 91 |         self.predicted_values = self._state_value(
 92 |             hidden_state,
 93 |             scope=self.relative_scope + "/state_value",
 94 |             reuse=self.special.get("reuse_state_value", False))
 95 | 
 96 |         self.predicted_values_for_actions = tf.squeeze(self.predicted_values, axis=1)
 97 | 
 98 |         self.loss = tf.losses.mean_squared_error(
 99 |             labels=self.td_target,
100 |             predictions=self.predicted_values_for_actions)
101 | 
102 |     def _state_value(self, hidden_state, scope, reuse=False):
103 |         with tf.variable_scope(scope, reuse=reuse):
104 |             state_values = tf.layers.dense(
105 |                 hidden_state,
106 |                 units=1,
107 |                 activation=None)
108 |             return state_values
109 | 
110 | 
111 | class QvalueNet(object):
112 |     def __init__(self, hidden_state, n_actions, special=None):
113 |         self.special = special or {}
114 |         self.n_actions = n_actions
115 | 
116 |         self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
117 |         self.td_target = tf.placeholder(shape=[None], dtype=tf.float32, name="td_target")
118 |         self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")
119 |         self.global_step = tf.Variable(0, name='global_step', trainable=False)
120 | 
121 |         self.optimizer = None
122 |         self.train_op = None
123 | 
124 |         self.relative_scope = self.special.get("scope", "qvalue_network")
125 |         self.scope = tf.get_variable_scope().name + "/" + self.relative_scope
126 | 
127 |         self.predicted_qvalues = self._qvalues(
128 |             hidden_state,
129 |             scope=self.relative_scope + "/qvalue",
130 |             reuse=self.special.get("reuse_state_value", False))
131 | 
132 |         batch_size = tf.shape(self.actions)[0]
133 |         predicted_ids = tf.range(batch_size) * tf.shape(self.predicted_qvalues)[1] + self.actions
134 | 
135 |         self.predicted_qvalues_for_actions = tf.gather(
136 |             tf.reshape(self.predicted_qvalues, [-1]), predicted_ids)
137 | 
138 |         self.loss = tf.losses.mean_squared_error(
139 |             labels=self.td_target,
140 |             predictions=self.predicted_qvalues_for_actions)
141 | 
142 |     def _qvalues(self, hidden_state, scope, reuse=False):
143 |         with tf.variable_scope(scope, reuse=reuse):
144 |             qvalues = tf.layers.dense(
145 |                 hidden_state,
146 |                 units=self.n_actions,
147 |                 activation=None)
148 |             if self.special.get("advantage", False):
149 |                 qvalues -= tf.reduce_mean(qvalues, axis=-1, keep_dims=True)
150 |             return qvalues
151 | 
152 | 
153 | def copy_scope_parameters(sess, net1_scope, net2_scope):
154 |     net1_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=net1_scope)
155 |     net2_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=net2_scope)
156 |     net1_params = sorted(net1_params, key=lambda v: v.name)
157 |     net2_params = sorted(net2_params, key=lambda v: v.name)
158 | 
159 |     update_ops = []
160 |     for net1_v, net2_v in zip(net1_params, net2_params):
161 |         op = net2_v.assign(net1_v)
162 |         update_ops.append(op)
163 | 
164 |     sess.run(update_ops)
165 | 
166 | 
167 | def copy_model_parameters(sess, net1, net2):
168 |     """
169 |     Copies the model parameters of one net to another.
170 | 
171 |     Args:
172 |       sess: Tensorflow session instance
173 |       net1: net to copy the parameters from
174 |       net2: net to copy the parameters to
175 |     """
176 | 
177 |     copy_scope_parameters(sess, net1.scope, net2.scope)
178 | 


--------------------------------------------------------------------------------
/agents/agent_states.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.contrib import rnn
  3 | 
  4 | #
  5 | # def get_state_variables(batch_size, cell):
  6 | #     zero_states = cell.zero_state(1, tf.float32)
  7 | #     if isinstance(zero_states, list):
  8 | #         state_variables = []
  9 | #         for i, (state_c, state_h) in enumerate(zero_states):
 10 | #             init_state_c = tf.get_variable(
 11 | #                 name="initial_state_vector_c:{}".format(i),
 12 | #                 dtype=tf.float32,
 13 | #                 initializer=state_c,
 14 | #                 trainable=False)
 15 | #             init_state_h = tf.get_variable(
 16 | #                 name="initial_state_vector_h:{}".format(i),
 17 | #                 dtype=tf.float32,
 18 | #                 initializer=state_h,
 19 | #                 trainable=False)
 20 | #             init_state_c = tf.tile(init_state_c, [batch_size, 1])
 21 | #             init_state_h = tf.tile(init_state_h, [batch_size, 1])
 22 | #             state_variables.append(
 23 | #                 rnn.LSTMStateTuple(
 24 | #                     init_state_c,
 25 | #                     init_state_h))
 26 | #         # Return as a tuple, so that it can be fed to dynamic_rnn as an initial state
 27 | #         return tuple(state_variables)
 28 | #     elif isinstance(zero_states, tuple):
 29 | #         state_c, state_h = zero_states
 30 | #         init_state_c = tf.get_variable(
 31 | #             name="initial_state_vector_c",
 32 | #             dtype=tf.float32,
 33 | #             initializer=state_c,
 34 | #             trainable=False)
 35 | #         init_state_h = tf.get_variable(
 36 | #             name="initial_state_vector_h",
 37 | #             dtype=tf.float32,
 38 | #             initializer=state_h,
 39 | #             trainable=False)
 40 | #         import pdb; pdb.set_trace()
 41 | #         init_state_c = tf.tile(init_state_c, [batch_size,  1])
 42 | #         init_state_h = tf.tile(init_state_h, [batch_size,  1])
 43 | #         return rnn.LSTMStateTuple(init_state_c, init_state_h)
 44 | #
 45 | #
 46 | # def get_state_update_op(state_variables, new_states, mask=None):
 47 | #     # Add an operation to update the train states with the last state tensors
 48 | #     update_ops = []
 49 | #     for state_variable, new_state in zip(state_variables, new_states):
 50 | #         # Assign the new state to the state variables on this layer
 51 | #         if mask is None:
 52 | #             # @TODO: error here, tiled Tensor has no assign
 53 | #             update_ops.extend([
 54 | #                 state_variable[0].assign(new_state[0]),
 55 | #                 state_variable[1].assign(new_state[1])])
 56 | #         else:
 57 | #             update_ops.extend([
 58 | #                 state_variable[0].assign(
 59 | #                     tf.where(mask, tf.zeros_like(new_state[0]), new_state[0])),
 60 | #                 state_variable[1].assign(
 61 | #                     tf.where(mask, tf.zeros_like(new_state[1]), new_state[1]))])
 62 | #     # Return a tuple in order to combine all update_ops into a single operation.
 63 | #     # The tuple's actual value should not be used.
 64 | #     return tf.tuple(update_ops)
 65 | #
 66 | #
 67 | # @TODO: rewrite for any cell, and refactor it
 68 | # @TODO: Not working without known batch_size, so static!
 69 | # class RecurrentHiddenState(object):
 70 | #     def __init__(self, feature_state, size=512, activation=tf.tanh):
 71 | #         self.is_end = tf.placeholder(shape=[None], dtype=tf.bool, name="is_end")
 72 | #
 73 | #         self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")
 74 | #         self.global_step = tf.Variable(0, name='global_step', trainable=False)
 75 | #
 76 | #         self.loss = None
 77 | #         self.optimizer = None
 78 | #         self.train_op = None
 79 | #
 80 | #         self.relative_scope = "hidden_state"
 81 | #         self.scope = tf.get_variable_scope().name + "/" + self.relative_scope
 82 | #         batch_size = tf.unstack(tf.shape(feature_state))[0]
 83 | #
 84 | #         with tf.variable_scope(self.relative_scope):
 85 | #             self.cell = rnn.LSTMCell(size, activation=activation)
 86 | #             self.belief_state = get_state_variables(batch_size, self.cell)
 87 | #             # very bad dark magic, need to refactor all of this
 88 | #             # supports only ine layer cell
 89 | #             self.belief_out = tf.placeholder(
 90 | #                 tf.float32, [2, None, self.cell.output_size])
 91 | #             l = tf.unstack(self.belief_out, axis=0)
 92 | #             rnn_tuple_state = rnn.LSTMStateTuple(l[0], l[1])
 93 | #             import pdb; pdb.set_trace()
 94 | #             self.belief_assign = get_state_update_op([self.belief_state], [rnn_tuple_state])
 95 | #
 96 | #             logits, rnn_states = tf.nn.dynamic_rnn(
 97 | #                 self.cell, tf.expand_dims(feature_state, 1),
 98 | #                 sequence_length=[1] * batch_size, initial_state=self.belief_state)
 99 | #
100 | #             self.state = tf.squeeze(logits, 1)
101 | #
102 | #             # @TODO: very hacky 2
103 | #             self.belief_update = get_state_update_op(
104 | #                 [self.belief_state], [rnn_states], self.is_end)
105 | 
106 | 
107 | def get_state_variables(batch_size, cell):
108 |     zero_states = cell.zero_state(batch_size, tf.float32)
109 |     if isinstance(zero_states, list):
110 |         state_variables = []
111 |         for i, (state_c, state_h) in enumerate(zero_states):
112 |             init_state_c = tf.get_variable(
113 |                 name="initial_state_vector_c:{}".format(i),
114 |                 dtype=tf.float32,
115 |                 initializer=state_c,
116 |                 trainable=False)
117 |             init_state_h = tf.get_variable(
118 |                 name="initial_state_vector_h:{}".format(i),
119 |                 dtype=tf.float32,
120 |                 initializer=state_h,
121 |                 trainable=False)
122 |             state_variables.append(
123 |                 rnn.LSTMStateTuple(
124 |                     init_state_c,
125 |                     init_state_h))
126 |         # Return as a tuple, so that it can be fed to dynamic_rnn as an initial state
127 |         return tuple(state_variables)
128 |     elif isinstance(zero_states, tuple):
129 |         state_c, state_h = zero_states
130 |         init_state_c = tf.get_variable(
131 |             name="initial_state_vector_c",
132 |             dtype=tf.float32,
133 |             initializer=state_c,
134 |             trainable=False)
135 |         init_state_h = tf.get_variable(
136 |             name="initial_state_vector_h",
137 |             dtype=tf.float32,
138 |             initializer=state_h,
139 |             trainable=False)
140 |         return rnn.LSTMStateTuple(init_state_c, init_state_h)
141 | 
142 | 
143 | def get_state_update_op(state_variables, new_states, mask=None):
144 |     # Add an operation to update the train states with the last state tensors
145 |     update_ops = []
146 |     for state_variable, new_state in zip(state_variables, new_states):
147 |         # Assign the new state to the state variables on this layer
148 |         if mask is None:
149 |             update_ops.extend([
150 |                 state_variable[0].assign(new_state[0]),
151 |                 state_variable[1].assign(new_state[1])])
152 |         else:
153 |             update_ops.extend([
154 |                 state_variable[0].assign(
155 |                     tf.where(mask, tf.zeros_like(new_state[0]), new_state[0])),
156 |                 state_variable[1].assign(
157 |                     tf.where(mask, tf.zeros_like(new_state[1]), new_state[1]))])
158 |     # Return a tuple in order to combine all update_ops into a single operation.
159 |     # The tuple's actual value should not be used.
160 |     return tf.tuple(update_ops)
161 | 
162 | 
163 | class LinearHiddenState(object):
164 |     def __init__(self, feature_state, size=512, activation=None):
165 | 
166 |         self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")
167 |         self.global_step = tf.Variable(0, name='global_step', trainable=False)
168 | 
169 |         self.loss = None
170 |         self.optimizer = None
171 |         self.train_op = None
172 | 
173 |         self.relative_scope = "hidden_state"
174 |         self.scope = tf.get_variable_scope().name + "/" + self.relative_scope
175 | 
176 |         with tf.variable_scope(self.relative_scope):
177 |             self.state = tf.layers.dense(
178 |                 feature_state,
179 |                 size,
180 |                 activation=activation)
181 | 
182 | 
183 | class RecurrentHiddenState(object):
184 |     def __init__(self, feature_state, size=512, activation=tf.tanh, batch_size=1):
185 |         self.is_end = tf.placeholder(shape=[None], dtype=tf.bool, name="is_end")
186 | 
187 |         self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")
188 |         self.global_step = tf.Variable(0, name='global_step', trainable=False)
189 | 
190 |         self.loss = None
191 |         self.optimizer = None
192 |         self.train_op = None
193 | 
194 |         self.relative_scope = "hidden_state"
195 |         self.scope = tf.get_variable_scope().name + "/" + self.relative_scope
196 | 
197 |         with tf.variable_scope(self.relative_scope):
198 |             self.cell = rnn.LSTMCell(size, activation=activation)
199 |             self.belief_state = get_state_variables(batch_size, self.cell)
200 |             # very bad dark magic, need to refactor all of this
201 |             # supports only ine layer cell
202 |             self.belief_out = tf.placeholder(
203 |                 tf.float32, [2, batch_size, self.cell.output_size])
204 |             l = tf.unstack(self.belief_out, axis=0)
205 |             rnn_tuple_state = rnn.LSTMStateTuple(l[0], l[1])
206 |             self.belief_assign = get_state_update_op([self.belief_state], [rnn_tuple_state])
207 | 
208 |             logits, rnn_states = tf.nn.dynamic_rnn(
209 |                 self.cell, tf.expand_dims(feature_state, 1),
210 |                 sequence_length=[1] * batch_size, initial_state=self.belief_state)
211 | 
212 |             self.state = tf.squeeze(logits, 1)
213 | 
214 |             # @TODO: very hacky 2
215 |             self.belief_update = get_state_update_op([self.belief_state], [rnn_states], self.is_end)


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/common/__init__.py


--------------------------------------------------------------------------------
/common/buffer.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | 
  5 | from common.segment_tree import SumSegmentTree, MinSegmentTree
  6 | 
  7 | 
  8 | class ReplayBuffer(object):
  9 |     def __init__(self, size):
 10 |         """Create Prioritized Replay buffer.
 11 | 
 12 |         Parameters
 13 |         ----------
 14 |         size: int
 15 |             Max number of transitions to store in the buffer. When the buffer
 16 |             overflows the old memories are dropped.
 17 |         """
 18 |         self._storage = []
 19 |         self._maxsize = size
 20 |         self._next_idx = 0
 21 | 
 22 |     def __len__(self):
 23 |         return len(self._storage)
 24 | 
 25 |     def add(self, obs_t, action, reward, obs_tp1, done):
 26 |         data = (obs_t, action, reward, obs_tp1, done)
 27 | 
 28 |         if self._next_idx >= len(self._storage):
 29 |             self._storage.append(data)
 30 |         else:
 31 |             self._storage[self._next_idx] = data
 32 |         self._next_idx = (self._next_idx + 1) % self._maxsize
 33 | 
 34 |     def _encode_sample(self, idxes):
 35 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
 36 |         for i in idxes:
 37 |             data = self._storage[i]
 38 |             obs_t, action, reward, obs_tp1, done = data
 39 |             obses_t.append(np.array(obs_t, copy=False))
 40 |             actions.append(np.array(action, copy=False))
 41 |             rewards.append(reward)
 42 |             obses_tp1.append(np.array(obs_tp1, copy=False))
 43 |             dones.append(done)
 44 |         return np.array(obses_t), \
 45 |                np.array(actions), \
 46 |                np.array(rewards), \
 47 |                np.array(obses_tp1), \
 48 |                np.array(dones)
 49 | 
 50 |     def sample(self, batch_size):
 51 |         """Sample a batch of experiences.
 52 | 
 53 |         Parameters
 54 |         ----------
 55 |         batch_size: int
 56 |             How many transitions to sample.
 57 | 
 58 |         Returns
 59 |         -------
 60 |         obs_batch: np.array
 61 |             batch of observations
 62 |         act_batch: np.array
 63 |             batch of actions executed given obs_batch
 64 |         rew_batch: np.array
 65 |             rewards received as results of executing act_batch
 66 |         next_obs_batch: np.array
 67 |             next set of observations seen after executing act_batch
 68 |         done_mask: np.array
 69 |             done_mask[i] = 1 if executing act_batch[i] resulted in
 70 |             the end of an episode and 0 otherwise.
 71 |         """
 72 |         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
 73 |         return self._encode_sample(idxes)
 74 | 
 75 | 
 76 | class PrioritizedReplayBuffer(ReplayBuffer):
 77 |     def __init__(self, size, alpha=0.5):
 78 |         """Create Prioritized Replay buffer.
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         size: int
 83 |             Max number of transitions to store in the buffer. When the buffer
 84 |             overflows the old memories are dropped.
 85 |         alpha: float
 86 |             how much prioritization is used
 87 |             (0 - no prioritization, 1 - full prioritization)
 88 | 
 89 |         See Also
 90 |         --------
 91 |         ReplayBuffer.__init__
 92 |         """
 93 |         super(PrioritizedReplayBuffer, self).__init__(size)
 94 |         assert alpha > 0
 95 |         self._alpha = alpha
 96 | 
 97 |         it_capacity = 1
 98 |         while it_capacity < size:
 99 |             it_capacity *= 2
100 | 
101 |         self._it_sum = SumSegmentTree(it_capacity)
102 |         self._it_min = MinSegmentTree(it_capacity)
103 |         self._max_priority = 1.0
104 | 
105 |     def add(self, *args, **kwargs):
106 |         """See ReplayBuffer.store_effect"""
107 |         idx = self._next_idx
108 |         super().add(*args, **kwargs)
109 |         self._it_sum[idx] = self._max_priority ** self._alpha
110 |         self._it_min[idx] = self._max_priority ** self._alpha
111 | 
112 |     def _sample_proportional(self, batch_size):
113 |         res = []
114 |         for _ in range(batch_size):
115 |             # TODO(szymon): should we ensure no repeats?
116 |             mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
117 |             idx = self._it_sum.find_prefixsum_idx(mass)
118 |             res.append(idx)
119 |         return res
120 | 
121 |     def sample(self, batch_size, beta=0.5):
122 |         """Sample a batch of experiences.
123 | 
124 |         compared to ReplayBuffer.sample
125 |         it also returns importance weights and idxes
126 |         of sampled experiences.
127 | 
128 | 
129 |         Parameters
130 |         ----------
131 |         batch_size: int
132 |             How many transitions to sample.
133 |         beta: float
134 |             To what degree to use importance weights
135 |             (0 - no corrections, 1 - full correction)
136 | 
137 |         Returns
138 |         -------
139 |         obs_batch: np.array
140 |             batch of observations
141 |         act_batch: np.array
142 |             batch of actions executed given obs_batch
143 |         rew_batch: np.array
144 |             rewards received as results of executing act_batch
145 |         next_obs_batch: np.array
146 |             next set of observations seen after executing act_batch
147 |         done_mask: np.array
148 |             done_mask[i] = 1 if executing act_batch[i] resulted in
149 |             the end of an episode and 0 otherwise.
150 |         weights: np.array
151 |             Array of shape (batch_size,) and dtype np.float32
152 |             denoting importance weight of each sampled transition
153 |         idxes: np.array
154 |             Array of shape (batch_size,) and dtype np.int32
155 |             idexes in buffer of sampled experiences
156 |         """
157 |         assert beta > 0
158 | 
159 |         idxes = self._sample_proportional(batch_size)
160 | 
161 |         weights = []
162 |         p_min = self._it_min.min() / self._it_sum.sum()
163 |         max_weight = (p_min * len(self._storage)) ** (-beta)
164 | 
165 |         for idx in idxes:
166 |             p_sample = self._it_sum[idx] / self._it_sum.sum()
167 |             weight = (p_sample * len(self._storage)) ** (-beta)
168 |             weights.append(weight / max_weight)
169 |         weights = np.array(weights)
170 |         encoded_sample = self._encode_sample(idxes)
171 |         return tuple(list(encoded_sample) + [weights, idxes])
172 | 
173 |     def update_priorities(self, idxes, priorities):
174 |         """Update priorities of sampled transitions.
175 | 
176 |         sets priority of transition at index idxes[i] in buffer
177 |         to priorities[i].
178 | 
179 |         Parameters
180 |         ----------
181 |         idxes: [int]
182 |             List of idxes of sampled transitions
183 |         priorities: [float]
184 |             List of updated priorities corresponding to
185 |             transitions at the sampled idxes denoted by
186 |             variable `idxes`.
187 |         """
188 |         assert len(idxes) == len(priorities)
189 |         for idx, priority in zip(idxes, priorities):
190 |             assert priority > 0
191 |             assert 0 <= idx < len(self._storage)
192 |             self._it_sum[idx] = priority ** self._alpha
193 |             self._it_min[idx] = priority ** self._alpha
194 | 
195 |             self._max_priority = max(self._max_priority, priority)
196 | 
197 | buffers = {
198 |     "none": None,
199 |     "simple": ReplayBuffer,
200 |     "prioritized": PrioritizedReplayBuffer
201 | }


--------------------------------------------------------------------------------
/common/networks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | activations = {
 5 |     "sigmoid": tf.sigmoid,
 6 |     "tanh": tf.tanh,
 7 |     "relu": tf.nn.relu,
 8 |     "relu6": tf.nn.relu6,
 9 |     "elu": tf.nn.elu,
10 |     "softplus": tf.nn.softplus
11 | }
12 | 
13 | 
14 | def linear_network(
15 |         states, is_training=False, scope=None, reuse=False,
16 |         layers=None, activation_fn=tf.nn.elu, use_bn=False, dropout=-1):
17 |     layers = layers or [16, 16]
18 |     x = states
19 |     with tf.variable_scope(scope or "linear_network", reuse=reuse):
20 |         for n_out in layers:
21 |             x = tf.layers.dense(x, n_out, activation=None)
22 |             if use_bn:
23 |                 x = tf.layers.batch_normalization(x, training=is_training)
24 |             x = activation_fn(x)
25 |             if dropout > 0:
26 |                 x = tf.layers.dropout(x, rate=dropout, training=is_training)
27 |         return x
28 | 
29 | 
30 | def convolution_network(
31 |         states, is_training=False, scope=None, reuse=False,
32 |         n_filters=None, kernels=None, strides=None,
33 |         activation_fn=tf.nn.elu, use_bn=False, dropout=-1):
34 |     n_filters = n_filters or [32, 64, 64]
35 |     kernels = kernels or [8, 4, 4]
36 |     strides = strides or [4, 2, 1]
37 |     x = states
38 |     with tf.variable_scope(scope or "convolution_network", reuse=reuse):
39 |         for n_filter, kernel, stride in zip(n_filters, kernels, strides):
40 |             x = tf.layers.conv2d(x, n_filter, kernel, stride, activation=None)
41 |             if use_bn:
42 |                 x = tf.layers.batch_normalization(x, training=is_training)
43 |             x = activation_fn(x)
44 |             if dropout > 0:
45 |                 x = tf.layers.dropout(x, rate=dropout, training=is_training)
46 |         x = tf.contrib.layers.flatten(x)
47 |         return x
48 | 
49 | 
50 | networks = {
51 |     "linear": linear_network,
52 |     "convolution": convolution_network
53 | }
54 | 
55 | 
56 | def network_wrapper(network, params):
57 |     def wrapper(states, is_training=False, scope=None, reuse=False,):
58 |         return network(states, is_training, scope, reuse, **params)
59 |     return wrapper
60 | 


--------------------------------------------------------------------------------
/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient `reduce`
 16 |                operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the
 18 |                array.
 19 | 
 20 |         Paramters
 21 |         ---------
 22 |         capacity: int
 23 |             Total size of the array - must be a power of two.
 24 |         operation: lambda obj, obj -> obj
 25 |             and operation for combining elements (eg. sum, max)
 26 |             must for a mathematical group together with the set of
 27 |             possible values for array elements.
 28 |         neutral_element: obj
 29 |             neutral element for the operation above. eg. float('-inf')
 30 |             for max and 0 for sum.
 31 |         """
 32 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 33 |         self._capacity = capacity
 34 |         self._value = [neutral_element for _ in range(2 * capacity)]
 35 |         self._operation = operation
 36 | 
 37 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 38 |         if start == node_start and end == node_end:
 39 |             return self._value[node]
 40 |         mid = (node_start + node_end) // 2
 41 |         if end <= mid:
 42 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 43 |         else:
 44 |             if mid + 1 <= start:
 45 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 46 |             else:
 47 |                 return self._operation(
 48 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 49 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 50 |                 )
 51 | 
 52 |     def reduce(self, start=0, end=None):
 53 |         """Returns result of applying `self.operation`
 54 |         to a contiguous subsequence of the array.
 55 | 
 56 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |         start: int
 61 |             beginning of the subsequence
 62 |         end: int
 63 |             end of the subsequences
 64 | 
 65 |         Returns
 66 |         -------
 67 |         reduced: obj
 68 |             result of reducing self.operation over the specified range of array elements.
 69 |         """
 70 |         if end is None:
 71 |             end = self._capacity
 72 |         if end < 0:
 73 |             end += self._capacity
 74 |         end -= 1
 75 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 76 | 
 77 |     def __setitem__(self, idx, val):
 78 |         # index of the leaf
 79 |         idx += self._capacity
 80 |         self._value[idx] = val
 81 |         idx //= 2
 82 |         while idx >= 1:
 83 |             self._value[idx] = self._operation(
 84 |                 self._value[2 * idx],
 85 |                 self._value[2 * idx + 1]
 86 |             )
 87 |             idx //= 2
 88 | 
 89 |     def __getitem__(self, idx):
 90 |         assert 0 <= idx < self._capacity
 91 |         return self._value[self._capacity + idx]
 92 | 
 93 | 
 94 | class SumSegmentTree(SegmentTree):
 95 |     def __init__(self, capacity):
 96 |         super(SumSegmentTree, self).__init__(
 97 |             capacity=capacity,
 98 |             operation=operator.add,
 99 |             neutral_element=0.0
100 |         )
101 | 
102 |     def sum(self, start=0, end=None):
103 |         """Returns arr[start] + ... + arr[end]"""
104 |         return super(SumSegmentTree, self).reduce(start, end)
105 | 
106 |     def find_prefixsum_idx(self, prefixsum):
107 |         """Find the highest index `i` in the array such that
108 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
109 | 
110 |         if array values are probabilities, this function
111 |         allows to sample indexes according to the discrete
112 |         probability efficiently.
113 | 
114 |         Parameters
115 |         ----------
116 |         perfixsum: float
117 |             upperbound on the sum of array prefix
118 | 
119 |         Returns
120 |         -------
121 |         idx: int
122 |             highest index satisfying the prefixsum constraint
123 |         """
124 |         assert 0 <= prefixsum <= self.sum() + 1e-5
125 |         idx = 1
126 |         while idx < self._capacity:  # while non-leaf
127 |             if self._value[2 * idx] > prefixsum:
128 |                 idx = 2 * idx
129 |             else:
130 |                 prefixsum -= self._value[2 * idx]
131 |                 idx = 2 * idx + 1
132 |         return idx - self._capacity
133 | 
134 | 
135 | class MinSegmentTree(SegmentTree):
136 |     def __init__(self, capacity):
137 |         super(MinSegmentTree, self).__init__(
138 |             capacity=capacity,
139 |             operation=min,
140 |             neutral_element=float('inf')
141 |         )
142 | 
143 |     def min(self, start=0, end=None):
144 |         """Returns min(arr[start], ...,  arr[end])"""
145 | 
146 |         return super(MinSegmentTree, self).reduce(start, end)
147 | 


--------------------------------------------------------------------------------
/jedi_upload.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Use your power, Luke!
 5 | """
 6 | 
 7 | import gym
 8 | import argparse
 9 | import json
10 | from glob import glob
11 | 
12 | 
13 | def force_upload(monitor_dir, correct_name, api_key):
14 |     f_name = glob("{}/*manifest.json".format(monitor_dir))[0]
15 |     with open(f_name, "r") as fin:
16 |         data = json.load(fin)
17 |     data["env_info"]["env_id"] = correct_name
18 |     with open(f_name, "w") as fout:
19 |         json.dump(data, fout, ensure_ascii=False, indent=4)
20 |     gym.upload(monitor_dir, api_key=api_key)
21 | 
22 | 
23 | def _parse_args():
24 |     parser = argparse.ArgumentParser(description='')
25 |     parser.add_argument('--monitor_dir',
26 |                         type=str)
27 |     parser.add_argument('--correct_name',
28 |                         type=str)
29 |     parser.add_argument('--api_key',
30 |                         type=str)
31 | 
32 |     args, _ = parser.parse_known_args()
33 |     return args
34 | 
35 | 
36 | def main():
37 |     args = _parse_args()
38 |     force_upload(args.monitor_dir, args.correct_name, args.api_key)
39 | 
40 | if __name__ == '__main__':
41 |     main()
42 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy
2 | gym
3 | tensorflow==1.1.0


--------------------------------------------------------------------------------
/wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/wrappers/__init__.py


--------------------------------------------------------------------------------
/wrappers/gym_wrappers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.misc import imresize
  3 | import gym
  4 | from gym.core import ObservationWrapper, Wrapper
  5 | from gym.spaces.box import Box
  6 | from gym.wrappers import SkipWrapper, TimeLimit
  7 | from copy import copy
  8 | import collections
  9 | 
 10 | try:
 11 |     import ppaquette_gym_doom
 12 |     from ppaquette_gym_doom.wrappers.action_space import ToDiscrete
 13 | except ImportError:
 14 |     print("no doom envs")
 15 | 
 16 | 
 17 | Transition = collections.namedtuple(
 18 |     "Transition",
 19 |     ["state", "action", "reward", "next_state", "done"])
 20 | 
 21 | 
 22 | class PreprocessImage(ObservationWrapper):
 23 |     def __init__(self, env, height=64, width=64, grayscale=True, crop=None):
 24 |         """
 25 |         A gym wrapper that crops, scales image into the desired shapes and optionally grayscales it.
 26 |         """
 27 |         super(PreprocessImage, self).__init__(env)
 28 |         self.img_size = (height, width)
 29 |         self.grayscale = grayscale
 30 |         no_crop = lambda img: img
 31 |         self.crop = crop or no_crop 
 32 | 
 33 |         n_colors = 1 if self.grayscale else 3
 34 |         self.observation_space = Box(0.0, 1.0, [height, width, n_colors])
 35 | 
 36 |     def _observation(self, img):
 37 |         """what happens to the observation"""
 38 |         img = self.crop(img)
 39 |         img = imresize(img, self.img_size)
 40 |         if self.grayscale:
 41 |             img = img.mean(-1, keepdims=True)
 42 |         img = img.astype('float32') / 255.
 43 |         return img
 44 | 
 45 | 
 46 | class FrameBuffer(Wrapper):
 47 |     def __init__(self, env, n_frames=4, reshape_fn=None):
 48 |         """A gym wrapper that returns last n_frames observations as a single observation.
 49 |         Useful for games like Atari and Doom with screen as input."""
 50 |         super(FrameBuffer, self).__init__(env)
 51 |         self.framebuffer = np.zeros([n_frames, ] + list(env.observation_space.shape))
 52 | 
 53 |         # now, hacky auto-reshape fn
 54 |         if reshape_fn is None:
 55 |             shape_dims = list(range(len(self.framebuffer.shape)))
 56 |             shape_dims = shape_dims[1:] + [shape_dims[0]]
 57 | 
 58 |             result_shape = list(env.observation_space.shape)
 59 |             if len(result_shape) == 1:
 60 |                 # so, its linear env
 61 |                 result_shape += [1]
 62 |             result_shape[-1] = result_shape[-1] * n_frames
 63 | 
 64 |             reshape_fn = lambda x: np.transpose(x, shape_dims).reshape(result_shape)
 65 | 
 66 |         self.reshape_fn = reshape_fn
 67 |         self.observation_space = Box(0.0, 1.0, self.reshape_fn(self.framebuffer).shape)
 68 | 
 69 |     def reset(self):
 70 |         """resets breakout, returns initial frames"""
 71 |         self.framebuffer = np.zeros_like(self.framebuffer)
 72 |         self.update_buffer(self.env.reset())
 73 |         return self.reshape_fn(self.framebuffer)
 74 | 
 75 |     def step(self, action):
 76 |         """plays breakout for 1 step, returns 4-frame buffer"""
 77 |         new_obs, r, done, info = self.env.step(action)
 78 |         self.update_buffer(new_obs)
 79 |         return self.reshape_fn(self.framebuffer), r, done, info
 80 | 
 81 |     def update_buffer(self, obs):
 82 |         """push new observation to the buffer, remove the earliest one"""
 83 |         self.framebuffer = np.vstack([obs[None], self.framebuffer[:-1]])
 84 | 
 85 | 
 86 | class EnvPool(Wrapper):
 87 |     """
 88 |         Typical EnvPool, that does not care about done envs.
 89 |     """
 90 | 
 91 |     def __init__(self, env, n_envs=16, autoreload_envs=False):
 92 |         super(EnvPool, self).__init__(env)
 93 |         self.initial_env = env
 94 |         self.n_envs = n_envs
 95 |         self.env_shape = env.observation_space.shape
 96 |         self.envs = []
 97 |         self.recreate_envs()
 98 |         self.reset()
 99 | 
100 |     def recreate_envs(self):
101 |         self.close()
102 |         self.envs = np.array([copy(self.initial_env) for _ in range(self.n_envs)])
103 | 
104 |     def reset(self):
105 |         self._states = np.zeros(shape=(self.n_envs,) + tuple(self.env_shape), dtype=np.float32)
106 |         self._rewards = np.zeros(shape=self.n_envs, dtype=np.float32)
107 |         self._dones = np.zeros(shape=self.n_envs, dtype=np.bool)
108 |         for i, env in enumerate(self.envs):
109 |             self._states[i] = env.reset()
110 |         return self._states.copy()
111 | 
112 |     def step(self, actions):
113 | 
114 |         for i, (action, env) in enumerate(zip(actions, self.envs)):
115 |             new_s, r, done, _ = env.step(action)
116 |             self._rewards[i] = r
117 |             self._dones[i] = done
118 |             if not done:
119 |                 self._states[i] = new_s
120 |             else:
121 |                 self._states[i] = env.reset()
122 |         return self._states.copy(), self._rewards.copy(), self._dones.copy(), None
123 | 
124 |     def close(self):
125 |         for env in self.envs:
126 |             env.close()
127 | 
128 |     def pool_states(self):
129 |         return self._states.copy()
130 | 
131 | 
132 | def make_env(env_name, n_games=1, episode_limit=None, n_frames=1, autoreload_envs=False):
133 |     env = gym.make(env_name) if episode_limit is None else gym.make(env_name).env
134 |     env = FrameBuffer(env, n_frames=n_frames) if n_frames > 1 else env
135 |     if episode_limit is not None:
136 |         env = TimeLimit(env, max_episode_steps=episode_limit)
137 |     return EnvPool(env, n_games, autoreload_envs) if n_games > 0 else env
138 | 
139 | 
140 | def make_image_env(
141 |         env_name, n_games=1, episode_limit=None,
142 |         n_frames=1, autoreload_envs=False,
143 |         width=64, height=64,
144 |         grayscale=True, crop=None):
145 |     env = gym.make(env_name) if episode_limit is None else gym.make(env_name).env
146 |     if "ppaquette" in env_name:
147 |         env = SkipWrapper(4)(ToDiscrete("minimal")(env))
148 |     env = PreprocessImage(env, width=width, height=height, grayscale=grayscale, crop=crop)
149 |     env = FrameBuffer(env, n_frames=n_frames) if n_frames > 1 else env
150 |     if episode_limit is not None:
151 |         env = TimeLimit(env, max_episode_steps=episode_limit)
152 |     return EnvPool(env, n_games, autoreload_envs) if n_games > 0 else env
153 | 
154 | 
155 | def make_env_wrapper(make_env_fn, params):
156 |     def wrapper(env, n_games, episode_limit=None):
157 |         return make_env_fn(env, n_games, episode_limit=episode_limit, **params)
158 | 
159 |     return wrapper
160 | 


--------------------------------------------------------------------------------
/wrappers/run_wrappers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | 
  4 | import gym
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from rstools.utils.os_utils import save_history, save_model, create_if_need
  8 | from rstools.visualization.plotter import plot_all_metrics
  9 | 
 10 | from common.networks import activations, networks, network_wrapper
 11 | from wrappers.gym_wrappers import make_env, make_image_env, make_env_wrapper
 12 | 
 13 | try:
 14 |     import ppaquette_gym_doom
 15 | except ImportError:
 16 |     print("no doom envs")
 17 | 
 18 | 
 19 | def str2params(string, delimeter="-"):
 20 |     try:
 21 |         result = tuple(map(int, string.split(delimeter)))
 22 |     except:
 23 |         result = None
 24 |     return result
 25 | 
 26 | 
 27 | def epsilon_greedy_policy(agent, sess, observations):
 28 |     probs = agent.predict_probs(sess, observations)
 29 |     actions = [np.random.choice(len(row), p=row) for row in probs]
 30 |     return actions
 31 | 
 32 | 
 33 | # @TODO: rewrite more numpy way (no for usage)
 34 | def epsilon_greedy_actions(agent, sess, observations, epsilon=0.01):
 35 |     qvalues = agent.predict_qvalues(sess, observations)
 36 |     probs = np.ones_like(qvalues, dtype=float) * epsilon / agent.qvalue_net.n_actions
 37 |     best_action = np.argmax(qvalues, axis=-1)
 38 |     for i, action in enumerate(best_action):
 39 |         probs[i, action] += (1.0 - epsilon)
 40 |     actions = [np.random.choice(len(row), p=row) for row in probs]
 41 |     return actions
 42 | 
 43 | 
 44 | def play_session(sess, agent, env, action_fn, t_max=int(1e10)):
 45 |     total_reward = 0
 46 | 
 47 |     s = env.reset()
 48 |     for t in range(t_max):
 49 |         a = action_fn(agent, sess, np.array([s], dtype=np.float32))[0]
 50 |         next_s, r, done, _ = env.step(a)
 51 |         total_reward += r
 52 | 
 53 |         if hasattr(agent, "update_belief_state"):
 54 |             agent.update_belief_state(sess, [s], [done])
 55 | 
 56 |         s = next_s
 57 |         if done:
 58 |             break
 59 | 
 60 |     return total_reward, t
 61 | 
 62 | 
 63 | def update_wraper(
 64 |         update_fn,
 65 |         **kwargs):
 66 |     def wrapper(*args):
 67 |         return update_fn(*args, **kwargs)
 68 | 
 69 |     return wrapper
 70 | 
 71 | 
 72 | def create_agent(agent_cls, state_shape, n_actions, agent_agrs, use_target_network):
 73 |     agent = agent_cls(
 74 |         state_shape, n_actions, **agent_agrs)
 75 | 
 76 |     if use_target_network:
 77 |         targets_special = {**agent_agrs["special"], **{"scope": "target_" + agent.scope}}
 78 |         agent_agrs["special"] = targets_special
 79 |         target_agent = agent_cls(
 80 |             state_shape, n_actions, **agent_agrs)
 81 |         agent = (agent, target_agent)
 82 | 
 83 |     from pprint import pprint
 84 |     pprint([(v.name, v.get_shape().as_list()) for v in tf.trainable_variables()])
 85 | 
 86 |     return agent
 87 | 
 88 | 
 89 | def run_wrapper(
 90 |         n_games, learning_fn, update_fn, play_fn, action_fn,
 91 |         env_name, make_env_fn, agent_cls,
 92 |         run_args, agent_agrs,
 93 |         log_dir=None, episode_limit=None,
 94 |         plot_stats=False, api_key=None,
 95 |         load=False, gpu_option=0.4,
 96 |         use_target_network=False):
 97 |     env = make_env_fn(env_name, n_games, episode_limit=episode_limit)
 98 | 
 99 |     n_actions = env.action_space.n
100 |     state_shape = env.observation_space.shape
101 | 
102 |     # hack, I know
103 |     agent_agrs["special"]["batch_size"] = n_games
104 |     agent = create_agent(agent_cls, state_shape, n_actions, agent_agrs, use_target_network)
105 | 
106 |     log_dir = log_dir or "./logs_" + env_name.replace(string.punctuation, "_")
107 |     create_if_need(log_dir)
108 | 
109 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_option)
110 |     with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
111 |         saver = tf.train.Saver(
112 |             var_list=tf.trainable_variables(), 
113 |             keep_checkpoint_every_n_hours=1)
114 | 
115 |         sess.run(tf.global_variables_initializer())
116 |         if load:
117 |             saver.restore(sess, "{}/model.ckpt".format(log_dir))
118 | 
119 |         save_model(sess, saver, log_dir)
120 |         try:
121 |             history = learning_fn(
122 |                 sess, agent, env,
123 |                 update_fn=update_fn,
124 |                 **run_args)
125 |             save_history(history, log_dir)
126 |             if plot_stats:
127 |                 plotter_dir = os.path.join(log_dir, "plotter")
128 |                 plot_all_metrics(history, save_dir=plotter_dir)
129 |         except KeyboardInterrupt:
130 |             print("Exiting training procedure")
131 |         save_model(sess, saver, log_dir)
132 | 
133 |     if api_key is not None:
134 |         tf.reset_default_graph()
135 |         agent_agrs["special"]["batch_size"] = 1
136 |         agent = create_agent(agent_cls, state_shape, n_actions, agent_agrs, use_target_network)
137 | 
138 |         env_name = env_name.replace("Deterministic", "")
139 |         env = make_env_fn(env_name, -1, episode_limit=None)
140 |         monitor_dir = os.path.join(log_dir, "monitor")
141 | 
142 |         env = gym.wrappers.Monitor(env, monitor_dir, force=True)
143 | 
144 |         with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
145 |             saver = tf.train.Saver(
146 |                 var_list=tf.trainable_variables(), 
147 |                 keep_checkpoint_every_n_hours=1)
148 |             sess.run(tf.global_variables_initializer())
149 |             saver.restore(sess, "{}/model.ckpt".format(log_dir))
150 | 
151 |             sessions = [play_fn(sess, agent, env, action_fn=action_fn) for _ in range(300)]
152 | 
153 |         env.close()
154 |         gym.upload(monitor_dir, api_key=api_key)
155 | 
156 | 
157 | def typical_args(parser):
158 |     parser.add_argument(
159 |         '--env',
160 |         type=str,
161 |         default='CartPole-v0',  # BreakoutDeterministic-v0
162 |         help='The environment to use. (default: %(default)s)')
163 | 
164 |     # env pool params
165 |     parser.add_argument(
166 |         '--n_games',
167 |         type=int,
168 |         default=10,
169 |         help='Number of parallel games to play during training. (default: %(default)s)')
170 |     parser.add_argument(
171 |         '--reload_envs',
172 |         action='store_true',
173 |         default=False,
174 |         help='Flag for auto-reloading environments if they done. (default: %(default)s)')
175 | 
176 |     parser.add_argument(
177 |         '--n_epochs',
178 |         type=int,
179 |         default=100,
180 |         help='Number of epochs to train. (default: %(default)s)')
181 |     parser.add_argument(
182 |         '--n_sessions',
183 |         type=int,
184 |         default=10,
185 |         help='Number of game session to play per one epoch. (default: %(default)s)')
186 |     parser.add_argument(
187 |         '--t_max',
188 |         type=int,
189 |         default=1000,
190 |         help='Steps per game session to play. (default: %(default)s)')
191 |     parser.add_argument(
192 |         '--episode_limit',
193 |         type=int,
194 |         default=None,
195 |         help='Max steps in environment to play. (default: %(default)s)')
196 |     parser.add_argument(
197 |         '--plot_history',
198 |         action='store_true',
199 |         default=False,
200 |         help='Plot graph with main train statistics (reward, loss, steps). (default: %(default)s)')
201 |     parser.add_argument(
202 |         '--api_key',
203 |         type=str,
204 |         default=None,
205 |         help='Your API key to submit to gym. (default: %(default)s)')
206 |     parser.add_argument(
207 |         '--log_dir',
208 |         type=str,
209 |         default=None,
210 |         help='Your API key to submit to gym. (default: %(default)s)')
211 |     parser.add_argument(
212 |         '--load',
213 |         action='store_true',
214 |         default=False,
215 |         help='Flag to load previous model from log_dir. (default: %(default)s)')
216 |     parser.add_argument(
217 |         '--gpu_option',
218 |         type=float,
219 |         default=0.45,
220 |         help='GPU usage. (default: %(default)s)')
221 | 
222 |     # feature network params
223 |     parser.add_argument(
224 |         '--feature_network',
225 |         type=str,
226 |         choices=["linear", "convolution"],
227 |         default="linear",
228 |         help='Feature network type, need to create vector representation of the state. '
229 |              '(default: %(''default)s)')
230 |     parser.add_argument(
231 |         '--activation',
232 |         type=str,
233 |         default="elu",
234 |         help='Typical activation for feature network. (default: %(default)s)')
235 |     parser.add_argument(
236 |         '--use_bn',
237 |         action='store_true',
238 |         default=False,
239 |         help='Batchnorm usage flag. (default: %(default)s) - no batchnorm')
240 |     parser.add_argument(
241 |         '--dropout',
242 |         type=float,
243 |         default=-1,
244 |         help='Dropout keep prob rate. (default: %(default)s) - no dropout')
245 | 
246 |     # special args for linear network
247 |     parser.add_argument(
248 |         '--layers',
249 |         type=str,
250 |         default=None,
251 |         help='Linear feature network layers, splitted by \'-\'.')
252 | 
253 |     # special args for convolution network:
254 |     parser.add_argument(
255 |         '--n_filters',
256 |         type=str,
257 |         default=None,
258 |         help='Convolution feature network filters, splitted by \'-\'.')
259 |     parser.add_argument(
260 |         '--kernels',
261 |         type=str,
262 |         default=None,
263 |         help='Convolution feature network kernels, splitted by \'-\'.')
264 |     parser.add_argument(
265 |         '--strides',
266 |         type=str,
267 |         default=None,
268 |         help='Convolution feature network strides, splitted by \'-\'.')
269 | 
270 |     # typical hidden state params
271 |     parser.add_argument(
272 |         '--hidden_size',
273 |         type=int,
274 |         default=512,
275 |         help='Hidden state size. (default: %(default)s)')
276 |     parser.add_argument(
277 |         '--hidden_activation',
278 |         type=str,
279 |         default="elu",
280 |         help='Hidden state activation. (default: %(default)s)')
281 | 
282 |     # typical optimization params
283 |     parser.add_argument(
284 |         '--feature_lr',
285 |         type=float,
286 |         default=1e-5,
287 |         help='Learning rate for feature network. (default: %(default)s)')
288 |     parser.add_argument(
289 |         '--lr_decay_steps',
290 |         type=float,
291 |         default=1e5,
292 |         help='Learning rate decay steps. (default: %(default)s)')
293 |     parser.add_argument(
294 |         '--lr_decay',
295 |         type=float,
296 |         default=0.999,
297 |         help='Learning rate decay factor. (default: %(default)s)')
298 |     parser.add_argument(
299 |         '--grad_clip',
300 |         type=float,
301 |         default=1.0,
302 |         help='Gradient clip factor. (default: %(default)s)')
303 | 
304 |     # update args
305 |     parser.add_argument(
306 |         '--gamma',
307 |         type=float,
308 |         default=0.99,
309 |         help='Gamma discount factor. (default: %(default)s)')
310 |     parser.add_argument(
311 |         '--reward_norm',
312 |         type=float,
313 |         default=1.0,
314 |         help='Reward norm factor. (default: %(default)s)')
315 |     parser.add_argument(
316 |         '--batch_size',
317 |         type=int,
318 |         default=10,
319 |         help='Batch size for update, should be more than parallel games count. '
320 |              '(default: %(''default)s)')
321 |     parser.add_argument(
322 |         '--time_major',
323 |         action='store_true',
324 |         default=False,
325 |         help='Time-major flag for update. (default: %(default)s) - batch-major')
326 | 
327 |     # preprocess args for image envs
328 |     parser.add_argument(
329 |         '--image_width',
330 |         type=int,
331 |         default=64,
332 |         help='Image-based environments preprocessing, cut to current width. '
333 |              '(default: %(default)s)')
334 |     parser.add_argument(
335 |         '--image_height',
336 |         type=int,
337 |         default=64,
338 |         help='Image-based environments preprocessing, cut to current height. '
339 |              '(default: %(default)s)')
340 |     parser.add_argument(
341 |         '--image_grayscale',
342 |         action='store_true',
343 |         default=False,
344 |         help='Image-based environments preprocessing, flag to grayscale state image.')
345 |     parser.add_argument(
346 |         '--image_corners',
347 |         type=str,
348 |         default=None,
349 |         help='Image-based environments preprocessing, image corners splitted by \'-\'.')
350 |     parser.add_argument(
351 |         '--n_frames',
352 |         type=int,
353 |         default=1,
354 |         help='Number of memory frames to use. (default: %(default)s)')
355 | 
356 |     return parser
357 | 
358 | 
359 | def typical_argsparse(args):
360 |     if args.feature_network == "linear":
361 |         network_args = {
362 |             "layers": str2params(args.layers)
363 |         }
364 | 
365 |         env_args = {
366 |             "n_frames": args.n_frames,
367 |             "autoreload_envs": args.autoreload_envs
368 |         }
369 |         make_env_fn = make_env_wrapper(make_env, env_args)
370 |     elif args.feature_network == "convolution":
371 |         network_args = {
372 |             "n_filters": str2params(args.n_filters),
373 |             "kernels": str2params(args.kernels),
374 |             "strides": str2params(args.strides)
375 |         }
376 | 
377 |         corners = str2params(args.image_corners)
378 |         if corners is not None:
379 |             image_crop_x1, image_crop_x2, image_crop_y1, image_crop_y2 = corners
380 |             crop_fn = lambda img: img[image_crop_x1:image_crop_x2, image_crop_y1:image_crop_y2]
381 |         else:
382 |             crop_fn = None
383 | 
384 |         image_preprocessing_params = {
385 |             "width": int(args.image_width),
386 |             "height": int(args.image_height),
387 |             "grayscale": args.image_grayscale,
388 |             "crop": crop_fn,
389 |             "n_frames": int(args.n_frames),
390 |             "autoreload_envs": args.autoreload_envs
391 |         }
392 | 
393 |         make_env_fn = make_env_wrapper(make_image_env, image_preprocessing_params)
394 |     else:
395 |         raise NotImplemented()
396 | 
397 |     network = network_wrapper(
398 |         networks[args.feature_network], {
399 |             **network_args, **{
400 |             "activation_fn": activations[args.activation],
401 |             "use_bn": args.use_bn,
402 |             "dropout": args.dropout
403 |         }})
404 | 
405 |     run_args = {
406 |         "n_epochs": int(args.n_epochs),
407 |         "n_sessions": int(args.n_sessions),
408 |         "t_max": int(args.t_max)
409 |     }
410 |     update_args = {
411 |         "discount_factor": float(args.gamma),
412 |         "reward_norm": float(args.reward_norm),
413 |         "batch_size": int(args.batch_size),
414 |         "time_major": args.time_major,
415 |     }
416 |     optimization_params = {
417 |         "initial_lr": float(args.feature_lr),
418 |         "decay_steps": int(args.lr_decay_steps),
419 |         "lr_decay": float(args.lr_decay),
420 |         "grad_clip": float(args.grad_clip)
421 |     }
422 | 
423 |     return network, run_args, update_args, optimization_params, make_env_fn
424 | 


--------------------------------------------------------------------------------
/xvfb:
--------------------------------------------------------------------------------
 1 | #taken from https://gist.github.com/jterrace/2911875
 2 | XVFB=/usr/bin/Xvfb
 3 | XVFBARGS=":1 -screen 0 1024x768x24 -ac +extension GLX +render -noreset"
 4 | PIDFILE=./xvfb.pid
 5 | case "$1" in
 6 |   start)
 7 |     echo -n "Starting virtual X frame buffer: Xvfb"
 8 |     start-stop-daemon --start --quiet --pidfile $PIDFILE --make-pidfile --background --exec $XVFB -- $XVFBARGS
 9 |     echo "."
10 |     ;;
11 |   stop)
12 |     echo -n "Stopping virtual X frame buffer: Xvfb"
13 |     start-stop-daemon --stop --quiet --pidfile $PIDFILE
14 |     echo "."
15 |     ;;
16 |   restart)
17 |     $0 stop
18 |     $0 start
19 |     ;;
20 |   *)
21 |         echo "Usage: /etc/init.d/xvfb {start|stop|restart}"
22 |         exit 1
23 | esac
24 | 
25 | exit 0
26 | 


--------------------------------------------------------------------------------