├── .gitignore ├── A3C ├── .gitignore ├── README.md ├── a3c_ff.py ├── a3c_lstm.py └── run_a3c.py ├── CEM ├── .gitignore ├── README.md ├── cem_gym.py └── continuous_cem_gym.py ├── DP ├── README.md ├── policy_iteration_gym.py └── value_iteration_gym.py ├── DQN ├── .gitignore ├── README.md ├── dqn.py ├── drqn.py └── run_dqn.py ├── FA ├── .gitignore ├── README.md └── q_learning_gym.py ├── GEN ├── README.md └── genetic_gym.py ├── LICENSE ├── MC ├── .directory ├── Blackjack Playground.ipynb ├── MC Control with Epsilon-Greedy Policies.ipynb ├── MC Prediction.ipynb ├── Off-Policy MC Control with Weighted Importance Sampling.ipynb └── README.md ├── PG ├── .gitignore ├── README.md ├── reinforce.py └── run_reinforce.py ├── README.md ├── TD ├── README.md ├── evsarsa.py ├── qlearning.py ├── run.py └── sarsa.py ├── agents ├── README.md ├── __init__.py ├── agent_networks.py └── agent_states.py ├── common ├── __init__.py ├── buffer.py ├── networks.py ├── schedules.py └── segment_tree.py ├── environment_corners.ipynb ├── jedi_upload.py ├── requirements.txt ├── wrappers ├── __init__.py ├── gym_wrappers.py └── run_wrappers.py └── xvfb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .idea 92 | .code 93 | 94 | *.png 95 | *.pid 96 | 97 | *tmp* 98 | *logs* 99 | *.log 100 | 101 | solutions.md 102 | *.swp 103 | *.ipynb 104 | vizdoom.ini 105 | -------------------------------------------------------------------------------- /A3C/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | logs* 3 | -------------------------------------------------------------------------------- /A3C/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/A3C/README.md -------------------------------------------------------------------------------- /A3C/a3c_ff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import tensorflow as tf 4 | from rstools.tf.optimization import build_model_optimization, build_scope_optimization 5 | 6 | from agents.agent_networks import FeatureNet, PolicyNet, ValueNet 7 | from agents.agent_states import LinearHiddenState 8 | 9 | 10 | class A3CFFAgent(object): 11 | def __init__(self, state_shape, n_actions, network, special=None): 12 | self.special = special or {} 13 | self.state_shape = state_shape 14 | self.n_actions = n_actions 15 | self.special = special 16 | 17 | self.special = special 18 | self.scope = special.get("scope", "a3c_ff") 19 | 20 | with tf.variable_scope(self.scope): 21 | self._build_graph(network) 22 | 23 | def _build_graph(self, network): 24 | self.feature_net = FeatureNet( 25 | self.state_shape, network, 26 | self.special.get("feature_get", None)) 27 | 28 | self.hidden_state = LinearHiddenState( 29 | self.feature_net.feature_state, 30 | self.special.get("hidden_size", 512), 31 | self.special.get("hidden_activation", tf.nn.elu)) 32 | 33 | self.policy_net = PolicyNet( 34 | self.hidden_state.state, self.n_actions, 35 | self.special.get("policy_net", None)) 36 | self.value_net = ValueNet( 37 | self.hidden_state.state, 38 | self.special.get("value_net", None)) 39 | 40 | build_model_optimization( 41 | self.policy_net, 42 | self.special.get("policy_net_optimization", None)) 43 | build_model_optimization( 44 | self.value_net, 45 | self.special.get("value_net_optimization", None)) 46 | build_model_optimization( 47 | self.hidden_state, 48 | self.special.get("hidden_state_optimization", None), 49 | loss=0.5 * (self.policy_net.loss + self.value_net.loss)) 50 | build_model_optimization( 51 | self.feature_net, 52 | self.special.get("feature_net_optimization", None), 53 | loss=0.5 * (self.policy_net.loss + self.value_net.loss)) 54 | 55 | def predict_values(self, sess, state_batch): 56 | return sess.run( 57 | self.value_net.predicted_values_for_actions, 58 | feed_dict={ 59 | self.feature_net.states: state_batch, 60 | self.feature_net.is_training: False}) 61 | 62 | def predict_probs(self, sess, state_batch): 63 | return sess.run( 64 | self.policy_net.predicted_probs, 65 | feed_dict={ 66 | self.feature_net.states: state_batch, 67 | self.feature_net.is_training: False}) 68 | -------------------------------------------------------------------------------- /A3C/a3c_lstm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import tensorflow as tf 4 | from rstools.tf.optimization import build_model_optimization, build_scope_optimization 5 | from tensorflow.contrib import rnn 6 | 7 | from agents.agent_networks import FeatureNet, PolicyNet, ValueNet 8 | from agents.agent_states import RecurrentHiddenState, get_state_variables, get_state_update_op 9 | 10 | 11 | class A3CLstmAgent(object): 12 | def __init__(self, state_shape, n_actions, network, special=None): 13 | self.state_shape = state_shape 14 | self.n_actions = n_actions 15 | 16 | self.special = special 17 | self.scope = special.get("scope", "a3c_lstm") 18 | 19 | with tf.variable_scope(self.scope): 20 | self._build_graph(network) 21 | 22 | def _build_graph(self, network): 23 | self.feature_net = FeatureNet( 24 | self.state_shape, network, 25 | self.special.get("feature_net", None)) 26 | 27 | self.hidden_state = RecurrentHiddenState( 28 | self.feature_net.feature_state, 29 | self.special.get("hidden_size", 512), 30 | self.special.get("hidden_activation", tf.tanh), 31 | self.special.get("batch_size", 1)) 32 | 33 | self.policy_net = PolicyNet( 34 | self.hidden_state.state, self.n_actions, 35 | self.special.get("policy_net", {})) 36 | self.value_net = ValueNet( 37 | self.hidden_state.state, 38 | self.special.get("value_net", {})) 39 | 40 | build_model_optimization( 41 | self.policy_net, 42 | self.special.get("policy_net_optimization", None)) 43 | build_model_optimization( 44 | self.value_net, 45 | self.special.get("value_net_optimization", None)) 46 | build_model_optimization( 47 | self.hidden_state, 48 | self.special.get("hidden_state_optimization", None), 49 | loss=0.5 * (self.value_net.loss + self.policy_net.loss)) 50 | build_model_optimization( 51 | self.feature_net, self.special.get("feature_net_optimization", None), 52 | loss=0.5 * (self.value_net.loss + self.policy_net.loss)) 53 | 54 | def predict_values(self, sess, state_batch): 55 | return sess.run( 56 | self.value_net.predicted_values_for_actions, 57 | feed_dict={ 58 | self.feature_net.states: state_batch, 59 | self.feature_net.is_training: False 60 | }) 61 | 62 | def predict_probs(self, sess, state_batch): 63 | return sess.run( 64 | self.policy_net.predicted_probs, 65 | feed_dict={ 66 | self.feature_net.states: state_batch, 67 | self.feature_net.is_training: False 68 | }) 69 | 70 | def update_belief_state(self, sess, state_batch, done_batch): 71 | _ = sess.run( 72 | self.hidden_state.belief_update, 73 | feed_dict={ 74 | self.feature_net.states: state_batch, 75 | self.hidden_state.is_end: done_batch, 76 | self.feature_net.is_training: False 77 | }) 78 | 79 | def assign_belief_state(self, sess, new_belief): 80 | _ = sess.run( 81 | self.hidden_state.belief_assign, 82 | feed_dict={ 83 | self.hidden_state.belief_out: new_belief 84 | }) 85 | 86 | def get_belief_state(self, sess): 87 | return sess.run(self.hidden_state.belief_state) 88 | -------------------------------------------------------------------------------- /A3C/run_a3c.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | from rstools.utils.batch_utils import iterate_minibatches, merge_generators 5 | from tqdm import trange 6 | 7 | from A3C.a3c_ff import A3CFFAgent 8 | from A3C.a3c_lstm import A3CLstmAgent 9 | from common.networks import activations 10 | from wrappers.gym_wrappers import Transition 11 | from wrappers.run_wrappers import typical_args, typical_argsparse, run_wrapper, update_wraper, \ 12 | epsilon_greedy_policy, play_session 13 | 14 | 15 | def update(sess, a3c_agent, transitions, initial_state=None, 16 | discount_factor=0.99, reward_norm=1.0, 17 | batch_size=32, time_major=True): 18 | policy_targets = [] 19 | value_targets = [] 20 | state_history = [] 21 | action_history = [] 22 | done_history = [] 23 | 24 | cumulative_reward = np.zeros_like(transitions[-1].reward) + \ 25 | np.invert(transitions[-1].done) * \ 26 | a3c_agent.predict_values(sess, transitions[-1].next_state) 27 | for transition in reversed(transitions): 28 | cumulative_reward = reward_norm * transition.reward + \ 29 | np.invert(transition.done) * discount_factor * cumulative_reward 30 | policy_target = cumulative_reward - a3c_agent.predict_values(sess, transition.state) 31 | 32 | value_targets.append(cumulative_reward) 33 | policy_targets.append(policy_target) 34 | state_history.append(transition.state) 35 | action_history.append(transition.action) 36 | done_history.append(transition.done) 37 | 38 | value_targets = np.array(value_targets[::-1]) # time-major 39 | policy_targets = np.array(policy_targets[::-1]) 40 | state_history = np.array(state_history[::-1]) 41 | action_history = np.array(action_history[::-1]) 42 | done_history = np.array(done_history[::-1]) 43 | 44 | if isinstance(a3c_agent, A3CLstmAgent): 45 | a3c_agent.assign_belief_state(sess, initial_state) 46 | 47 | time_len = state_history.shape[0] 48 | value_loss, policy_loss = 0.0, 0.0 49 | for state_axis, action_axis, value_target_axis, policy_target_axis, done_axis in \ 50 | zip(state_history, action_history, value_targets, policy_targets, done_history): 51 | axis_len = state_axis.shape[0] 52 | axis_value_loss, axis_policy_loss = 0.0, 0.0 53 | 54 | state_axis = iterate_minibatches(state_axis, batch_size) 55 | action_axis = iterate_minibatches(action_axis, batch_size) 56 | value_target_axis = iterate_minibatches(value_target_axis, batch_size) 57 | policy_target_axis = iterate_minibatches(policy_target_axis, batch_size) 58 | done_axis = iterate_minibatches(done_axis, batch_size) 59 | 60 | batch_generator = merge_generators( 61 | [state_axis, action_axis, value_target_axis, policy_target_axis, done_axis]) 62 | 63 | for state_batch, action_batch, value_target, policy_target, done_batch in batch_generator: 64 | run_params = [ 65 | a3c_agent.policy_net.loss, 66 | a3c_agent.value_net.loss, 67 | a3c_agent.policy_net.train_op, 68 | a3c_agent.value_net.train_op, 69 | a3c_agent.feature_net.train_op] 70 | feed_params = { 71 | a3c_agent.feature_net.states: state_batch, 72 | a3c_agent.feature_net.is_training: True, 73 | a3c_agent.policy_net.actions: action_batch, 74 | a3c_agent.policy_net.cumulative_rewards: policy_target, 75 | a3c_agent.policy_net.is_training: True, 76 | a3c_agent.value_net.td_target: value_target, 77 | a3c_agent.value_net.is_training: True 78 | } 79 | 80 | if isinstance(a3c_agent, A3CLstmAgent): 81 | run_params += [a3c_agent.hidden_state.belief_update] 82 | feed_params[a3c_agent.hidden_state.is_end] = done_batch 83 | 84 | run_result = sess.run( 85 | run_params, 86 | feed_dict=feed_params) 87 | 88 | batch_loss_policy = run_result[0] 89 | batch_loss_state = run_result[1] 90 | 91 | axis_value_loss += batch_loss_state 92 | axis_policy_loss += batch_loss_policy 93 | 94 | policy_loss += axis_policy_loss / axis_len 95 | value_loss += axis_value_loss / axis_len 96 | 97 | return policy_loss / time_len, value_loss / time_len 98 | 99 | 100 | def generate_sessions(sess, a3c_agent, env_pool, update_fn, t_max=1000): 101 | total_reward = 0.0 102 | total_games = 0.0 103 | 104 | transitions = [] 105 | init_state = None 106 | if isinstance(a3c_agent, A3CLstmAgent): 107 | init_state = a3c_agent.get_belief_state(sess) 108 | 109 | states = env_pool.pool_states() 110 | for t in range(t_max): 111 | actions = epsilon_greedy_policy(a3c_agent, sess, states) 112 | next_states, rewards, dones, _ = env_pool.step(actions) 113 | 114 | transitions.append(Transition( 115 | state=states, action=actions, reward=rewards, next_state=next_states, done=dones)) 116 | 117 | if isinstance(a3c_agent, A3CLstmAgent): 118 | a3c_agent.update_belief_state(sess, states, dones) 119 | 120 | states = next_states 121 | 122 | total_reward += rewards.sum() 123 | total_games += dones.sum() 124 | 125 | if env_pool.n_envs == 1 and total_games > 0: 126 | break 127 | 128 | total_policy_loss, total_value_loss = update_fn(sess, a3c_agent, transitions, init_state) 129 | 130 | return total_reward / env_pool.n_envs, \ 131 | total_policy_loss, total_value_loss, \ 132 | t / (total_games / env_pool.n_envs) 133 | 134 | 135 | def a3c_learning( 136 | sess, agent, env, update_fn, 137 | n_epochs=1000, n_sessions=100, t_max=1000): 138 | tr = trange( 139 | n_epochs, 140 | desc="", 141 | leave=True) 142 | 143 | history = { 144 | "reward": np.zeros(n_epochs), 145 | "policy_loss": np.zeros(n_epochs), 146 | "value_loss": np.zeros(n_epochs), 147 | "steps": np.zeros(n_epochs), 148 | } 149 | 150 | for i in tr: 151 | sessions = [ 152 | generate_sessions(sess, agent, env, update_fn, t_max) 153 | for _ in range(n_sessions)] 154 | session_rewards, session_policy_loss, session_value_loss, session_steps = \ 155 | map(np.array, zip(*sessions)) 156 | 157 | history["reward"][i] = np.mean(session_rewards) 158 | history["policy_loss"][i] = np.mean(session_policy_loss) 159 | history["value_loss"][i] = np.mean(session_value_loss) 160 | history["steps"][i] = np.mean(session_steps) 161 | 162 | desc = "\t".join( 163 | ["{} = {:.3f}".format(key, value[i]) for key, value in history.items()]) 164 | tr.set_description(desc) 165 | 166 | return history 167 | 168 | 169 | def run(env_name, make_env_fn, agent_cls, 170 | run_args, update_args, agent_agrs, 171 | log_dir=None, episode_limit=None, 172 | plot_stats=False, api_key=None, 173 | load=False, gpu_option=0.4, 174 | n_games=10): 175 | run_wrapper( 176 | n_games, a3c_learning, update_wraper(update, **update_args), 177 | play_session, epsilon_greedy_policy, 178 | env_name, make_env_fn, agent_cls, 179 | run_args, agent_agrs, 180 | log_dir=log_dir, episode_limit=episode_limit, 181 | plot_stats=plot_stats, api_key=api_key, 182 | load=load, gpu_option=gpu_option) 183 | 184 | 185 | def _parse_args(): 186 | parser = argparse.ArgumentParser(description='A3C Agent Learning') 187 | # typical params 188 | parser.add_argument( 189 | '--agent', 190 | type=str, 191 | default="feed_forward", 192 | choices=["feed_forward", "recurrent"], 193 | help='Which agent to use. (default: %(default)s)') 194 | 195 | parser = typical_args(parser) 196 | 197 | # agent special params & optimization 198 | parser.add_argument( 199 | '--policy_lr', 200 | type=float, 201 | default=1e-5, 202 | help='Learning rate for policy network. (default: %(default)s)') 203 | parser.add_argument( 204 | '--value_lr', 205 | type=float, 206 | default=1e-5, 207 | help='Learning rate for value network. (default: %(default)s)') 208 | 209 | parser.add_argument( 210 | '--entropy_factor', 211 | type=float, 212 | default=1e-2, 213 | help='Entropy factor for policy network. (default: %(default)s)') 214 | 215 | args = parser.parse_args() 216 | return args 217 | 218 | 219 | def main(): 220 | args = _parse_args() 221 | 222 | assert args.time_major, "Please, use time_major flag for updates" 223 | 224 | network, run_args, update_args, optimization_params, make_env_fn = typical_argsparse(args) 225 | 226 | policy_optimization_params = { 227 | **optimization_params, 228 | **{"initial_lr": args.policy_lr} 229 | } 230 | 231 | value_optimization_params = { 232 | **optimization_params, 233 | **{"initial_lr": args.value_lr} 234 | } 235 | policy_net_params = { 236 | "entropy_factor": args.entropy_factor 237 | } 238 | 239 | agent_cls = A3CFFAgent if args.agent == "feed_forward" else A3CLstmAgent 240 | 241 | special = { 242 | "policy_net": policy_net_params, 243 | "hidden_size": args.hidden_size, 244 | "hidden_activation": activations[args.hidden_activation], 245 | "feature_net_optimization": optimization_params, 246 | "hidden_state_optimization": optimization_params, 247 | "value_net_optimization": value_optimization_params, 248 | "policy_net_optimization": policy_optimization_params, 249 | } 250 | 251 | agent_args = { 252 | "network": network, 253 | "special": special 254 | } 255 | 256 | run(args.env, make_env_fn, agent_cls, 257 | run_args, update_args, agent_args, 258 | args.log_dir, args.episode_limit, 259 | args.plot_history, args.api_key, 260 | args.load, args.gpu_option, 261 | args.n_games) 262 | 263 | 264 | if __name__ == '__main__': 265 | main() 266 | -------------------------------------------------------------------------------- /CEM/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | logs* 3 | -------------------------------------------------------------------------------- /CEM/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/CEM/README.md -------------------------------------------------------------------------------- /CEM/cem_gym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import gym 4 | from gym import wrappers 5 | import argparse 6 | import pickle 7 | import numpy as np 8 | from tqdm import trange 9 | from joblib import Parallel, delayed 10 | import collections 11 | import sklearn.pipeline 12 | import sklearn.preprocessing 13 | from sklearn.neural_network import MLPClassifier 14 | from sklearn.kernel_approximation import RBFSampler 15 | 16 | from matplotlib import pyplot as plt 17 | 18 | plt.style.use("ggplot") 19 | 20 | 21 | def plot_unimetric(history, metric, save_dir): 22 | plt.figure() 23 | plt.plot(history[metric]) 24 | plt.title('model {}'.format(metric)) 25 | plt.ylabel(metric) 26 | plt.xlabel('epoch') 27 | plt.savefig("{}/{}.png".format(save_dir, metric), 28 | format='png', dpi=300) 29 | 30 | 31 | class Estimator(object): 32 | """ 33 | Value Function approximator. 34 | """ 35 | 36 | def __init__(self, env, layers): 37 | self.n_actions = env.action_space.n 38 | self._prepare_estimator_for_env(env) 39 | self.model = MLPClassifier( 40 | hidden_layer_sizes=layers, 41 | activation='tanh', 42 | warm_start=True, 43 | max_iter=1) 44 | # We need to call partial_fit once to initialize the model 45 | # or we get a NotFittedError when trying to make a prediction 46 | # This is quite hacky. 47 | self.model.fit( 48 | [self.featurize_state(env.reset())] * self.n_actions, 49 | range(self.n_actions)) 50 | 51 | def _prepare_estimator_for_env(self, env): 52 | observation_examples = np.array( 53 | [env.observation_space.sample() for _ in range(10000)]) 54 | observation_examples = self._vectorise_state(observation_examples) 55 | 56 | scaler = sklearn.preprocessing.StandardScaler() 57 | scaler.fit(observation_examples) 58 | self.scaler = scaler 59 | 60 | featurizer = sklearn.pipeline.FeatureUnion([ 61 | ("rbf1", RBFSampler(gamma=5.0, n_components=100)), 62 | ("rbf2", RBFSampler(gamma=2.0, n_components=100)), 63 | ("rbf3", RBFSampler(gamma=1.0, n_components=100)), 64 | ("rbf4", RBFSampler(gamma=0.5, n_components=100)) 65 | ]) 66 | featurizer.fit(scaler.transform(observation_examples)) 67 | self.featurizer = featurizer 68 | 69 | def _vectorise_state(self, states): 70 | obs_shape = states.shape 71 | if len(obs_shape) < 2: # just one observation 72 | states = np.expand_dims(states, 0) 73 | elif len(obs_shape) > 2: # some many states magic 74 | states = states.reshape((obs_shape[0], -1)) 75 | return states 76 | 77 | def featurize_state(self, state): 78 | """ 79 | Returns the featurized representation for a state. 80 | """ 81 | state = self._vectorise_state(state) 82 | scaled = self.scaler.transform(state) 83 | featurized = self.featurizer.transform(scaled) 84 | if featurized.shape[0] == 1: 85 | return featurized[0] 86 | else: 87 | return featurized 88 | 89 | def predict_proba(self, s): 90 | features = self.featurize_state(s) 91 | return self.model.predict_proba([features]) 92 | 93 | def fit(self, s, y): 94 | features = self.featurize_state(s) 95 | self.model.partial_fit(features, y) 96 | 97 | 98 | def generate_session(env, agent, t_max=int(1e5), step_penalty=0.01): 99 | states, actions = [], [] 100 | total_reward = 0 101 | 102 | s = env.reset() 103 | 104 | for t in range(t_max): 105 | 106 | # predict array of action probabilities 107 | probs = agent.predict_proba([s])[0] 108 | 109 | a = np.random.choice(env.action_space.n, p=probs) 110 | 111 | new_s, r, done, info = env.step(a) 112 | 113 | # record sessions like you did before 114 | states.append(s) 115 | actions.append(a) 116 | total_reward += r 117 | 118 | s = new_s 119 | if done: 120 | break 121 | 122 | total_reward -= t * step_penalty 123 | 124 | return states, actions, total_reward, t 125 | 126 | 127 | glob_env = None 128 | glob_agent = None 129 | 130 | 131 | def generate_parallel_session(t_max=int(1e5), step_penalty=0.01): 132 | states, actions = [], [] 133 | total_reward = 0 134 | 135 | s = glob_env.reset() 136 | 137 | for t in range(t_max): 138 | 139 | # predict array of action probabilities 140 | probs = glob_agent.predict_proba([s])[0] 141 | 142 | a = np.random.choice(glob_env.action_space.n, p=probs) 143 | 144 | new_s, r, done, info = glob_env.step(a) 145 | 146 | # record sessions like you did before 147 | states.append(s) 148 | actions.append(a) 149 | total_reward += r 150 | 151 | s = new_s 152 | if done: 153 | break 154 | 155 | total_reward -= t * step_penalty 156 | 157 | return states, actions, total_reward, t 158 | 159 | 160 | def generate_parallel_sessions(n, t_max, step_penalty, n_jobs=-1): 161 | return Parallel(n_jobs)(n * [delayed(generate_parallel_session)(t_max, step_penalty)]) 162 | 163 | 164 | def cem(env, agent, num_episodes, max_steps=int(1e6), step_penalty=0.01, 165 | n_samples=200, percentile=50, n_jobs=-1, verbose=False): 166 | global glob_env, glob_agent 167 | init_n_samples = n_samples 168 | final_n_samples = n_samples // 5 169 | plays_to_decay = num_episodes // 2 170 | 171 | states_deque = collections.deque(maxlen=int(init_n_samples * 2)) 172 | actions_deque = collections.deque(maxlen=int(init_n_samples * 2)) 173 | rewards_deque = collections.deque(maxlen=int(init_n_samples * 2)) 174 | 175 | glob_env = env # NEVER DO LIKE THIS PLEASE! 176 | glob_agent = agent 177 | 178 | # Keeps track of useful statistics 179 | history = { 180 | "threshold": np.zeros(num_episodes), 181 | "reward": np.zeros(num_episodes), 182 | "n_steps": np.zeros(num_episodes), 183 | } 184 | 185 | tr = trange( 186 | num_episodes, 187 | desc="mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format( 188 | 0.0, 0.0, 0.0), 189 | leave=True) 190 | 191 | for i in tr: 192 | # generate new sessions 193 | sessions = generate_parallel_sessions(n_samples, max_steps, step_penalty, n_jobs) 194 | if i < plays_to_decay: 195 | n_samples -= (init_n_samples - final_n_samples) // plays_to_decay 196 | 197 | batch_states, batch_actions, batch_rewards, batch_steps = map(np.array, zip(*sessions)) 198 | # batch_states: a list of lists of states in each session 199 | # batch_actions: a list of lists of actions in each session 200 | # batch_rewards: a list of floats - total rewards at each session 201 | states_deque.extend(batch_states) 202 | actions_deque.extend(batch_actions) 203 | rewards_deque.extend(batch_rewards) 204 | 205 | batch_states = np.array(states_deque) 206 | batch_actions = np.array(actions_deque) 207 | batch_rewards = np.array(rewards_deque) 208 | 209 | threshold = np.percentile(batch_rewards, percentile) 210 | 211 | history["threshold"][i] = threshold 212 | history["reward"][i] = np.mean(batch_rewards) 213 | history["n_steps"][i] = np.mean(batch_steps) 214 | 215 | # look like > better, cause >= refer to reuse of bad examples 216 | if i < plays_to_decay: 217 | elite_states = batch_states[batch_rewards > threshold] 218 | elite_actions = batch_actions[batch_rewards > threshold] 219 | else: 220 | elite_states = batch_states[batch_rewards >= threshold] 221 | elite_actions = batch_actions[batch_rewards >= threshold] 222 | 223 | if len(elite_actions) > 0: 224 | elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions]) 225 | # elite_states: a list of states from top games 226 | # elite_actions: a list of actions from top games 227 | try: 228 | agent.fit(elite_states, elite_actions) 229 | except: 230 | # just a hack 231 | addition = np.array([env.reset()] * env.action_space.n) 232 | elite_states = np.vstack((elite_states, addition)) 233 | elite_actions = np.hstack((elite_actions, list(range(env.action_space.n)))) 234 | agent.fit(elite_states, elite_actions) 235 | 236 | tr.set_description( 237 | "mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format( 238 | np.mean(batch_rewards) + step_penalty * np.mean(batch_steps), 239 | threshold, np.mean(batch_steps))) 240 | 241 | return history 242 | 243 | 244 | def _parse_args(): 245 | parser = argparse.ArgumentParser(description='Policy iteration example') 246 | parser.add_argument( 247 | '--env', 248 | type=str, 249 | default='CartPole-v0', # CartPole-v0, MountainCar-v0, LunarLander-v2 250 | help='The environment to use') 251 | parser.add_argument( 252 | '--num_episodes', 253 | type=int, 254 | default=200, 255 | help='Number of episodes') 256 | parser.add_argument( 257 | '--max_steps', 258 | type=int, 259 | default=int(1e5), 260 | help='Number of steps per episode') 261 | parser.add_argument( 262 | '--n_samples', 263 | type=int, 264 | default=1000, 265 | help='Games per epoch') 266 | parser.add_argument( 267 | '--step_penalty', 268 | type=float, 269 | default=0.01) 270 | parser.add_argument( 271 | '--percentile', 272 | type=int, 273 | default=80, 274 | help='percentile') 275 | parser.add_argument( 276 | '--verbose', 277 | action='store_true', 278 | default=False) 279 | parser.add_argument( 280 | '--plot_stats', 281 | action='store_true', 282 | default=False) 283 | parser.add_argument( 284 | '--features', 285 | action='store_true', 286 | default=False) 287 | parser.add_argument( 288 | '--layers', 289 | type=str, 290 | default=None) 291 | parser.add_argument( 292 | '--api_key', 293 | type=str, 294 | default=None) 295 | parser.add_argument( 296 | '--n_jobs', 297 | type=int, 298 | default=-1) 299 | parser.add_argument( 300 | '--seed', 301 | type=int, 302 | default=42) 303 | parser.add_argument( 304 | '--resume', 305 | action='store_true', 306 | default=False) 307 | 308 | args, _ = parser.parse_known_args() 309 | return args 310 | 311 | 312 | def save_stats(stats, save_dir="./"): 313 | for key in stats: 314 | plot_unimetric(stats, key, save_dir) 315 | 316 | 317 | def run(env, n_episodes=200, max_steps=int(1e5), n_samples=1000, step_penalty=0.01, 318 | percentile=80, features=False, layers=None, 319 | verbose=False, plot_stats=False, api_key=None, n_jobs=-1, seed=42, resume=False): 320 | env_name = env 321 | if env_name == "MountainCar-v0": 322 | env = gym.make(env).env 323 | layers = layers or (20, 10, 20) 324 | else: 325 | env = gym.make(env) 326 | layers = layers or (256, 256, 128) 327 | 328 | if features: 329 | agent = Estimator(env, layers) 330 | else: 331 | agent = MLPClassifier( 332 | hidden_layer_sizes=layers, 333 | activation='tanh', 334 | warm_start=True, 335 | max_iter=1) 336 | agent.fit([env.reset()] * env.action_space.n, range(env.action_space.n)) 337 | 338 | if resume: 339 | agent = pickle.load(open("agent.pkl", "rb")) 340 | 341 | env.seed(seed) 342 | np.random.seed(seed) 343 | 344 | stats = cem(env, agent, n_episodes, 345 | max_steps=max_steps, step_penalty=step_penalty, 346 | n_samples=n_samples, percentile=percentile, 347 | n_jobs=n_jobs, verbose=verbose) 348 | if plot_stats: 349 | save_stats(stats) 350 | 351 | pickle.dump(agent, open("agent.pkl", "wb")) 352 | 353 | if api_key is not None: 354 | env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True) 355 | sessions = [generate_session(env, agent, int(1e10), 0.0) for _ in range(1000)] 356 | env.close() 357 | gym.upload("/tmp/" + env_name, api_key=api_key) 358 | 359 | 360 | def main(): 361 | args = _parse_args() 362 | try: 363 | layers = tuple(map(int, args.layers.split("-"))) 364 | except: 365 | layers = None 366 | run(args.env, args.num_episodes, args.max_steps, args.n_samples, args.step_penalty, 367 | args.percentile, args.features, layers, 368 | args.verbose, args.plot_stats, args.api_key, args.n_jobs, args.seed, args.resume) 369 | 370 | 371 | if __name__ == '__main__': 372 | main() 373 | -------------------------------------------------------------------------------- /CEM/continuous_cem_gym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import gym 4 | from gym import wrappers 5 | import pickle 6 | import argparse 7 | import numpy as np 8 | from tqdm import trange 9 | from joblib import Parallel, delayed 10 | import collections 11 | from sklearn.neural_network import MLPRegressor 12 | 13 | from matplotlib import pyplot as plt 14 | 15 | plt.style.use("ggplot") 16 | 17 | 18 | def plot_unimetric(history, metric, save_dir): 19 | plt.figure() 20 | plt.plot(history[metric]) 21 | plt.title('model {}'.format(metric)) 22 | plt.ylabel(metric) 23 | plt.xlabel('epoch') 24 | plt.savefig("{}/{}.png".format(save_dir, metric), 25 | format='png', dpi=300) 26 | 27 | 28 | def generate_session(env, agent, t_max=int(1e5), step_penalty=0.01): 29 | states, actions = [], [] 30 | total_reward = 0 31 | 32 | s = env.reset() 33 | 34 | for t in range(t_max): 35 | a = agent.predict([s])[0] 36 | a = np.array(list(map( 37 | lambda x: min( 38 | max(x[1], env.action_space.low[x[0]]), 39 | env.action_space.high[x[0]]), 40 | enumerate(a)))) 41 | 42 | new_s, r, done, info = env.step(a) 43 | 44 | # record sessions like you did before 45 | states.append(s) 46 | actions.append(a) 47 | total_reward += r 48 | 49 | s = new_s 50 | if done: 51 | break 52 | 53 | total_reward -= t * step_penalty 54 | 55 | return states, actions, total_reward, t 56 | 57 | 58 | glob_env = None 59 | glob_agent = None 60 | 61 | 62 | def generate_parallel_session(t_max=int(1e5), step_penalty=0.01): 63 | states, actions = [], [] 64 | total_reward = 0 65 | 66 | s = glob_env.reset() 67 | 68 | for t in range(t_max): 69 | a = glob_agent.predict([s])[0] 70 | a = np.array(list(map( 71 | lambda x: min( 72 | max(x[1], glob_env.action_space.low[x[0]]), 73 | glob_env.action_space.high[x[0]]), 74 | enumerate(a)))) 75 | 76 | new_s, r, done, info = glob_env.step(a) 77 | 78 | # record sessions like you did before 79 | states.append(s) 80 | actions.append(a) 81 | total_reward += r 82 | 83 | s = new_s 84 | if done: 85 | break 86 | 87 | total_reward -= t * step_penalty 88 | 89 | return states, actions, total_reward, t 90 | 91 | 92 | def generate_parallel_sessions(n, t_max, step_penalty, n_jobs=-1): 93 | return Parallel(n_jobs)(n * [delayed(generate_parallel_session)(t_max, step_penalty)]) 94 | 95 | 96 | def cem(env, agent, num_episodes, max_steps=int(1e6), step_penalty=0.01, 97 | n_samples=200, percentile=50, n_jobs=-1, verbose=False): 98 | global glob_env, glob_agent 99 | init_n_samples = n_samples 100 | final_n_samples = n_samples // 5 101 | plays_to_decay = num_episodes // 2 102 | 103 | states_deque = collections.deque(maxlen=int(init_n_samples * 2)) 104 | actions_deque = collections.deque(maxlen=int(init_n_samples * 2)) 105 | rewards_deque = collections.deque(maxlen=int(init_n_samples * 2)) 106 | 107 | glob_env = env # NEVER DO LIKE THIS PLEASE! 108 | glob_agent = agent 109 | 110 | # Keeps track of useful statistics 111 | history = { 112 | "threshold": np.zeros(num_episodes), 113 | "reward": np.zeros(num_episodes), 114 | "n_steps": np.zeros(num_episodes), 115 | } 116 | 117 | tr = trange( 118 | num_episodes, 119 | desc="mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format(0.0, 0.0, 120 | 0.0), 121 | leave=True) 122 | 123 | for i in tr: 124 | # generate new sessions 125 | # sessions = [ 126 | # generate_session(env, agent, max_steps, step_penalty) 127 | # for _ in range(n_samples)] 128 | sessions = generate_parallel_sessions(n_samples, max_steps, step_penalty, n_jobs) 129 | if i < plays_to_decay: 130 | n_samples -= (init_n_samples - final_n_samples) // plays_to_decay 131 | 132 | batch_states, batch_actions, batch_rewards, batch_steps = map(np.array, zip(*sessions)) 133 | # batch_states: a list of lists of states in each session 134 | # batch_actions: a list of lists of actions in each session 135 | # batch_rewards: a list of floats - total rewards at each session 136 | states_deque.extend(batch_states) 137 | actions_deque.extend(batch_actions) 138 | rewards_deque.extend(batch_rewards) 139 | 140 | batch_states = np.array(states_deque) 141 | batch_actions = np.array(actions_deque) 142 | batch_rewards = np.array(rewards_deque) 143 | 144 | threshold = np.percentile(batch_rewards, percentile) 145 | 146 | history["threshold"][i] = threshold 147 | history["reward"][i] = np.mean(batch_rewards) 148 | history["n_steps"][i] = np.mean(batch_steps) 149 | 150 | # look like > better, cause >= refer to reuse of bad examples 151 | if i < plays_to_decay: 152 | elite_states = batch_states[batch_rewards > threshold] 153 | elite_actions = batch_actions[batch_rewards > threshold] 154 | else: 155 | elite_states = batch_states[batch_rewards >= threshold] 156 | elite_actions = batch_actions[batch_rewards >= threshold] 157 | 158 | if len(elite_actions) > 0: 159 | elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions]) 160 | # elite_states: a list of states from top games 161 | # elite_actions: a list of actions from top games 162 | try: 163 | agent.fit(elite_states, elite_actions) 164 | except: 165 | # just a hack 166 | addition = np.array([env.reset()] * env.action_space.n) 167 | elite_states = np.vstack((elite_states, addition)) 168 | elite_actions = np.hstack((elite_actions, list(range(env.action_space.n)))) 169 | agent.fit(elite_states, elite_actions) 170 | 171 | tr.set_description( 172 | "mean reward = {:.3f}\tthreshold = {:.3f}\tmean n_steps = {:.3f}".format( 173 | np.mean(batch_rewards) + step_penalty * np.mean(batch_steps), 174 | threshold, np.mean(batch_steps))) 175 | 176 | return history 177 | 178 | 179 | def _parse_args(): 180 | parser = argparse.ArgumentParser(description='Policy iteration example') 181 | parser.add_argument( 182 | '--env', 183 | type=str, 184 | default='MountainCarContinuous-v0', # MountainCar-v0, LunarLander-v2 185 | help='The environment to use') 186 | parser.add_argument( 187 | '--num_episodes', 188 | type=int, 189 | default=200, 190 | help='Number of episodes') 191 | parser.add_argument( 192 | '--max_steps', 193 | type=int, 194 | default=int(1e5), 195 | help='Number of steps per episode') 196 | parser.add_argument( 197 | '--n_samples', 198 | type=int, 199 | default=1000, 200 | help='Games per epoch') 201 | parser.add_argument( 202 | '--step_penalty', 203 | type=float, 204 | default=0.01) 205 | parser.add_argument( 206 | '--percentile', 207 | type=int, 208 | default=80, 209 | help='percentile') 210 | parser.add_argument( 211 | '--verbose', 212 | action='store_true', 213 | default=False) 214 | parser.add_argument( 215 | '--plot_stats', 216 | action='store_true', 217 | default=False) 218 | parser.add_argument( 219 | '--layers', 220 | type=str, 221 | default=None) 222 | parser.add_argument( 223 | '--api_key', 224 | type=str, 225 | default=None) 226 | parser.add_argument( 227 | '--n_jobs', 228 | type=int, 229 | default=-1) 230 | parser.add_argument( 231 | '--seed', 232 | type=int, 233 | default=42) 234 | parser.add_argument( 235 | '--resume', 236 | action='store_true', 237 | default=False) 238 | 239 | args, _ = parser.parse_known_args() 240 | return args 241 | 242 | 243 | def save_stats(stats, save_dir="./"): 244 | for key in stats: 245 | plot_unimetric(stats, key, save_dir) 246 | 247 | 248 | def run(env, n_episodes=200, max_steps=int(1e5), n_samples=1000, step_penalty=0.01, 249 | percentile=80, layers=None, 250 | verbose=False, plot_stats=False, api_key=None, n_jobs=-1, seed=42, resume=False): 251 | env_name = env 252 | if env_name == "MountainCarContinuous-v0": 253 | env = gym.make(env).env 254 | else: 255 | env = gym.make(env) 256 | layers = layers or (256, 256, 128) 257 | 258 | agent = MLPRegressor( 259 | hidden_layer_sizes=layers, 260 | activation='tanh', 261 | warm_start=True, 262 | max_iter=1) 263 | agent.fit( 264 | np.zeros(env.observation_space.shape).reshape(1, -1), 265 | np.zeros(env.action_space.shape).reshape(1, -1)) 266 | 267 | if resume: 268 | agent = pickle.load(open("agent.pkl", "rb")) 269 | 270 | env.seed(seed) 271 | np.random.seed(seed) 272 | 273 | stats = cem(env, agent, n_episodes, 274 | max_steps=max_steps, step_penalty=step_penalty, 275 | n_samples=n_samples, percentile=percentile, 276 | n_jobs=n_jobs, verbose=verbose) 277 | if plot_stats: 278 | save_stats(stats) 279 | 280 | pickle.dump(agent, open("agent.pkl", "wb")) 281 | 282 | if api_key is not None: 283 | env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True) 284 | sessions = [generate_session(env, agent, int(1e10), 0.0) for _ in range(300)] 285 | env.close() 286 | gym.upload("/tmp/" + env_name, api_key=api_key) 287 | 288 | 289 | def main(): 290 | args = _parse_args() 291 | try: 292 | layers = tuple(map(int, args.layers.split("-"))) 293 | except: 294 | layers = None 295 | run(args.env, args.num_episodes, args.max_steps, args.n_samples, args.step_penalty, 296 | args.percentile, layers, 297 | args.verbose, args.plot_stats, args.api_key, args.n_jobs, args.seed, args.resume) 298 | 299 | 300 | if __name__ == '__main__': 301 | main() 302 | -------------------------------------------------------------------------------- /DP/README.md: -------------------------------------------------------------------------------- 1 | ## Model-Based RL: Policy and Value Iteration using Dynamic Programming 2 | 3 | ### Learning Goals 4 | 5 | - Understand the difference between Policy Evaluation and Policy Improvement and how these processes interact 6 | - Understand the Policy Iteration Algorithm 7 | - Understand the Value Iteration Algorithm 8 | - Understand the Limitations of Dynamic Programming Approaches 9 | 10 | 11 | ### Summary 12 | 13 | - Dynamic Programming (DP) methods assume that we have a perfect model of the environment's Markov Decision Process (MDP). That's usually not the case in practice, but it's important to study DP anyway. 14 | - Policy Evaluation: Calculates the state-value function `V(s)` for a given policy. In DP this is done using a "full backup". At each state, we look ahead one step at each possible action and next state. We can only do this because we have a perfect model of the environment. 15 | - Full backups are basically the Bellman equations turned into updates. 16 | - Policy Improvement: Given the correct state-value function for a policy we can act greedily with respect to it (i.e. pick the best action at each state). Then we are guaranteed to improve the policy or keep it fixed if it's already optimal. 17 | - Policy Iteration: Iteratively perform Policy Evaluation and Policy Improvement until we reach the optimal policy. 18 | - Value Iteration: Instead of doing multiple steps of Policy Evaluation to find the "correct" V(s) we only do a single step and improve the policy immediately. In practice, this converges faster. 19 | - Generalized Policy Iteration: The process of iteratively doing policy evaluation and improvement. We can pick different algorithms for each of these steps but the basic idea stays the same. 20 | - DP methods bootstrap: They update estimates based on other estimates (one step ahead). 21 | 22 | 23 | ### Lectures & Readings 24 | 25 | **Required:** 26 | 27 | - David Silver's RL Course Lecture 3 - Planning by Dynamic Programming ([video](https://www.youtube.com/watch?v=Nd1-UUMVfz4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/DP.pdf)) 28 | 29 | **Optional:** 30 | 31 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 4: Dynamic Programming 32 | 33 | [source](https://github.com/dennybritz/reinforcement-learning) 34 | -------------------------------------------------------------------------------- /DP/policy_iteration_gym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import gym 4 | from gym import wrappers 5 | import argparse 6 | import numpy as np 7 | 8 | 9 | def policy_eval(policy, env, discount_factor=1.0, theta=0.00001): 10 | """ 11 | Evaluate a policy given an environment 12 | and a full description of the environment's dynamics. 13 | 14 | Args: 15 | policy: [S, A] shaped matrix representing the policy. 16 | env: OpenAI env. 17 | env.P represents the transition probabilities of the environment. 18 | env.P[s][a] is a (prob, next_state, reward, done) tuple. 19 | theta: We stop evaluation 20 | one our value function change is less than theta for all states. 21 | discount_factor: lambda discount factor. 22 | 23 | Returns: 24 | Vector of length env.observation_space.n representing the value function. 25 | """ 26 | # Start with a random (all 0) value function 27 | V = np.zeros(env.observation_space.n) 28 | while True: 29 | delta = 0 30 | # For each state, perform a "full backup" 31 | for s in range(env.observation_space.n): 32 | v = 0 33 | # Look at the possible next actions 34 | for a, action_prob in enumerate(policy[s]): 35 | # For each action, look at the possible next states... 36 | for prob, next_state, reward, done in env.P[s][a]: 37 | # Calculate the expected value 38 | v += action_prob * prob * ( 39 | reward + discount_factor * V[next_state]) 40 | # How much our value function changed (across any states) 41 | delta = max(delta, np.abs(v - V[s])) 42 | V[s] = v 43 | # Stop evaluating once our value function change is below a threshold 44 | if delta < theta: 45 | break 46 | return np.array(V) 47 | 48 | 49 | def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0): 50 | """ 51 | Policy Improvement Algorithm. Iteratively evaluates and improves a policy 52 | until an optimal policy is found. 53 | 54 | Args: 55 | env: The OpenAI envrionment. 56 | policy_eval_fn: Policy Evaluation function that takes 3 arguments: 57 | policy, env, discount_factor. 58 | discount_factor: Lambda discount factor. 59 | 60 | Returns: 61 | A tuple (policy, V). 62 | policy is the optimal policy, 63 | a matrix of shape [S, A] where each state s 64 | contains a valid probability distribution over actions. 65 | V is the value function for the optimal policy. 66 | 67 | """ 68 | # Start with a random policy 69 | policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n 70 | 71 | while True: 72 | # Evaluate the current policy 73 | V = policy_eval_fn(policy, env, discount_factor) 74 | 75 | # Will be set to false if we make any changes to the policy 76 | policy_stable = True 77 | 78 | # For each state... 79 | for s in range(env.observation_space.n): 80 | # The best action we would take under the currect policy 81 | chosen_a = np.argmax(policy[s]) 82 | 83 | # Find the best action by one-step lookahead 84 | # Ties are resolved arbitarily 85 | action_values = np.zeros(env.action_space.n) 86 | for a in range(env.action_space.n): 87 | for prob, next_state, reward, done in env.P[s][a]: 88 | action_values[a] += prob * ( 89 | reward + discount_factor * V[next_state]) 90 | best_a = np.argmax(action_values) 91 | 92 | # Greedily update the policy 93 | if chosen_a != best_a: 94 | policy_stable = False 95 | policy[s] = np.eye(env.action_space.n)[best_a] 96 | 97 | # If the policy is stable we've found an optimal policy. Return it 98 | if policy_stable: 99 | return policy, V 100 | 101 | 102 | def env_description(env, policy, v): 103 | print("Policy Probability Distribution:") 104 | print(policy) 105 | print("") 106 | 107 | import pdb; pdb.set_trace() 108 | 109 | print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") 110 | print(np.reshape(np.argmax(policy, axis=1), (env.nrow, env.ncol))) 111 | print("") 112 | 113 | print("Value Function:") 114 | print(v) 115 | print("") 116 | 117 | print("Reshaped Grid Value Function:") 118 | print(v.reshape((env.nrow, env.ncol))) 119 | print("") 120 | 121 | 122 | def env_run(env, n_episodes, policy, versbose=False): 123 | rewards = [] 124 | for ep in range(n_episodes): 125 | done = False 126 | epoch_reward = 0 127 | s = env.reset() 128 | while not done: 129 | if versbose: 130 | env.render() 131 | action = np.argmax(policy[s]) 132 | s, reward, done, info = env.step(action) 133 | epoch_reward += reward 134 | rewards.append(epoch_reward) 135 | return rewards 136 | 137 | 138 | def _parse_args(): 139 | parser = argparse.ArgumentParser(description='Policy iteration example') 140 | parser.add_argument('--env', 141 | type=str, 142 | default='Taxi-v1', 143 | help='The environment to use') 144 | parser.add_argument('--num_episodes', 145 | type=int, 146 | default=1000, 147 | help='Number of episodes') 148 | parser.add_argument('--gamma', 149 | type=float, 150 | default=0.99, 151 | help='Gamma discount factor') 152 | parser.add_argument('--verbose', 153 | action='store_true', 154 | default=False) 155 | parser.add_argument('--api_key', 156 | type=str, 157 | default=None) 158 | 159 | args, _ = parser.parse_known_args() 160 | return args 161 | 162 | 163 | def run(env, n_episodes, discount_factor, verbose=False, api_key=None): 164 | env_name = env 165 | env = gym.make(env) 166 | if api_key is not None: 167 | env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True) 168 | policy, v = policy_improvement(env, discount_factor=discount_factor) 169 | if verbose: 170 | try: 171 | env_description(env, policy, v) 172 | except: 173 | print("Sorry, something go wrong.") 174 | rewards = env_run(env, n_episodes, policy, verbose) 175 | print("Avg rewards over {} episodes: {:.4f} +/-{:.4f}".format( 176 | n_episodes, np.mean(rewards), np.std(rewards))) 177 | if api_key is not None: 178 | env.close() 179 | gym.upload("/tmp/" + env_name, api_key=api_key) 180 | 181 | 182 | def main(): 183 | args = _parse_args() 184 | run(args.env, args.num_episodes, args.gamma, args.verbose, args.api_key) 185 | 186 | 187 | if __name__ == '__main__': 188 | main() 189 | -------------------------------------------------------------------------------- /DP/value_iteration_gym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import gym 4 | from gym import wrappers 5 | import argparse 6 | import numpy as np 7 | 8 | 9 | def value_iteration(env, theta=0.0001, discount_factor=1.0): 10 | """ 11 | Value Iteration Algorithm. 12 | 13 | Args: 14 | env: OpenAI environment. 15 | env.P represents the transition probabilities of the environment. 16 | theta: Stopping threshold. 17 | If the value of all states changes less than theta 18 | in one iteration we are done. 19 | discount_factor: lambda time discount factor. 20 | 21 | Returns: 22 | A tuple (policy, V) of the optimal policy and the optimal value function. 23 | """ 24 | 25 | def one_step_lookahead(state, V): 26 | """ 27 | Helper function to calculate the value for all action in a given state. 28 | 29 | Args: 30 | state: The state to consider (int) 31 | V: The value to use as an estimator, Vector of length env.observation_space.n 32 | 33 | Returns: 34 | A vector of length env.action_space.n` 35 | containing the expected value of each action. 36 | """ 37 | A = np.zeros(env.action_space.n) 38 | for a in range(env.action_space.n): 39 | for prob, next_state, reward, done in env.env.env.P[state][a]: 40 | A[a] += prob * (reward + discount_factor * V[next_state]) 41 | return A 42 | 43 | V = np.zeros(env.observation_space.n) 44 | while True: 45 | # Stopping condition 46 | delta = 0 47 | # Update each state... 48 | for s in range(env.observation_space.n): 49 | # Do a one-step lookahead to find the best action 50 | A = one_step_lookahead(s, V) 51 | best_action_value = np.max(A) 52 | # Calculate delta across all states seen so far 53 | delta = max(delta, np.abs(best_action_value - V[s])) 54 | # Update the value function 55 | V[s] = best_action_value 56 | # Check if we can stop 57 | if delta < theta: 58 | break 59 | 60 | # Create a deterministic policy using the optimal value function 61 | policy = np.zeros([env.observation_space.n, env.action_space.n]) 62 | for s in range(env.observation_space.n): 63 | # One step lookahead to find the best action for this state 64 | A = one_step_lookahead(s, V) 65 | best_action = np.argmax(A) 66 | # Always take the best action 67 | policy[s, best_action] = 1.0 68 | 69 | return policy, V 70 | 71 | 72 | def env_description(env, policy, v): 73 | print("Policy Probability Distribution:") 74 | print(policy) 75 | print("") 76 | 77 | print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") 78 | print(np.reshape(np.argmax(policy, axis=1), (env.nrow, env.ncol))) 79 | print("") 80 | 81 | print("Value Function:") 82 | print(v) 83 | print("") 84 | 85 | print("Reshaped Grid Value Function:") 86 | print(v.reshape((env.nrow, env.ncol))) 87 | print("") 88 | 89 | 90 | def env_run(env, n_episodes, policy, versbose=False): 91 | rewards = [] 92 | for ep in range(n_episodes): 93 | done = False 94 | epoch_reward = 0 95 | s = env.reset() 96 | while not done: 97 | if versbose: 98 | env.render() 99 | action = np.argmax(policy[s]) 100 | s, reward, done, info = env.step(action) 101 | epoch_reward += reward 102 | rewards.append(epoch_reward) 103 | return rewards 104 | 105 | 106 | def _parse_args(): 107 | parser = argparse.ArgumentParser(description='Policy iteration example') 108 | parser.add_argument('--env', 109 | type=str, 110 | default='Taxi-v1', 111 | help='The environment to use') 112 | parser.add_argument('--num_episodes', 113 | type=int, 114 | default=1000, 115 | help='Number of episodes') 116 | parser.add_argument('--gamma', 117 | type=float, 118 | default=0.99, 119 | help='Gamma discount factor') 120 | parser.add_argument('--verbose', 121 | action='store_true', 122 | default=False) 123 | parser.add_argument('--api_key', 124 | type=str, 125 | default=None) 126 | 127 | args, _ = parser.parse_known_args() 128 | return args 129 | 130 | 131 | def run(env, n_episodes, discount_factor, verbose=False, api_key=None): 132 | env_name = env 133 | env = gym.make(env) 134 | if api_key is not None: 135 | env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True) 136 | policy, v = value_iteration(env, discount_factor=discount_factor) 137 | if verbose: 138 | try: 139 | env_description(env, policy, v) 140 | except: 141 | print("Sorry, something go wrong.") 142 | rewards = env_run(env, n_episodes, policy, verbose) 143 | print("Avg rewards over {} episodes: {:.4f} +/-{:.4f}".format( 144 | n_episodes, np.mean(rewards), np.std(rewards))) 145 | if api_key is not None: 146 | env.close() 147 | gym.upload("/tmp/" + env_name, api_key=api_key) 148 | 149 | 150 | def main(): 151 | args = _parse_args() 152 | run(args.env, args.num_episodes, args.gamma, args.verbose, args.api_key) 153 | 154 | 155 | if __name__ == '__main__': 156 | main() 157 | -------------------------------------------------------------------------------- /DQN/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | logs* 3 | -------------------------------------------------------------------------------- /DQN/README.md: -------------------------------------------------------------------------------- 1 | ## Deep Q-Learning 2 | 3 | ### Algorithms & Readings 4 | 5 | - [Deep Q-Learning (DQN)](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 6 | - [Human-Level Control through Deep Reinforcement Learning](http://www.davidqiu.com:8888/research/nature14236.pdf) 7 | - [Deep Reinforcement Learning with Double Q-learning](http://arxiv.org/abs/1509.06461) 8 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581) 9 | - [Deep Recurrent Q-Learning for Partially Observable MDPs (DRQN)](https://arxiv.org/abs/1507.06527) 10 | - [Prioritized Experience Replay](http://arxiv.org/abs/1511.05952) 11 | 12 | 13 | ### Summary 14 | 15 | - DQN: Q-Learning but with a Deep Neural Network as a function approximator. 16 | - Using a non-linear Deep Neural Network is powerful, but training is unstable if we apply it naively. 17 | - Trick 1 - Experience Replay: Store experience `(S, A, R, S_next)` in a replay buffer and sample minibatches from it to train the network. This decorrelates the data and leads to better data efficiency. In the beginning, the replay buffer is filled with random experience. 18 | - Trick 2 - Target Network: Use a separate network to estimate the TD target. This target network has the same architecture as the function approximator but with frozen parameters. Every T steps (a hyperparameter) the parameters from the Q network are copied to the target network. This leads to more stable training because it keeps the target function fixed (for a while). 19 | - Double DQN: Just like regular Q-Learning, DQN tends to overestimate values due to its max operation applied to both selecting and estimating actions. We get around this by using the Q network for selection and the target network for estimation when making updates. 20 | -------------------------------------------------------------------------------- /DQN/dqn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import tensorflow as tf 4 | from agents.agent_states import LinearHiddenState 5 | from rstools.tf.optimization import build_model_optimization 6 | 7 | from agents.agent_networks import FeatureNet, QvalueNet, ValueNet 8 | 9 | 10 | class DqnAgent(object): 11 | def __init__(self, state_shape, n_actions, network, special=None): 12 | self.special = special or {} 13 | self.state_shape = state_shape 14 | self.n_actions = n_actions 15 | self.special = special 16 | 17 | self.scope = tf.get_variable_scope().name + "/" + special.get("scope", "dqn") \ 18 | if tf.get_variable_scope().name else special.get("scope", "dqn") 19 | 20 | with tf.variable_scope(self.scope): 21 | self._build_graph(network) 22 | 23 | def _build_graph(self, network): 24 | self.feature_net = FeatureNet( 25 | self.state_shape, network, 26 | self.special.get("feature_net", {})) 27 | 28 | self.hidden_state = LinearHiddenState( 29 | self.feature_net.feature_state, 30 | self.special.get("hidden_size", 512), 31 | self.special.get("hidden_activation", tf.nn.elu)) 32 | 33 | if self.special.get("dueling_network", False): 34 | self.qvalue_net = QvalueNet( 35 | self.hidden_state.state, self.n_actions, 36 | dict(**self.special.get("qvalue_net", {}), **{"advantage": True})) 37 | self.value_net = ValueNet( 38 | self.hidden_state.state, 39 | self.special.get("value_net", {})) 40 | 41 | # a bit hacky way 42 | self.predicted_qvalues = self.value_net.predicted_values + \ 43 | self.qvalue_net.predicted_qvalues 44 | self.predicted_qvalues_for_action = self.value_net.predicted_values_for_actions + \ 45 | self.qvalue_net.predicted_qvalues_for_actions 46 | self.agent_loss = tf.losses.mean_squared_error( 47 | labels=self.qvalue_net.td_target, 48 | predictions=self.predicted_qvalues_for_action) 49 | 50 | build_model_optimization( 51 | self.value_net, 52 | self.special.get("value_net_optimization", None), 53 | loss=self.agent_loss) 54 | else: 55 | self.qvalue_net = QvalueNet( 56 | self.hidden_state.state, self.n_actions, 57 | self.special.get("qvalue_net", {})) 58 | self.predicted_qvalues = self.qvalue_net.predicted_qvalues 59 | self.predicted_qvalues_for_action = self.qvalue_net.predicted_qvalues_for_actions 60 | self.agent_loss = self.qvalue_net.loss 61 | 62 | build_model_optimization( 63 | self.qvalue_net, 64 | self.special.get("qvalue_net_optimization", None)) 65 | 66 | build_model_optimization( 67 | self.hidden_state, 68 | self.special.get("hidden_state_optimization", None), 69 | loss=self.agent_loss) 70 | build_model_optimization( 71 | self.feature_net, 72 | self.special.get("feature_net_optimization", None), 73 | loss=self.agent_loss) 74 | 75 | def predict_qvalues(self, sess, state_batch): 76 | return sess.run( 77 | self.predicted_qvalues, 78 | feed_dict={ 79 | self.feature_net.states: state_batch, 80 | self.feature_net.is_training: False}) 81 | -------------------------------------------------------------------------------- /DQN/drqn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import tensorflow as tf 4 | 5 | from rstools.tf.optimization import build_model_optimization 6 | 7 | from agents.agent_networks import FeatureNet, QvalueNet, ValueNet 8 | from agents.agent_states import RecurrentHiddenState 9 | 10 | 11 | class DrqnAgent(object): 12 | def __init__(self, state_shape, n_actions, network, special=None): 13 | self.state_shape = state_shape 14 | self.n_actions = n_actions 15 | 16 | self.special = special 17 | self.scope = special.get("scope", "drqn") 18 | 19 | with tf.variable_scope(self.scope): 20 | self._build_graph(network) 21 | 22 | def _build_graph(self, network): 23 | self.feature_net = FeatureNet( 24 | self.state_shape, network, 25 | self.special.get("feature_net", {})) 26 | 27 | self.hidden_state = RecurrentHiddenState( 28 | self.feature_net.feature_state, 29 | self.special.get("hidden_size", 512), 30 | self.special.get("hidden_activation", tf.tanh), 31 | self.special.get("batch_size", 1)) 32 | 33 | if self.special.get("dueling_network", False): 34 | self.qvalue_net = QvalueNet( 35 | self.hidden_state.state, self.n_actions, 36 | dict(**self.special.get("qvalue_net", {}), **{"advantage": True})) 37 | self.value_net = ValueNet( 38 | self.hidden_state.state, 39 | self.special.get("value_net", {})) 40 | 41 | # a bit hacky way 42 | self.predicted_qvalues = self.value_net.predicted_values + \ 43 | self.qvalue_net.predicted_qvalues 44 | self.predicted_qvalues_for_action = self.value_net.predicted_values_for_actions + \ 45 | self.qvalue_net.predicted_qvalues_for_actions 46 | self.agent_loss = tf.losses.mean_squared_error( 47 | labels=self.qvalue_net.td_target, 48 | predictions=self.value_net.predicted_values_for_actions + 49 | self.qvalue_net.predicted_qvalues_for_actions) 50 | 51 | build_model_optimization( 52 | self.value_net, 53 | self.special.get("value_net_optimization", None), 54 | loss=self.agent_loss) 55 | else: 56 | self.qvalue_net = QvalueNet( 57 | self.hidden_state.state, self.n_actions, 58 | self.special.get("qvalue_net", {})) 59 | self.predicted_qvalues = self.qvalue_net.predicted_qvalues 60 | self.predicted_qvalues_for_action = self.qvalue_net.predicted_qvalues_for_actions 61 | self.agent_loss = self.qvalue_net.loss 62 | 63 | build_model_optimization( 64 | self.qvalue_net, 65 | self.special.get("qvalue_net_optimization", None)) 66 | 67 | build_model_optimization( 68 | self.hidden_state, 69 | self.special.get("hidden_state_optimization", None), 70 | loss=self.agent_loss) 71 | build_model_optimization( 72 | self.feature_net, 73 | self.special.get("feature_net_optimization", None), 74 | loss=self.agent_loss) 75 | 76 | def predict_qvalues(self, sess, state_batch): 77 | return sess.run( 78 | self.predicted_qvalues, 79 | feed_dict={ 80 | self.feature_net.states: state_batch, 81 | self.feature_net.is_training: False}) 82 | 83 | def update_belief_state(self, sess, state_batch, done_batch): 84 | _ = sess.run( 85 | self.hidden_state.belief_update, 86 | feed_dict={ 87 | self.feature_net.states: state_batch, 88 | self.hidden_state.is_end: done_batch, 89 | self.feature_net.is_training: False 90 | }) 91 | 92 | def assign_belief_state(self, sess, new_belief): 93 | _ = sess.run( 94 | self.hidden_state.belief_assign, 95 | feed_dict={ 96 | self.hidden_state.belief_out: new_belief 97 | }) 98 | 99 | def get_belief_state(self, sess): 100 | return sess.run(self.hidden_state.belief_state) 101 | -------------------------------------------------------------------------------- /DQN/run_dqn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | from rstools.utils.batch_utils import iterate_minibatches 5 | from tqdm import trange 6 | 7 | from DQN.dqn import DqnAgent 8 | from DQN.drqn import DrqnAgent 9 | from agents.agent_networks import copy_model_parameters 10 | from common.networks import activations 11 | from common.buffer import buffers 12 | from wrappers.gym_wrappers import Transition 13 | from wrappers.run_wrappers import typical_args, typical_argsparse, run_wrapper, update_wraper, \ 14 | epsilon_greedy_actions, play_session 15 | 16 | 17 | def update(sess, agent, target_agent, transitions, init_state=None, 18 | discount_factor=0.99, reward_norm=1.0, batch_size=32, time_major=False, 19 | replay_buffer=None): 20 | loss = 0.0 21 | if replay_buffer is not None: 22 | for transition in zip( 23 | transitions.state, transitions.action, transitions.reward, 24 | transitions.next_state, transitions.done.astype(np.float32)): 25 | replay_buffer.add(*transition) 26 | states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size) 27 | transitions = Transition( 28 | state=states, action=actions, reward=rewards, 29 | next_state=next_states, done=dones.astype(bool)) 30 | 31 | time_len = transitions.state.shape[0] 32 | transitions_it = zip( 33 | iterate_minibatches(transitions.state, batch_size), 34 | iterate_minibatches(transitions.action, batch_size), 35 | iterate_minibatches(transitions.reward, batch_size), 36 | iterate_minibatches(transitions.next_state, batch_size), 37 | iterate_minibatches(transitions.done, batch_size)) 38 | 39 | for states, actions, rewards, next_states, dones in transitions_it: 40 | qvalues_next = agent.predict_qvalues(sess, next_states) 41 | best_actions = qvalues_next.argmax(axis=1) 42 | qvalues_next_target = target_agent.predict_qvalues(sess, next_states) 43 | qvalues_next_target = qvalues_next_target[np.arange(batch_size), best_actions] 44 | 45 | td_target = rewards * reward_norm + \ 46 | np.invert(dones).astype(np.float32) * \ 47 | discount_factor * qvalues_next_target 48 | 49 | run_params = [ 50 | agent.qvalue_net.loss, 51 | agent.qvalue_net.train_op, agent.hidden_state.train_op, agent.feature_net.train_op 52 | ] 53 | 54 | feed_params = { 55 | agent.feature_net.states: states, 56 | agent.feature_net.is_training: True, 57 | agent.qvalue_net.actions: actions, 58 | agent.qvalue_net.td_target: td_target, 59 | agent.qvalue_net.is_training: True, 60 | } 61 | 62 | if agent.special.get("dueling_network", False): 63 | run_params[0] = agent.agent_loss 64 | run_params += [agent.value_net.train_op] 65 | feed_params[agent.value_net.td_target] = td_target # @TODO: why need to feed? 66 | feed_params[agent.value_net.is_training] = True 67 | 68 | if isinstance(agent, DrqnAgent): 69 | run_params += [agent.hidden_state.belief_update] 70 | feed_params[agent.hidden_state.is_end] = dones 71 | 72 | run_results = sess.run( 73 | run_params, 74 | feed_dict=feed_params) 75 | 76 | batch_loss = run_results[0] 77 | loss += batch_loss 78 | return loss / time_len 79 | 80 | 81 | def generate_sessions( 82 | sess, agent, target_agent, env_pool, update_fn, 83 | t_max=1000, epsilon=0.01): 84 | total_reward = 0.0 85 | total_qvalue_loss = 0.0 86 | total_games = 0.0 87 | 88 | states = env_pool.pool_states() 89 | for t in range(t_max): 90 | actions = epsilon_greedy_actions(agent, sess, states, epsilon=epsilon) 91 | next_states, rewards, dones, _ = env_pool.step(actions) 92 | transition = Transition( 93 | state=states, action=actions, reward=rewards, next_state=next_states, done=dones) 94 | total_qvalue_loss += update_fn(sess, agent, target_agent, transition) 95 | 96 | states = next_states 97 | 98 | total_reward += rewards.sum() 99 | total_games += dones.sum() 100 | 101 | if env_pool.n_envs == 1 and total_games > 0: 102 | break 103 | 104 | return total_reward / env_pool.n_envs, \ 105 | total_qvalue_loss / t, \ 106 | t / (total_games / env_pool.n_envs) 107 | 108 | 109 | def dqn_learning( 110 | sess, agent, env, update_fn, 111 | n_epochs=1000, n_sessions=100, t_max=1000, 112 | initial_epsilon=0.5, final_epsilon=0.01, 113 | use_target_net=False, copy_n_epoch=5): 114 | tr = trange( 115 | n_epochs, 116 | desc="", 117 | leave=True) 118 | 119 | if use_target_net: 120 | agent, target_agent = agent 121 | # copy_model_parameters(sess, agent, target_agent) 122 | else: 123 | target_agent = agent 124 | 125 | history = { 126 | "reward": np.zeros(n_epochs), 127 | "qvalue_loss": np.zeros(n_epochs), 128 | "steps": np.zeros(n_epochs), 129 | "epsilon": np.zeros(n_epochs) 130 | } 131 | 132 | epsilon = initial_epsilon 133 | n_epochs_decay = n_epochs * 0.8 134 | 135 | for i in tr: 136 | sessions = [ 137 | generate_sessions( 138 | sess, agent, target_agent, env, update_fn, t_max=t_max, epsilon=epsilon) 139 | for _ in range(n_sessions)] 140 | session_rewards, session_qvalue_loss, session_steps = map(np.array, zip(*sessions)) 141 | 142 | history["reward"][i] = np.mean(session_rewards) 143 | history["qvalue_loss"][i] = np.mean(session_qvalue_loss) 144 | history["steps"][i] = np.mean(session_steps) 145 | history["epsilon"][i] = epsilon 146 | 147 | if i < n_epochs_decay: 148 | epsilon -= (initial_epsilon - final_epsilon) / float(n_epochs_decay) 149 | 150 | if use_target_net and (i + 1) % copy_n_epoch == 0: 151 | copy_model_parameters(sess, agent, target_agent) 152 | 153 | desc = "\t".join( 154 | ["{} = {:.3f}".format(key, value[i]) for key, value in history.items()]) 155 | tr.set_description(desc) 156 | 157 | if use_target_net: 158 | copy_model_parameters(sess, agent, target_agent) 159 | 160 | return history 161 | 162 | 163 | def run(env_name, make_env_fn, agent_cls, 164 | run_args, update_args, agent_agrs, 165 | log_dir=None, episode_limit=None, 166 | plot_stats=False, api_key=None, 167 | load=False, gpu_option=0.4, 168 | n_games=10, 169 | use_target_net=False): 170 | run_wrapper( 171 | n_games, dqn_learning, 172 | update_wraper(update, **update_args), 173 | play_session, epsilon_greedy_actions, 174 | env_name, make_env_fn, agent_cls, 175 | run_args, agent_agrs, 176 | log_dir=log_dir, episode_limit=episode_limit, 177 | plot_stats=plot_stats, api_key=api_key, 178 | load=load, gpu_option=gpu_option, 179 | use_target_network=use_target_net) 180 | 181 | 182 | def _parse_args(): 183 | parser = argparse.ArgumentParser(description='DQN Agent Learning') 184 | # typical params 185 | parser.add_argument( 186 | '--agent', 187 | type=str, 188 | default="dqn", 189 | choices=["dqn", "drqn"], 190 | help='Which agent to use. (default: %(default)s)') 191 | 192 | parser.add_argument( 193 | '--replay_buffer', 194 | type=str, 195 | choices=["none", "simple", "prioritized"], 196 | default="none", 197 | help="ReplayBuffer to use for training") 198 | parser.add_argument( 199 | '--replay_buffer_size', 200 | type=int, 201 | default=5000, 202 | help="Number of transitions to store in replay buffer.") 203 | 204 | # special exploration params 205 | parser.add_argument( 206 | '--initial_epsilon', 207 | type=float, 208 | default=0.5, 209 | help='DQN exploration: initial epsilon. (default: %(default)s)') 210 | parser.add_argument( 211 | '--final_epsilon', 212 | type=float, 213 | default=0.01, 214 | help='DQN exploration: final epsilon at 0.8*epochs. (default: %(default)s)') 215 | 216 | parser.add_argument( 217 | '--copy_n_epoch', 218 | type=int, 219 | default=5, 220 | help='Target DQN: copy parameters every %(default)s epoch') 221 | 222 | # special optimization params 223 | parser.add_argument( 224 | '--qvalue_lr', 225 | type=float, 226 | default=1e-5, 227 | help='Learning rate for qvalue network. (default: %(default)s)') 228 | parser.add_argument( 229 | '--value_lr', 230 | type=float, 231 | default=1e-5, 232 | help='Learning rate for value network. (default: %(default)s)') 233 | 234 | # agent special params & optimization 235 | parser.add_argument( 236 | '--use_target_net', 237 | action='store_true', 238 | default=False, 239 | help='Flag for target network use.') 240 | parser.add_argument( 241 | '--dueling_dqn', 242 | action='store_true', 243 | default=False, 244 | help='Flag for dueling network architecture use.') 245 | 246 | parser = typical_args(parser) 247 | 248 | args = parser.parse_args() 249 | return args 250 | 251 | 252 | def main(): 253 | args = _parse_args() 254 | 255 | network, run_args, update_args, optimization_params, make_env_fn = typical_argsparse(args) 256 | 257 | special_run_args = { 258 | "use_target_net": args.use_target_net, 259 | "initial_epsilon": args.initial_epsilon, 260 | "final_epsilon": args.final_epsilon, 261 | "copy_n_epoch": args.copy_n_epoch 262 | } 263 | run_args = {**run_args, **special_run_args} 264 | 265 | buffer = buffers[args.replay_buffer](args.replay_buffer_size) \ 266 | if args.replay_buffer != "none" \ 267 | else None 268 | special_update_args = { 269 | "replay_buffer": buffer 270 | } 271 | 272 | update_args = {**update_args, **special_update_args} 273 | 274 | qvalue_optimization_params = { 275 | **optimization_params, 276 | **{"initial_lr": args.qvalue_lr} 277 | } 278 | value_optimization_params = { 279 | **optimization_params, 280 | **{"initial_lr": args.value_lr} 281 | } 282 | 283 | agent_cls = DqnAgent if args.agent == "dqn" else DrqnAgent 284 | 285 | special = { 286 | "dueling_network": args.dueling_dqn, 287 | "hidden_size": args.hidden_size, 288 | "hidden_activation": activations[args.hidden_activation], 289 | "feature_net_optimization": optimization_params, 290 | "hidden_state_optimization": optimization_params, 291 | "value_net_optimization": value_optimization_params, 292 | "qvalue_net_optimization": qvalue_optimization_params, 293 | } 294 | 295 | agent_args = { 296 | "network": network, 297 | "special": special 298 | } 299 | 300 | run(args.env, make_env_fn, agent_cls, 301 | run_args, update_args, agent_args, 302 | args.log_dir, args.episode_limit, 303 | args.plot_history, args.api_key, 304 | args.load, args.gpu_option, 305 | args.n_games, 306 | args.use_target_net) 307 | 308 | 309 | if __name__ == '__main__': 310 | main() 311 | -------------------------------------------------------------------------------- /FA/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | logs* 3 | -------------------------------------------------------------------------------- /FA/README.md: -------------------------------------------------------------------------------- 1 | ## Function Approximation 2 | 3 | ### Learning Goals 4 | 5 | - Understand the motivation for Function Approximation over Table Lookup 6 | - Understand how to incorporate function approximation into existing algorithms 7 | - Understand convergence properties of function approximators and RL algorithms 8 | - Understand batching using experience replay 9 | 10 | 11 | ### Summary 12 | 13 | - Building a big table, one value for each state or state-action pair, is memory- and data-inefficient. Function Approximation can generalize to unseen states by using a featurized state representation. 14 | - Treat RL as supervised learning problem with the MC- or TD-target as the label and the current state/action as the input. Often the target also depends on the function estimator but we simply ignore its gradient. That's why these methods are called semi-gradient methods. 15 | - Challenge: We have non-stationary (policy changes, bootstrapping) and non-iid (correlated in time) data. 16 | - Many methods assume that our action space is discrete because they rely on calculating the argmax over all actions. Large and continuous action spaces are ongoing research. 17 | - For Control very few convergence guarantees exist. For non-linear approximators there are basically no guarantees at all. But in works in practice. 18 | - Experience Replay: Store experience as dataset, randomize it, and repeatedly apply minibatch SGD. 19 | - Tricks to stabilize non-linear function approximators: Fixed Targets. The target is calculated based on frozen parameter values from a previous time step. 20 | - For the non-episodic (continuing) case function approximation is more complex and we need to give up discounting and use an "average reward" formulation. 21 | 22 | 23 | ### Lectures & Readings 24 | 25 | **Required:** 26 | 27 | - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) 28 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 9: On-policy Prediction with Approximation 29 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 10: On-policy Control with Approximation 30 | 31 | **Optional:** 32 | 33 | - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4) 34 | 35 | 36 | ### Exercises 37 | 38 | - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation 39 | - [Exercise](Q-Learning with Value Function Approximation.ipynb) 40 | - [Solution](Q-Learning with Value Function Approximation Solution.ipynb) 41 | -------------------------------------------------------------------------------- /FA/q_learning_gym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import gym 4 | from gym import wrappers 5 | import sys 6 | import argparse 7 | import numpy as np 8 | import sklearn.pipeline 9 | import sklearn.preprocessing 10 | from sklearn.linear_model import SGDRegressor 11 | from sklearn.kernel_approximation import RBFSampler 12 | 13 | from matplotlib import pyplot as plt 14 | 15 | plt.style.use("ggplot") 16 | 17 | 18 | def plot_unimetric(history, metric, save_dir): 19 | plt.figure() 20 | plt.plot(history[metric]) 21 | plt.title('model {}'.format(metric)) 22 | plt.ylabel(metric) 23 | plt.xlabel('epoch') 24 | plt.savefig("{}/{}.png".format(save_dir, metric), 25 | format='png', dpi=300) 26 | 27 | 28 | def make_epsilon_greedy_policy(estimator, epsilon, nA): 29 | """ 30 | Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon. 31 | 32 | Args: 33 | estimator: An estimator that returns q values for a given state 34 | epsilon: The probability to select a random action . float between 0 and 1. 35 | nA: Number of actions in the environment. 36 | 37 | Returns: 38 | A function that takes the observation as an argument and returns 39 | the probabilities for each action in the form of a numpy array of length nA. 40 | 41 | """ 42 | 43 | def policy_fn(observation): 44 | A = np.ones(nA, dtype=float) * epsilon / nA 45 | q_values = estimator.predict(observation) 46 | best_action = np.argmax(q_values) 47 | A[best_action] += (1.0 - epsilon) 48 | return A 49 | 50 | return policy_fn 51 | 52 | 53 | class Estimator(object): 54 | """ 55 | Value Function approximator. 56 | """ 57 | 58 | def __init__(self, env): 59 | self._prepare_estimator_for_env(env) 60 | # We create a separate model for each action in the environment's 61 | # action space. Alternatively we could somehow encode the action 62 | # into the features, but this way it's easier to code up. 63 | self.models = [] 64 | for _ in range(env.action_space.n): 65 | model = SGDRegressor(learning_rate="constant") 66 | # We need to call partial_fit once to initialize the model 67 | # or we get a NotFittedError when trying to make a prediction 68 | # This is quite hacky. 69 | model.partial_fit([self.featurize_state(env.reset())], [0]) 70 | self.models.append(model) 71 | 72 | def _prepare_estimator_for_env(self, env): 73 | observation_examples = np.array( 74 | [env.observation_space.sample() for _ in range(1000)]) 75 | observation_examples = self._vectorise_state(observation_examples) 76 | 77 | scaler = sklearn.preprocessing.StandardScaler() 78 | scaler.fit(observation_examples) 79 | self.scaler = scaler 80 | 81 | featurizer = sklearn.pipeline.FeatureUnion([ 82 | ("rbf1", RBFSampler(gamma=5.0, n_components=100)), 83 | ("rbf2", RBFSampler(gamma=2.0, n_components=100)), 84 | ("rbf3", RBFSampler(gamma=1.0, n_components=100)), 85 | ("rbf4", RBFSampler(gamma=0.5, n_components=100)) 86 | ]) 87 | featurizer.fit(scaler.transform(observation_examples)) 88 | self.featurizer = featurizer 89 | 90 | def _vectorise_state(self, states): 91 | obs_shape = states.shape 92 | if len(obs_shape) > 2: 93 | states = states.reshape((obs_shape[0], -1)) 94 | return states 95 | 96 | def featurize_state(self, state): 97 | """ 98 | Returns the featurized representation for a state. 99 | """ 100 | state = self._vectorise_state(np.array([state])) 101 | scaled = self.scaler.transform(state) 102 | featurized = self.featurizer.transform(scaled) 103 | return featurized[0] 104 | 105 | def predict(self, s, a=None): 106 | """ 107 | Makes value function predictions. 108 | 109 | Args: 110 | s: state to make a prediction for 111 | a: (Optional) action to make a prediction for 112 | 113 | Returns 114 | If an action a is given this returns a single number as the prediction. 115 | If no action is given this returns a vector or predictions for all actions 116 | in the environment where pred[i] is the prediction for action i. 117 | 118 | """ 119 | features = self.featurize_state(s) 120 | return self.models[a].predict([features])[0] if a \ 121 | else np.array([model.predict([features])[0] for model in self.models]) 122 | 123 | def update(self, s, a, y): 124 | """ 125 | Updates the estimator parameters for a given state and action towards 126 | the target y. 127 | """ 128 | features = self.featurize_state(s) 129 | self.models[a].partial_fit([features], [y]) 130 | 131 | 132 | def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0, 133 | verbose=False): 134 | """ 135 | Q-Learning algorithm for fff-policy TD control using Function Approximation. 136 | Finds the optimal greedy policy while following an epsilon-greedy policy. 137 | 138 | Args: 139 | env: OpenAI environment. 140 | estimator: Action-Value function estimator 141 | num_episodes: Number of episodes to run for. 142 | discount_factor: Lambda time discount factor. 143 | epsilon: Chance the sample a random action. Float betwen 0 and 1. 144 | epsilon_decay: Each episode, epsilon is decayed by this factor 145 | 146 | Returns: 147 | An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. 148 | """ 149 | 150 | # Keeps track of useful statistics 151 | episode_lengths = np.zeros(num_episodes) 152 | episode_rewards = np.zeros(num_episodes) 153 | 154 | for i_episode in range(num_episodes): 155 | 156 | # The policy we're following 157 | policy = make_epsilon_greedy_policy( 158 | estimator, epsilon * epsilon_decay ** i_episode, env.action_space.n) 159 | 160 | # Print out which episode we're on, useful for debugging. 161 | # Also print reward for last episode 162 | if verbose: 163 | last_reward = episode_rewards[i_episode - 1] 164 | print("\rEpisode {}/{} ({})".format(i_episode + 1, num_episodes, last_reward), end="") 165 | sys.stdout.flush() 166 | 167 | state = env.reset() 168 | n_action = None 169 | 170 | len_counter = 0 171 | reward_counter = 0 172 | done = False 173 | while not done: 174 | if verbose: 175 | pass 176 | # env.render() 177 | if n_action is None: 178 | probs = policy(state) 179 | action = np.random.choice(np.arange(len(probs)), p=probs) 180 | else: 181 | action = n_action 182 | 183 | n_state, reward, done, info = env.step(action) 184 | reward_counter += reward 185 | len_counter += 1 186 | 187 | q_val_next = estimator.predict(n_state) 188 | td_target = reward + discount_factor * np.max(q_val_next) 189 | 190 | estimator.update(state, action, td_target) 191 | 192 | state = n_state 193 | 194 | episode_rewards[i_episode] = reward_counter 195 | episode_lengths[i_episode] = len_counter 196 | 197 | return {"episode_rewards": episode_rewards, "episode_lengths": episode_lengths} 198 | 199 | 200 | def _parse_args(): 201 | parser = argparse.ArgumentParser(description='Policy iteration example') 202 | parser.add_argument( 203 | '--env', 204 | type=str, 205 | default='MountainCar-v0', # CartPole-v0, MountainCar-v0 206 | help='The environment to use') 207 | parser.add_argument( 208 | '--num_episodes', 209 | type=int, 210 | default=1000, 211 | help='Number of episodes') 212 | parser.add_argument( 213 | '--gamma', 214 | type=float, 215 | default=0.99, 216 | help='Gamma discount factor') 217 | parser.add_argument( 218 | '--verbose', 219 | action='store_true', 220 | default=False) 221 | parser.add_argument( 222 | '--plot_stats', 223 | action='store_true', 224 | default=False) 225 | parser.add_argument( 226 | '--api_key', 227 | type=str, 228 | default=None) 229 | 230 | args, _ = parser.parse_known_args() 231 | return args 232 | 233 | 234 | def save_stats(stats, save_dir="./"): 235 | for key in stats: 236 | plot_unimetric(stats, key, save_dir) 237 | 238 | 239 | def run(env, n_episodes, discount_factor, verbose=False, plot_stats=False, api_key=None): 240 | env_name = env 241 | env = gym.make(env) 242 | 243 | estimator = Estimator(env) 244 | 245 | if api_key is not None: 246 | env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True) 247 | 248 | stats = q_learning(env, estimator, n_episodes, 249 | discount_factor=discount_factor, epsilon=0.0, 250 | verbose=verbose) 251 | if plot_stats: 252 | save_stats(stats) 253 | 254 | if api_key is not None: 255 | env.close() 256 | gym.upload("/tmp/" + env_name, api_key=api_key) 257 | 258 | 259 | def main(): 260 | args = _parse_args() 261 | run(args.env, args.num_episodes, args.gamma, 262 | args.verbose, args.plot_stats, args.api_key) 263 | 264 | 265 | if __name__ == '__main__': 266 | main() 267 | -------------------------------------------------------------------------------- /GEN/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/GEN/README.md -------------------------------------------------------------------------------- /GEN/genetic_gym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import gym 4 | from gym import wrappers 5 | import argparse 6 | import numpy as np 7 | import random 8 | from tqdm import trange 9 | 10 | 11 | def get_random_policy(env): 12 | """ 13 | Build a numpy array representing agent policy. 14 | This array must have one element per each of 16 environment states. 15 | Element must be an integer from 0 to 3, representing action 16 | to take from that state. 17 | """ 18 | return np.random.randint(0, int(env.action_space.n), int(env.observation_space.n)) 19 | 20 | 21 | def sample_reward(env, policy, t_max=100): 22 | """ 23 | Interact with an environment, return sum of all rewards. 24 | If game doesn't end on t_max (e.g. agent walks into a wall), 25 | force end the game and return whatever reward you got so far. 26 | Tip: see signature of env.step(...) method above. 27 | """ 28 | s = env.reset() 29 | total_reward = 0 30 | 31 | for _ in range(t_max): 32 | action = policy[s] 33 | s, reward, done, info = env.step(action) 34 | total_reward += reward 35 | if done: 36 | break 37 | return total_reward 38 | 39 | 40 | def evaluate(sample_func, env, policy, n_times=100): 41 | """Run several evaluations and average the score the policy gets.""" 42 | rewards = [sample_func(env, policy) for _ in range(n_times)] 43 | return float(np.mean(rewards)) 44 | 45 | 46 | def crossover(env, policy1, policy2, p=0.5, prioritize_func=None): 47 | """ 48 | for each state, with probability p take action from policy1, else policy2 49 | """ 50 | if prioritize_func is not None: 51 | p = prioritize_func(env, policy1, policy2, p) 52 | return np.choose( 53 | (np.random.random_sample(policy1.shape[0]) <= p).astype(int), [policy1, policy2]) 54 | 55 | 56 | def mutation(env, policy, p=0.1): 57 | """ 58 | for each state, with probability p replace action with random action 59 | Tip: mutation can be written as crossover with random policy 60 | """ 61 | return crossover(env, get_random_policy(env), policy, p) 62 | 63 | 64 | def run(env, n_episodes, max_steps, 65 | pool_size, n_crossovers, n_mutations, 66 | seed=42, verbose=False, api_key=None): 67 | random.seed(seed) 68 | np.random.seed(seed) 69 | 70 | env_name = env 71 | env = gym.make(env).env 72 | env.reset() 73 | 74 | if api_key is not None: 75 | env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True) 76 | 77 | if verbose: 78 | print("initializing...") 79 | pool = [get_random_policy(env) for _ in range(pool_size)] 80 | 81 | rewards = np.zeros(n_episodes) 82 | 83 | tr = trange( 84 | n_episodes, 85 | desc="best score: {:.4}".format(0.0), 86 | leave=True) 87 | 88 | def sample_func(env, policy): 89 | return sample_reward( 90 | env, policy, t_max=max_steps if api_key is None else int(1e10)) 91 | 92 | def prioritize_func(env, policy1, policy2, p): 93 | return min( 94 | p * evaluate(sample_func, env, policy1) / (evaluate(sample_func, env, policy2) + 0.001), 95 | 1.0) 96 | 97 | for i_epoch in tr: 98 | crossovered = [ 99 | crossover(env, random.choice(pool), random.choice(pool), 100 | prioritize_func=prioritize_func) 101 | for _ in range(n_crossovers)] 102 | mutated = [mutation(env, random.choice(pool)) for _ in range(n_mutations)] 103 | 104 | assert type(crossovered) == type(mutated) == list 105 | 106 | # add new policies to the pool 107 | pool = pool + crossovered + mutated 108 | pool_scores = list(map(lambda x: evaluate(sample_func, env, x), pool)) 109 | 110 | # select pool_size best policies 111 | selected_indices = np.argsort(pool_scores)[-pool_size:] 112 | pool = [pool[i] for i in selected_indices] 113 | pool_scores = [pool_scores[i] for i in selected_indices] 114 | 115 | # print the best policy so far (last in ascending score order) 116 | tr.set_description("best score: {:.4}".format(pool_scores[-1])) 117 | rewards[i_epoch] = pool_scores[-1] 118 | 119 | print("Avg rewards over {} episodes: {:.4f} +/-{:.4f}".format( 120 | n_episodes, np.mean(rewards), np.std(rewards))) 121 | if api_key is not None: 122 | env.close() 123 | gym.upload("/tmp/" + env_name, api_key=api_key) 124 | 125 | 126 | def _parse_args(): 127 | parser = argparse.ArgumentParser(description='Policy iteration example') 128 | parser.add_argument( 129 | '--env', 130 | type=str, 131 | default='FrozenLake8x8-v0', 132 | help='The environment to use') 133 | parser.add_argument( 134 | '--num_episodes', 135 | type=int, 136 | default=200, 137 | help='Number of episodes') 138 | parser.add_argument( 139 | '--max_steps', 140 | type=int, 141 | default=200, 142 | help='Max number per episode') 143 | parser.add_argument( 144 | '--pool_size', 145 | type=int, 146 | default=200, 147 | help='Population size') 148 | parser.add_argument( 149 | '--n_crossovers', 150 | type=int, 151 | default=100, 152 | help='Number of crossovers per episode') 153 | parser.add_argument( 154 | '--n_mutations', 155 | type=int, 156 | default=100, 157 | help='Number of mutations per episode') 158 | parser.add_argument( 159 | '--seed', 160 | type=int, 161 | default=42) 162 | parser.add_argument( 163 | '--verbose', 164 | action='store_true', 165 | default=False) 166 | parser.add_argument( 167 | '--api_key', 168 | type=str, 169 | default=None) 170 | 171 | args, _ = parser.parse_known_args() 172 | return args 173 | 174 | 175 | def main(): 176 | args = _parse_args() 177 | run(args.env, args.num_episodes, args.max_steps, 178 | args.pool_size, args.n_crossovers, args.n_mutations, 179 | args.seed, args.verbose, args.api_key) 180 | 181 | 182 | if __name__ == '__main__': 183 | main() 184 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Sergey Kolesnikov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MC/.directory: -------------------------------------------------------------------------------- 1 | [Dolphin] 2 | HeaderColumnWidths=570,72,107 3 | Timestamp=2016,12,13,9,50,27 4 | Version=3 5 | ViewMode=1 6 | -------------------------------------------------------------------------------- /MC/Blackjack Playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 419, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import sys\n", 13 | "if \"../\" not in sys.path:\n", 14 | " sys.path.append(\"../\") \n", 15 | "from lib.envs.blackjack import BlackjackEnv" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 420, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "env = BlackjackEnv()" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 422, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Player Score: 17 (Usable Ace: False), Dealer Score: 10\n", 41 | "Taking action: Hit\n", 42 | "Player Score: 18 (Usable Ace: False), Dealer Score: 10\n", 43 | "Taking action: Hit\n", 44 | "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n", 45 | "Game end. Reward: -1.0\n", 46 | "\n", 47 | "Player Score: 6 (Usable Ace: False), Dealer Score: 9\n", 48 | "Taking action: Hit\n", 49 | "Player Score: 16 (Usable Ace: False), Dealer Score: 9\n", 50 | "Taking action: Hit\n", 51 | "Player Score: 26 (Usable Ace: False), Dealer Score: 9\n", 52 | "Game end. Reward: -1.0\n", 53 | "\n", 54 | "Player Score: 12 (Usable Ace: False), Dealer Score: 6\n", 55 | "Taking action: Hit\n", 56 | "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n", 57 | "Taking action: Stick\n", 58 | "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n", 59 | "Game end. Reward: 1.0\n", 60 | "\n", 61 | "Player Score: 17 (Usable Ace: True), Dealer Score: 8\n", 62 | "Taking action: Hit\n", 63 | "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n", 64 | "Taking action: Hit\n", 65 | "Player Score: 22 (Usable Ace: False), Dealer Score: 8\n", 66 | "Game end. Reward: -1.0\n", 67 | "\n", 68 | "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n", 69 | "Taking action: Hit\n", 70 | "Player Score: 27 (Usable Ace: False), Dealer Score: 8\n", 71 | "Game end. Reward: -1.0\n", 72 | "\n", 73 | "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", 74 | "Taking action: Hit\n", 75 | "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", 76 | "Taking action: Hit\n", 77 | "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n", 78 | "Game end. Reward: -1.0\n", 79 | "\n", 80 | "Player Score: 13 (Usable Ace: False), Dealer Score: 7\n", 81 | "Taking action: Hit\n", 82 | "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n", 83 | "Taking action: Hit\n", 84 | "Player Score: 24 (Usable Ace: False), Dealer Score: 7\n", 85 | "Game end. Reward: -1.0\n", 86 | "\n", 87 | "Player Score: 17 (Usable Ace: False), Dealer Score: 5\n", 88 | "Taking action: Hit\n", 89 | "Player Score: 25 (Usable Ace: False), Dealer Score: 5\n", 90 | "Game end. Reward: -1.0\n", 91 | "\n", 92 | "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n", 93 | "Taking action: Stick\n", 94 | "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n", 95 | "Game end. Reward: 1.0\n", 96 | "\n", 97 | "Player Score: 12 (Usable Ace: True), Dealer Score: 10\n", 98 | "Taking action: Hit\n", 99 | "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n", 100 | "Taking action: Stick\n", 101 | "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n", 102 | "Game end. Reward: 0.0\n", 103 | "\n", 104 | "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", 105 | "Taking action: Hit\n", 106 | "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", 107 | "Taking action: Hit\n", 108 | "Player Score: 24 (Usable Ace: False), Dealer Score: 10\n", 109 | "Game end. Reward: -1.0\n", 110 | "\n", 111 | "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n", 112 | "Taking action: Hit\n", 113 | "Player Score: 22 (Usable Ace: False), Dealer Score: 4\n", 114 | "Game end. Reward: -1.0\n", 115 | "\n", 116 | "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", 117 | "Taking action: Hit\n", 118 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 119 | "Taking action: Stick\n", 120 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 121 | "Game end. Reward: 0.0\n", 122 | "\n", 123 | "Player Score: 4 (Usable Ace: False), Dealer Score: 3\n", 124 | "Taking action: Hit\n", 125 | "Player Score: 14 (Usable Ace: False), Dealer Score: 3\n", 126 | "Taking action: Hit\n", 127 | "Player Score: 24 (Usable Ace: False), Dealer Score: 3\n", 128 | "Game end. Reward: -1.0\n", 129 | "\n", 130 | "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", 131 | "Taking action: Stick\n", 132 | "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", 133 | "Game end. Reward: 1.0\n", 134 | "\n", 135 | "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n", 136 | "Taking action: Hit\n", 137 | "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", 138 | "Taking action: Hit\n", 139 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 140 | "Taking action: Stick\n", 141 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 142 | "Game end. Reward: 1.0\n", 143 | "\n", 144 | "Player Score: 9 (Usable Ace: False), Dealer Score: 10\n", 145 | "Taking action: Hit\n", 146 | "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", 147 | "Taking action: Hit\n", 148 | "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", 149 | "Game end. Reward: -1.0\n", 150 | "\n", 151 | "Player Score: 12 (Usable Ace: False), Dealer Score: 5\n", 152 | "Taking action: Hit\n", 153 | "Player Score: 15 (Usable Ace: False), Dealer Score: 5\n", 154 | "Taking action: Hit\n", 155 | "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n", 156 | "Taking action: Stick\n", 157 | "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n", 158 | "Game end. Reward: 1.0\n", 159 | "\n", 160 | "Player Score: 11 (Usable Ace: False), Dealer Score: 9\n", 161 | "Taking action: Hit\n", 162 | "Player Score: 13 (Usable Ace: False), Dealer Score: 9\n", 163 | "Taking action: Hit\n", 164 | "Player Score: 17 (Usable Ace: False), Dealer Score: 9\n", 165 | "Taking action: Hit\n", 166 | "Player Score: 19 (Usable Ace: False), Dealer Score: 9\n", 167 | "Taking action: Hit\n", 168 | "Player Score: 29 (Usable Ace: False), Dealer Score: 9\n", 169 | "Game end. Reward: -1.0\n", 170 | "\n", 171 | "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n", 172 | "Taking action: Hit\n", 173 | "Player Score: 19 (Usable Ace: False), Dealer Score: 7\n", 174 | "Taking action: Hit\n", 175 | "Player Score: 29 (Usable Ace: False), Dealer Score: 7\n", 176 | "Game end. Reward: -1.0\n", 177 | "\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "def print_observation(observation):\n", 183 | " score, dealer_score, usable_ace = observation\n", 184 | " print(\"Player Score: {} (Usable Ace: {}), Dealer Score: {}\".format(\n", 185 | " score, usable_ace, dealer_score))\n", 186 | "\n", 187 | "def strategy(observation):\n", 188 | " score, dealer_score, usable_ace = observation\n", 189 | " # Stick (action 0) if the score is > 20, hit (action 1) otherwise\n", 190 | " return 0 if score >= 20 else 1\n", 191 | "\n", 192 | "for i_episode in range(20):\n", 193 | " observation = env.reset()\n", 194 | " for t in range(100):\n", 195 | " print_observation(observation)\n", 196 | " action = strategy(observation)\n", 197 | " print(\"Taking action: {}\".format( [\"Stick\", \"Hit\"][action]))\n", 198 | " observation, reward, done, _ = env.step(action)\n", 199 | " if done:\n", 200 | " print_observation(observation)\n", 201 | " print(\"Game end. Reward: {}\\n\".format(float(reward)))\n", 202 | " break" 203 | ] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.5.1" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 0 227 | } 228 | -------------------------------------------------------------------------------- /MC/README.md: -------------------------------------------------------------------------------- 1 | ## Model-Free Prediction & Control with Monte Carlo (MC) 2 | 3 | 4 | ### Learning Goals 5 | 6 | - Understand the difference between Prediction and Control 7 | - Know how to use the MC method for predicting state values and state-action values 8 | - Understand the on-policy first-visit MC control algorithm 9 | - Understand off-policy MC control algorithms 10 | - Understand Weighted Importance Sampling 11 | - Understand the benefits of MC algorithms over the Dynamic Programming approach 12 | 13 | 14 | ### Summary 15 | 16 | - Dynamic Programming approaches assume complete knowledge of the environment (the MDP). In practice, we often don't have full knowledge of how the world works. 17 | - Monte Carlo (MC) methods can learn directly from experience collected by interacting with the environment. An episode of experience is a series of `(State, Action, Reward, Next State)` tuples. 18 | - MC methods work based on episodes. We sample episodes of experience and make updates to our estimates at the end of each episode. MC methods have high variance (due to lots of random decisions within an episode) but are unbiased. 19 | - MC Policy Evaluation: Given a policy, we want to estimate the state-value function V(s). Sample episodes of experience and estimate V(s) to be the reward received from that state onwards averaged across all of your experience. The same technique works for the action-value function Q(s, a). Given enough samples, this is proven to converge. 20 | - MC Control: Idea is the same as for Dynamic Programming. Use MC Policy Evaluation to evaluate the current policy then improve the policy greedily. The Problem: How do we ensure that we explore all states if we don't know the full environment? 21 | - Solution to exploration problem: Use epsilon-greedy policies instead of full greedy policies. When making a decision act randomly with probability epsilon. This will learn the optimal epsilon-greedy policy. 22 | - Off-Policy Learning: How can we learn about the actual optimal (greedy) policy while following an exploratory (epsilon-greedy) policy? We can use importance sampling, which weighs returns by their probability of occurring under the policy we want to learn about. 23 | 24 | 25 | ### Lectures & Readings 26 | 27 | **Required:** 28 | 29 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 5: Monte Carlo Methods 30 | 31 | 32 | **Optional:** 33 | 34 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) 35 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) 36 | 37 | 38 | ### Exercises 39 | 40 | - [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack Playground.ipynb) 41 | - Implement the Monte Carlo Prediction to estimate state-action values 42 | - [Exercise](MC Prediction.ipynb) 43 | - [Solution](MC Prediction Solution.ipynb) 44 | - Implement the on-policy first-visit Monte Carlo Control algorithm 45 | - [Exercise](MC Control with Epsilon-Greedy Policies.ipynb) 46 | - [Solution](MC Control with Epsilon-Greedy Policies Solution.ipynb) 47 | - Implement the off-policy every-visit Monte Carlo Control using Weighted Important Sampling algorithm 48 | - [Exercise](Off-Policy MC Control with Weighted Importance Sampling.ipynb) 49 | - [Solution](Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb) -------------------------------------------------------------------------------- /PG/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | logs* 3 | -------------------------------------------------------------------------------- /PG/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/PG/README.md -------------------------------------------------------------------------------- /PG/reinforce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import tensorflow as tf 4 | from agents.agent_states import LinearHiddenState 5 | from rstools.tf.optimization import build_model_optimization 6 | 7 | from agents.agent_networks import FeatureNet, PolicyNet 8 | 9 | 10 | class ReinforceAgent(object): 11 | def __init__(self, state_shape, n_actions, network, special=None): 12 | self.special = special or {} 13 | self.state_shape = state_shape 14 | self.n_actions = n_actions 15 | self.special = special 16 | 17 | self.scope = tf.get_variable_scope().name + "/" + special.get("scope", "dqn") \ 18 | if tf.get_variable_scope().name else special.get("scope", "dqn") 19 | 20 | with tf.variable_scope(self.scope): 21 | self._build_graph(network) 22 | 23 | def _build_graph(self, network): 24 | self.feature_net = FeatureNet( 25 | self.state_shape, network, 26 | self.special.get("feature_net", {})) 27 | 28 | self.hidden_state = LinearHiddenState( 29 | self.feature_net.feature_state, 30 | self.special.get("hidden_size", 512), 31 | self.special.get("hidden_activation", tf.nn.elu)) 32 | 33 | self.policy_net = PolicyNet( 34 | self.hidden_state.state, self.n_actions, 35 | self.special.get("policy_net", {})) 36 | 37 | build_model_optimization( 38 | self.policy_net, 39 | self.special.get("policy_net_optimization", None)) 40 | build_model_optimization( 41 | self.hidden_state, 42 | self.special.get("hidden_state_optimization", None), 43 | loss=self.policy_net.loss) 44 | build_model_optimization( 45 | self.feature_net, 46 | self.special.get("feature_net_optimization", None), 47 | loss=self.policy_net.loss) 48 | 49 | def predict_probs(self, sess, state_batch, is_training=False): 50 | return sess.run( 51 | self.policy_net.predicted_probs, 52 | feed_dict={ 53 | self.feature_net.states: state_batch, 54 | self.feature_net.is_training: is_training}) 55 | -------------------------------------------------------------------------------- /PG/run_reinforce.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | from rstools.utils.batch_utils import iterate_minibatches 5 | from tqdm import trange 6 | 7 | from PG.reinforce import ReinforceAgent 8 | from common.networks import activations 9 | from wrappers.gym_wrappers import Transition 10 | from wrappers.run_wrappers import typical_args, typical_argsparse, run_wrapper, update_wraper, \ 11 | epsilon_greedy_policy, play_session 12 | 13 | 14 | def update(sess, reinforce_agent, transitions, initial_state=None, 15 | discount_factor=0.99, reward_norm=1.0, batch_size=32, time_major=True): 16 | policy_targets = [] 17 | state_history = [] 18 | action_history = [] 19 | 20 | cumulative_reward = np.zeros_like(transitions[-1].reward) 21 | for transition in reversed(transitions): 22 | cumulative_reward = reward_norm * transition.reward + \ 23 | np.invert(transition.done) * discount_factor * cumulative_reward 24 | 25 | policy_targets.append(cumulative_reward) 26 | state_history.append(transition.state) 27 | action_history.append(transition.action) 28 | 29 | # time-major 30 | policy_targets = np.array(policy_targets[::-1]) 31 | state_history = np.array(state_history[::-1]) 32 | action_history = np.array(action_history[::-1]) 33 | 34 | time_len = state_history.shape[0] 35 | 36 | policy_loss = 0.0 37 | for state_axis, action_axis, policy_target_axis in \ 38 | zip(state_history, action_history, policy_targets): 39 | axis_len = state_axis.shape[0] 40 | axis_policy_loss = 0.0 41 | 42 | state_axis = iterate_minibatches(state_axis, batch_size) 43 | action_axis = iterate_minibatches(action_axis, batch_size) 44 | policy_target_axis = iterate_minibatches(policy_target_axis, batch_size) 45 | 46 | for state_batch, action_batch, policy_target in \ 47 | zip(state_axis, action_axis, policy_target_axis): 48 | run_params = [ 49 | reinforce_agent.policy_net.loss, 50 | reinforce_agent.policy_net.train_op, 51 | reinforce_agent.feature_net.train_op] 52 | feed_params = { 53 | reinforce_agent.feature_net.states: state_batch, 54 | reinforce_agent.feature_net.is_training: True, 55 | reinforce_agent.policy_net.actions: action_batch, 56 | reinforce_agent.policy_net.cumulative_rewards: policy_target, 57 | reinforce_agent.policy_net.is_training: True 58 | } 59 | 60 | run_result = sess.run( 61 | run_params, 62 | feed_dict=feed_params) 63 | 64 | batch_loss_policy = run_result[0] 65 | 66 | axis_policy_loss += batch_loss_policy 67 | 68 | policy_loss += axis_policy_loss / axis_len 69 | 70 | return policy_loss / time_len 71 | 72 | 73 | def generate_sessions(sess, a3c_agent, env_pool, update_fn, t_max=1000): 74 | total_reward = 0.0 75 | total_games = 0.0 76 | 77 | transitions = [] 78 | 79 | states = env_pool.pool_states() 80 | for t in range(t_max): 81 | actions = epsilon_greedy_policy(a3c_agent, sess, states) 82 | next_states, rewards, dones, _ = env_pool.step(actions) 83 | 84 | transitions.append(Transition( 85 | state=states, action=actions, reward=rewards, next_state=next_states, done=dones)) 86 | states = next_states 87 | 88 | total_reward += rewards.sum() 89 | total_games += dones.sum() 90 | 91 | if env_pool.n_envs == 1 and total_games > 0: 92 | break 93 | 94 | total_policy_loss = update_fn(sess, a3c_agent, transitions) 95 | 96 | return total_reward / env_pool.n_envs, \ 97 | total_policy_loss, \ 98 | t / (total_games / env_pool.n_envs) 99 | 100 | 101 | def reinforce_learning( 102 | sess, agent, env, update_fn, 103 | n_epochs=1000, n_sessions=100, t_max=1000): 104 | tr = trange( 105 | n_epochs, 106 | desc="", 107 | leave=True) 108 | 109 | history = { 110 | "reward": np.zeros(n_epochs), 111 | "policy_loss": np.zeros(n_epochs), 112 | "steps": np.zeros(n_epochs), 113 | } 114 | 115 | for i in tr: 116 | sessions = [ 117 | generate_sessions(sess, agent, env, update_fn, t_max) 118 | for _ in range(n_sessions)] 119 | session_rewards, session_policy_loss, session_steps = \ 120 | map(np.array, zip(*sessions)) 121 | 122 | history["reward"][i] = np.mean(session_rewards) 123 | history["policy_loss"][i] = np.mean(session_policy_loss) 124 | history["steps"][i] = np.mean(session_steps) 125 | 126 | desc = "\t".join( 127 | ["{} = {:.3f}".format(key, value[i]) for key, value in history.items()]) 128 | tr.set_description(desc) 129 | 130 | return history 131 | 132 | 133 | def run(env_name, make_env_fn, agent_cls, 134 | run_args, update_args, agent_agrs, 135 | log_dir=None, episode_limit=None, 136 | plot_stats=False, api_key=None, 137 | load=False, gpu_option=0.4, 138 | n_games=10): 139 | run_wrapper( 140 | n_games, reinforce_learning, update_wraper(update, **update_args), 141 | play_session, epsilon_greedy_policy, 142 | env_name, make_env_fn, agent_cls, 143 | run_args, agent_agrs, 144 | log_dir=log_dir, episode_limit=episode_limit, 145 | plot_stats=plot_stats, api_key=api_key, 146 | load=load, gpu_option=gpu_option) 147 | 148 | 149 | def _parse_args(): 150 | parser = argparse.ArgumentParser(description='Reinforce Agent Learning') 151 | # typical params 152 | parser = typical_args(parser) 153 | 154 | # agent special params & optimization 155 | parser.add_argument( 156 | '--policy_lr', 157 | type=float, 158 | default=1e-5, 159 | help='Learning rate for policy network. (default: %(default)s)') 160 | 161 | parser.add_argument( 162 | '--entropy_factor', 163 | type=float, 164 | default=1e-2, 165 | help='Entropy factor for policy network. (default: %(default)s)') 166 | 167 | args = parser.parse_args() 168 | return args 169 | 170 | 171 | def main(): 172 | args = _parse_args() 173 | 174 | assert args.time_major, "Please, use time_major flag for updates" 175 | 176 | network, run_args, update_args, optimization_params, make_env_fn = typical_argsparse(args) 177 | 178 | policy_optimization_params = { 179 | **optimization_params, 180 | **{"initial_lr": args.policy_lr} 181 | } 182 | 183 | policy_net_params = { 184 | "entropy_factor": args.entropy_factor 185 | } 186 | 187 | agent_cls = ReinforceAgent 188 | 189 | special = { 190 | "policy_net": policy_net_params, 191 | "hidden_size": args.hidden_size, 192 | "hidden_activation": activations[args.hidden_activation], 193 | "feature_net_optimization": optimization_params, 194 | "hidden_state_optimization": optimization_params, 195 | "policy_net_optimization": policy_optimization_params, 196 | } 197 | 198 | agent_args = { 199 | "network": network, 200 | "special": special 201 | } 202 | 203 | run(args.env, make_env_fn, agent_cls, 204 | run_args, update_args, agent_args, 205 | args.log_dir, args.episode_limit, 206 | args.plot_history, args.api_key, 207 | args.load, args.gpu_option, 208 | args.n_games) 209 | 210 | 211 | if __name__ == '__main__': 212 | main() 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL course experiments 2 | 3 | ### Overview 4 | This repository provides code implementations for popular Reinforcement Learning algorithms. 5 | 6 | Main idea was to generalise main RL algorithms and provide unified interface for testing them on any gym environment. 7 | For example, now your can create your own Double Dueling Deep Recurrent Q-Learning agent (Let's name it, 3DRQ). 8 | For simplicity, all main agent blocks are in `agents` folder. 9 | 10 | For now, repository is under after-course refactoring. So, many documentation needed. 11 | 12 | All code is written in Python 3 and uses RL environments from OpenAI Gym. 13 | Advanced techniques use Tensorflow for neural network implementations. 14 | 15 | ### Inspired by: 16 | * [Berkeley CS188x](http://ai.berkeley.edu/home.html) 17 | * [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) 18 | * [dennybritz/reinforcement-learning](https://github.com/dennybritz/reinforcement-learning) 19 | * [yandexdataschool/Practical_RL](https://github.com/yandexdataschool/Practical_RL) 20 | * [yandexdataschool/AgentNet](https://github.com/yandexdataschool/AgentNet) 21 | 22 | ##### Additional thanks to [JustHeuristic](https://github.com/justheuristic) for Practical_RL course 23 | 24 | ### Table of Contents 25 | * [Genetic algorithm](https://github.com/Scitator/rl-course-experiments/tree/master/GEN) 26 | * [Dynamic Programming](https://github.com/Scitator/rl-course-experiments/tree/master/DP) 27 | * [Cross Entropy Method](https://github.com/Scitator/rl-course-experiments/tree/master/CEM) 28 | * [Monte Carlo Control](https://github.com/Scitator/rl-course-experiments/tree/master/MC) 29 | * [Temporal Difference](https://github.com/Scitator/rl-course-experiments/tree/master/TD) 30 | * [Deep Q-Networks](https://github.com/Scitator/rl-course-experiments/tree/master/DQN) 31 | * [Policy Gradient](https://github.com/Scitator/rl-course-experiments/tree/master/PG) 32 | * [Asynchronous Advantage Actor-Critic](https://github.com/Scitator/rl-course-experiments/tree/master/A3C) 33 | * [Optimality Tightening](https://arxiv.org/abs/1611.01606) [TODO] 34 | * [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477) [TODO] 35 | * Continuous action space [TODO] 36 | * Monte Carlo Tree Search [TODO] 37 | 38 | For more information, look at folder readme. 39 | 40 | #### Special requirements 41 | 42 | For simple script running you need to install additional [repo](https://github.com/Scitator/rstools) with optimization stuff for neural networks: 43 | 44 | `pip install git+https://github.com/Scitator/rstools` 45 | 46 | #### Example usage 47 | 48 | DQN: 49 | 50 | ``` 51 | PYTHONPATH=. python DQN/run_dqn.py --plot_history --env CartPole-v0 \ 52 | --feature_network linear --layers 128-128 --hidden_size 64 \ 53 | --n_epochs 1000 --n_games 4 --batch_size 128 --t_max 500 --episode_limit 500 \ 54 | --replay_buffer simple --replay_buffer_size 2000 \ 55 | --qvalue_lr 0.0001 --feature_lr 0.0001 --value_lr 0.0001 \ 56 | --initial_epsilon 0.8 --final_epsilon 0.1 \ 57 | --gpu_option 0.25 \ 58 | --api_key 59 | ``` 60 | 61 | Reinforce: 62 | 63 | ``` 64 | PYTHONPATH=. python PG/run_reinforce.py --plot_history --env CartPole-v0 \ 65 | --feature_network linear --layers 128-128 --hidden_size 64 \ 66 | --n_epochs 10000 --n_games 1 --batch_size 1 --t_max 500 --episode_limit 500 \ 67 | --entropy_factor 0.005 --policy_lr 0.0000001 --feature_lr 0.0000001 --grad_clip 10.0 \ 68 | --gpu_option 0.25 --time_major \ 69 | --api_key 70 | ``` 71 | 72 | Feed-Forward Asynchronous Advantage Actor-Critic: 73 | 74 | ``` 75 | PYTHONPATH=. python A3C/run_a3c.py --plot_history --env CartPole-v0 \ 76 | --feature_network linear --layers 128-128 --hidden_size 64 \ 77 | --n_epochs 500 --n_games 1 --batch_size 1 --t_max 100 --episode_limit 500 \ 78 | --entropy_factor 0.005 --policy_lr 0.00001 --feature_lr 0.00001 --value_lr 0.00001 --grad_clip 10.0 \ 79 | --gpu_option 0.25 --time_major \ 80 | --api_key 81 | ``` 82 | 83 | If agent start to play well, you can always stop training by `Ctrl+C` hotkey. 84 | If something go wrong, you can always evaluate agent thought magic `--load --n_epochs 0` 85 | combination. 86 | 87 | ##### Metrics 88 | 89 | - loss - typical neural network loss 90 | - reward - typical environment reward, 91 | but because Environment Pool is always used not very informative for now 92 | - steps - mean number of game ends per epoch session 93 | 94 | ##### If you have linux with NVIDIA GPU and no X server, but want to try gym 95 | 96 | You need to reinstall NVIDIA drivers. 97 | 98 | [issue source](https://github.com/openai/gym/issues/366) 99 | [how-to guide](https://davidsanwald.github.io/2016/11/13/building-tensorflow-with-gpu-support.html) 100 | 101 | and add `bash xvfb start; DISPLAY=:1` before run command. 102 | 103 | #### Contributing 104 | 105 | ##### write code 106 | 107 | Found a bug or know how to write it simpler? 108 | Or maybe you want to create your own agent? 109 | Just follow PEP8 and make merge request. 110 | 111 | ##### ...or play a game 112 | 113 | We have a lot of RL algorithms, and even more gym environments to test them. 114 | So, play a game, save 115 | * agent parameters (so anyone can reproduce) 116 | * agent itself (`model.ckpt*`) 117 | * plots (they will be automatically generated with `--plot_history` flag) 118 | * gym-link (main results) 119 | * make merge request (solutions should be at `field/solutions.md`, for example `DQN/solutions.md`) 120 | -------------------------------------------------------------------------------- /TD/README.md: -------------------------------------------------------------------------------- 1 | ## Model-Free Prediction & Control with Temporal Difference (TD) and Q-Learning 2 | 3 | 4 | ### Learning Goals 5 | 6 | - Understand TD(0) for prediction 7 | - Understand SARSA for on-policy control 8 | - Understand Q-Learning for off-policy control 9 | - Understand the benefits of TD algorithms over MC and DP approaches 10 | - Understand how n-step methods unify MC and TD approaches 11 | - Understand the backward and forward view of TD-Lambda 12 | 13 | 14 | ### Summary 15 | 16 | - TD-Learning is a combination of Monte Carlo and Dynamic Programming ideas. Like Monte Carlo, TD works based on samples and doesn't require a model of the environment. Like Dynamic Programming, TD uses bootstrapping to make updates. 17 | - Whether MC or TD is better depends on the problem and there are no theoretical results that prove a clear winner. 18 | - General Update Rule: `Q[s,a] += learning_rate * (td_target - Q[s,a])`. `td_target - Q[s,a]` is also called the TD Error. 19 | - SARSA: On-Policy TD Control 20 | - TD Target for SARSA: `R[t+1] + discount_factor * Q[next_state][next_action]` 21 | - Q-Learning: Off-policy TD Control 22 | - TD Target for Q-Learning: `R[t+1] + discount_factor * max(Q[next_state])` 23 | - Q-Learning has a positive bias because it uses the maximum of estimated Q values to estimate the maximum action value, all from the same experience. Double Q-Learning gets around this by splitting the experience and using different Q functions for maximization and estimation. 24 | - N-Step methods unify MC and TD approaches. They making updates based on n-steps instead of a single step (TD-0) or a full episode (MC). 25 | 26 | 27 | ### Lectures & Readings 28 | 29 | **Required:** 30 | 31 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 6: Temporal-Difference Learning 32 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) 33 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) 34 | 35 | **Optional:** 36 | 37 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 7: Multi-Step Bootstrapping 38 | - [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf) - Chapter 12: Eligibility Traces 39 | 40 | -------------------------------------------------------------------------------- /TD/evsarsa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Expected Value SARSA 3 | This file builds upon the same functions as Q-learning agent (qlearning.py). 4 | 5 | Here's usage example: 6 | from expected_value_sarsa import EVSarsaAgent 7 | 8 | agent = EVSarsaAgent( 9 | alpha=0.5,epsilon=0.25,discount=0.99, 10 | getLegalActions = lambda s: actions_from_that_state) 11 | action = agent.getAction(state) 12 | agent.update(state,action, next_state,reward) 13 | agent.epsilon *= 0.99 14 | """ 15 | 16 | import random 17 | 18 | import numpy as np 19 | from collections import defaultdict 20 | 21 | 22 | class EVSarsaAgent(): 23 | """ 24 | Expected Value SARSA Agent. 25 | 26 | The two main methods are 27 | - self.getAction(state) - returns agent's action in that state 28 | - self.update(state,action,nextState,reward) - returns agent's next action 29 | 30 | Instance variables you have access to 31 | - self.epsilon (exploration prob) 32 | - self.alpha (learning rate) 33 | - self.discount (discount rate aka gamma) 34 | 35 | """ 36 | 37 | def __init__(self, alpha, epsilon, discount, getLegalActions): 38 | "We initialize agent and Q-values here." 39 | self.getLegalActions = getLegalActions 40 | self._qValues = defaultdict(lambda: defaultdict(lambda: 0)) 41 | self.alpha = alpha 42 | self.epsilon = epsilon 43 | self.discount = discount 44 | 45 | def getQValue(self, state, action): 46 | """ 47 | Returns Q(state,action) 48 | """ 49 | return self._qValues[state][action] 50 | 51 | def setQValue(self, state, action, value): 52 | """ 53 | Sets the Qvalue for [state,action] to the given value 54 | """ 55 | self._qValues[state][action] = value 56 | 57 | def getValue(self, state): 58 | """ 59 | Returns max_action Q(state,action) 60 | where the max is over legal actions. 61 | """ 62 | 63 | possibleActions = self.getLegalActions(state) 64 | # If there are no legal actions, return 0.0 65 | if len(possibleActions) == 0: 66 | return 0.0 67 | 68 | # You'll need this to estimate action probabilities 69 | epsilon = self.epsilon 70 | 71 | value = np.array([self.getQValue(state, a) for a in possibleActions]) 72 | value = (value * epsilon).sum() + value.max() * ( 73 | 1.0 - epsilon / float(len(possibleActions))) 74 | return value 75 | 76 | def getPolicy(self, state): 77 | """ 78 | Compute the best action to take in a state. 79 | """ 80 | possibleActions = self.getLegalActions(state) 81 | 82 | # If there are no legal actions, return None 83 | if len(possibleActions) == 0: 84 | return None 85 | 86 | best_action = possibleActions[ 87 | np.argmax([self.getQValue(state, a) for a in possibleActions])] 88 | return best_action 89 | 90 | def getAction(self, state): 91 | """ 92 | Compute the action to take in the current state, including exploration. 93 | 94 | With probability self.epsilon, we should take a random action. 95 | otherwise - the best policy action (self.getPolicy). 96 | """ 97 | 98 | # Pick Action 99 | possibleActions = self.getLegalActions(state) 100 | action = None 101 | 102 | # If there are no legal actions, return None 103 | if len(possibleActions) == 0: 104 | return None 105 | 106 | # agent parameters: 107 | epsilon = self.epsilon 108 | 109 | if np.random.random() <= epsilon: 110 | action = random.choice(possibleActions) 111 | else: 112 | action = self.getPolicy(state) 113 | return action 114 | 115 | def update(self, state, action, nextState, reward): 116 | """ 117 | You should do your Q-Value update here 118 | """ 119 | # agent parameters 120 | gamma = self.discount 121 | learning_rate = self.alpha 122 | 123 | reference_qvalue = reward + gamma * self.getValue(nextState) 124 | updated_qvalue = (1 - learning_rate) * self.getQValue(state, action) + \ 125 | learning_rate * reference_qvalue 126 | self.setQValue(state, action, updated_qvalue) 127 | -------------------------------------------------------------------------------- /TD/qlearning.py: -------------------------------------------------------------------------------- 1 | """ 2 | Q-learning Agent 3 | 4 | Here's an example: 5 | from qlearning import QLearningAgent 6 | 7 | agent = QLearningAgent( 8 | alpha=0.5,epsilon=0.25,discount=0.99, 9 | getLegalActions = lambda s: actions_from_that_state) 10 | action = agent.getAction(state) 11 | agent.update(state,action, next_state,reward) 12 | agent.epsilon *= 0.99 13 | """ 14 | 15 | import random 16 | 17 | import numpy as np 18 | from collections import defaultdict 19 | 20 | 21 | class QLearningAgent(object): 22 | """ 23 | Q-Learning Agent 24 | 25 | The two main methods are 26 | - self.getAction(state) - returns agent's action in that state 27 | - self.update(state,action,nextState,reward) - returns agent's next action 28 | 29 | Functions you should use 30 | - self.getLegalActions(state) 31 | which returns legal actions for a state 32 | - self.getQValue(state,action) 33 | which returns Q(state,action) 34 | - self.setQValue(state,action,value) 35 | which sets Q(state,action) := value 36 | 37 | !!!Important!!! 38 | NOTE: please avoid using self._qValues directly to make code cleaner 39 | """ 40 | 41 | def __init__(self, alpha, epsilon, discount, getLegalActions): 42 | "We initialize agent and Q-values here." 43 | self.getLegalActions = getLegalActions 44 | self._qValues = defaultdict(lambda: defaultdict(lambda: 0)) 45 | self.alpha = alpha 46 | self.epsilon = epsilon 47 | self.discount = discount 48 | 49 | def getQValue(self, state, action): 50 | """ 51 | Returns Q(state,action) 52 | """ 53 | return self._qValues[state][action] 54 | 55 | def setQValue(self, state, action, value): 56 | """ 57 | Sets the Qvalue for [state,action] to the given value 58 | """ 59 | self._qValues[state][action] = value 60 | 61 | def getValue(self, state): 62 | """ 63 | Returns max_action Q(state,action) 64 | where the max is over legal actions. 65 | """ 66 | 67 | possibleActions = self.getLegalActions(state) 68 | # If there are no legal actions, return 0.0 69 | if len(possibleActions) == 0: 70 | return 0.0 71 | 72 | return max([self.getQValue(state, a) for a in possibleActions]) 73 | 74 | def getPolicy(self, state): 75 | """ 76 | Compute the best action to take in a state. 77 | 78 | """ 79 | possibleActions = self.getLegalActions(state) 80 | 81 | # If there are no legal actions, return None 82 | if len(possibleActions) == 0: 83 | return None 84 | 85 | best_action = possibleActions[ 86 | np.argmax([self.getQValue(state, a) for a in possibleActions])] 87 | return best_action 88 | 89 | def getAction(self, state): 90 | """ 91 | Compute the action to take in the current state, including exploration. 92 | 93 | With probability self.epsilon, we should take a random action. 94 | otherwise - the best policy action (self.getPolicy). 95 | 96 | """ 97 | 98 | # Pick Action 99 | possibleActions = self.getLegalActions(state) 100 | action = None 101 | 102 | # If there are no legal actions, return None 103 | if len(possibleActions) == 0: 104 | return None 105 | 106 | # agent parameters: 107 | epsilon = self.epsilon 108 | 109 | if np.random.random() <= epsilon: 110 | action = random.choice(possibleActions) 111 | else: 112 | action = self.getPolicy(state) 113 | return action 114 | 115 | def update(self, state, action, nextState, reward): 116 | """ 117 | You should do your Q-Value update here 118 | """ 119 | # agent parameters 120 | gamma = self.discount 121 | learning_rate = self.alpha 122 | 123 | reference_qvalue = reward + gamma * self.getValue(nextState) 124 | updated_qvalue = (1 - learning_rate) * self.getQValue(state, action) + \ 125 | learning_rate * reference_qvalue 126 | self.setQValue(state, action, updated_qvalue) 127 | -------------------------------------------------------------------------------- /TD/run.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt # noqa: E402 5 | import matplotlib.cm as cm 6 | 7 | plt.style.use("ggplot") 8 | import seaborn as sns # noqa: E402 9 | 10 | sns.set(color_codes=True) 11 | 12 | import numpy as np 13 | import argparse 14 | import gym 15 | from gym.core import ObservationWrapper 16 | import os 17 | import pickle 18 | from tqdm import trange 19 | 20 | from qlearning import QLearningAgent 21 | from sarsa import SarsaAgent 22 | from evsarsa import EVSarsaAgent 23 | 24 | 25 | def plot_unimetric(history, metric, save_dir): 26 | plt.figure() 27 | plt.plot(history[metric]) 28 | plt.title('model {}'.format(metric)) 29 | plt.ylabel(metric) 30 | plt.xlabel('epoch') 31 | plt.savefig("{}/{}.png".format(save_dir, metric), 32 | format='png', dpi=300) 33 | 34 | 35 | def save_stats(stats, save_dir="./"): 36 | for key in stats: 37 | plot_unimetric(stats, key, save_dir) 38 | 39 | 40 | class Binarizer(ObservationWrapper): 41 | def __init__(self, env, bins=None): 42 | super().__init__(env) 43 | self.n_bins = (bins or [10] * env.action_space.n) 44 | 45 | def _state_encoder(self, i, s_i): 46 | return int(self.n_bins[i] * s_i) 47 | 48 | def _observation(self, state): 49 | state = map(lambda x: self._state_encoder(x[0], x[1]), enumerate(state)) 50 | 51 | return tuple(state) 52 | 53 | 54 | def play_and_train_qlearning(env, agent, t_max=10 ** 3): 55 | total_reward = 0.0 56 | s = env.reset() 57 | 58 | for t in range(t_max): 59 | a = agent.getAction(s) 60 | 61 | next_s, r, done, _ = env.step(a) 62 | 63 | agent.update(s, a, next_s, r) 64 | 65 | s = next_s 66 | total_reward += r 67 | if done: 68 | break 69 | 70 | return total_reward 71 | 72 | 73 | def play_and_train_sarsa(env, agent, t_max=10 ** 3): 74 | total_reward = 0.0 75 | s = env.reset() 76 | 77 | for t in range(t_max): 78 | a = agent.getAction(s) 79 | 80 | next_s, r, done, _ = env.step(a) 81 | 82 | agent.update(s, a, next_s, agent.getAction(next_s), r) 83 | 84 | s = next_s 85 | total_reward += r 86 | if done: 87 | break 88 | 89 | return total_reward 90 | 91 | 92 | def play_and_train_evsarsa(env, agent, t_max=10 ** 3): 93 | total_reward = 0.0 94 | s = env.reset() 95 | 96 | for t in range(t_max): 97 | a = agent.getAction(s) 98 | 99 | next_s, r, done, _ = env.step(a) 100 | 101 | agent.update(s, a, next_s, r) 102 | 103 | s = next_s 104 | total_reward += r 105 | if done: 106 | break 107 | 108 | return total_reward 109 | 110 | 111 | def agent_runner( 112 | env, agent_fn, agent_play_fn, 113 | n_epochs=int(2e5), alpha=0.05, discount=0.99, 114 | initial_epsilon=0.25, final_epsilon=0.01): 115 | n_actions = env.action_space.n 116 | 117 | agent = agent_fn( 118 | alpha=alpha, epsilon=initial_epsilon, discount=discount, 119 | getLegalActions=lambda s: range(n_actions)) 120 | 121 | n_epochs_decay = n_epochs * 0.8 122 | 123 | tr = trange( 124 | n_epochs, 125 | desc="", 126 | leave=True) 127 | 128 | rewards = np.zeros(n_epochs) 129 | eps = np.zeros(n_epochs) 130 | epoch_rewards = np.zeros(n_epochs // 1000) 131 | agent.epsilon = initial_epsilon 132 | for i in tr: 133 | rewards[i] = agent_play_fn(env, agent) 134 | eps[i] = agent.epsilon 135 | 136 | if i < n_epochs_decay: 137 | agent.epsilon -= (initial_epsilon - final_epsilon) / float(n_epochs_decay) 138 | 139 | if i % 1000 == 0: 140 | epoch_rewards[i // 1000] = np.mean(rewards[i - 1000:i]) 141 | desc = "reward: {}\tepsilon: {}".format(epoch_rewards[i // 1000], agent.epsilon) 142 | tr.set_description(desc) 143 | 144 | return { 145 | "reward": rewards, 146 | "epoch_reward": epoch_rewards, 147 | "epsilon": eps 148 | } 149 | 150 | 151 | AGENTS = { 152 | "qlearning": QLearningAgent, 153 | "sarsa": SarsaAgent, 154 | "evsarsa": EVSarsaAgent, 155 | } 156 | 157 | AGENTS_FN = { 158 | "qlearning": play_and_train_qlearning, 159 | "sarsa": play_and_train_sarsa, 160 | "evsarsa": play_and_train_evsarsa, 161 | } 162 | 163 | 164 | def run(env, agent, bins=None, 165 | lr=0.05, discount_factor=0.99, n_steps=1, initial_epsilon=0.25, 166 | n_epochs=1000, t_max=1000, 167 | plot_stats=False, api_key=None): 168 | env_name = env 169 | env = Binarizer(gym.make(env).env, bins=bins) 170 | agent = AGENTS[agent] 171 | agent_fn = lambda env, agent: AGENTS_FN[agent](env, agent, t_max=t_max) 172 | 173 | history = agent_runner( 174 | env, agent, agent_fn, 175 | n_epochs, lr, discount_factor, 176 | initial_epsilon) 177 | 178 | if plot_stats: 179 | save_stats(history) 180 | 181 | if api_key is not None: 182 | env = gym.wrappers.Monitor(env, "/tmp/" + env_name, force=True) 183 | for _ in range(200): 184 | agent_fn(env, agent) 185 | env.close() 186 | gym.upload("/tmp/" + env_name, api_key=api_key) 187 | 188 | 189 | def _parse_args(): 190 | parser = argparse.ArgumentParser(description='Policy iteration example') 191 | parser.add_argument( 192 | '--env', 193 | type=str, 194 | default='CartPole-v0', # CartPole-v0, MountainCar-v0 195 | help='The environment to use') 196 | parser.add_argument( 197 | '--agent', 198 | type=str, 199 | default='qlearning', # qlearning, sarsa, evsarsa 200 | help='The agent to use') 201 | parser.add_argument( 202 | '--n_epochs', 203 | type=int, 204 | default=1000) 205 | parser.add_argument( 206 | '--t_max', 207 | type=int, 208 | default=1000) 209 | parser.add_argument( 210 | '--lr', 211 | type=float, 212 | default=0.05, 213 | help='Agent learning rate') 214 | parser.add_argument( 215 | '--initial_epsilon', 216 | type=float, 217 | default=0.99, 218 | help='Agent start exploration factor') 219 | parser.add_argument( 220 | '--gamma', 221 | type=float, 222 | default=0.99, 223 | help='Gamma discount factor') 224 | parser.add_argument( 225 | '--plot_stats', 226 | action='store_true', 227 | default=False) 228 | parser.add_argument( 229 | '--api_key', 230 | type=str, 231 | default=None) 232 | parser.add_argument( 233 | '--n_steps', 234 | type=int, 235 | default=1) 236 | parser.add_argument( 237 | '--bins', 238 | type=str, 239 | default=None) 240 | 241 | args, _ = parser.parse_known_args() 242 | return args 243 | 244 | 245 | def main(): 246 | args = _parse_args() 247 | try: 248 | bins = tuple(map(int, args.bins.split("-"))) 249 | except: 250 | bins = None 251 | run(args.env, args.agent, bins, 252 | args.lr, args.gamma, args.n_steps, args.initial_epsilon, 253 | args.n_epochs, args.t_max, 254 | args.plot_stats, args.api_key) 255 | 256 | 257 | if __name__ == '__main__': 258 | main() 259 | -------------------------------------------------------------------------------- /TD/sarsa.py: -------------------------------------------------------------------------------- 1 | """ 2 | SARSA Agent 3 | This file builds upon the same functions as Q-learning agent (qlearning.py). 4 | 5 | Here's usage example: 6 | from sarsa import SarsaAgent 7 | 8 | agent = SarsaAgent( 9 | alpha=0.1,epsilon=0.25,discount=0.99, 10 | getLegalActions = lambda s: actions_from_that_state) 11 | action = agent.getAction(state) 12 | agent.update(state, action, next_state, reward) 13 | agent.epsilon *= 0.99 14 | """ 15 | import random 16 | 17 | import numpy as np 18 | from collections import defaultdict 19 | 20 | 21 | class SarsaAgent(object): 22 | """ 23 | Classical SARSA agent. 24 | 25 | The two main methods are 26 | - self.getAction(state) - returns agent's action in that state 27 | - self.update(state,action,reward,nextState,nextAction) - returns agent's next action 28 | 29 | Instance variables you have access to 30 | - self.epsilon (exploration prob) 31 | - self.alpha (learning rate) 32 | - self.discount (discount rate aka gamma) 33 | 34 | """ 35 | 36 | def __init__(self, alpha, epsilon, discount, getLegalActions): 37 | "We initialize agent and Q-values here." 38 | self.getLegalActions = getLegalActions 39 | self._qValues = defaultdict(lambda: defaultdict(lambda: 0)) 40 | self.alpha = alpha 41 | self.epsilon = epsilon 42 | self.discount = discount 43 | 44 | def getQValue(self, state, action): 45 | """ 46 | Returns Q(state,action) 47 | """ 48 | return self._qValues[state][action] 49 | 50 | def setQValue(self, state, action, value): 51 | """ 52 | Sets the Qvalue for [state,action] to the given value 53 | """ 54 | self._qValues[state][action] = value 55 | 56 | def getPolicy(self, state): 57 | """ 58 | Compute the best action to take in a state. 59 | """ 60 | possibleActions = self.getLegalActions(state) 61 | 62 | # If there are no legal actions, return None 63 | if len(possibleActions) == 0: 64 | return None 65 | 66 | "*** this code works exactly as Q-learning ***" 67 | best_action = possibleActions[ 68 | np.argmax([self.getQValue(state, a) for a in possibleActions])] 69 | return best_action 70 | 71 | def getAction(self, state): 72 | """ 73 | Compute the action to take in the current state, including exploration. 74 | """ 75 | 76 | # Pick Action 77 | possibleActions = self.getLegalActions(state) 78 | action = None 79 | 80 | # If there are no legal actions, return None 81 | if len(possibleActions) == 0: 82 | return None 83 | 84 | # agent parameters: 85 | epsilon = self.epsilon 86 | 87 | "*** Epsilon-greedy strategy exactly as Q-learning ***" 88 | if np.random.random() <= epsilon: 89 | action = random.choice(possibleActions) 90 | else: 91 | action = self.getPolicy(state) 92 | return action 93 | 94 | def update(self, state, action, nextState, nextAction, reward): 95 | """ 96 | You should do your Q-Value update here 97 | """ 98 | # agent parameters 99 | gamma = self.discount 100 | learning_rate = self.alpha 101 | 102 | "*** YOUR CODE HERE ***" 103 | reference_qvalue = reward + gamma * self.getQValue(nextState, nextAction) 104 | 105 | updated_qvalue = (1 - learning_rate) * self.getQValue(state, action) + \ 106 | learning_rate * reference_qvalue 107 | 108 | self.setQValue(state, action, updated_qvalue) 109 | -------------------------------------------------------------------------------- /agents/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/agents/README.md -------------------------------------------------------------------------------- /agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/agents/__init__.py -------------------------------------------------------------------------------- /agents/agent_networks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib import rnn 4 | 5 | 6 | class FeatureNet(object): 7 | def __init__(self, state_shape, network, special=None): 8 | self.special = special or {} 9 | self.state_shape = state_shape 10 | 11 | self.states = tf.placeholder(shape=(None,) + state_shape, dtype=tf.float32, name="states") 12 | self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") 13 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 14 | 15 | self.loss = None 16 | self.optimizer = None 17 | self.train_op = None 18 | 19 | self.relative_scope = self.special.get("scope", "feature_network") 20 | self.scope = tf.get_variable_scope().name + "/" + self.relative_scope 21 | 22 | self.feature_state = network( 23 | self.states, 24 | scope=self.relative_scope + "/feature", 25 | reuse=self.special.get("reuse_feature", False), 26 | is_training=self.is_training) 27 | 28 | 29 | class PolicyNet(object): 30 | def __init__(self, hidden_state, n_actions, special=None): 31 | self.special = special or {} 32 | self.n_actions = n_actions 33 | 34 | self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions") 35 | self.cumulative_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="rewards") 36 | self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") 37 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 38 | 39 | self.optimizer = None 40 | self.train_op = None 41 | 42 | self.relative_scope = self.special.get("scope", "policy_network") 43 | self.scope = tf.get_variable_scope().name + "/" + self.relative_scope 44 | 45 | self.predicted_probs = self._probs( 46 | hidden_state, 47 | scope=self.relative_scope + "/probs", 48 | reuse=self.special.get("reuse_probs", False)) + 1e-8 49 | 50 | batch_size = tf.shape(self.actions)[0] 51 | predicted_ids = tf.range(batch_size) * tf.shape(self.predicted_probs)[1] + self.actions 52 | 53 | self.predicted_probs_for_actions = tf.gather( 54 | tf.reshape(self.predicted_probs, [-1]), predicted_ids) 55 | 56 | J = -tf.reduce_mean(tf.log(self.predicted_probs_for_actions) * self.cumulative_rewards) 57 | self.loss = J 58 | 59 | # a bit of regularization 60 | if self.special.get("entropy_loss", True): 61 | entropy = tf.reduce_mean( 62 | tf.reduce_sum( 63 | self.predicted_probs * tf.log(self.predicted_probs), 64 | axis=-1)) 65 | entropy *= self.special.get("entropy_factor", 0.01) 66 | self.loss += entropy 67 | 68 | def _probs(self, hidden_state, scope, reuse=False): 69 | with tf.variable_scope(scope, reuse=reuse): 70 | probs = tf.layers.dense( 71 | hidden_state, 72 | units=self.n_actions, 73 | activation=tf.nn.softmax) 74 | return probs 75 | 76 | 77 | class ValueNet(object): 78 | def __init__(self, hidden_state, special=None): 79 | self.special = special or {} 80 | 81 | self.td_target = tf.placeholder(shape=[None], dtype=tf.float32, name="td_target") 82 | self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") 83 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 84 | 85 | self.optimizer = None 86 | self.train_op = None 87 | 88 | self.relative_scope = self.special.get("scope", "value_network") 89 | self.scope = tf.get_variable_scope().name + "/" + self.relative_scope 90 | 91 | self.predicted_values = self._state_value( 92 | hidden_state, 93 | scope=self.relative_scope + "/state_value", 94 | reuse=self.special.get("reuse_state_value", False)) 95 | 96 | self.predicted_values_for_actions = tf.squeeze(self.predicted_values, axis=1) 97 | 98 | self.loss = tf.losses.mean_squared_error( 99 | labels=self.td_target, 100 | predictions=self.predicted_values_for_actions) 101 | 102 | def _state_value(self, hidden_state, scope, reuse=False): 103 | with tf.variable_scope(scope, reuse=reuse): 104 | state_values = tf.layers.dense( 105 | hidden_state, 106 | units=1, 107 | activation=None) 108 | return state_values 109 | 110 | 111 | class QvalueNet(object): 112 | def __init__(self, hidden_state, n_actions, special=None): 113 | self.special = special or {} 114 | self.n_actions = n_actions 115 | 116 | self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions") 117 | self.td_target = tf.placeholder(shape=[None], dtype=tf.float32, name="td_target") 118 | self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") 119 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 120 | 121 | self.optimizer = None 122 | self.train_op = None 123 | 124 | self.relative_scope = self.special.get("scope", "qvalue_network") 125 | self.scope = tf.get_variable_scope().name + "/" + self.relative_scope 126 | 127 | self.predicted_qvalues = self._qvalues( 128 | hidden_state, 129 | scope=self.relative_scope + "/qvalue", 130 | reuse=self.special.get("reuse_state_value", False)) 131 | 132 | batch_size = tf.shape(self.actions)[0] 133 | predicted_ids = tf.range(batch_size) * tf.shape(self.predicted_qvalues)[1] + self.actions 134 | 135 | self.predicted_qvalues_for_actions = tf.gather( 136 | tf.reshape(self.predicted_qvalues, [-1]), predicted_ids) 137 | 138 | self.loss = tf.losses.mean_squared_error( 139 | labels=self.td_target, 140 | predictions=self.predicted_qvalues_for_actions) 141 | 142 | def _qvalues(self, hidden_state, scope, reuse=False): 143 | with tf.variable_scope(scope, reuse=reuse): 144 | qvalues = tf.layers.dense( 145 | hidden_state, 146 | units=self.n_actions, 147 | activation=None) 148 | if self.special.get("advantage", False): 149 | qvalues -= tf.reduce_mean(qvalues, axis=-1, keep_dims=True) 150 | return qvalues 151 | 152 | 153 | def copy_scope_parameters(sess, net1_scope, net2_scope): 154 | net1_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=net1_scope) 155 | net2_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=net2_scope) 156 | net1_params = sorted(net1_params, key=lambda v: v.name) 157 | net2_params = sorted(net2_params, key=lambda v: v.name) 158 | 159 | update_ops = [] 160 | for net1_v, net2_v in zip(net1_params, net2_params): 161 | op = net2_v.assign(net1_v) 162 | update_ops.append(op) 163 | 164 | sess.run(update_ops) 165 | 166 | 167 | def copy_model_parameters(sess, net1, net2): 168 | """ 169 | Copies the model parameters of one net to another. 170 | 171 | Args: 172 | sess: Tensorflow session instance 173 | net1: net to copy the parameters from 174 | net2: net to copy the parameters to 175 | """ 176 | 177 | copy_scope_parameters(sess, net1.scope, net2.scope) 178 | -------------------------------------------------------------------------------- /agents/agent_states.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import rnn 3 | 4 | # 5 | # def get_state_variables(batch_size, cell): 6 | # zero_states = cell.zero_state(1, tf.float32) 7 | # if isinstance(zero_states, list): 8 | # state_variables = [] 9 | # for i, (state_c, state_h) in enumerate(zero_states): 10 | # init_state_c = tf.get_variable( 11 | # name="initial_state_vector_c:{}".format(i), 12 | # dtype=tf.float32, 13 | # initializer=state_c, 14 | # trainable=False) 15 | # init_state_h = tf.get_variable( 16 | # name="initial_state_vector_h:{}".format(i), 17 | # dtype=tf.float32, 18 | # initializer=state_h, 19 | # trainable=False) 20 | # init_state_c = tf.tile(init_state_c, [batch_size, 1]) 21 | # init_state_h = tf.tile(init_state_h, [batch_size, 1]) 22 | # state_variables.append( 23 | # rnn.LSTMStateTuple( 24 | # init_state_c, 25 | # init_state_h)) 26 | # # Return as a tuple, so that it can be fed to dynamic_rnn as an initial state 27 | # return tuple(state_variables) 28 | # elif isinstance(zero_states, tuple): 29 | # state_c, state_h = zero_states 30 | # init_state_c = tf.get_variable( 31 | # name="initial_state_vector_c", 32 | # dtype=tf.float32, 33 | # initializer=state_c, 34 | # trainable=False) 35 | # init_state_h = tf.get_variable( 36 | # name="initial_state_vector_h", 37 | # dtype=tf.float32, 38 | # initializer=state_h, 39 | # trainable=False) 40 | # import pdb; pdb.set_trace() 41 | # init_state_c = tf.tile(init_state_c, [batch_size, 1]) 42 | # init_state_h = tf.tile(init_state_h, [batch_size, 1]) 43 | # return rnn.LSTMStateTuple(init_state_c, init_state_h) 44 | # 45 | # 46 | # def get_state_update_op(state_variables, new_states, mask=None): 47 | # # Add an operation to update the train states with the last state tensors 48 | # update_ops = [] 49 | # for state_variable, new_state in zip(state_variables, new_states): 50 | # # Assign the new state to the state variables on this layer 51 | # if mask is None: 52 | # # @TODO: error here, tiled Tensor has no assign 53 | # update_ops.extend([ 54 | # state_variable[0].assign(new_state[0]), 55 | # state_variable[1].assign(new_state[1])]) 56 | # else: 57 | # update_ops.extend([ 58 | # state_variable[0].assign( 59 | # tf.where(mask, tf.zeros_like(new_state[0]), new_state[0])), 60 | # state_variable[1].assign( 61 | # tf.where(mask, tf.zeros_like(new_state[1]), new_state[1]))]) 62 | # # Return a tuple in order to combine all update_ops into a single operation. 63 | # # The tuple's actual value should not be used. 64 | # return tf.tuple(update_ops) 65 | # 66 | # 67 | # @TODO: rewrite for any cell, and refactor it 68 | # @TODO: Not working without known batch_size, so static! 69 | # class RecurrentHiddenState(object): 70 | # def __init__(self, feature_state, size=512, activation=tf.tanh): 71 | # self.is_end = tf.placeholder(shape=[None], dtype=tf.bool, name="is_end") 72 | # 73 | # self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") 74 | # self.global_step = tf.Variable(0, name='global_step', trainable=False) 75 | # 76 | # self.loss = None 77 | # self.optimizer = None 78 | # self.train_op = None 79 | # 80 | # self.relative_scope = "hidden_state" 81 | # self.scope = tf.get_variable_scope().name + "/" + self.relative_scope 82 | # batch_size = tf.unstack(tf.shape(feature_state))[0] 83 | # 84 | # with tf.variable_scope(self.relative_scope): 85 | # self.cell = rnn.LSTMCell(size, activation=activation) 86 | # self.belief_state = get_state_variables(batch_size, self.cell) 87 | # # very bad dark magic, need to refactor all of this 88 | # # supports only ine layer cell 89 | # self.belief_out = tf.placeholder( 90 | # tf.float32, [2, None, self.cell.output_size]) 91 | # l = tf.unstack(self.belief_out, axis=0) 92 | # rnn_tuple_state = rnn.LSTMStateTuple(l[0], l[1]) 93 | # import pdb; pdb.set_trace() 94 | # self.belief_assign = get_state_update_op([self.belief_state], [rnn_tuple_state]) 95 | # 96 | # logits, rnn_states = tf.nn.dynamic_rnn( 97 | # self.cell, tf.expand_dims(feature_state, 1), 98 | # sequence_length=[1] * batch_size, initial_state=self.belief_state) 99 | # 100 | # self.state = tf.squeeze(logits, 1) 101 | # 102 | # # @TODO: very hacky 2 103 | # self.belief_update = get_state_update_op( 104 | # [self.belief_state], [rnn_states], self.is_end) 105 | 106 | 107 | def get_state_variables(batch_size, cell): 108 | zero_states = cell.zero_state(batch_size, tf.float32) 109 | if isinstance(zero_states, list): 110 | state_variables = [] 111 | for i, (state_c, state_h) in enumerate(zero_states): 112 | init_state_c = tf.get_variable( 113 | name="initial_state_vector_c:{}".format(i), 114 | dtype=tf.float32, 115 | initializer=state_c, 116 | trainable=False) 117 | init_state_h = tf.get_variable( 118 | name="initial_state_vector_h:{}".format(i), 119 | dtype=tf.float32, 120 | initializer=state_h, 121 | trainable=False) 122 | state_variables.append( 123 | rnn.LSTMStateTuple( 124 | init_state_c, 125 | init_state_h)) 126 | # Return as a tuple, so that it can be fed to dynamic_rnn as an initial state 127 | return tuple(state_variables) 128 | elif isinstance(zero_states, tuple): 129 | state_c, state_h = zero_states 130 | init_state_c = tf.get_variable( 131 | name="initial_state_vector_c", 132 | dtype=tf.float32, 133 | initializer=state_c, 134 | trainable=False) 135 | init_state_h = tf.get_variable( 136 | name="initial_state_vector_h", 137 | dtype=tf.float32, 138 | initializer=state_h, 139 | trainable=False) 140 | return rnn.LSTMStateTuple(init_state_c, init_state_h) 141 | 142 | 143 | def get_state_update_op(state_variables, new_states, mask=None): 144 | # Add an operation to update the train states with the last state tensors 145 | update_ops = [] 146 | for state_variable, new_state in zip(state_variables, new_states): 147 | # Assign the new state to the state variables on this layer 148 | if mask is None: 149 | update_ops.extend([ 150 | state_variable[0].assign(new_state[0]), 151 | state_variable[1].assign(new_state[1])]) 152 | else: 153 | update_ops.extend([ 154 | state_variable[0].assign( 155 | tf.where(mask, tf.zeros_like(new_state[0]), new_state[0])), 156 | state_variable[1].assign( 157 | tf.where(mask, tf.zeros_like(new_state[1]), new_state[1]))]) 158 | # Return a tuple in order to combine all update_ops into a single operation. 159 | # The tuple's actual value should not be used. 160 | return tf.tuple(update_ops) 161 | 162 | 163 | class LinearHiddenState(object): 164 | def __init__(self, feature_state, size=512, activation=None): 165 | 166 | self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") 167 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 168 | 169 | self.loss = None 170 | self.optimizer = None 171 | self.train_op = None 172 | 173 | self.relative_scope = "hidden_state" 174 | self.scope = tf.get_variable_scope().name + "/" + self.relative_scope 175 | 176 | with tf.variable_scope(self.relative_scope): 177 | self.state = tf.layers.dense( 178 | feature_state, 179 | size, 180 | activation=activation) 181 | 182 | 183 | class RecurrentHiddenState(object): 184 | def __init__(self, feature_state, size=512, activation=tf.tanh, batch_size=1): 185 | self.is_end = tf.placeholder(shape=[None], dtype=tf.bool, name="is_end") 186 | 187 | self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") 188 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 189 | 190 | self.loss = None 191 | self.optimizer = None 192 | self.train_op = None 193 | 194 | self.relative_scope = "hidden_state" 195 | self.scope = tf.get_variable_scope().name + "/" + self.relative_scope 196 | 197 | with tf.variable_scope(self.relative_scope): 198 | self.cell = rnn.LSTMCell(size, activation=activation) 199 | self.belief_state = get_state_variables(batch_size, self.cell) 200 | # very bad dark magic, need to refactor all of this 201 | # supports only ine layer cell 202 | self.belief_out = tf.placeholder( 203 | tf.float32, [2, batch_size, self.cell.output_size]) 204 | l = tf.unstack(self.belief_out, axis=0) 205 | rnn_tuple_state = rnn.LSTMStateTuple(l[0], l[1]) 206 | self.belief_assign = get_state_update_op([self.belief_state], [rnn_tuple_state]) 207 | 208 | logits, rnn_states = tf.nn.dynamic_rnn( 209 | self.cell, tf.expand_dims(feature_state, 1), 210 | sequence_length=[1] * batch_size, initial_state=self.belief_state) 211 | 212 | self.state = tf.squeeze(logits, 1) 213 | 214 | # @TODO: very hacky 2 215 | self.belief_update = get_state_update_op([self.belief_state], [rnn_states], self.is_end) -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/common/__init__.py -------------------------------------------------------------------------------- /common/buffer.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | 5 | from common.segment_tree import SumSegmentTree, MinSegmentTree 6 | 7 | 8 | class ReplayBuffer(object): 9 | def __init__(self, size): 10 | """Create Prioritized Replay buffer. 11 | 12 | Parameters 13 | ---------- 14 | size: int 15 | Max number of transitions to store in the buffer. When the buffer 16 | overflows the old memories are dropped. 17 | """ 18 | self._storage = [] 19 | self._maxsize = size 20 | self._next_idx = 0 21 | 22 | def __len__(self): 23 | return len(self._storage) 24 | 25 | def add(self, obs_t, action, reward, obs_tp1, done): 26 | data = (obs_t, action, reward, obs_tp1, done) 27 | 28 | if self._next_idx >= len(self._storage): 29 | self._storage.append(data) 30 | else: 31 | self._storage[self._next_idx] = data 32 | self._next_idx = (self._next_idx + 1) % self._maxsize 33 | 34 | def _encode_sample(self, idxes): 35 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 36 | for i in idxes: 37 | data = self._storage[i] 38 | obs_t, action, reward, obs_tp1, done = data 39 | obses_t.append(np.array(obs_t, copy=False)) 40 | actions.append(np.array(action, copy=False)) 41 | rewards.append(reward) 42 | obses_tp1.append(np.array(obs_tp1, copy=False)) 43 | dones.append(done) 44 | return np.array(obses_t), \ 45 | np.array(actions), \ 46 | np.array(rewards), \ 47 | np.array(obses_tp1), \ 48 | np.array(dones) 49 | 50 | def sample(self, batch_size): 51 | """Sample a batch of experiences. 52 | 53 | Parameters 54 | ---------- 55 | batch_size: int 56 | How many transitions to sample. 57 | 58 | Returns 59 | ------- 60 | obs_batch: np.array 61 | batch of observations 62 | act_batch: np.array 63 | batch of actions executed given obs_batch 64 | rew_batch: np.array 65 | rewards received as results of executing act_batch 66 | next_obs_batch: np.array 67 | next set of observations seen after executing act_batch 68 | done_mask: np.array 69 | done_mask[i] = 1 if executing act_batch[i] resulted in 70 | the end of an episode and 0 otherwise. 71 | """ 72 | idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 73 | return self._encode_sample(idxes) 74 | 75 | 76 | class PrioritizedReplayBuffer(ReplayBuffer): 77 | def __init__(self, size, alpha=0.5): 78 | """Create Prioritized Replay buffer. 79 | 80 | Parameters 81 | ---------- 82 | size: int 83 | Max number of transitions to store in the buffer. When the buffer 84 | overflows the old memories are dropped. 85 | alpha: float 86 | how much prioritization is used 87 | (0 - no prioritization, 1 - full prioritization) 88 | 89 | See Also 90 | -------- 91 | ReplayBuffer.__init__ 92 | """ 93 | super(PrioritizedReplayBuffer, self).__init__(size) 94 | assert alpha > 0 95 | self._alpha = alpha 96 | 97 | it_capacity = 1 98 | while it_capacity < size: 99 | it_capacity *= 2 100 | 101 | self._it_sum = SumSegmentTree(it_capacity) 102 | self._it_min = MinSegmentTree(it_capacity) 103 | self._max_priority = 1.0 104 | 105 | def add(self, *args, **kwargs): 106 | """See ReplayBuffer.store_effect""" 107 | idx = self._next_idx 108 | super().add(*args, **kwargs) 109 | self._it_sum[idx] = self._max_priority ** self._alpha 110 | self._it_min[idx] = self._max_priority ** self._alpha 111 | 112 | def _sample_proportional(self, batch_size): 113 | res = [] 114 | for _ in range(batch_size): 115 | # TODO(szymon): should we ensure no repeats? 116 | mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) 117 | idx = self._it_sum.find_prefixsum_idx(mass) 118 | res.append(idx) 119 | return res 120 | 121 | def sample(self, batch_size, beta=0.5): 122 | """Sample a batch of experiences. 123 | 124 | compared to ReplayBuffer.sample 125 | it also returns importance weights and idxes 126 | of sampled experiences. 127 | 128 | 129 | Parameters 130 | ---------- 131 | batch_size: int 132 | How many transitions to sample. 133 | beta: float 134 | To what degree to use importance weights 135 | (0 - no corrections, 1 - full correction) 136 | 137 | Returns 138 | ------- 139 | obs_batch: np.array 140 | batch of observations 141 | act_batch: np.array 142 | batch of actions executed given obs_batch 143 | rew_batch: np.array 144 | rewards received as results of executing act_batch 145 | next_obs_batch: np.array 146 | next set of observations seen after executing act_batch 147 | done_mask: np.array 148 | done_mask[i] = 1 if executing act_batch[i] resulted in 149 | the end of an episode and 0 otherwise. 150 | weights: np.array 151 | Array of shape (batch_size,) and dtype np.float32 152 | denoting importance weight of each sampled transition 153 | idxes: np.array 154 | Array of shape (batch_size,) and dtype np.int32 155 | idexes in buffer of sampled experiences 156 | """ 157 | assert beta > 0 158 | 159 | idxes = self._sample_proportional(batch_size) 160 | 161 | weights = [] 162 | p_min = self._it_min.min() / self._it_sum.sum() 163 | max_weight = (p_min * len(self._storage)) ** (-beta) 164 | 165 | for idx in idxes: 166 | p_sample = self._it_sum[idx] / self._it_sum.sum() 167 | weight = (p_sample * len(self._storage)) ** (-beta) 168 | weights.append(weight / max_weight) 169 | weights = np.array(weights) 170 | encoded_sample = self._encode_sample(idxes) 171 | return tuple(list(encoded_sample) + [weights, idxes]) 172 | 173 | def update_priorities(self, idxes, priorities): 174 | """Update priorities of sampled transitions. 175 | 176 | sets priority of transition at index idxes[i] in buffer 177 | to priorities[i]. 178 | 179 | Parameters 180 | ---------- 181 | idxes: [int] 182 | List of idxes of sampled transitions 183 | priorities: [float] 184 | List of updated priorities corresponding to 185 | transitions at the sampled idxes denoted by 186 | variable `idxes`. 187 | """ 188 | assert len(idxes) == len(priorities) 189 | for idx, priority in zip(idxes, priorities): 190 | assert priority > 0 191 | assert 0 <= idx < len(self._storage) 192 | self._it_sum[idx] = priority ** self._alpha 193 | self._it_min[idx] = priority ** self._alpha 194 | 195 | self._max_priority = max(self._max_priority, priority) 196 | 197 | buffers = { 198 | "none": None, 199 | "simple": ReplayBuffer, 200 | "prioritized": PrioritizedReplayBuffer 201 | } -------------------------------------------------------------------------------- /common/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | activations = { 5 | "sigmoid": tf.sigmoid, 6 | "tanh": tf.tanh, 7 | "relu": tf.nn.relu, 8 | "relu6": tf.nn.relu6, 9 | "elu": tf.nn.elu, 10 | "softplus": tf.nn.softplus 11 | } 12 | 13 | 14 | def linear_network( 15 | states, is_training=False, scope=None, reuse=False, 16 | layers=None, activation_fn=tf.nn.elu, use_bn=False, dropout=-1): 17 | layers = layers or [16, 16] 18 | x = states 19 | with tf.variable_scope(scope or "linear_network", reuse=reuse): 20 | for n_out in layers: 21 | x = tf.layers.dense(x, n_out, activation=None) 22 | if use_bn: 23 | x = tf.layers.batch_normalization(x, training=is_training) 24 | x = activation_fn(x) 25 | if dropout > 0: 26 | x = tf.layers.dropout(x, rate=dropout, training=is_training) 27 | return x 28 | 29 | 30 | def convolution_network( 31 | states, is_training=False, scope=None, reuse=False, 32 | n_filters=None, kernels=None, strides=None, 33 | activation_fn=tf.nn.elu, use_bn=False, dropout=-1): 34 | n_filters = n_filters or [32, 64, 64] 35 | kernels = kernels or [8, 4, 4] 36 | strides = strides or [4, 2, 1] 37 | x = states 38 | with tf.variable_scope(scope or "convolution_network", reuse=reuse): 39 | for n_filter, kernel, stride in zip(n_filters, kernels, strides): 40 | x = tf.layers.conv2d(x, n_filter, kernel, stride, activation=None) 41 | if use_bn: 42 | x = tf.layers.batch_normalization(x, training=is_training) 43 | x = activation_fn(x) 44 | if dropout > 0: 45 | x = tf.layers.dropout(x, rate=dropout, training=is_training) 46 | x = tf.contrib.layers.flatten(x) 47 | return x 48 | 49 | 50 | networks = { 51 | "linear": linear_network, 52 | "convolution": convolution_network 53 | } 54 | 55 | 56 | def network_wrapper(network, params): 57 | def wrapper(states, is_training=False, scope=None, reuse=False,): 58 | return network(states, is_training, scope, reuse, **params) 59 | return wrapper 60 | -------------------------------------------------------------------------------- /common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | - learning rate for the optimizer 4 | - exploration epsilon for the epsilon greedy exploration strategy 5 | - beta parameter for beta parameter in prioritized replay 6 | 7 | Each schedule has a function `value(t)` which returns the current value 8 | of the parameter given the timestep t of the optimization procedure. 9 | """ 10 | 11 | 12 | class Schedule(object): 13 | def value(self, t): 14 | """Value of the schedule at time t""" 15 | raise NotImplementedError() 16 | 17 | 18 | class ConstantSchedule(object): 19 | def __init__(self, value): 20 | """Value remains constant over time. 21 | 22 | Parameters 23 | ---------- 24 | value: float 25 | Constant value of the schedule 26 | """ 27 | self._v = value 28 | 29 | def value(self, t): 30 | """See Schedule.value""" 31 | return self._v 32 | 33 | 34 | def linear_interpolation(l, r, alpha): 35 | return l + alpha * (r - l) 36 | 37 | 38 | class PiecewiseSchedule(object): 39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 40 | """Piecewise schedule. 41 | 42 | endpoints: [(int, int)] 43 | list of pairs `(time, value)` meanining that schedule should output 44 | `value` when `t==time`. All the values for time must be sorted in 45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 48 | time passed between `time_a` and `time_b` for time `t`. 49 | interpolation: lambda float, float, float: float 50 | a function that takes value to the left and to the right of t according 51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 52 | right endpoint that t has covered. See linear_interpolation for example. 53 | outside_value: float 54 | if the value is requested outside of all the intervals sepecified in 55 | `endpoints` this value is returned. If None then AssertionError is 56 | raised when outside value is requested. 57 | """ 58 | idxes = [e[0] for e in endpoints] 59 | assert idxes == sorted(idxes) 60 | self._interpolation = interpolation 61 | self._outside_value = outside_value 62 | self._endpoints = endpoints 63 | 64 | def value(self, t): 65 | """See Schedule.value""" 66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 67 | if l_t <= t < r_t: 68 | alpha = float(t - l_t) / (r_t - l_t) 69 | return self._interpolation(l, r, alpha) 70 | 71 | # t does not belong to any of the pieces, so doom. 72 | assert self._outside_value is not None 73 | return self._outside_value 74 | 75 | 76 | class LinearSchedule(object): 77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 78 | """Linear interpolation between initial_p and final_p over 79 | schedule_timesteps. After this many timesteps pass final_p is 80 | returned. 81 | 82 | Parameters 83 | ---------- 84 | schedule_timesteps: int 85 | Number of timesteps for which to linearly anneal initial_p 86 | to final_p 87 | initial_p: float 88 | initial output value 89 | final_p: float 90 | final output value 91 | """ 92 | self.schedule_timesteps = schedule_timesteps 93 | self.final_p = final_p 94 | self.initial_p = initial_p 95 | 96 | def value(self, t): 97 | """See Schedule.value""" 98 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 99 | return self.initial_p + fraction * (self.final_p - self.initial_p) 100 | -------------------------------------------------------------------------------- /common/segment_tree.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | 4 | class SegmentTree(object): 5 | def __init__(self, capacity, operation, neutral_element): 6 | """Build a Segment Tree data structure. 7 | 8 | https://en.wikipedia.org/wiki/Segment_tree 9 | 10 | Can be used as regular array, but with two 11 | important differences: 12 | 13 | a) setting item's value is slightly slower. 14 | It is O(lg capacity) instead of O(1). 15 | b) user has access to an efficient `reduce` 16 | operation which reduces `operation` over 17 | a contiguous subsequence of items in the 18 | array. 19 | 20 | Paramters 21 | --------- 22 | capacity: int 23 | Total size of the array - must be a power of two. 24 | operation: lambda obj, obj -> obj 25 | and operation for combining elements (eg. sum, max) 26 | must for a mathematical group together with the set of 27 | possible values for array elements. 28 | neutral_element: obj 29 | neutral element for the operation above. eg. float('-inf') 30 | for max and 0 for sum. 31 | """ 32 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 33 | self._capacity = capacity 34 | self._value = [neutral_element for _ in range(2 * capacity)] 35 | self._operation = operation 36 | 37 | def _reduce_helper(self, start, end, node, node_start, node_end): 38 | if start == node_start and end == node_end: 39 | return self._value[node] 40 | mid = (node_start + node_end) // 2 41 | if end <= mid: 42 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 43 | else: 44 | if mid + 1 <= start: 45 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 46 | else: 47 | return self._operation( 48 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 49 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 50 | ) 51 | 52 | def reduce(self, start=0, end=None): 53 | """Returns result of applying `self.operation` 54 | to a contiguous subsequence of the array. 55 | 56 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 57 | 58 | Parameters 59 | ---------- 60 | start: int 61 | beginning of the subsequence 62 | end: int 63 | end of the subsequences 64 | 65 | Returns 66 | ------- 67 | reduced: obj 68 | result of reducing self.operation over the specified range of array elements. 69 | """ 70 | if end is None: 71 | end = self._capacity 72 | if end < 0: 73 | end += self._capacity 74 | end -= 1 75 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 76 | 77 | def __setitem__(self, idx, val): 78 | # index of the leaf 79 | idx += self._capacity 80 | self._value[idx] = val 81 | idx //= 2 82 | while idx >= 1: 83 | self._value[idx] = self._operation( 84 | self._value[2 * idx], 85 | self._value[2 * idx + 1] 86 | ) 87 | idx //= 2 88 | 89 | def __getitem__(self, idx): 90 | assert 0 <= idx < self._capacity 91 | return self._value[self._capacity + idx] 92 | 93 | 94 | class SumSegmentTree(SegmentTree): 95 | def __init__(self, capacity): 96 | super(SumSegmentTree, self).__init__( 97 | capacity=capacity, 98 | operation=operator.add, 99 | neutral_element=0.0 100 | ) 101 | 102 | def sum(self, start=0, end=None): 103 | """Returns arr[start] + ... + arr[end]""" 104 | return super(SumSegmentTree, self).reduce(start, end) 105 | 106 | def find_prefixsum_idx(self, prefixsum): 107 | """Find the highest index `i` in the array such that 108 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 109 | 110 | if array values are probabilities, this function 111 | allows to sample indexes according to the discrete 112 | probability efficiently. 113 | 114 | Parameters 115 | ---------- 116 | perfixsum: float 117 | upperbound on the sum of array prefix 118 | 119 | Returns 120 | ------- 121 | idx: int 122 | highest index satisfying the prefixsum constraint 123 | """ 124 | assert 0 <= prefixsum <= self.sum() + 1e-5 125 | idx = 1 126 | while idx < self._capacity: # while non-leaf 127 | if self._value[2 * idx] > prefixsum: 128 | idx = 2 * idx 129 | else: 130 | prefixsum -= self._value[2 * idx] 131 | idx = 2 * idx + 1 132 | return idx - self._capacity 133 | 134 | 135 | class MinSegmentTree(SegmentTree): 136 | def __init__(self, capacity): 137 | super(MinSegmentTree, self).__init__( 138 | capacity=capacity, 139 | operation=min, 140 | neutral_element=float('inf') 141 | ) 142 | 143 | def min(self, start=0, end=None): 144 | """Returns min(arr[start], ..., arr[end])""" 145 | 146 | return super(MinSegmentTree, self).reduce(start, end) 147 | -------------------------------------------------------------------------------- /jedi_upload.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Use your power, Luke! 5 | """ 6 | 7 | import gym 8 | import argparse 9 | import json 10 | from glob import glob 11 | 12 | 13 | def force_upload(monitor_dir, correct_name, api_key): 14 | f_name = glob("{}/*manifest.json".format(monitor_dir))[0] 15 | with open(f_name, "r") as fin: 16 | data = json.load(fin) 17 | data["env_info"]["env_id"] = correct_name 18 | with open(f_name, "w") as fout: 19 | json.dump(data, fout, ensure_ascii=False, indent=4) 20 | gym.upload(monitor_dir, api_key=api_key) 21 | 22 | 23 | def _parse_args(): 24 | parser = argparse.ArgumentParser(description='') 25 | parser.add_argument('--monitor_dir', 26 | type=str) 27 | parser.add_argument('--correct_name', 28 | type=str) 29 | parser.add_argument('--api_key', 30 | type=str) 31 | 32 | args, _ = parser.parse_known_args() 33 | return args 34 | 35 | 36 | def main(): 37 | args = _parse_args() 38 | force_upload(args.monitor_dir, args.correct_name, args.api_key) 39 | 40 | if __name__ == '__main__': 41 | main() 42 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | gym 3 | tensorflow==1.1.0 -------------------------------------------------------------------------------- /wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scitator/rl-course-experiments/47e7666e2bc618ca4007372f411c3626fd7efc2b/wrappers/__init__.py -------------------------------------------------------------------------------- /wrappers/gym_wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.misc import imresize 3 | import gym 4 | from gym.core import ObservationWrapper, Wrapper 5 | from gym.spaces.box import Box 6 | from gym.wrappers import SkipWrapper, TimeLimit 7 | from copy import copy 8 | import collections 9 | 10 | try: 11 | import ppaquette_gym_doom 12 | from ppaquette_gym_doom.wrappers.action_space import ToDiscrete 13 | except ImportError: 14 | print("no doom envs") 15 | 16 | 17 | Transition = collections.namedtuple( 18 | "Transition", 19 | ["state", "action", "reward", "next_state", "done"]) 20 | 21 | 22 | class PreprocessImage(ObservationWrapper): 23 | def __init__(self, env, height=64, width=64, grayscale=True, crop=None): 24 | """ 25 | A gym wrapper that crops, scales image into the desired shapes and optionally grayscales it. 26 | """ 27 | super(PreprocessImage, self).__init__(env) 28 | self.img_size = (height, width) 29 | self.grayscale = grayscale 30 | no_crop = lambda img: img 31 | self.crop = crop or no_crop 32 | 33 | n_colors = 1 if self.grayscale else 3 34 | self.observation_space = Box(0.0, 1.0, [height, width, n_colors]) 35 | 36 | def _observation(self, img): 37 | """what happens to the observation""" 38 | img = self.crop(img) 39 | img = imresize(img, self.img_size) 40 | if self.grayscale: 41 | img = img.mean(-1, keepdims=True) 42 | img = img.astype('float32') / 255. 43 | return img 44 | 45 | 46 | class FrameBuffer(Wrapper): 47 | def __init__(self, env, n_frames=4, reshape_fn=None): 48 | """A gym wrapper that returns last n_frames observations as a single observation. 49 | Useful for games like Atari and Doom with screen as input.""" 50 | super(FrameBuffer, self).__init__(env) 51 | self.framebuffer = np.zeros([n_frames, ] + list(env.observation_space.shape)) 52 | 53 | # now, hacky auto-reshape fn 54 | if reshape_fn is None: 55 | shape_dims = list(range(len(self.framebuffer.shape))) 56 | shape_dims = shape_dims[1:] + [shape_dims[0]] 57 | 58 | result_shape = list(env.observation_space.shape) 59 | if len(result_shape) == 1: 60 | # so, its linear env 61 | result_shape += [1] 62 | result_shape[-1] = result_shape[-1] * n_frames 63 | 64 | reshape_fn = lambda x: np.transpose(x, shape_dims).reshape(result_shape) 65 | 66 | self.reshape_fn = reshape_fn 67 | self.observation_space = Box(0.0, 1.0, self.reshape_fn(self.framebuffer).shape) 68 | 69 | def reset(self): 70 | """resets breakout, returns initial frames""" 71 | self.framebuffer = np.zeros_like(self.framebuffer) 72 | self.update_buffer(self.env.reset()) 73 | return self.reshape_fn(self.framebuffer) 74 | 75 | def step(self, action): 76 | """plays breakout for 1 step, returns 4-frame buffer""" 77 | new_obs, r, done, info = self.env.step(action) 78 | self.update_buffer(new_obs) 79 | return self.reshape_fn(self.framebuffer), r, done, info 80 | 81 | def update_buffer(self, obs): 82 | """push new observation to the buffer, remove the earliest one""" 83 | self.framebuffer = np.vstack([obs[None], self.framebuffer[:-1]]) 84 | 85 | 86 | class EnvPool(Wrapper): 87 | """ 88 | Typical EnvPool, that does not care about done envs. 89 | """ 90 | 91 | def __init__(self, env, n_envs=16, autoreload_envs=False): 92 | super(EnvPool, self).__init__(env) 93 | self.initial_env = env 94 | self.n_envs = n_envs 95 | self.env_shape = env.observation_space.shape 96 | self.envs = [] 97 | self.recreate_envs() 98 | self.reset() 99 | 100 | def recreate_envs(self): 101 | self.close() 102 | self.envs = np.array([copy(self.initial_env) for _ in range(self.n_envs)]) 103 | 104 | def reset(self): 105 | self._states = np.zeros(shape=(self.n_envs,) + tuple(self.env_shape), dtype=np.float32) 106 | self._rewards = np.zeros(shape=self.n_envs, dtype=np.float32) 107 | self._dones = np.zeros(shape=self.n_envs, dtype=np.bool) 108 | for i, env in enumerate(self.envs): 109 | self._states[i] = env.reset() 110 | return self._states.copy() 111 | 112 | def step(self, actions): 113 | 114 | for i, (action, env) in enumerate(zip(actions, self.envs)): 115 | new_s, r, done, _ = env.step(action) 116 | self._rewards[i] = r 117 | self._dones[i] = done 118 | if not done: 119 | self._states[i] = new_s 120 | else: 121 | self._states[i] = env.reset() 122 | return self._states.copy(), self._rewards.copy(), self._dones.copy(), None 123 | 124 | def close(self): 125 | for env in self.envs: 126 | env.close() 127 | 128 | def pool_states(self): 129 | return self._states.copy() 130 | 131 | 132 | def make_env(env_name, n_games=1, episode_limit=None, n_frames=1, autoreload_envs=False): 133 | env = gym.make(env_name) if episode_limit is None else gym.make(env_name).env 134 | env = FrameBuffer(env, n_frames=n_frames) if n_frames > 1 else env 135 | if episode_limit is not None: 136 | env = TimeLimit(env, max_episode_steps=episode_limit) 137 | return EnvPool(env, n_games, autoreload_envs) if n_games > 0 else env 138 | 139 | 140 | def make_image_env( 141 | env_name, n_games=1, episode_limit=None, 142 | n_frames=1, autoreload_envs=False, 143 | width=64, height=64, 144 | grayscale=True, crop=None): 145 | env = gym.make(env_name) if episode_limit is None else gym.make(env_name).env 146 | if "ppaquette" in env_name: 147 | env = SkipWrapper(4)(ToDiscrete("minimal")(env)) 148 | env = PreprocessImage(env, width=width, height=height, grayscale=grayscale, crop=crop) 149 | env = FrameBuffer(env, n_frames=n_frames) if n_frames > 1 else env 150 | if episode_limit is not None: 151 | env = TimeLimit(env, max_episode_steps=episode_limit) 152 | return EnvPool(env, n_games, autoreload_envs) if n_games > 0 else env 153 | 154 | 155 | def make_env_wrapper(make_env_fn, params): 156 | def wrapper(env, n_games, episode_limit=None): 157 | return make_env_fn(env, n_games, episode_limit=episode_limit, **params) 158 | 159 | return wrapper 160 | -------------------------------------------------------------------------------- /wrappers/run_wrappers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | 4 | import gym 5 | import numpy as np 6 | import tensorflow as tf 7 | from rstools.utils.os_utils import save_history, save_model, create_if_need 8 | from rstools.visualization.plotter import plot_all_metrics 9 | 10 | from common.networks import activations, networks, network_wrapper 11 | from wrappers.gym_wrappers import make_env, make_image_env, make_env_wrapper 12 | 13 | try: 14 | import ppaquette_gym_doom 15 | except ImportError: 16 | print("no doom envs") 17 | 18 | 19 | def str2params(string, delimeter="-"): 20 | try: 21 | result = tuple(map(int, string.split(delimeter))) 22 | except: 23 | result = None 24 | return result 25 | 26 | 27 | def epsilon_greedy_policy(agent, sess, observations): 28 | probs = agent.predict_probs(sess, observations) 29 | actions = [np.random.choice(len(row), p=row) for row in probs] 30 | return actions 31 | 32 | 33 | # @TODO: rewrite more numpy way (no for usage) 34 | def epsilon_greedy_actions(agent, sess, observations, epsilon=0.01): 35 | qvalues = agent.predict_qvalues(sess, observations) 36 | probs = np.ones_like(qvalues, dtype=float) * epsilon / agent.qvalue_net.n_actions 37 | best_action = np.argmax(qvalues, axis=-1) 38 | for i, action in enumerate(best_action): 39 | probs[i, action] += (1.0 - epsilon) 40 | actions = [np.random.choice(len(row), p=row) for row in probs] 41 | return actions 42 | 43 | 44 | def play_session(sess, agent, env, action_fn, t_max=int(1e10)): 45 | total_reward = 0 46 | 47 | s = env.reset() 48 | for t in range(t_max): 49 | a = action_fn(agent, sess, np.array([s], dtype=np.float32))[0] 50 | next_s, r, done, _ = env.step(a) 51 | total_reward += r 52 | 53 | if hasattr(agent, "update_belief_state"): 54 | agent.update_belief_state(sess, [s], [done]) 55 | 56 | s = next_s 57 | if done: 58 | break 59 | 60 | return total_reward, t 61 | 62 | 63 | def update_wraper( 64 | update_fn, 65 | **kwargs): 66 | def wrapper(*args): 67 | return update_fn(*args, **kwargs) 68 | 69 | return wrapper 70 | 71 | 72 | def create_agent(agent_cls, state_shape, n_actions, agent_agrs, use_target_network): 73 | agent = agent_cls( 74 | state_shape, n_actions, **agent_agrs) 75 | 76 | if use_target_network: 77 | targets_special = {**agent_agrs["special"], **{"scope": "target_" + agent.scope}} 78 | agent_agrs["special"] = targets_special 79 | target_agent = agent_cls( 80 | state_shape, n_actions, **agent_agrs) 81 | agent = (agent, target_agent) 82 | 83 | from pprint import pprint 84 | pprint([(v.name, v.get_shape().as_list()) for v in tf.trainable_variables()]) 85 | 86 | return agent 87 | 88 | 89 | def run_wrapper( 90 | n_games, learning_fn, update_fn, play_fn, action_fn, 91 | env_name, make_env_fn, agent_cls, 92 | run_args, agent_agrs, 93 | log_dir=None, episode_limit=None, 94 | plot_stats=False, api_key=None, 95 | load=False, gpu_option=0.4, 96 | use_target_network=False): 97 | env = make_env_fn(env_name, n_games, episode_limit=episode_limit) 98 | 99 | n_actions = env.action_space.n 100 | state_shape = env.observation_space.shape 101 | 102 | # hack, I know 103 | agent_agrs["special"]["batch_size"] = n_games 104 | agent = create_agent(agent_cls, state_shape, n_actions, agent_agrs, use_target_network) 105 | 106 | log_dir = log_dir or "./logs_" + env_name.replace(string.punctuation, "_") 107 | create_if_need(log_dir) 108 | 109 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_option) 110 | with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: 111 | saver = tf.train.Saver( 112 | var_list=tf.trainable_variables(), 113 | keep_checkpoint_every_n_hours=1) 114 | 115 | sess.run(tf.global_variables_initializer()) 116 | if load: 117 | saver.restore(sess, "{}/model.ckpt".format(log_dir)) 118 | 119 | save_model(sess, saver, log_dir) 120 | try: 121 | history = learning_fn( 122 | sess, agent, env, 123 | update_fn=update_fn, 124 | **run_args) 125 | save_history(history, log_dir) 126 | if plot_stats: 127 | plotter_dir = os.path.join(log_dir, "plotter") 128 | plot_all_metrics(history, save_dir=plotter_dir) 129 | except KeyboardInterrupt: 130 | print("Exiting training procedure") 131 | save_model(sess, saver, log_dir) 132 | 133 | if api_key is not None: 134 | tf.reset_default_graph() 135 | agent_agrs["special"]["batch_size"] = 1 136 | agent = create_agent(agent_cls, state_shape, n_actions, agent_agrs, use_target_network) 137 | 138 | env_name = env_name.replace("Deterministic", "") 139 | env = make_env_fn(env_name, -1, episode_limit=None) 140 | monitor_dir = os.path.join(log_dir, "monitor") 141 | 142 | env = gym.wrappers.Monitor(env, monitor_dir, force=True) 143 | 144 | with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: 145 | saver = tf.train.Saver( 146 | var_list=tf.trainable_variables(), 147 | keep_checkpoint_every_n_hours=1) 148 | sess.run(tf.global_variables_initializer()) 149 | saver.restore(sess, "{}/model.ckpt".format(log_dir)) 150 | 151 | sessions = [play_fn(sess, agent, env, action_fn=action_fn) for _ in range(300)] 152 | 153 | env.close() 154 | gym.upload(monitor_dir, api_key=api_key) 155 | 156 | 157 | def typical_args(parser): 158 | parser.add_argument( 159 | '--env', 160 | type=str, 161 | default='CartPole-v0', # BreakoutDeterministic-v0 162 | help='The environment to use. (default: %(default)s)') 163 | 164 | # env pool params 165 | parser.add_argument( 166 | '--n_games', 167 | type=int, 168 | default=10, 169 | help='Number of parallel games to play during training. (default: %(default)s)') 170 | parser.add_argument( 171 | '--reload_envs', 172 | action='store_true', 173 | default=False, 174 | help='Flag for auto-reloading environments if they done. (default: %(default)s)') 175 | 176 | parser.add_argument( 177 | '--n_epochs', 178 | type=int, 179 | default=100, 180 | help='Number of epochs to train. (default: %(default)s)') 181 | parser.add_argument( 182 | '--n_sessions', 183 | type=int, 184 | default=10, 185 | help='Number of game session to play per one epoch. (default: %(default)s)') 186 | parser.add_argument( 187 | '--t_max', 188 | type=int, 189 | default=1000, 190 | help='Steps per game session to play. (default: %(default)s)') 191 | parser.add_argument( 192 | '--episode_limit', 193 | type=int, 194 | default=None, 195 | help='Max steps in environment to play. (default: %(default)s)') 196 | parser.add_argument( 197 | '--plot_history', 198 | action='store_true', 199 | default=False, 200 | help='Plot graph with main train statistics (reward, loss, steps). (default: %(default)s)') 201 | parser.add_argument( 202 | '--api_key', 203 | type=str, 204 | default=None, 205 | help='Your API key to submit to gym. (default: %(default)s)') 206 | parser.add_argument( 207 | '--log_dir', 208 | type=str, 209 | default=None, 210 | help='Your API key to submit to gym. (default: %(default)s)') 211 | parser.add_argument( 212 | '--load', 213 | action='store_true', 214 | default=False, 215 | help='Flag to load previous model from log_dir. (default: %(default)s)') 216 | parser.add_argument( 217 | '--gpu_option', 218 | type=float, 219 | default=0.45, 220 | help='GPU usage. (default: %(default)s)') 221 | 222 | # feature network params 223 | parser.add_argument( 224 | '--feature_network', 225 | type=str, 226 | choices=["linear", "convolution"], 227 | default="linear", 228 | help='Feature network type, need to create vector representation of the state. ' 229 | '(default: %(''default)s)') 230 | parser.add_argument( 231 | '--activation', 232 | type=str, 233 | default="elu", 234 | help='Typical activation for feature network. (default: %(default)s)') 235 | parser.add_argument( 236 | '--use_bn', 237 | action='store_true', 238 | default=False, 239 | help='Batchnorm usage flag. (default: %(default)s) - no batchnorm') 240 | parser.add_argument( 241 | '--dropout', 242 | type=float, 243 | default=-1, 244 | help='Dropout keep prob rate. (default: %(default)s) - no dropout') 245 | 246 | # special args for linear network 247 | parser.add_argument( 248 | '--layers', 249 | type=str, 250 | default=None, 251 | help='Linear feature network layers, splitted by \'-\'.') 252 | 253 | # special args for convolution network: 254 | parser.add_argument( 255 | '--n_filters', 256 | type=str, 257 | default=None, 258 | help='Convolution feature network filters, splitted by \'-\'.') 259 | parser.add_argument( 260 | '--kernels', 261 | type=str, 262 | default=None, 263 | help='Convolution feature network kernels, splitted by \'-\'.') 264 | parser.add_argument( 265 | '--strides', 266 | type=str, 267 | default=None, 268 | help='Convolution feature network strides, splitted by \'-\'.') 269 | 270 | # typical hidden state params 271 | parser.add_argument( 272 | '--hidden_size', 273 | type=int, 274 | default=512, 275 | help='Hidden state size. (default: %(default)s)') 276 | parser.add_argument( 277 | '--hidden_activation', 278 | type=str, 279 | default="elu", 280 | help='Hidden state activation. (default: %(default)s)') 281 | 282 | # typical optimization params 283 | parser.add_argument( 284 | '--feature_lr', 285 | type=float, 286 | default=1e-5, 287 | help='Learning rate for feature network. (default: %(default)s)') 288 | parser.add_argument( 289 | '--lr_decay_steps', 290 | type=float, 291 | default=1e5, 292 | help='Learning rate decay steps. (default: %(default)s)') 293 | parser.add_argument( 294 | '--lr_decay', 295 | type=float, 296 | default=0.999, 297 | help='Learning rate decay factor. (default: %(default)s)') 298 | parser.add_argument( 299 | '--grad_clip', 300 | type=float, 301 | default=1.0, 302 | help='Gradient clip factor. (default: %(default)s)') 303 | 304 | # update args 305 | parser.add_argument( 306 | '--gamma', 307 | type=float, 308 | default=0.99, 309 | help='Gamma discount factor. (default: %(default)s)') 310 | parser.add_argument( 311 | '--reward_norm', 312 | type=float, 313 | default=1.0, 314 | help='Reward norm factor. (default: %(default)s)') 315 | parser.add_argument( 316 | '--batch_size', 317 | type=int, 318 | default=10, 319 | help='Batch size for update, should be more than parallel games count. ' 320 | '(default: %(''default)s)') 321 | parser.add_argument( 322 | '--time_major', 323 | action='store_true', 324 | default=False, 325 | help='Time-major flag for update. (default: %(default)s) - batch-major') 326 | 327 | # preprocess args for image envs 328 | parser.add_argument( 329 | '--image_width', 330 | type=int, 331 | default=64, 332 | help='Image-based environments preprocessing, cut to current width. ' 333 | '(default: %(default)s)') 334 | parser.add_argument( 335 | '--image_height', 336 | type=int, 337 | default=64, 338 | help='Image-based environments preprocessing, cut to current height. ' 339 | '(default: %(default)s)') 340 | parser.add_argument( 341 | '--image_grayscale', 342 | action='store_true', 343 | default=False, 344 | help='Image-based environments preprocessing, flag to grayscale state image.') 345 | parser.add_argument( 346 | '--image_corners', 347 | type=str, 348 | default=None, 349 | help='Image-based environments preprocessing, image corners splitted by \'-\'.') 350 | parser.add_argument( 351 | '--n_frames', 352 | type=int, 353 | default=1, 354 | help='Number of memory frames to use. (default: %(default)s)') 355 | 356 | return parser 357 | 358 | 359 | def typical_argsparse(args): 360 | if args.feature_network == "linear": 361 | network_args = { 362 | "layers": str2params(args.layers) 363 | } 364 | 365 | env_args = { 366 | "n_frames": args.n_frames, 367 | "autoreload_envs": args.autoreload_envs 368 | } 369 | make_env_fn = make_env_wrapper(make_env, env_args) 370 | elif args.feature_network == "convolution": 371 | network_args = { 372 | "n_filters": str2params(args.n_filters), 373 | "kernels": str2params(args.kernels), 374 | "strides": str2params(args.strides) 375 | } 376 | 377 | corners = str2params(args.image_corners) 378 | if corners is not None: 379 | image_crop_x1, image_crop_x2, image_crop_y1, image_crop_y2 = corners 380 | crop_fn = lambda img: img[image_crop_x1:image_crop_x2, image_crop_y1:image_crop_y2] 381 | else: 382 | crop_fn = None 383 | 384 | image_preprocessing_params = { 385 | "width": int(args.image_width), 386 | "height": int(args.image_height), 387 | "grayscale": args.image_grayscale, 388 | "crop": crop_fn, 389 | "n_frames": int(args.n_frames), 390 | "autoreload_envs": args.autoreload_envs 391 | } 392 | 393 | make_env_fn = make_env_wrapper(make_image_env, image_preprocessing_params) 394 | else: 395 | raise NotImplemented() 396 | 397 | network = network_wrapper( 398 | networks[args.feature_network], { 399 | **network_args, **{ 400 | "activation_fn": activations[args.activation], 401 | "use_bn": args.use_bn, 402 | "dropout": args.dropout 403 | }}) 404 | 405 | run_args = { 406 | "n_epochs": int(args.n_epochs), 407 | "n_sessions": int(args.n_sessions), 408 | "t_max": int(args.t_max) 409 | } 410 | update_args = { 411 | "discount_factor": float(args.gamma), 412 | "reward_norm": float(args.reward_norm), 413 | "batch_size": int(args.batch_size), 414 | "time_major": args.time_major, 415 | } 416 | optimization_params = { 417 | "initial_lr": float(args.feature_lr), 418 | "decay_steps": int(args.lr_decay_steps), 419 | "lr_decay": float(args.lr_decay), 420 | "grad_clip": float(args.grad_clip) 421 | } 422 | 423 | return network, run_args, update_args, optimization_params, make_env_fn 424 | -------------------------------------------------------------------------------- /xvfb: -------------------------------------------------------------------------------- 1 | #taken from https://gist.github.com/jterrace/2911875 2 | XVFB=/usr/bin/Xvfb 3 | XVFBARGS=":1 -screen 0 1024x768x24 -ac +extension GLX +render -noreset" 4 | PIDFILE=./xvfb.pid 5 | case "$1" in 6 | start) 7 | echo -n "Starting virtual X frame buffer: Xvfb" 8 | start-stop-daemon --start --quiet --pidfile $PIDFILE --make-pidfile --background --exec $XVFB -- $XVFBARGS 9 | echo "." 10 | ;; 11 | stop) 12 | echo -n "Stopping virtual X frame buffer: Xvfb" 13 | start-stop-daemon --stop --quiet --pidfile $PIDFILE 14 | echo "." 15 | ;; 16 | restart) 17 | $0 stop 18 | $0 start 19 | ;; 20 | *) 21 | echo "Usage: /etc/init.d/xvfb {start|stop|restart}" 22 | exit 1 23 | esac 24 | 25 | exit 0 26 | --------------------------------------------------------------------------------