├── Chapter02
    └── Code.ipynb
├── Chapter03
    ├── frozenlake8x8_policyiteration.py
    └── frozenlake8x8_valueiteration.py
├── Chapter04
    └── SARSA Q_learning Taxi-v2.py
├── Chapter05
    ├── .ipynb_checkpoints
    │   └── Untitled-checkpoint.ipynb
    ├── DQN_Atari.py
    ├── DQN_variations_Atari.py
    ├── Untitled.ipynb
    ├── atari_wrappers.py
    └── untitled
├── Chapter06
    ├── AC.py
    ├── REINFORCE.py
    └── REINFORCE_baseline.py
├── Chapter07
    ├── PPO.py
    └── TRPO.py
├── Chapter08
    ├── DDPG.py
    └── TD3.py
├── Chapter09
    └── ME-TRPO.py
├── Chapter10
    ├── DAgger.py
    └── expert
    │   ├── checkpoint
    │   ├── model.ckpt.data-00000-of-00001
    │   ├── model.ckpt.index
    │   └── model.ckpt.meta
├── Chapter11
    └── ES.py
├── Chapter12
    └── ESBAS.py
└── README.md


/Chapter02/Code.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### TensorFlow installation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "`pip3 install tensorflow`\n",
 15 |     "\n",
 16 |     "or\n",
 17 |     "\n",
 18 |     "`pip3 install tensorflow-gpu`"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "#### OpenAI Gym installation"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "On OSX: \n",
 33 |     "\n",
 34 |     "`brew install cmake boost boost-python sdl2 swig wget`\n",
 35 |     " \n",
 36 |     "On Ubuntu 16.04:\n",
 37 |     "\n",
 38 |     "`apt-get install -y python-pyglet python3-opengl zlib1g-dev libjpeg-dev patchelf cmake swig libboost-all-dev libsdl2-dev libosmesa6-dev xvfb ffmpeg`\n",
 39 |     "\n",
 40 |     "On Ubuntu 18.04\n",
 41 |     "\n",
 42 |     "`sudo apt install -y python3-dev zlib1g-dev libjpeg-dev cmake swig python-pyglet python3-opengl libboost-all-dev libsdl2-dev libosmesa6-dev patchelf ffmpeg xvfb `\n",
 43 |     "\n",
 44 |     "Then:\n",
 45 |     "\n",
 46 |     "```\n",
 47 |     "git clone https://github.com/openai/gym.git \n",
 48 |     "\n",
 49 |     "cd gym\n",
 50 |     "\n",
 51 |     "pip install -e '.[all]'\n",
 52 |     "```\n",
 53 |     "\n",
 54 |     "PyBox2D:\n",
 55 |     "\n",
 56 |     "```\n",
 57 |     "git clone https://github.com/pybox2d/pybox2d\n",
 58 |     "cd pybox2d\n",
 59 |     "pip3 install -e .\n",
 60 |     "```\n"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "#### Duckietown installation\n",
 68 |     "\n",
 69 |     "```\n",
 70 |     "git clone https://github.com/duckietown/gym-duckietown.git\n",
 71 |     "cd gym-duckietown\n",
 72 |     "pip3 install -e .\n",
 73 |     "```"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "#### Roboschool installation\n",
 81 |     "\n",
 82 |     "```\n",
 83 |     "git clone https://github.com/openai/roboschool\n",
 84 |     "cd roboschool\n",
 85 |     "ROBOSCHOOL_PATH=`pwd`\n",
 86 |     "git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision\n",
 87 |     "mkdir bullet3/build\n",
 88 |     "cd    bullet3/build\n",
 89 |     "cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 -DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF  -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF ..\n",
 90 |     "\n",
 91 |     "make -j4\n",
 92 |     "make install\n",
 93 |     "cd ../..\n",
 94 |     "pip3 install -e $ROBOSCHOOL_PATH\n",
 95 |     "```"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## RL cycle"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 1,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "import gym\n",
120 |     "\n",
121 |     "# create the environment \n",
122 |     "env = gym.make(\"CartPole-v1\")\n",
123 |     "# reset the environment before starting\n",
124 |     "env.reset()\n",
125 |     "\n",
126 |     "# loop 10 times\n",
127 |     "for i in range(10):\n",
128 |     "    # take a random action\n",
129 |     "    env.step(env.action_space.sample())\n",
130 |     "    # render the game\n",
131 |     "    env.render()\n",
132 |     "\n",
133 |     "# close the environment\n",
134 |     "env.close()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 2,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n",
147 |       "Episode 0 finished, reward:15\n",
148 |       "Episode 1 finished, reward:13\n",
149 |       "Episode 2 finished, reward:20\n",
150 |       "Episode 3 finished, reward:22\n",
151 |       "Episode 4 finished, reward:13\n",
152 |       "Episode 5 finished, reward:18\n",
153 |       "Episode 6 finished, reward:15\n",
154 |       "Episode 7 finished, reward:12\n",
155 |       "Episode 8 finished, reward:58\n",
156 |       "Episode 9 finished, reward:15\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "import gym\n",
162 |     "\n",
163 |     "# create and initialize the environment\n",
164 |     "env = gym.make(\"CartPole-v1\")\n",
165 |     "env.reset()\n",
166 |     "\n",
167 |     "# play 10 games\n",
168 |     "for i in range(10):\n",
169 |     "    # initialize the variables\n",
170 |     "    done = False\n",
171 |     "    game_rew = 0\n",
172 |     "\n",
173 |     "    while not done:\n",
174 |     "        # choose a random action\n",
175 |     "        action = env.action_space.sample()\n",
176 |     "        # take a step in the environment\n",
177 |     "        new_obs, rew, done, info = env.step(action)\n",
178 |     "        game_rew += rew\n",
179 |     "    \n",
180 |     "        # when is done, print the cumulative reward of the game and reset the environment\n",
181 |     "        if done:\n",
182 |     "            print('Episode %d finished, reward:%d' % (i, game_rew))\n",
183 |     "            env.reset()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 3,
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n",
196 |       "Box(4,)\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "import gym\n",
202 |     "\n",
203 |     "env = gym.make('CartPole-v1')\n",
204 |     "print(env.observation_space)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 4,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "Discrete(2)\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "print(env.action_space)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 5,
227 |    "metadata": {},
228 |    "outputs": [
229 |     {
230 |      "name": "stdout",
231 |      "output_type": "stream",
232 |      "text": [
233 |       "1\n",
234 |       "0\n",
235 |       "0\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "print(env.action_space.sample())\n",
241 |     "print(env.action_space.sample())\n",
242 |     "print(env.action_space.sample())"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 6,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "print(env.observation_space.low)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 7,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "name": "stdout",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "print(env.observation_space.high)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "## TensorFlow"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 8,
289 |    "metadata": {},
290 |    "outputs": [
291 |     {
292 |      "name": "stderr",
293 |      "output_type": "stream",
294 |      "text": [
295 |       "c:\\users\\andrea\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\h5py\\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
296 |       "  from ._conv import register_converters as _register_converters\n"
297 |      ]
298 |     },
299 |     {
300 |      "name": "stdout",
301 |      "output_type": "stream",
302 |      "text": [
303 |       "Tensor(\"add:0\", shape=(), dtype=int32)\n",
304 |       "7\n"
305 |      ]
306 |     }
307 |    ],
308 |    "source": [
309 |     "import tensorflow as tf\n",
310 |     "\n",
311 |     "# create two constants: a and b\n",
312 |     "a = tf.constant(4)\n",
313 |     "b = tf.constant(3)\n",
314 |     "\n",
315 |     "# perform a computation\n",
316 |     "c = a + b\n",
317 |     "print(c) # print the shape of c\n",
318 |     "\n",
319 |     "# create a session\n",
320 |     "session = tf.Session()\n",
321 |     "# run the session. It compute the sum\n",
322 |     "res = session.run(c)\n",
323 |     "print(res) # print the actual result"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 9,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "# reset the graph\n",
333 |     "tf.reset_default_graph()"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "### Tensor"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 10,
346 |    "metadata": {},
347 |    "outputs": [
348 |     {
349 |      "name": "stdout",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "()\n"
353 |      ]
354 |     }
355 |    ],
356 |    "source": [
357 |     "a = tf.constant(1)\n",
358 |     "print(a.shape)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 11,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "name": "stdout",
368 |      "output_type": "stream",
369 |      "text": [
370 |       "(5,)\n"
371 |      ]
372 |     }
373 |    ],
374 |    "source": [
375 |     "# array of five elements\n",
376 |     "b = tf.constant([1,2,3,4,5])\n",
377 |     "print(b.shape)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 12,
383 |    "metadata": {},
384 |    "outputs": [
385 |     {
386 |      "name": "stdout",
387 |      "output_type": "stream",
388 |      "text": [
389 |       "[1 2 3]\n"
390 |      ]
391 |     }
392 |    ],
393 |    "source": [
394 |     "#NB: a can be of any type of tensor\n",
395 |     "a = tf.constant([1,2,3,4,5])\n",
396 |     "first_three_elem = a[:3]\n",
397 |     "fourth_elem = a[3]\n",
398 |     "\n",
399 |     "sess = tf.Session()\n",
400 |     "print(sess.run(first_three_elem))"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 13,
406 |    "metadata": {},
407 |    "outputs": [
408 |     {
409 |      "name": "stdout",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "4\n"
413 |      ]
414 |     }
415 |    ],
416 |    "source": [
417 |     "print(sess.run(fourth_elem))"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "#### Constant"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 14,
430 |    "metadata": {},
431 |    "outputs": [
432 |     {
433 |      "name": "stdout",
434 |      "output_type": "stream",
435 |      "text": [
436 |       "Tensor(\"a_const:0\", shape=(4,), dtype=float32)\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "a = tf.constant([1.0, 1.1, 2.1, 3.1], dtype=tf.float32, name='a_const')\n",
442 |     "print(a)"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {},
448 |    "source": [
449 |     "#### Placeholder"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": 15,
455 |    "metadata": {},
456 |    "outputs": [
457 |     {
458 |      "name": "stdout",
459 |      "output_type": "stream",
460 |      "text": [
461 |       "[[10.1 10.2 10.3]]\n"
462 |      ]
463 |     }
464 |    ],
465 |    "source": [
466 |     "a = tf.placeholder(shape=(1,3), dtype=tf.float32)\n",
467 |     "b = tf.constant([[10,10,10]], dtype=tf.float32)\n",
468 |     "\n",
469 |     "c = a + b\n",
470 |     "\n",
471 |     "sess = tf.Session()\n",
472 |     "res = sess.run(c, feed_dict={a:[[0.1,0.2,0.3]]})\n",
473 |     "print(res)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 16,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "tf.reset_default_graph()"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": 17,
488 |    "metadata": {},
489 |    "outputs": [
490 |     {
491 |      "name": "stdout",
492 |      "output_type": "stream",
493 |      "text": [
494 |       "Tensor(\"Placeholder:0\", shape=(?, 3), dtype=float32)\n",
495 |       "[[10.1 10.2 10.3]]\n",
496 |       "[[7. 7. 7.]\n",
497 |       " [7. 7. 7.]]\n"
498 |      ]
499 |     }
500 |    ],
501 |    "source": [
502 |     "import numpy as np\n",
503 |     "\n",
504 |     "# NB: the fist dimension is 'None', meaning that it can be of any lenght\n",
505 |     "a = tf.placeholder(shape=(None,3), dtype=tf.float32)\n",
506 |     "b = tf.placeholder(shape=(None,3), dtype=tf.float32)\n",
507 |     "\n",
508 |     "c = a + b\n",
509 |     "\n",
510 |     "print(a)\n",
511 |     "\n",
512 |     "sess = tf.Session()\n",
513 |     "print(sess.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))\n",
514 |     "\n",
515 |     "v_a = np.array([[1,2,3],[4,5,6]])\n",
516 |     "v_b = np.array([[6,5,4],[3,2,1]])\n",
517 |     "print(sess.run(c, feed_dict={a:v_a, b:v_b}))"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": 18,
523 |    "metadata": {},
524 |    "outputs": [
525 |     {
526 |      "name": "stdout",
527 |      "output_type": "stream",
528 |      "text": [
529 |       "[[10.1 10.2 10.3]]\n"
530 |      ]
531 |     }
532 |    ],
533 |    "source": [
534 |     "sess = tf.Session()\n",
535 |     "print(sess.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {},
541 |    "source": [
542 |     "#### Variable"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": 19,
548 |    "metadata": {},
549 |    "outputs": [
550 |     {
551 |      "name": "stdout",
552 |      "output_type": "stream",
553 |      "text": [
554 |       "[[0.4478302  0.7014905  0.36300516]]\n",
555 |       "[[4 5]]\n"
556 |      ]
557 |     }
558 |    ],
559 |    "source": [
560 |     "tf.reset_default_graph()\n",
561 |     "\n",
562 |     "# variable initialized using the glorot uniform initializer\n",
563 |     "var = tf.get_variable(\"first_variable\", shape=[1,3], dtype=tf.float32, initializer=tf.glorot_uniform_initializer)\n",
564 |     "\n",
565 |     "# variable initialized with constant values\n",
566 |     "init_val = np.array([4,5])\n",
567 |     "var2 = tf.get_variable(\"second_variable\", shape=[1,2], dtype=tf.int32, initializer=tf.constant_initializer(init_val))\n",
568 |     "\n",
569 |     "# create the session\n",
570 |     "sess = tf.Session()\n",
571 |     "# initialize all the variables\n",
572 |     "sess.run(tf.global_variables_initializer())\n",
573 |     "\n",
574 |     "print(sess.run(var))\n",
575 |     "\n",
576 |     "print(sess.run(var2))"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 20,
582 |    "metadata": {},
583 |    "outputs": [],
584 |    "source": [
585 |     "# not trainable variable\n",
586 |     "var2 = tf.get_variable(\"variable\", shape=[1,2], trainable=False, dtype=tf.int32)"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": 21,
592 |    "metadata": {},
593 |    "outputs": [
594 |     {
595 |      "name": "stdout",
596 |      "output_type": "stream",
597 |      "text": [
598 |       "[<tf.Variable 'first_variable:0' shape=(1, 3) dtype=float32_ref>, <tf.Variable 'second_variable:0' shape=(1, 2) dtype=int32_ref>, <tf.Variable 'variable:0' shape=(1, 2) dtype=int32_ref>]\n"
599 |      ]
600 |     }
601 |    ],
602 |    "source": [
603 |     "print(tf.global_variables())"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "markdown",
608 |    "metadata": {},
609 |    "source": [
610 |     "#### Graph"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 22,
616 |    "metadata": {},
617 |    "outputs": [
618 |     {
619 |      "data": {
620 |       "text/plain": [
621 |        "-0.015899599"
622 |       ]
623 |      },
624 |      "execution_count": 22,
625 |      "metadata": {},
626 |      "output_type": "execute_result"
627 |     }
628 |    ],
629 |    "source": [
630 |     "tf.reset_default_graph()\n",
631 |     "\n",
632 |     "const1 = tf.constant(3.0, name='constant1')\n",
633 |     "\n",
634 |     "var = tf.get_variable(\"variable1\", shape=[1,2], dtype=tf.float32)\n",
635 |     "var2 = tf.get_variable(\"variable2\", shape=[1,2], trainable=False, dtype=tf.float32)\n",
636 |     "\n",
637 |     "op1 = const1 * var\n",
638 |     "op2 = op1 + var2\n",
639 |     "op3 = tf.reduce_mean(op2)\n",
640 |     "\n",
641 |     "sess = tf.Session()\n",
642 |     "sess.run(tf.global_variables_initializer())\n",
643 |     "sess.run(op3)"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "markdown",
648 |    "metadata": {},
649 |    "source": [
650 |     "### Simple Linear Regression Example\n"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": 23,
656 |    "metadata": {},
657 |    "outputs": [
658 |     {
659 |      "name": "stdout",
660 |      "output_type": "stream",
661 |      "text": [
662 |       "Epoch:   0, MSE: 4617.4390, W: 1.295, b: -0.407\n",
663 |       "Epoch:  40, MSE: 5.3334, W: 0.496, b: -0.727\n",
664 |       "Epoch:  80, MSE: 4.5894, W: 0.529, b: -0.012\n",
665 |       "Epoch: 120, MSE: 4.1029, W: 0.512, b: 0.608\n",
666 |       "Epoch: 160, MSE: 3.8552, W: 0.506, b: 1.092\n",
667 |       "Epoch: 200, MSE: 3.7597, W: 0.501, b: 1.418\n",
668 |       "Final weight: 0.500, bias: 1.473\n"
669 |      ]
670 |     }
671 |    ],
672 |    "source": [
673 |     "tf.reset_default_graph()\n",
674 |     "\n",
675 |     "np.random.seed(10)\n",
676 |     "tf.set_random_seed(10)\n",
677 |     "\n",
678 |     "W, b = 0.5, 1.4\n",
679 |     "# create a dataset of 100 examples\n",
680 |     "X = np.linspace(0,100, num=100)\n",
681 |     "# add random noise to the y labels\n",
682 |     "y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))\n",
683 |     "\n",
684 |     "# create the placeholders\n",
685 |     "x_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
686 |     "y_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
687 |     "\n",
688 |     "# create the variables.\n",
689 |     "v_weight = tf.get_variable(\"weight\", shape=[1], dtype=tf.float32)\n",
690 |     "v_bias = tf.get_variable(\"bias\", shape=[1], dtype=tf.float32)\n",
691 |     "\n",
692 |     "# linear computation\n",
693 |     "out = v_weight * x_ph + v_bias\n",
694 |     "\n",
695 |     "# compute the Mean Squared Error\n",
696 |     "loss = tf.reduce_mean((out - y_ph)**2)\n",
697 |     "\n",
698 |     "# optimizer\n",
699 |     "opt = tf.train.AdamOptimizer(0.4).minimize(loss)\n",
700 |     "\n",
701 |     "# create the session\n",
702 |     "session = tf.Session()\n",
703 |     "session.run(tf.global_variables_initializer())\n",
704 |     "\n",
705 |     "# loop to train the parameters\n",
706 |     "for ep in range(210):\n",
707 |     "    # run the optimizer and get the loss\n",
708 |     "    train_loss, _ = session.run([loss, opt], feed_dict={x_ph:X, y_ph:y})\n",
709 |     " \n",
710 |     "    # print epoch number and loss\n",
711 |     "    if ep % 40 == 0:\n",
712 |     "        print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, session.run(v_weight), session.run(v_bias)))\n",
713 |     "        \n",
714 |     "print('Final weight: %.3f, bias: %.3f' % (session.run(v_weight), session.run(v_bias)))"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "markdown",
719 |    "metadata": {},
720 |    "source": [
721 |     "#### .. with TensorBoard"
722 |    ]
723 |   },
724 |   {
725 |    "cell_type": "code",
726 |    "execution_count": 24,
727 |    "metadata": {},
728 |    "outputs": [
729 |     {
730 |      "name": "stdout",
731 |      "output_type": "stream",
732 |      "text": [
733 |       "Epoch:   0, MSE: 4617.4390, W: 1.295, b: -0.407\n",
734 |       "Epoch:  40, MSE: 5.3334, W: 0.496, b: -0.727\n",
735 |       "Epoch:  80, MSE: 4.5894, W: 0.529, b: -0.012\n",
736 |       "Epoch: 120, MSE: 4.1029, W: 0.512, b: 0.608\n",
737 |       "Epoch: 160, MSE: 3.8552, W: 0.506, b: 1.092\n",
738 |       "Epoch: 200, MSE: 3.7597, W: 0.501, b: 1.418\n",
739 |       "Final weight: 0.500, bias: 1.473\n"
740 |      ]
741 |     }
742 |    ],
743 |    "source": [
744 |     "from datetime import datetime\n",
745 |     "\n",
746 |     "tf.reset_default_graph()\n",
747 |     "\n",
748 |     "np.random.seed(10)\n",
749 |     "tf.set_random_seed(10)\n",
750 |     "\n",
751 |     "W, b = 0.5, 1.4\n",
752 |     "# create a dataset of 100 examples\n",
753 |     "X = np.linspace(0,100, num=100)\n",
754 |     "# add random noise to the y labels\n",
755 |     "y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))\n",
756 |     "\n",
757 |     "# create the placeholders\n",
758 |     "x_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
759 |     "y_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
760 |     "\n",
761 |     "# create the variables.\n",
762 |     "v_weight = tf.get_variable(\"weight\", shape=[1], dtype=tf.float32)\n",
763 |     "v_bias = tf.get_variable(\"bias\", shape=[1], dtype=tf.float32)\n",
764 |     "\n",
765 |     "# linear computation\n",
766 |     "out = v_weight * x_ph + v_bias\n",
767 |     "\n",
768 |     "# compute the Mean Squared Error\n",
769 |     "loss = tf.reduce_mean((out - y_ph)**2)\n",
770 |     "\n",
771 |     "# optimizer\n",
772 |     "opt = tf.train.AdamOptimizer(0.4).minimize(loss)\n",
773 |     "\n",
774 |     "\n",
775 |     "tf.summary.scalar('MSEloss', loss)\n",
776 |     "tf.summary.histogram('model_weight', v_weight)\n",
777 |     "tf.summary.histogram('model_bias', v_bias)\n",
778 |     "all_summary = tf.summary.merge_all()\n",
779 |     "\n",
780 |     "now = datetime.now()\n",
781 |     "clock_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)\n",
782 |     "file_writer = tf.summary.FileWriter('log_dir/'+clock_time, tf.get_default_graph())\n",
783 |     "\n",
784 |     "\n",
785 |     "# create the session\n",
786 |     "session = tf.Session()\n",
787 |     "session.run(tf.global_variables_initializer())\n",
788 |     "\n",
789 |     "# loop to train the parameters\n",
790 |     "for ep in range(210):\n",
791 |     "    # run the optimizer and get the loss\n",
792 |     "    train_loss, _, train_summary = session.run([loss, opt, all_summary], feed_dict={x_ph:X, y_ph:y})\n",
793 |     "    file_writer.add_summary(train_summary, ep)\n",
794 |     " \n",
795 |     "    # print epoch number and loss\n",
796 |     "    if ep % 40 == 0:\n",
797 |     "        print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, session.run(v_weight), session.run(v_bias)))\n",
798 |     "        \n",
799 |     "print('Final weight: %.3f, bias: %.3f' % (session.run(v_weight), session.run(v_bias)))\n",
800 |     "file_writer.close()"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": null,
806 |    "metadata": {},
807 |    "outputs": [],
808 |    "source": []
809 |   }
810 |  ],
811 |  "metadata": {
812 |   "kernelspec": {
813 |    "display_name": "Python 3",
814 |    "language": "python",
815 |    "name": "python3"
816 |   },
817 |   "language_info": {
818 |    "codemirror_mode": {
819 |     "name": "ipython",
820 |     "version": 3
821 |    },
822 |    "file_extension": ".py",
823 |    "mimetype": "text/x-python",
824 |    "name": "python",
825 |    "nbconvert_exporter": "python",
826 |    "pygments_lexer": "ipython3",
827 |    "version": "3.5.2"
828 |   }
829 |  },
830 |  "nbformat": 4,
831 |  "nbformat_minor": 2
832 | }
833 | 


--------------------------------------------------------------------------------
/Chapter03/frozenlake8x8_policyiteration.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | def eval_state_action(V, s, a, gamma=0.99):
 5 |     return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]])
 6 | 
 7 | def policy_evaluation(V, policy, eps=0.0001):
 8 |     '''
 9 |     Policy evaluation. Update the value function until it reach a steady state
10 |     '''
11 |     while True:
12 |         delta = 0
13 |         # loop over all states
14 |         for s in range(nS):
15 |             old_v = V[s]
16 |             # update V[s] using the Bellman equation
17 |             V[s] = eval_state_action(V, s, policy[s])
18 |             delta = max(delta, np.abs(old_v - V[s]))
19 | 
20 |         if delta < eps:
21 |             break
22 | 
23 | def policy_improvement(V, policy):
24 |     '''
25 |     Policy improvement. Update the policy based on the value function
26 |     '''
27 |     policy_stable = True
28 |     for s in range(nS):
29 |         old_a = policy[s]
30 |         # update the policy with the action that bring to the highest state value
31 |         policy[s] = np.argmax([eval_state_action(V, s, a) for a in range(nA)])
32 |         if old_a != policy[s]: 
33 |             policy_stable = False
34 | 
35 |     return policy_stable
36 | 
37 | 
38 | def run_episodes(env, policy, num_games=100):
39 |     '''
40 |     Run some games to test a policy
41 |     '''
42 |     tot_rew = 0
43 |     state = env.reset()
44 | 
45 |     for _ in range(num_games):
46 |         done = False
47 |         while not done:
48 |             # select the action accordingly to the policy
49 |             next_state, reward, done, _ = env.step(policy[state])
50 |                 
51 |             state = next_state
52 |             tot_rew += reward 
53 |             if done:
54 |                 state = env.reset()
55 | 
56 |     print('Won %i of %i games!'%(tot_rew, num_games))
57 | 
58 |             
59 | if __name__ == '__main__':
60 |     # create the environment
61 |     env = gym.make('FrozenLake-v0')
62 |     # enwrap it to have additional information from it
63 |     env = env.unwrapped
64 | 
65 |     # spaces dimension
66 |     nA = env.action_space.n
67 |     nS = env.observation_space.n
68 |     
69 |     # initializing value function and policy
70 |     V = np.zeros(nS)
71 |     policy = np.zeros(nS)
72 | 
73 |     # some useful variable
74 |     policy_stable = False
75 |     it = 0
76 | 
77 |     while not policy_stable:
78 |         policy_evaluation(V, policy)
79 |         policy_stable = policy_improvement(V, policy)
80 |         it += 1
81 | 
82 |     print('Converged after %i policy iterations'%(it))
83 |     run_episodes(env, policy)
84 |     print(V.reshape((4,4)))
85 |     print(policy.reshape((4,4)))


--------------------------------------------------------------------------------
/Chapter03/frozenlake8x8_valueiteration.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | def eval_state_action(V, s, a, gamma=0.99):
 5 |     return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]])
 6 | 
 7 | def value_iteration(eps=0.0001):
 8 |     '''
 9 |     Value iteration algorithm
10 |     '''
11 |     V = np.zeros(nS)
12 |     it = 0
13 | 
14 |     while True:
15 |         delta = 0
16 |         # update the value of each state using as "policy" the max operator
17 |         for s in range(nS):
18 |             old_v = V[s]
19 |             V[s] = np.max([eval_state_action(V, s, a) for a in range(nA)])
20 |             delta = max(delta, np.abs(old_v - V[s]))
21 | 
22 |         if delta < eps:
23 |             break
24 |         else:
25 |             print('Iter:', it, ' delta:', np.round(delta, 5))
26 |         it += 1
27 | 
28 |     return V
29 | 
30 | def run_episodes(env, V, num_games=100):
31 |     '''
32 |     Run some test games
33 |     '''
34 |     tot_rew = 0
35 |     state = env.reset()
36 | 
37 |     for _ in range(num_games):
38 |         done = False
39 |         while not done:
40 |             action = np.argmax([eval_state_action(V, state, a) for a in range(nA)])
41 |             next_state, reward, done, _ = env.step(action)
42 | 
43 |             state = next_state
44 |             tot_rew += reward 
45 |             if done:
46 |                 state = env.reset()
47 | 
48 |     print('Won %i of %i games!'%(tot_rew, num_games))
49 | 
50 |             
51 | if __name__ == '__main__':
52 |     # create the environment
53 |     env = gym.make('FrozenLake-v0')
54 |     # enwrap it to have additional information from it
55 |     env = env.unwrapped
56 | 
57 |     # spaces dimension
58 |     nA = env.action_space.n
59 |     nS = env.observation_space.n
60 | 
61 |     # Value iteration
62 |     V = value_iteration(eps=0.0001)
63 |     # test the value function on 100 games
64 |     run_episodes(env, V, 100)
65 |     # print the state values
66 |     print(V.reshape((4,4)))
67 | 
68 | 


--------------------------------------------------------------------------------
/Chapter04/SARSA Q_learning Taxi-v2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import gym
  3 | 
  4 | 
  5 | def eps_greedy(Q, s, eps=0.1):
  6 |     '''
  7 |     Epsilon greedy policy
  8 |     '''
  9 |     if np.random.uniform(0,1) < eps:
 10 |         # Choose a random action
 11 |         return np.random.randint(Q.shape[1])
 12 |     else:
 13 |         # Choose the action of a greedy policy
 14 |         return greedy(Q, s)
 15 | 
 16 | 
 17 | def greedy(Q, s):
 18 |     '''
 19 |     Greedy policy
 20 | 
 21 |     return the index corresponding to the maximum action-state value
 22 |     '''
 23 |     return np.argmax(Q[s])
 24 | 
 25 | 
 26 | def run_episodes(env, Q, num_episodes=100, to_print=False):
 27 |     '''
 28 |     Run some episodes to test the policy
 29 |     '''
 30 |     tot_rew = []
 31 |     state = env.reset()
 32 | 
 33 |     for _ in range(num_episodes):
 34 |         done = False
 35 |         game_rew = 0
 36 | 
 37 |         while not done:
 38 |             # select a greedy action
 39 |             next_state, rew, done, _ = env.step(greedy(Q, state))
 40 | 
 41 |             state = next_state
 42 |             game_rew += rew 
 43 |             if done:
 44 |                 state = env.reset()
 45 |                 tot_rew.append(game_rew)
 46 | 
 47 |     if to_print:
 48 |         print('Mean score: %.3f of %i games!'%(np.mean(tot_rew), num_episodes))
 49 | 
 50 |     return np.mean(tot_rew)
 51 | 
 52 | def Q_learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
 53 |     nA = env.action_space.n
 54 |     nS = env.observation_space.n
 55 | 
 56 |     # Initialize the Q matrix
 57 |     # Q: matrix nS*nA where each row represent a state and each colums represent a different action
 58 |     Q = np.zeros((nS, nA))
 59 |     games_reward = []
 60 |     test_rewards = []
 61 | 
 62 |     for ep in range(num_episodes):
 63 |         state = env.reset()
 64 |         done = False
 65 |         tot_rew = 0
 66 |         
 67 |         # decay the epsilon value until it reaches the threshold of 0.01
 68 |         if eps > 0.01:
 69 |             eps -= eps_decay
 70 | 
 71 |         # loop the main body until the environment stops
 72 |         while not done:
 73 |             # select an action following the eps-greedy policy
 74 |             action = eps_greedy(Q, state, eps)
 75 | 
 76 |             next_state, rew, done, _ = env.step(action) # Take one step in the environment
 77 | 
 78 |             # Q-learning update the state-action value (get the max Q value for the next state)
 79 |             Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action])
 80 | 
 81 |             state = next_state
 82 |             tot_rew += rew
 83 |             if done:
 84 |                 games_reward.append(tot_rew)
 85 | 
 86 |         # Test the policy every 300 episodes and print the results
 87 |         if (ep % 300) == 0:
 88 |             test_rew = run_episodes(env, Q, 1000)
 89 |             print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
 90 |             test_rewards.append(test_rew)
 91 |             
 92 |     return Q
 93 | 
 94 | 
 95 | def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
 96 |     nA = env.action_space.n
 97 |     nS = env.observation_space.n
 98 | 
 99 |     # Initialize the Q matrix
100 |     # Q: matrix nS*nA where each row represent a state and each colums represent a different action
101 |     Q = np.zeros((nS, nA))
102 |     games_reward = []
103 |     test_rewards = []
104 | 
105 |     for ep in range(num_episodes):
106 |         state = env.reset()
107 |         done = False
108 |         tot_rew = 0
109 | 
110 |         # decay the epsilon value until it reaches the threshold of 0.01
111 |         if eps > 0.01:
112 |             eps -= eps_decay
113 | 
114 | 
115 |         action = eps_greedy(Q, state, eps) 
116 | 
117 |         # loop the main body until the environment stops
118 |         while not done:
119 |             next_state, rew, done, _ = env.step(action) # Take one step in the environment
120 | 
121 |             # choose the next action (needed for the SARSA update)
122 |             next_action = eps_greedy(Q, next_state, eps) 
123 |             # SARSA update
124 |             Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])
125 | 
126 |             state = next_state
127 |             action = next_action
128 |             tot_rew += rew
129 |             if done:
130 |                 games_reward.append(tot_rew)
131 | 
132 |         # Test the policy every 300 episodes and print the results
133 |         if (ep % 300) == 0:
134 |             test_rew = run_episodes(env, Q, 1000)
135 |             print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
136 |             test_rewards.append(test_rew)
137 | 
138 |     return Q
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     env = gym.make('Taxi-v2')
143 |     
144 |     Q_qlearning = Q_learning(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)
145 | 
146 |     Q_sarsa = SARSA(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)


--------------------------------------------------------------------------------
/Chapter05/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/Chapter05/DQN_Atari.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | from collections import deque
  6 | import time
  7 | import sys
  8 | 
  9 | from atari_wrappers import make_env
 10 | 
 11 | 
 12 | gym.logger.set_level(40)
 13 | 
 14 | current_milli_time = lambda: int(round(time.time() * 1000))
 15 | 
 16 | def cnn(x):
 17 |     '''
 18 |     Convolutional neural network
 19 |     '''
 20 |     x = tf.layers.conv2d(x, filters=16, kernel_size=8, strides=4, padding='valid', activation='relu') 
 21 |     x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation='relu') 
 22 |     return tf.layers.conv2d(x, filters=32, kernel_size=3, strides=1, padding='valid', activation='relu') 
 23 |     
 24 | 
 25 | def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
 26 |     '''
 27 |     Feed-forward neural network
 28 |     '''
 29 |     for l in hidden_layers:
 30 |         x = tf.layers.dense(x, units=l, activation=activation)
 31 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 32 | 
 33 | def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None):
 34 |     '''
 35 |     Deep Q network: CNN followed by FNN
 36 |     '''
 37 |     x = cnn(x)
 38 |     x = tf.layers.flatten(x)
 39 | 
 40 |     return fnn(x, hidden_layers, output_size, fnn_activation, last_activation)
 41 | 
 42 | 
 43 | class ExperienceBuffer():
 44 |     '''
 45 |     Experience Replay Buffer
 46 |     '''
 47 |     def __init__(self, buffer_size):
 48 |         self.obs_buf = deque(maxlen=buffer_size)
 49 |         self.rew_buf = deque(maxlen=buffer_size)
 50 |         self.act_buf = deque(maxlen=buffer_size)
 51 |         self.obs2_buf = deque(maxlen=buffer_size)
 52 |         self.done_buf = deque(maxlen=buffer_size)
 53 | 
 54 | 
 55 |     def add(self, obs, rew, act, obs2, done):
 56 |         # Add a new transition to the buffers
 57 |         self.obs_buf.append(obs)
 58 |         self.rew_buf.append(rew)
 59 |         self.act_buf.append(act)
 60 |         self.obs2_buf.append(obs2)
 61 |         self.done_buf.append(done)
 62 |         
 63 | 
 64 |     def sample_minibatch(self, batch_size):
 65 |         # Sample a minibatch of size batch_size
 66 |         mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
 67 | 
 68 |         mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices])
 69 |         mb_rew = [self.rew_buf[i] for i in mb_indices]
 70 |         mb_act = [self.act_buf[i] for i in mb_indices]
 71 |         mb_obs2 = scale_frames([self.obs2_buf[i] for i in mb_indices])
 72 |         mb_done = [self.done_buf[i] for i in mb_indices]
 73 | 
 74 |         return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
 75 | 
 76 |     def __len__(self):
 77 |         return len(self.obs_buf)
 78 | 
 79 | 
 80 | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):   
 81 |     '''
 82 |     Calculate the target value y for each transition
 83 |     '''
 84 |     max_av = np.max(av, axis=1)
 85 |     
 86 |     # if episode terminate, y take value r
 87 |     # otherwise, q-learning step
 88 |     
 89 |     ys = []
 90 |     for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av):
 91 |         if d:
 92 |             ys.append(r)
 93 |         else:
 94 |             q_step = r + discounted_value * av
 95 |             ys.append(q_step)
 96 |     
 97 |     assert len(ys) == len(mini_batch_rw)
 98 |     return ys
 99 | 
100 | def greedy(action_values):
101 |     '''
102 |     Greedy policy
103 |     '''
104 |     return np.argmax(action_values)
105 | 
106 | def eps_greedy(action_values, eps=0.1):
107 |     '''
108 |     Eps-greedy policy
109 |     '''
110 |     if np.random.uniform(0,1) < eps:
111 |         # Choose a uniform random action
112 |         return np.random.randint(len(action_values))
113 |     else:
114 |         # Choose the greedy action
115 |         return np.argmax(action_values)
116 | 
117 | def test_agent(env_test, agent_op, num_games=20):
118 |     '''
119 |     Test an agent
120 |     '''
121 |     games_r = []
122 | 
123 |     for _ in range(num_games):
124 |         d = False
125 |         game_r = 0
126 |         o = env_test.reset()
127 | 
128 |         while not d:
129 |             # Use an eps-greedy policy with eps=0.05 (to add stochasticity to the policy)
130 |             # Needed because Atari envs are deterministic
131 |             # If you would use a greedy policy, the results will be always the same
132 |             a = eps_greedy(np.squeeze(agent_op(o)), eps=0.05)
133 |             o, r, d, _ = env_test.step(a)
134 | 
135 |             game_r += r
136 | 
137 |         games_r.append(game_r)
138 | 
139 |     return games_r
140 | 
141 | def scale_frames(frames):
142 |     '''
143 |     Scale the frame with number between 0 and 1
144 |     '''
145 |     return np.array(frames, dtype=np.float32) / 255.0
146 | 
147 | def DQN(env_name, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000, 
148 |         batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000):
149 | 
150 |     # Create the environment both for train and test
151 |     env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
152 |     env_test = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
153 |     # Add a monitor to the test env to store the videos
154 |     env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0)
155 | 
156 |     tf.reset_default_graph()
157 | 
158 |     obs_dim = env.observation_space.shape
159 |     act_dim = env.action_space.n 
160 | 
161 |     # Create all the placeholders
162 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]), dtype=tf.float32, name='obs')
163 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
164 |     y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
165 | 
166 |     # Create the target network
167 |     with tf.variable_scope('target_network'):
168 |         target_qv = qnet(obs_ph, hidden_sizes, act_dim)
169 |     target_vars = tf.trainable_variables()
170 | 
171 |     # Create the online network (i.e. the behavior policy)
172 |     with tf.variable_scope('online_network'):
173 |         online_qv = qnet(obs_ph, hidden_sizes, act_dim)
174 |     train_vars = tf.trainable_variables()
175 | 
176 |     # Update the target network by assigning to it the variables of the online network
177 |     # Note that the target network and the online network have the same exact architecture
178 |     update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))]
179 |     update_target_op = tf.group(*update_target)
180 | 
181 |     # One hot encoding of the action
182 |     act_onehot = tf.one_hot(act_ph, depth=act_dim)
183 |     # We are interested only in the Q-values of those actions
184 |     q_values = tf.reduce_sum(act_onehot * online_qv, axis=1)
185 |     
186 |     # MSE loss function
187 |     v_loss = tf.reduce_mean((y_ph - q_values)**2)
188 |     # Adam optimize that minimize the loss v_loss
189 |     v_opt = tf.train.AdamOptimizer(lr).minimize(v_loss)
190 | 
191 |     def agent_op(o):
192 |         '''
193 |         Forward pass to obtain the Q-values from the online network of a single observation
194 |         '''
195 |         # Scale the frames
196 |         o = scale_frames(o)
197 |         return sess.run(online_qv, feed_dict={obs_ph:[o]})
198 | 
199 |     # Time
200 |     now = datetime.now()
201 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
202 |     print('Time:', clock_time)
203 | 
204 |     mr_v = tf.Variable(0.0)
205 |     ml_v = tf.Variable(0.0)
206 | 
207 | 
208 |     # TensorBoard summaries
209 |     tf.summary.scalar('v_loss', v_loss)
210 |     tf.summary.scalar('Q-value', tf.reduce_mean(q_values))
211 |     tf.summary.histogram('Q-values', q_values)
212 | 
213 |     scalar_summary = tf.summary.merge_all()
214 |     reward_summary = tf.summary.scalar('test_rew', mr_v) 
215 |     mean_loss_summary = tf.summary.scalar('mean_loss', ml_v)
216 | 
217 |     LOG_DIR = 'log_dir/'+env_name
218 |     hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}" .format(lr, update_target_net, update_freq, frames_num)
219 | 
220 |     # initialize the File Writer for writing TensorBoard summaries
221 |     file_writer = tf.summary.FileWriter(LOG_DIR+'/DQN_'+clock_time+'_'+hyp_str, tf.get_default_graph())
222 | 
223 |     # open a session
224 |     sess = tf.Session()
225 |     # and initialize all the variables
226 |     sess.run(tf.global_variables_initializer())
227 |     
228 |     render_the_game = False
229 |     step_count = 0
230 |     last_update_loss = []
231 |     ep_time = current_milli_time()
232 |     batch_rew = []
233 |     old_step_count = 0
234 | 
235 |     obs = env.reset()
236 | 
237 |     # Initialize the experience buffer
238 |     buffer = ExperienceBuffer(buffer_size)
239 |     
240 |     # Copy the online network in the target network
241 |     sess.run(update_target_op)
242 | 
243 |     ########## EXPLORATION INITIALIZATION ######
244 |     eps = start_explor
245 |     eps_decay = (start_explor - end_explor) / explor_steps
246 | 
247 |     for ep in range(num_epochs):
248 |         g_rew = 0
249 |         done = False
250 | 
251 |         # Until the environment does not end..
252 |         while not done:
253 |                 
254 |             # Epsilon decay
255 |             if eps > end_explor:
256 |                 eps -= eps_decay
257 | 
258 |             # Choose an eps-greedy action 
259 |             act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps)
260 | 
261 |             # execute the action in the environment
262 |             obs2, rew, done, _ = env.step(act)
263 | 
264 |             # Render the game if you want to
265 |             if render_the_game:
266 |                 env.render()
267 | 
268 |             # Add the transition to the replay buffer
269 |             buffer.add(obs, rew, act, obs2, done)
270 | 
271 |             obs = obs2
272 |             g_rew += rew
273 |             step_count += 1
274 | 
275 |             ################ TRAINING ###############
276 |             # If it's time to train the network:
277 |             if len(buffer) > min_buffer_size and (step_count % update_freq == 0):
278 |                 
279 |                 # sample a minibatch from the buffer
280 |                 mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
281 | 
282 |  
283 |                 mb_trg_qv = sess.run(target_qv, feed_dict={obs_ph:mb_obs2})
284 |                 y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount)
285 | 
286 |                 # TRAINING STEP
287 |                 # optimize, compute the loss and return the TB summary
288 |                 train_summary, train_loss, _ = sess.run([scalar_summary, v_loss, v_opt], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
289 | 
290 |                 # Add the train summary to the file_writer
291 |                 file_writer.add_summary(train_summary, step_count)
292 |                 last_update_loss.append(train_loss)
293 | 
294 |             # Every update_target_net steps, update the target network
295 |             if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0):
296 | 
297 |                 # run the session to update the target network and get the mean loss sumamry 
298 |                 _, train_summary = sess.run([update_target_op, mean_loss_summary], feed_dict={ml_v:np.mean(last_update_loss)})
299 |                 file_writer.add_summary(train_summary, step_count)
300 |                 last_update_loss = []
301 | 
302 | 
303 |             # If the environment is ended, reset it and initialize the variables
304 |             if done:
305 |                 obs = env.reset()
306 |                 batch_rew.append(g_rew)
307 |                 g_rew, render_the_game = 0, False
308 | 
309 |         # every test_frequency episodes, test the agent and write some stats in TensorBoard
310 |         if ep % test_frequency == 0:
311 |             # Test the agent to 10 games
312 |             test_rw = test_agent(env_test, agent_op, num_games=10)
313 | 
314 |             # Run the test stats and add them to the file_writer
315 |             test_summary = sess.run(reward_summary, feed_dict={mr_v: np.mean(test_rw)})
316 |             file_writer.add_summary(test_summary, step_count)
317 | 
318 |             # Print some useful stats
319 |             ep_sec_time = int((current_milli_time()-ep_time) / 1000)
320 |             print('Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d' %
321 |                         (ep,np.mean(batch_rew), eps, step_count, np.mean(test_rw), np.std(test_rw), ep_sec_time, (step_count-old_step_count)/test_frequency))
322 | 
323 |             ep_time = current_milli_time()
324 |             batch_rew = []
325 |             old_step_count = step_count
326 |                             
327 |         if ep % render_cycle == 0:
328 |             render_the_game = True
329 | 
330 |     file_writer.close()
331 |     env.close()
332 | 
333 | 
334 | if __name__ == '__main__':
335 | 
336 |     DQN('PongNoFrameskip-v4', hidden_sizes=[128], lr=2e-4, buffer_size=100000, update_target_net=1000, batch_size=32, 
337 |         update_freq=2, frames_num=2, min_buffer_size=10000, render_cycle=10000)


--------------------------------------------------------------------------------
/Chapter05/DQN_variations_Atari.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | from collections import deque
  6 | import time
  7 | import sys
  8 | 
  9 | from atari_wrappers import make_env
 10 | 
 11 | 
 12 | gym.logger.set_level(40)
 13 | 
 14 | current_milli_time = lambda: int(round(time.time() * 1000))
 15 | 
 16 | 
 17 | def cnn(x):
 18 |     '''
 19 |     Convolutional neural network
 20 |     '''
 21 |     x = tf.layers.conv2d(x, filters=16, kernel_size=8, strides=4, padding='valid', activation='relu') 
 22 |     x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation='relu') 
 23 |     return tf.layers.conv2d(x, filters=32, kernel_size=3, strides=1, padding='valid', activation='relu') 
 24 |     
 25 | 
 26 | def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
 27 |     '''
 28 |     Feed-forward neural network
 29 |     '''
 30 |     for l in hidden_layers:
 31 |         x = tf.layers.dense(x, units=l, activation=activation)
 32 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 33 | 
 34 | def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None):
 35 |     '''
 36 |     Deep Q network: CNN followed by FNN
 37 |     '''
 38 |     x = cnn(x)
 39 |     x = tf.layers.flatten(x)
 40 | 
 41 |     return fnn(x, hidden_layers, output_size, fnn_activation, last_activation)
 42 | 
 43 | def greedy(action_values):
 44 |     '''
 45 |     Greedy policy
 46 |     '''
 47 |     return np.argmax(action_values)
 48 | 
 49 | def eps_greedy(action_values, eps=0.1):
 50 |     '''
 51 |     Eps-greedy policy
 52 |     '''
 53 |     if np.random.uniform(0,1) < eps:
 54 |         # Choose a uniform random action
 55 |         return np.random.randint(len(action_values))
 56 |     else:
 57 |         # Choose the greedy action
 58 |         return np.argmax(action_values)
 59 | 
 60 | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):   
 61 |     '''
 62 |     Calculate the target value y for each transition
 63 |     '''
 64 |     max_av = np.max(av, axis=1)
 65 |     
 66 |     # if episode terminate, y take value r
 67 |     # otherwise, q-learning step
 68 |     
 69 |     ys = []
 70 |     for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av):
 71 |         if d:
 72 |             ys.append(r)
 73 |         else:
 74 |             q_step = r + discounted_value * av
 75 |             ys.append(q_step)
 76 |     
 77 |     assert len(ys) == len(mini_batch_rw)
 78 |     return ys
 79 | 
 80 | def test_agent(env_test, agent_op, num_games=20):
 81 |     '''
 82 |     Test an agent
 83 |     '''
 84 |     games_r = []
 85 | 
 86 |     for _ in range(num_games):
 87 |         d = False
 88 |         game_r = 0
 89 |         o = env_test.reset()
 90 | 
 91 |         while not d:
 92 |             # Use an eps-greedy policy with eps=0.05 (to add stochasticity to the policy)
 93 |             # Needed because Atari envs are deterministic
 94 |             # If you would use a greedy policy, the results will be always the same
 95 |             a = eps_greedy(np.squeeze(agent_op(o)), eps=0.05)
 96 |             o, r, d, _ = env_test.step(a)
 97 | 
 98 |             game_r += r
 99 | 
100 |         games_r.append(game_r)
101 | 
102 |     return games_r
103 | 
104 | def scale_frames(frames):
105 |     '''
106 |     Scale the frame with number between 0 and 1
107 |     '''
108 |     return np.array(frames, dtype=np.float32) / 255.0
109 | 
110 | 
111 | def dueling_qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None):
112 |     '''
113 |     Dueling neural network
114 |     '''
115 |     x = cnn(x)
116 |     x = tf.layers.flatten(x)
117 | 
118 |     qf = fnn(x, hidden_layers, 1, fnn_activation, last_activation)
119 |     aaqf = fnn(x, hidden_layers, output_size, fnn_activation, last_activation)
120 | 
121 |     return qf + aaqf - tf.reduce_mean(aaqf)
122 | 
123 | def double_q_target_values(mini_batch_rw, mini_batch_done, target_qv, online_qv, discounted_value):   ## IS THE NAME CORRECT???
124 |     '''
125 |     Calculate the target value y following the double Q-learning update
126 |     '''
127 |     argmax_online_qv = np.argmax(online_qv, axis=1)
128 |     
129 |     # if episode terminate, y take value r
130 |     # otherwise, q-learning step
131 |     
132 |     ys = []
133 |     assert len(mini_batch_rw) == len(mini_batch_done) == len(target_qv) == len(argmax_online_qv)
134 |     for r, d, t_av, arg_a in zip(mini_batch_rw, mini_batch_done, target_qv, argmax_online_qv):
135 |         if d:
136 |             ys.append(r)
137 |         else:
138 |             q_value = r + discounted_value * t_av[arg_a]
139 |             ys.append(q_value)
140 |     
141 |     assert len(ys) == len(mini_batch_rw)
142 | 
143 |     return ys
144 | 
145 | class MultiStepExperienceBuffer():
146 |     '''
147 |     Experience Replay Buffer for multi-step learning
148 |     '''
149 |     def __init__(self, buffer_size, n_step, gamma):
150 |         self.obs_buf = deque(maxlen=buffer_size)
151 |         self.act_buf = deque(maxlen=buffer_size)
152 | 
153 |         self.n_obs_buf = deque(maxlen=buffer_size)
154 |         self.n_done_buf = deque(maxlen=buffer_size)
155 |         self.n_rew_buf = deque(maxlen=buffer_size)
156 | 
157 |         self.n_step = n_step
158 |         self.last_rews = deque(maxlen=self.n_step+1)
159 |         self.gamma = gamma
160 | 
161 | 
162 |     def add(self, obs, rew, act, obs2, done):
163 |         self.obs_buf.append(obs)
164 |         self.act_buf.append(act)
165 |         # the following buffers will be updated in the next n_step steps
166 |         # their values are not known, yet
167 |         self.n_obs_buf.append(None)
168 |         self.n_rew_buf.append(None)
169 |         self.n_done_buf.append(None)
170 | 
171 |         self.last_rews.append(rew)
172 | 
173 |         ln = len(self.obs_buf)
174 |         len_rews = len(self.last_rews)
175 | 
176 |         # Update the indices of the buffer that are n_steps old
177 |         if done:
178 |             # In case it's the last step, update up to the n_steps indices fo the buffer
179 |             # it cannot update more than len(last_rews), otherwise will update the previous traj
180 |             for i in range(len_rews):
181 |                 self.n_obs_buf[ln-(len_rews-i-1)-1] = obs2
182 |                 self.n_done_buf[ln-(len_rews-i-1)-1] = done
183 |                 rgt = np.sum([(self.gamma**k)*r for k,r in enumerate(np.array(self.last_rews)[i:len_rews])])
184 |                 self.n_rew_buf[ln-(len_rews-i-1)-1] = rgt
185 | 
186 |             # reset the reward deque
187 |             self.last_rews = deque(maxlen=self.n_step+1)
188 |         else:
189 |             # Update the elements of the buffer that has been added n_step steps ago
190 |             # Add only if the multi-step values are updated
191 |             if len(self.last_rews) >= (self.n_step+1):
192 |                 self.n_obs_buf[ln-self.n_step-1] = obs2
193 |                 self.n_done_buf[ln-self.n_step-1] = done
194 |                 rgt = np.sum([(self.gamma**k)*r for k,r in enumerate(np.array(self.last_rews)[:len_rews])])
195 |                 self.n_rew_buf[ln-self.n_step-1] = rgt
196 |         
197 | 
198 |     def sample_minibatch(self, batch_size):
199 |         # Sample a minibatch of size batch_size
200 |         # Note: the samples should be at least of n_step steps ago
201 |         mb_indices = np.random.randint(len(self.obs_buf)-self.n_step, size=batch_size)
202 | 
203 |         mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices])
204 |         mb_rew = [self.n_rew_buf[i] for i in mb_indices]
205 |         mb_act = [self.act_buf[i] for i in mb_indices]
206 |         mb_obs2 = scale_frames([self.n_obs_buf[i] for i in mb_indices])
207 |         mb_done = [self.n_done_buf[i] for i in mb_indices]
208 | 
209 |         return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
210 | 
211 |     def __len__(self):
212 |         return len(self.obs_buf)
213 | 
214 | def DQN_with_variations(env_name, extensions_hyp, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000, 
215 |         batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000):
216 | 
217 |     # Create the environment both for train and test
218 |     env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
219 |     env_test = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
220 |     # Add a monitor to the test env to store the videos
221 |     env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0)
222 | 
223 |     tf.reset_default_graph()
224 | 
225 |     obs_dim = env.observation_space.shape
226 |     act_dim = env.action_space.n 
227 | 
228 |     # Create all the placeholders
229 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]), dtype=tf.float32, name='obs')
230 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
231 |     y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
232 | 
233 |     # Create the target network
234 |     with tf.variable_scope('target_network'):
235 |         if extensions_hyp['dueling']:
236 |             target_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim)
237 |         else:
238 |             target_qv = qnet(obs_ph, hidden_sizes, act_dim)
239 |     target_vars = tf.trainable_variables()
240 | 
241 |     # Create the online network (i.e. the behavior policy)
242 |     with tf.variable_scope('online_network'):
243 |         if extensions_hyp['dueling']:
244 |             online_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim)
245 |         else:
246 |             online_qv = qnet(obs_ph, hidden_sizes, act_dim)
247 |     train_vars = tf.trainable_variables()
248 | 
249 |     # Update the target network by assigning to it the variables of the online network
250 |     # Note that the target network and the online network have the same exact architecture
251 |     update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))]
252 |     update_target_op = tf.group(*update_target)
253 | 
254 |     # One hot encoding of the action
255 |     act_onehot = tf.one_hot(act_ph, depth=act_dim)
256 |     # We are interested only in the Q-values of those actions
257 |     q_values = tf.reduce_sum(act_onehot * online_qv, axis=1)
258 |     
259 |     # MSE loss function
260 |     v_loss = tf.reduce_mean((y_ph - q_values)**2)
261 |     # Adam optimize that minimize the loss v_loss
262 |     v_opt = tf.train.AdamOptimizer(lr).minimize(v_loss)
263 | 
264 |     def agent_op(o):
265 |         '''
266 |         Forward pass to obtain the Q-values from the online network of a single observation
267 |         '''
268 |         # Scale the frames
269 |         o = scale_frames(o)
270 |         return sess.run(online_qv, feed_dict={obs_ph:[o]})
271 | 
272 |     # Time
273 |     now = datetime.now()
274 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
275 |     print('Time:', clock_time)
276 | 
277 |     mr_v = tf.Variable(0.0)
278 |     ml_v = tf.Variable(0.0)
279 | 
280 | 
281 |     # TensorBoard summaries
282 |     tf.summary.scalar('v_loss', v_loss)
283 |     tf.summary.scalar('Q-value', tf.reduce_mean(q_values))
284 |     tf.summary.histogram('Q-values', q_values)
285 | 
286 |     scalar_summary = tf.summary.merge_all()
287 |     reward_summary = tf.summary.scalar('test_rew', mr_v)
288 |     mean_loss_summary = tf.summary.scalar('mean_loss', ml_v)
289 | 
290 |     LOG_DIR = 'log_dir/'+env_name
291 |     hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}-ddqn_{}-duel_{}-nstep_{}" \
292 |                 .format(lr, update_target_net, update_freq, frames_num, extensions_hyp['DDQN'], extensions_hyp['dueling'], extensions_hyp['multi_step'])
293 | 
294 |     # initialize the File Writer for writing TensorBoard summaries
295 |     file_writer = tf.summary.FileWriter(LOG_DIR+'/DQN_'+clock_time+'_'+hyp_str, tf.get_default_graph())
296 | 
297 |     # open a session
298 |     sess = tf.Session()
299 |     # and initialize all the variables
300 |     sess.run(tf.global_variables_initializer())
301 |     
302 |     render_the_game = False
303 |     step_count = 0
304 |     last_update_loss = []
305 |     ep_time = current_milli_time()
306 |     batch_rew = []
307 |     old_step_count = 0
308 | 
309 |     obs = env.reset()
310 | 
311 |     # Initialize the experience buffer
312 |     #buffer = ExperienceBuffer(buffer_size)
313 |     buffer = MultiStepExperienceBuffer(buffer_size, extensions_hyp['multi_step'], discount)
314 |     
315 |     # Copy the online network in the target network
316 |     sess.run(update_target_op)
317 | 
318 |     ########## EXPLORATION INITIALIZATION ######
319 |     eps = start_explor
320 |     eps_decay = (start_explor - end_explor) / explor_steps
321 | 
322 |     for ep in range(num_epochs):
323 |         g_rew = 0
324 |         done = False
325 | 
326 |         # Until the environment does not end..
327 |         while not done:
328 |                 
329 |             # Epsilon decay
330 |             if eps > end_explor:
331 |                 eps -= eps_decay
332 | 
333 |             # Choose an eps-greedy action 
334 |             act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps)
335 | 
336 |             # execute the action in the environment
337 |             obs2, rew, done, _ = env.step(act)
338 | 
339 |             # Render the game if you want to
340 |             if render_the_game:
341 |                 env.render()
342 | 
343 |             # Add the transition to the replay buffer
344 |             buffer.add(obs, rew, act, obs2, done)
345 | 
346 |             obs = obs2
347 |             g_rew += rew
348 |             step_count += 1
349 | 
350 |             ################ TRAINING ###############
351 |             # If it's time to train the network:
352 |             if len(buffer) > min_buffer_size and (step_count % update_freq == 0):
353 |                 
354 |                 # sample a minibatch from the buffer
355 |                 mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
356 | 
357 |                 if extensions_hyp['DDQN']:
358 |                     mb_onl_qv, mb_trg_qv = sess.run([online_qv,target_qv], feed_dict={obs_ph:mb_obs2})
359 |                     y_r = double_q_target_values(mb_rew, mb_done, mb_trg_qv, mb_onl_qv, discount)
360 |                 else:
361 |                     mb_trg_qv = sess.run(target_qv, feed_dict={obs_ph:mb_obs2})
362 |                     y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount)
363 | 
364 |                 # optimize, compute the loss and return the TB summary
365 |                 train_summary, train_loss, _ = sess.run([scalar_summary, v_loss, v_opt], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
366 | 
367 |                 # Add the train summary to the file_writer
368 |                 file_writer.add_summary(train_summary, step_count)
369 |                 last_update_loss.append(train_loss)
370 | 
371 |             # Every update_target_net steps, update the target network
372 |             if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0):
373 | 
374 |                 # run the session to update the target network and get the mean loss sumamry 
375 |                 _, train_summary = sess.run([update_target_op, mean_loss_summary], feed_dict={ml_v:np.mean(last_update_loss)})
376 |                 file_writer.add_summary(train_summary, step_count)
377 |                 last_update_loss = []
378 | 
379 | 
380 |             # If the environment is ended, reset it and initialize the variables
381 |             if done:
382 |                 obs = env.reset()
383 |                 batch_rew.append(g_rew)
384 |                 g_rew, render_the_game = 0, False
385 | 
386 |         # every test_frequency episodes, test the agent and write some stats in TensorBoard
387 |         if ep % test_frequency == 0:
388 |             # Test the agent to 10 games
389 |             test_rw = test_agent(env_test, agent_op, num_games=10)
390 | 
391 |             # Run the test stats and add them to the file_writer
392 |             test_summary = sess.run(reward_summary, feed_dict={mr_v: np.mean(test_rw)})
393 |             file_writer.add_summary(test_summary, step_count)
394 | 
395 |             # Print some useful stats
396 |             ep_sec_time = int((current_milli_time()-ep_time) / 1000)
397 |             print('Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d' %
398 |                         (ep,np.mean(batch_rew), eps, step_count, np.mean(test_rw), np.std(test_rw), ep_sec_time, (step_count-old_step_count)/test_frequency))
399 | 
400 |             ep_time = current_milli_time()
401 |             batch_rew = []
402 |             old_step_count = step_count
403 |                             
404 |         if ep % render_cycle == 0:
405 |             render_the_game = True
406 | 
407 |     file_writer.close()
408 |     env.close()
409 | 
410 | 
411 | if __name__ == '__main__':
412 | 
413 |     extensions_hyp={
414 |         'DDQN':False,
415 |         'dueling':False,
416 |         'multi_step':1
417 |     }
418 |     DQN_with_variations('PongNoFrameskip-v4', extensions_hyp, hidden_sizes=[128], lr=2e-4, buffer_size=100000, update_target_net=1000, batch_size=32, 
419 |         update_freq=2, frames_num=2, min_buffer_size=10000, render_cycle=10000)


--------------------------------------------------------------------------------
/Chapter05/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/Chapter05/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | import cv2
  7 | 
  8 | ''' 
  9 | Atari Wrapper copied from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
 10 | '''
 11 | 
 12 | class NoopResetEnv(gym.Wrapper):
 13 |     def __init__(self, env, noop_max=30):
 14 |         """Sample initial states by taking random number of no-ops on reset.
 15 |         No-op is assumed to be action 0.
 16 |         """
 17 |         gym.Wrapper.__init__(self, env)
 18 |         self.noop_max = noop_max
 19 |         self.override_num_noops = None
 20 |         self.noop_action = 0
 21 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 22 | 
 23 |     def reset(self, **kwargs):
 24 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 25 |         self.env.reset(**kwargs)
 26 |         if self.override_num_noops is not None:
 27 |             noops = self.override_num_noops
 28 |         else:
 29 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 30 |         assert noops > 0
 31 |         obs = None
 32 |         for _ in range(noops):
 33 |             obs, _, done, _ = self.env.step(self.noop_action)
 34 |             if done:
 35 |                 obs = self.env.reset(**kwargs)
 36 |         return obs
 37 | 
 38 |     def step(self, ac):
 39 |         return self.env.step(ac)
 40 | 
 41 | class LazyFrames(object):
 42 |     def __init__(self, frames):
 43 |         """This object ensures that common frames between the observations are only stored once.
 44 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
 45 |         buffers.
 46 |         This object should only be converted to numpy array before being passed to the model.
 47 |         You'd not believe how complex the previous solution was."""
 48 |         self._frames = frames
 49 |         self._out = None
 50 | 
 51 |     def _force(self):
 52 |         if self._out is None:
 53 |             self._out = np.concatenate(self._frames, axis=2)
 54 |             self._frames = None
 55 |         return self._out
 56 | 
 57 |     def __array__(self, dtype=None):
 58 |         out = self._force()
 59 |         if dtype is not None:
 60 |             out = out.astype(dtype)
 61 |         return out
 62 | 
 63 |     def __len__(self):
 64 |         return len(self._force())
 65 | 
 66 |     def __getitem__(self, i):
 67 |         return self._force()[i]
 68 | 
 69 | class FireResetEnv(gym.Wrapper):
 70 |     def __init__(self, env):
 71 |         """Take action on reset for environments that are fixed until firing."""
 72 |         gym.Wrapper.__init__(self, env)
 73 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 74 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 75 | 
 76 |     def reset(self, **kwargs):
 77 |         self.env.reset(**kwargs)
 78 |         obs, _, done, _ = self.env.step(1)
 79 |         if done:
 80 |             self.env.reset(**kwargs)
 81 |         obs, _, done, _ = self.env.step(2)
 82 |         if done:
 83 |             self.env.reset(**kwargs)
 84 |         return obs
 85 | 
 86 |     def step(self, ac):
 87 |         return self.env.step(ac)
 88 | 
 89 | 
 90 | class MaxAndSkipEnv(gym.Wrapper):
 91 |     def __init__(self, env, skip=4):
 92 |         """Return only every `skip`-th frame"""
 93 |         gym.Wrapper.__init__(self, env)
 94 |         # most recent raw observations (for max pooling across time steps)
 95 |         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
 96 |         self._skip       = skip
 97 | 
 98 |     def step(self, action):
 99 |         """Repeat action, sum reward, and max over last observations."""
100 |         total_reward = 0.0
101 |         done = None
102 |         for i in range(self._skip):
103 |             obs, reward, done, info = self.env.step(action)
104 |             if i == self._skip - 2: self._obs_buffer[0] = obs
105 |             if i == self._skip - 1: self._obs_buffer[1] = obs
106 |             total_reward += reward
107 |             if done:
108 |                 break
109 |         # Note that the observation on the done=True frame
110 |         # doesn't matter
111 |         max_frame = self._obs_buffer.max(axis=0)
112 | 
113 |         return max_frame, total_reward, done, info
114 | 
115 |     def reset(self, **kwargs):
116 |         return self.env.reset(**kwargs)
117 | 
118 | 
119 | 
120 | class WarpFrame(gym.ObservationWrapper):
121 |     def __init__(self, env):
122 |         """Warp frames to 84x84 as done in the Nature paper and later work."""
123 |         gym.ObservationWrapper.__init__(self, env)
124 |         self.width = 84
125 |         self.height = 84
126 |         self.observation_space = spaces.Box(low=0, high=255,
127 |             shape=(self.height, self.width, 1), dtype=np.uint8)
128 | 
129 |     def observation(self, frame):
130 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
131 |         frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
132 |         return frame[:, :, None]
133 | 
134 | 
135 | 
136 | class FrameStack(gym.Wrapper):
137 |     def __init__(self, env, k):
138 |         """Stack k last frames.
139 |         Returns lazy array, which is much more memory efficient.
140 |         See Also
141 |         baselines.common.atari_wrappers.LazyFrames
142 |         """
143 |         gym.Wrapper.__init__(self, env)
144 |         self.k = k
145 |         self.frames = deque([], maxlen=k)
146 |         shp = env.observation_space.shape
147 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
148 | 
149 |     def reset(self):
150 |         ob = self.env.reset()
151 |         for _ in range(self.k):
152 |             self.frames.append(ob)
153 |         return self._get_ob()
154 | 
155 |     def step(self, action):
156 |         ob, reward, done, info = self.env.step(action)
157 |         self.frames.append(ob)
158 |         return self._get_ob(), reward, done, info
159 | 
160 |     def _get_ob(self):
161 |         assert len(self.frames) == self.k
162 |         return LazyFrames(list(self.frames))
163 | 
164 | 
165 | class ScaledFloatFrame(gym.ObservationWrapper):
166 |     def __init__(self, env):
167 |         gym.ObservationWrapper.__init__(self, env)
168 |         self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
169 | 
170 |     def observation(self, observation):
171 |         # careful! This undoes the memory optimization, use
172 |         # with smaller replay buffers only.
173 |         return np.array(observation).astype(np.float32) / 255.0
174 | 
175 | 
176 | def make_env(env_name, fire=True, frames_num=2, noop_num=30, skip_frames=True):
177 |     env = gym.make(env_name)
178 |     
179 |     if skip_frames:
180 |         env = MaxAndSkipEnv(env) ## Return only every `skip`-th frame
181 |     if fire:
182 |        env = FireResetEnv(env) ## Fire at the beginning
183 |     env = NoopResetEnv(env, noop_max=noop_num)
184 |     env = WarpFrame(env) ## Reshape image
185 |     env = FrameStack(env, frames_num) ## Stack last 4 frames
186 |     #env = ScaledFloatFrame(env) ## Scale frames
187 |     return env


--------------------------------------------------------------------------------
/Chapter05/untitled:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter05/untitled


--------------------------------------------------------------------------------
/Chapter06/AC.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | import time
  6 | 
  7 | 
  8 | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None):
  9 |     '''
 10 |     Multi-layer perceptron
 11 |     '''
 12 |     for l in hidden_layers:
 13 |         x = tf.layers.dense(x, units=l, activation=activation)
 14 |     return tf.layers.dense(x, units=output_size, activation=last_activation)
 15 | 
 16 | def softmax_entropy(logits):
 17 |     '''
 18 |     Softmax Entropy
 19 |     '''
 20 |     return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
 21 | 
 22 | def discounted_rewards(rews, last_sv, gamma):
 23 |     '''
 24 |     Discounted reward to go 
 25 | 
 26 |     Parameters:
 27 |     ----------
 28 |     rews: list of rewards
 29 |     last_sv: value of the last state
 30 |     gamma: discount value 
 31 |     '''
 32 |     rtg = np.zeros_like(rews, dtype=np.float32)
 33 |     rtg[-1] = rews[-1] + gamma*last_sv
 34 |     for i in reversed(range(len(rews)-1)):
 35 |         rtg[i] = rews[i] + gamma*rtg[i+1]
 36 |     return rtg
 37 | 
 38 | class Buffer():
 39 |     '''
 40 |     Buffer class to store the experience from a unique policy
 41 |     '''
 42 |     def __init__(self, gamma=0.99):
 43 |         self.gamma = gamma
 44 |         self.obs = []
 45 |         self.act = []
 46 |         self.ret = []
 47 |         self.rtg = []
 48 | 
 49 |     def store(self, temp_traj, last_sv):
 50 |         '''
 51 |         Add temp_traj values to the buffers and compute the advantage and reward to go
 52 | 
 53 |         Parameters:
 54 |         -----------
 55 |         temp_traj: list where each element is a list that contains: observation, reward, action, state-value
 56 |         last_sv: value of the last state (Used to Bootstrap)
 57 |         '''
 58 |         # store only if the temp_traj list is not empty
 59 |         if len(temp_traj) > 0:
 60 |             self.obs.extend(temp_traj[:,0])
 61 |             rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma)
 62 |             self.ret.extend(rtg - temp_traj[:,3])
 63 |             self.rtg.extend(rtg)
 64 |             self.act.extend(temp_traj[:,2])
 65 | 
 66 |     def get_batch(self):
 67 |         return self.obs, self.act, self.ret, self.rtg
 68 | 
 69 |     def __len__(self):
 70 |         assert(len(self.obs) == len(self.act) == len(self.ret) == len(self.rtg))
 71 |         return len(self.obs)
 72 |     
 73 | def AC(env_name, hidden_sizes=[32], ac_lr=5e-3, cr_lr=8e-3, num_epochs=50, gamma=0.99, steps_per_epoch=100, steps_to_print=100):
 74 |     '''
 75 |     Actor-Critic Algorithm
 76 | s
 77 |     Parameters:
 78 |     -----------
 79 |     env_name: Name of the environment
 80 |     hidden_size: list of the number of hidden units for each layer
 81 |     ac_lr: actor learning rate
 82 |     cr_lr: critic learning rate
 83 |     num_epochs: number of training epochs
 84 |     gamma: discount factor
 85 |     steps_per_epoch: number of steps per epoch
 86 |     '''
 87 |     tf.reset_default_graph()
 88 | 
 89 |     env = gym.make(env_name)    
 90 | 
 91 |     
 92 |     obs_dim = env.observation_space.shape
 93 |     act_dim = env.action_space.n 
 94 | 
 95 |     # Placeholders
 96 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
 97 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
 98 |     ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
 99 |     rtg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='rtg')
100 | 
101 |     #####################################################
102 |     ########### COMPUTE THE PG LOSS FUNCTIONS ###########
103 |     #####################################################
104 | 
105 |     # policy
106 |     p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh)
107 | 
108 |     act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1))
109 |     actions_mask = tf.one_hot(act_ph, depth=act_dim)
110 |     p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1)
111 |     # entropy useful to study the algorithms
112 |     entropy = -tf.reduce_mean(softmax_entropy(p_logits))
113 |     p_loss = -tf.reduce_mean(p_log*ret_ph)
114 | 
115 |     # policy optimization
116 |     p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss)
117 | 
118 |     #######################################
119 |     ###########  VALUE FUNCTION ###########
120 |     #######################################
121 |     
122 |     # value function
123 |     s_values = tf.squeeze(mlp(obs_ph, hidden_sizes, 1, activation=tf.tanh))
124 |     # MSE loss function
125 |     v_loss = tf.reduce_mean((rtg_ph - s_values)**2)
126 |     # value function optimization
127 |     v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss)
128 | 
129 |     # Time
130 |     now = datetime.now()
131 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
132 |     print('Time:', clock_time)
133 | 
134 | 
135 |     # Set scalars and hisograms for TensorBoard
136 |     tf.summary.scalar('p_loss', p_loss, collections=['train'])
137 |     tf.summary.scalar('v_loss', v_loss, collections=['train'])
138 |     tf.summary.scalar('entropy', entropy, collections=['train'])
139 |     tf.summary.scalar('s_values', tf.reduce_mean(s_values), collections=['train'])
140 |     tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train'])
141 |     tf.summary.histogram('p_log', p_log, collections=['train'])
142 |     tf.summary.histogram('act_multn', act_multn, collections=['train'])
143 |     tf.summary.histogram('p_logits', p_logits, collections=['train'])
144 |     tf.summary.histogram('ret_ph', ret_ph, collections=['train'])
145 |     tf.summary.histogram('rtg_ph', rtg_ph, collections=['train'])
146 |     tf.summary.histogram('s_values', s_values, collections=['train'])
147 |     train_summary = tf.summary.merge_all('train')
148 | 
149 |     tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
150 |     tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
151 |     pre_scalar_summary = tf.summary.merge_all('pre_train')
152 | 
153 |     hyp_str = '-steps_{}-aclr_{}-crlr_{}'.format(steps_per_epoch, ac_lr, cr_lr)
154 |     file_writer = tf.summary.FileWriter('log_dir/{}/AC_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph())
155 |     
156 |     # create a session
157 |     sess = tf.Session()
158 |     # initialize the variables
159 |     sess.run(tf.global_variables_initializer())
160 | 
161 |     # few variables
162 |     step_count = 0
163 |     train_rewards = []
164 |     train_ep_len = []
165 |     timer = time.time()
166 |     last_print_step = 0
167 | 
168 |     #Reset the environment at the beginning of the cycle
169 |     obs = env.reset()
170 |     ep_rews = []
171 | 
172 |     # main cycle
173 |     for ep in range(num_epochs):
174 | 
175 |         # intiaizlie buffer and other variables for the new epochs
176 |         buffer = Buffer(gamma)
177 |         env_buf = []
178 |         
179 |         #iterate always over a fixed number of iterations
180 |         for _ in range(steps_per_epoch):
181 | 
182 |             # run the policy
183 |             act, val = sess.run([act_multn, s_values], feed_dict={obs_ph:[obs]})
184 |             # take a step in the environment
185 |             obs2, rew, done, _ = env.step(np.squeeze(act))
186 | 
187 |             # add the new transition
188 |             env_buf.append([obs.copy(), rew, act, np.squeeze(val)])
189 | 
190 |             obs = obs2.copy()
191 | 
192 |             step_count += 1
193 |             last_print_step += 1
194 |             ep_rews.append(rew)
195 | 
196 |             if done:
197 |                 # store the trajectory just completed
198 |                 # Changed from REINFORCE! The second parameter is the estimated value of the next state. Because the environment is done. 
199 |                 # we pass a value of 0
200 |                 buffer.store(np.array(env_buf), 0)
201 |                 env_buf = []
202 |                 # store additionl information about the episode
203 |                 train_rewards.append(np.sum(ep_rews))
204 |                 train_ep_len.append(len(ep_rews))
205 |                 # reset the environment
206 |                 obs = env.reset()
207 |                 ep_rews = []
208 | 
209 |         # Bootstrap with the estimated state value of the next state!
210 |         if len(env_buf) > 0:
211 |             last_sv = sess.run(s_values, feed_dict={obs_ph:[obs]})
212 |             buffer.store(np.array(env_buf), last_sv)
213 | 
214 |         # collect the episodes' information
215 |         obs_batch, act_batch, ret_batch, rtg_batch = buffer.get_batch()
216 |         
217 |         # run pre_scalar_summary before the optimization phase
218 |         old_p_loss, old_v_loss, epochs_summary = sess.run([p_loss, v_loss, pre_scalar_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
219 |         file_writer.add_summary(epochs_summary, step_count)
220 | 
221 |         # Optimize the actor and the critic
222 |         sess.run([p_opt, v_opt], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
223 | 
224 |         # run train_summary to save the summary after the optimization
225 |         new_p_loss, new_v_loss, train_summary_run = sess.run([p_loss, v_loss, train_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
226 |         file_writer.add_summary(train_summary_run, step_count)
227 |         summary = tf.Summary()
228 |         summary.value.add(tag='diff/p_loss', simple_value=(old_p_loss - new_p_loss))
229 |         summary.value.add(tag='diff/v_loss', simple_value=(old_v_loss - new_v_loss))
230 |         file_writer.add_summary(summary, step_count)
231 |         file_writer.flush()
232 | 
233 |         # it's time to print some useful information
234 |         if last_print_step > steps_to_print:
235 |             print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer))
236 | 
237 |             summary = tf.Summary()
238 |             summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len))
239 |             summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards))
240 |             file_writer.add_summary(summary, step_count)
241 |             file_writer.flush()
242 | 
243 |             timer = time.time()
244 |             train_rewards = []
245 |             train_ep_len = []
246 |             last_print_step = 0
247 | 
248 |     env.close()
249 |     file_writer.close()
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     AC('LunarLander-v2', hidden_sizes=[64], ac_lr=4e-3, cr_lr=1.5e-2, gamma=0.99, steps_per_epoch=100, steps_to_print=5000, num_epochs=8000)
254 | 


--------------------------------------------------------------------------------
/Chapter06/REINFORCE.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | import time
  6 | 
  7 | 
  8 | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None):
  9 |     '''
 10 |     Multi-layer perceptron
 11 |     '''
 12 |     for l in hidden_layers:
 13 |         x = tf.layers.dense(x, units=l, activation=activation)
 14 |     return tf.layers.dense(x, units=output_size, activation=last_activation)
 15 | 
 16 | def softmax_entropy(logits):
 17 |     '''
 18 |     Softmax Entropy
 19 |     '''
 20 |     return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
 21 | 
 22 | 
 23 | def discounted_rewards(rews, gamma):
 24 |     '''
 25 |     Discounted reward to go 
 26 | 
 27 |     Parameters:
 28 |     ----------
 29 |     rews: list of rewards
 30 |     gamma: discount value 
 31 |     '''
 32 |     rtg = np.zeros_like(rews, dtype=np.float32)
 33 |     rtg[-1] = rews[-1]
 34 |     for i in reversed(range(len(rews)-1)):
 35 |         rtg[i] = rews[i] + gamma*rtg[i+1]
 36 |     return rtg
 37 | 
 38 | class Buffer():
 39 |     '''
 40 |     Buffer class to store the experience from a unique policy
 41 |     '''
 42 |     def __init__(self, gamma=0.99):
 43 |         self.gamma = gamma
 44 |         self.obs = []
 45 |         self.act = []
 46 |         self.ret = []
 47 | 
 48 |     def store(self, temp_traj):
 49 |         '''
 50 |         Add temp_traj values to the buffers and compute the advantage and reward to go
 51 | 
 52 |         Parameters:
 53 |         -----------
 54 |         temp_traj: list where each element is a list that contains: observation, reward, action, state-value
 55 |         '''
 56 |         # store only if the temp_traj list is not empty
 57 |         if len(temp_traj) > 0:
 58 |             self.obs.extend(temp_traj[:,0])
 59 |             rtg = discounted_rewards(temp_traj[:,1], self.gamma)
 60 |             self.ret.extend(rtg)
 61 |             self.act.extend(temp_traj[:,2])
 62 | 
 63 |     def get_batch(self):
 64 |         b_ret = self.ret
 65 |         return self.obs, self.act, b_ret
 66 | 
 67 |     def __len__(self):
 68 |         assert(len(self.obs) == len(self.act) == len(self.ret))
 69 |         return len(self.obs)
 70 |     
 71 | 
 72 | def REINFORCE(env_name, hidden_sizes=[32], lr=5e-3, num_epochs=50, gamma=0.99, steps_per_epoch=100):
 73 |     '''
 74 |     REINFORCE Algorithm
 75 | 
 76 |     Parameters:
 77 |     -----------
 78 |     env_name: Name of the environment
 79 |     hidden_size: list of the number of hidden units for each layer
 80 |     lr: policy learning rate
 81 |     gamma: discount factor
 82 |     steps_per_epoch: number of steps per epoch
 83 |     num_epochs: number train epochs (Note: they aren't properly epochs)
 84 |     '''
 85 |     tf.reset_default_graph()
 86 | 
 87 |     env = gym.make(env_name)    
 88 | 
 89 |     
 90 |     obs_dim = env.observation_space.shape
 91 |     act_dim = env.action_space.n 
 92 | 
 93 |     # Placeholders
 94 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
 95 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
 96 |     ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
 97 | 
 98 |     ##################################################
 99 |     ########### COMPUTE THE LOSS FUNCTIONS ###########
100 |     ##################################################
101 | 
102 | 
103 |     # policy
104 |     p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh)
105 | 
106 | 
107 |     act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1))
108 |     actions_mask = tf.one_hot(act_ph, depth=act_dim)
109 | 
110 |     p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1)
111 | 
112 |     # entropy useful to study the algorithms
113 |     entropy = -tf.reduce_mean(softmax_entropy(p_logits))
114 |     p_loss = -tf.reduce_mean(p_log*ret_ph)
115 | 
116 |     # policy optimization
117 |     p_opt = tf.train.AdamOptimizer(lr).minimize(p_loss)
118 | 
119 |     # Time
120 |     now = datetime.now()
121 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
122 |     print('Time:', clock_time)
123 | 
124 | 
125 |     # Set scalars and hisograms for TensorBoard
126 |     tf.summary.scalar('p_loss', p_loss, collections=['train'])
127 |     tf.summary.scalar('entropy', entropy, collections=['train'])
128 |     tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train'])
129 |     tf.summary.histogram('p_log', p_log, collections=['train'])
130 |     tf.summary.histogram('act_multn', act_multn, collections=['train'])
131 |     tf.summary.histogram('p_logits', p_logits, collections=['train'])
132 |     tf.summary.histogram('ret_ph', ret_ph, collections=['train'])
133 |     train_summary = tf.summary.merge_all('train')
134 | 
135 |     tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
136 |     pre_scalar_summary = tf.summary.merge_all('pre_train')
137 | 
138 |     hyp_str = '-steps_{}-aclr_{}'.format(steps_per_epoch, lr)
139 |     file_writer = tf.summary.FileWriter('log_dir/{}/REINFORCE_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph())
140 |     
141 |     # create a session
142 |     sess = tf.Session()
143 |     # initialize the variables
144 |     sess.run(tf.global_variables_initializer())
145 | 
146 |     # few variables
147 |     step_count = 0
148 |     train_rewards = []
149 |     train_ep_len = []
150 |     timer = time.time()
151 | 
152 |     # main cycle
153 |     for ep in range(num_epochs):
154 | 
155 |         # initialize environment for the new epochs
156 |         obs = env.reset()
157 | 
158 |         # intiaizlie buffer and other variables for the new epochs
159 |         buffer = Buffer(gamma)
160 |         env_buf = []
161 |         ep_rews = []
162 |         
163 |         while len(buffer) < steps_per_epoch:
164 | 
165 |             # run the policy
166 |             act = sess.run(act_multn, feed_dict={obs_ph:[obs]})
167 |             # take a step in the environment
168 |             obs2, rew, done, _ = env.step(np.squeeze(act))
169 | 
170 |             # add the new transition
171 |             env_buf.append([obs.copy(), rew, act])
172 | 
173 |             obs = obs2.copy()
174 | 
175 |             step_count += 1
176 |             ep_rews.append(rew)
177 | 
178 |             if done:
179 |                 # store the trajectory just completed
180 |                 buffer.store(np.array(env_buf))
181 |                 env_buf = []
182 |                 # store additionl information about the episode
183 |                 train_rewards.append(np.sum(ep_rews))
184 |                 train_ep_len.append(len(ep_rews))
185 |                 # reset the environment
186 |                 obs = env.reset()
187 |                 ep_rews = []
188 | 
189 |         # collect the episodes' information
190 |         obs_batch, act_batch, ret_batch = buffer.get_batch()
191 |         
192 |         # run pre_scalar_summary before the optimization phase
193 |         epochs_summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch})
194 |         file_writer.add_summary(epochs_summary, step_count)
195 | 
196 |         # Optimize the policy
197 |         sess.run(p_opt, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch})
198 | 
199 |         # run train_summary to save the summary after the optimization
200 |         train_summary_run = sess.run(train_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch})
201 |         file_writer.add_summary(train_summary_run, step_count)
202 | 
203 |         # it's time to print some useful information
204 |         if ep % 10 == 0:
205 |             print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer))
206 | 
207 |             summary = tf.Summary()
208 |             summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len))
209 |             summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards))
210 |             file_writer.add_summary(summary, step_count)
211 |             file_writer.flush()
212 | 
213 |             timer = time.time()
214 |             train_rewards = []
215 |             train_ep_len = []
216 | 
217 | 
218 |     env.close()
219 |     file_writer.close()
220 | 
221 | 
222 | if __name__ == '__main__':
223 |     REINFORCE('LunarLander-v2', hidden_sizes=[64], lr=8e-3, gamma=0.99, num_epochs=1000, steps_per_epoch=1000)


--------------------------------------------------------------------------------
/Chapter06/REINFORCE_baseline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | import time
  6 | 
  7 | 
  8 | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None):
  9 |     '''
 10 |     Multi-layer perceptron
 11 |     '''
 12 |     for l in hidden_layers:
 13 |         x = tf.layers.dense(x, units=l, activation=activation)
 14 |     return tf.layers.dense(x, units=output_size, activation=last_activation)
 15 | 
 16 | def softmax_entropy(logits):
 17 |     '''
 18 |     Softmax Entropy
 19 |     '''
 20 |     return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
 21 | 
 22 | 
 23 | def discounted_rewards(rews, gamma):
 24 |     '''
 25 |     Discounted reward to go 
 26 | 
 27 |     Parameters:
 28 |     ----------
 29 |     rews: list of rewards
 30 |     gamma: discount value 
 31 |     '''
 32 |     rtg = np.zeros_like(rews, dtype=np.float32)
 33 |     rtg[-1] = rews[-1]
 34 |     for i in reversed(range(len(rews)-1)):
 35 |         rtg[i] = rews[i] + gamma*rtg[i+1]
 36 |     return rtg
 37 | 
 38 | class Buffer():
 39 |     '''
 40 |     Buffer class to store the experience from a unique policy
 41 |     '''
 42 |     def __init__(self, gamma=0.99):
 43 |         self.gamma = gamma
 44 |         self.obs = []
 45 |         self.act = []
 46 |         self.ret = []
 47 |         self.rtg = []
 48 | 
 49 |     def store(self, temp_traj):
 50 |         '''
 51 |         Add temp_traj values to the buffers and compute the advantage and reward to go
 52 | 
 53 |         Parameters:
 54 |         -----------
 55 |         temp_traj: list where each element is a list that contains: observation, reward, action, state-value
 56 |         '''
 57 |         # store only if the temp_traj list is not empty
 58 |         if len(temp_traj) > 0:
 59 |             self.obs.extend(temp_traj[:,0])
 60 |             rtg = discounted_rewards(temp_traj[:,1], self.gamma)
 61 |             # NEW
 62 |             self.ret.extend(rtg - temp_traj[:,3])
 63 |             self.rtg.extend(rtg)
 64 |             self.act.extend(temp_traj[:,2])
 65 | 
 66 |     def get_batch(self):
 67 |         # MODIFIED
 68 |         return self.obs, self.act, self.ret, self.rtg
 69 | 
 70 |     def __len__(self):
 71 |         assert(len(self.obs) == len(self.act) == len(self.ret) == len(self.rtg))
 72 |         return len(self.obs)
 73 | 
 74 | 
 75 | def REINFORCE_baseline(env_name, hidden_sizes=[32], p_lr=5e-3, vf_lr=8e-3, gamma=0.99, steps_per_epoch=100, num_epochs=1000):
 76 |     '''
 77 |     REINFORCE with baseline Algorithm
 78 | 
 79 |     Parameters:
 80 |     -----------
 81 |     env_name: Name of the environment
 82 |     hidden_size: list of the number of hidden units for each layer
 83 |     p_lr: policy learning rate
 84 |     vf_lr: value function learning rate
 85 |     gamma: discount factor
 86 |     steps_per_epoch: number of steps per epoch
 87 |     num_epochs: number train epochs (Note: they aren't properly epochs)
 88 |     '''
 89 |     tf.reset_default_graph()
 90 | 
 91 |     env = gym.make(env_name)    
 92 |     
 93 |     obs_dim = env.observation_space.shape
 94 |     act_dim = env.action_space.n 
 95 | 
 96 |     # Placeholders
 97 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
 98 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
 99 |     ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
100 |     rtg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='rtg')
101 | 
102 |     #####################################################
103 |     ########### COMPUTE THE PG LOSS FUNCTIONS ###########
104 |     #####################################################
105 | 
106 |     # policy
107 |     p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh)
108 | 
109 |     act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1))
110 |     actions_mask = tf.one_hot(act_ph, depth=act_dim)
111 |     p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1)
112 |     # entropy useful to study the algorithms
113 |     entropy = -tf.reduce_mean(softmax_entropy(p_logits))
114 |     p_loss = -tf.reduce_mean(p_log*ret_ph)
115 | 
116 |     # policy optimization
117 |     p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss)
118 | 
119 |     #######################################
120 |     ###########  VALUE FUNCTION ###########
121 |     #######################################
122 |     
123 |     ########### NEW ###########
124 |     # value function
125 |     s_values = tf.squeeze(mlp(obs_ph, hidden_sizes, 1, activation=tf.tanh))
126 | 
127 |     # MSE loss function
128 |     v_loss = tf.reduce_mean((rtg_ph - s_values)**2)
129 | 
130 |     # value function optimization
131 |     v_opt = tf.train.AdamOptimizer(vf_lr).minimize(v_loss)
132 | 
133 |     # Time
134 |     now = datetime.now()
135 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
136 |     print('Time:', clock_time)
137 | 
138 | 
139 |     # Set scalars and hisograms for TensorBoard
140 |     tf.summary.scalar('p_loss', p_loss, collections=['train'])
141 |     tf.summary.scalar('v_loss', v_loss, collections=['train'])
142 |     tf.summary.scalar('entropy', entropy, collections=['train'])
143 |     tf.summary.scalar('s_values', tf.reduce_mean(s_values), collections=['train'])
144 |     tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train'])
145 |     tf.summary.histogram('p_log', p_log, collections=['train'])
146 |     tf.summary.histogram('act_multn', act_multn, collections=['train'])
147 |     tf.summary.histogram('p_logits', p_logits, collections=['train'])
148 |     tf.summary.histogram('ret_ph', ret_ph, collections=['train'])
149 |     tf.summary.histogram('rtg_ph', rtg_ph, collections=['train'])
150 |     tf.summary.histogram('s_values', s_values, collections=['train'])
151 |     train_summary = tf.summary.merge_all('train')
152 | 
153 |     tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
154 |     tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
155 |     pre_scalar_summary = tf.summary.merge_all('pre_train')
156 | 
157 |     hyp_str = '-steps_{}-plr_{}-vflr_{}'.format(steps_per_epoch, p_lr, vf_lr)
158 |     file_writer = tf.summary.FileWriter('log_dir/{}/REINFORCE_basel_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph())
159 |     
160 |     # create a session
161 |     sess = tf.Session()
162 |     # initialize the variables
163 |     sess.run(tf.global_variables_initializer())
164 | 
165 |     # few variables
166 |     step_count = 0
167 |     train_rewards = []
168 |     train_ep_len = []
169 |     timer = time.time()
170 | 
171 |     # main cycle
172 |     for ep in range(num_epochs):
173 | 
174 |         # initialize environment for the new epochs
175 |         obs = env.reset()
176 | 
177 |         # intiaizlie buffer and other variables for the new epochs
178 |         buffer = Buffer(gamma)
179 |         env_buf = []
180 |         ep_rews = []
181 |         
182 |         while len(buffer) < steps_per_epoch:
183 | 
184 |             # run the policy
185 |             act, val = sess.run([act_multn, s_values], feed_dict={obs_ph:[obs]})
186 |             # take a step in the environment
187 |             obs2, rew, done, _ = env.step(np.squeeze(act))
188 | 
189 |             # add the new transition
190 |             env_buf.append([obs.copy(), rew, act, np.squeeze(val)])
191 | 
192 |             obs = obs2.copy()
193 | 
194 |             step_count += 1
195 |             ep_rews.append(rew)
196 | 
197 |             if done:
198 |                 # store the trajectory just completed
199 |                 buffer.store(np.array(env_buf))
200 |                 env_buf = []
201 |                 # store additionl information about the episode
202 |                 train_rewards.append(np.sum(ep_rews))
203 |                 train_ep_len.append(len(ep_rews))
204 |                 # reset the environment
205 |                 obs = env.reset()
206 |                 ep_rews = []
207 | 
208 |         # collect the episodes' information
209 |         obs_batch, act_batch, ret_batch, rtg_batch = buffer.get_batch()
210 |         
211 |         # run pre_scalar_summary before the optimization phase
212 |         epochs_summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
213 |         file_writer.add_summary(epochs_summary, step_count)
214 | 
215 |         # Optimize the NN policy and the NN value function
216 |         sess.run([p_opt, v_opt], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
217 | 
218 |         # run train_summary to save the summary after the optimization
219 |         train_summary_run = sess.run(train_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
220 |         file_writer.add_summary(train_summary_run, step_count)
221 | 
222 |         # it's time to print some useful information
223 |         if ep % 10 == 0:
224 |             print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer))
225 | 
226 |             summary = tf.Summary()
227 |             summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len))
228 |             summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards))
229 |             file_writer.add_summary(summary, step_count)
230 |             file_writer.flush()
231 | 
232 |             timer = time.time()
233 |             train_rewards = []
234 |             train_ep_len = []
235 | 
236 | 
237 |     env.close()
238 |     file_writer.close()
239 | 
240 | 
241 | if __name__ == '__main__':
242 |     REINFORCE_baseline('LunarLander-v2', hidden_sizes=[64], p_lr=8e-3, vf_lr=7e-3, gamma=0.99, steps_per_epoch=1000, num_epochs=1000)


--------------------------------------------------------------------------------
/Chapter07/PPO.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | import time
  6 | import roboschool
  7 | 
  8 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
  9 |     '''
 10 |     Multi-layer perceptron
 11 |     '''
 12 |     for l in hidden_layers:
 13 |         x = tf.layers.dense(x, units=l, activation=activation)
 14 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 15 | 
 16 | def softmax_entropy(logits):
 17 |     '''
 18 |     Softmax Entropy
 19 |     '''
 20 |     return -tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
 21 | 
 22 | def clipped_surrogate_obj(new_p, old_p, adv, eps):
 23 |     '''
 24 |     Clipped surrogate objective function
 25 |     '''
 26 |     rt = tf.exp(new_p - old_p) # i.e. pi / old_pi
 27 |     return -tf.reduce_mean(tf.minimum(rt*adv, tf.clip_by_value(rt, 1-eps, 1+eps)*adv))
 28 | 
 29 | def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
 30 |     '''
 31 |     Generalized Advantage Estimation
 32 |     '''
 33 |     assert len(rews) == len(v)
 34 |     vs = np.append(v, v_last)
 35 |     delta = np.array(rews) + gamma*vs[1:] - vs[:-1]
 36 |     gae_advantage = discounted_rewards(delta, 0, gamma*lam)
 37 |     return gae_advantage
 38 | 
 39 | def discounted_rewards(rews, last_sv, gamma):
 40 |     '''
 41 |     Discounted reward to go 
 42 | 
 43 |     Parameters:
 44 |     ----------
 45 |     rews: list of rewards
 46 |     last_sv: value of the last state
 47 |     gamma: discount value 
 48 |     '''
 49 |     rtg = np.zeros_like(rews, dtype=np.float32)
 50 |     rtg[-1] = rews[-1] + gamma*last_sv
 51 |     for i in reversed(range(len(rews)-1)):
 52 |         rtg[i] = rews[i] + gamma*rtg[i+1]
 53 |     return rtg
 54 | 
 55 | 
 56 | class StructEnv(gym.Wrapper):
 57 |     '''
 58 |     Gym Wrapper to store information like number of steps and total reward of the last espisode.
 59 |     '''
 60 |     def __init__(self, env):
 61 |         gym.Wrapper.__init__(self, env)
 62 |         self.n_obs = self.env.reset()
 63 |         self.rew_episode = 0
 64 |         self.len_episode = 0
 65 | 
 66 |     def reset(self, **kwargs):
 67 |         self.n_obs = self.env.reset(**kwargs)
 68 |         self.rew_episode = 0
 69 |         self.len_episode = 0
 70 |         return self.n_obs.copy()
 71 |         
 72 |     def step(self, action):
 73 |         ob, reward, done, info = self.env.step(action)
 74 |         self.rew_episode += reward
 75 |         self.len_episode += 1
 76 |         return ob, reward, done, info
 77 | 
 78 |     def get_episode_reward(self):
 79 |         return self.rew_episode
 80 | 
 81 |     def get_episode_length(self):
 82 |         return self.len_episode
 83 | 
 84 | class Buffer():
 85 |     '''
 86 |     Class to store the experience from a unique policy
 87 |     '''
 88 |     def __init__(self, gamma=0.99, lam=0.95):
 89 |         self.gamma = gamma
 90 |         self.lam = lam
 91 |         self.adv = []
 92 |         self.ob = []
 93 |         self.ac = []
 94 |         self.rtg = []
 95 | 
 96 |     def store(self, temp_traj, last_sv):
 97 |         '''
 98 |         Add temp_traj values to the buffers and compute the advantage and reward to go
 99 | 
100 |         Parameters:
101 |         -----------
102 |         temp_traj: list where each element is a list that contains: observation, reward, action, state-value
103 |         last_sv: value of the last state (Used to Bootstrap)
104 |         '''
105 |         # store only if there are temporary trajectories
106 |         if len(temp_traj) > 0:
107 |             self.ob.extend(temp_traj[:,0])
108 |             rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma)
109 |             self.adv.extend(GAE(temp_traj[:,1], temp_traj[:,3], last_sv, self.gamma, self.lam))
110 |             self.rtg.extend(rtg)
111 |             self.ac.extend(temp_traj[:,2])
112 | 
113 |     def get_batch(self):
114 |         # standardize the advantage values
115 |         norm_adv = (self.adv - np.mean(self.adv)) / (np.std(self.adv) + 1e-10)
116 |         return np.array(self.ob), np.array(self.ac), np.array(norm_adv), np.array(self.rtg)
117 | 
118 |     def __len__(self):
119 |         assert(len(self.adv) == len(self.ob) == len(self.ac) == len(self.rtg))
120 |         return len(self.ob)
121 |     
122 | def gaussian_log_likelihood(x, mean, log_std):
123 |     '''
124 |     Gaussian Log Likelihood 
125 |     '''
126 |     log_p = -0.5 *((x-mean)**2 / (tf.exp(log_std)**2+1e-9) + 2*log_std + np.log(2*np.pi))
127 |     return tf.reduce_sum(log_p, axis=-1)
128 | 
129 | def PPO(env_name, hidden_sizes=[32], cr_lr=5e-3, ac_lr=5e-3, num_epochs=50, minibatch_size=5000, gamma=0.99, lam=0.95, number_envs=1, eps=0.1, 
130 |         actor_iter=5, critic_iter=10, steps_per_env=100, action_type='Discrete'):
131 |     '''
132 |     Proximal Policy Optimization
133 | 
134 |     Parameters:
135 |     -----------
136 |     env_name: Name of the environment
137 |     hidden_size: list of the number of hidden units for each layer
138 |     ac_lr: actor learning rate
139 |     cr_lr: critic learning rate
140 |     num_epochs: number of training epochs
141 |     minibatch_size: Batch size used to train the critic and actor
142 |     gamma: discount factor
143 |     lam: lambda parameter for computing the GAE
144 |     number_envs: number of parallel synchronous environments
145 |         # NB: it isn't distributed across multiple CPUs
146 |     eps: Clip threshold. Max deviation from previous policy.
147 |     actor_iter: Number of SGD iterations on the actor per epoch
148 |     critic_iter: NUmber of SGD iterations on the critic per epoch
149 |     steps_per_env: number of steps per environment
150 |             # NB: the total number of steps per epoch will be: steps_per_env*number_envs
151 |     action_type: class name of the action space: Either "Discrete' or "Box"
152 |     '''
153 | 
154 |     tf.reset_default_graph()
155 | 
156 |     # Create some environments to collect the trajectories
157 |     envs = [StructEnv(gym.make(env_name)) for _ in range(number_envs)]
158 |     
159 |     obs_dim = envs[0].observation_space.shape
160 | 
161 |     # Placeholders
162 |     if action_type == 'Discrete':
163 |         act_dim = envs[0].action_space.n 
164 |         act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
165 | 
166 |     elif action_type == 'Box':
167 |         low_action_space = envs[0].action_space.low
168 |         high_action_space = envs[0].action_space.high
169 |         act_dim = envs[0].action_space.shape[0]
170 |         act_ph = tf.placeholder(shape=(None,act_dim), dtype=tf.float32, name='act')
171 | 
172 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
173 |     ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
174 |     adv_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='adv')
175 |     old_p_log_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_p_log')
176 | 
177 |     # Computational graph for the policy in case of a continuous action space
178 |     if action_type == 'Discrete':
179 |         with tf.variable_scope('actor_nn'):
180 |             p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.nn.relu, last_activation=tf.tanh)
181 | 
182 |         act_smp = tf.squeeze(tf.random.multinomial(p_logits, 1))
183 |         act_onehot = tf.one_hot(act_ph, depth=act_dim)
184 |         p_log = tf.reduce_sum(act_onehot * tf.nn.log_softmax(p_logits), axis=-1)
185 |         
186 |     # Computational graph for the policy in case of a continuous action space
187 |     else:
188 |         with tf.variable_scope('actor_nn'):
189 |             p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh)
190 |             log_std = tf.get_variable(name='log_std', initializer=np.zeros(act_dim, dtype=np.float32)-0.5)
191 |         
192 |         # Add noise to the mean values predicted
193 |         # The noise is proportional to the standard deviation
194 |         p_noisy = p_logits + tf.random_normal(tf.shape(p_logits), 0, 1) * tf.exp(log_std)
195 |         # Clip the noisy actions
196 |         act_smp = tf.clip_by_value(p_noisy, low_action_space, high_action_space)
197 |         # Compute the gaussian log likelihood
198 |         p_log = gaussian_log_likelihood(act_ph, p_logits, log_std)
199 | 
200 |     # Nerual nework value function approximizer
201 |     with tf.variable_scope('critic_nn'):
202 |         s_values = mlp(obs_ph, hidden_sizes, 1, tf.tanh, last_activation=None)
203 |         s_values = tf.squeeze(s_values)
204 |             
205 |     # PPO loss function
206 |     p_loss = clipped_surrogate_obj(p_log, old_p_log_ph, adv_ph, eps)
207 |     # MSE loss function
208 |     v_loss = tf.reduce_mean((ret_ph - s_values)**2)
209 | 
210 |     # policy optimizer
211 |     p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss)
212 |     # value function optimizer
213 |     v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss)
214 | 
215 |     # Time
216 |     now = datetime.now()
217 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
218 |     print('Time:', clock_time)
219 | 
220 |     # Set scalars and hisograms for TensorBoard
221 |     tf.summary.scalar('p_loss', p_loss, collections=['train'])
222 |     tf.summary.scalar('v_loss', v_loss, collections=['train'])
223 |     tf.summary.scalar('s_values_m', tf.reduce_mean(s_values), collections=['train'])
224 | 
225 |     if action_type == 'Box':
226 |         tf.summary.scalar('p_std', tf.reduce_mean(tf.exp(log_std)), collections=['train'])
227 |         tf.summary.histogram('log_std',log_std, collections=['train'])
228 |     tf.summary.histogram('p_log', p_log, collections=['train'])
229 |     tf.summary.histogram('p_logits', p_logits, collections=['train'])
230 |     tf.summary.histogram('s_values', s_values, collections=['train'])
231 |     tf.summary.histogram('adv_ph',adv_ph, collections=['train'])
232 |     scalar_summary = tf.summary.merge_all('train')
233 | 
234 |     # .. summary to run before the optimization steps
235 |     tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
236 |     tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
237 |     pre_scalar_summary = tf.summary.merge_all('pre_train')
238 | 
239 |     hyp_str = '-bs_'+str(minibatch_size)+'-envs_'+str(number_envs)+'-ac_lr_'+str(ac_lr)+'-cr_lr'+str(cr_lr)+'-act_it_'+str(actor_iter)+'-crit_it_'+str(critic_iter)
240 |     file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/PPO_'+clock_time+'_'+hyp_str, tf.get_default_graph())
241 |     
242 |     # create a session
243 |     sess = tf.Session()
244 |     # initialize the variables
245 |     sess.run(tf.global_variables_initializer())
246 |     
247 |     # variable to store the total number of steps
248 |     step_count = 0
249 |     
250 |     print('Env batch size:',steps_per_env, ' Batch size:',steps_per_env*number_envs)
251 | 
252 |     for ep in range(num_epochs):
253 |         # Create the buffer that will contain the trajectories (full or partial) 
254 |         # run with the last policy
255 |         buffer = Buffer(gamma, lam)
256 |         # lists to store rewards and length of the trajectories completed
257 |         batch_rew = []
258 |         batch_len = []
259 | 
260 |         # Execute in serial the environments, storing temporarily the trajectories. 
261 |         for env in envs:
262 |             temp_buf = []
263 | 
264 |             #iterate over a fixed number of steps
265 |             for _ in range(steps_per_env):
266 | 
267 |                 # run the policy
268 |                 act, val = sess.run([act_smp, s_values], feed_dict={obs_ph:[env.n_obs]})
269 |                 act = np.squeeze(act)
270 | 
271 |                 # take a step in the environment
272 |                 obs2, rew, done, _ = env.step(act)
273 |                 
274 |                 # add the new transition to the temporary buffer
275 |                 temp_buf.append([env.n_obs.copy(), rew, act, np.squeeze(val)])
276 | 
277 |                 env.n_obs = obs2.copy()
278 |                 step_count += 1
279 | 
280 |                 if done:
281 |                     # Store the full trajectory in the buffer 
282 |                     # (the value of the last state is 0 as the trajectory is completed)
283 |                     buffer.store(np.array(temp_buf), 0)
284 | 
285 |                     # Empty temporary buffer
286 |                     temp_buf = []
287 |                     
288 |                     batch_rew.append(env.get_episode_reward())
289 |                     batch_len.append(env.get_episode_length())
290 |                     
291 |                     # reset the environment
292 |                     env.reset()                 
293 | 
294 |             # Bootstrap with the estimated state value of the next state!
295 |             last_v = sess.run(s_values, feed_dict={obs_ph:[env.n_obs]})
296 |             buffer.store(np.array(temp_buf), np.squeeze(last_v))
297 | 
298 | 
299 |         # Gather the entire batch from the buffer
300 |         # NB: all the batch is used and deleted after the optimization. That is because PPO is on-policy
301 |         obs_batch, act_batch, adv_batch, rtg_batch = buffer.get_batch()
302 | 
303 |         old_p_log = sess.run(p_log, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
304 |         old_p_batch = np.array(old_p_log)
305 | 
306 |         summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_batch})
307 |         file_writer.add_summary(summary, step_count)
308 | 
309 |         lb = len(buffer)
310 |         shuffled_batch = np.arange(lb)    
311 |         
312 |         # Policy optimization steps
313 |         for _ in range(actor_iter):
314 |             # shuffle the batch on every iteration
315 |             np.random.shuffle(shuffled_batch)
316 |             for idx in range(0,lb, minibatch_size):
317 |                 minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
318 |                 sess.run(p_opt, feed_dict={obs_ph:obs_batch[minib], act_ph:act_batch[minib], adv_ph:adv_batch[minib], old_p_log_ph:old_p_batch[minib]})
319 | 
320 |         # Value function optimization steps
321 |         for _ in range(critic_iter):
322 |             # shuffle the batch on every iteration
323 |             np.random.shuffle(shuffled_batch)
324 |             for idx in range(0,lb, minibatch_size):
325 |                 minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
326 |                 sess.run(v_opt, feed_dict={obs_ph:obs_batch[minib], ret_ph:rtg_batch[minib]})
327 |                 
328 | 
329 |         # print some statistics and run the summary for visualizing it on TB
330 |         if len(batch_rew) > 0:           
331 |             train_summary = sess.run(scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, 
332 |                                                                 old_p_log_ph:old_p_batch, ret_ph:rtg_batch})
333 |             file_writer.add_summary(train_summary, step_count)
334 | 
335 |             summary = tf.Summary()
336 |             summary.value.add(tag='supplementary/performance', simple_value=np.mean(batch_rew))
337 |             summary.value.add(tag='supplementary/len', simple_value=np.mean(batch_len))
338 |             file_writer.add_summary(summary, step_count)
339 |             file_writer.flush()
340 |         
341 |             print('Ep:%d Rew:%.2f -- Step:%d' % (ep, np.mean(batch_rew), step_count))
342 | 
343 |     # closing environments..
344 |     for env in envs:
345 |         env.close()
346 | 
347 |     # Close the writer
348 |     file_writer.close()
349 | 
350 | 
351 | if __name__ == '__main__':
352 |     PPO('RoboschoolWalker2d-v1', hidden_sizes=[64,64], cr_lr=5e-4, ac_lr=2e-4, gamma=0.99, lam=0.95, steps_per_env=5000, 
353 |         number_envs=1, eps=0.15, actor_iter=6, critic_iter=10, action_type='Box', num_epochs=5000, minibatch_size=256)
354 |       


--------------------------------------------------------------------------------
/Chapter07/TRPO.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | import roboschool
  6 | 
  7 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
  8 |     '''
  9 |     Multi-layer perceptron
 10 |     '''
 11 |     for l in hidden_layers:
 12 |         x = tf.layers.dense(x, units=l, activation=activation)
 13 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 14 | 
 15 | def softmax_entropy(logits):
 16 |     '''
 17 |     Softmax Entropy
 18 |     '''
 19 |     return -tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
 20 | 
 21 | 
 22 | def gaussian_log_likelihood(ac, mean, log_std):
 23 |     '''
 24 |     Gaussian Log Likelihood 
 25 |     '''
 26 |     log_p = ((ac-mean)**2 / (tf.exp(log_std)**2+1e-9) + 2*log_std) + np.log(2*np.pi)
 27 |     return -0.5 * tf.reduce_sum(log_p, axis=-1)
 28 | 
 29 | 
 30 | def conjugate_gradient(A, b, x=None, iters=10):
 31 |     '''
 32 |     Conjugate gradient method: approximate the solution of Ax=b
 33 |     It solve Ax=b without forming the full matrix, just compute the matrix-vector product (The Fisher-vector product)
 34 |     
 35 |     NB: A is not the full matrix but is a useful matrix-vector product between the averaged Fisher information matrix and arbitrary vectors 
 36 |     Descibed in Appendix C.1 of the TRPO paper
 37 |     '''
 38 |     if x is None:
 39 |         x = np.zeros_like(b)
 40 |         
 41 |     r = A(x) - b
 42 |     p = -r
 43 |     for _ in range(iters):
 44 |         a = np.dot(r, r) / (np.dot(p, A(p))+1e-8)
 45 |         x += a*p
 46 |         r_n = r + a*A(p)
 47 |         b = np.dot(r_n, r_n) / (np.dot(r, r)+1e-8)
 48 |         p = -r_n + b*p
 49 |         r = r_n
 50 |     return x
 51 | 
 52 | def gaussian_DKL(mu_q, log_std_q, mu_p, log_std_p):
 53 |     '''
 54 |     Gaussian KL divergence in case of a diagonal covariance matrix
 55 |     '''
 56 |     return tf.reduce_mean(tf.reduce_sum(0.5 * (log_std_p - log_std_q + tf.exp(log_std_q - log_std_p) + (mu_q - mu_p)**2 / tf.exp(log_std_p) - 1), axis=1))
 57 | 
 58 | 
 59 | def backtracking_line_search(Dkl, delta, old_loss, p=0.8):
 60 |     '''
 61 |     Backtracking line searc. It look for a coefficient s.t. the constraint on the DKL is satisfied
 62 |     It has both to
 63 |      - improve the non-linear objective
 64 |      - satisfy the constraint
 65 | 
 66 |     '''
 67 |     ## Explained in Appendix C of the TRPO paper
 68 |     a = 1
 69 |     it = 0
 70 |  
 71 |     new_dkl, new_loss = Dkl(a) 
 72 |     while (new_dkl > delta) or (new_loss > old_loss):
 73 |         a *= p
 74 |         it += 1
 75 |         new_dkl, new_loss = Dkl(a)
 76 | 
 77 |     return a
 78 | 
 79 | 
 80 | 
 81 | def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
 82 |     '''
 83 |     Generalized Advantage Estimation
 84 |     '''
 85 |     assert len(rews) == len(v)
 86 |     vs = np.append(v, v_last)
 87 |     d = np.array(rews) + gamma*vs[1:] - vs[:-1]
 88 |     gae_advantage = discounted_rewards(d, 0, gamma*lam)
 89 |     return gae_advantage
 90 | 
 91 | def discounted_rewards(rews, last_sv, gamma):
 92 |     '''
 93 |     Discounted reward to go 
 94 | 
 95 |     Parameters:
 96 |     ----------
 97 |     rews: list of rewards
 98 |     last_sv: value of the last state
 99 |     gamma: discount value 
100 |     '''
101 |     rtg = np.zeros_like(rews, dtype=np.float32)
102 |     rtg[-1] = rews[-1] + gamma*last_sv
103 |     for i in reversed(range(len(rews)-1)):
104 |         rtg[i] = rews[i] + gamma*rtg[i+1]
105 |     return rtg
106 | 
107 | class Buffer():
108 |     '''
109 |     Class to store the experience from a unique policy
110 |     '''
111 |     def __init__(self, gamma=0.99, lam=0.95):
112 |         self.gamma = gamma
113 |         self.lam = lam
114 |         self.adv = []
115 |         self.ob = []
116 |         self.ac = []
117 |         self.rtg = []
118 | 
119 |     def store(self, temp_traj, last_sv):
120 |         '''
121 |         Add temp_traj values to the buffers and compute the advantage and reward to go
122 | 
123 |         Parameters:
124 |         -----------
125 |         temp_traj: list where each element is a list that contains: observation, reward, action, state-value
126 |         last_sv: value of the last state (Used to Bootstrap)
127 |         '''
128 |         # store only if there are temporary trajectories
129 |         if len(temp_traj) > 0:
130 |             self.ob.extend(temp_traj[:,0])
131 |             rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma)
132 |             self.adv.extend(GAE(temp_traj[:,1], temp_traj[:,3], last_sv, self.gamma, self.lam))
133 |             self.rtg.extend(rtg)
134 |             self.ac.extend(temp_traj[:,2])
135 | 
136 |     def get_batch(self):
137 |         # standardize the advantage values
138 |         norm_adv = (self.adv - np.mean(self.adv)) / (np.std(self.adv) + 1e-10)
139 |         return np.array(self.ob), np.array(self.ac), np.array(norm_adv), np.array(self.rtg)
140 | 
141 |     def __len__(self):
142 |         assert(len(self.adv) == len(self.ob) == len(self.ac) == len(self.rtg))
143 |         return len(self.ob)
144 | 
145 | def flatten_list(tensor_list):
146 |     '''
147 |     Flatten a list of tensors
148 |     '''
149 |     return tf.concat([flatten(t) for t in tensor_list], axis=0)
150 | 
151 | def flatten(tensor):
152 |     '''
153 |     Flatten a tensor
154 |     '''
155 |     return tf.reshape(tensor, shape=(-1,))
156 | 
157 | 
158 | class StructEnv(gym.Wrapper):
159 |     '''
160 |     Gym Wrapper to store information like number of steps and total reward of the last espisode.
161 |     '''
162 |     def __init__(self, env):
163 |         gym.Wrapper.__init__(self, env)
164 |         self.n_obs = self.env.reset()
165 |         self.total_rew = 0
166 |         self.len_episode = 0
167 | 
168 |     def reset(self, **kwargs):
169 |         self.n_obs = self.env.reset(**kwargs)
170 |         self.total_rew = 0
171 |         self.len_episode = 0
172 |         return self.n_obs.copy()
173 |         
174 |     def step(self, action):
175 |         ob, reward, done, info = self.env.step(action)
176 |         self.total_rew += reward
177 |         self.len_episode += 1
178 |         return ob, reward, done, info
179 | 
180 |     def get_episode_reward(self):
181 |         return self.total_rew
182 | 
183 |     def get_episode_length(self):
184 |         return self.len_episode
185 | 
186 | 
187 | def TRPO(env_name, hidden_sizes=[32], cr_lr=5e-3, num_epochs=50, gamma=0.99, lam=0.95, number_envs=1, 
188 |         critic_iter=10, steps_per_env=100, delta=0.002, algorithm='TRPO', conj_iters=10, minibatch_size=1000):
189 |     '''
190 |     Trust Region Policy Optimization
191 | 
192 |     Parameters:
193 |     -----------
194 |     env_name: Name of the environment
195 |     hidden_sizes: list of the number of hidden units for each layer
196 |     cr_lr: critic learning rate
197 |     num_epochs: number of training epochs
198 |     gamma: discount factor
199 |     lam: lambda parameter for computing the GAE
200 |     number_envs: number of "parallel" synchronous environments
201 |         # NB: it isn't distributed across multiple CPUs
202 |     critic_iter: NUmber of SGD iterations on the critic per epoch
203 |     steps_per_env: number of steps per environment
204 |             # NB: the total number of steps per epoch will be: steps_per_env*number_envs
205 |     delta: Maximum KL divergence between two policies. Scalar value
206 |     algorithm: type of algorithm. Either 'TRPO' or 'NPO'
207 |     conj_iters: number of conjugate gradient iterations
208 |     minibatch_size: Batch size used to train the critic
209 |     '''
210 | 
211 |     tf.reset_default_graph()
212 | 
213 |     # Create a few environments to collect the trajectories
214 |     envs = [StructEnv(gym.make(env_name)) for _ in range(number_envs)]
215 | 
216 |     low_action_space = envs[0].action_space.low
217 |     high_action_space = envs[0].action_space.high
218 | 
219 |     obs_dim = envs[0].observation_space.shape
220 |     act_dim = envs[0].action_space.shape[0]
221 | 
222 |     # Placeholders
223 |     act_ph = tf.placeholder(shape=(None,act_dim), dtype=tf.float32, name='act')
224 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
225 |     ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
226 |     adv_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='adv')
227 |     old_p_log_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_p_log')
228 |     old_mu_ph = tf.placeholder(shape=(None, act_dim), dtype=tf.float32, name='old_mu')
229 |     old_log_std_ph = tf.placeholder(shape=(act_dim), dtype=tf.float32, name='old_log_std')
230 |     p_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_ph')
231 |     # result of the conjugate gradient algorithm
232 |     cg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='cg')
233 |         
234 |     # Neural network that represent the policy
235 |     with tf.variable_scope('actor_nn'):
236 |         p_means = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh)
237 |         log_std = tf.get_variable(name='log_std', initializer=np.zeros(act_dim, dtype=np.float32) - 0.5)
238 | 
239 |     # Neural network that represent the value function
240 |     with tf.variable_scope('critic_nn'):
241 |         s_values = mlp(obs_ph, hidden_sizes, 1, tf.tanh, last_activation=None)
242 |         s_values = tf.squeeze(s_values)    
243 | 
244 |     # Add "noise" to the predicted mean following the Guassian distribution with standard deviation e^(log_std)
245 |     p_noisy = p_means + tf.random_normal(tf.shape(p_means), 0, 1) * tf.exp(log_std)
246 |     # Clip the noisy actions
247 |     a_sampl = tf.clip_by_value(p_noisy, low_action_space, high_action_space)
248 |     # Compute the gaussian log likelihood
249 |     p_log = gaussian_log_likelihood(act_ph, p_means, log_std)
250 | 
251 |     # Measure the divergence
252 |     diverg = tf.reduce_mean(tf.exp(old_p_log_ph - p_log))
253 |     
254 |     # ratio
255 |     ratio_new_old = tf.exp(p_log - old_p_log_ph)
256 |     # TRPO surrogate loss function
257 |     p_loss = - tf.reduce_mean(ratio_new_old * adv_ph)
258 | 
259 |     # MSE loss function
260 |     v_loss = tf.reduce_mean((ret_ph - s_values)**2)
261 |     # Critic optimization
262 |     v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss)
263 | 
264 |     def variables_in_scope(scope):
265 |         # get all trainable variables in 'scope'
266 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
267 |     
268 |     # Gather and flatten the actor parameters
269 |     p_variables = variables_in_scope('actor_nn')
270 |     p_var_flatten = flatten_list(p_variables)
271 | 
272 |     # Gradient of the policy loss with respect to the actor parameters
273 |     p_grads = tf.gradients(p_loss, p_variables)
274 |     p_grads_flatten = flatten_list(p_grads)
275 | 
276 |     ########### RESTORE ACTOR PARAMETERS ###########
277 |     p_old_variables = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_old_variables')
278 |     # variable used as index for restoring the actor's parameters
279 |     it_v1 = tf.Variable(0, trainable=False)
280 |     restore_params = []
281 | 
282 |     for p_v in p_variables:
283 |         upd_rsh = tf.reshape(p_old_variables[it_v1 : it_v1+tf.reduce_prod(p_v.shape)], shape=p_v.shape)
284 |         restore_params.append(p_v.assign(upd_rsh)) 
285 |         it_v1 += tf.reduce_prod(p_v.shape)
286 | 
287 |     restore_params = tf.group(*restore_params)
288 | 
289 |     # gaussian KL divergence of the two policies 
290 |     dkl_diverg = gaussian_DKL(old_mu_ph, old_log_std_ph, p_means, log_std) 
291 | 
292 |     # Jacobian of the KL divergence (Needed for the Fisher matrix-vector product)
293 |     dkl_diverg_grad = tf.gradients(dkl_diverg, p_variables) 
294 | 
295 |     dkl_matrix_product = tf.reduce_sum(flatten_list(dkl_diverg_grad) * p_ph)
296 |     print('dkl_matrix_product', dkl_matrix_product.shape)
297 |     # Fisher vector product
298 |     # The Fisher-vector product is a way to compute the A matrix without the need of the full A
299 |     Fx = flatten_list(tf.gradients(dkl_matrix_product, p_variables))
300 | 
301 |     ## Step length
302 |     beta_ph = tf.placeholder(shape=(), dtype=tf.float32, name='beta')
303 |     # NPG update
304 |     npg_update = beta_ph * cg_ph
305 |     
306 |     ## alpha is found through line search
307 |     alpha = tf.Variable(1., trainable=False)
308 |     # TRPO update
309 |     trpo_update = alpha * npg_update
310 | 
311 |     ####################   POLICY UPDATE  ###################
312 |     # variable used as an index
313 |     it_v = tf.Variable(0, trainable=False)
314 |     p_opt = []
315 |     # Apply the updates to the policy
316 |     for p_v in p_variables:
317 |         upd_rsh = tf.reshape(trpo_update[it_v : it_v+tf.reduce_prod(p_v.shape)], shape=p_v.shape)
318 |         p_opt.append(p_v.assign_sub(upd_rsh))
319 |         it_v += tf.reduce_prod(p_v.shape)
320 | 
321 |     p_opt = tf.group(*p_opt)
322 |         
323 |     # Time
324 |     now = datetime.now()
325 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
326 |     print('Time:', clock_time)
327 | 
328 | 
329 |     # Set scalars and hisograms for TensorBoard
330 |     tf.summary.scalar('p_loss', p_loss, collections=['train'])
331 |     tf.summary.scalar('v_loss', v_loss, collections=['train'])
332 |     tf.summary.scalar('p_divergence', diverg, collections=['train'])
333 |     tf.summary.scalar('ratio_new_old',tf.reduce_mean(ratio_new_old), collections=['train'])
334 |     tf.summary.scalar('dkl_diverg', dkl_diverg, collections=['train'])
335 |     tf.summary.scalar('alpha', alpha, collections=['train'])
336 |     tf.summary.scalar('beta', beta_ph, collections=['train'])
337 |     tf.summary.scalar('p_std_mn', tf.reduce_mean(tf.exp(log_std)), collections=['train'])
338 |     tf.summary.scalar('s_values_mn', tf.reduce_mean(s_values), collections=['train'])
339 |     tf.summary.histogram('p_log', p_log, collections=['train'])
340 |     tf.summary.histogram('p_means', p_means, collections=['train'])
341 |     tf.summary.histogram('s_values', s_values, collections=['train'])
342 |     tf.summary.histogram('adv_ph',adv_ph, collections=['train'])
343 |     tf.summary.histogram('log_std',log_std, collections=['train'])
344 |     scalar_summary = tf.summary.merge_all('train')
345 | 
346 |     tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
347 |     tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
348 |     pre_scalar_summary = tf.summary.merge_all('pre_train')
349 | 
350 |     hyp_str = '-spe_'+str(steps_per_env)+'-envs_'+str(number_envs)+'-cr_lr'+str(cr_lr)+'-crit_it_'+str(critic_iter)+'-delta_'+str(delta)+'-conj_iters_'+str(conj_iters)
351 |     file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/'+algorithm+'_'+clock_time+'_'+hyp_str, tf.get_default_graph())
352 |     
353 |     # create a session
354 |     sess = tf.Session()
355 |     # initialize the variables
356 |     sess.run(tf.global_variables_initializer())
357 |     
358 |     # variable to store the total number of steps
359 |     step_count = 0
360 |     
361 |     print('Env batch size:',steps_per_env, ' Batch size:',steps_per_env*number_envs)
362 | 
363 |     for ep in range(num_epochs):
364 |         # Create the buffer that will contain the trajectories (full or partial) 
365 |         # run with the last policy
366 |         buffer = Buffer(gamma, lam)
367 |         # lists to store rewards and length of the trajectories completed
368 |         batch_rew = []
369 |         batch_len = []
370 | 
371 |         # Execute in serial the environment, storing temporarily the trajectories.
372 |         for env in envs:
373 |             temp_buf = []
374 | 
375 |             # iterate over a fixed number of steps
376 |             for _ in range(steps_per_env):
377 |                 # run the policy
378 |                 act, val = sess.run([a_sampl, s_values], feed_dict={obs_ph:[env.n_obs]})
379 |                 act = np.squeeze(act)
380 | 
381 |                 # take a step in the environment
382 |                 obs2, rew, done, _ = env.step(act)
383 | 
384 |                 # add the new transition to the temporary buffer
385 |                 temp_buf.append([env.n_obs.copy(), rew, act, np.squeeze(val)])
386 | 
387 |                 env.n_obs = obs2.copy()
388 |                 step_count += 1
389 | 
390 |                 if done:
391 |                     # Store the full trajectory in the buffer 
392 |                     # (the value of the last state is 0 as the trajectory is completed)
393 |                     buffer.store(np.array(temp_buf), 0)
394 |                     # Empty temporary buffer
395 |                     temp_buf = []
396 | 
397 |                     batch_rew.append(env.get_episode_reward())
398 |                     batch_len.append(env.get_episode_length())
399 | 
400 |                     env.reset()
401 |                     
402 |             # Bootstrap with the estimated state value of the next state!
403 |             lsv = sess.run(s_values, feed_dict={obs_ph:[env.n_obs]})
404 |             buffer.store(np.array(temp_buf), np.squeeze(lsv))
405 | 
406 | 
407 |         # Get the entire batch from the buffer
408 |         # NB: all the batch is used and deleted after the optimization. This is because PPO is on-policy
409 |         obs_batch, act_batch, adv_batch, rtg_batch = buffer.get_batch()
410 | 
411 |         # log probabilities, logits and log std of the "old" policy
412 |         # "old" policy refer to the policy to optimize and that has been used to sample from the environment
413 |         old_p_log, old_p_means, old_log_std = sess.run([p_log, p_means, log_std], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
414 |         # get also the "old" parameters
415 |         old_actor_params = sess.run(p_var_flatten)
416 | 
417 |         # old_p_loss is later used in the line search
418 |         # run pre_scalar_summary for a summary before the optimization
419 |         old_p_loss, summary = sess.run([p_loss,pre_scalar_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
420 |         file_writer.add_summary(summary, step_count)
421 | 
422 |         def H_f(p):
423 |             '''
424 |             Run the Fisher-Vector product on 'p' to approximate the Hessian of the DKL
425 |             '''
426 |             return sess.run(Fx, feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, p_ph:p, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
427 | 
428 |         g_f = sess.run(p_grads_flatten, feed_dict={old_mu_ph:old_p_means,obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
429 |         ## Compute the Conjugate Gradient so to obtain an approximation of H^(-1)*g
430 |         # Where H in reality isn't the true Hessian of the KL divergence but an approximation of it computed via Fisher-Vector Product (F)
431 |         conj_grad = conjugate_gradient(H_f, g_f, iters=conj_iters)
432 | 
433 |         # Compute the step length
434 |         beta_np = np.sqrt(2*delta / np.sum(conj_grad * H_f(conj_grad)))
435 |         
436 |         def DKL(alpha_v):
437 |             '''
438 |             Compute the KL divergence.
439 |             It optimize the function to compute the DKL. Afterwards it restore the old parameters.
440 |             '''
441 |             sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:alpha_v, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
442 |             a_res = sess.run([dkl_diverg, p_loss], feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
443 |             sess.run(restore_params, feed_dict={p_old_variables: old_actor_params})
444 |             return a_res
445 | 
446 |         # Actor optimization step
447 |         # Different for TRPO or NPG
448 |         if algorithm=='TRPO':
449 |             # Backtracing line search to find the maximum alpha coefficient s.t. the constraint is valid
450 |             best_alpha = backtracking_line_search(DKL, delta, old_p_loss, p=0.8)
451 |             sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:best_alpha, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
452 |         elif algorithm=='NPG':
453 |             # In case of NPG, no line search
454 |             sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:1, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
455 | 
456 | 
457 |         lb = len(buffer)
458 |         shuffled_batch = np.arange(lb)
459 |         np.random.shuffle(shuffled_batch)
460 | 
461 |         # Value function optimization steps
462 |         for _ in range(critic_iter):
463 |             # shuffle the batch on every iteration
464 |             np.random.shuffle(shuffled_batch)
465 |             for idx in range(0,lb, minibatch_size):
466 |                 minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
467 |                 sess.run(v_opt, feed_dict={obs_ph:obs_batch[minib], ret_ph:rtg_batch[minib]})
468 | 
469 |         # print some statistics and run the summary for visualizing it on TB
470 |         if len(batch_rew) > 0:
471 |             train_summary = sess.run(scalar_summary, feed_dict={beta_ph:beta_np, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, cg_ph:conj_grad,
472 |                                                                 old_p_log_ph:old_p_log, ret_ph:rtg_batch, old_mu_ph:old_p_means, old_log_std_ph:old_log_std})
473 |             file_writer.add_summary(train_summary, step_count)
474 |             
475 |             summary = tf.Summary()
476 |             summary.value.add(tag='supplementary/performance', simple_value=np.mean(batch_rew))
477 |             summary.value.add(tag='supplementary/len', simple_value=np.mean(batch_len))
478 |             file_writer.add_summary(summary, step_count)
479 |             file_writer.flush()
480 | 
481 |             print('Ep:%d Rew:%.2f -- Step:%d' % (ep, np.mean(batch_rew), step_count))
482 | 
483 |     # closing environments..
484 |     for env in envs:
485 |         env.close()
486 | 
487 |     file_writer.close()
488 | 
489 | if __name__ == '__main__':
490 |     TRPO('RoboschoolWalker2d-v1', hidden_sizes=[64,64], cr_lr=2e-3, gamma=0.99, lam=0.95, num_epochs=1000, steps_per_env=6000, 
491 |          number_envs=1, critic_iter=10, delta=0.01, algorithm='TRPO', conj_iters=10, minibatch_size=1000)
492 | 


--------------------------------------------------------------------------------
/Chapter08/DDPG.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | from collections import deque
  6 | import time
  7 | 
  8 | current_milli_time = lambda: int(round(time.time() * 1000))
  9 | 
 10 | def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
 11 |     '''
 12 |     Multi-layer perceptron
 13 |     '''
 14 |     for l in hidden_layers:
 15 |         x = tf.layers.dense(x, units=l, activation=activation)
 16 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 17 | 
 18 | def deterministic_actor_critic(x, a, hidden_sizes, act_dim, max_act):
 19 |     '''
 20 |     Deterministic Actor-Critic
 21 |     '''
 22 |     # Actor
 23 |     with tf.variable_scope('p_mlp'):
 24 |         p_means = max_act * mlp(x, hidden_sizes, act_dim, last_activation=tf.tanh) 
 25 |     
 26 |     # Critic with as input the deterministic action of the actor
 27 |     with tf.variable_scope('q_mlp'):
 28 |         q_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None) 
 29 |     
 30 |     # Critic with as input an arbirtary action
 31 |     with tf.variable_scope('q_mlp', reuse=True): # Use the weights of the mlp just defined
 32 |         q_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None)
 33 | 
 34 |     return p_means, tf.squeeze(q_d), tf.squeeze(q_a)
 35 | 
 36 | class ExperiencedBuffer():
 37 |     '''
 38 |     Experienced buffer
 39 |     '''
 40 |     def __init__(self, buffer_size):
 41 |         # Contains up to 'buffer_size' experience
 42 |         self.obs_buf = deque(maxlen=buffer_size)
 43 |         self.rew_buf = deque(maxlen=buffer_size)
 44 |         self.act_buf = deque(maxlen=buffer_size)
 45 |         self.obs2_buf = deque(maxlen=buffer_size)
 46 |         self.done_buf = deque(maxlen=buffer_size)
 47 | 
 48 | 
 49 |     def add(self, obs, rew, act, obs2, done):
 50 |         '''
 51 |         Add a new transition to the buffers
 52 |         '''
 53 |         self.obs_buf.append(obs)
 54 |         self.rew_buf.append(rew)
 55 |         self.act_buf.append(act)
 56 |         self.obs2_buf.append(obs2)
 57 |         self.done_buf.append(done)
 58 |         
 59 | 
 60 |     def sample_minibatch(self, batch_size):
 61 |         '''
 62 |         Sample a mini-batch of size 'batch_size'
 63 |         '''
 64 |         mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
 65 | 
 66 |         mb_obs = [self.obs_buf[i] for i in mb_indices]
 67 |         mb_rew = [self.rew_buf[i] for i in mb_indices]
 68 |         mb_act = [self.act_buf[i] for i in mb_indices]
 69 |         mb_obs2 = [self.obs2_buf[i] for i in mb_indices]
 70 |         mb_done = [self.done_buf[i] for i in mb_indices]
 71 | 
 72 |         return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
 73 | 
 74 |     def __len__(self):
 75 |         return len(self.obs_buf)
 76 | 
 77 | def test_agent(env_test, agent_op, num_games=10):
 78 |     '''
 79 |     Test an agent 'agent_op', 'num_games' times
 80 |     Return mean and std
 81 |     '''
 82 |     games_r = []
 83 |     for _ in range(num_games):
 84 |         d = False
 85 |         game_r = 0
 86 |         o = env_test.reset()
 87 | 
 88 |         while not d:
 89 |             a_s = agent_op(o)
 90 |             o, r, d, _ = env_test.step(a_s)
 91 |             game_r += r
 92 | 
 93 |         games_r.append(game_r)
 94 |     return np.mean(games_r), np.std(games_r)
 95 | 
 96 | 
 97 | 
 98 | def DDPG(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs=2000, buffer_size=5000, discount=0.99, render_cycle=100, mean_summaries_steps=1000, 
 99 |         batch_size=128, min_buffer_size=5000, tau=0.005):
100 | 
101 |     # Create an environment for training
102 |     env = gym.make(env_name)
103 |     # Create an environment for testing the actor
104 |     env_test = gym.make(env_name)
105 | 
106 |     tf.reset_default_graph()
107 | 
108 |     obs_dim = env.observation_space.shape
109 |     act_dim = env.action_space.shape
110 |     print('-- Observation space:', obs_dim, ' Action space:', act_dim, '--')
111 | 
112 |     # Create some placeholders
113 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
114 |     act_ph = tf.placeholder(shape=(None, act_dim[0]), dtype=tf.float32, name='act')
115 |     y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
116 | 
117 |     # Create an online deterministic actor-critic 
118 |     with tf.variable_scope('online'):
119 |         p_onl, qd_onl, qa_onl = deterministic_actor_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
120 |     # and a target one
121 |     with tf.variable_scope('target'):
122 |         _, qd_tar, _ = deterministic_actor_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
123 | 
124 |     def variables_in_scope(scope):
125 |         '''
126 |         Retrieve all the variables in the scope 'scope'
127 |         '''
128 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
129 | 
130 |     # Copy all the online variables to the target networks i.e. target = online
131 |     # Needed only at the beginning
132 |     init_target = [target_var.assign(online_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
133 |     init_target_op = tf.group(*init_target)
134 | 
135 |     # Soft update
136 |     update_target = [target_var.assign(tau*online_var + (1-tau)*target_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
137 |     update_target_op = tf.group(*update_target)
138 | 
139 |     # Critic loss (MSE)
140 |     q_loss = tf.reduce_mean((qa_onl - y_ph)**2) 
141 |     # Actor loss
142 |     p_loss = -tf.reduce_mean(qd_onl)
143 | 
144 |     # Optimize the critic
145 |     q_opt = tf.train.AdamOptimizer(cr_lr).minimize(q_loss)
146 |     # Optimize the actor
147 |     p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss, var_list=variables_in_scope('online/p_mlp'))
148 | 
149 | 
150 |     def agent_op(o):
151 |         a = np.squeeze(sess.run(p_onl, feed_dict={obs_ph:[o]}))
152 |         return np.clip(a, env.action_space.low, env.action_space.high)
153 | 
154 |     def agent_noisy_op(o, scale):
155 |         action = agent_op(o)
156 |         noisy_action = action + np.random.normal(loc=0.0, scale=scale, size=action.shape)
157 |         return np.clip(noisy_action, env.action_space.low, env.action_space.high)
158 | 
159 | 
160 |     # Time
161 |     now = datetime.now()
162 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
163 |     print('Time:', clock_time)
164 | 
165 | 
166 |     # Set TensorBoard
167 |     tf.summary.scalar('loss/q', q_loss)
168 |     tf.summary.scalar('loss/p', p_loss)
169 |     scalar_summary = tf.summary.merge_all()
170 | 
171 |     hyp_str = '-aclr_'+str(ac_lr)+'-crlr_'+str(cr_lr)+'-tau_'+str(tau)
172 | 
173 |     file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/DDPG_'+clock_time+'_'+hyp_str, tf.get_default_graph())
174 | 
175 |     # Create a session and initialize the variables
176 |     sess = tf.Session()
177 |     sess.run(tf.global_variables_initializer())
178 |     sess.run(init_target_op)
179 |     
180 |     # Some useful variables..
181 |     render_the_game = False
182 |     step_count = 0
183 |     last_q_update_loss = []
184 |     last_p_update_loss = []
185 |     ep_time = current_milli_time()
186 |     batch_rew = []
187 | 
188 |     # Reset the environment
189 |     obs = env.reset()
190 |     # Initialize the buffer
191 |     buffer = ExperiencedBuffer(buffer_size)
192 | 
193 | 
194 |     for ep in range(num_epochs):
195 |         g_rew = 0
196 |         done = False
197 | 
198 |         while not done:
199 |             # If not gathered enough experience yet, act randomly
200 |             if len(buffer) < min_buffer_size:
201 |                 act = env.action_space.sample()
202 |             else:
203 |                 act = agent_noisy_op(obs, 0.1)
204 | 
205 |             # Take a step in the environment
206 |             obs2, rew, done, _ = env.step(act)
207 | 
208 |             if render_the_game:
209 |                 env.render()
210 | 
211 |             # Add the transition in the buffer
212 |             buffer.add(obs.copy(), rew, act, obs2.copy(), done)
213 | 
214 |             obs = obs2
215 |             g_rew += rew
216 |             step_count += 1
217 | 
218 |             if len(buffer) > min_buffer_size:
219 |                 # sample a mini batch from the buffer
220 |                 mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
221 | 
222 |                 # Compute the target values
223 |                 q_target_mb = sess.run(qd_tar, feed_dict={obs_ph:mb_obs2})
224 |                 y_r = np.array(mb_rew) + discount*(1-np.array(mb_done))*q_target_mb
225 | 
226 |                 # optimize the critic
227 |                 train_summary, _, q_train_loss = sess.run([scalar_summary, q_opt, q_loss], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
228 |                 
229 |                 # optimize the actor
230 |                 _, p_train_loss = sess.run([p_opt, p_loss], feed_dict={obs_ph:mb_obs})
231 | 
232 |                 # summaries..
233 |                 file_writer.add_summary(train_summary, step_count)
234 |                 last_q_update_loss.append(q_train_loss)
235 |                 last_p_update_loss.append(p_train_loss)
236 | 
237 |                 # Soft update of the target networks
238 |                 sess.run(update_target_op)
239 | 
240 |                 # some 'mean' summaries to plot more smooth functions
241 |                 if step_count % mean_summaries_steps == 0:
242 |                     summary = tf.Summary()
243 |                     summary.value.add(tag='loss/mean_q', simple_value=np.mean(last_q_update_loss))
244 |                     summary.value.add(tag='loss/mean_p', simple_value=np.mean(last_p_update_loss))
245 |                     file_writer.add_summary(summary, step_count)
246 |                     file_writer.flush()
247 | 
248 |                     last_q_update_loss = []
249 |                     last_p_update_loss = []
250 | 
251 | 
252 |             if done:
253 |                 obs = env.reset()
254 |                 batch_rew.append(g_rew)
255 |                 g_rew, render_the_game = 0, False
256 | 
257 |         # Test the actor every 10 epochs
258 |         if ep % 10 == 0:
259 |             test_mn_rw, test_std_rw = test_agent(env_test, agent_op)
260 | 
261 |             summary = tf.Summary()
262 |             summary.value.add(tag='test/reward', simple_value=test_mn_rw)
263 |             file_writer.add_summary(summary, step_count)
264 |             file_writer.flush()
265 | 
266 |             ep_sec_time = int((current_milli_time()-ep_time) / 1000)
267 |             print('Ep:%4d Rew:%4.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d' %  (ep,np.mean(batch_rew), step_count, test_mn_rw, test_std_rw, ep_sec_time))
268 | 
269 |             ep_time = current_milli_time()
270 |             batch_rew = []
271 |                 
272 |         if ep % render_cycle == 0:
273 |             render_the_game = True
274 | 
275 |     # close everything
276 |     file_writer.close()
277 |     env.close()
278 |     env_test.close()
279 | 
280 | 
281 | if __name__ == '__main__':
282 |     DDPG('BipedalWalker-v2', hidden_sizes=[64,64], ac_lr=3e-4, cr_lr=4e-4, buffer_size=200000, mean_summaries_steps=100, batch_size=64, 
283 |         min_buffer_size=10000, tau=0.003)
284 |     
285 | 


--------------------------------------------------------------------------------
/Chapter08/TD3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | from collections import deque
  6 | import time
  7 | 
  8 | current_milli_time = lambda: int(round(time.time() * 1000))
  9 | 
 10 | def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
 11 |     '''
 12 |     Multi-layer perceptron
 13 |     '''
 14 |     for l in hidden_layers:
 15 |         x = tf.layers.dense(x, units=l, activation=activation)
 16 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 17 | 
 18 | # CHANGED FROM DDPG!
 19 | def deterministic_actor_double_critic(x, a, hidden_sizes, act_dim, max_act=1):
 20 |     '''
 21 |     Deterministic Actor-Critic
 22 |     '''
 23 |     # Actor
 24 |     with tf.variable_scope('p_mlp'):
 25 |         p_means = max_act * mlp(x, hidden_sizes, act_dim, last_activation=tf.tanh) 
 26 |     
 27 |     # First critic
 28 |     with tf.variable_scope('q1_mlp'):
 29 |         q1_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None) 
 30 |     
 31 |     with tf.variable_scope('q1_mlp', reuse=True): # Use the weights of the mlp just defined
 32 |         q1_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None)
 33 | 
 34 |     # Second critic
 35 |     with tf.variable_scope('q2_mlp'):
 36 |         q2_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None)
 37 |     with tf.variable_scope('q2_mlp', reuse=True):
 38 |         q2_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None)
 39 | 
 40 |     return p_means, tf.squeeze(q1_d), tf.squeeze(q1_a), tf.squeeze(q2_d), tf.squeeze(q2_a)
 41 | 
 42 | class ExperiencedBuffer():
 43 |     '''
 44 |     Experienced buffer
 45 |     '''
 46 |     def __init__(self, buffer_size):
 47 |         # Contains up to 'buffer_size' experience
 48 |         self.obs_buf = deque(maxlen=buffer_size)
 49 |         self.rew_buf = deque(maxlen=buffer_size)
 50 |         self.act_buf = deque(maxlen=buffer_size)
 51 |         self.obs2_buf = deque(maxlen=buffer_size)
 52 |         self.done_buf = deque(maxlen=buffer_size)
 53 | 
 54 | 
 55 |     def add(self, obs, rew, act, obs2, done):
 56 |         '''
 57 |         Add a new transition to the buffers
 58 |         '''
 59 |         self.obs_buf.append(obs)
 60 |         self.rew_buf.append(rew)
 61 |         self.act_buf.append(act)
 62 |         self.obs2_buf.append(obs2)
 63 |         self.done_buf.append(done)
 64 |         
 65 | 
 66 |     def sample_minibatch(self, batch_size):
 67 |         '''
 68 |         Sample a mini-batch of size 'batch_size'
 69 |         '''
 70 |         mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
 71 | 
 72 |         mb_obs = [self.obs_buf[i] for i in mb_indices]
 73 |         mb_rew = [self.rew_buf[i] for i in mb_indices]
 74 |         mb_act = [self.act_buf[i] for i in mb_indices]
 75 |         mb_obs2 = [self.obs2_buf[i] for i in mb_indices]
 76 |         mb_done = [self.done_buf[i] for i in mb_indices]
 77 | 
 78 |         return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
 79 | 
 80 |     def __len__(self):
 81 |         return len(self.obs_buf)
 82 | 
 83 | def test_agent(env_test, agent_op, num_games=10):
 84 |     '''
 85 |     Test an agent 'agent_op', 'num_games' times
 86 |     Return mean and std
 87 |     '''
 88 |     games_r = []
 89 | 
 90 |     for _ in range(num_games):
 91 |         d = False
 92 |         game_r = 0
 93 |         o = env_test.reset()
 94 | 
 95 |         while not d:
 96 |             a_s = agent_op(o)
 97 |             o, r, d, _ = env_test.step(a_s)
 98 | 
 99 |             game_r += r
100 | 
101 |         games_r.append(game_r)
102 | 
103 |     return np.mean(games_r), np.std(games_r)
104 | 
105 | 
106 | 
107 | def TD3(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs=2000, buffer_size=5000, discount=0.99, render_cycle=10000, mean_summaries_steps=1000, 
108 |         batch_size=128, min_buffer_size=5000, tau=0.005, target_noise=0.2, expl_noise=0.1, policy_update_freq=2):
109 | 
110 |     # Create an environment for training
111 |     env = gym.make(env_name)
112 |     # Create an environment for testing the actor
113 |     env_test = gym.make(env_name)
114 | 
115 |     tf.reset_default_graph()
116 | 
117 |     obs_dim = env.observation_space.shape
118 |     act_dim = env.action_space.shape
119 |     print('-- Observation space:', obs_dim, ' Action space:', act_dim, '--')
120 | 
121 |     # Create some placeholders
122 |     obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
123 |     act_ph = tf.placeholder(shape=(None, act_dim[0]), dtype=tf.float32, name='act')
124 |     y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
125 | 
126 |     # Create an online deterministic actor and a double critic 
127 |     with tf.variable_scope('online'):
128 |         p_onl, qd1_onl, qa1_onl, _, qa2_onl = deterministic_actor_double_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
129 | 
130 |     # and a target actor and double critic
131 |     with tf.variable_scope('target'):
132 |         p_tar, _, qa1_tar, _, qa2_tar = deterministic_actor_double_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
133 | 
134 |     def variables_in_scope(scope):
135 |         '''
136 |         Retrieve all the variables in the scope 'scope'
137 |         '''
138 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
139 | 
140 |     # Copy all the online variables to the target networks i.e. target = online
141 |     # Needed only at the beginning
142 |     init_target = [target_var.assign(online_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
143 |     init_target_op = tf.group(*init_target)
144 | 
145 |     # Soft update
146 |     update_target = [target_var.assign(tau*online_var + (1-tau)*target_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
147 |     update_target_op = tf.group(*update_target)
148 | 
149 |     # Critics loss (MSE)
150 |     q1_loss = tf.reduce_mean((qa1_onl - y_ph)**2) 
151 |     q2_loss = tf.reduce_mean((qa2_onl - y_ph)**2)
152 | 
153 |     # Actor loss
154 |     p_loss = -tf.reduce_mean(qd1_onl)
155 |     
156 |     # Optimize the critics
157 |     q1_opt = tf.train.AdamOptimizer(cr_lr).minimize(q1_loss)
158 |     q2_opt = tf.train.AdamOptimizer(cr_lr).minimize(q2_loss)
159 | 
160 |     # Optimize the actor
161 |     p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss, var_list=variables_in_scope('online/p_mlp'))
162 | 
163 | 
164 |     def add_normal_noise(x, scale, low_lim=-0.5, high_lim=0.5):
165 |         return x + np.clip(np.random.normal(loc=0.0, scale=scale, size=x.shape), low_lim, high_lim)
166 | 
167 |     def agent_op(o):
168 |         ac = np.squeeze(sess.run(p_onl, feed_dict={obs_ph:[o]}))
169 |         return np.clip(ac, env.action_space.low, env.action_space.high)
170 | 
171 |     def agent_noisy_op(o, scale):
172 |         ac = agent_op(o)
173 |         return np.clip(add_normal_noise(ac, scale, env.action_space.low, env.action_space.high), env.action_space.low, env.action_space.high)
174 | 
175 | 
176 |     # Time
177 |     now = datetime.now()
178 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
179 |     print('Time:', clock_time)
180 | 
181 |     # Set TensorBoard
182 |     tf.summary.scalar('loss/q1', q1_loss)
183 |     tf.summary.scalar('loss/q2', q2_loss)
184 |     tf.summary.scalar('loss/p', p_loss)
185 |     scalar_summary = tf.summary.merge_all()
186 | 
187 |     hyp_str = '-aclr_'+str(ac_lr)+'-crlr_'+str(cr_lr)+'-tau_'+str(tau)
188 | 
189 |     file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/TD3_'+clock_time+'_'+hyp_str, tf.get_default_graph())
190 | 
191 |     # Create a session and initialize the variables
192 |     sess = tf.Session()
193 |     sess.run(tf.global_variables_initializer())
194 |     sess.run(init_target_op)
195 |     
196 |     # Some useful variables..
197 |     render_the_game = False
198 |     step_count = 0
199 |     last_q1_update_loss = []
200 |     last_q2_update_loss = []
201 |     last_p_update_loss = []
202 |     ep_time = current_milli_time()
203 |     batch_rew = []
204 | 
205 |     # Reset the environment
206 |     obs = env.reset()
207 |     # Initialize the buffer
208 |     buffer = ExperiencedBuffer(buffer_size)
209 | 
210 | 
211 |     for ep in range(num_epochs):
212 |         g_rew = 0
213 |         done = False
214 | 
215 |         while not done:
216 |             # If not gathered enough experience yet, act randomly
217 |             if len(buffer) < min_buffer_size:
218 |                 act = env.action_space.sample()
219 |             else:
220 |                 act = agent_noisy_op(obs, expl_noise)
221 | 
222 |             # Take a step in the environment
223 |             obs2, rew, done, _ = env.step(act)
224 | 
225 |             if render_the_game:
226 |                 env.render()
227 | 
228 |             # Add the transition in the buffer
229 |             buffer.add(obs.copy(), rew, act, obs2.copy(), done)
230 | 
231 |             obs = obs2
232 |             g_rew += rew
233 |             step_count += 1
234 | 
235 |             if len(buffer) > min_buffer_size:
236 |                 # sample a mini batch from the buffer
237 |                 mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
238 | 
239 | 
240 |                 double_actions = sess.run(p_tar, feed_dict={obs_ph:mb_obs2})
241 |                 # Target regularization
242 |                 double_noisy_actions = np.clip(add_normal_noise(double_actions, target_noise), env.action_space.low, env.action_space.high)
243 | 
244 |                 # Clipped Double Q-learning
245 |                 q1_target_mb, q2_target_mb = sess.run([qa1_tar,qa2_tar], feed_dict={obs_ph:mb_obs2, act_ph:double_noisy_actions})
246 |                 q_target_mb = np.min([q1_target_mb, q2_target_mb], axis=0) 
247 |                 assert(len(q1_target_mb) == len(q_target_mb))
248 | 
249 |                 # Compute the target values
250 |                 y_r = np.array(mb_rew) + discount*(1-np.array(mb_done))*q_target_mb
251 | 
252 |                 # Optimize the critics
253 |                 train_summary, _, q1_train_loss, _, q2_train_loss = sess.run([scalar_summary, q1_opt, q1_loss, q2_opt, q2_loss], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
254 | 
255 |                 # Delayed policy update
256 |                 if step_count % policy_update_freq == 0:
257 |                     # Optimize the policy
258 |                     _, p_train_loss = sess.run([p_opt, p_loss], feed_dict={obs_ph:mb_obs})
259 | 
260 |                     # Soft update of the target networks
261 |                     sess.run(update_target_op)
262 | 
263 |                     file_writer.add_summary(train_summary, step_count)
264 |                     last_q1_update_loss.append(q1_train_loss)
265 |                     last_q2_update_loss.append(q2_train_loss)
266 |                     last_p_update_loss.append(p_train_loss)
267 | 
268 |                 
269 |                 # some 'mean' summaries to plot more smooth functions
270 |                 if step_count % mean_summaries_steps == 0:
271 |                     summary = tf.Summary()
272 |                     summary.value.add(tag='loss/mean_q1', simple_value=np.mean(last_q1_update_loss))
273 |                     summary.value.add(tag='loss/mean_q2', simple_value=np.mean(last_q2_update_loss))
274 |                     summary.value.add(tag='loss/mean_p', simple_value=np.mean(last_p_update_loss))
275 |                     file_writer.add_summary(summary, step_count)
276 |                     file_writer.flush()
277 | 
278 |                     last_q1_update_loss = []
279 |                     last_q2_update_loss = []
280 |                     last_p_update_loss = []
281 | 
282 | 
283 |             if done:
284 |                 obs = env.reset()
285 |                 batch_rew.append(g_rew)
286 |                 g_rew, render_the_game = 0, False
287 | 
288 |         # Test the actor every 10 epochs
289 |         if ep % 10 == 0:
290 |             test_mn_rw, test_std_rw = test_agent(env_test, agent_op)
291 |             summary = tf.Summary()
292 |             summary.value.add(tag='test/reward', simple_value=test_mn_rw)
293 |             file_writer.add_summary(summary, step_count)
294 |             file_writer.flush()
295 | 
296 |             ep_sec_time = int((current_milli_time()-ep_time) / 1000)
297 |             print('Ep:%4d Rew:%4.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d' %  (ep,np.mean(batch_rew), step_count, test_mn_rw, test_std_rw, ep_sec_time))
298 | 
299 |             ep_time = current_milli_time()
300 |             batch_rew = []
301 |                 
302 |         if ep % render_cycle == 0:
303 |             render_the_game = True
304 | 
305 |     # close everything
306 |     file_writer.close()
307 |     env.close()
308 |     env_test.close()
309 | 
310 | 
311 | if __name__ == '__main__':
312 |     TD3('BipedalWalker-v2', hidden_sizes=[64,64], ac_lr=4e-4, cr_lr=4e-4, buffer_size=200000, mean_summaries_steps=100, batch_size=64, 
313 |         min_buffer_size=10000, tau=0.005, policy_update_freq=2, target_noise=0.1)


--------------------------------------------------------------------------------
/Chapter10/DAgger.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | from datetime import datetime
  4 | import time
  5 | from ple.games.flappybird import FlappyBird
  6 | from ple import PLE
  7 | 
  8 | 
  9 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
 10 |     '''
 11 |     Multi-layer perceptron
 12 |     '''
 13 |     for l in hidden_layers:
 14 |         x = tf.layers.dense(x, units=l, activation=activation)
 15 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 16 | 
 17 | def flappy_to_list(fd):
 18 |     '''
 19 |     Return the state dictionary as a list
 20 |     '''
 21 |     return fd['player_y'], fd['player_vel'], fd['next_pipe_dist_to_player'], fd['next_pipe_top_y'], \
 22 |             fd['next_pipe_bottom_y'], fd['next_next_pipe_dist_to_player'], fd['next_next_pipe_top_y'], \
 23 |             fd['next_next_pipe_bottom_y']
 24 | 
 25 | def flappy_game_state(bol):
 26 |     '''
 27 |     Normalize the game state
 28 |     '''
 29 |     stat = flappy_to_list(bol.getGameState())
 30 |     stat = (np.array(stat, dtype=np.float32) / 300.0) - 0.5
 31 |     return stat
 32 | 
 33 | def no_op(env, n_act=5):
 34 |     for _ in range(n_act):
 35 |         env.act(119 if np.random.randn() < 0.5 else None)
 36 | 
 37 | 
 38 | def expert():
 39 |     '''
 40 |     Load the computational graph and pretarined weights of the expert
 41 |     '''
 42 |     graph = tf.get_default_graph()
 43 | 
 44 |     sess_expert = tf.Session(graph=graph)
 45 | 
 46 |     saver = tf.train.import_meta_graph('expert/model.ckpt.meta')
 47 |     saver.restore(sess_expert,tf.train.latest_checkpoint('expert/'))
 48 |     
 49 |     p_argmax = graph.get_tensor_by_name('actor_nn/max_act:0') 
 50 |     obs_ph = graph.get_tensor_by_name('obs:0') 
 51 | 
 52 |     def expert_policy(state):
 53 |         act = sess_expert.run(p_argmax, feed_dict={obs_ph:[state]})
 54 |         return np.squeeze(act)
 55 | 
 56 |     return expert_policy
 57 | 
 58 | def test_agent(policy, file_writer=None, test_games=10, step=0):
 59 |     game = FlappyBird()
 60 |     env = PLE(game, fps=30, display_screen=False)
 61 |     env.init()
 62 | 
 63 |     test_rewards = []
 64 |     for _ in range(test_games):
 65 |         env.reset_game()
 66 |         no_op(env)
 67 | 
 68 |         game_rew = 0
 69 | 
 70 |         while not env.game_over():
 71 | 
 72 |             state = flappy_game_state(env)
 73 | 
 74 |             action = 119 if policy(state) == 1 else None
 75 | 
 76 |             for _ in range(2):
 77 |                 game_rew += env.act(action)
 78 | 
 79 |         test_rewards.append(game_rew)
 80 | 
 81 |         if file_writer is not None:
 82 |             summary = tf.Summary()
 83 |             summary.value.add(tag='test_performance', simple_value=game_rew)
 84 |             file_writer.add_summary(summary, step)
 85 |             file_writer.flush()
 86 | 
 87 |     return test_rewards
 88 | 
 89 | 
 90 | def DAgger(hidden_sizes=[32,32], dagger_iterations=20, p_lr=1e-3, step_iterations=1000, batch_size=128, train_epochs=20, obs_dim=8, act_dim=2):
 91 | 
 92 |     tf.reset_default_graph()
 93 | 
 94 |     ############################## EXPERT ###############################
 95 |     # load the expert and return a function that predict the expert action given a state
 96 |     expert_policy = expert()     
 97 |     print('Expert performance: ', np.mean(test_agent(expert_policy)))
 98 | 
 99 | 
100 |     #################### LEARNER COMPUTATIONAL GRAPH ####################
101 |     obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32, name='obs')
102 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
103 | 
104 |     # Multi-layer perceptron
105 |     p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.nn.relu, last_activation=None)
106 |         
107 |     act_max = tf.math.argmax(p_logits, axis=1)
108 |     act_onehot = tf.one_hot(act_ph, depth=act_dim)
109 | 
110 |     # softmax cross entropy loss
111 |     p_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=act_onehot, logits=p_logits))
112 |     # Adam optimizer
113 |     p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss)
114 | 
115 | 
116 |     now = datetime.now()
117 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
118 |     file_writer = tf.summary.FileWriter('log_dir/FlappyBird/DAgger_'+clock_time, tf.get_default_graph())
119 | 
120 |     sess = tf.Session()
121 |     sess.run(tf.global_variables_initializer())
122 |     
123 |     def learner_policy(state):
124 |         action = sess.run(act_max, feed_dict={obs_ph:[state]})
125 |         return np.squeeze(action)
126 | 
127 |     X = []
128 |     y = []
129 | 
130 |     env = FlappyBird()
131 | 
132 |     env = PLE(env, fps=30, display_screen=False)
133 |     env.init()    
134 | 
135 |     #################### DAgger iterations ####################
136 |     
137 |     for it in range(dagger_iterations):
138 |         sess.run(tf.global_variables_initializer())
139 |         env.reset_game()
140 |         no_op(env)
141 | 
142 |         game_rew = 0
143 |         rewards = []
144 | 
145 |         ###################### Populate the dataset #####################
146 | 
147 |         for _ in range(step_iterations):
148 |             # get the current state from the environment
149 |             state = flappy_game_state(env)
150 | 
151 |             # As the iterations continue use more and more actions sampled from the learner
152 |             if np.random.rand() < (1 - it/5):
153 |                 action = expert_policy(state)
154 |             else:
155 |                 action = learner_policy(state)
156 | 
157 |             action = 119 if action == 1 else None
158 | 
159 |             rew = env.act(action)
160 |             rew += env.act(action)
161 | 
162 |             # Add the state and the expert action to the dataset
163 |             X.append(state)
164 |             y.append(expert_policy(state))
165 | 
166 |             game_rew += rew
167 | 
168 |             # Whenever the game stop, reset the environment and initailize the variables
169 |             if env.game_over():
170 |                 env.reset_game()
171 |                 no_op(env)
172 | 
173 |                 rewards.append(game_rew)
174 |                 game_rew = 0
175 | 
176 |         ##################### Training #####################
177 | 
178 |         # Calculate the number of minibatches
179 |         n_batches = int(np.floor(len(X)/batch_size))
180 | 
181 |         # shuffle the dataset
182 |         shuffle = np.arange(len(X))
183 |         np.random.shuffle(shuffle)
184 | 
185 |         
186 |         shuffled_X = np.array(X)[shuffle]
187 |         shuffled_y = np.array(y)[shuffle]
188 |         
189 |         
190 |         for _ in range(train_epochs):
191 |             ep_loss = []
192 |             # Train the model on each minibatch in the dataset
193 |             for b in range(n_batches):
194 |                 p_start = b*batch_size
195 | 
196 |                 # mini-batch training
197 |                 tr_loss, _ = sess.run([p_loss, p_opt], feed_dict={
198 |                                 obs_ph:shuffled_X[p_start:p_start+batch_size], 
199 |                                 act_ph:shuffled_y[p_start:p_start+batch_size]})
200 | 
201 |                 ep_loss.append(tr_loss)
202 |             
203 |         agent_tests = test_agent(learner_policy, file_writer, step=len(X))
204 | 
205 |         print('Ep:', it, np.mean(ep_loss), 'Test:', np.mean(agent_tests))
206 | 
207 | 
208 |     
209 | 
210 | if __name__ == "__main__":
211 |     DAgger(hidden_sizes=[16,16], dagger_iterations=10, p_lr=1e-4, step_iterations=100, batch_size=50, train_epochs=2000)


--------------------------------------------------------------------------------
/Chapter10/expert/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter10/expert/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter10/expert/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/Chapter10/expert/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter10/expert/model.ckpt.index


--------------------------------------------------------------------------------
/Chapter10/expert/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter10/expert/model.ckpt.meta


--------------------------------------------------------------------------------
/Chapter11/ES.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | from datetime import datetime
  4 | import time
  5 | import gym
  6 | 
  7 | import multiprocessing as mp
  8 | import scipy.stats as ss
  9 | import contextlib
 10 | import numpy as np
 11 | 
 12 | @contextlib.contextmanager
 13 | def temp_seed(seed):
 14 |     state = np.random.get_state()
 15 |     np.random.seed(seed)
 16 |     try:
 17 |         yield
 18 |     finally:
 19 |         np.random.set_state(state)
 20 | 
 21 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
 22 |     '''
 23 |     Multi-layer perceptron
 24 |     '''
 25 |     for l in hidden_layers:
 26 |         x = tf.layers.dense(x, units=l, activation=activation)
 27 |         
 28 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 29 | 
 30 | 
 31 | def test_agent(env_test, agent_op, num_games=1):
 32 |     '''
 33 |     Test an agent 'agent_op', 'num_games' times
 34 |     Return mean and std
 35 |     '''
 36 |     games_r = []
 37 |     steps = 0
 38 |     for _ in range(num_games):
 39 |         d = False
 40 |         game_r = 0
 41 |         o = env_test.reset()
 42 | 
 43 |         while not d:
 44 |             a_s = agent_op(o)
 45 |             o, r, d, _ = env_test.step(a_s)
 46 |             game_r += r
 47 |             steps += 1
 48 | 
 49 |         games_r.append(game_r)
 50 |     return games_r, steps
 51 | 
 52 | 
 53 | def worker(env_name, initial_seed, hidden_sizes, lr, std_noise, indiv_per_worker, worker_name, params_queue, output_queue):
 54 | 
 55 |     env = gym.make(env_name)
 56 |     obs_dim = env.observation_space.shape[0]
 57 |     act_dim = env.action_space.shape[0]
 58 | 
 59 |     import tensorflow as tf
 60 | 
 61 |     # set an initial seed common to all the workers
 62 |     tf.random.set_random_seed(initial_seed)
 63 |     np.random.seed(initial_seed)
 64 |     
 65 | 
 66 |     with tf.device("/cpu:" + worker_name):
 67 |         
 68 |         obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32, name='obs_ph')
 69 |         new_weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='new_weights_ph')
 70 |         
 71 |         def variables_in_scope(scope):
 72 |             # get all trainable variables in 'scope'
 73 |             return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
 74 | 
 75 |         with tf.variable_scope('nn_' + worker_name):
 76 |             acts = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh)
 77 | 
 78 |         agent_variables = variables_in_scope('nn_' + worker_name)
 79 |         agent_variables_flatten = flatten_list(agent_variables)
 80 | 
 81 |         # Update the agent parameters with new weights new_weights_ph
 82 |         it_v1 = tf.Variable(0, trainable=False)
 83 |         update_weights = []
 84 |         for a_v in agent_variables:
 85 |             upd_rsh = tf.reshape(new_weights_ph[it_v1 : it_v1+tf.reduce_prod(a_v.shape)], shape=a_v.shape)
 86 |             update_weights.append(a_v.assign(upd_rsh))
 87 |             it_v1 += tf.reduce_prod(a_v.shape)
 88 | 
 89 | 
 90 |         # Reshape the new_weights_ph following the neural network shape
 91 |         it_v2 = tf.Variable(0, trainable=False)
 92 |         vars_grads_list = []
 93 |         for a_v in agent_variables:
 94 |             vars_grads_list.append(tf.reshape(new_weights_ph[it_v2 : it_v2+tf.reduce_prod(a_v.shape)], shape=a_v.shape))
 95 |             it_v2 += tf.reduce_prod(a_v.shape)
 96 | 
 97 |         # Create the optimizer
 98 |         opt = tf.train.AdamOptimizer(lr)
 99 |         # Apply the "gradients" using Adam
100 |         apply_g = opt.apply_gradients([(g, v) for g, v in zip(vars_grads_list, agent_variables)])
101 |         
102 |     def agent_op(o):
103 |         a = np.squeeze(sess.run(acts, feed_dict={obs_ph:[o]}))
104 |         return np.clip(a, env.action_space.low, env.action_space.high)
105 | 
106 | 
107 |     def evaluation_on_noise(noise):
108 |         '''
109 |         Evaluate the agent with the noise
110 |         ''' 
111 |         # Get the original weights that will be restored after the evaluation
112 |         original_weights = sess.run(agent_variables_flatten)
113 | 
114 |         # Update the weights of the agent/individual by adding the extra noise noise*STD_NOISE
115 |         sess.run(update_weights, feed_dict={new_weights_ph:original_weights + noise*std_noise})
116 | 
117 |         # Test the agent with the new weights
118 |         rewards, steps = test_agent(env, agent_op)
119 | 
120 |         # Restore the original weights
121 |         sess.run(update_weights, feed_dict={new_weights_ph:original_weights})
122 | 
123 |         return np.mean(rewards), steps
124 | 
125 |     config_proto = tf.ConfigProto(device_count={'CPU': 4}, allow_soft_placement=True)
126 |     sess = tf.Session(config=config_proto)
127 |     sess.run(tf.global_variables_initializer())
128 | 
129 | 
130 |     agent_flatten_shape = sess.run(agent_variables_flatten).shape
131 | 
132 |     while True:
133 | 
134 |         for _ in range(indiv_per_worker):
135 |             seed = np.random.randint(1e7)
136 | 
137 |             with temp_seed(seed):
138 |                 # sample, for each weight of the agent, from a normal distribution
139 |                 sampled_noise = np.random.normal(size=agent_flatten_shape)
140 |             
141 |             # Mirrored sampling
142 |             pos_rew, stp1 = evaluation_on_noise(sampled_noise)
143 |             neg_rew, stp2 = evaluation_on_noise(-sampled_noise)
144 | 
145 |             # Put the returns and seeds on the queue
146 |             # Note that here we are just sending the seed (a scalar value), not the complete perturbation sampled_noise
147 |             output_queue.put([[pos_rew, neg_rew], seed, stp1+stp2])
148 | 
149 |         # Get all the returns and seed from each other worker
150 |         batch_return, batch_seed = params_queue.get()
151 | 
152 |         batch_noise = []
153 |         for seed in batch_seed:
154 | 
155 |             # reconstruct the perturbations from the seed
156 |             with temp_seed(seed):
157 |                 sampled_noise = np.random.normal(size=agent_flatten_shape)
158 | 
159 |             batch_noise.append(sampled_noise)
160 |             batch_noise.append(-sampled_noise)
161 |             
162 | 
163 |         # Compute the sthocastic gradient estimate 
164 |         vars_grads = np.zeros(agent_flatten_shape)
165 |         for n, r in zip(batch_noise, batch_return):
166 |             vars_grads += n * r
167 |         vars_grads /= len(batch_noise) * std_noise
168 | 
169 |         # run Adam optimization on the estimate gradient just computed
170 |         sess.run(apply_g, feed_dict={new_weights_ph:-vars_grads})
171 | 
172 | 
173 | def normalized_rank(rewards):
174 |     '''
175 |     Rank the rewards and normalize them.
176 |     '''
177 |     ranked = ss.rankdata(rewards)
178 |     norm = (ranked - 1) / (len(ranked) - 1)
179 |     norm -= 0.5
180 |     return norm
181 | 
182 | 
183 | def flatten(tensor):
184 |     '''
185 |     Flatten a tensor
186 |     '''
187 |     return tf.reshape(tensor, shape=(-1,))
188 | 
189 | def flatten_list(tensor_list):
190 |     '''
191 |     Flatten a list of tensors
192 |     '''
193 |     return tf.concat([flatten(t) for t in tensor_list], axis=0)
194 | 
195 | 
196 | 
197 | def ES(env_name, hidden_sizes=[8,8], number_iter=1000, num_workers=4, lr=0.01, indiv_per_worker=10, std_noise=0.01):
198 | 
199 | 
200 |     initial_seed = np.random.randint(1e7)
201 | 
202 |     # Create a queue for the output values (single returns and seeds values)
203 |     output_queue = mp.Queue(maxsize=num_workers*indiv_per_worker)
204 |     # Create a queue for the input paramaters (batch return and batch seeds)
205 |     params_queue = mp.Queue(maxsize=num_workers)
206 | 
207 | 
208 |     now = datetime.now()
209 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
210 |     hyp_str = '-numworkers_'+str(num_workers)+'-lr_'+str(lr)
211 |     file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/'+clock_time+'_'+hyp_str, tf.get_default_graph())
212 |     
213 |     processes = []
214 |     # Create a parallel process for each worker
215 |     for widx in range(num_workers):
216 |         p = mp.Process(target=worker, args=(env_name, initial_seed, hidden_sizes, lr, std_noise, indiv_per_worker, str(widx), params_queue, output_queue))
217 |         p.start()
218 |         processes.append(p)
219 | 
220 |     tot_steps = 0
221 |     # Iterate over all the training iterations
222 |     for n_iter in range(number_iter):
223 | 
224 |         batch_seed = []
225 |         batch_return = []
226 |         
227 |         # Wait until enough candidate individuals are evaluated
228 |         for _ in range(num_workers*indiv_per_worker):
229 |             p_rews, p_seed, p_steps = output_queue.get()
230 | 
231 |             batch_seed.append(p_seed)
232 |             batch_return.extend(p_rews)
233 |             tot_steps += p_steps
234 | 
235 |         print('Iter: {} Reward: {:.2f}'.format(n_iter, np.mean(batch_return)))
236 | 
237 |         # Let's save the population's performance
238 |         summary = tf.Summary()
239 |         for r in batch_return:
240 |             summary.value.add(tag='performance', simple_value=r)
241 |         file_writer.add_summary(summary, tot_steps)
242 |         file_writer.flush()
243 | 
244 |         # Rank and normalize the returns
245 |         batch_return = normalized_rank(batch_return)
246 | 
247 |         # Put on the queue all the returns and seed so that each worker can optimize the neural network
248 |         for _ in range(num_workers):
249 |             params_queue.put([batch_return, batch_seed])
250 |     
251 |     # terminate all workers
252 |     for p in processes:
253 |         p.terminate()
254 | 
255 | 
256 |         
257 | if __name__ == '__main__':
258 |     ES('LunarLanderContinuous-v2', hidden_sizes=[32,32], number_iter=200, num_workers=4, lr=0.02, indiv_per_worker=12, std_noise=0.05)
259 | 


--------------------------------------------------------------------------------
/Chapter12/ESBAS.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf
  3 | import gym
  4 | from datetime import datetime
  5 | from collections import deque
  6 | import time
  7 | import sys
  8 | 
  9 | 
 10 | gym.logger.set_level(40)
 11 | 
 12 | current_milli_time = lambda: int(round(time.time() * 1000))
 13 |     
 14 | 
 15 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
 16 |     '''
 17 |     Multi-layer perceptron
 18 |     '''
 19 |     for l in hidden_layers:
 20 |         x = tf.layers.dense(x, units=l, activation=activation)
 21 |         
 22 |     return tf.layers.dense(x, units=output_layer, activation=last_activation)
 23 | 
 24 | class ExperienceBuffer():
 25 |     '''
 26 |     Experience Replay Buffer
 27 |     '''
 28 |     def __init__(self, buffer_size):
 29 |         self.obs_buf = deque(maxlen=buffer_size)
 30 |         self.rew_buf = deque(maxlen=buffer_size)
 31 |         self.act_buf = deque(maxlen=buffer_size)
 32 |         self.obs2_buf = deque(maxlen=buffer_size)
 33 |         self.done_buf = deque(maxlen=buffer_size)
 34 | 
 35 | 
 36 |     def add(self, obs, rew, act, obs2, done):
 37 |         # Add a new transition to the buffers
 38 |         self.obs_buf.append(obs)
 39 |         self.rew_buf.append(rew)
 40 |         self.act_buf.append(act)
 41 |         self.obs2_buf.append(obs2)
 42 |         self.done_buf.append(done)
 43 |         
 44 | 
 45 |     def sample_minibatch(self, batch_size):
 46 |         # Sample a minibatch of size batch_size
 47 |         mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
 48 | 
 49 |         mb_obs = [self.obs_buf[i] for i in mb_indices]
 50 |         mb_rew = [self.rew_buf[i] for i in mb_indices]
 51 |         mb_act = [self.act_buf[i] for i in mb_indices]
 52 |         mb_obs2 = [self.obs2_buf[i] for i in mb_indices]
 53 |         mb_done = [self.done_buf[i] for i in mb_indices]
 54 | 
 55 |         return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
 56 | 
 57 |     def __len__(self):
 58 |         return len(self.obs_buf)
 59 | 
 60 | 
 61 | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):   
 62 |     '''
 63 |     Calculate the target value y for each transition
 64 |     '''
 65 |     max_av = np.max(av, axis=1)
 66 |     
 67 |     # if episode terminate, y take value r
 68 |     # otherwise, q-learning step
 69 |     ys = []
 70 |     for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av):
 71 |         if d:
 72 |             ys.append(r)
 73 |         else:
 74 |             q_step = r + discounted_value * av
 75 |             ys.append(q_step)
 76 |     
 77 |     assert len(ys) == len(mini_batch_rw)
 78 |     return ys
 79 | 
 80 | def greedy(action_values):
 81 |     '''
 82 |     Greedy policy
 83 |     '''
 84 |     return np.argmax(action_values)
 85 | 
 86 | def eps_greedy(action_values, eps=0.1):
 87 |     '''
 88 |     Eps-greedy policy
 89 |     '''
 90 |     if np.random.uniform(0,1) < eps:
 91 |         # Choose a uniform random action
 92 |         return np.random.randint(len(action_values))
 93 |     else:
 94 |         # Choose the greedy action
 95 |         return np.argmax(action_values)
 96 | 
 97 | def test_agent(env_test, agent_op, num_games=20, summary=None):
 98 |     '''
 99 |     Test an agent
100 |     '''
101 |     games_r = []
102 | 
103 |     for _ in range(num_games):
104 |         d = False
105 |         game_r = 0
106 |         o = env_test.reset()
107 | 
108 |         while not d:
109 |             a = greedy(np.squeeze(agent_op(o)))
110 |             o, r, d, _ = env_test.step(a)
111 | 
112 |             game_r += r
113 | 
114 |         if summary is not None:
115 |             summary.value.add(tag='test_performance', simple_value=game_r)
116 | 
117 |         games_r.append(game_r)
118 | 
119 |     return games_r
120 | 
121 | 
122 | class DQN_optimization:
123 |     def __init__(self, obs_dim, act_dim, hidden_layers, lr, discount):
124 |         self.obs_dim = obs_dim
125 |         self.act_dim = act_dim
126 |         self.hidden_layers = hidden_layers
127 |         self.lr = lr
128 |         self.discount = discount
129 | 
130 |         self.__build_graph()
131 | 
132 | 
133 |     def __build_graph(self):
134 |         
135 |         self.g = tf.Graph()
136 |         with self.g.as_default():
137 |             # Create all the placeholders
138 |             self.obs_ph = tf.placeholder(shape=(None, self.obs_dim[0]), dtype=tf.float32, name='obs')
139 |             self.act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
140 |             self.y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
141 | 
142 |             # Create the target network
143 |             with tf.variable_scope('target_network'):
144 |                 self.target_qv = mlp(self.obs_ph, self.hidden_layers, self.act_dim, tf.nn.relu, last_activation=None)
145 |             target_vars = tf.trainable_variables()
146 | 
147 |             # Create the online network (i.e. the behavior policy)
148 |             with tf.variable_scope('online_network'):
149 |                 self.online_qv = mlp(self.obs_ph, self.hidden_layers, self.act_dim, tf.nn.relu, last_activation=None)
150 |             train_vars = tf.trainable_variables()
151 | 
152 |             # Update the target network by assigning to it the variables of the online network
153 |             # Note that the target network and the online network have the same exact architecture
154 |             update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))]
155 |             self.update_target_op = tf.group(*update_target)
156 | 
157 |             # One hot encoding of the action
158 |             act_onehot = tf.one_hot(self.act_ph, depth=self.act_dim)
159 |             # We are interested only in the Q-values of those actions
160 |             q_values = tf.reduce_sum(act_onehot * self.online_qv, axis=1)
161 |             
162 |             # MSE loss function
163 |             self.v_loss = tf.reduce_mean((self.y_ph - q_values)**2)
164 |             # Adam optimize that minimize the loss v_loss
165 |             self.v_opt = tf.train.AdamOptimizer(self.lr).minimize(self.v_loss)
166 | 
167 |             self.__create_session()
168 | 
169 |             # Copy the online network in the target network
170 |             self.sess.run(self.update_target_op)
171 | 
172 |     def __create_session(self):
173 |          # open a session
174 |         self.sess = tf.Session(graph=self.g)
175 |         # and initialize all the variables
176 |         self.sess.run(tf.global_variables_initializer())      
177 |     
178 | 
179 |     def act(self, o):
180 |         '''
181 |         Forward pass to obtain the Q-values from the online network of a single observation
182 |         '''
183 |         return self.sess.run(self.online_qv, feed_dict={self.obs_ph:[o]})
184 | 
185 |     def optimize(self, mb_obs, mb_rew, mb_act, mb_obs2, mb_done):
186 |         mb_trg_qv = self.sess.run(self.target_qv, feed_dict={self.obs_ph:mb_obs2})
187 |         y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, self.discount)
188 | 
189 |         # training step
190 |         # optimize, compute the loss and return the TB summary
191 |         self.sess.run(self.v_opt, feed_dict={self.obs_ph:mb_obs, self.y_ph:y_r, self.act_ph: mb_act})
192 | 
193 |     def update_target_network(self):
194 |         # run the session to update the target network and get the mean loss sumamry 
195 |         self.sess.run(self.update_target_op)
196 | 
197 | 
198 | class UCB1:
199 |     def __init__(self, algos, epsilon):
200 |         self.n = 0
201 |         self.epsilon = epsilon
202 |         self.algos = algos
203 | 
204 |         self.nk = np.zeros(len(algos))
205 |         self.xk = np.zeros(len(algos))
206 | 
207 |     def choose_algorithm(self):
208 |         # take the best algorithm following UCB1
209 |         current_best = np.argmax([self.xk[i] + np.sqrt(self.epsilon * np.log(self.n) / self.nk[i]) for i in range(len(self.algos))])
210 |         for i in range(len(self.algos)):
211 |             if self.nk[i] < 5:
212 |                 return np.random.randint(len(self.algos))
213 | 
214 |         return current_best
215 | 
216 |     def update(self, idx_algo, traj_return):
217 |         # Update the mean RL return 
218 |         self.xk[idx_algo] = (self.nk[idx_algo] * self.xk[idx_algo] + traj_return) / (self.nk[idx_algo] + 1)
219 |         # increase the number of trajectories run
220 |         self.nk[idx_algo] += 1
221 |         self.n += 1
222 | 
223 | 
224 | def ESBAS(env_name, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000, 
225 |         batch_size=64, update_freq=4, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000,
226 |         xi=1):
227 | 
228 |     # reset the default graph
229 |     tf.reset_default_graph()
230 | 
231 |     # Create the environment both for train and test
232 |     env = gym.make(env_name)
233 |     # Add a monitor to the test env to store the videos
234 |     env_test = gym.wrappers.Monitor(gym.make(env_name), "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0)
235 | 
236 |     dqns = []
237 |     for l in hidden_sizes:
238 |         dqns.append(DQN_optimization(env.observation_space.shape, env.action_space.n, l, lr, discount))
239 | 
240 |     # Time
241 |     now = datetime.now()
242 |     clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
243 |     print('Time:', clock_time)
244 | 
245 |     LOG_DIR = 'log_dir/'+env_name
246 |     hyp_str = "-lr_{}-upTN_{}-upF_{}-xi_{}" .format(lr, update_target_net, update_freq, xi)
247 | 
248 |     # initialize the File Writer for writing TensorBoard summaries
249 |     file_writer = tf.summary.FileWriter(LOG_DIR+'/ESBAS_'+clock_time+'_'+hyp_str, tf.get_default_graph())
250 | 
251 |     def DQNs_update(step_counter):
252 |         # If it's time to train the network:
253 |         if len(buffer) > min_buffer_size and (step_counter % update_freq == 0):
254 |         
255 |             # sample a minibatch from the buffer
256 |             mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
257 | 
258 |             for dqn in dqns:
259 |                 dqn.optimize(mb_obs, mb_rew, mb_act, mb_obs2, mb_done)
260 | 
261 |         # Every update_target_net steps, update the target network
262 |         if len(buffer) > min_buffer_size and (step_counter % update_target_net == 0):
263 | 
264 |             for dqn in dqns:
265 |                 dqn.update_target_network()
266 |     
267 | 
268 |     step_count = 0
269 |     episode = 0
270 |     beta = 1
271 | 
272 |     # Initialize the experience buffer
273 |     buffer = ExperienceBuffer(buffer_size)
274 | 
275 |     obs = env.reset()
276 | 
277 |     # policy exploration initialization
278 |     eps = start_explor
279 |     eps_decay = (start_explor - end_explor) / explor_steps
280 | 
281 | 
282 |     for ep in range(num_epochs):
283 | 
284 |         # Policies' training
285 |         for i in range(2**(beta-1), 2**beta):
286 |             DQNs_update(i)
287 | 
288 |         ucb1 = UCB1(dqns, xi)
289 |         list_bests = []
290 |         ep_rew = []
291 |         beta += 1
292 | 
293 |         while step_count < 2**beta:
294 | 
295 |             # Chose the best policy's algortihm that will run the next trajectory 
296 |             best_dqn = ucb1.choose_algorithm()
297 |             list_bests.append(best_dqn)
298 | 
299 |             summary = tf.Summary()
300 |             summary.value.add(tag='algorithm_selected', simple_value=best_dqn)
301 |             file_writer.add_summary(summary, step_count)
302 |             file_writer.flush()
303 | 
304 |             g_rew = 0
305 |             done = False
306 |                 
307 |             while not done:
308 |                 # Epsilon decay
309 |                 if eps > end_explor:
310 |                     eps -= eps_decay
311 |                 
312 | 
313 |                 # Choose an eps-greedy action 
314 |                 act = eps_greedy(np.squeeze(dqns[best_dqn].act(obs)), eps=eps)
315 | 
316 |                 # execute the action in the environment
317 |                 obs2, rew, done, _ = env.step(act)
318 | 
319 |                 # Add the transition to the replay buffer
320 |                 buffer.add(obs, rew, act, obs2, done)
321 | 
322 |                 obs = obs2
323 |                 g_rew += rew
324 |                 step_count += 1
325 |             
326 | 
327 |             # Update the UCB parameters of the algortihm just used
328 |             ucb1.update(best_dqn, g_rew)
329 | 
330 |             # The environment is ended.. reset it and initialize the variables
331 |             obs = env.reset()
332 |             ep_rew.append(g_rew)
333 |             g_rew = 0
334 |             episode += 1
335 | 
336 | 
337 |             # Print some stats and test the best policy
338 |             summary = tf.Summary()
339 |             summary.value.add(tag='train_performance', simple_value=np.mean(ep_rew))
340 | 
341 |             if episode % 10 == 0:
342 |                 unique, counts = np.unique(list_bests, return_counts=True)
343 |                 print(dict(zip(unique, counts)))
344 | 
345 |                 test_agent_results = test_agent(env_test, dqns[best_dqn].act, num_games=10, summary=summary)
346 |                 print('Epoch:%4d Episode:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f Best:%2d Last:%2d' % (ep,episode,np.mean(ep_rew), eps, step_count, np.mean(test_agent_results), best_dqn, g_rew))
347 | 
348 |             file_writer.add_summary(summary, step_count)
349 |             file_writer.flush()
350 | 
351 | 
352 |     file_writer.close()
353 |     env.close()
354 | 
355 | 
356 | if __name__ == '__main__':
357 | 
358 |     #ESBAS('Acrobot-v1', hidden_sizes=[[64, 64]], lr=4e-4, buffer_size=100000, update_target_net=100, batch_size=32, 
359 |     #    update_freq=4, min_buffer_size=100, render_cycle=10000, explor_steps=50000, num_epochs=20000, end_explor=0.1)
360 | 
361 |     ESBAS('Acrobot-v1', hidden_sizes=[[64], [16, 16], [64, 64]], lr=4e-4, buffer_size=100000, update_target_net=100, batch_size=32, 
362 |         update_freq=4, min_buffer_size=100, render_cycle=10000, explor_steps=50000, num_epochs=20000, end_explor=0.1,
363 |         xi=1./4)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Reinforcement Learning Algorithms with Python
 5 | 
 6 | <a href="https://www.packtpub.com/data/hands-on-reinforcement-learning-algorithms-with-python"><img src="https://www.packtpub.com/media/catalog/product/cache/ecd051e9670bd57df35c8f0b122d8aea/9/7/9781789131116-original.jpeg" alt="Reinforcement Learning Algorithms with Python" height="256px" align="right"></a>
 7 | 
 8 | This is the code repository for [Reinforcement Learning Algorithms with Python](https://www.packtpub.com/data/hands-on-reinforcement-learning-algorithms-with-python), published by Packt.
 9 | 
10 | **Learn, understand, and develop smart algorithms for addressing AI challenges**
11 | 
12 | ## What is this book about?
13 | Reinforcement Learning (RL) is a popular and promising branch of AI that involves making smarter models and agents that can automatically determine ideal behavior based on changing requirements. This book will help you master RL algorithms and understand their implementation as you build self-learning agents.
14 | Starting with an introduction to the tools, libraries, and setup needed to work in the RL environment, this book covers the building blocks of RL and delves into value-based methods, such as the application of Q-learning and SARSA algorithms. You'll learn how to use a combination of Q-learning and neural networks to solve complex problems. Furthermore, you'll study the policy gradient methods, TRPO, and PPO, to improve performance and stability, before moving on to the DDPG and TD3 deterministic algorithms. This book also covers how imitation learning techniques work and how Dagger can teach an agent to drive. You'll discover evolutionary strategies and black-box optimization techniques, and see how they can improve RL algorithms. Finally, you'll get to grips with exploration approaches, such as UCB and UCB1, and develop a meta-algorithm called ESBAS.
15 | By the end of the book, you'll have worked with key RL algorithms to overcome challenges in real-world applications, and be part of the RL research community.
16 | 
17 | 
18 | This book covers the following exciting features:
19 | * Develop an agent to play CartPole using the OpenAI Gym interface
20 | * Discover the model-based reinforcement learning paradigm
21 | * Solve the Frozen Lake problem with dynamic programming
22 | * Explore Q-learning and SARSA with a view to playing a taxi game
23 | * Apply Deep Q-Networks (DQNs) to Atari games using Gym
24 | * Study policy gradient algorithms, including Actor-Critic and REINFORCE
25 | * Understand and apply PPO and TRPO in continuous locomotion environments
26 | * Get to grips with evolution strategies for solving the lunar lander problem
27 | 
28 | If you feel this book is for you, get your [copy](https://www.amazon.com/Reinforcement-Learning-Algorithms-Python-understand/dp/1789131111/) today!
29 | 
30 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
31 | alt="https://www.packtpub.com/" border="5" /></a>
32 | 
33 | ## Instructions and Navigations
34 | All of the code is organized into folders. For example, Chapter02.
35 | 
36 | The code will look like the following:
37 | ```
38 | import gym
39 | 
40 | # create the environment 
41 | env = gym.make("CartPole-v1")
42 | # reset the environment before starting
43 | env.reset()
44 | 
45 | # loop 10 times
46 | for i in range(10):
47 |     # take a random action
48 |     env.step(env.action_space.sample())
49 |     # render the game
50 |    env.render()
51 | 
52 | # close the environment
53 | env.close()
54 | ```
55 | 
56 | **Following is what you need for this book:**
57 | If you are an AI researcher, deep learning user, or anyone who wants to learn reinforcement learning from scratch, this book is for you. You’ll also find this reinforcement learning book useful if you want to learn about the advancements in the field. Working knowledge of Python is necessary.	
58 | 
59 | 
60 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11).
61 | ### Software and Hardware List
62 | | Chapter | Software required | OS required |
63 | | -------- | ------------------------------------ | ----------------------------------- |
64 | | All | Python 3.6 or higher | Windows, Mac OS X, and Linux (Any) |
65 | | All | TensorFlow 1.14 or higher | Windows, Mac OS X, and Linux (Any) |
66 | 
67 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](http://www.packtpub.com/sites/default/files/downloads/9781789131116_ColorImages.pdf).
68 | 
69 | ### Related products
70 | * Hands-On Reinforcement Learning with Python [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/hands-reinforcement-learning-python) [[Amazon]](https://www.amazon.com/Hands-Reinforcement-Learning-Python-reinforcement-ebook/dp/B079Q3WLM4/)
71 | 
72 | * Python Reinforcement Learning Projects [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/python-reinforcement-learning-projects) [[Amazon]](https://www.amazon.com/Python-Reinforcement-Learning-Projects-hands-ebook/dp/B07F2S82W3/)
73 | 
74 | ## Get to Know the Author
75 | **Andrea Lonza** is a deep learning engineer with a great passion for artificial intelligence and a desire to create machines that act intelligently. He has acquired expert knowledge in reinforcement learning, natural language processing, and computer vision through academic and industrial machine learning projects. He has also participated in several Kaggle competitions, achieving high results. He is always looking for compelling challenges and loves to prove himself.
76 | 
77 | 
78 | 
79 | ### Suggestions and Feedback
80 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
81 | 
82 | 
83 | ### Download a free PDF
84 | 
85 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
86 | <p align="center"> <a href="https://packt.link/free-ebook/9781789131116">https://packt.link/free-ebook/9781789131116 </a> </p>


--------------------------------------------------------------------------------