├── Chapter02 └── Code.ipynb ├── Chapter03 ├── frozenlake8x8_policyiteration.py └── frozenlake8x8_valueiteration.py ├── Chapter04 └── SARSA Q_learning Taxi-v2.py ├── Chapter05 ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb ├── DQN_Atari.py ├── DQN_variations_Atari.py ├── Untitled.ipynb ├── atari_wrappers.py └── untitled ├── Chapter06 ├── AC.py ├── REINFORCE.py └── REINFORCE_baseline.py ├── Chapter07 ├── PPO.py └── TRPO.py ├── Chapter08 ├── DDPG.py └── TD3.py ├── Chapter09 └── ME-TRPO.py ├── Chapter10 ├── DAgger.py └── expert │ ├── checkpoint │ ├── model.ckpt.data-00000-of-00001 │ ├── model.ckpt.index │ └── model.ckpt.meta ├── Chapter11 └── ES.py ├── Chapter12 └── ESBAS.py └── README.md /Chapter02/Code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### TensorFlow installation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "`pip3 install tensorflow`\n", 15 | "\n", 16 | "or\n", 17 | "\n", 18 | "`pip3 install tensorflow-gpu`" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "#### OpenAI Gym installation" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "On OSX: \n", 33 | "\n", 34 | "`brew install cmake boost boost-python sdl2 swig wget`\n", 35 | " \n", 36 | "On Ubuntu 16.04:\n", 37 | "\n", 38 | "`apt-get install -y python-pyglet python3-opengl zlib1g-dev libjpeg-dev patchelf cmake swig libboost-all-dev libsdl2-dev libosmesa6-dev xvfb ffmpeg`\n", 39 | "\n", 40 | "On Ubuntu 18.04\n", 41 | "\n", 42 | "`sudo apt install -y python3-dev zlib1g-dev libjpeg-dev cmake swig python-pyglet python3-opengl libboost-all-dev libsdl2-dev libosmesa6-dev patchelf ffmpeg xvfb `\n", 43 | "\n", 44 | "Then:\n", 45 | "\n", 46 | "```\n", 47 | "git clone https://github.com/openai/gym.git \n", 48 | "\n", 49 | "cd gym\n", 50 | "\n", 51 | "pip install -e '.[all]'\n", 52 | "```\n", 53 | "\n", 54 | "PyBox2D:\n", 55 | "\n", 56 | "```\n", 57 | "git clone https://github.com/pybox2d/pybox2d\n", 58 | "cd pybox2d\n", 59 | "pip3 install -e .\n", 60 | "```\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "#### Duckietown installation\n", 68 | "\n", 69 | "```\n", 70 | "git clone https://github.com/duckietown/gym-duckietown.git\n", 71 | "cd gym-duckietown\n", 72 | "pip3 install -e .\n", 73 | "```" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "#### Roboschool installation\n", 81 | "\n", 82 | "```\n", 83 | "git clone https://github.com/openai/roboschool\n", 84 | "cd roboschool\n", 85 | "ROBOSCHOOL_PATH=`pwd`\n", 86 | "git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision\n", 87 | "mkdir bullet3/build\n", 88 | "cd bullet3/build\n", 89 | "cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 -DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF ..\n", 90 | "\n", 91 | "make -j4\n", 92 | "make install\n", 93 | "cd ../..\n", 94 | "pip3 install -e $ROBOSCHOOL_PATH\n", 95 | "```" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## RL cycle" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 1, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "import gym\n", 120 | "\n", 121 | "# create the environment \n", 122 | "env = gym.make(\"CartPole-v1\")\n", 123 | "# reset the environment before starting\n", 124 | "env.reset()\n", 125 | "\n", 126 | "# loop 10 times\n", 127 | "for i in range(10):\n", 128 | " # take a random action\n", 129 | " env.step(env.action_space.sample())\n", 130 | " # render the game\n", 131 | " env.render()\n", 132 | "\n", 133 | "# close the environment\n", 134 | "env.close()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 2, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n", 147 | "Episode 0 finished, reward:15\n", 148 | "Episode 1 finished, reward:13\n", 149 | "Episode 2 finished, reward:20\n", 150 | "Episode 3 finished, reward:22\n", 151 | "Episode 4 finished, reward:13\n", 152 | "Episode 5 finished, reward:18\n", 153 | "Episode 6 finished, reward:15\n", 154 | "Episode 7 finished, reward:12\n", 155 | "Episode 8 finished, reward:58\n", 156 | "Episode 9 finished, reward:15\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "import gym\n", 162 | "\n", 163 | "# create and initialize the environment\n", 164 | "env = gym.make(\"CartPole-v1\")\n", 165 | "env.reset()\n", 166 | "\n", 167 | "# play 10 games\n", 168 | "for i in range(10):\n", 169 | " # initialize the variables\n", 170 | " done = False\n", 171 | " game_rew = 0\n", 172 | "\n", 173 | " while not done:\n", 174 | " # choose a random action\n", 175 | " action = env.action_space.sample()\n", 176 | " # take a step in the environment\n", 177 | " new_obs, rew, done, info = env.step(action)\n", 178 | " game_rew += rew\n", 179 | " \n", 180 | " # when is done, print the cumulative reward of the game and reset the environment\n", 181 | " if done:\n", 182 | " print('Episode %d finished, reward:%d' % (i, game_rew))\n", 183 | " env.reset()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 3, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n", 196 | "Box(4,)\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "import gym\n", 202 | "\n", 203 | "env = gym.make('CartPole-v1')\n", 204 | "print(env.observation_space)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 4, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "Discrete(2)\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "print(env.action_space)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 5, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "1\n", 234 | "0\n", 235 | "0\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "print(env.action_space.sample())\n", 241 | "print(env.action_space.sample())\n", 242 | "print(env.action_space.sample())" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 6, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "print(env.observation_space.low)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 7, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | "[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "print(env.observation_space.high)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "## TensorFlow" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 8, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stderr", 293 | "output_type": "stream", 294 | "text": [ 295 | "c:\\users\\andrea\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\h5py\\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 296 | " from ._conv import register_converters as _register_converters\n" 297 | ] 298 | }, 299 | { 300 | "name": "stdout", 301 | "output_type": "stream", 302 | "text": [ 303 | "Tensor(\"add:0\", shape=(), dtype=int32)\n", 304 | "7\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "import tensorflow as tf\n", 310 | "\n", 311 | "# create two constants: a and b\n", 312 | "a = tf.constant(4)\n", 313 | "b = tf.constant(3)\n", 314 | "\n", 315 | "# perform a computation\n", 316 | "c = a + b\n", 317 | "print(c) # print the shape of c\n", 318 | "\n", 319 | "# create a session\n", 320 | "session = tf.Session()\n", 321 | "# run the session. It compute the sum\n", 322 | "res = session.run(c)\n", 323 | "print(res) # print the actual result" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 9, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "# reset the graph\n", 333 | "tf.reset_default_graph()" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "### Tensor" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 10, 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "()\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "a = tf.constant(1)\n", 358 | "print(a.shape)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 11, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "(5,)\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "# array of five elements\n", 376 | "b = tf.constant([1,2,3,4,5])\n", 377 | "print(b.shape)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 12, 383 | "metadata": {}, 384 | "outputs": [ 385 | { 386 | "name": "stdout", 387 | "output_type": "stream", 388 | "text": [ 389 | "[1 2 3]\n" 390 | ] 391 | } 392 | ], 393 | "source": [ 394 | "#NB: a can be of any type of tensor\n", 395 | "a = tf.constant([1,2,3,4,5])\n", 396 | "first_three_elem = a[:3]\n", 397 | "fourth_elem = a[3]\n", 398 | "\n", 399 | "sess = tf.Session()\n", 400 | "print(sess.run(first_three_elem))" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 13, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "4\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "print(sess.run(fourth_elem))" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "#### Constant" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 14, 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "Tensor(\"a_const:0\", shape=(4,), dtype=float32)\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "a = tf.constant([1.0, 1.1, 2.1, 3.1], dtype=tf.float32, name='a_const')\n", 442 | "print(a)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "#### Placeholder" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 15, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "name": "stdout", 459 | "output_type": "stream", 460 | "text": [ 461 | "[[10.1 10.2 10.3]]\n" 462 | ] 463 | } 464 | ], 465 | "source": [ 466 | "a = tf.placeholder(shape=(1,3), dtype=tf.float32)\n", 467 | "b = tf.constant([[10,10,10]], dtype=tf.float32)\n", 468 | "\n", 469 | "c = a + b\n", 470 | "\n", 471 | "sess = tf.Session()\n", 472 | "res = sess.run(c, feed_dict={a:[[0.1,0.2,0.3]]})\n", 473 | "print(res)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 16, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "tf.reset_default_graph()" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 17, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "name": "stdout", 492 | "output_type": "stream", 493 | "text": [ 494 | "Tensor(\"Placeholder:0\", shape=(?, 3), dtype=float32)\n", 495 | "[[10.1 10.2 10.3]]\n", 496 | "[[7. 7. 7.]\n", 497 | " [7. 7. 7.]]\n" 498 | ] 499 | } 500 | ], 501 | "source": [ 502 | "import numpy as np\n", 503 | "\n", 504 | "# NB: the fist dimension is 'None', meaning that it can be of any lenght\n", 505 | "a = tf.placeholder(shape=(None,3), dtype=tf.float32)\n", 506 | "b = tf.placeholder(shape=(None,3), dtype=tf.float32)\n", 507 | "\n", 508 | "c = a + b\n", 509 | "\n", 510 | "print(a)\n", 511 | "\n", 512 | "sess = tf.Session()\n", 513 | "print(sess.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))\n", 514 | "\n", 515 | "v_a = np.array([[1,2,3],[4,5,6]])\n", 516 | "v_b = np.array([[6,5,4],[3,2,1]])\n", 517 | "print(sess.run(c, feed_dict={a:v_a, b:v_b}))" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 18, 523 | "metadata": {}, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "[[10.1 10.2 10.3]]\n" 530 | ] 531 | } 532 | ], 533 | "source": [ 534 | "sess = tf.Session()\n", 535 | "print(sess.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "#### Variable" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 19, 548 | "metadata": {}, 549 | "outputs": [ 550 | { 551 | "name": "stdout", 552 | "output_type": "stream", 553 | "text": [ 554 | "[[0.4478302 0.7014905 0.36300516]]\n", 555 | "[[4 5]]\n" 556 | ] 557 | } 558 | ], 559 | "source": [ 560 | "tf.reset_default_graph()\n", 561 | "\n", 562 | "# variable initialized using the glorot uniform initializer\n", 563 | "var = tf.get_variable(\"first_variable\", shape=[1,3], dtype=tf.float32, initializer=tf.glorot_uniform_initializer)\n", 564 | "\n", 565 | "# variable initialized with constant values\n", 566 | "init_val = np.array([4,5])\n", 567 | "var2 = tf.get_variable(\"second_variable\", shape=[1,2], dtype=tf.int32, initializer=tf.constant_initializer(init_val))\n", 568 | "\n", 569 | "# create the session\n", 570 | "sess = tf.Session()\n", 571 | "# initialize all the variables\n", 572 | "sess.run(tf.global_variables_initializer())\n", 573 | "\n", 574 | "print(sess.run(var))\n", 575 | "\n", 576 | "print(sess.run(var2))" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 20, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "# not trainable variable\n", 586 | "var2 = tf.get_variable(\"variable\", shape=[1,2], trainable=False, dtype=tf.int32)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 21, 592 | "metadata": {}, 593 | "outputs": [ 594 | { 595 | "name": "stdout", 596 | "output_type": "stream", 597 | "text": [ 598 | "[, , ]\n" 599 | ] 600 | } 601 | ], 602 | "source": [ 603 | "print(tf.global_variables())" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "#### Graph" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 22, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "data": { 620 | "text/plain": [ 621 | "-0.015899599" 622 | ] 623 | }, 624 | "execution_count": 22, 625 | "metadata": {}, 626 | "output_type": "execute_result" 627 | } 628 | ], 629 | "source": [ 630 | "tf.reset_default_graph()\n", 631 | "\n", 632 | "const1 = tf.constant(3.0, name='constant1')\n", 633 | "\n", 634 | "var = tf.get_variable(\"variable1\", shape=[1,2], dtype=tf.float32)\n", 635 | "var2 = tf.get_variable(\"variable2\", shape=[1,2], trainable=False, dtype=tf.float32)\n", 636 | "\n", 637 | "op1 = const1 * var\n", 638 | "op2 = op1 + var2\n", 639 | "op3 = tf.reduce_mean(op2)\n", 640 | "\n", 641 | "sess = tf.Session()\n", 642 | "sess.run(tf.global_variables_initializer())\n", 643 | "sess.run(op3)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "markdown", 648 | "metadata": {}, 649 | "source": [ 650 | "### Simple Linear Regression Example\n" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 23, 656 | "metadata": {}, 657 | "outputs": [ 658 | { 659 | "name": "stdout", 660 | "output_type": "stream", 661 | "text": [ 662 | "Epoch: 0, MSE: 4617.4390, W: 1.295, b: -0.407\n", 663 | "Epoch: 40, MSE: 5.3334, W: 0.496, b: -0.727\n", 664 | "Epoch: 80, MSE: 4.5894, W: 0.529, b: -0.012\n", 665 | "Epoch: 120, MSE: 4.1029, W: 0.512, b: 0.608\n", 666 | "Epoch: 160, MSE: 3.8552, W: 0.506, b: 1.092\n", 667 | "Epoch: 200, MSE: 3.7597, W: 0.501, b: 1.418\n", 668 | "Final weight: 0.500, bias: 1.473\n" 669 | ] 670 | } 671 | ], 672 | "source": [ 673 | "tf.reset_default_graph()\n", 674 | "\n", 675 | "np.random.seed(10)\n", 676 | "tf.set_random_seed(10)\n", 677 | "\n", 678 | "W, b = 0.5, 1.4\n", 679 | "# create a dataset of 100 examples\n", 680 | "X = np.linspace(0,100, num=100)\n", 681 | "# add random noise to the y labels\n", 682 | "y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))\n", 683 | "\n", 684 | "# create the placeholders\n", 685 | "x_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n", 686 | "y_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n", 687 | "\n", 688 | "# create the variables.\n", 689 | "v_weight = tf.get_variable(\"weight\", shape=[1], dtype=tf.float32)\n", 690 | "v_bias = tf.get_variable(\"bias\", shape=[1], dtype=tf.float32)\n", 691 | "\n", 692 | "# linear computation\n", 693 | "out = v_weight * x_ph + v_bias\n", 694 | "\n", 695 | "# compute the Mean Squared Error\n", 696 | "loss = tf.reduce_mean((out - y_ph)**2)\n", 697 | "\n", 698 | "# optimizer\n", 699 | "opt = tf.train.AdamOptimizer(0.4).minimize(loss)\n", 700 | "\n", 701 | "# create the session\n", 702 | "session = tf.Session()\n", 703 | "session.run(tf.global_variables_initializer())\n", 704 | "\n", 705 | "# loop to train the parameters\n", 706 | "for ep in range(210):\n", 707 | " # run the optimizer and get the loss\n", 708 | " train_loss, _ = session.run([loss, opt], feed_dict={x_ph:X, y_ph:y})\n", 709 | " \n", 710 | " # print epoch number and loss\n", 711 | " if ep % 40 == 0:\n", 712 | " print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, session.run(v_weight), session.run(v_bias)))\n", 713 | " \n", 714 | "print('Final weight: %.3f, bias: %.3f' % (session.run(v_weight), session.run(v_bias)))" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "metadata": {}, 720 | "source": [ 721 | "#### .. with TensorBoard" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": 24, 727 | "metadata": {}, 728 | "outputs": [ 729 | { 730 | "name": "stdout", 731 | "output_type": "stream", 732 | "text": [ 733 | "Epoch: 0, MSE: 4617.4390, W: 1.295, b: -0.407\n", 734 | "Epoch: 40, MSE: 5.3334, W: 0.496, b: -0.727\n", 735 | "Epoch: 80, MSE: 4.5894, W: 0.529, b: -0.012\n", 736 | "Epoch: 120, MSE: 4.1029, W: 0.512, b: 0.608\n", 737 | "Epoch: 160, MSE: 3.8552, W: 0.506, b: 1.092\n", 738 | "Epoch: 200, MSE: 3.7597, W: 0.501, b: 1.418\n", 739 | "Final weight: 0.500, bias: 1.473\n" 740 | ] 741 | } 742 | ], 743 | "source": [ 744 | "from datetime import datetime\n", 745 | "\n", 746 | "tf.reset_default_graph()\n", 747 | "\n", 748 | "np.random.seed(10)\n", 749 | "tf.set_random_seed(10)\n", 750 | "\n", 751 | "W, b = 0.5, 1.4\n", 752 | "# create a dataset of 100 examples\n", 753 | "X = np.linspace(0,100, num=100)\n", 754 | "# add random noise to the y labels\n", 755 | "y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))\n", 756 | "\n", 757 | "# create the placeholders\n", 758 | "x_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n", 759 | "y_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n", 760 | "\n", 761 | "# create the variables.\n", 762 | "v_weight = tf.get_variable(\"weight\", shape=[1], dtype=tf.float32)\n", 763 | "v_bias = tf.get_variable(\"bias\", shape=[1], dtype=tf.float32)\n", 764 | "\n", 765 | "# linear computation\n", 766 | "out = v_weight * x_ph + v_bias\n", 767 | "\n", 768 | "# compute the Mean Squared Error\n", 769 | "loss = tf.reduce_mean((out - y_ph)**2)\n", 770 | "\n", 771 | "# optimizer\n", 772 | "opt = tf.train.AdamOptimizer(0.4).minimize(loss)\n", 773 | "\n", 774 | "\n", 775 | "tf.summary.scalar('MSEloss', loss)\n", 776 | "tf.summary.histogram('model_weight', v_weight)\n", 777 | "tf.summary.histogram('model_bias', v_bias)\n", 778 | "all_summary = tf.summary.merge_all()\n", 779 | "\n", 780 | "now = datetime.now()\n", 781 | "clock_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)\n", 782 | "file_writer = tf.summary.FileWriter('log_dir/'+clock_time, tf.get_default_graph())\n", 783 | "\n", 784 | "\n", 785 | "# create the session\n", 786 | "session = tf.Session()\n", 787 | "session.run(tf.global_variables_initializer())\n", 788 | "\n", 789 | "# loop to train the parameters\n", 790 | "for ep in range(210):\n", 791 | " # run the optimizer and get the loss\n", 792 | " train_loss, _, train_summary = session.run([loss, opt, all_summary], feed_dict={x_ph:X, y_ph:y})\n", 793 | " file_writer.add_summary(train_summary, ep)\n", 794 | " \n", 795 | " # print epoch number and loss\n", 796 | " if ep % 40 == 0:\n", 797 | " print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, session.run(v_weight), session.run(v_bias)))\n", 798 | " \n", 799 | "print('Final weight: %.3f, bias: %.3f' % (session.run(v_weight), session.run(v_bias)))\n", 800 | "file_writer.close()" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [] 809 | } 810 | ], 811 | "metadata": { 812 | "kernelspec": { 813 | "display_name": "Python 3", 814 | "language": "python", 815 | "name": "python3" 816 | }, 817 | "language_info": { 818 | "codemirror_mode": { 819 | "name": "ipython", 820 | "version": 3 821 | }, 822 | "file_extension": ".py", 823 | "mimetype": "text/x-python", 824 | "name": "python", 825 | "nbconvert_exporter": "python", 826 | "pygments_lexer": "ipython3", 827 | "version": "3.5.2" 828 | } 829 | }, 830 | "nbformat": 4, 831 | "nbformat_minor": 2 832 | } 833 | -------------------------------------------------------------------------------- /Chapter03/frozenlake8x8_policyiteration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | def eval_state_action(V, s, a, gamma=0.99): 5 | return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]]) 6 | 7 | def policy_evaluation(V, policy, eps=0.0001): 8 | ''' 9 | Policy evaluation. Update the value function until it reach a steady state 10 | ''' 11 | while True: 12 | delta = 0 13 | # loop over all states 14 | for s in range(nS): 15 | old_v = V[s] 16 | # update V[s] using the Bellman equation 17 | V[s] = eval_state_action(V, s, policy[s]) 18 | delta = max(delta, np.abs(old_v - V[s])) 19 | 20 | if delta < eps: 21 | break 22 | 23 | def policy_improvement(V, policy): 24 | ''' 25 | Policy improvement. Update the policy based on the value function 26 | ''' 27 | policy_stable = True 28 | for s in range(nS): 29 | old_a = policy[s] 30 | # update the policy with the action that bring to the highest state value 31 | policy[s] = np.argmax([eval_state_action(V, s, a) for a in range(nA)]) 32 | if old_a != policy[s]: 33 | policy_stable = False 34 | 35 | return policy_stable 36 | 37 | 38 | def run_episodes(env, policy, num_games=100): 39 | ''' 40 | Run some games to test a policy 41 | ''' 42 | tot_rew = 0 43 | state = env.reset() 44 | 45 | for _ in range(num_games): 46 | done = False 47 | while not done: 48 | # select the action accordingly to the policy 49 | next_state, reward, done, _ = env.step(policy[state]) 50 | 51 | state = next_state 52 | tot_rew += reward 53 | if done: 54 | state = env.reset() 55 | 56 | print('Won %i of %i games!'%(tot_rew, num_games)) 57 | 58 | 59 | if __name__ == '__main__': 60 | # create the environment 61 | env = gym.make('FrozenLake-v0') 62 | # enwrap it to have additional information from it 63 | env = env.unwrapped 64 | 65 | # spaces dimension 66 | nA = env.action_space.n 67 | nS = env.observation_space.n 68 | 69 | # initializing value function and policy 70 | V = np.zeros(nS) 71 | policy = np.zeros(nS) 72 | 73 | # some useful variable 74 | policy_stable = False 75 | it = 0 76 | 77 | while not policy_stable: 78 | policy_evaluation(V, policy) 79 | policy_stable = policy_improvement(V, policy) 80 | it += 1 81 | 82 | print('Converged after %i policy iterations'%(it)) 83 | run_episodes(env, policy) 84 | print(V.reshape((4,4))) 85 | print(policy.reshape((4,4))) -------------------------------------------------------------------------------- /Chapter03/frozenlake8x8_valueiteration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | def eval_state_action(V, s, a, gamma=0.99): 5 | return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]]) 6 | 7 | def value_iteration(eps=0.0001): 8 | ''' 9 | Value iteration algorithm 10 | ''' 11 | V = np.zeros(nS) 12 | it = 0 13 | 14 | while True: 15 | delta = 0 16 | # update the value of each state using as "policy" the max operator 17 | for s in range(nS): 18 | old_v = V[s] 19 | V[s] = np.max([eval_state_action(V, s, a) for a in range(nA)]) 20 | delta = max(delta, np.abs(old_v - V[s])) 21 | 22 | if delta < eps: 23 | break 24 | else: 25 | print('Iter:', it, ' delta:', np.round(delta, 5)) 26 | it += 1 27 | 28 | return V 29 | 30 | def run_episodes(env, V, num_games=100): 31 | ''' 32 | Run some test games 33 | ''' 34 | tot_rew = 0 35 | state = env.reset() 36 | 37 | for _ in range(num_games): 38 | done = False 39 | while not done: 40 | action = np.argmax([eval_state_action(V, state, a) for a in range(nA)]) 41 | next_state, reward, done, _ = env.step(action) 42 | 43 | state = next_state 44 | tot_rew += reward 45 | if done: 46 | state = env.reset() 47 | 48 | print('Won %i of %i games!'%(tot_rew, num_games)) 49 | 50 | 51 | if __name__ == '__main__': 52 | # create the environment 53 | env = gym.make('FrozenLake-v0') 54 | # enwrap it to have additional information from it 55 | env = env.unwrapped 56 | 57 | # spaces dimension 58 | nA = env.action_space.n 59 | nS = env.observation_space.n 60 | 61 | # Value iteration 62 | V = value_iteration(eps=0.0001) 63 | # test the value function on 100 games 64 | run_episodes(env, V, 100) 65 | # print the state values 66 | print(V.reshape((4,4))) 67 | 68 | -------------------------------------------------------------------------------- /Chapter04/SARSA Q_learning Taxi-v2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | 5 | def eps_greedy(Q, s, eps=0.1): 6 | ''' 7 | Epsilon greedy policy 8 | ''' 9 | if np.random.uniform(0,1) < eps: 10 | # Choose a random action 11 | return np.random.randint(Q.shape[1]) 12 | else: 13 | # Choose the action of a greedy policy 14 | return greedy(Q, s) 15 | 16 | 17 | def greedy(Q, s): 18 | ''' 19 | Greedy policy 20 | 21 | return the index corresponding to the maximum action-state value 22 | ''' 23 | return np.argmax(Q[s]) 24 | 25 | 26 | def run_episodes(env, Q, num_episodes=100, to_print=False): 27 | ''' 28 | Run some episodes to test the policy 29 | ''' 30 | tot_rew = [] 31 | state = env.reset() 32 | 33 | for _ in range(num_episodes): 34 | done = False 35 | game_rew = 0 36 | 37 | while not done: 38 | # select a greedy action 39 | next_state, rew, done, _ = env.step(greedy(Q, state)) 40 | 41 | state = next_state 42 | game_rew += rew 43 | if done: 44 | state = env.reset() 45 | tot_rew.append(game_rew) 46 | 47 | if to_print: 48 | print('Mean score: %.3f of %i games!'%(np.mean(tot_rew), num_episodes)) 49 | 50 | return np.mean(tot_rew) 51 | 52 | def Q_learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005): 53 | nA = env.action_space.n 54 | nS = env.observation_space.n 55 | 56 | # Initialize the Q matrix 57 | # Q: matrix nS*nA where each row represent a state and each colums represent a different action 58 | Q = np.zeros((nS, nA)) 59 | games_reward = [] 60 | test_rewards = [] 61 | 62 | for ep in range(num_episodes): 63 | state = env.reset() 64 | done = False 65 | tot_rew = 0 66 | 67 | # decay the epsilon value until it reaches the threshold of 0.01 68 | if eps > 0.01: 69 | eps -= eps_decay 70 | 71 | # loop the main body until the environment stops 72 | while not done: 73 | # select an action following the eps-greedy policy 74 | action = eps_greedy(Q, state, eps) 75 | 76 | next_state, rew, done, _ = env.step(action) # Take one step in the environment 77 | 78 | # Q-learning update the state-action value (get the max Q value for the next state) 79 | Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action]) 80 | 81 | state = next_state 82 | tot_rew += rew 83 | if done: 84 | games_reward.append(tot_rew) 85 | 86 | # Test the policy every 300 episodes and print the results 87 | if (ep % 300) == 0: 88 | test_rew = run_episodes(env, Q, 1000) 89 | print("Episode:{:5d} Eps:{:2.4f} Rew:{:2.4f}".format(ep, eps, test_rew)) 90 | test_rewards.append(test_rew) 91 | 92 | return Q 93 | 94 | 95 | def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005): 96 | nA = env.action_space.n 97 | nS = env.observation_space.n 98 | 99 | # Initialize the Q matrix 100 | # Q: matrix nS*nA where each row represent a state and each colums represent a different action 101 | Q = np.zeros((nS, nA)) 102 | games_reward = [] 103 | test_rewards = [] 104 | 105 | for ep in range(num_episodes): 106 | state = env.reset() 107 | done = False 108 | tot_rew = 0 109 | 110 | # decay the epsilon value until it reaches the threshold of 0.01 111 | if eps > 0.01: 112 | eps -= eps_decay 113 | 114 | 115 | action = eps_greedy(Q, state, eps) 116 | 117 | # loop the main body until the environment stops 118 | while not done: 119 | next_state, rew, done, _ = env.step(action) # Take one step in the environment 120 | 121 | # choose the next action (needed for the SARSA update) 122 | next_action = eps_greedy(Q, next_state, eps) 123 | # SARSA update 124 | Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action]) 125 | 126 | state = next_state 127 | action = next_action 128 | tot_rew += rew 129 | if done: 130 | games_reward.append(tot_rew) 131 | 132 | # Test the policy every 300 episodes and print the results 133 | if (ep % 300) == 0: 134 | test_rew = run_episodes(env, Q, 1000) 135 | print("Episode:{:5d} Eps:{:2.4f} Rew:{:2.4f}".format(ep, eps, test_rew)) 136 | test_rewards.append(test_rew) 137 | 138 | return Q 139 | 140 | 141 | if __name__ == '__main__': 142 | env = gym.make('Taxi-v2') 143 | 144 | Q_qlearning = Q_learning(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001) 145 | 146 | Q_sarsa = SARSA(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001) -------------------------------------------------------------------------------- /Chapter05/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /Chapter05/DQN_Atari.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | from collections import deque 6 | import time 7 | import sys 8 | 9 | from atari_wrappers import make_env 10 | 11 | 12 | gym.logger.set_level(40) 13 | 14 | current_milli_time = lambda: int(round(time.time() * 1000)) 15 | 16 | def cnn(x): 17 | ''' 18 | Convolutional neural network 19 | ''' 20 | x = tf.layers.conv2d(x, filters=16, kernel_size=8, strides=4, padding='valid', activation='relu') 21 | x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation='relu') 22 | return tf.layers.conv2d(x, filters=32, kernel_size=3, strides=1, padding='valid', activation='relu') 23 | 24 | 25 | def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None): 26 | ''' 27 | Feed-forward neural network 28 | ''' 29 | for l in hidden_layers: 30 | x = tf.layers.dense(x, units=l, activation=activation) 31 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 32 | 33 | def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None): 34 | ''' 35 | Deep Q network: CNN followed by FNN 36 | ''' 37 | x = cnn(x) 38 | x = tf.layers.flatten(x) 39 | 40 | return fnn(x, hidden_layers, output_size, fnn_activation, last_activation) 41 | 42 | 43 | class ExperienceBuffer(): 44 | ''' 45 | Experience Replay Buffer 46 | ''' 47 | def __init__(self, buffer_size): 48 | self.obs_buf = deque(maxlen=buffer_size) 49 | self.rew_buf = deque(maxlen=buffer_size) 50 | self.act_buf = deque(maxlen=buffer_size) 51 | self.obs2_buf = deque(maxlen=buffer_size) 52 | self.done_buf = deque(maxlen=buffer_size) 53 | 54 | 55 | def add(self, obs, rew, act, obs2, done): 56 | # Add a new transition to the buffers 57 | self.obs_buf.append(obs) 58 | self.rew_buf.append(rew) 59 | self.act_buf.append(act) 60 | self.obs2_buf.append(obs2) 61 | self.done_buf.append(done) 62 | 63 | 64 | def sample_minibatch(self, batch_size): 65 | # Sample a minibatch of size batch_size 66 | mb_indices = np.random.randint(len(self.obs_buf), size=batch_size) 67 | 68 | mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices]) 69 | mb_rew = [self.rew_buf[i] for i in mb_indices] 70 | mb_act = [self.act_buf[i] for i in mb_indices] 71 | mb_obs2 = scale_frames([self.obs2_buf[i] for i in mb_indices]) 72 | mb_done = [self.done_buf[i] for i in mb_indices] 73 | 74 | return mb_obs, mb_rew, mb_act, mb_obs2, mb_done 75 | 76 | def __len__(self): 77 | return len(self.obs_buf) 78 | 79 | 80 | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value): 81 | ''' 82 | Calculate the target value y for each transition 83 | ''' 84 | max_av = np.max(av, axis=1) 85 | 86 | # if episode terminate, y take value r 87 | # otherwise, q-learning step 88 | 89 | ys = [] 90 | for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av): 91 | if d: 92 | ys.append(r) 93 | else: 94 | q_step = r + discounted_value * av 95 | ys.append(q_step) 96 | 97 | assert len(ys) == len(mini_batch_rw) 98 | return ys 99 | 100 | def greedy(action_values): 101 | ''' 102 | Greedy policy 103 | ''' 104 | return np.argmax(action_values) 105 | 106 | def eps_greedy(action_values, eps=0.1): 107 | ''' 108 | Eps-greedy policy 109 | ''' 110 | if np.random.uniform(0,1) < eps: 111 | # Choose a uniform random action 112 | return np.random.randint(len(action_values)) 113 | else: 114 | # Choose the greedy action 115 | return np.argmax(action_values) 116 | 117 | def test_agent(env_test, agent_op, num_games=20): 118 | ''' 119 | Test an agent 120 | ''' 121 | games_r = [] 122 | 123 | for _ in range(num_games): 124 | d = False 125 | game_r = 0 126 | o = env_test.reset() 127 | 128 | while not d: 129 | # Use an eps-greedy policy with eps=0.05 (to add stochasticity to the policy) 130 | # Needed because Atari envs are deterministic 131 | # If you would use a greedy policy, the results will be always the same 132 | a = eps_greedy(np.squeeze(agent_op(o)), eps=0.05) 133 | o, r, d, _ = env_test.step(a) 134 | 135 | game_r += r 136 | 137 | games_r.append(game_r) 138 | 139 | return games_r 140 | 141 | def scale_frames(frames): 142 | ''' 143 | Scale the frame with number between 0 and 1 144 | ''' 145 | return np.array(frames, dtype=np.float32) / 255.0 146 | 147 | def DQN(env_name, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000, 148 | batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000): 149 | 150 | # Create the environment both for train and test 151 | env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20) 152 | env_test = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20) 153 | # Add a monitor to the test env to store the videos 154 | env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0) 155 | 156 | tf.reset_default_graph() 157 | 158 | obs_dim = env.observation_space.shape 159 | act_dim = env.action_space.n 160 | 161 | # Create all the placeholders 162 | obs_ph = tf.placeholder(shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]), dtype=tf.float32, name='obs') 163 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 164 | y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y') 165 | 166 | # Create the target network 167 | with tf.variable_scope('target_network'): 168 | target_qv = qnet(obs_ph, hidden_sizes, act_dim) 169 | target_vars = tf.trainable_variables() 170 | 171 | # Create the online network (i.e. the behavior policy) 172 | with tf.variable_scope('online_network'): 173 | online_qv = qnet(obs_ph, hidden_sizes, act_dim) 174 | train_vars = tf.trainable_variables() 175 | 176 | # Update the target network by assigning to it the variables of the online network 177 | # Note that the target network and the online network have the same exact architecture 178 | update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))] 179 | update_target_op = tf.group(*update_target) 180 | 181 | # One hot encoding of the action 182 | act_onehot = tf.one_hot(act_ph, depth=act_dim) 183 | # We are interested only in the Q-values of those actions 184 | q_values = tf.reduce_sum(act_onehot * online_qv, axis=1) 185 | 186 | # MSE loss function 187 | v_loss = tf.reduce_mean((y_ph - q_values)**2) 188 | # Adam optimize that minimize the loss v_loss 189 | v_opt = tf.train.AdamOptimizer(lr).minimize(v_loss) 190 | 191 | def agent_op(o): 192 | ''' 193 | Forward pass to obtain the Q-values from the online network of a single observation 194 | ''' 195 | # Scale the frames 196 | o = scale_frames(o) 197 | return sess.run(online_qv, feed_dict={obs_ph:[o]}) 198 | 199 | # Time 200 | now = datetime.now() 201 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second)) 202 | print('Time:', clock_time) 203 | 204 | mr_v = tf.Variable(0.0) 205 | ml_v = tf.Variable(0.0) 206 | 207 | 208 | # TensorBoard summaries 209 | tf.summary.scalar('v_loss', v_loss) 210 | tf.summary.scalar('Q-value', tf.reduce_mean(q_values)) 211 | tf.summary.histogram('Q-values', q_values) 212 | 213 | scalar_summary = tf.summary.merge_all() 214 | reward_summary = tf.summary.scalar('test_rew', mr_v) 215 | mean_loss_summary = tf.summary.scalar('mean_loss', ml_v) 216 | 217 | LOG_DIR = 'log_dir/'+env_name 218 | hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}" .format(lr, update_target_net, update_freq, frames_num) 219 | 220 | # initialize the File Writer for writing TensorBoard summaries 221 | file_writer = tf.summary.FileWriter(LOG_DIR+'/DQN_'+clock_time+'_'+hyp_str, tf.get_default_graph()) 222 | 223 | # open a session 224 | sess = tf.Session() 225 | # and initialize all the variables 226 | sess.run(tf.global_variables_initializer()) 227 | 228 | render_the_game = False 229 | step_count = 0 230 | last_update_loss = [] 231 | ep_time = current_milli_time() 232 | batch_rew = [] 233 | old_step_count = 0 234 | 235 | obs = env.reset() 236 | 237 | # Initialize the experience buffer 238 | buffer = ExperienceBuffer(buffer_size) 239 | 240 | # Copy the online network in the target network 241 | sess.run(update_target_op) 242 | 243 | ########## EXPLORATION INITIALIZATION ###### 244 | eps = start_explor 245 | eps_decay = (start_explor - end_explor) / explor_steps 246 | 247 | for ep in range(num_epochs): 248 | g_rew = 0 249 | done = False 250 | 251 | # Until the environment does not end.. 252 | while not done: 253 | 254 | # Epsilon decay 255 | if eps > end_explor: 256 | eps -= eps_decay 257 | 258 | # Choose an eps-greedy action 259 | act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps) 260 | 261 | # execute the action in the environment 262 | obs2, rew, done, _ = env.step(act) 263 | 264 | # Render the game if you want to 265 | if render_the_game: 266 | env.render() 267 | 268 | # Add the transition to the replay buffer 269 | buffer.add(obs, rew, act, obs2, done) 270 | 271 | obs = obs2 272 | g_rew += rew 273 | step_count += 1 274 | 275 | ################ TRAINING ############### 276 | # If it's time to train the network: 277 | if len(buffer) > min_buffer_size and (step_count % update_freq == 0): 278 | 279 | # sample a minibatch from the buffer 280 | mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size) 281 | 282 | 283 | mb_trg_qv = sess.run(target_qv, feed_dict={obs_ph:mb_obs2}) 284 | y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount) 285 | 286 | # TRAINING STEP 287 | # optimize, compute the loss and return the TB summary 288 | train_summary, train_loss, _ = sess.run([scalar_summary, v_loss, v_opt], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act}) 289 | 290 | # Add the train summary to the file_writer 291 | file_writer.add_summary(train_summary, step_count) 292 | last_update_loss.append(train_loss) 293 | 294 | # Every update_target_net steps, update the target network 295 | if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0): 296 | 297 | # run the session to update the target network and get the mean loss sumamry 298 | _, train_summary = sess.run([update_target_op, mean_loss_summary], feed_dict={ml_v:np.mean(last_update_loss)}) 299 | file_writer.add_summary(train_summary, step_count) 300 | last_update_loss = [] 301 | 302 | 303 | # If the environment is ended, reset it and initialize the variables 304 | if done: 305 | obs = env.reset() 306 | batch_rew.append(g_rew) 307 | g_rew, render_the_game = 0, False 308 | 309 | # every test_frequency episodes, test the agent and write some stats in TensorBoard 310 | if ep % test_frequency == 0: 311 | # Test the agent to 10 games 312 | test_rw = test_agent(env_test, agent_op, num_games=10) 313 | 314 | # Run the test stats and add them to the file_writer 315 | test_summary = sess.run(reward_summary, feed_dict={mr_v: np.mean(test_rw)}) 316 | file_writer.add_summary(test_summary, step_count) 317 | 318 | # Print some useful stats 319 | ep_sec_time = int((current_milli_time()-ep_time) / 1000) 320 | print('Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d' % 321 | (ep,np.mean(batch_rew), eps, step_count, np.mean(test_rw), np.std(test_rw), ep_sec_time, (step_count-old_step_count)/test_frequency)) 322 | 323 | ep_time = current_milli_time() 324 | batch_rew = [] 325 | old_step_count = step_count 326 | 327 | if ep % render_cycle == 0: 328 | render_the_game = True 329 | 330 | file_writer.close() 331 | env.close() 332 | 333 | 334 | if __name__ == '__main__': 335 | 336 | DQN('PongNoFrameskip-v4', hidden_sizes=[128], lr=2e-4, buffer_size=100000, update_target_net=1000, batch_size=32, 337 | update_freq=2, frames_num=2, min_buffer_size=10000, render_cycle=10000) -------------------------------------------------------------------------------- /Chapter05/DQN_variations_Atari.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | from collections import deque 6 | import time 7 | import sys 8 | 9 | from atari_wrappers import make_env 10 | 11 | 12 | gym.logger.set_level(40) 13 | 14 | current_milli_time = lambda: int(round(time.time() * 1000)) 15 | 16 | 17 | def cnn(x): 18 | ''' 19 | Convolutional neural network 20 | ''' 21 | x = tf.layers.conv2d(x, filters=16, kernel_size=8, strides=4, padding='valid', activation='relu') 22 | x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation='relu') 23 | return tf.layers.conv2d(x, filters=32, kernel_size=3, strides=1, padding='valid', activation='relu') 24 | 25 | 26 | def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None): 27 | ''' 28 | Feed-forward neural network 29 | ''' 30 | for l in hidden_layers: 31 | x = tf.layers.dense(x, units=l, activation=activation) 32 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 33 | 34 | def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None): 35 | ''' 36 | Deep Q network: CNN followed by FNN 37 | ''' 38 | x = cnn(x) 39 | x = tf.layers.flatten(x) 40 | 41 | return fnn(x, hidden_layers, output_size, fnn_activation, last_activation) 42 | 43 | def greedy(action_values): 44 | ''' 45 | Greedy policy 46 | ''' 47 | return np.argmax(action_values) 48 | 49 | def eps_greedy(action_values, eps=0.1): 50 | ''' 51 | Eps-greedy policy 52 | ''' 53 | if np.random.uniform(0,1) < eps: 54 | # Choose a uniform random action 55 | return np.random.randint(len(action_values)) 56 | else: 57 | # Choose the greedy action 58 | return np.argmax(action_values) 59 | 60 | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value): 61 | ''' 62 | Calculate the target value y for each transition 63 | ''' 64 | max_av = np.max(av, axis=1) 65 | 66 | # if episode terminate, y take value r 67 | # otherwise, q-learning step 68 | 69 | ys = [] 70 | for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av): 71 | if d: 72 | ys.append(r) 73 | else: 74 | q_step = r + discounted_value * av 75 | ys.append(q_step) 76 | 77 | assert len(ys) == len(mini_batch_rw) 78 | return ys 79 | 80 | def test_agent(env_test, agent_op, num_games=20): 81 | ''' 82 | Test an agent 83 | ''' 84 | games_r = [] 85 | 86 | for _ in range(num_games): 87 | d = False 88 | game_r = 0 89 | o = env_test.reset() 90 | 91 | while not d: 92 | # Use an eps-greedy policy with eps=0.05 (to add stochasticity to the policy) 93 | # Needed because Atari envs are deterministic 94 | # If you would use a greedy policy, the results will be always the same 95 | a = eps_greedy(np.squeeze(agent_op(o)), eps=0.05) 96 | o, r, d, _ = env_test.step(a) 97 | 98 | game_r += r 99 | 100 | games_r.append(game_r) 101 | 102 | return games_r 103 | 104 | def scale_frames(frames): 105 | ''' 106 | Scale the frame with number between 0 and 1 107 | ''' 108 | return np.array(frames, dtype=np.float32) / 255.0 109 | 110 | 111 | def dueling_qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None): 112 | ''' 113 | Dueling neural network 114 | ''' 115 | x = cnn(x) 116 | x = tf.layers.flatten(x) 117 | 118 | qf = fnn(x, hidden_layers, 1, fnn_activation, last_activation) 119 | aaqf = fnn(x, hidden_layers, output_size, fnn_activation, last_activation) 120 | 121 | return qf + aaqf - tf.reduce_mean(aaqf) 122 | 123 | def double_q_target_values(mini_batch_rw, mini_batch_done, target_qv, online_qv, discounted_value): ## IS THE NAME CORRECT??? 124 | ''' 125 | Calculate the target value y following the double Q-learning update 126 | ''' 127 | argmax_online_qv = np.argmax(online_qv, axis=1) 128 | 129 | # if episode terminate, y take value r 130 | # otherwise, q-learning step 131 | 132 | ys = [] 133 | assert len(mini_batch_rw) == len(mini_batch_done) == len(target_qv) == len(argmax_online_qv) 134 | for r, d, t_av, arg_a in zip(mini_batch_rw, mini_batch_done, target_qv, argmax_online_qv): 135 | if d: 136 | ys.append(r) 137 | else: 138 | q_value = r + discounted_value * t_av[arg_a] 139 | ys.append(q_value) 140 | 141 | assert len(ys) == len(mini_batch_rw) 142 | 143 | return ys 144 | 145 | class MultiStepExperienceBuffer(): 146 | ''' 147 | Experience Replay Buffer for multi-step learning 148 | ''' 149 | def __init__(self, buffer_size, n_step, gamma): 150 | self.obs_buf = deque(maxlen=buffer_size) 151 | self.act_buf = deque(maxlen=buffer_size) 152 | 153 | self.n_obs_buf = deque(maxlen=buffer_size) 154 | self.n_done_buf = deque(maxlen=buffer_size) 155 | self.n_rew_buf = deque(maxlen=buffer_size) 156 | 157 | self.n_step = n_step 158 | self.last_rews = deque(maxlen=self.n_step+1) 159 | self.gamma = gamma 160 | 161 | 162 | def add(self, obs, rew, act, obs2, done): 163 | self.obs_buf.append(obs) 164 | self.act_buf.append(act) 165 | # the following buffers will be updated in the next n_step steps 166 | # their values are not known, yet 167 | self.n_obs_buf.append(None) 168 | self.n_rew_buf.append(None) 169 | self.n_done_buf.append(None) 170 | 171 | self.last_rews.append(rew) 172 | 173 | ln = len(self.obs_buf) 174 | len_rews = len(self.last_rews) 175 | 176 | # Update the indices of the buffer that are n_steps old 177 | if done: 178 | # In case it's the last step, update up to the n_steps indices fo the buffer 179 | # it cannot update more than len(last_rews), otherwise will update the previous traj 180 | for i in range(len_rews): 181 | self.n_obs_buf[ln-(len_rews-i-1)-1] = obs2 182 | self.n_done_buf[ln-(len_rews-i-1)-1] = done 183 | rgt = np.sum([(self.gamma**k)*r for k,r in enumerate(np.array(self.last_rews)[i:len_rews])]) 184 | self.n_rew_buf[ln-(len_rews-i-1)-1] = rgt 185 | 186 | # reset the reward deque 187 | self.last_rews = deque(maxlen=self.n_step+1) 188 | else: 189 | # Update the elements of the buffer that has been added n_step steps ago 190 | # Add only if the multi-step values are updated 191 | if len(self.last_rews) >= (self.n_step+1): 192 | self.n_obs_buf[ln-self.n_step-1] = obs2 193 | self.n_done_buf[ln-self.n_step-1] = done 194 | rgt = np.sum([(self.gamma**k)*r for k,r in enumerate(np.array(self.last_rews)[:len_rews])]) 195 | self.n_rew_buf[ln-self.n_step-1] = rgt 196 | 197 | 198 | def sample_minibatch(self, batch_size): 199 | # Sample a minibatch of size batch_size 200 | # Note: the samples should be at least of n_step steps ago 201 | mb_indices = np.random.randint(len(self.obs_buf)-self.n_step, size=batch_size) 202 | 203 | mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices]) 204 | mb_rew = [self.n_rew_buf[i] for i in mb_indices] 205 | mb_act = [self.act_buf[i] for i in mb_indices] 206 | mb_obs2 = scale_frames([self.n_obs_buf[i] for i in mb_indices]) 207 | mb_done = [self.n_done_buf[i] for i in mb_indices] 208 | 209 | return mb_obs, mb_rew, mb_act, mb_obs2, mb_done 210 | 211 | def __len__(self): 212 | return len(self.obs_buf) 213 | 214 | def DQN_with_variations(env_name, extensions_hyp, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000, 215 | batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000): 216 | 217 | # Create the environment both for train and test 218 | env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20) 219 | env_test = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20) 220 | # Add a monitor to the test env to store the videos 221 | env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0) 222 | 223 | tf.reset_default_graph() 224 | 225 | obs_dim = env.observation_space.shape 226 | act_dim = env.action_space.n 227 | 228 | # Create all the placeholders 229 | obs_ph = tf.placeholder(shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]), dtype=tf.float32, name='obs') 230 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 231 | y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y') 232 | 233 | # Create the target network 234 | with tf.variable_scope('target_network'): 235 | if extensions_hyp['dueling']: 236 | target_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim) 237 | else: 238 | target_qv = qnet(obs_ph, hidden_sizes, act_dim) 239 | target_vars = tf.trainable_variables() 240 | 241 | # Create the online network (i.e. the behavior policy) 242 | with tf.variable_scope('online_network'): 243 | if extensions_hyp['dueling']: 244 | online_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim) 245 | else: 246 | online_qv = qnet(obs_ph, hidden_sizes, act_dim) 247 | train_vars = tf.trainable_variables() 248 | 249 | # Update the target network by assigning to it the variables of the online network 250 | # Note that the target network and the online network have the same exact architecture 251 | update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))] 252 | update_target_op = tf.group(*update_target) 253 | 254 | # One hot encoding of the action 255 | act_onehot = tf.one_hot(act_ph, depth=act_dim) 256 | # We are interested only in the Q-values of those actions 257 | q_values = tf.reduce_sum(act_onehot * online_qv, axis=1) 258 | 259 | # MSE loss function 260 | v_loss = tf.reduce_mean((y_ph - q_values)**2) 261 | # Adam optimize that minimize the loss v_loss 262 | v_opt = tf.train.AdamOptimizer(lr).minimize(v_loss) 263 | 264 | def agent_op(o): 265 | ''' 266 | Forward pass to obtain the Q-values from the online network of a single observation 267 | ''' 268 | # Scale the frames 269 | o = scale_frames(o) 270 | return sess.run(online_qv, feed_dict={obs_ph:[o]}) 271 | 272 | # Time 273 | now = datetime.now() 274 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second)) 275 | print('Time:', clock_time) 276 | 277 | mr_v = tf.Variable(0.0) 278 | ml_v = tf.Variable(0.0) 279 | 280 | 281 | # TensorBoard summaries 282 | tf.summary.scalar('v_loss', v_loss) 283 | tf.summary.scalar('Q-value', tf.reduce_mean(q_values)) 284 | tf.summary.histogram('Q-values', q_values) 285 | 286 | scalar_summary = tf.summary.merge_all() 287 | reward_summary = tf.summary.scalar('test_rew', mr_v) 288 | mean_loss_summary = tf.summary.scalar('mean_loss', ml_v) 289 | 290 | LOG_DIR = 'log_dir/'+env_name 291 | hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}-ddqn_{}-duel_{}-nstep_{}" \ 292 | .format(lr, update_target_net, update_freq, frames_num, extensions_hyp['DDQN'], extensions_hyp['dueling'], extensions_hyp['multi_step']) 293 | 294 | # initialize the File Writer for writing TensorBoard summaries 295 | file_writer = tf.summary.FileWriter(LOG_DIR+'/DQN_'+clock_time+'_'+hyp_str, tf.get_default_graph()) 296 | 297 | # open a session 298 | sess = tf.Session() 299 | # and initialize all the variables 300 | sess.run(tf.global_variables_initializer()) 301 | 302 | render_the_game = False 303 | step_count = 0 304 | last_update_loss = [] 305 | ep_time = current_milli_time() 306 | batch_rew = [] 307 | old_step_count = 0 308 | 309 | obs = env.reset() 310 | 311 | # Initialize the experience buffer 312 | #buffer = ExperienceBuffer(buffer_size) 313 | buffer = MultiStepExperienceBuffer(buffer_size, extensions_hyp['multi_step'], discount) 314 | 315 | # Copy the online network in the target network 316 | sess.run(update_target_op) 317 | 318 | ########## EXPLORATION INITIALIZATION ###### 319 | eps = start_explor 320 | eps_decay = (start_explor - end_explor) / explor_steps 321 | 322 | for ep in range(num_epochs): 323 | g_rew = 0 324 | done = False 325 | 326 | # Until the environment does not end.. 327 | while not done: 328 | 329 | # Epsilon decay 330 | if eps > end_explor: 331 | eps -= eps_decay 332 | 333 | # Choose an eps-greedy action 334 | act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps) 335 | 336 | # execute the action in the environment 337 | obs2, rew, done, _ = env.step(act) 338 | 339 | # Render the game if you want to 340 | if render_the_game: 341 | env.render() 342 | 343 | # Add the transition to the replay buffer 344 | buffer.add(obs, rew, act, obs2, done) 345 | 346 | obs = obs2 347 | g_rew += rew 348 | step_count += 1 349 | 350 | ################ TRAINING ############### 351 | # If it's time to train the network: 352 | if len(buffer) > min_buffer_size and (step_count % update_freq == 0): 353 | 354 | # sample a minibatch from the buffer 355 | mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size) 356 | 357 | if extensions_hyp['DDQN']: 358 | mb_onl_qv, mb_trg_qv = sess.run([online_qv,target_qv], feed_dict={obs_ph:mb_obs2}) 359 | y_r = double_q_target_values(mb_rew, mb_done, mb_trg_qv, mb_onl_qv, discount) 360 | else: 361 | mb_trg_qv = sess.run(target_qv, feed_dict={obs_ph:mb_obs2}) 362 | y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount) 363 | 364 | # optimize, compute the loss and return the TB summary 365 | train_summary, train_loss, _ = sess.run([scalar_summary, v_loss, v_opt], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act}) 366 | 367 | # Add the train summary to the file_writer 368 | file_writer.add_summary(train_summary, step_count) 369 | last_update_loss.append(train_loss) 370 | 371 | # Every update_target_net steps, update the target network 372 | if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0): 373 | 374 | # run the session to update the target network and get the mean loss sumamry 375 | _, train_summary = sess.run([update_target_op, mean_loss_summary], feed_dict={ml_v:np.mean(last_update_loss)}) 376 | file_writer.add_summary(train_summary, step_count) 377 | last_update_loss = [] 378 | 379 | 380 | # If the environment is ended, reset it and initialize the variables 381 | if done: 382 | obs = env.reset() 383 | batch_rew.append(g_rew) 384 | g_rew, render_the_game = 0, False 385 | 386 | # every test_frequency episodes, test the agent and write some stats in TensorBoard 387 | if ep % test_frequency == 0: 388 | # Test the agent to 10 games 389 | test_rw = test_agent(env_test, agent_op, num_games=10) 390 | 391 | # Run the test stats and add them to the file_writer 392 | test_summary = sess.run(reward_summary, feed_dict={mr_v: np.mean(test_rw)}) 393 | file_writer.add_summary(test_summary, step_count) 394 | 395 | # Print some useful stats 396 | ep_sec_time = int((current_milli_time()-ep_time) / 1000) 397 | print('Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d' % 398 | (ep,np.mean(batch_rew), eps, step_count, np.mean(test_rw), np.std(test_rw), ep_sec_time, (step_count-old_step_count)/test_frequency)) 399 | 400 | ep_time = current_milli_time() 401 | batch_rew = [] 402 | old_step_count = step_count 403 | 404 | if ep % render_cycle == 0: 405 | render_the_game = True 406 | 407 | file_writer.close() 408 | env.close() 409 | 410 | 411 | if __name__ == '__main__': 412 | 413 | extensions_hyp={ 414 | 'DDQN':False, 415 | 'dueling':False, 416 | 'multi_step':1 417 | } 418 | DQN_with_variations('PongNoFrameskip-v4', extensions_hyp, hidden_sizes=[128], lr=2e-4, buffer_size=100000, update_target_net=1000, batch_size=32, 419 | update_freq=2, frames_num=2, min_buffer_size=10000, render_cycle=10000) -------------------------------------------------------------------------------- /Chapter05/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /Chapter05/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | import cv2 7 | 8 | ''' 9 | Atari Wrapper copied from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 10 | ''' 11 | 12 | class NoopResetEnv(gym.Wrapper): 13 | def __init__(self, env, noop_max=30): 14 | """Sample initial states by taking random number of no-ops on reset. 15 | No-op is assumed to be action 0. 16 | """ 17 | gym.Wrapper.__init__(self, env) 18 | self.noop_max = noop_max 19 | self.override_num_noops = None 20 | self.noop_action = 0 21 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 22 | 23 | def reset(self, **kwargs): 24 | """ Do no-op action for a number of steps in [1, noop_max].""" 25 | self.env.reset(**kwargs) 26 | if self.override_num_noops is not None: 27 | noops = self.override_num_noops 28 | else: 29 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 30 | assert noops > 0 31 | obs = None 32 | for _ in range(noops): 33 | obs, _, done, _ = self.env.step(self.noop_action) 34 | if done: 35 | obs = self.env.reset(**kwargs) 36 | return obs 37 | 38 | def step(self, ac): 39 | return self.env.step(ac) 40 | 41 | class LazyFrames(object): 42 | def __init__(self, frames): 43 | """This object ensures that common frames between the observations are only stored once. 44 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 45 | buffers. 46 | This object should only be converted to numpy array before being passed to the model. 47 | You'd not believe how complex the previous solution was.""" 48 | self._frames = frames 49 | self._out = None 50 | 51 | def _force(self): 52 | if self._out is None: 53 | self._out = np.concatenate(self._frames, axis=2) 54 | self._frames = None 55 | return self._out 56 | 57 | def __array__(self, dtype=None): 58 | out = self._force() 59 | if dtype is not None: 60 | out = out.astype(dtype) 61 | return out 62 | 63 | def __len__(self): 64 | return len(self._force()) 65 | 66 | def __getitem__(self, i): 67 | return self._force()[i] 68 | 69 | class FireResetEnv(gym.Wrapper): 70 | def __init__(self, env): 71 | """Take action on reset for environments that are fixed until firing.""" 72 | gym.Wrapper.__init__(self, env) 73 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 74 | assert len(env.unwrapped.get_action_meanings()) >= 3 75 | 76 | def reset(self, **kwargs): 77 | self.env.reset(**kwargs) 78 | obs, _, done, _ = self.env.step(1) 79 | if done: 80 | self.env.reset(**kwargs) 81 | obs, _, done, _ = self.env.step(2) 82 | if done: 83 | self.env.reset(**kwargs) 84 | return obs 85 | 86 | def step(self, ac): 87 | return self.env.step(ac) 88 | 89 | 90 | class MaxAndSkipEnv(gym.Wrapper): 91 | def __init__(self, env, skip=4): 92 | """Return only every `skip`-th frame""" 93 | gym.Wrapper.__init__(self, env) 94 | # most recent raw observations (for max pooling across time steps) 95 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 96 | self._skip = skip 97 | 98 | def step(self, action): 99 | """Repeat action, sum reward, and max over last observations.""" 100 | total_reward = 0.0 101 | done = None 102 | for i in range(self._skip): 103 | obs, reward, done, info = self.env.step(action) 104 | if i == self._skip - 2: self._obs_buffer[0] = obs 105 | if i == self._skip - 1: self._obs_buffer[1] = obs 106 | total_reward += reward 107 | if done: 108 | break 109 | # Note that the observation on the done=True frame 110 | # doesn't matter 111 | max_frame = self._obs_buffer.max(axis=0) 112 | 113 | return max_frame, total_reward, done, info 114 | 115 | def reset(self, **kwargs): 116 | return self.env.reset(**kwargs) 117 | 118 | 119 | 120 | class WarpFrame(gym.ObservationWrapper): 121 | def __init__(self, env): 122 | """Warp frames to 84x84 as done in the Nature paper and later work.""" 123 | gym.ObservationWrapper.__init__(self, env) 124 | self.width = 84 125 | self.height = 84 126 | self.observation_space = spaces.Box(low=0, high=255, 127 | shape=(self.height, self.width, 1), dtype=np.uint8) 128 | 129 | def observation(self, frame): 130 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 131 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 132 | return frame[:, :, None] 133 | 134 | 135 | 136 | class FrameStack(gym.Wrapper): 137 | def __init__(self, env, k): 138 | """Stack k last frames. 139 | Returns lazy array, which is much more memory efficient. 140 | See Also 141 | baselines.common.atari_wrappers.LazyFrames 142 | """ 143 | gym.Wrapper.__init__(self, env) 144 | self.k = k 145 | self.frames = deque([], maxlen=k) 146 | shp = env.observation_space.shape 147 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype) 148 | 149 | def reset(self): 150 | ob = self.env.reset() 151 | for _ in range(self.k): 152 | self.frames.append(ob) 153 | return self._get_ob() 154 | 155 | def step(self, action): 156 | ob, reward, done, info = self.env.step(action) 157 | self.frames.append(ob) 158 | return self._get_ob(), reward, done, info 159 | 160 | def _get_ob(self): 161 | assert len(self.frames) == self.k 162 | return LazyFrames(list(self.frames)) 163 | 164 | 165 | class ScaledFloatFrame(gym.ObservationWrapper): 166 | def __init__(self, env): 167 | gym.ObservationWrapper.__init__(self, env) 168 | self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) 169 | 170 | def observation(self, observation): 171 | # careful! This undoes the memory optimization, use 172 | # with smaller replay buffers only. 173 | return np.array(observation).astype(np.float32) / 255.0 174 | 175 | 176 | def make_env(env_name, fire=True, frames_num=2, noop_num=30, skip_frames=True): 177 | env = gym.make(env_name) 178 | 179 | if skip_frames: 180 | env = MaxAndSkipEnv(env) ## Return only every `skip`-th frame 181 | if fire: 182 | env = FireResetEnv(env) ## Fire at the beginning 183 | env = NoopResetEnv(env, noop_max=noop_num) 184 | env = WarpFrame(env) ## Reshape image 185 | env = FrameStack(env, frames_num) ## Stack last 4 frames 186 | #env = ScaledFloatFrame(env) ## Scale frames 187 | return env -------------------------------------------------------------------------------- /Chapter05/untitled: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter05/untitled -------------------------------------------------------------------------------- /Chapter06/AC.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | import time 6 | 7 | 8 | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None): 9 | ''' 10 | Multi-layer perceptron 11 | ''' 12 | for l in hidden_layers: 13 | x = tf.layers.dense(x, units=l, activation=activation) 14 | return tf.layers.dense(x, units=output_size, activation=last_activation) 15 | 16 | def softmax_entropy(logits): 17 | ''' 18 | Softmax Entropy 19 | ''' 20 | return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1) 21 | 22 | def discounted_rewards(rews, last_sv, gamma): 23 | ''' 24 | Discounted reward to go 25 | 26 | Parameters: 27 | ---------- 28 | rews: list of rewards 29 | last_sv: value of the last state 30 | gamma: discount value 31 | ''' 32 | rtg = np.zeros_like(rews, dtype=np.float32) 33 | rtg[-1] = rews[-1] + gamma*last_sv 34 | for i in reversed(range(len(rews)-1)): 35 | rtg[i] = rews[i] + gamma*rtg[i+1] 36 | return rtg 37 | 38 | class Buffer(): 39 | ''' 40 | Buffer class to store the experience from a unique policy 41 | ''' 42 | def __init__(self, gamma=0.99): 43 | self.gamma = gamma 44 | self.obs = [] 45 | self.act = [] 46 | self.ret = [] 47 | self.rtg = [] 48 | 49 | def store(self, temp_traj, last_sv): 50 | ''' 51 | Add temp_traj values to the buffers and compute the advantage and reward to go 52 | 53 | Parameters: 54 | ----------- 55 | temp_traj: list where each element is a list that contains: observation, reward, action, state-value 56 | last_sv: value of the last state (Used to Bootstrap) 57 | ''' 58 | # store only if the temp_traj list is not empty 59 | if len(temp_traj) > 0: 60 | self.obs.extend(temp_traj[:,0]) 61 | rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma) 62 | self.ret.extend(rtg - temp_traj[:,3]) 63 | self.rtg.extend(rtg) 64 | self.act.extend(temp_traj[:,2]) 65 | 66 | def get_batch(self): 67 | return self.obs, self.act, self.ret, self.rtg 68 | 69 | def __len__(self): 70 | assert(len(self.obs) == len(self.act) == len(self.ret) == len(self.rtg)) 71 | return len(self.obs) 72 | 73 | def AC(env_name, hidden_sizes=[32], ac_lr=5e-3, cr_lr=8e-3, num_epochs=50, gamma=0.99, steps_per_epoch=100, steps_to_print=100): 74 | ''' 75 | Actor-Critic Algorithm 76 | s 77 | Parameters: 78 | ----------- 79 | env_name: Name of the environment 80 | hidden_size: list of the number of hidden units for each layer 81 | ac_lr: actor learning rate 82 | cr_lr: critic learning rate 83 | num_epochs: number of training epochs 84 | gamma: discount factor 85 | steps_per_epoch: number of steps per epoch 86 | ''' 87 | tf.reset_default_graph() 88 | 89 | env = gym.make(env_name) 90 | 91 | 92 | obs_dim = env.observation_space.shape 93 | act_dim = env.action_space.n 94 | 95 | # Placeholders 96 | obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs') 97 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 98 | ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret') 99 | rtg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='rtg') 100 | 101 | ##################################################### 102 | ########### COMPUTE THE PG LOSS FUNCTIONS ########### 103 | ##################################################### 104 | 105 | # policy 106 | p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh) 107 | 108 | act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1)) 109 | actions_mask = tf.one_hot(act_ph, depth=act_dim) 110 | p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1) 111 | # entropy useful to study the algorithms 112 | entropy = -tf.reduce_mean(softmax_entropy(p_logits)) 113 | p_loss = -tf.reduce_mean(p_log*ret_ph) 114 | 115 | # policy optimization 116 | p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss) 117 | 118 | ####################################### 119 | ########### VALUE FUNCTION ########### 120 | ####################################### 121 | 122 | # value function 123 | s_values = tf.squeeze(mlp(obs_ph, hidden_sizes, 1, activation=tf.tanh)) 124 | # MSE loss function 125 | v_loss = tf.reduce_mean((rtg_ph - s_values)**2) 126 | # value function optimization 127 | v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss) 128 | 129 | # Time 130 | now = datetime.now() 131 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 132 | print('Time:', clock_time) 133 | 134 | 135 | # Set scalars and hisograms for TensorBoard 136 | tf.summary.scalar('p_loss', p_loss, collections=['train']) 137 | tf.summary.scalar('v_loss', v_loss, collections=['train']) 138 | tf.summary.scalar('entropy', entropy, collections=['train']) 139 | tf.summary.scalar('s_values', tf.reduce_mean(s_values), collections=['train']) 140 | tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train']) 141 | tf.summary.histogram('p_log', p_log, collections=['train']) 142 | tf.summary.histogram('act_multn', act_multn, collections=['train']) 143 | tf.summary.histogram('p_logits', p_logits, collections=['train']) 144 | tf.summary.histogram('ret_ph', ret_ph, collections=['train']) 145 | tf.summary.histogram('rtg_ph', rtg_ph, collections=['train']) 146 | tf.summary.histogram('s_values', s_values, collections=['train']) 147 | train_summary = tf.summary.merge_all('train') 148 | 149 | tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train']) 150 | tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train']) 151 | pre_scalar_summary = tf.summary.merge_all('pre_train') 152 | 153 | hyp_str = '-steps_{}-aclr_{}-crlr_{}'.format(steps_per_epoch, ac_lr, cr_lr) 154 | file_writer = tf.summary.FileWriter('log_dir/{}/AC_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph()) 155 | 156 | # create a session 157 | sess = tf.Session() 158 | # initialize the variables 159 | sess.run(tf.global_variables_initializer()) 160 | 161 | # few variables 162 | step_count = 0 163 | train_rewards = [] 164 | train_ep_len = [] 165 | timer = time.time() 166 | last_print_step = 0 167 | 168 | #Reset the environment at the beginning of the cycle 169 | obs = env.reset() 170 | ep_rews = [] 171 | 172 | # main cycle 173 | for ep in range(num_epochs): 174 | 175 | # intiaizlie buffer and other variables for the new epochs 176 | buffer = Buffer(gamma) 177 | env_buf = [] 178 | 179 | #iterate always over a fixed number of iterations 180 | for _ in range(steps_per_epoch): 181 | 182 | # run the policy 183 | act, val = sess.run([act_multn, s_values], feed_dict={obs_ph:[obs]}) 184 | # take a step in the environment 185 | obs2, rew, done, _ = env.step(np.squeeze(act)) 186 | 187 | # add the new transition 188 | env_buf.append([obs.copy(), rew, act, np.squeeze(val)]) 189 | 190 | obs = obs2.copy() 191 | 192 | step_count += 1 193 | last_print_step += 1 194 | ep_rews.append(rew) 195 | 196 | if done: 197 | # store the trajectory just completed 198 | # Changed from REINFORCE! The second parameter is the estimated value of the next state. Because the environment is done. 199 | # we pass a value of 0 200 | buffer.store(np.array(env_buf), 0) 201 | env_buf = [] 202 | # store additionl information about the episode 203 | train_rewards.append(np.sum(ep_rews)) 204 | train_ep_len.append(len(ep_rews)) 205 | # reset the environment 206 | obs = env.reset() 207 | ep_rews = [] 208 | 209 | # Bootstrap with the estimated state value of the next state! 210 | if len(env_buf) > 0: 211 | last_sv = sess.run(s_values, feed_dict={obs_ph:[obs]}) 212 | buffer.store(np.array(env_buf), last_sv) 213 | 214 | # collect the episodes' information 215 | obs_batch, act_batch, ret_batch, rtg_batch = buffer.get_batch() 216 | 217 | # run pre_scalar_summary before the optimization phase 218 | old_p_loss, old_v_loss, epochs_summary = sess.run([p_loss, v_loss, pre_scalar_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch}) 219 | file_writer.add_summary(epochs_summary, step_count) 220 | 221 | # Optimize the actor and the critic 222 | sess.run([p_opt, v_opt], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch}) 223 | 224 | # run train_summary to save the summary after the optimization 225 | new_p_loss, new_v_loss, train_summary_run = sess.run([p_loss, v_loss, train_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch}) 226 | file_writer.add_summary(train_summary_run, step_count) 227 | summary = tf.Summary() 228 | summary.value.add(tag='diff/p_loss', simple_value=(old_p_loss - new_p_loss)) 229 | summary.value.add(tag='diff/v_loss', simple_value=(old_v_loss - new_v_loss)) 230 | file_writer.add_summary(summary, step_count) 231 | file_writer.flush() 232 | 233 | # it's time to print some useful information 234 | if last_print_step > steps_to_print: 235 | print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer)) 236 | 237 | summary = tf.Summary() 238 | summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len)) 239 | summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards)) 240 | file_writer.add_summary(summary, step_count) 241 | file_writer.flush() 242 | 243 | timer = time.time() 244 | train_rewards = [] 245 | train_ep_len = [] 246 | last_print_step = 0 247 | 248 | env.close() 249 | file_writer.close() 250 | 251 | 252 | if __name__ == '__main__': 253 | AC('LunarLander-v2', hidden_sizes=[64], ac_lr=4e-3, cr_lr=1.5e-2, gamma=0.99, steps_per_epoch=100, steps_to_print=5000, num_epochs=8000) 254 | -------------------------------------------------------------------------------- /Chapter06/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | import time 6 | 7 | 8 | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None): 9 | ''' 10 | Multi-layer perceptron 11 | ''' 12 | for l in hidden_layers: 13 | x = tf.layers.dense(x, units=l, activation=activation) 14 | return tf.layers.dense(x, units=output_size, activation=last_activation) 15 | 16 | def softmax_entropy(logits): 17 | ''' 18 | Softmax Entropy 19 | ''' 20 | return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1) 21 | 22 | 23 | def discounted_rewards(rews, gamma): 24 | ''' 25 | Discounted reward to go 26 | 27 | Parameters: 28 | ---------- 29 | rews: list of rewards 30 | gamma: discount value 31 | ''' 32 | rtg = np.zeros_like(rews, dtype=np.float32) 33 | rtg[-1] = rews[-1] 34 | for i in reversed(range(len(rews)-1)): 35 | rtg[i] = rews[i] + gamma*rtg[i+1] 36 | return rtg 37 | 38 | class Buffer(): 39 | ''' 40 | Buffer class to store the experience from a unique policy 41 | ''' 42 | def __init__(self, gamma=0.99): 43 | self.gamma = gamma 44 | self.obs = [] 45 | self.act = [] 46 | self.ret = [] 47 | 48 | def store(self, temp_traj): 49 | ''' 50 | Add temp_traj values to the buffers and compute the advantage and reward to go 51 | 52 | Parameters: 53 | ----------- 54 | temp_traj: list where each element is a list that contains: observation, reward, action, state-value 55 | ''' 56 | # store only if the temp_traj list is not empty 57 | if len(temp_traj) > 0: 58 | self.obs.extend(temp_traj[:,0]) 59 | rtg = discounted_rewards(temp_traj[:,1], self.gamma) 60 | self.ret.extend(rtg) 61 | self.act.extend(temp_traj[:,2]) 62 | 63 | def get_batch(self): 64 | b_ret = self.ret 65 | return self.obs, self.act, b_ret 66 | 67 | def __len__(self): 68 | assert(len(self.obs) == len(self.act) == len(self.ret)) 69 | return len(self.obs) 70 | 71 | 72 | def REINFORCE(env_name, hidden_sizes=[32], lr=5e-3, num_epochs=50, gamma=0.99, steps_per_epoch=100): 73 | ''' 74 | REINFORCE Algorithm 75 | 76 | Parameters: 77 | ----------- 78 | env_name: Name of the environment 79 | hidden_size: list of the number of hidden units for each layer 80 | lr: policy learning rate 81 | gamma: discount factor 82 | steps_per_epoch: number of steps per epoch 83 | num_epochs: number train epochs (Note: they aren't properly epochs) 84 | ''' 85 | tf.reset_default_graph() 86 | 87 | env = gym.make(env_name) 88 | 89 | 90 | obs_dim = env.observation_space.shape 91 | act_dim = env.action_space.n 92 | 93 | # Placeholders 94 | obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs') 95 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 96 | ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret') 97 | 98 | ################################################## 99 | ########### COMPUTE THE LOSS FUNCTIONS ########### 100 | ################################################## 101 | 102 | 103 | # policy 104 | p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh) 105 | 106 | 107 | act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1)) 108 | actions_mask = tf.one_hot(act_ph, depth=act_dim) 109 | 110 | p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1) 111 | 112 | # entropy useful to study the algorithms 113 | entropy = -tf.reduce_mean(softmax_entropy(p_logits)) 114 | p_loss = -tf.reduce_mean(p_log*ret_ph) 115 | 116 | # policy optimization 117 | p_opt = tf.train.AdamOptimizer(lr).minimize(p_loss) 118 | 119 | # Time 120 | now = datetime.now() 121 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 122 | print('Time:', clock_time) 123 | 124 | 125 | # Set scalars and hisograms for TensorBoard 126 | tf.summary.scalar('p_loss', p_loss, collections=['train']) 127 | tf.summary.scalar('entropy', entropy, collections=['train']) 128 | tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train']) 129 | tf.summary.histogram('p_log', p_log, collections=['train']) 130 | tf.summary.histogram('act_multn', act_multn, collections=['train']) 131 | tf.summary.histogram('p_logits', p_logits, collections=['train']) 132 | tf.summary.histogram('ret_ph', ret_ph, collections=['train']) 133 | train_summary = tf.summary.merge_all('train') 134 | 135 | tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train']) 136 | pre_scalar_summary = tf.summary.merge_all('pre_train') 137 | 138 | hyp_str = '-steps_{}-aclr_{}'.format(steps_per_epoch, lr) 139 | file_writer = tf.summary.FileWriter('log_dir/{}/REINFORCE_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph()) 140 | 141 | # create a session 142 | sess = tf.Session() 143 | # initialize the variables 144 | sess.run(tf.global_variables_initializer()) 145 | 146 | # few variables 147 | step_count = 0 148 | train_rewards = [] 149 | train_ep_len = [] 150 | timer = time.time() 151 | 152 | # main cycle 153 | for ep in range(num_epochs): 154 | 155 | # initialize environment for the new epochs 156 | obs = env.reset() 157 | 158 | # intiaizlie buffer and other variables for the new epochs 159 | buffer = Buffer(gamma) 160 | env_buf = [] 161 | ep_rews = [] 162 | 163 | while len(buffer) < steps_per_epoch: 164 | 165 | # run the policy 166 | act = sess.run(act_multn, feed_dict={obs_ph:[obs]}) 167 | # take a step in the environment 168 | obs2, rew, done, _ = env.step(np.squeeze(act)) 169 | 170 | # add the new transition 171 | env_buf.append([obs.copy(), rew, act]) 172 | 173 | obs = obs2.copy() 174 | 175 | step_count += 1 176 | ep_rews.append(rew) 177 | 178 | if done: 179 | # store the trajectory just completed 180 | buffer.store(np.array(env_buf)) 181 | env_buf = [] 182 | # store additionl information about the episode 183 | train_rewards.append(np.sum(ep_rews)) 184 | train_ep_len.append(len(ep_rews)) 185 | # reset the environment 186 | obs = env.reset() 187 | ep_rews = [] 188 | 189 | # collect the episodes' information 190 | obs_batch, act_batch, ret_batch = buffer.get_batch() 191 | 192 | # run pre_scalar_summary before the optimization phase 193 | epochs_summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch}) 194 | file_writer.add_summary(epochs_summary, step_count) 195 | 196 | # Optimize the policy 197 | sess.run(p_opt, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch}) 198 | 199 | # run train_summary to save the summary after the optimization 200 | train_summary_run = sess.run(train_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch}) 201 | file_writer.add_summary(train_summary_run, step_count) 202 | 203 | # it's time to print some useful information 204 | if ep % 10 == 0: 205 | print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer)) 206 | 207 | summary = tf.Summary() 208 | summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len)) 209 | summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards)) 210 | file_writer.add_summary(summary, step_count) 211 | file_writer.flush() 212 | 213 | timer = time.time() 214 | train_rewards = [] 215 | train_ep_len = [] 216 | 217 | 218 | env.close() 219 | file_writer.close() 220 | 221 | 222 | if __name__ == '__main__': 223 | REINFORCE('LunarLander-v2', hidden_sizes=[64], lr=8e-3, gamma=0.99, num_epochs=1000, steps_per_epoch=1000) -------------------------------------------------------------------------------- /Chapter06/REINFORCE_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | import time 6 | 7 | 8 | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None): 9 | ''' 10 | Multi-layer perceptron 11 | ''' 12 | for l in hidden_layers: 13 | x = tf.layers.dense(x, units=l, activation=activation) 14 | return tf.layers.dense(x, units=output_size, activation=last_activation) 15 | 16 | def softmax_entropy(logits): 17 | ''' 18 | Softmax Entropy 19 | ''' 20 | return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1) 21 | 22 | 23 | def discounted_rewards(rews, gamma): 24 | ''' 25 | Discounted reward to go 26 | 27 | Parameters: 28 | ---------- 29 | rews: list of rewards 30 | gamma: discount value 31 | ''' 32 | rtg = np.zeros_like(rews, dtype=np.float32) 33 | rtg[-1] = rews[-1] 34 | for i in reversed(range(len(rews)-1)): 35 | rtg[i] = rews[i] + gamma*rtg[i+1] 36 | return rtg 37 | 38 | class Buffer(): 39 | ''' 40 | Buffer class to store the experience from a unique policy 41 | ''' 42 | def __init__(self, gamma=0.99): 43 | self.gamma = gamma 44 | self.obs = [] 45 | self.act = [] 46 | self.ret = [] 47 | self.rtg = [] 48 | 49 | def store(self, temp_traj): 50 | ''' 51 | Add temp_traj values to the buffers and compute the advantage and reward to go 52 | 53 | Parameters: 54 | ----------- 55 | temp_traj: list where each element is a list that contains: observation, reward, action, state-value 56 | ''' 57 | # store only if the temp_traj list is not empty 58 | if len(temp_traj) > 0: 59 | self.obs.extend(temp_traj[:,0]) 60 | rtg = discounted_rewards(temp_traj[:,1], self.gamma) 61 | # NEW 62 | self.ret.extend(rtg - temp_traj[:,3]) 63 | self.rtg.extend(rtg) 64 | self.act.extend(temp_traj[:,2]) 65 | 66 | def get_batch(self): 67 | # MODIFIED 68 | return self.obs, self.act, self.ret, self.rtg 69 | 70 | def __len__(self): 71 | assert(len(self.obs) == len(self.act) == len(self.ret) == len(self.rtg)) 72 | return len(self.obs) 73 | 74 | 75 | def REINFORCE_baseline(env_name, hidden_sizes=[32], p_lr=5e-3, vf_lr=8e-3, gamma=0.99, steps_per_epoch=100, num_epochs=1000): 76 | ''' 77 | REINFORCE with baseline Algorithm 78 | 79 | Parameters: 80 | ----------- 81 | env_name: Name of the environment 82 | hidden_size: list of the number of hidden units for each layer 83 | p_lr: policy learning rate 84 | vf_lr: value function learning rate 85 | gamma: discount factor 86 | steps_per_epoch: number of steps per epoch 87 | num_epochs: number train epochs (Note: they aren't properly epochs) 88 | ''' 89 | tf.reset_default_graph() 90 | 91 | env = gym.make(env_name) 92 | 93 | obs_dim = env.observation_space.shape 94 | act_dim = env.action_space.n 95 | 96 | # Placeholders 97 | obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs') 98 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 99 | ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret') 100 | rtg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='rtg') 101 | 102 | ##################################################### 103 | ########### COMPUTE THE PG LOSS FUNCTIONS ########### 104 | ##################################################### 105 | 106 | # policy 107 | p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh) 108 | 109 | act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1)) 110 | actions_mask = tf.one_hot(act_ph, depth=act_dim) 111 | p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1) 112 | # entropy useful to study the algorithms 113 | entropy = -tf.reduce_mean(softmax_entropy(p_logits)) 114 | p_loss = -tf.reduce_mean(p_log*ret_ph) 115 | 116 | # policy optimization 117 | p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss) 118 | 119 | ####################################### 120 | ########### VALUE FUNCTION ########### 121 | ####################################### 122 | 123 | ########### NEW ########### 124 | # value function 125 | s_values = tf.squeeze(mlp(obs_ph, hidden_sizes, 1, activation=tf.tanh)) 126 | 127 | # MSE loss function 128 | v_loss = tf.reduce_mean((rtg_ph - s_values)**2) 129 | 130 | # value function optimization 131 | v_opt = tf.train.AdamOptimizer(vf_lr).minimize(v_loss) 132 | 133 | # Time 134 | now = datetime.now() 135 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 136 | print('Time:', clock_time) 137 | 138 | 139 | # Set scalars and hisograms for TensorBoard 140 | tf.summary.scalar('p_loss', p_loss, collections=['train']) 141 | tf.summary.scalar('v_loss', v_loss, collections=['train']) 142 | tf.summary.scalar('entropy', entropy, collections=['train']) 143 | tf.summary.scalar('s_values', tf.reduce_mean(s_values), collections=['train']) 144 | tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train']) 145 | tf.summary.histogram('p_log', p_log, collections=['train']) 146 | tf.summary.histogram('act_multn', act_multn, collections=['train']) 147 | tf.summary.histogram('p_logits', p_logits, collections=['train']) 148 | tf.summary.histogram('ret_ph', ret_ph, collections=['train']) 149 | tf.summary.histogram('rtg_ph', rtg_ph, collections=['train']) 150 | tf.summary.histogram('s_values', s_values, collections=['train']) 151 | train_summary = tf.summary.merge_all('train') 152 | 153 | tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train']) 154 | tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train']) 155 | pre_scalar_summary = tf.summary.merge_all('pre_train') 156 | 157 | hyp_str = '-steps_{}-plr_{}-vflr_{}'.format(steps_per_epoch, p_lr, vf_lr) 158 | file_writer = tf.summary.FileWriter('log_dir/{}/REINFORCE_basel_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph()) 159 | 160 | # create a session 161 | sess = tf.Session() 162 | # initialize the variables 163 | sess.run(tf.global_variables_initializer()) 164 | 165 | # few variables 166 | step_count = 0 167 | train_rewards = [] 168 | train_ep_len = [] 169 | timer = time.time() 170 | 171 | # main cycle 172 | for ep in range(num_epochs): 173 | 174 | # initialize environment for the new epochs 175 | obs = env.reset() 176 | 177 | # intiaizlie buffer and other variables for the new epochs 178 | buffer = Buffer(gamma) 179 | env_buf = [] 180 | ep_rews = [] 181 | 182 | while len(buffer) < steps_per_epoch: 183 | 184 | # run the policy 185 | act, val = sess.run([act_multn, s_values], feed_dict={obs_ph:[obs]}) 186 | # take a step in the environment 187 | obs2, rew, done, _ = env.step(np.squeeze(act)) 188 | 189 | # add the new transition 190 | env_buf.append([obs.copy(), rew, act, np.squeeze(val)]) 191 | 192 | obs = obs2.copy() 193 | 194 | step_count += 1 195 | ep_rews.append(rew) 196 | 197 | if done: 198 | # store the trajectory just completed 199 | buffer.store(np.array(env_buf)) 200 | env_buf = [] 201 | # store additionl information about the episode 202 | train_rewards.append(np.sum(ep_rews)) 203 | train_ep_len.append(len(ep_rews)) 204 | # reset the environment 205 | obs = env.reset() 206 | ep_rews = [] 207 | 208 | # collect the episodes' information 209 | obs_batch, act_batch, ret_batch, rtg_batch = buffer.get_batch() 210 | 211 | # run pre_scalar_summary before the optimization phase 212 | epochs_summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch}) 213 | file_writer.add_summary(epochs_summary, step_count) 214 | 215 | # Optimize the NN policy and the NN value function 216 | sess.run([p_opt, v_opt], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch}) 217 | 218 | # run train_summary to save the summary after the optimization 219 | train_summary_run = sess.run(train_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch}) 220 | file_writer.add_summary(train_summary_run, step_count) 221 | 222 | # it's time to print some useful information 223 | if ep % 10 == 0: 224 | print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer)) 225 | 226 | summary = tf.Summary() 227 | summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len)) 228 | summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards)) 229 | file_writer.add_summary(summary, step_count) 230 | file_writer.flush() 231 | 232 | timer = time.time() 233 | train_rewards = [] 234 | train_ep_len = [] 235 | 236 | 237 | env.close() 238 | file_writer.close() 239 | 240 | 241 | if __name__ == '__main__': 242 | REINFORCE_baseline('LunarLander-v2', hidden_sizes=[64], p_lr=8e-3, vf_lr=7e-3, gamma=0.99, steps_per_epoch=1000, num_epochs=1000) -------------------------------------------------------------------------------- /Chapter07/PPO.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | import time 6 | import roboschool 7 | 8 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None): 9 | ''' 10 | Multi-layer perceptron 11 | ''' 12 | for l in hidden_layers: 13 | x = tf.layers.dense(x, units=l, activation=activation) 14 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 15 | 16 | def softmax_entropy(logits): 17 | ''' 18 | Softmax Entropy 19 | ''' 20 | return -tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1) 21 | 22 | def clipped_surrogate_obj(new_p, old_p, adv, eps): 23 | ''' 24 | Clipped surrogate objective function 25 | ''' 26 | rt = tf.exp(new_p - old_p) # i.e. pi / old_pi 27 | return -tf.reduce_mean(tf.minimum(rt*adv, tf.clip_by_value(rt, 1-eps, 1+eps)*adv)) 28 | 29 | def GAE(rews, v, v_last, gamma=0.99, lam=0.95): 30 | ''' 31 | Generalized Advantage Estimation 32 | ''' 33 | assert len(rews) == len(v) 34 | vs = np.append(v, v_last) 35 | delta = np.array(rews) + gamma*vs[1:] - vs[:-1] 36 | gae_advantage = discounted_rewards(delta, 0, gamma*lam) 37 | return gae_advantage 38 | 39 | def discounted_rewards(rews, last_sv, gamma): 40 | ''' 41 | Discounted reward to go 42 | 43 | Parameters: 44 | ---------- 45 | rews: list of rewards 46 | last_sv: value of the last state 47 | gamma: discount value 48 | ''' 49 | rtg = np.zeros_like(rews, dtype=np.float32) 50 | rtg[-1] = rews[-1] + gamma*last_sv 51 | for i in reversed(range(len(rews)-1)): 52 | rtg[i] = rews[i] + gamma*rtg[i+1] 53 | return rtg 54 | 55 | 56 | class StructEnv(gym.Wrapper): 57 | ''' 58 | Gym Wrapper to store information like number of steps and total reward of the last espisode. 59 | ''' 60 | def __init__(self, env): 61 | gym.Wrapper.__init__(self, env) 62 | self.n_obs = self.env.reset() 63 | self.rew_episode = 0 64 | self.len_episode = 0 65 | 66 | def reset(self, **kwargs): 67 | self.n_obs = self.env.reset(**kwargs) 68 | self.rew_episode = 0 69 | self.len_episode = 0 70 | return self.n_obs.copy() 71 | 72 | def step(self, action): 73 | ob, reward, done, info = self.env.step(action) 74 | self.rew_episode += reward 75 | self.len_episode += 1 76 | return ob, reward, done, info 77 | 78 | def get_episode_reward(self): 79 | return self.rew_episode 80 | 81 | def get_episode_length(self): 82 | return self.len_episode 83 | 84 | class Buffer(): 85 | ''' 86 | Class to store the experience from a unique policy 87 | ''' 88 | def __init__(self, gamma=0.99, lam=0.95): 89 | self.gamma = gamma 90 | self.lam = lam 91 | self.adv = [] 92 | self.ob = [] 93 | self.ac = [] 94 | self.rtg = [] 95 | 96 | def store(self, temp_traj, last_sv): 97 | ''' 98 | Add temp_traj values to the buffers and compute the advantage and reward to go 99 | 100 | Parameters: 101 | ----------- 102 | temp_traj: list where each element is a list that contains: observation, reward, action, state-value 103 | last_sv: value of the last state (Used to Bootstrap) 104 | ''' 105 | # store only if there are temporary trajectories 106 | if len(temp_traj) > 0: 107 | self.ob.extend(temp_traj[:,0]) 108 | rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma) 109 | self.adv.extend(GAE(temp_traj[:,1], temp_traj[:,3], last_sv, self.gamma, self.lam)) 110 | self.rtg.extend(rtg) 111 | self.ac.extend(temp_traj[:,2]) 112 | 113 | def get_batch(self): 114 | # standardize the advantage values 115 | norm_adv = (self.adv - np.mean(self.adv)) / (np.std(self.adv) + 1e-10) 116 | return np.array(self.ob), np.array(self.ac), np.array(norm_adv), np.array(self.rtg) 117 | 118 | def __len__(self): 119 | assert(len(self.adv) == len(self.ob) == len(self.ac) == len(self.rtg)) 120 | return len(self.ob) 121 | 122 | def gaussian_log_likelihood(x, mean, log_std): 123 | ''' 124 | Gaussian Log Likelihood 125 | ''' 126 | log_p = -0.5 *((x-mean)**2 / (tf.exp(log_std)**2+1e-9) + 2*log_std + np.log(2*np.pi)) 127 | return tf.reduce_sum(log_p, axis=-1) 128 | 129 | def PPO(env_name, hidden_sizes=[32], cr_lr=5e-3, ac_lr=5e-3, num_epochs=50, minibatch_size=5000, gamma=0.99, lam=0.95, number_envs=1, eps=0.1, 130 | actor_iter=5, critic_iter=10, steps_per_env=100, action_type='Discrete'): 131 | ''' 132 | Proximal Policy Optimization 133 | 134 | Parameters: 135 | ----------- 136 | env_name: Name of the environment 137 | hidden_size: list of the number of hidden units for each layer 138 | ac_lr: actor learning rate 139 | cr_lr: critic learning rate 140 | num_epochs: number of training epochs 141 | minibatch_size: Batch size used to train the critic and actor 142 | gamma: discount factor 143 | lam: lambda parameter for computing the GAE 144 | number_envs: number of parallel synchronous environments 145 | # NB: it isn't distributed across multiple CPUs 146 | eps: Clip threshold. Max deviation from previous policy. 147 | actor_iter: Number of SGD iterations on the actor per epoch 148 | critic_iter: NUmber of SGD iterations on the critic per epoch 149 | steps_per_env: number of steps per environment 150 | # NB: the total number of steps per epoch will be: steps_per_env*number_envs 151 | action_type: class name of the action space: Either "Discrete' or "Box" 152 | ''' 153 | 154 | tf.reset_default_graph() 155 | 156 | # Create some environments to collect the trajectories 157 | envs = [StructEnv(gym.make(env_name)) for _ in range(number_envs)] 158 | 159 | obs_dim = envs[0].observation_space.shape 160 | 161 | # Placeholders 162 | if action_type == 'Discrete': 163 | act_dim = envs[0].action_space.n 164 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 165 | 166 | elif action_type == 'Box': 167 | low_action_space = envs[0].action_space.low 168 | high_action_space = envs[0].action_space.high 169 | act_dim = envs[0].action_space.shape[0] 170 | act_ph = tf.placeholder(shape=(None,act_dim), dtype=tf.float32, name='act') 171 | 172 | obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs') 173 | ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret') 174 | adv_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='adv') 175 | old_p_log_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_p_log') 176 | 177 | # Computational graph for the policy in case of a continuous action space 178 | if action_type == 'Discrete': 179 | with tf.variable_scope('actor_nn'): 180 | p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.nn.relu, last_activation=tf.tanh) 181 | 182 | act_smp = tf.squeeze(tf.random.multinomial(p_logits, 1)) 183 | act_onehot = tf.one_hot(act_ph, depth=act_dim) 184 | p_log = tf.reduce_sum(act_onehot * tf.nn.log_softmax(p_logits), axis=-1) 185 | 186 | # Computational graph for the policy in case of a continuous action space 187 | else: 188 | with tf.variable_scope('actor_nn'): 189 | p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh) 190 | log_std = tf.get_variable(name='log_std', initializer=np.zeros(act_dim, dtype=np.float32)-0.5) 191 | 192 | # Add noise to the mean values predicted 193 | # The noise is proportional to the standard deviation 194 | p_noisy = p_logits + tf.random_normal(tf.shape(p_logits), 0, 1) * tf.exp(log_std) 195 | # Clip the noisy actions 196 | act_smp = tf.clip_by_value(p_noisy, low_action_space, high_action_space) 197 | # Compute the gaussian log likelihood 198 | p_log = gaussian_log_likelihood(act_ph, p_logits, log_std) 199 | 200 | # Nerual nework value function approximizer 201 | with tf.variable_scope('critic_nn'): 202 | s_values = mlp(obs_ph, hidden_sizes, 1, tf.tanh, last_activation=None) 203 | s_values = tf.squeeze(s_values) 204 | 205 | # PPO loss function 206 | p_loss = clipped_surrogate_obj(p_log, old_p_log_ph, adv_ph, eps) 207 | # MSE loss function 208 | v_loss = tf.reduce_mean((ret_ph - s_values)**2) 209 | 210 | # policy optimizer 211 | p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss) 212 | # value function optimizer 213 | v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss) 214 | 215 | # Time 216 | now = datetime.now() 217 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 218 | print('Time:', clock_time) 219 | 220 | # Set scalars and hisograms for TensorBoard 221 | tf.summary.scalar('p_loss', p_loss, collections=['train']) 222 | tf.summary.scalar('v_loss', v_loss, collections=['train']) 223 | tf.summary.scalar('s_values_m', tf.reduce_mean(s_values), collections=['train']) 224 | 225 | if action_type == 'Box': 226 | tf.summary.scalar('p_std', tf.reduce_mean(tf.exp(log_std)), collections=['train']) 227 | tf.summary.histogram('log_std',log_std, collections=['train']) 228 | tf.summary.histogram('p_log', p_log, collections=['train']) 229 | tf.summary.histogram('p_logits', p_logits, collections=['train']) 230 | tf.summary.histogram('s_values', s_values, collections=['train']) 231 | tf.summary.histogram('adv_ph',adv_ph, collections=['train']) 232 | scalar_summary = tf.summary.merge_all('train') 233 | 234 | # .. summary to run before the optimization steps 235 | tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train']) 236 | tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train']) 237 | pre_scalar_summary = tf.summary.merge_all('pre_train') 238 | 239 | hyp_str = '-bs_'+str(minibatch_size)+'-envs_'+str(number_envs)+'-ac_lr_'+str(ac_lr)+'-cr_lr'+str(cr_lr)+'-act_it_'+str(actor_iter)+'-crit_it_'+str(critic_iter) 240 | file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/PPO_'+clock_time+'_'+hyp_str, tf.get_default_graph()) 241 | 242 | # create a session 243 | sess = tf.Session() 244 | # initialize the variables 245 | sess.run(tf.global_variables_initializer()) 246 | 247 | # variable to store the total number of steps 248 | step_count = 0 249 | 250 | print('Env batch size:',steps_per_env, ' Batch size:',steps_per_env*number_envs) 251 | 252 | for ep in range(num_epochs): 253 | # Create the buffer that will contain the trajectories (full or partial) 254 | # run with the last policy 255 | buffer = Buffer(gamma, lam) 256 | # lists to store rewards and length of the trajectories completed 257 | batch_rew = [] 258 | batch_len = [] 259 | 260 | # Execute in serial the environments, storing temporarily the trajectories. 261 | for env in envs: 262 | temp_buf = [] 263 | 264 | #iterate over a fixed number of steps 265 | for _ in range(steps_per_env): 266 | 267 | # run the policy 268 | act, val = sess.run([act_smp, s_values], feed_dict={obs_ph:[env.n_obs]}) 269 | act = np.squeeze(act) 270 | 271 | # take a step in the environment 272 | obs2, rew, done, _ = env.step(act) 273 | 274 | # add the new transition to the temporary buffer 275 | temp_buf.append([env.n_obs.copy(), rew, act, np.squeeze(val)]) 276 | 277 | env.n_obs = obs2.copy() 278 | step_count += 1 279 | 280 | if done: 281 | # Store the full trajectory in the buffer 282 | # (the value of the last state is 0 as the trajectory is completed) 283 | buffer.store(np.array(temp_buf), 0) 284 | 285 | # Empty temporary buffer 286 | temp_buf = [] 287 | 288 | batch_rew.append(env.get_episode_reward()) 289 | batch_len.append(env.get_episode_length()) 290 | 291 | # reset the environment 292 | env.reset() 293 | 294 | # Bootstrap with the estimated state value of the next state! 295 | last_v = sess.run(s_values, feed_dict={obs_ph:[env.n_obs]}) 296 | buffer.store(np.array(temp_buf), np.squeeze(last_v)) 297 | 298 | 299 | # Gather the entire batch from the buffer 300 | # NB: all the batch is used and deleted after the optimization. That is because PPO is on-policy 301 | obs_batch, act_batch, adv_batch, rtg_batch = buffer.get_batch() 302 | 303 | old_p_log = sess.run(p_log, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch}) 304 | old_p_batch = np.array(old_p_log) 305 | 306 | summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_batch}) 307 | file_writer.add_summary(summary, step_count) 308 | 309 | lb = len(buffer) 310 | shuffled_batch = np.arange(lb) 311 | 312 | # Policy optimization steps 313 | for _ in range(actor_iter): 314 | # shuffle the batch on every iteration 315 | np.random.shuffle(shuffled_batch) 316 | for idx in range(0,lb, minibatch_size): 317 | minib = shuffled_batch[idx:min(idx+minibatch_size,lb)] 318 | sess.run(p_opt, feed_dict={obs_ph:obs_batch[minib], act_ph:act_batch[minib], adv_ph:adv_batch[minib], old_p_log_ph:old_p_batch[minib]}) 319 | 320 | # Value function optimization steps 321 | for _ in range(critic_iter): 322 | # shuffle the batch on every iteration 323 | np.random.shuffle(shuffled_batch) 324 | for idx in range(0,lb, minibatch_size): 325 | minib = shuffled_batch[idx:min(idx+minibatch_size,lb)] 326 | sess.run(v_opt, feed_dict={obs_ph:obs_batch[minib], ret_ph:rtg_batch[minib]}) 327 | 328 | 329 | # print some statistics and run the summary for visualizing it on TB 330 | if len(batch_rew) > 0: 331 | train_summary = sess.run(scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, 332 | old_p_log_ph:old_p_batch, ret_ph:rtg_batch}) 333 | file_writer.add_summary(train_summary, step_count) 334 | 335 | summary = tf.Summary() 336 | summary.value.add(tag='supplementary/performance', simple_value=np.mean(batch_rew)) 337 | summary.value.add(tag='supplementary/len', simple_value=np.mean(batch_len)) 338 | file_writer.add_summary(summary, step_count) 339 | file_writer.flush() 340 | 341 | print('Ep:%d Rew:%.2f -- Step:%d' % (ep, np.mean(batch_rew), step_count)) 342 | 343 | # closing environments.. 344 | for env in envs: 345 | env.close() 346 | 347 | # Close the writer 348 | file_writer.close() 349 | 350 | 351 | if __name__ == '__main__': 352 | PPO('RoboschoolWalker2d-v1', hidden_sizes=[64,64], cr_lr=5e-4, ac_lr=2e-4, gamma=0.99, lam=0.95, steps_per_env=5000, 353 | number_envs=1, eps=0.15, actor_iter=6, critic_iter=10, action_type='Box', num_epochs=5000, minibatch_size=256) 354 | -------------------------------------------------------------------------------- /Chapter07/TRPO.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | import roboschool 6 | 7 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None): 8 | ''' 9 | Multi-layer perceptron 10 | ''' 11 | for l in hidden_layers: 12 | x = tf.layers.dense(x, units=l, activation=activation) 13 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 14 | 15 | def softmax_entropy(logits): 16 | ''' 17 | Softmax Entropy 18 | ''' 19 | return -tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1) 20 | 21 | 22 | def gaussian_log_likelihood(ac, mean, log_std): 23 | ''' 24 | Gaussian Log Likelihood 25 | ''' 26 | log_p = ((ac-mean)**2 / (tf.exp(log_std)**2+1e-9) + 2*log_std) + np.log(2*np.pi) 27 | return -0.5 * tf.reduce_sum(log_p, axis=-1) 28 | 29 | 30 | def conjugate_gradient(A, b, x=None, iters=10): 31 | ''' 32 | Conjugate gradient method: approximate the solution of Ax=b 33 | It solve Ax=b without forming the full matrix, just compute the matrix-vector product (The Fisher-vector product) 34 | 35 | NB: A is not the full matrix but is a useful matrix-vector product between the averaged Fisher information matrix and arbitrary vectors 36 | Descibed in Appendix C.1 of the TRPO paper 37 | ''' 38 | if x is None: 39 | x = np.zeros_like(b) 40 | 41 | r = A(x) - b 42 | p = -r 43 | for _ in range(iters): 44 | a = np.dot(r, r) / (np.dot(p, A(p))+1e-8) 45 | x += a*p 46 | r_n = r + a*A(p) 47 | b = np.dot(r_n, r_n) / (np.dot(r, r)+1e-8) 48 | p = -r_n + b*p 49 | r = r_n 50 | return x 51 | 52 | def gaussian_DKL(mu_q, log_std_q, mu_p, log_std_p): 53 | ''' 54 | Gaussian KL divergence in case of a diagonal covariance matrix 55 | ''' 56 | return tf.reduce_mean(tf.reduce_sum(0.5 * (log_std_p - log_std_q + tf.exp(log_std_q - log_std_p) + (mu_q - mu_p)**2 / tf.exp(log_std_p) - 1), axis=1)) 57 | 58 | 59 | def backtracking_line_search(Dkl, delta, old_loss, p=0.8): 60 | ''' 61 | Backtracking line searc. It look for a coefficient s.t. the constraint on the DKL is satisfied 62 | It has both to 63 | - improve the non-linear objective 64 | - satisfy the constraint 65 | 66 | ''' 67 | ## Explained in Appendix C of the TRPO paper 68 | a = 1 69 | it = 0 70 | 71 | new_dkl, new_loss = Dkl(a) 72 | while (new_dkl > delta) or (new_loss > old_loss): 73 | a *= p 74 | it += 1 75 | new_dkl, new_loss = Dkl(a) 76 | 77 | return a 78 | 79 | 80 | 81 | def GAE(rews, v, v_last, gamma=0.99, lam=0.95): 82 | ''' 83 | Generalized Advantage Estimation 84 | ''' 85 | assert len(rews) == len(v) 86 | vs = np.append(v, v_last) 87 | d = np.array(rews) + gamma*vs[1:] - vs[:-1] 88 | gae_advantage = discounted_rewards(d, 0, gamma*lam) 89 | return gae_advantage 90 | 91 | def discounted_rewards(rews, last_sv, gamma): 92 | ''' 93 | Discounted reward to go 94 | 95 | Parameters: 96 | ---------- 97 | rews: list of rewards 98 | last_sv: value of the last state 99 | gamma: discount value 100 | ''' 101 | rtg = np.zeros_like(rews, dtype=np.float32) 102 | rtg[-1] = rews[-1] + gamma*last_sv 103 | for i in reversed(range(len(rews)-1)): 104 | rtg[i] = rews[i] + gamma*rtg[i+1] 105 | return rtg 106 | 107 | class Buffer(): 108 | ''' 109 | Class to store the experience from a unique policy 110 | ''' 111 | def __init__(self, gamma=0.99, lam=0.95): 112 | self.gamma = gamma 113 | self.lam = lam 114 | self.adv = [] 115 | self.ob = [] 116 | self.ac = [] 117 | self.rtg = [] 118 | 119 | def store(self, temp_traj, last_sv): 120 | ''' 121 | Add temp_traj values to the buffers and compute the advantage and reward to go 122 | 123 | Parameters: 124 | ----------- 125 | temp_traj: list where each element is a list that contains: observation, reward, action, state-value 126 | last_sv: value of the last state (Used to Bootstrap) 127 | ''' 128 | # store only if there are temporary trajectories 129 | if len(temp_traj) > 0: 130 | self.ob.extend(temp_traj[:,0]) 131 | rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma) 132 | self.adv.extend(GAE(temp_traj[:,1], temp_traj[:,3], last_sv, self.gamma, self.lam)) 133 | self.rtg.extend(rtg) 134 | self.ac.extend(temp_traj[:,2]) 135 | 136 | def get_batch(self): 137 | # standardize the advantage values 138 | norm_adv = (self.adv - np.mean(self.adv)) / (np.std(self.adv) + 1e-10) 139 | return np.array(self.ob), np.array(self.ac), np.array(norm_adv), np.array(self.rtg) 140 | 141 | def __len__(self): 142 | assert(len(self.adv) == len(self.ob) == len(self.ac) == len(self.rtg)) 143 | return len(self.ob) 144 | 145 | def flatten_list(tensor_list): 146 | ''' 147 | Flatten a list of tensors 148 | ''' 149 | return tf.concat([flatten(t) for t in tensor_list], axis=0) 150 | 151 | def flatten(tensor): 152 | ''' 153 | Flatten a tensor 154 | ''' 155 | return tf.reshape(tensor, shape=(-1,)) 156 | 157 | 158 | class StructEnv(gym.Wrapper): 159 | ''' 160 | Gym Wrapper to store information like number of steps and total reward of the last espisode. 161 | ''' 162 | def __init__(self, env): 163 | gym.Wrapper.__init__(self, env) 164 | self.n_obs = self.env.reset() 165 | self.total_rew = 0 166 | self.len_episode = 0 167 | 168 | def reset(self, **kwargs): 169 | self.n_obs = self.env.reset(**kwargs) 170 | self.total_rew = 0 171 | self.len_episode = 0 172 | return self.n_obs.copy() 173 | 174 | def step(self, action): 175 | ob, reward, done, info = self.env.step(action) 176 | self.total_rew += reward 177 | self.len_episode += 1 178 | return ob, reward, done, info 179 | 180 | def get_episode_reward(self): 181 | return self.total_rew 182 | 183 | def get_episode_length(self): 184 | return self.len_episode 185 | 186 | 187 | def TRPO(env_name, hidden_sizes=[32], cr_lr=5e-3, num_epochs=50, gamma=0.99, lam=0.95, number_envs=1, 188 | critic_iter=10, steps_per_env=100, delta=0.002, algorithm='TRPO', conj_iters=10, minibatch_size=1000): 189 | ''' 190 | Trust Region Policy Optimization 191 | 192 | Parameters: 193 | ----------- 194 | env_name: Name of the environment 195 | hidden_sizes: list of the number of hidden units for each layer 196 | cr_lr: critic learning rate 197 | num_epochs: number of training epochs 198 | gamma: discount factor 199 | lam: lambda parameter for computing the GAE 200 | number_envs: number of "parallel" synchronous environments 201 | # NB: it isn't distributed across multiple CPUs 202 | critic_iter: NUmber of SGD iterations on the critic per epoch 203 | steps_per_env: number of steps per environment 204 | # NB: the total number of steps per epoch will be: steps_per_env*number_envs 205 | delta: Maximum KL divergence between two policies. Scalar value 206 | algorithm: type of algorithm. Either 'TRPO' or 'NPO' 207 | conj_iters: number of conjugate gradient iterations 208 | minibatch_size: Batch size used to train the critic 209 | ''' 210 | 211 | tf.reset_default_graph() 212 | 213 | # Create a few environments to collect the trajectories 214 | envs = [StructEnv(gym.make(env_name)) for _ in range(number_envs)] 215 | 216 | low_action_space = envs[0].action_space.low 217 | high_action_space = envs[0].action_space.high 218 | 219 | obs_dim = envs[0].observation_space.shape 220 | act_dim = envs[0].action_space.shape[0] 221 | 222 | # Placeholders 223 | act_ph = tf.placeholder(shape=(None,act_dim), dtype=tf.float32, name='act') 224 | obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs') 225 | ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret') 226 | adv_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='adv') 227 | old_p_log_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_p_log') 228 | old_mu_ph = tf.placeholder(shape=(None, act_dim), dtype=tf.float32, name='old_mu') 229 | old_log_std_ph = tf.placeholder(shape=(act_dim), dtype=tf.float32, name='old_log_std') 230 | p_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_ph') 231 | # result of the conjugate gradient algorithm 232 | cg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='cg') 233 | 234 | # Neural network that represent the policy 235 | with tf.variable_scope('actor_nn'): 236 | p_means = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh) 237 | log_std = tf.get_variable(name='log_std', initializer=np.zeros(act_dim, dtype=np.float32) - 0.5) 238 | 239 | # Neural network that represent the value function 240 | with tf.variable_scope('critic_nn'): 241 | s_values = mlp(obs_ph, hidden_sizes, 1, tf.tanh, last_activation=None) 242 | s_values = tf.squeeze(s_values) 243 | 244 | # Add "noise" to the predicted mean following the Guassian distribution with standard deviation e^(log_std) 245 | p_noisy = p_means + tf.random_normal(tf.shape(p_means), 0, 1) * tf.exp(log_std) 246 | # Clip the noisy actions 247 | a_sampl = tf.clip_by_value(p_noisy, low_action_space, high_action_space) 248 | # Compute the gaussian log likelihood 249 | p_log = gaussian_log_likelihood(act_ph, p_means, log_std) 250 | 251 | # Measure the divergence 252 | diverg = tf.reduce_mean(tf.exp(old_p_log_ph - p_log)) 253 | 254 | # ratio 255 | ratio_new_old = tf.exp(p_log - old_p_log_ph) 256 | # TRPO surrogate loss function 257 | p_loss = - tf.reduce_mean(ratio_new_old * adv_ph) 258 | 259 | # MSE loss function 260 | v_loss = tf.reduce_mean((ret_ph - s_values)**2) 261 | # Critic optimization 262 | v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss) 263 | 264 | def variables_in_scope(scope): 265 | # get all trainable variables in 'scope' 266 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 267 | 268 | # Gather and flatten the actor parameters 269 | p_variables = variables_in_scope('actor_nn') 270 | p_var_flatten = flatten_list(p_variables) 271 | 272 | # Gradient of the policy loss with respect to the actor parameters 273 | p_grads = tf.gradients(p_loss, p_variables) 274 | p_grads_flatten = flatten_list(p_grads) 275 | 276 | ########### RESTORE ACTOR PARAMETERS ########### 277 | p_old_variables = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_old_variables') 278 | # variable used as index for restoring the actor's parameters 279 | it_v1 = tf.Variable(0, trainable=False) 280 | restore_params = [] 281 | 282 | for p_v in p_variables: 283 | upd_rsh = tf.reshape(p_old_variables[it_v1 : it_v1+tf.reduce_prod(p_v.shape)], shape=p_v.shape) 284 | restore_params.append(p_v.assign(upd_rsh)) 285 | it_v1 += tf.reduce_prod(p_v.shape) 286 | 287 | restore_params = tf.group(*restore_params) 288 | 289 | # gaussian KL divergence of the two policies 290 | dkl_diverg = gaussian_DKL(old_mu_ph, old_log_std_ph, p_means, log_std) 291 | 292 | # Jacobian of the KL divergence (Needed for the Fisher matrix-vector product) 293 | dkl_diverg_grad = tf.gradients(dkl_diverg, p_variables) 294 | 295 | dkl_matrix_product = tf.reduce_sum(flatten_list(dkl_diverg_grad) * p_ph) 296 | print('dkl_matrix_product', dkl_matrix_product.shape) 297 | # Fisher vector product 298 | # The Fisher-vector product is a way to compute the A matrix without the need of the full A 299 | Fx = flatten_list(tf.gradients(dkl_matrix_product, p_variables)) 300 | 301 | ## Step length 302 | beta_ph = tf.placeholder(shape=(), dtype=tf.float32, name='beta') 303 | # NPG update 304 | npg_update = beta_ph * cg_ph 305 | 306 | ## alpha is found through line search 307 | alpha = tf.Variable(1., trainable=False) 308 | # TRPO update 309 | trpo_update = alpha * npg_update 310 | 311 | #################### POLICY UPDATE ################### 312 | # variable used as an index 313 | it_v = tf.Variable(0, trainable=False) 314 | p_opt = [] 315 | # Apply the updates to the policy 316 | for p_v in p_variables: 317 | upd_rsh = tf.reshape(trpo_update[it_v : it_v+tf.reduce_prod(p_v.shape)], shape=p_v.shape) 318 | p_opt.append(p_v.assign_sub(upd_rsh)) 319 | it_v += tf.reduce_prod(p_v.shape) 320 | 321 | p_opt = tf.group(*p_opt) 322 | 323 | # Time 324 | now = datetime.now() 325 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 326 | print('Time:', clock_time) 327 | 328 | 329 | # Set scalars and hisograms for TensorBoard 330 | tf.summary.scalar('p_loss', p_loss, collections=['train']) 331 | tf.summary.scalar('v_loss', v_loss, collections=['train']) 332 | tf.summary.scalar('p_divergence', diverg, collections=['train']) 333 | tf.summary.scalar('ratio_new_old',tf.reduce_mean(ratio_new_old), collections=['train']) 334 | tf.summary.scalar('dkl_diverg', dkl_diverg, collections=['train']) 335 | tf.summary.scalar('alpha', alpha, collections=['train']) 336 | tf.summary.scalar('beta', beta_ph, collections=['train']) 337 | tf.summary.scalar('p_std_mn', tf.reduce_mean(tf.exp(log_std)), collections=['train']) 338 | tf.summary.scalar('s_values_mn', tf.reduce_mean(s_values), collections=['train']) 339 | tf.summary.histogram('p_log', p_log, collections=['train']) 340 | tf.summary.histogram('p_means', p_means, collections=['train']) 341 | tf.summary.histogram('s_values', s_values, collections=['train']) 342 | tf.summary.histogram('adv_ph',adv_ph, collections=['train']) 343 | tf.summary.histogram('log_std',log_std, collections=['train']) 344 | scalar_summary = tf.summary.merge_all('train') 345 | 346 | tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train']) 347 | tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train']) 348 | pre_scalar_summary = tf.summary.merge_all('pre_train') 349 | 350 | hyp_str = '-spe_'+str(steps_per_env)+'-envs_'+str(number_envs)+'-cr_lr'+str(cr_lr)+'-crit_it_'+str(critic_iter)+'-delta_'+str(delta)+'-conj_iters_'+str(conj_iters) 351 | file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/'+algorithm+'_'+clock_time+'_'+hyp_str, tf.get_default_graph()) 352 | 353 | # create a session 354 | sess = tf.Session() 355 | # initialize the variables 356 | sess.run(tf.global_variables_initializer()) 357 | 358 | # variable to store the total number of steps 359 | step_count = 0 360 | 361 | print('Env batch size:',steps_per_env, ' Batch size:',steps_per_env*number_envs) 362 | 363 | for ep in range(num_epochs): 364 | # Create the buffer that will contain the trajectories (full or partial) 365 | # run with the last policy 366 | buffer = Buffer(gamma, lam) 367 | # lists to store rewards and length of the trajectories completed 368 | batch_rew = [] 369 | batch_len = [] 370 | 371 | # Execute in serial the environment, storing temporarily the trajectories. 372 | for env in envs: 373 | temp_buf = [] 374 | 375 | # iterate over a fixed number of steps 376 | for _ in range(steps_per_env): 377 | # run the policy 378 | act, val = sess.run([a_sampl, s_values], feed_dict={obs_ph:[env.n_obs]}) 379 | act = np.squeeze(act) 380 | 381 | # take a step in the environment 382 | obs2, rew, done, _ = env.step(act) 383 | 384 | # add the new transition to the temporary buffer 385 | temp_buf.append([env.n_obs.copy(), rew, act, np.squeeze(val)]) 386 | 387 | env.n_obs = obs2.copy() 388 | step_count += 1 389 | 390 | if done: 391 | # Store the full trajectory in the buffer 392 | # (the value of the last state is 0 as the trajectory is completed) 393 | buffer.store(np.array(temp_buf), 0) 394 | # Empty temporary buffer 395 | temp_buf = [] 396 | 397 | batch_rew.append(env.get_episode_reward()) 398 | batch_len.append(env.get_episode_length()) 399 | 400 | env.reset() 401 | 402 | # Bootstrap with the estimated state value of the next state! 403 | lsv = sess.run(s_values, feed_dict={obs_ph:[env.n_obs]}) 404 | buffer.store(np.array(temp_buf), np.squeeze(lsv)) 405 | 406 | 407 | # Get the entire batch from the buffer 408 | # NB: all the batch is used and deleted after the optimization. This is because PPO is on-policy 409 | obs_batch, act_batch, adv_batch, rtg_batch = buffer.get_batch() 410 | 411 | # log probabilities, logits and log std of the "old" policy 412 | # "old" policy refer to the policy to optimize and that has been used to sample from the environment 413 | old_p_log, old_p_means, old_log_std = sess.run([p_log, p_means, log_std], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch}) 414 | # get also the "old" parameters 415 | old_actor_params = sess.run(p_var_flatten) 416 | 417 | # old_p_loss is later used in the line search 418 | # run pre_scalar_summary for a summary before the optimization 419 | old_p_loss, summary = sess.run([p_loss,pre_scalar_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log}) 420 | file_writer.add_summary(summary, step_count) 421 | 422 | def H_f(p): 423 | ''' 424 | Run the Fisher-Vector product on 'p' to approximate the Hessian of the DKL 425 | ''' 426 | return sess.run(Fx, feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, p_ph:p, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch}) 427 | 428 | g_f = sess.run(p_grads_flatten, feed_dict={old_mu_ph:old_p_means,obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log}) 429 | ## Compute the Conjugate Gradient so to obtain an approximation of H^(-1)*g 430 | # Where H in reality isn't the true Hessian of the KL divergence but an approximation of it computed via Fisher-Vector Product (F) 431 | conj_grad = conjugate_gradient(H_f, g_f, iters=conj_iters) 432 | 433 | # Compute the step length 434 | beta_np = np.sqrt(2*delta / np.sum(conj_grad * H_f(conj_grad))) 435 | 436 | def DKL(alpha_v): 437 | ''' 438 | Compute the KL divergence. 439 | It optimize the function to compute the DKL. Afterwards it restore the old parameters. 440 | ''' 441 | sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:alpha_v, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log}) 442 | a_res = sess.run([dkl_diverg, p_loss], feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log}) 443 | sess.run(restore_params, feed_dict={p_old_variables: old_actor_params}) 444 | return a_res 445 | 446 | # Actor optimization step 447 | # Different for TRPO or NPG 448 | if algorithm=='TRPO': 449 | # Backtracing line search to find the maximum alpha coefficient s.t. the constraint is valid 450 | best_alpha = backtracking_line_search(DKL, delta, old_p_loss, p=0.8) 451 | sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:best_alpha, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log}) 452 | elif algorithm=='NPG': 453 | # In case of NPG, no line search 454 | sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:1, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log}) 455 | 456 | 457 | lb = len(buffer) 458 | shuffled_batch = np.arange(lb) 459 | np.random.shuffle(shuffled_batch) 460 | 461 | # Value function optimization steps 462 | for _ in range(critic_iter): 463 | # shuffle the batch on every iteration 464 | np.random.shuffle(shuffled_batch) 465 | for idx in range(0,lb, minibatch_size): 466 | minib = shuffled_batch[idx:min(idx+minibatch_size,lb)] 467 | sess.run(v_opt, feed_dict={obs_ph:obs_batch[minib], ret_ph:rtg_batch[minib]}) 468 | 469 | # print some statistics and run the summary for visualizing it on TB 470 | if len(batch_rew) > 0: 471 | train_summary = sess.run(scalar_summary, feed_dict={beta_ph:beta_np, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, cg_ph:conj_grad, 472 | old_p_log_ph:old_p_log, ret_ph:rtg_batch, old_mu_ph:old_p_means, old_log_std_ph:old_log_std}) 473 | file_writer.add_summary(train_summary, step_count) 474 | 475 | summary = tf.Summary() 476 | summary.value.add(tag='supplementary/performance', simple_value=np.mean(batch_rew)) 477 | summary.value.add(tag='supplementary/len', simple_value=np.mean(batch_len)) 478 | file_writer.add_summary(summary, step_count) 479 | file_writer.flush() 480 | 481 | print('Ep:%d Rew:%.2f -- Step:%d' % (ep, np.mean(batch_rew), step_count)) 482 | 483 | # closing environments.. 484 | for env in envs: 485 | env.close() 486 | 487 | file_writer.close() 488 | 489 | if __name__ == '__main__': 490 | TRPO('RoboschoolWalker2d-v1', hidden_sizes=[64,64], cr_lr=2e-3, gamma=0.99, lam=0.95, num_epochs=1000, steps_per_env=6000, 491 | number_envs=1, critic_iter=10, delta=0.01, algorithm='TRPO', conj_iters=10, minibatch_size=1000) 492 | -------------------------------------------------------------------------------- /Chapter08/DDPG.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | from collections import deque 6 | import time 7 | 8 | current_milli_time = lambda: int(round(time.time() * 1000)) 9 | 10 | def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None): 11 | ''' 12 | Multi-layer perceptron 13 | ''' 14 | for l in hidden_layers: 15 | x = tf.layers.dense(x, units=l, activation=activation) 16 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 17 | 18 | def deterministic_actor_critic(x, a, hidden_sizes, act_dim, max_act): 19 | ''' 20 | Deterministic Actor-Critic 21 | ''' 22 | # Actor 23 | with tf.variable_scope('p_mlp'): 24 | p_means = max_act * mlp(x, hidden_sizes, act_dim, last_activation=tf.tanh) 25 | 26 | # Critic with as input the deterministic action of the actor 27 | with tf.variable_scope('q_mlp'): 28 | q_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None) 29 | 30 | # Critic with as input an arbirtary action 31 | with tf.variable_scope('q_mlp', reuse=True): # Use the weights of the mlp just defined 32 | q_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None) 33 | 34 | return p_means, tf.squeeze(q_d), tf.squeeze(q_a) 35 | 36 | class ExperiencedBuffer(): 37 | ''' 38 | Experienced buffer 39 | ''' 40 | def __init__(self, buffer_size): 41 | # Contains up to 'buffer_size' experience 42 | self.obs_buf = deque(maxlen=buffer_size) 43 | self.rew_buf = deque(maxlen=buffer_size) 44 | self.act_buf = deque(maxlen=buffer_size) 45 | self.obs2_buf = deque(maxlen=buffer_size) 46 | self.done_buf = deque(maxlen=buffer_size) 47 | 48 | 49 | def add(self, obs, rew, act, obs2, done): 50 | ''' 51 | Add a new transition to the buffers 52 | ''' 53 | self.obs_buf.append(obs) 54 | self.rew_buf.append(rew) 55 | self.act_buf.append(act) 56 | self.obs2_buf.append(obs2) 57 | self.done_buf.append(done) 58 | 59 | 60 | def sample_minibatch(self, batch_size): 61 | ''' 62 | Sample a mini-batch of size 'batch_size' 63 | ''' 64 | mb_indices = np.random.randint(len(self.obs_buf), size=batch_size) 65 | 66 | mb_obs = [self.obs_buf[i] for i in mb_indices] 67 | mb_rew = [self.rew_buf[i] for i in mb_indices] 68 | mb_act = [self.act_buf[i] for i in mb_indices] 69 | mb_obs2 = [self.obs2_buf[i] for i in mb_indices] 70 | mb_done = [self.done_buf[i] for i in mb_indices] 71 | 72 | return mb_obs, mb_rew, mb_act, mb_obs2, mb_done 73 | 74 | def __len__(self): 75 | return len(self.obs_buf) 76 | 77 | def test_agent(env_test, agent_op, num_games=10): 78 | ''' 79 | Test an agent 'agent_op', 'num_games' times 80 | Return mean and std 81 | ''' 82 | games_r = [] 83 | for _ in range(num_games): 84 | d = False 85 | game_r = 0 86 | o = env_test.reset() 87 | 88 | while not d: 89 | a_s = agent_op(o) 90 | o, r, d, _ = env_test.step(a_s) 91 | game_r += r 92 | 93 | games_r.append(game_r) 94 | return np.mean(games_r), np.std(games_r) 95 | 96 | 97 | 98 | def DDPG(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs=2000, buffer_size=5000, discount=0.99, render_cycle=100, mean_summaries_steps=1000, 99 | batch_size=128, min_buffer_size=5000, tau=0.005): 100 | 101 | # Create an environment for training 102 | env = gym.make(env_name) 103 | # Create an environment for testing the actor 104 | env_test = gym.make(env_name) 105 | 106 | tf.reset_default_graph() 107 | 108 | obs_dim = env.observation_space.shape 109 | act_dim = env.action_space.shape 110 | print('-- Observation space:', obs_dim, ' Action space:', act_dim, '--') 111 | 112 | # Create some placeholders 113 | obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs') 114 | act_ph = tf.placeholder(shape=(None, act_dim[0]), dtype=tf.float32, name='act') 115 | y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y') 116 | 117 | # Create an online deterministic actor-critic 118 | with tf.variable_scope('online'): 119 | p_onl, qd_onl, qa_onl = deterministic_actor_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high)) 120 | # and a target one 121 | with tf.variable_scope('target'): 122 | _, qd_tar, _ = deterministic_actor_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high)) 123 | 124 | def variables_in_scope(scope): 125 | ''' 126 | Retrieve all the variables in the scope 'scope' 127 | ''' 128 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) 129 | 130 | # Copy all the online variables to the target networks i.e. target = online 131 | # Needed only at the beginning 132 | init_target = [target_var.assign(online_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))] 133 | init_target_op = tf.group(*init_target) 134 | 135 | # Soft update 136 | update_target = [target_var.assign(tau*online_var + (1-tau)*target_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))] 137 | update_target_op = tf.group(*update_target) 138 | 139 | # Critic loss (MSE) 140 | q_loss = tf.reduce_mean((qa_onl - y_ph)**2) 141 | # Actor loss 142 | p_loss = -tf.reduce_mean(qd_onl) 143 | 144 | # Optimize the critic 145 | q_opt = tf.train.AdamOptimizer(cr_lr).minimize(q_loss) 146 | # Optimize the actor 147 | p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss, var_list=variables_in_scope('online/p_mlp')) 148 | 149 | 150 | def agent_op(o): 151 | a = np.squeeze(sess.run(p_onl, feed_dict={obs_ph:[o]})) 152 | return np.clip(a, env.action_space.low, env.action_space.high) 153 | 154 | def agent_noisy_op(o, scale): 155 | action = agent_op(o) 156 | noisy_action = action + np.random.normal(loc=0.0, scale=scale, size=action.shape) 157 | return np.clip(noisy_action, env.action_space.low, env.action_space.high) 158 | 159 | 160 | # Time 161 | now = datetime.now() 162 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second)) 163 | print('Time:', clock_time) 164 | 165 | 166 | # Set TensorBoard 167 | tf.summary.scalar('loss/q', q_loss) 168 | tf.summary.scalar('loss/p', p_loss) 169 | scalar_summary = tf.summary.merge_all() 170 | 171 | hyp_str = '-aclr_'+str(ac_lr)+'-crlr_'+str(cr_lr)+'-tau_'+str(tau) 172 | 173 | file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/DDPG_'+clock_time+'_'+hyp_str, tf.get_default_graph()) 174 | 175 | # Create a session and initialize the variables 176 | sess = tf.Session() 177 | sess.run(tf.global_variables_initializer()) 178 | sess.run(init_target_op) 179 | 180 | # Some useful variables.. 181 | render_the_game = False 182 | step_count = 0 183 | last_q_update_loss = [] 184 | last_p_update_loss = [] 185 | ep_time = current_milli_time() 186 | batch_rew = [] 187 | 188 | # Reset the environment 189 | obs = env.reset() 190 | # Initialize the buffer 191 | buffer = ExperiencedBuffer(buffer_size) 192 | 193 | 194 | for ep in range(num_epochs): 195 | g_rew = 0 196 | done = False 197 | 198 | while not done: 199 | # If not gathered enough experience yet, act randomly 200 | if len(buffer) < min_buffer_size: 201 | act = env.action_space.sample() 202 | else: 203 | act = agent_noisy_op(obs, 0.1) 204 | 205 | # Take a step in the environment 206 | obs2, rew, done, _ = env.step(act) 207 | 208 | if render_the_game: 209 | env.render() 210 | 211 | # Add the transition in the buffer 212 | buffer.add(obs.copy(), rew, act, obs2.copy(), done) 213 | 214 | obs = obs2 215 | g_rew += rew 216 | step_count += 1 217 | 218 | if len(buffer) > min_buffer_size: 219 | # sample a mini batch from the buffer 220 | mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size) 221 | 222 | # Compute the target values 223 | q_target_mb = sess.run(qd_tar, feed_dict={obs_ph:mb_obs2}) 224 | y_r = np.array(mb_rew) + discount*(1-np.array(mb_done))*q_target_mb 225 | 226 | # optimize the critic 227 | train_summary, _, q_train_loss = sess.run([scalar_summary, q_opt, q_loss], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act}) 228 | 229 | # optimize the actor 230 | _, p_train_loss = sess.run([p_opt, p_loss], feed_dict={obs_ph:mb_obs}) 231 | 232 | # summaries.. 233 | file_writer.add_summary(train_summary, step_count) 234 | last_q_update_loss.append(q_train_loss) 235 | last_p_update_loss.append(p_train_loss) 236 | 237 | # Soft update of the target networks 238 | sess.run(update_target_op) 239 | 240 | # some 'mean' summaries to plot more smooth functions 241 | if step_count % mean_summaries_steps == 0: 242 | summary = tf.Summary() 243 | summary.value.add(tag='loss/mean_q', simple_value=np.mean(last_q_update_loss)) 244 | summary.value.add(tag='loss/mean_p', simple_value=np.mean(last_p_update_loss)) 245 | file_writer.add_summary(summary, step_count) 246 | file_writer.flush() 247 | 248 | last_q_update_loss = [] 249 | last_p_update_loss = [] 250 | 251 | 252 | if done: 253 | obs = env.reset() 254 | batch_rew.append(g_rew) 255 | g_rew, render_the_game = 0, False 256 | 257 | # Test the actor every 10 epochs 258 | if ep % 10 == 0: 259 | test_mn_rw, test_std_rw = test_agent(env_test, agent_op) 260 | 261 | summary = tf.Summary() 262 | summary.value.add(tag='test/reward', simple_value=test_mn_rw) 263 | file_writer.add_summary(summary, step_count) 264 | file_writer.flush() 265 | 266 | ep_sec_time = int((current_milli_time()-ep_time) / 1000) 267 | print('Ep:%4d Rew:%4.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d' % (ep,np.mean(batch_rew), step_count, test_mn_rw, test_std_rw, ep_sec_time)) 268 | 269 | ep_time = current_milli_time() 270 | batch_rew = [] 271 | 272 | if ep % render_cycle == 0: 273 | render_the_game = True 274 | 275 | # close everything 276 | file_writer.close() 277 | env.close() 278 | env_test.close() 279 | 280 | 281 | if __name__ == '__main__': 282 | DDPG('BipedalWalker-v2', hidden_sizes=[64,64], ac_lr=3e-4, cr_lr=4e-4, buffer_size=200000, mean_summaries_steps=100, batch_size=64, 283 | min_buffer_size=10000, tau=0.003) 284 | 285 | -------------------------------------------------------------------------------- /Chapter08/TD3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | from collections import deque 6 | import time 7 | 8 | current_milli_time = lambda: int(round(time.time() * 1000)) 9 | 10 | def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None): 11 | ''' 12 | Multi-layer perceptron 13 | ''' 14 | for l in hidden_layers: 15 | x = tf.layers.dense(x, units=l, activation=activation) 16 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 17 | 18 | # CHANGED FROM DDPG! 19 | def deterministic_actor_double_critic(x, a, hidden_sizes, act_dim, max_act=1): 20 | ''' 21 | Deterministic Actor-Critic 22 | ''' 23 | # Actor 24 | with tf.variable_scope('p_mlp'): 25 | p_means = max_act * mlp(x, hidden_sizes, act_dim, last_activation=tf.tanh) 26 | 27 | # First critic 28 | with tf.variable_scope('q1_mlp'): 29 | q1_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None) 30 | 31 | with tf.variable_scope('q1_mlp', reuse=True): # Use the weights of the mlp just defined 32 | q1_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None) 33 | 34 | # Second critic 35 | with tf.variable_scope('q2_mlp'): 36 | q2_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None) 37 | with tf.variable_scope('q2_mlp', reuse=True): 38 | q2_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None) 39 | 40 | return p_means, tf.squeeze(q1_d), tf.squeeze(q1_a), tf.squeeze(q2_d), tf.squeeze(q2_a) 41 | 42 | class ExperiencedBuffer(): 43 | ''' 44 | Experienced buffer 45 | ''' 46 | def __init__(self, buffer_size): 47 | # Contains up to 'buffer_size' experience 48 | self.obs_buf = deque(maxlen=buffer_size) 49 | self.rew_buf = deque(maxlen=buffer_size) 50 | self.act_buf = deque(maxlen=buffer_size) 51 | self.obs2_buf = deque(maxlen=buffer_size) 52 | self.done_buf = deque(maxlen=buffer_size) 53 | 54 | 55 | def add(self, obs, rew, act, obs2, done): 56 | ''' 57 | Add a new transition to the buffers 58 | ''' 59 | self.obs_buf.append(obs) 60 | self.rew_buf.append(rew) 61 | self.act_buf.append(act) 62 | self.obs2_buf.append(obs2) 63 | self.done_buf.append(done) 64 | 65 | 66 | def sample_minibatch(self, batch_size): 67 | ''' 68 | Sample a mini-batch of size 'batch_size' 69 | ''' 70 | mb_indices = np.random.randint(len(self.obs_buf), size=batch_size) 71 | 72 | mb_obs = [self.obs_buf[i] for i in mb_indices] 73 | mb_rew = [self.rew_buf[i] for i in mb_indices] 74 | mb_act = [self.act_buf[i] for i in mb_indices] 75 | mb_obs2 = [self.obs2_buf[i] for i in mb_indices] 76 | mb_done = [self.done_buf[i] for i in mb_indices] 77 | 78 | return mb_obs, mb_rew, mb_act, mb_obs2, mb_done 79 | 80 | def __len__(self): 81 | return len(self.obs_buf) 82 | 83 | def test_agent(env_test, agent_op, num_games=10): 84 | ''' 85 | Test an agent 'agent_op', 'num_games' times 86 | Return mean and std 87 | ''' 88 | games_r = [] 89 | 90 | for _ in range(num_games): 91 | d = False 92 | game_r = 0 93 | o = env_test.reset() 94 | 95 | while not d: 96 | a_s = agent_op(o) 97 | o, r, d, _ = env_test.step(a_s) 98 | 99 | game_r += r 100 | 101 | games_r.append(game_r) 102 | 103 | return np.mean(games_r), np.std(games_r) 104 | 105 | 106 | 107 | def TD3(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs=2000, buffer_size=5000, discount=0.99, render_cycle=10000, mean_summaries_steps=1000, 108 | batch_size=128, min_buffer_size=5000, tau=0.005, target_noise=0.2, expl_noise=0.1, policy_update_freq=2): 109 | 110 | # Create an environment for training 111 | env = gym.make(env_name) 112 | # Create an environment for testing the actor 113 | env_test = gym.make(env_name) 114 | 115 | tf.reset_default_graph() 116 | 117 | obs_dim = env.observation_space.shape 118 | act_dim = env.action_space.shape 119 | print('-- Observation space:', obs_dim, ' Action space:', act_dim, '--') 120 | 121 | # Create some placeholders 122 | obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs') 123 | act_ph = tf.placeholder(shape=(None, act_dim[0]), dtype=tf.float32, name='act') 124 | y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y') 125 | 126 | # Create an online deterministic actor and a double critic 127 | with tf.variable_scope('online'): 128 | p_onl, qd1_onl, qa1_onl, _, qa2_onl = deterministic_actor_double_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high)) 129 | 130 | # and a target actor and double critic 131 | with tf.variable_scope('target'): 132 | p_tar, _, qa1_tar, _, qa2_tar = deterministic_actor_double_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high)) 133 | 134 | def variables_in_scope(scope): 135 | ''' 136 | Retrieve all the variables in the scope 'scope' 137 | ''' 138 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) 139 | 140 | # Copy all the online variables to the target networks i.e. target = online 141 | # Needed only at the beginning 142 | init_target = [target_var.assign(online_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))] 143 | init_target_op = tf.group(*init_target) 144 | 145 | # Soft update 146 | update_target = [target_var.assign(tau*online_var + (1-tau)*target_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))] 147 | update_target_op = tf.group(*update_target) 148 | 149 | # Critics loss (MSE) 150 | q1_loss = tf.reduce_mean((qa1_onl - y_ph)**2) 151 | q2_loss = tf.reduce_mean((qa2_onl - y_ph)**2) 152 | 153 | # Actor loss 154 | p_loss = -tf.reduce_mean(qd1_onl) 155 | 156 | # Optimize the critics 157 | q1_opt = tf.train.AdamOptimizer(cr_lr).minimize(q1_loss) 158 | q2_opt = tf.train.AdamOptimizer(cr_lr).minimize(q2_loss) 159 | 160 | # Optimize the actor 161 | p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss, var_list=variables_in_scope('online/p_mlp')) 162 | 163 | 164 | def add_normal_noise(x, scale, low_lim=-0.5, high_lim=0.5): 165 | return x + np.clip(np.random.normal(loc=0.0, scale=scale, size=x.shape), low_lim, high_lim) 166 | 167 | def agent_op(o): 168 | ac = np.squeeze(sess.run(p_onl, feed_dict={obs_ph:[o]})) 169 | return np.clip(ac, env.action_space.low, env.action_space.high) 170 | 171 | def agent_noisy_op(o, scale): 172 | ac = agent_op(o) 173 | return np.clip(add_normal_noise(ac, scale, env.action_space.low, env.action_space.high), env.action_space.low, env.action_space.high) 174 | 175 | 176 | # Time 177 | now = datetime.now() 178 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second)) 179 | print('Time:', clock_time) 180 | 181 | # Set TensorBoard 182 | tf.summary.scalar('loss/q1', q1_loss) 183 | tf.summary.scalar('loss/q2', q2_loss) 184 | tf.summary.scalar('loss/p', p_loss) 185 | scalar_summary = tf.summary.merge_all() 186 | 187 | hyp_str = '-aclr_'+str(ac_lr)+'-crlr_'+str(cr_lr)+'-tau_'+str(tau) 188 | 189 | file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/TD3_'+clock_time+'_'+hyp_str, tf.get_default_graph()) 190 | 191 | # Create a session and initialize the variables 192 | sess = tf.Session() 193 | sess.run(tf.global_variables_initializer()) 194 | sess.run(init_target_op) 195 | 196 | # Some useful variables.. 197 | render_the_game = False 198 | step_count = 0 199 | last_q1_update_loss = [] 200 | last_q2_update_loss = [] 201 | last_p_update_loss = [] 202 | ep_time = current_milli_time() 203 | batch_rew = [] 204 | 205 | # Reset the environment 206 | obs = env.reset() 207 | # Initialize the buffer 208 | buffer = ExperiencedBuffer(buffer_size) 209 | 210 | 211 | for ep in range(num_epochs): 212 | g_rew = 0 213 | done = False 214 | 215 | while not done: 216 | # If not gathered enough experience yet, act randomly 217 | if len(buffer) < min_buffer_size: 218 | act = env.action_space.sample() 219 | else: 220 | act = agent_noisy_op(obs, expl_noise) 221 | 222 | # Take a step in the environment 223 | obs2, rew, done, _ = env.step(act) 224 | 225 | if render_the_game: 226 | env.render() 227 | 228 | # Add the transition in the buffer 229 | buffer.add(obs.copy(), rew, act, obs2.copy(), done) 230 | 231 | obs = obs2 232 | g_rew += rew 233 | step_count += 1 234 | 235 | if len(buffer) > min_buffer_size: 236 | # sample a mini batch from the buffer 237 | mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size) 238 | 239 | 240 | double_actions = sess.run(p_tar, feed_dict={obs_ph:mb_obs2}) 241 | # Target regularization 242 | double_noisy_actions = np.clip(add_normal_noise(double_actions, target_noise), env.action_space.low, env.action_space.high) 243 | 244 | # Clipped Double Q-learning 245 | q1_target_mb, q2_target_mb = sess.run([qa1_tar,qa2_tar], feed_dict={obs_ph:mb_obs2, act_ph:double_noisy_actions}) 246 | q_target_mb = np.min([q1_target_mb, q2_target_mb], axis=0) 247 | assert(len(q1_target_mb) == len(q_target_mb)) 248 | 249 | # Compute the target values 250 | y_r = np.array(mb_rew) + discount*(1-np.array(mb_done))*q_target_mb 251 | 252 | # Optimize the critics 253 | train_summary, _, q1_train_loss, _, q2_train_loss = sess.run([scalar_summary, q1_opt, q1_loss, q2_opt, q2_loss], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act}) 254 | 255 | # Delayed policy update 256 | if step_count % policy_update_freq == 0: 257 | # Optimize the policy 258 | _, p_train_loss = sess.run([p_opt, p_loss], feed_dict={obs_ph:mb_obs}) 259 | 260 | # Soft update of the target networks 261 | sess.run(update_target_op) 262 | 263 | file_writer.add_summary(train_summary, step_count) 264 | last_q1_update_loss.append(q1_train_loss) 265 | last_q2_update_loss.append(q2_train_loss) 266 | last_p_update_loss.append(p_train_loss) 267 | 268 | 269 | # some 'mean' summaries to plot more smooth functions 270 | if step_count % mean_summaries_steps == 0: 271 | summary = tf.Summary() 272 | summary.value.add(tag='loss/mean_q1', simple_value=np.mean(last_q1_update_loss)) 273 | summary.value.add(tag='loss/mean_q2', simple_value=np.mean(last_q2_update_loss)) 274 | summary.value.add(tag='loss/mean_p', simple_value=np.mean(last_p_update_loss)) 275 | file_writer.add_summary(summary, step_count) 276 | file_writer.flush() 277 | 278 | last_q1_update_loss = [] 279 | last_q2_update_loss = [] 280 | last_p_update_loss = [] 281 | 282 | 283 | if done: 284 | obs = env.reset() 285 | batch_rew.append(g_rew) 286 | g_rew, render_the_game = 0, False 287 | 288 | # Test the actor every 10 epochs 289 | if ep % 10 == 0: 290 | test_mn_rw, test_std_rw = test_agent(env_test, agent_op) 291 | summary = tf.Summary() 292 | summary.value.add(tag='test/reward', simple_value=test_mn_rw) 293 | file_writer.add_summary(summary, step_count) 294 | file_writer.flush() 295 | 296 | ep_sec_time = int((current_milli_time()-ep_time) / 1000) 297 | print('Ep:%4d Rew:%4.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d' % (ep,np.mean(batch_rew), step_count, test_mn_rw, test_std_rw, ep_sec_time)) 298 | 299 | ep_time = current_milli_time() 300 | batch_rew = [] 301 | 302 | if ep % render_cycle == 0: 303 | render_the_game = True 304 | 305 | # close everything 306 | file_writer.close() 307 | env.close() 308 | env_test.close() 309 | 310 | 311 | if __name__ == '__main__': 312 | TD3('BipedalWalker-v2', hidden_sizes=[64,64], ac_lr=4e-4, cr_lr=4e-4, buffer_size=200000, mean_summaries_steps=100, batch_size=64, 313 | min_buffer_size=10000, tau=0.005, policy_update_freq=2, target_noise=0.1) -------------------------------------------------------------------------------- /Chapter10/DAgger.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from datetime import datetime 4 | import time 5 | from ple.games.flappybird import FlappyBird 6 | from ple import PLE 7 | 8 | 9 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None): 10 | ''' 11 | Multi-layer perceptron 12 | ''' 13 | for l in hidden_layers: 14 | x = tf.layers.dense(x, units=l, activation=activation) 15 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 16 | 17 | def flappy_to_list(fd): 18 | ''' 19 | Return the state dictionary as a list 20 | ''' 21 | return fd['player_y'], fd['player_vel'], fd['next_pipe_dist_to_player'], fd['next_pipe_top_y'], \ 22 | fd['next_pipe_bottom_y'], fd['next_next_pipe_dist_to_player'], fd['next_next_pipe_top_y'], \ 23 | fd['next_next_pipe_bottom_y'] 24 | 25 | def flappy_game_state(bol): 26 | ''' 27 | Normalize the game state 28 | ''' 29 | stat = flappy_to_list(bol.getGameState()) 30 | stat = (np.array(stat, dtype=np.float32) / 300.0) - 0.5 31 | return stat 32 | 33 | def no_op(env, n_act=5): 34 | for _ in range(n_act): 35 | env.act(119 if np.random.randn() < 0.5 else None) 36 | 37 | 38 | def expert(): 39 | ''' 40 | Load the computational graph and pretarined weights of the expert 41 | ''' 42 | graph = tf.get_default_graph() 43 | 44 | sess_expert = tf.Session(graph=graph) 45 | 46 | saver = tf.train.import_meta_graph('expert/model.ckpt.meta') 47 | saver.restore(sess_expert,tf.train.latest_checkpoint('expert/')) 48 | 49 | p_argmax = graph.get_tensor_by_name('actor_nn/max_act:0') 50 | obs_ph = graph.get_tensor_by_name('obs:0') 51 | 52 | def expert_policy(state): 53 | act = sess_expert.run(p_argmax, feed_dict={obs_ph:[state]}) 54 | return np.squeeze(act) 55 | 56 | return expert_policy 57 | 58 | def test_agent(policy, file_writer=None, test_games=10, step=0): 59 | game = FlappyBird() 60 | env = PLE(game, fps=30, display_screen=False) 61 | env.init() 62 | 63 | test_rewards = [] 64 | for _ in range(test_games): 65 | env.reset_game() 66 | no_op(env) 67 | 68 | game_rew = 0 69 | 70 | while not env.game_over(): 71 | 72 | state = flappy_game_state(env) 73 | 74 | action = 119 if policy(state) == 1 else None 75 | 76 | for _ in range(2): 77 | game_rew += env.act(action) 78 | 79 | test_rewards.append(game_rew) 80 | 81 | if file_writer is not None: 82 | summary = tf.Summary() 83 | summary.value.add(tag='test_performance', simple_value=game_rew) 84 | file_writer.add_summary(summary, step) 85 | file_writer.flush() 86 | 87 | return test_rewards 88 | 89 | 90 | def DAgger(hidden_sizes=[32,32], dagger_iterations=20, p_lr=1e-3, step_iterations=1000, batch_size=128, train_epochs=20, obs_dim=8, act_dim=2): 91 | 92 | tf.reset_default_graph() 93 | 94 | ############################## EXPERT ############################### 95 | # load the expert and return a function that predict the expert action given a state 96 | expert_policy = expert() 97 | print('Expert performance: ', np.mean(test_agent(expert_policy))) 98 | 99 | 100 | #################### LEARNER COMPUTATIONAL GRAPH #################### 101 | obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32, name='obs') 102 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 103 | 104 | # Multi-layer perceptron 105 | p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.nn.relu, last_activation=None) 106 | 107 | act_max = tf.math.argmax(p_logits, axis=1) 108 | act_onehot = tf.one_hot(act_ph, depth=act_dim) 109 | 110 | # softmax cross entropy loss 111 | p_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=act_onehot, logits=p_logits)) 112 | # Adam optimizer 113 | p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss) 114 | 115 | 116 | now = datetime.now() 117 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 118 | file_writer = tf.summary.FileWriter('log_dir/FlappyBird/DAgger_'+clock_time, tf.get_default_graph()) 119 | 120 | sess = tf.Session() 121 | sess.run(tf.global_variables_initializer()) 122 | 123 | def learner_policy(state): 124 | action = sess.run(act_max, feed_dict={obs_ph:[state]}) 125 | return np.squeeze(action) 126 | 127 | X = [] 128 | y = [] 129 | 130 | env = FlappyBird() 131 | 132 | env = PLE(env, fps=30, display_screen=False) 133 | env.init() 134 | 135 | #################### DAgger iterations #################### 136 | 137 | for it in range(dagger_iterations): 138 | sess.run(tf.global_variables_initializer()) 139 | env.reset_game() 140 | no_op(env) 141 | 142 | game_rew = 0 143 | rewards = [] 144 | 145 | ###################### Populate the dataset ##################### 146 | 147 | for _ in range(step_iterations): 148 | # get the current state from the environment 149 | state = flappy_game_state(env) 150 | 151 | # As the iterations continue use more and more actions sampled from the learner 152 | if np.random.rand() < (1 - it/5): 153 | action = expert_policy(state) 154 | else: 155 | action = learner_policy(state) 156 | 157 | action = 119 if action == 1 else None 158 | 159 | rew = env.act(action) 160 | rew += env.act(action) 161 | 162 | # Add the state and the expert action to the dataset 163 | X.append(state) 164 | y.append(expert_policy(state)) 165 | 166 | game_rew += rew 167 | 168 | # Whenever the game stop, reset the environment and initailize the variables 169 | if env.game_over(): 170 | env.reset_game() 171 | no_op(env) 172 | 173 | rewards.append(game_rew) 174 | game_rew = 0 175 | 176 | ##################### Training ##################### 177 | 178 | # Calculate the number of minibatches 179 | n_batches = int(np.floor(len(X)/batch_size)) 180 | 181 | # shuffle the dataset 182 | shuffle = np.arange(len(X)) 183 | np.random.shuffle(shuffle) 184 | 185 | 186 | shuffled_X = np.array(X)[shuffle] 187 | shuffled_y = np.array(y)[shuffle] 188 | 189 | 190 | for _ in range(train_epochs): 191 | ep_loss = [] 192 | # Train the model on each minibatch in the dataset 193 | for b in range(n_batches): 194 | p_start = b*batch_size 195 | 196 | # mini-batch training 197 | tr_loss, _ = sess.run([p_loss, p_opt], feed_dict={ 198 | obs_ph:shuffled_X[p_start:p_start+batch_size], 199 | act_ph:shuffled_y[p_start:p_start+batch_size]}) 200 | 201 | ep_loss.append(tr_loss) 202 | 203 | agent_tests = test_agent(learner_policy, file_writer, step=len(X)) 204 | 205 | print('Ep:', it, np.mean(ep_loss), 'Test:', np.mean(agent_tests)) 206 | 207 | 208 | 209 | 210 | if __name__ == "__main__": 211 | DAgger(hidden_sizes=[16,16], dagger_iterations=10, p_lr=1e-4, step_iterations=100, batch_size=50, train_epochs=2000) -------------------------------------------------------------------------------- /Chapter10/expert/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter10/expert/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter10/expert/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /Chapter10/expert/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter10/expert/model.ckpt.index -------------------------------------------------------------------------------- /Chapter10/expert/model.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/d144d314b3b5b91e9e8c37d4e0970af5d8379d1b/Chapter10/expert/model.ckpt.meta -------------------------------------------------------------------------------- /Chapter11/ES.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from datetime import datetime 4 | import time 5 | import gym 6 | 7 | import multiprocessing as mp 8 | import scipy.stats as ss 9 | import contextlib 10 | import numpy as np 11 | 12 | @contextlib.contextmanager 13 | def temp_seed(seed): 14 | state = np.random.get_state() 15 | np.random.seed(seed) 16 | try: 17 | yield 18 | finally: 19 | np.random.set_state(state) 20 | 21 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None): 22 | ''' 23 | Multi-layer perceptron 24 | ''' 25 | for l in hidden_layers: 26 | x = tf.layers.dense(x, units=l, activation=activation) 27 | 28 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 29 | 30 | 31 | def test_agent(env_test, agent_op, num_games=1): 32 | ''' 33 | Test an agent 'agent_op', 'num_games' times 34 | Return mean and std 35 | ''' 36 | games_r = [] 37 | steps = 0 38 | for _ in range(num_games): 39 | d = False 40 | game_r = 0 41 | o = env_test.reset() 42 | 43 | while not d: 44 | a_s = agent_op(o) 45 | o, r, d, _ = env_test.step(a_s) 46 | game_r += r 47 | steps += 1 48 | 49 | games_r.append(game_r) 50 | return games_r, steps 51 | 52 | 53 | def worker(env_name, initial_seed, hidden_sizes, lr, std_noise, indiv_per_worker, worker_name, params_queue, output_queue): 54 | 55 | env = gym.make(env_name) 56 | obs_dim = env.observation_space.shape[0] 57 | act_dim = env.action_space.shape[0] 58 | 59 | import tensorflow as tf 60 | 61 | # set an initial seed common to all the workers 62 | tf.random.set_random_seed(initial_seed) 63 | np.random.seed(initial_seed) 64 | 65 | 66 | with tf.device("/cpu:" + worker_name): 67 | 68 | obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32, name='obs_ph') 69 | new_weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='new_weights_ph') 70 | 71 | def variables_in_scope(scope): 72 | # get all trainable variables in 'scope' 73 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 74 | 75 | with tf.variable_scope('nn_' + worker_name): 76 | acts = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh) 77 | 78 | agent_variables = variables_in_scope('nn_' + worker_name) 79 | agent_variables_flatten = flatten_list(agent_variables) 80 | 81 | # Update the agent parameters with new weights new_weights_ph 82 | it_v1 = tf.Variable(0, trainable=False) 83 | update_weights = [] 84 | for a_v in agent_variables: 85 | upd_rsh = tf.reshape(new_weights_ph[it_v1 : it_v1+tf.reduce_prod(a_v.shape)], shape=a_v.shape) 86 | update_weights.append(a_v.assign(upd_rsh)) 87 | it_v1 += tf.reduce_prod(a_v.shape) 88 | 89 | 90 | # Reshape the new_weights_ph following the neural network shape 91 | it_v2 = tf.Variable(0, trainable=False) 92 | vars_grads_list = [] 93 | for a_v in agent_variables: 94 | vars_grads_list.append(tf.reshape(new_weights_ph[it_v2 : it_v2+tf.reduce_prod(a_v.shape)], shape=a_v.shape)) 95 | it_v2 += tf.reduce_prod(a_v.shape) 96 | 97 | # Create the optimizer 98 | opt = tf.train.AdamOptimizer(lr) 99 | # Apply the "gradients" using Adam 100 | apply_g = opt.apply_gradients([(g, v) for g, v in zip(vars_grads_list, agent_variables)]) 101 | 102 | def agent_op(o): 103 | a = np.squeeze(sess.run(acts, feed_dict={obs_ph:[o]})) 104 | return np.clip(a, env.action_space.low, env.action_space.high) 105 | 106 | 107 | def evaluation_on_noise(noise): 108 | ''' 109 | Evaluate the agent with the noise 110 | ''' 111 | # Get the original weights that will be restored after the evaluation 112 | original_weights = sess.run(agent_variables_flatten) 113 | 114 | # Update the weights of the agent/individual by adding the extra noise noise*STD_NOISE 115 | sess.run(update_weights, feed_dict={new_weights_ph:original_weights + noise*std_noise}) 116 | 117 | # Test the agent with the new weights 118 | rewards, steps = test_agent(env, agent_op) 119 | 120 | # Restore the original weights 121 | sess.run(update_weights, feed_dict={new_weights_ph:original_weights}) 122 | 123 | return np.mean(rewards), steps 124 | 125 | config_proto = tf.ConfigProto(device_count={'CPU': 4}, allow_soft_placement=True) 126 | sess = tf.Session(config=config_proto) 127 | sess.run(tf.global_variables_initializer()) 128 | 129 | 130 | agent_flatten_shape = sess.run(agent_variables_flatten).shape 131 | 132 | while True: 133 | 134 | for _ in range(indiv_per_worker): 135 | seed = np.random.randint(1e7) 136 | 137 | with temp_seed(seed): 138 | # sample, for each weight of the agent, from a normal distribution 139 | sampled_noise = np.random.normal(size=agent_flatten_shape) 140 | 141 | # Mirrored sampling 142 | pos_rew, stp1 = evaluation_on_noise(sampled_noise) 143 | neg_rew, stp2 = evaluation_on_noise(-sampled_noise) 144 | 145 | # Put the returns and seeds on the queue 146 | # Note that here we are just sending the seed (a scalar value), not the complete perturbation sampled_noise 147 | output_queue.put([[pos_rew, neg_rew], seed, stp1+stp2]) 148 | 149 | # Get all the returns and seed from each other worker 150 | batch_return, batch_seed = params_queue.get() 151 | 152 | batch_noise = [] 153 | for seed in batch_seed: 154 | 155 | # reconstruct the perturbations from the seed 156 | with temp_seed(seed): 157 | sampled_noise = np.random.normal(size=agent_flatten_shape) 158 | 159 | batch_noise.append(sampled_noise) 160 | batch_noise.append(-sampled_noise) 161 | 162 | 163 | # Compute the sthocastic gradient estimate 164 | vars_grads = np.zeros(agent_flatten_shape) 165 | for n, r in zip(batch_noise, batch_return): 166 | vars_grads += n * r 167 | vars_grads /= len(batch_noise) * std_noise 168 | 169 | # run Adam optimization on the estimate gradient just computed 170 | sess.run(apply_g, feed_dict={new_weights_ph:-vars_grads}) 171 | 172 | 173 | def normalized_rank(rewards): 174 | ''' 175 | Rank the rewards and normalize them. 176 | ''' 177 | ranked = ss.rankdata(rewards) 178 | norm = (ranked - 1) / (len(ranked) - 1) 179 | norm -= 0.5 180 | return norm 181 | 182 | 183 | def flatten(tensor): 184 | ''' 185 | Flatten a tensor 186 | ''' 187 | return tf.reshape(tensor, shape=(-1,)) 188 | 189 | def flatten_list(tensor_list): 190 | ''' 191 | Flatten a list of tensors 192 | ''' 193 | return tf.concat([flatten(t) for t in tensor_list], axis=0) 194 | 195 | 196 | 197 | def ES(env_name, hidden_sizes=[8,8], number_iter=1000, num_workers=4, lr=0.01, indiv_per_worker=10, std_noise=0.01): 198 | 199 | 200 | initial_seed = np.random.randint(1e7) 201 | 202 | # Create a queue for the output values (single returns and seeds values) 203 | output_queue = mp.Queue(maxsize=num_workers*indiv_per_worker) 204 | # Create a queue for the input paramaters (batch return and batch seeds) 205 | params_queue = mp.Queue(maxsize=num_workers) 206 | 207 | 208 | now = datetime.now() 209 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 210 | hyp_str = '-numworkers_'+str(num_workers)+'-lr_'+str(lr) 211 | file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/'+clock_time+'_'+hyp_str, tf.get_default_graph()) 212 | 213 | processes = [] 214 | # Create a parallel process for each worker 215 | for widx in range(num_workers): 216 | p = mp.Process(target=worker, args=(env_name, initial_seed, hidden_sizes, lr, std_noise, indiv_per_worker, str(widx), params_queue, output_queue)) 217 | p.start() 218 | processes.append(p) 219 | 220 | tot_steps = 0 221 | # Iterate over all the training iterations 222 | for n_iter in range(number_iter): 223 | 224 | batch_seed = [] 225 | batch_return = [] 226 | 227 | # Wait until enough candidate individuals are evaluated 228 | for _ in range(num_workers*indiv_per_worker): 229 | p_rews, p_seed, p_steps = output_queue.get() 230 | 231 | batch_seed.append(p_seed) 232 | batch_return.extend(p_rews) 233 | tot_steps += p_steps 234 | 235 | print('Iter: {} Reward: {:.2f}'.format(n_iter, np.mean(batch_return))) 236 | 237 | # Let's save the population's performance 238 | summary = tf.Summary() 239 | for r in batch_return: 240 | summary.value.add(tag='performance', simple_value=r) 241 | file_writer.add_summary(summary, tot_steps) 242 | file_writer.flush() 243 | 244 | # Rank and normalize the returns 245 | batch_return = normalized_rank(batch_return) 246 | 247 | # Put on the queue all the returns and seed so that each worker can optimize the neural network 248 | for _ in range(num_workers): 249 | params_queue.put([batch_return, batch_seed]) 250 | 251 | # terminate all workers 252 | for p in processes: 253 | p.terminate() 254 | 255 | 256 | 257 | if __name__ == '__main__': 258 | ES('LunarLanderContinuous-v2', hidden_sizes=[32,32], number_iter=200, num_workers=4, lr=0.02, indiv_per_worker=12, std_noise=0.05) 259 | -------------------------------------------------------------------------------- /Chapter12/ESBAS.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from datetime import datetime 5 | from collections import deque 6 | import time 7 | import sys 8 | 9 | 10 | gym.logger.set_level(40) 11 | 12 | current_milli_time = lambda: int(round(time.time() * 1000)) 13 | 14 | 15 | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None): 16 | ''' 17 | Multi-layer perceptron 18 | ''' 19 | for l in hidden_layers: 20 | x = tf.layers.dense(x, units=l, activation=activation) 21 | 22 | return tf.layers.dense(x, units=output_layer, activation=last_activation) 23 | 24 | class ExperienceBuffer(): 25 | ''' 26 | Experience Replay Buffer 27 | ''' 28 | def __init__(self, buffer_size): 29 | self.obs_buf = deque(maxlen=buffer_size) 30 | self.rew_buf = deque(maxlen=buffer_size) 31 | self.act_buf = deque(maxlen=buffer_size) 32 | self.obs2_buf = deque(maxlen=buffer_size) 33 | self.done_buf = deque(maxlen=buffer_size) 34 | 35 | 36 | def add(self, obs, rew, act, obs2, done): 37 | # Add a new transition to the buffers 38 | self.obs_buf.append(obs) 39 | self.rew_buf.append(rew) 40 | self.act_buf.append(act) 41 | self.obs2_buf.append(obs2) 42 | self.done_buf.append(done) 43 | 44 | 45 | def sample_minibatch(self, batch_size): 46 | # Sample a minibatch of size batch_size 47 | mb_indices = np.random.randint(len(self.obs_buf), size=batch_size) 48 | 49 | mb_obs = [self.obs_buf[i] for i in mb_indices] 50 | mb_rew = [self.rew_buf[i] for i in mb_indices] 51 | mb_act = [self.act_buf[i] for i in mb_indices] 52 | mb_obs2 = [self.obs2_buf[i] for i in mb_indices] 53 | mb_done = [self.done_buf[i] for i in mb_indices] 54 | 55 | return mb_obs, mb_rew, mb_act, mb_obs2, mb_done 56 | 57 | def __len__(self): 58 | return len(self.obs_buf) 59 | 60 | 61 | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value): 62 | ''' 63 | Calculate the target value y for each transition 64 | ''' 65 | max_av = np.max(av, axis=1) 66 | 67 | # if episode terminate, y take value r 68 | # otherwise, q-learning step 69 | ys = [] 70 | for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av): 71 | if d: 72 | ys.append(r) 73 | else: 74 | q_step = r + discounted_value * av 75 | ys.append(q_step) 76 | 77 | assert len(ys) == len(mini_batch_rw) 78 | return ys 79 | 80 | def greedy(action_values): 81 | ''' 82 | Greedy policy 83 | ''' 84 | return np.argmax(action_values) 85 | 86 | def eps_greedy(action_values, eps=0.1): 87 | ''' 88 | Eps-greedy policy 89 | ''' 90 | if np.random.uniform(0,1) < eps: 91 | # Choose a uniform random action 92 | return np.random.randint(len(action_values)) 93 | else: 94 | # Choose the greedy action 95 | return np.argmax(action_values) 96 | 97 | def test_agent(env_test, agent_op, num_games=20, summary=None): 98 | ''' 99 | Test an agent 100 | ''' 101 | games_r = [] 102 | 103 | for _ in range(num_games): 104 | d = False 105 | game_r = 0 106 | o = env_test.reset() 107 | 108 | while not d: 109 | a = greedy(np.squeeze(agent_op(o))) 110 | o, r, d, _ = env_test.step(a) 111 | 112 | game_r += r 113 | 114 | if summary is not None: 115 | summary.value.add(tag='test_performance', simple_value=game_r) 116 | 117 | games_r.append(game_r) 118 | 119 | return games_r 120 | 121 | 122 | class DQN_optimization: 123 | def __init__(self, obs_dim, act_dim, hidden_layers, lr, discount): 124 | self.obs_dim = obs_dim 125 | self.act_dim = act_dim 126 | self.hidden_layers = hidden_layers 127 | self.lr = lr 128 | self.discount = discount 129 | 130 | self.__build_graph() 131 | 132 | 133 | def __build_graph(self): 134 | 135 | self.g = tf.Graph() 136 | with self.g.as_default(): 137 | # Create all the placeholders 138 | self.obs_ph = tf.placeholder(shape=(None, self.obs_dim[0]), dtype=tf.float32, name='obs') 139 | self.act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act') 140 | self.y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y') 141 | 142 | # Create the target network 143 | with tf.variable_scope('target_network'): 144 | self.target_qv = mlp(self.obs_ph, self.hidden_layers, self.act_dim, tf.nn.relu, last_activation=None) 145 | target_vars = tf.trainable_variables() 146 | 147 | # Create the online network (i.e. the behavior policy) 148 | with tf.variable_scope('online_network'): 149 | self.online_qv = mlp(self.obs_ph, self.hidden_layers, self.act_dim, tf.nn.relu, last_activation=None) 150 | train_vars = tf.trainable_variables() 151 | 152 | # Update the target network by assigning to it the variables of the online network 153 | # Note that the target network and the online network have the same exact architecture 154 | update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))] 155 | self.update_target_op = tf.group(*update_target) 156 | 157 | # One hot encoding of the action 158 | act_onehot = tf.one_hot(self.act_ph, depth=self.act_dim) 159 | # We are interested only in the Q-values of those actions 160 | q_values = tf.reduce_sum(act_onehot * self.online_qv, axis=1) 161 | 162 | # MSE loss function 163 | self.v_loss = tf.reduce_mean((self.y_ph - q_values)**2) 164 | # Adam optimize that minimize the loss v_loss 165 | self.v_opt = tf.train.AdamOptimizer(self.lr).minimize(self.v_loss) 166 | 167 | self.__create_session() 168 | 169 | # Copy the online network in the target network 170 | self.sess.run(self.update_target_op) 171 | 172 | def __create_session(self): 173 | # open a session 174 | self.sess = tf.Session(graph=self.g) 175 | # and initialize all the variables 176 | self.sess.run(tf.global_variables_initializer()) 177 | 178 | 179 | def act(self, o): 180 | ''' 181 | Forward pass to obtain the Q-values from the online network of a single observation 182 | ''' 183 | return self.sess.run(self.online_qv, feed_dict={self.obs_ph:[o]}) 184 | 185 | def optimize(self, mb_obs, mb_rew, mb_act, mb_obs2, mb_done): 186 | mb_trg_qv = self.sess.run(self.target_qv, feed_dict={self.obs_ph:mb_obs2}) 187 | y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, self.discount) 188 | 189 | # training step 190 | # optimize, compute the loss and return the TB summary 191 | self.sess.run(self.v_opt, feed_dict={self.obs_ph:mb_obs, self.y_ph:y_r, self.act_ph: mb_act}) 192 | 193 | def update_target_network(self): 194 | # run the session to update the target network and get the mean loss sumamry 195 | self.sess.run(self.update_target_op) 196 | 197 | 198 | class UCB1: 199 | def __init__(self, algos, epsilon): 200 | self.n = 0 201 | self.epsilon = epsilon 202 | self.algos = algos 203 | 204 | self.nk = np.zeros(len(algos)) 205 | self.xk = np.zeros(len(algos)) 206 | 207 | def choose_algorithm(self): 208 | # take the best algorithm following UCB1 209 | current_best = np.argmax([self.xk[i] + np.sqrt(self.epsilon * np.log(self.n) / self.nk[i]) for i in range(len(self.algos))]) 210 | for i in range(len(self.algos)): 211 | if self.nk[i] < 5: 212 | return np.random.randint(len(self.algos)) 213 | 214 | return current_best 215 | 216 | def update(self, idx_algo, traj_return): 217 | # Update the mean RL return 218 | self.xk[idx_algo] = (self.nk[idx_algo] * self.xk[idx_algo] + traj_return) / (self.nk[idx_algo] + 1) 219 | # increase the number of trajectories run 220 | self.nk[idx_algo] += 1 221 | self.n += 1 222 | 223 | 224 | def ESBAS(env_name, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000, 225 | batch_size=64, update_freq=4, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000, 226 | xi=1): 227 | 228 | # reset the default graph 229 | tf.reset_default_graph() 230 | 231 | # Create the environment both for train and test 232 | env = gym.make(env_name) 233 | # Add a monitor to the test env to store the videos 234 | env_test = gym.wrappers.Monitor(gym.make(env_name), "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0) 235 | 236 | dqns = [] 237 | for l in hidden_sizes: 238 | dqns.append(DQN_optimization(env.observation_space.shape, env.action_space.n, l, lr, discount)) 239 | 240 | # Time 241 | now = datetime.now() 242 | clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second)) 243 | print('Time:', clock_time) 244 | 245 | LOG_DIR = 'log_dir/'+env_name 246 | hyp_str = "-lr_{}-upTN_{}-upF_{}-xi_{}" .format(lr, update_target_net, update_freq, xi) 247 | 248 | # initialize the File Writer for writing TensorBoard summaries 249 | file_writer = tf.summary.FileWriter(LOG_DIR+'/ESBAS_'+clock_time+'_'+hyp_str, tf.get_default_graph()) 250 | 251 | def DQNs_update(step_counter): 252 | # If it's time to train the network: 253 | if len(buffer) > min_buffer_size and (step_counter % update_freq == 0): 254 | 255 | # sample a minibatch from the buffer 256 | mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size) 257 | 258 | for dqn in dqns: 259 | dqn.optimize(mb_obs, mb_rew, mb_act, mb_obs2, mb_done) 260 | 261 | # Every update_target_net steps, update the target network 262 | if len(buffer) > min_buffer_size and (step_counter % update_target_net == 0): 263 | 264 | for dqn in dqns: 265 | dqn.update_target_network() 266 | 267 | 268 | step_count = 0 269 | episode = 0 270 | beta = 1 271 | 272 | # Initialize the experience buffer 273 | buffer = ExperienceBuffer(buffer_size) 274 | 275 | obs = env.reset() 276 | 277 | # policy exploration initialization 278 | eps = start_explor 279 | eps_decay = (start_explor - end_explor) / explor_steps 280 | 281 | 282 | for ep in range(num_epochs): 283 | 284 | # Policies' training 285 | for i in range(2**(beta-1), 2**beta): 286 | DQNs_update(i) 287 | 288 | ucb1 = UCB1(dqns, xi) 289 | list_bests = [] 290 | ep_rew = [] 291 | beta += 1 292 | 293 | while step_count < 2**beta: 294 | 295 | # Chose the best policy's algortihm that will run the next trajectory 296 | best_dqn = ucb1.choose_algorithm() 297 | list_bests.append(best_dqn) 298 | 299 | summary = tf.Summary() 300 | summary.value.add(tag='algorithm_selected', simple_value=best_dqn) 301 | file_writer.add_summary(summary, step_count) 302 | file_writer.flush() 303 | 304 | g_rew = 0 305 | done = False 306 | 307 | while not done: 308 | # Epsilon decay 309 | if eps > end_explor: 310 | eps -= eps_decay 311 | 312 | 313 | # Choose an eps-greedy action 314 | act = eps_greedy(np.squeeze(dqns[best_dqn].act(obs)), eps=eps) 315 | 316 | # execute the action in the environment 317 | obs2, rew, done, _ = env.step(act) 318 | 319 | # Add the transition to the replay buffer 320 | buffer.add(obs, rew, act, obs2, done) 321 | 322 | obs = obs2 323 | g_rew += rew 324 | step_count += 1 325 | 326 | 327 | # Update the UCB parameters of the algortihm just used 328 | ucb1.update(best_dqn, g_rew) 329 | 330 | # The environment is ended.. reset it and initialize the variables 331 | obs = env.reset() 332 | ep_rew.append(g_rew) 333 | g_rew = 0 334 | episode += 1 335 | 336 | 337 | # Print some stats and test the best policy 338 | summary = tf.Summary() 339 | summary.value.add(tag='train_performance', simple_value=np.mean(ep_rew)) 340 | 341 | if episode % 10 == 0: 342 | unique, counts = np.unique(list_bests, return_counts=True) 343 | print(dict(zip(unique, counts))) 344 | 345 | test_agent_results = test_agent(env_test, dqns[best_dqn].act, num_games=10, summary=summary) 346 | print('Epoch:%4d Episode:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f Best:%2d Last:%2d' % (ep,episode,np.mean(ep_rew), eps, step_count, np.mean(test_agent_results), best_dqn, g_rew)) 347 | 348 | file_writer.add_summary(summary, step_count) 349 | file_writer.flush() 350 | 351 | 352 | file_writer.close() 353 | env.close() 354 | 355 | 356 | if __name__ == '__main__': 357 | 358 | #ESBAS('Acrobot-v1', hidden_sizes=[[64, 64]], lr=4e-4, buffer_size=100000, update_target_net=100, batch_size=32, 359 | # update_freq=4, min_buffer_size=100, render_cycle=10000, explor_steps=50000, num_epochs=20000, end_explor=0.1) 360 | 361 | ESBAS('Acrobot-v1', hidden_sizes=[[64], [16, 16], [64, 64]], lr=4e-4, buffer_size=100000, update_target_net=100, batch_size=32, 362 | update_freq=4, min_buffer_size=100, render_cycle=10000, explor_steps=50000, num_epochs=20000, end_explor=0.1, 363 | xi=1./4) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Reinforcement Learning Algorithms with Python 5 | 6 | Reinforcement Learning Algorithms with Python 7 | 8 | This is the code repository for [Reinforcement Learning Algorithms with Python](https://www.packtpub.com/data/hands-on-reinforcement-learning-algorithms-with-python), published by Packt. 9 | 10 | **Learn, understand, and develop smart algorithms for addressing AI challenges** 11 | 12 | ## What is this book about? 13 | Reinforcement Learning (RL) is a popular and promising branch of AI that involves making smarter models and agents that can automatically determine ideal behavior based on changing requirements. This book will help you master RL algorithms and understand their implementation as you build self-learning agents. 14 | Starting with an introduction to the tools, libraries, and setup needed to work in the RL environment, this book covers the building blocks of RL and delves into value-based methods, such as the application of Q-learning and SARSA algorithms. You'll learn how to use a combination of Q-learning and neural networks to solve complex problems. Furthermore, you'll study the policy gradient methods, TRPO, and PPO, to improve performance and stability, before moving on to the DDPG and TD3 deterministic algorithms. This book also covers how imitation learning techniques work and how Dagger can teach an agent to drive. You'll discover evolutionary strategies and black-box optimization techniques, and see how they can improve RL algorithms. Finally, you'll get to grips with exploration approaches, such as UCB and UCB1, and develop a meta-algorithm called ESBAS. 15 | By the end of the book, you'll have worked with key RL algorithms to overcome challenges in real-world applications, and be part of the RL research community. 16 | 17 | 18 | This book covers the following exciting features: 19 | * Develop an agent to play CartPole using the OpenAI Gym interface 20 | * Discover the model-based reinforcement learning paradigm 21 | * Solve the Frozen Lake problem with dynamic programming 22 | * Explore Q-learning and SARSA with a view to playing a taxi game 23 | * Apply Deep Q-Networks (DQNs) to Atari games using Gym 24 | * Study policy gradient algorithms, including Actor-Critic and REINFORCE 25 | * Understand and apply PPO and TRPO in continuous locomotion environments 26 | * Get to grips with evolution strategies for solving the lunar lander problem 27 | 28 | If you feel this book is for you, get your [copy](https://www.amazon.com/Reinforcement-Learning-Algorithms-Python-understand/dp/1789131111/) today! 29 | 30 | https://www.packtpub.com/ 32 | 33 | ## Instructions and Navigations 34 | All of the code is organized into folders. For example, Chapter02. 35 | 36 | The code will look like the following: 37 | ``` 38 | import gym 39 | 40 | # create the environment 41 | env = gym.make("CartPole-v1") 42 | # reset the environment before starting 43 | env.reset() 44 | 45 | # loop 10 times 46 | for i in range(10): 47 | # take a random action 48 | env.step(env.action_space.sample()) 49 | # render the game 50 | env.render() 51 | 52 | # close the environment 53 | env.close() 54 | ``` 55 | 56 | **Following is what you need for this book:** 57 | If you are an AI researcher, deep learning user, or anyone who wants to learn reinforcement learning from scratch, this book is for you. You’ll also find this reinforcement learning book useful if you want to learn about the advancements in the field. Working knowledge of Python is necessary. 58 | 59 | 60 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11). 61 | ### Software and Hardware List 62 | | Chapter | Software required | OS required | 63 | | -------- | ------------------------------------ | ----------------------------------- | 64 | | All | Python 3.6 or higher | Windows, Mac OS X, and Linux (Any) | 65 | | All | TensorFlow 1.14 or higher | Windows, Mac OS X, and Linux (Any) | 66 | 67 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](http://www.packtpub.com/sites/default/files/downloads/9781789131116_ColorImages.pdf). 68 | 69 | ### Related products 70 | * Hands-On Reinforcement Learning with Python [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/hands-reinforcement-learning-python) [[Amazon]](https://www.amazon.com/Hands-Reinforcement-Learning-Python-reinforcement-ebook/dp/B079Q3WLM4/) 71 | 72 | * Python Reinforcement Learning Projects [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/python-reinforcement-learning-projects) [[Amazon]](https://www.amazon.com/Python-Reinforcement-Learning-Projects-hands-ebook/dp/B07F2S82W3/) 73 | 74 | ## Get to Know the Author 75 | **Andrea Lonza** is a deep learning engineer with a great passion for artificial intelligence and a desire to create machines that act intelligently. He has acquired expert knowledge in reinforcement learning, natural language processing, and computer vision through academic and industrial machine learning projects. He has also participated in several Kaggle competitions, achieving high results. He is always looking for compelling challenges and loves to prove himself. 76 | 77 | 78 | 79 | ### Suggestions and Feedback 80 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 81 | 82 | 83 | ### Download a free PDF 84 | 85 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
86 |

https://packt.link/free-ebook/9781789131116

--------------------------------------------------------------------------------