├── Real Time Dynamic Programming ├── README.md ├── lrta_rtdp.mp4 ├── Presentation.pdf ├── syncdp_gauss.mp4 ├── Policy_Evaluation.ipynb ├── gauss_seidel_VI.ipynb ├── LRTA_RTDP.py └── SyncDP_GaussSeidal.py ├── TD Control methods - Expected SARSA ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png ├── q.png ├── s.png ├── es.png ├── se.png ├── u_q.png ├── u_sarsa.png ├── exp_sarsa.png ├── u_e_sarsa.png └── README.md ├── Emphatic Temporal-Difference Learning ├── etd0.png ├── etd0_1.png ├── rmsve.png ├── etd_lambda.png └── README.md ├── Q(sigma) and multi-step bootstrapping methods ├── pic_1.png ├── pic_2.png ├── pic_3.png └── pic_4.png ├── Temporal-Difference Learning by Harm van Seijen └── README.md └── README.md /Real Time Dynamic Programming/README.md: -------------------------------------------------------------------------------- 1 | Presentation for 3rd February 2017 by Monica Patel and Pulkit Khandelwal 2 | 3 | -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/1.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/2.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/3.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/4.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/5.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/6.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/7.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/8.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/q.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/q.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/s.png -------------------------------------------------------------------------------- /Real Time Dynamic Programming/lrta_rtdp.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Real Time Dynamic Programming/lrta_rtdp.mp4 -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/es.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/es.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/se.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/se.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/u_q.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/u_q.png -------------------------------------------------------------------------------- /Emphatic Temporal-Difference Learning/etd0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/etd0.png -------------------------------------------------------------------------------- /Real Time Dynamic Programming/Presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Real Time Dynamic Programming/Presentation.pdf -------------------------------------------------------------------------------- /Real Time Dynamic Programming/syncdp_gauss.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Real Time Dynamic Programming/syncdp_gauss.mp4 -------------------------------------------------------------------------------- /Emphatic Temporal-Difference Learning/etd0_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/etd0_1.png -------------------------------------------------------------------------------- /Emphatic Temporal-Difference Learning/rmsve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/rmsve.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/u_sarsa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/u_sarsa.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/exp_sarsa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/exp_sarsa.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/u_e_sarsa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/u_e_sarsa.png -------------------------------------------------------------------------------- /Emphatic Temporal-Difference Learning/etd_lambda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/etd_lambda.png -------------------------------------------------------------------------------- /Q(sigma) and multi-step bootstrapping methods/pic_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_1.png -------------------------------------------------------------------------------- /Q(sigma) and multi-step bootstrapping methods/pic_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_2.png -------------------------------------------------------------------------------- /Q(sigma) and multi-step bootstrapping methods/pic_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_3.png -------------------------------------------------------------------------------- /Q(sigma) and multi-step bootstrapping methods/pic_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_4.png -------------------------------------------------------------------------------- /TD Control methods - Expected SARSA/README.md: -------------------------------------------------------------------------------- 1 | Temporal Difference Learning Methods compared: 2 | Sarsa, Expected Sarsa and Q-learning 3 | 4 | Three environments: 5 | Cliff Walking 6 | Windy Gridworld 7 | Gridworld 8 | 9 | The three different IPython Notebooks have different results (the theory is the same). So, just see the results. 10 | 11 | More flavours of environments to be added soon. 12 | -------------------------------------------------------------------------------- /Temporal-Difference Learning by Harm van Seijen/README.md: -------------------------------------------------------------------------------- 1 | 2 | An Empirical Evaluation of True Online TD(Lambda) by Harm van Seijen et. al 3 | True Online Temporal-Difference Learning by Harm van Seijen et. al; 4 | -----Journal of Machine Learning Research 2016---- 5 | 6 | In this IPython Notebook, I will walk through the various Function Approximation methods for estimating an optimal solution for the value functions V(s) i.e. On-policy prediction. Refer to the above two papers and the chapters: 9 and 12 from Sutton and Barto's Book. I have implemented the following algorithms: 7 | 8 | 1. Gradient Monte Carlo Algorithm for Approximating V 9 | 2. Semi-gradient TD(0) for estimating V_pi 10 | 3. Semi-gradient TD(lambda) 11 | 4. True-online TD(lambda) 12 | 5. TD(0) for prediction. This result will act as the V_pi(s) to calculate RMSVE 13 | 14 | We get introduced to lambda-return, eligibility traces and the dutch traces! 15 | -------------------------------------------------------------------------------- /Emphatic Temporal-Difference Learning/README.md: -------------------------------------------------------------------------------- 1 | An Emphatic Approach to the Problem of Off-policy Temporal-Difference Learning---- Richard S. Sutton, A. Rupam Mahmood and Martha White 2 | 3 | In this IPython Notebook, I will walk through the various Function Approximation methods for estimating an optimal solution for the value functions V(s) given in the above mentioned paper. We get introduced to a different type of trace: the followon trace, interest function and also a new variant of Squared Value error with interest function! 4 | The algorithms implemented in this notebook are: 5 | 6 | 1. Emphatic TD(lambda) 7 | 2. Emphatic TD(0) 8 | 3. Off-policy Semi-gradient TD(0) for estimating V_pi 9 | 10 | The above alogorithms are compared with the following algorithms from the previous assignment 4: 11 | 12 | 1. Gradient Monte Carlo Algorithm for Approximating V 13 | 2. Semi-gradient TD(0) for estimating V_pi 14 | 3. TD(lambda) 15 | 4. True-online TD(lambda) 16 | 5. TD(0) for prediction. This result will act as the V_pi(s) to calculate RMSVE 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement-Learning-Notebooks 2 | 3 | ## A collection of Reinforcement Learning algorithms from Sutton and Barto's book and other research papers implemented in Python. 4 | 5 | I wrote these notebooks in March 2017 while I took the *COMP 767: Reinforcement Learning* [5] class by Prof. Doina Precup at McGill, Montréal. I highly recommend you to go through the class notes and references of all the papers the intructors have posted on the website. 6 | 7 | These notebooks should be used while you read the book and go beyond the same with the referenced papers. I would suggest watching David Silver's videos and reading the book simultaneously. And when you are done with a few chapters, start implementing them. The algorithms follow a pattern and mostly are variants of each other. I have tried my best to explain each notebook's results and possible future directions. 8 | 9 | 10 | Disclaimer: The code is a little messy. I'd written this when I was not a Pythonista. If you would like to clean them up and want to make it into a nice interface, feel free to contact me. I will be very pleased to collaborate. If you use them then please cite the source and also mention the credits as listed below. Also, email me with ways to improve, let me know if you find any bugs. 11 | 12 | Feel free to reach me at pulkit.khandelwal@mail.mcgill.ca or see my website [here](https://pulkit-khandelwal.github.io/) 13 | 14 | Special Credits: 15 | 16 | [1] [Denny Britz](https://github.com/dennybritz/reinforcement-learning) 17 | 18 | [2] [Monica Patel](https://monicaopatel.com/) 19 | 20 | [3] [Sutton and Barto](https://www.amazon.ca/Reinforcement-Learning-Introduction-Richard-Sutton/dp/0262193981) 21 | 22 | [4] [David Silver](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) 23 | 24 | [5] [Doina Precup's course](https://www.cs.mcgill.ca/~dprecup/courses/rl.html) 25 | -------------------------------------------------------------------------------- /Real Time Dynamic Programming/Policy_Evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 78, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "\"\"\"\n", 12 | "Define all the libraries and make an environment\n", 13 | "\"\"\"\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "import sys\n", 17 | "if \"../\" not in sys.path:\n", 18 | " sys.path.append(\"../\") \n", 19 | "from lib.envs.gridworld import GridworldEnv\n", 20 | "env = GridworldEnv()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 79, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "def evaluate_policy(policy, env, lambda_value=0.2, theta=0.0001):\n", 32 | " \"\"\"\n", 33 | " \n", 34 | " This function returns a vector of the value function of each state\n", 35 | " \n", 36 | " \"\"\"\n", 37 | " # Start with a random value function. Here, I have used zeros.\n", 38 | " V = np.zeros(env.nS)\n", 39 | " t = 0\n", 40 | " while True:\n", 41 | " delta = 0\n", 42 | " # Iterate over each state\n", 43 | " for s in range(env.nS):\n", 44 | " value = 0\n", 45 | " # See actions given a state\n", 46 | " for a, action_prob in enumerate(policy[s]):\n", 47 | " # Given state and action, see the next sate and immediate reward\n", 48 | " for prob, next_state, reward, done in env.P[s][a]:\n", 49 | " # Calculate the expected value at each iteration\n", 50 | " value += action_prob * prob * (reward + lambda_value * V[next_state])\n", 51 | " # Change in value function over states in each iteration\n", 52 | " delta = max(delta, np.abs(value - V[s]))\n", 53 | " V[s] = value\n", 54 | " # Stop at a certain threshold\n", 55 | " t +=1\n", 56 | " if delta < theta:\n", 57 | " break\n", 58 | " return (np.array(V),t)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 80, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "Value Function\n", 73 | "[ 0. -1.18366196 -1.24642763 -1.24959779 -1.18366196 -1.24326209\n", 74 | " -1.24896873 -1.24643287 -1.24642763 -1.24896873 -1.24326352 -1.18366814\n", 75 | " -1.24959779 -1.24643287 -1.18366814 0. ]\n", 76 | "\n", 77 | "Reshaped Grid Value Function\n", 78 | "[[ 0. -1.18366196 -1.24642763 -1.24959779]\n", 79 | " [-1.18366196 -1.24326209 -1.24896873 -1.24643287]\n", 80 | " [-1.24642763 -1.24896873 -1.24326352 -1.18366814]\n", 81 | " [-1.24959779 -1.24643287 -1.18366814 0. ]]\n", 82 | "\n", 83 | "For lambda value of 0.2 and threshold of 0.00001, the number of iterations taken to converge:\n", 84 | "6\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", 90 | "#Compute the vector of the state value functions\n", 91 | "v = evaluate_policy(random_policy, env)[0]\n", 92 | "\n", 93 | "print(\"Value Function\")\n", 94 | "print(v)\n", 95 | "print(\"\")\n", 96 | "\n", 97 | "#Resahpe into a grid\n", 98 | "print(\"Reshaped Grid Value Function\")\n", 99 | "print(v.reshape(env.shape))\n", 100 | "print(\"\")\n", 101 | "\n", 102 | "#Number of iterations\n", 103 | "print(\"For lambda value of 0.2 and threshold of 0.00001, the number of iterations taken to converge:\")\n", 104 | "print(evaluate_policy(random_policy, env)[1])\n", 105 | "\n", 106 | "#Credits: WildML and OpenAI gym" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "anaconda-cloud": {}, 112 | "kernelspec": { 113 | "display_name": "Python [default]", 114 | "language": "python", 115 | "name": "python2" 116 | }, 117 | "language_info": { 118 | "codemirror_mode": { 119 | "name": "ipython", 120 | "version": 2.0 121 | }, 122 | "file_extension": ".py", 123 | "mimetype": "text/x-python", 124 | "name": "python", 125 | "nbconvert_exporter": "python", 126 | "pygments_lexer": "ipython2", 127 | "version": "2.7.12" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 0 132 | } -------------------------------------------------------------------------------- /Real Time Dynamic Programming/gauss_seidel_VI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 81, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pprint\n", 13 | "import sys\n", 14 | "if \"../\" not in sys.path:\n", 15 | " sys.path.append(\"../\") \n", 16 | "from lib.envs.gridworld import GridworldEnv\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 82, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "pp = pprint.PrettyPrinter(indent=2)\n", 28 | "env = GridworldEnv()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 83, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", 40 | " \"\"\"\n", 41 | " Gauss Seidel Variation to the Value Iteration method\n", 42 | " \"\"\"\n", 43 | " \n", 44 | " def one_step_lookahead(state, V,V_old):\n", 45 | " \n", 46 | " A = np.zeros(env.nA)\n", 47 | " for a in range(env.nA):\n", 48 | " for prob, next_state, reward, done in env.P[state][a]:\n", 49 | " \n", 50 | " ####### Gauss Seidel #######\n", 51 | " if next_state < state:\n", 52 | " A[a] += prob * (reward + discount_factor * V[next_state])\n", 53 | " else:\n", 54 | " A[a] += prob * (reward + discount_factor * V_old[next_state])\n", 55 | " \n", 56 | " \n", 57 | " return A\n", 58 | " \n", 59 | " V = np.zeros(env.nS)\n", 60 | " V_old = np.zeros(env.nS)\n", 61 | " \n", 62 | " t = 0\n", 63 | "\n", 64 | " while True:\n", 65 | " \n", 66 | " delta = 0\n", 67 | " V_old = V\n", 68 | " # For every state\n", 69 | " for s in range(env.nS):\n", 70 | " # Do a one-step lookahead to find the best action\n", 71 | " A = one_step_lookahead(s, V,V_old)\n", 72 | " best_action_value = np.max(A)\n", 73 | " # Select best action\n", 74 | " delta = max(delta, np.abs(best_action_value - V[s]))\n", 75 | " # Value function update\n", 76 | " \n", 77 | " V[s] = best_action_value \n", 78 | " t +=1\n", 79 | " if delta < theta:\n", 80 | " break\n", 81 | " \n", 82 | " \n", 83 | " # Find optimal policy\n", 84 | " policy = np.zeros([env.nS, env.nA])\n", 85 | " for s in range(env.nS):\n", 86 | " \n", 87 | " A = one_step_lookahead(s, V, V_old)\n", 88 | " best_action = np.argmax(A)\n", 89 | " \n", 90 | " policy[s, best_action] = 1.0\n", 91 | " \n", 92 | " return policy, V,t" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 84, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "Policy Probability Distribution:\n", 107 | "[[ 1. 0. 0. 0.]\n", 108 | " [ 0. 0. 0. 1.]\n", 109 | " [ 0. 0. 0. 1.]\n", 110 | " [ 0. 0. 1. 0.]\n", 111 | " [ 1. 0. 0. 0.]\n", 112 | " [ 1. 0. 0. 0.]\n", 113 | " [ 1. 0. 0. 0.]\n", 114 | " [ 0. 0. 1. 0.]\n", 115 | " [ 1. 0. 0. 0.]\n", 116 | " [ 1. 0. 0. 0.]\n", 117 | " [ 0. 1. 0. 0.]\n", 118 | " [ 0. 0. 1. 0.]\n", 119 | " [ 1. 0. 0. 0.]\n", 120 | " [ 0. 1. 0. 0.]\n", 121 | " [ 0. 1. 0. 0.]\n", 122 | " [ 1. 0. 0. 0.]]\n", 123 | "\n", 124 | "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", 125 | "[[0 3 3 2]\n", 126 | " [0 0 0 2]\n", 127 | " [0 0 1 2]\n", 128 | " [0 1 1 0]]\n", 129 | "\n", 130 | "Value Function:\n", 131 | "[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1. 0.]\n", 132 | "\n", 133 | "Reshaped Grid Value Function:\n", 134 | "[[ 0. -1. -2. -3.]\n", 135 | " [-1. -2. -3. -2.]\n", 136 | " [-2. -3. -2. -1.]\n", 137 | " [-3. -2. -1. 0.]]\n", 138 | "\n", 139 | "Converged in number of steps:\n", 140 | "4\n", 141 | "\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "policy, v = value_iteration(env)[0],value_iteration(env)[1]\n", 147 | "\n", 148 | "print(\"Policy Probability Distribution:\")\n", 149 | "print(policy)\n", 150 | "print(\"\")\n", 151 | "\n", 152 | "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n", 153 | "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n", 154 | "print(\"\")\n", 155 | "\n", 156 | "print(\"Value Function:\")\n", 157 | "print(v)\n", 158 | "print(\"\")\n", 159 | "\n", 160 | "print(\"Reshaped Grid Value Function:\")\n", 161 | "print(v.reshape(env.shape))\n", 162 | "print(\"\")\n", 163 | "\n", 164 | "print(\"Converged in number of steps:\")\n", 165 | "print(value_iteration(env)[2])\n", 166 | "print(\"\")" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "source": [ 175 | "Credits: The code has been adapted from WildML's blog and has been modified to implement the Gauss Sediel Value Iteration." 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "anaconda-cloud": {}, 181 | "kernelspec": { 182 | "display_name": "Python [default]", 183 | "language": "python", 184 | "name": "python2" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 2 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython2", 196 | "version": "2.7.12" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 0 201 | } 202 | -------------------------------------------------------------------------------- /Real Time Dynamic Programming/LRTA_RTDP.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Wed Feb 1 19:27:41 2017 3 | 4 | @author: monica 5 | """ 6 | #--------------------------- Imports ---------------------------- 7 | import cv2 8 | import itertools 9 | import numpy as np 10 | import random 11 | import math 12 | 13 | #-------------------------Class -------------------------------------- 14 | class Stack: 15 | def __init__(self): 16 | self.items = [] 17 | 18 | def isEmpty(self): 19 | return self.items == [] 20 | 21 | def push(self, item): 22 | self.items.append(item) 23 | 24 | def pop(self): 25 | return self.items.pop() 26 | 27 | def peek(self): 28 | return self.items[len(self.items)-1] 29 | 30 | def size(self): 31 | return len(self.items) 32 | 33 | #---------------------- Global Parameters ---------------------------------- 34 | #img1 = cv2.imread('pic3.jpg',0) 35 | img1 = cv2.imread('pic4.png',0) 36 | 37 | ret,world = cv2.threshold(img1,127,255,cv2.THRESH_BINARY) 38 | height, width = img1.shape 39 | 40 | print "------------" + str(img1.shape) 41 | 42 | X_min = 0 43 | Y_min = 0 44 | X_max = width - 1 45 | Y_max = height - 1 46 | states = dict() 47 | 48 | start = set() 49 | goal = set() 50 | 51 | state_val = np.full((width,height),999.0) 52 | 53 | move = {1,-1} 54 | 55 | admisible_actions = set(itertools.product(move,move)) 56 | 57 | p_correct = 0.9 58 | ci_nonGoal = 1 59 | ci_Goal = 0 60 | 61 | 62 | #----------------------------- Helper FUnctions ------------------------------- 63 | 64 | def generate_startGoal(): 65 | global goal, start, world 66 | 67 | for i in range(0,70): 68 | goal.add((X_max,i)) 69 | goal.add((X_max-1,i)) 70 | 71 | for j in range(Y_max-70,Y_max): 72 | start.add((X_max,j)) 73 | start.add((X_max-1,j)) 74 | 75 | 76 | 77 | def stateNeighbours(state): 78 | """ 79 | Takes in node, (Image point) and returns all the neighboring image points 80 | @param node <-- tuple 81 | @return list(tuple) 82 | """ 83 | neighbours = list() 84 | 85 | x = state[0] 86 | y = state[1] 87 | 88 | 89 | if not ((x+1) > X_max or (y+1) > Y_max ): 90 | neighbours.append((x+1,y+1)) 91 | if not ((x+1) > X_max ): 92 | neighbours.append((x+1,y)) 93 | if not ((y+1) > Y_max ): 94 | neighbours.append((x,y+1)) 95 | if not ((x-1) < X_min or (y-1) < Y_min ): 96 | neighbours.append((x-1,y-1)) 97 | if not ((x-1) < X_min ): 98 | neighbours.append((x-1,y)) 99 | if not ((y-1) < Y_min ): 100 | neighbours.append((x,y-1)) 101 | if not ((x+1) > X_max or (y-1) < Y_min ): 102 | neighbours.append((x+1,y-1)) 103 | if not ((x-1) < X_min or (y+1) > Y_max ): 104 | neighbours.append((x-1,y+1)) 105 | 106 | return neighbours 107 | 108 | def search_heuristic(state,goal): 109 | """ 110 | Hueristic for estimating cost of future nodes. 111 | @param state - tuple, goal - tuple 112 | @return float - estimated cost of goal from the state. In this case Euclidean Dist 113 | """ 114 | dist = math.sqrt((goal[0] - state[0])**2 + (goal[1] - state[1])**2) 115 | return dist 116 | 117 | def lrta_star(start,goal): 118 | """ 119 | LRTA* implementation for shortest path problem in grid world 120 | @param start, goal 121 | """ 122 | global world 123 | path = list() 124 | cost = dict() 125 | current_state = start 126 | 127 | while(not(current_state in goal)): 128 | 129 | #Get all the nodes reachable by admisible action 130 | neighbours = stateNeighbours(current_state) 131 | 132 | #For nodes find the cost 133 | for n in neighbours: 134 | cost_n = ci_nonGoal + search_heuristic(n,goal) 135 | cost[cost_n] = n 136 | 137 | min_cost = min(cost.keys()) 138 | 139 | #Update current node cost to min cost of neighbour node, move to neighbour node 140 | if state_val[current_state[0],current_state[1]] > min_cost: 141 | state_val[current_state[0],current_state[1]] = min_cost 142 | new_state = cost[min_cost] 143 | path.append(current_state) 144 | cv2.line(world, (current_state[0],current_state[1]), (new_state[0],new_state[1]), 1, 1) 145 | current_state = new_state 146 | 147 | cv2.imshow('window',world) 148 | 149 | if cv2.waitKey(50)==27: 150 | break 151 | cv2.destroyAllWindows() 152 | 153 | def populate_Bt(frontNode,depth): 154 | """ 155 | Forward search function which gives set for asynchrous DP update 156 | @param current_node, depth upto which the search is to be done 157 | @return subset of state set 158 | """ 159 | count = 0 160 | Bt = set() 161 | s = Stack() 162 | s.push(frontNode) 163 | while( not (s.size() == 0)): 164 | count += 1 165 | node = s.pop() 166 | Bt.add(node) 167 | neighbours = stateNeighbours(node) 168 | if count <= depth: 169 | for n in neighbours: 170 | s.push(n) 171 | return Bt 172 | 173 | def rtdp(start,goal): 174 | global world 175 | path = list() 176 | cost = dict() 177 | costf= dict() 178 | sweep_val = np.full((width,height),999.0) 179 | 180 | frontNode = start 181 | 182 | #While goal is not reached 183 | while(not(frontNode in goal)): 184 | Bt = populate_Bt(frontNode,1) 185 | 186 | #perform asynchrous updates for all nodes in Bt 187 | for item in list(Bt): 188 | neighbours = stateNeighbours(item) 189 | 190 | for n in neighbours: 191 | cost_n = ci_nonGoal + search_heuristic(n,goal) 192 | cost[cost_n] = n 193 | 194 | min_cost = min(cost.keys()) 195 | 196 | if sweep_val[item[0],item[1]] > min_cost: 197 | sweep_val[item[0],item[1]] = min_cost 198 | 199 | #Choose a greedy control action for current node 200 | state_val = sweep_val 201 | neighbours_front = stateNeighbours(frontNode) 202 | 203 | for n in neighbours_front: 204 | cost_nf = state_val[n[0],n[1]] 205 | costf[cost_nf] = n 206 | 207 | min_costf = min(costf.keys()) 208 | 209 | new_state = cost[min_costf] 210 | path.append(frontNode) 211 | cv2.line(world, (frontNode[0],frontNode[1]), (new_state[0],new_state[1]), 1, 1) 212 | frontNode = new_state 213 | 214 | cv2.imshow('window',world) 215 | 216 | if cv2.waitKey(50)==27: 217 | break 218 | cv2.destroyAllWindows() 219 | 220 | if __name__ == '__main__': 221 | generate_startGoal() 222 | start_point = list(start)[random.randint(0,len(start) - 1)] 223 | goal_point = (100,256) 224 | cv2.circle(world, start_point, 2, color=1, thickness=-1, lineType=8, shift=0) 225 | cv2.circle(world, goal_point, 2, color=1, thickness=-1, lineType=8, shift=0) 226 | 227 | ############ PLEASE UNCOMMENT THE ONE TO RUN AND COMMENT ONE NOT NEEDED ########## 228 | lrta_star(start_point,goal_point) 229 | #rtdp(start_point,goal_point) -------------------------------------------------------------------------------- /Real Time Dynamic Programming/SyncDP_GaussSeidal.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Wed Feb 1 19:27:41 2017 3 | 4 | @author: Monica Patel 5 | """ 6 | #--------------------------- Imports ---------------------------- 7 | import cv2 8 | import itertools 9 | import numpy as np 10 | import random 11 | import math 12 | #---------------------- Global Parameters ---------------------------------- 13 | world = cv2.imread('pic4.png',0) 14 | 15 | height, width = world.shape 16 | state_space = world 17 | 18 | print "-----World Shape-------" + str(world.shape) + '---------World Shape-----------' 19 | 20 | X_min = 0 21 | Y_min = 0 22 | X_max = width - 1 23 | Y_max = height - 1 24 | states = dict() 25 | 26 | start = set() 27 | goal = set() 28 | 29 | move = {1,-1} 30 | 31 | admisible_actions = set(itertools.product(move,move)) 32 | 33 | p_correct = 0.9 34 | 35 | ci_nonGoal = 1 36 | ci_Goal = 0 37 | ci_wall = 1000 38 | 39 | delta = 0.1 40 | 41 | #----------------------------- Helper FUnctions ------------------------------- 42 | 43 | 44 | def generate_startGoal(): 45 | """ 46 | Generates a set of Start points and Goal points from total State Space 47 | """ 48 | global goal, start, world 49 | 50 | for i in range(31,50): 51 | goal.add((i,0)) 52 | goal.add((i,1)) 53 | 54 | for j in range(0,20): 55 | start.add((j,0)) 56 | start.add((j,1)) 57 | 58 | 59 | def stateNeighbours(state): 60 | """ 61 | Takes in node, (Image point) and returns all the neighboring image points 62 | @param node <-- tuple 63 | @return list(tuple) 64 | """ 65 | neighbours = list() 66 | 67 | x = state[0] 68 | y = state[1] 69 | 70 | if not ((x+1) > X_max or (y+1) > Y_max ): 71 | neighbours.append((x+1,y+1)) 72 | if not ((x+1) > X_max ): 73 | neighbours.append((x+1,y)) 74 | if not ((y+1) > Y_max ): 75 | neighbours.append((x,y+1)) 76 | if not ((x-1) < X_min or (y-1) < Y_min ): 77 | neighbours.append((x-1,y-1)) 78 | if not ((x-1) < X_min ): 79 | neighbours.append((x-1,y)) 80 | if not ((y-1) < Y_min ): 81 | neighbours.append((x,y-1)) 82 | if not ((x+1) > X_max or (y-1) < Y_min ): 83 | neighbours.append((x+1,y-1)) 84 | if not ((x-1) < X_min or (y+1) > Y_max ): 85 | neighbours.append((x-1,y+1)) 86 | 87 | return neighbours 88 | 89 | def search_heuristic(state,goal): 90 | """ 91 | Hueristic for estimating cost of future nodes. 92 | @param state - tuple, goal - tuple 93 | @return float - estimated cost of goal from the state. In this case Euclidean Dist 94 | """ 95 | dist = math.sqrt((goal[0] - state[0])**2 + (goal[1] - state[1])**2) 96 | return dist 97 | 98 | def convergence_condition(state_val, epoch_val, point): 99 | """ 100 | Check if the cost are converged to the optimal values 101 | @param: cost of states from one epoch back and current epoch 102 | @return Bool - True if difference between values are less than fixed delta. 103 | """ 104 | diffrence = (state_val[point[0],point[1]] - epoch_val[point[0],point[1]]) 105 | 106 | if (diffrence < delta): 107 | return True 108 | else: 109 | return False 110 | 111 | def syncDP(start,goal): 112 | """ 113 | Implementation of the Synchronous version of DP for shortest path problem 114 | @param: start, goal 115 | @return: Optimal Value function 116 | """ 117 | global state_space 118 | 119 | epochs = 0 120 | 121 | state_val = np.full((width+1,height+1),999.0) 122 | epoch_val = np.full((width+1,height+1),999.0) 123 | 124 | condition = False 125 | #Repeat until Convergence 126 | while (not condition): 127 | 128 | for i in range(X_max): 129 | for j in range(Y_max): 130 | # Update costs of the states 131 | cost = dict() 132 | curr_state = (i,j) 133 | neighbours = stateNeighbours((curr_state)) 134 | 135 | for n in neighbours: 136 | cost_n = ci_nonGoal + search_heuristic(n,goal) 137 | cost[cost_n] = n 138 | 139 | if not len(neighbours) == 0: 140 | min_cost = min(cost.keys()) 141 | 142 | 143 | if epoch_val[curr_state[0],curr_state[1]] > min_cost: 144 | epoch_val[curr_state[0],curr_state[1]] = min_cost 145 | 146 | epochs += 1 147 | print '-----------', epochs, '\t epochs Complete ----------------' 148 | x = random.randint(X_min,X_max) 149 | y = random.randint(Y_min,Y_max) 150 | condition = convergence_condition(state_val,epoch_val,(x,y)) 151 | 152 | #Back up the costs of all states at once 153 | state_val = epoch_val 154 | 155 | #state_space, ret = cv2.threshold(state_space,127,1,cv2.THRESH_BINARY) 156 | #cv2.imshow('window',state_space) 157 | #cv2.waitKey(1) 158 | 159 | return state_val 160 | 161 | 162 | def gauss_seidal(start,goal): 163 | """ 164 | Implementation of Gauss_seidal algorithm for solving MDP 165 | @param start goal 166 | @Return: Optimal Value function 167 | """ 168 | epochs = 0 169 | 170 | state_val = np.full((width+1,height+1),999.0) 171 | val_kMinus1 = np.full((width+1,height+1),999.0) 172 | 173 | condition = False 174 | while (not condition): 175 | 176 | for i in range(X_max): 177 | for j in range(Y_max): 178 | cost = dict() 179 | curr_state = (i,j) 180 | 181 | neighbours = stateNeighbours((curr_state)) 182 | 183 | for n in neighbours: 184 | cost_n = ci_nonGoal + search_heuristic(n,goal) 185 | cost[cost_n] = n 186 | 187 | if not len(neighbours) == 0: 188 | min_cost = min(cost.keys()) 189 | 190 | #Keep backing up the costs of states as when the sweep is done 191 | if state_val[curr_state[0],curr_state[1]] > min_cost: 192 | state_val[curr_state[0],curr_state[1]] = min_cost 193 | 194 | #cv2.circle(state_space,curr_state, 2, 1, -1) 195 | #cv2.imshow('window',state_space) 196 | #cv2.waitKey(1) 197 | 198 | epochs += 1 199 | print '-----------', epochs, '\t epochs Complete ----------------' 200 | x = random.randint(X_min,X_max) 201 | y = random.randint(Y_min,Y_max) 202 | condition = convergence_condition(state_val,val_kMinus1,(x,y)) 203 | val_kMinus1 = state_val 204 | 205 | return state_val 206 | 207 | def greedy_policy(start_point,goal_point,state_val): 208 | """ 209 | Generates a greedy policy based on optimal value function 210 | @param start, goal, Optimal value function 211 | @return Path or optimal policy 212 | """ 213 | path = list() 214 | 215 | curr_state = start_point 216 | while not(curr_state == goal_point): 217 | cost = dict() 218 | neighbours = stateNeighbours(curr_state) 219 | 220 | for n in neighbours: 221 | cost_n = state_val[n] 222 | cost[cost_n] = n 223 | 224 | 225 | if not len(neighbours) == 0: 226 | min_cost = min(cost.keys()) 227 | next_state = cost[min_cost] 228 | path.append(next_state) 229 | 230 | cv2.line(world, (next_state[0],next_state[1]), (curr_state[0],curr_state[1]), 1, 1) 231 | curr_state = next_state 232 | 233 | cv2.imshow('window',world) 234 | if cv2.waitKey(50) == 27: 235 | break 236 | 237 | cv2.destroyAllWindows() 238 | return path 239 | 240 | if __name__ == '__main__': 241 | generate_startGoal() 242 | 243 | start_point = (50, 60) 244 | goal_point = (305,180) 245 | 246 | cv2.circle(world, start_point, 5, color=1, thickness=-1, lineType=8, shift=0) 247 | cv2.circle(world, goal_point, 5, color=1, thickness=-1, lineType=8, shift=0) 248 | 249 | ############ PLEASE UNCOMMENT THE ONE TO RUN AND COMMENT ONE NOT NEEDED ########## 250 | state_val = syncDP(start_point,goal_point) 251 | #state_val = gauss_seidal(start_point,goal_point) 252 | #---------------------------------------------------------------------------- 253 | print '---------------------Update complete-------------------' 254 | 255 | path = greedy_policy(start_point,goal_point,state_val) 256 | 257 | --------------------------------------------------------------------------------