├── Real Time Dynamic Programming
    ├── README.md
    ├── lrta_rtdp.mp4
    ├── Presentation.pdf
    ├── syncdp_gauss.mp4
    ├── Policy_Evaluation.ipynb
    ├── gauss_seidel_VI.ipynb
    ├── LRTA_RTDP.py
    └── SyncDP_GaussSeidal.py
├── TD Control methods - Expected SARSA
    ├── 1.png
    ├── 2.png
    ├── 3.png
    ├── 4.png
    ├── 5.png
    ├── 6.png
    ├── 7.png
    ├── 8.png
    ├── q.png
    ├── s.png
    ├── es.png
    ├── se.png
    ├── u_q.png
    ├── u_sarsa.png
    ├── exp_sarsa.png
    ├── u_e_sarsa.png
    └── README.md
├── Emphatic Temporal-Difference Learning
    ├── etd0.png
    ├── etd0_1.png
    ├── rmsve.png
    ├── etd_lambda.png
    └── README.md
├── Q(sigma) and multi-step bootstrapping methods
    ├── pic_1.png
    ├── pic_2.png
    ├── pic_3.png
    └── pic_4.png
├── Temporal-Difference Learning by Harm van Seijen
    └── README.md
└── README.md


/Real Time Dynamic Programming/README.md:
--------------------------------------------------------------------------------
1 | Presentation for 3rd February 2017 by Monica Patel and Pulkit Khandelwal
2 | 
3 | 


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/1.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/2.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/3.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/4.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/5.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/6.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/7.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/8.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/q.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/q.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/s.png


--------------------------------------------------------------------------------
/Real Time Dynamic Programming/lrta_rtdp.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Real Time Dynamic Programming/lrta_rtdp.mp4


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/es.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/es.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/se.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/se.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/u_q.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/u_q.png


--------------------------------------------------------------------------------
/Emphatic Temporal-Difference Learning/etd0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/etd0.png


--------------------------------------------------------------------------------
/Real Time Dynamic Programming/Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Real Time Dynamic Programming/Presentation.pdf


--------------------------------------------------------------------------------
/Real Time Dynamic Programming/syncdp_gauss.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Real Time Dynamic Programming/syncdp_gauss.mp4


--------------------------------------------------------------------------------
/Emphatic Temporal-Difference Learning/etd0_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/etd0_1.png


--------------------------------------------------------------------------------
/Emphatic Temporal-Difference Learning/rmsve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/rmsve.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/u_sarsa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/u_sarsa.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/exp_sarsa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/exp_sarsa.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/u_e_sarsa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/TD Control methods - Expected SARSA/u_e_sarsa.png


--------------------------------------------------------------------------------
/Emphatic Temporal-Difference Learning/etd_lambda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Emphatic Temporal-Difference Learning/etd_lambda.png


--------------------------------------------------------------------------------
/Q(sigma) and multi-step bootstrapping methods/pic_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_1.png


--------------------------------------------------------------------------------
/Q(sigma) and multi-step bootstrapping methods/pic_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_2.png


--------------------------------------------------------------------------------
/Q(sigma) and multi-step bootstrapping methods/pic_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_3.png


--------------------------------------------------------------------------------
/Q(sigma) and multi-step bootstrapping methods/pic_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pulkit-Khandelwal/Reinforcement-Learning-Notebooks/HEAD/Q(sigma) and multi-step bootstrapping methods/pic_4.png


--------------------------------------------------------------------------------
/TD Control methods - Expected SARSA/README.md:
--------------------------------------------------------------------------------
 1 | Temporal Difference Learning Methods compared:
 2 | Sarsa, Expected Sarsa and Q-learning
 3 | 
 4 | Three environments:
 5 | Cliff Walking
 6 | Windy Gridworld
 7 | Gridworld
 8 | 
 9 | The three different IPython Notebooks have different results (the theory is the same). So, just see the results.
10 | 
11 | More flavours of environments to be added soon.
12 | 


--------------------------------------------------------------------------------
/Temporal-Difference Learning by Harm van Seijen/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | An Empirical Evaluation of True Online TD(Lambda) by Harm van Seijen et. al
 3 | True Online Temporal-Difference Learning by Harm van Seijen et. al; 
 4 | -----Journal of Machine Learning Research 2016----
 5 | 
 6 | In this IPython Notebook, I will walk through the various Function Approximation methods for estimating an optimal solution for the value functions V(s) i.e. On-policy prediction. Refer to the above two papers and the chapters: 9 and 12 from Sutton and Barto's Book. I have implemented the following algorithms:
 7 | 
 8 | 1. Gradient Monte Carlo Algorithm for Approximating V
 9 | 2. Semi-gradient TD(0) for estimating V_pi
10 | 3. Semi-gradient TD(lambda)
11 | 4. True-online TD(lambda)
12 | 5. TD(0) for prediction. This result will act as the V_pi(s) to calculate RMSVE
13 | 
14 | We get introduced to lambda-return, eligibility traces and the dutch traces!
15 | 


--------------------------------------------------------------------------------
/Emphatic Temporal-Difference Learning/README.md:
--------------------------------------------------------------------------------
 1 | An Emphatic Approach to the Problem of Off-policy Temporal-Difference Learning---- Richard S. Sutton, A. Rupam Mahmood and Martha White
 2 | 
 3 | In this IPython Notebook, I will walk through the various Function Approximation methods for estimating an optimal solution for the value functions V(s) given in the above mentioned paper. We get introduced to a different type of trace: the followon trace, interest function and also a new variant of Squared Value error with interest function!
 4 | The algorithms implemented in this notebook are:
 5 | 
 6 | 1. Emphatic TD(lambda)
 7 | 2. Emphatic TD(0)
 8 | 3. Off-policy Semi-gradient TD(0) for estimating V_pi
 9 | 
10 | The above alogorithms are compared with the following algorithms from the previous assignment 4:
11 | 
12 | 1. Gradient Monte Carlo Algorithm for Approximating V
13 | 2. Semi-gradient TD(0) for estimating V_pi
14 | 3. TD(lambda)
15 | 4. True-online TD(lambda)
16 | 5. TD(0) for prediction. This result will act as the V_pi(s) to calculate RMSVE
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement-Learning-Notebooks
 2 | 
 3 | ## A collection of Reinforcement Learning algorithms from Sutton and Barto's book and other research papers implemented in Python.
 4 | 
 5 | I wrote these notebooks in March 2017 while I took the *COMP 767: Reinforcement Learning* [5] class by Prof. Doina Precup at McGill, Montréal. I highly recommend you to go through the class notes and references of all the papers the intructors have posted on the website.
 6 | 
 7 | These notebooks should be used while you read the book and go beyond the same with the referenced papers. I would suggest watching David Silver's videos and reading the book simultaneously. And when you are done with a few chapters, start implementing them. The algorithms follow a pattern and mostly are variants of each other. I have tried my best to explain each notebook's results and possible future directions.
 8 | 
 9 | 
10 | Disclaimer: The code is a little messy. I'd written this when I was not a Pythonista. If you would like to clean them up and want to make it into a nice interface, feel free to contact me. I will be very pleased to collaborate. If you use them then please cite the source and also mention the credits as listed below. Also, email me with ways to improve, let me know if you find any bugs.
11 | 
12 | Feel free to reach me at pulkit.khandelwal@mail.mcgill.ca or see my website [here](https://pulkit-khandelwal.github.io/)
13 | 
14 | Special Credits:
15 | 
16 | [1] [Denny Britz](https://github.com/dennybritz/reinforcement-learning)
17 | 
18 | [2] [Monica Patel](https://monicaopatel.com/)
19 | 
20 | [3] [Sutton and Barto](https://www.amazon.ca/Reinforcement-Learning-Introduction-Richard-Sutton/dp/0262193981)
21 | 
22 | [4] [David Silver](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ)
23 | 
24 | [5] [Doina Precup's course](https://www.cs.mcgill.ca/~dprecup/courses/rl.html)
25 | 


--------------------------------------------------------------------------------
/Real Time Dynamic Programming/Policy_Evaluation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 78,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "\"\"\"\n",
 12 |     "Define all the libraries and make an environment\n",
 13 |     "\"\"\"\n",
 14 |     "\n",
 15 |     "import numpy as np\n",
 16 |     "import sys\n",
 17 |     "if \"../\" not in sys.path:\n",
 18 |     "  sys.path.append(\"../\") \n",
 19 |     "from lib.envs.gridworld import GridworldEnv\n",
 20 |     "env = GridworldEnv()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 79,
 26 |    "metadata": {
 27 |     "collapsed": false
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "def evaluate_policy(policy, env, lambda_value=0.2, theta=0.0001):\n",
 32 |     "    \"\"\"\n",
 33 |     "    \n",
 34 |     "    This function returns a vector of the value function of each state\n",
 35 |     "        \n",
 36 |     "    \"\"\"\n",
 37 |     "    # Start with a random value function. Here, I have used zeros.\n",
 38 |     "    V = np.zeros(env.nS)\n",
 39 |     "    t = 0\n",
 40 |     "    while True:\n",
 41 |     "        delta = 0\n",
 42 |     "        # Iterate over each state\n",
 43 |     "        for s in range(env.nS):\n",
 44 |     "            value = 0\n",
 45 |     "            # See actions given a state\n",
 46 |     "            for a, action_prob in enumerate(policy[s]):\n",
 47 |     "                # Given state and action, see the next sate and immediate reward\n",
 48 |     "                for  prob, next_state, reward, done in env.P[s][a]:\n",
 49 |     "                    # Calculate the expected value at each iteration\n",
 50 |     "                    value += action_prob * prob * (reward + lambda_value * V[next_state])\n",
 51 |     "            # Change in value function over states in each iteration\n",
 52 |     "            delta = max(delta, np.abs(value - V[s]))\n",
 53 |     "            V[s] = value\n",
 54 |     "        # Stop at a certain threshold\n",
 55 |     "        t +=1\n",
 56 |     "        if delta < theta:\n",
 57 |     "            break\n",
 58 |     "    return (np.array(V),t)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 80,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "Value Function\n",
 73 |       "[ 0.         -1.18366196 -1.24642763 -1.24959779 -1.18366196 -1.24326209\n",
 74 |       " -1.24896873 -1.24643287 -1.24642763 -1.24896873 -1.24326352 -1.18366814\n",
 75 |       " -1.24959779 -1.24643287 -1.18366814  0.        ]\n",
 76 |       "\n",
 77 |       "Reshaped Grid Value Function\n",
 78 |       "[[ 0.         -1.18366196 -1.24642763 -1.24959779]\n",
 79 |       " [-1.18366196 -1.24326209 -1.24896873 -1.24643287]\n",
 80 |       " [-1.24642763 -1.24896873 -1.24326352 -1.18366814]\n",
 81 |       " [-1.24959779 -1.24643287 -1.18366814  0.        ]]\n",
 82 |       "\n",
 83 |       "For lambda value of 0.2 and threshold of 0.00001, the number of iterations taken to converge:\n",
 84 |       "6\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
 90 |     "#Compute the vector of the state value functions\n",
 91 |     "v = evaluate_policy(random_policy, env)[0]\n",
 92 |     "\n",
 93 |     "print(\"Value Function\")\n",
 94 |     "print(v)\n",
 95 |     "print(\"\")\n",
 96 |     "\n",
 97 |     "#Resahpe into a grid\n",
 98 |     "print(\"Reshaped Grid Value Function\")\n",
 99 |     "print(v.reshape(env.shape))\n",
100 |     "print(\"\")\n",
101 |     "\n",
102 |     "#Number of iterations\n",
103 |     "print(\"For lambda value of 0.2 and threshold of 0.00001, the number of iterations taken to converge:\")\n",
104 |     "print(evaluate_policy(random_policy, env)[1])\n",
105 |     "\n",
106 |     "#Credits: WildML and OpenAI gym"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "anaconda-cloud": {},
112 |   "kernelspec": {
113 |    "display_name": "Python [default]",
114 |    "language": "python",
115 |    "name": "python2"
116 |   },
117 |   "language_info": {
118 |    "codemirror_mode": {
119 |     "name": "ipython",
120 |     "version": 2.0
121 |    },
122 |    "file_extension": ".py",
123 |    "mimetype": "text/x-python",
124 |    "name": "python",
125 |    "nbconvert_exporter": "python",
126 |    "pygments_lexer": "ipython2",
127 |    "version": "2.7.12"
128 |   }
129 |  },
130 |  "nbformat": 4,
131 |  "nbformat_minor": 0
132 | }


--------------------------------------------------------------------------------
/Real Time Dynamic Programming/gauss_seidel_VI.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 81,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pprint\n",
 13 |     "import sys\n",
 14 |     "if \"../\" not in sys.path:\n",
 15 |     "  sys.path.append(\"../\") \n",
 16 |     "from lib.envs.gridworld import GridworldEnv\n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 82,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "pp = pprint.PrettyPrinter(indent=2)\n",
 28 |     "env = GridworldEnv()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 83,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
 40 |     "    \"\"\"\n",
 41 |     "    Gauss Seidel Variation to the Value Iteration method\n",
 42 |     "    \"\"\"\n",
 43 |     "    \n",
 44 |     "    def one_step_lookahead(state, V,V_old):\n",
 45 |     "       \n",
 46 |     "        A = np.zeros(env.nA)\n",
 47 |     "        for a in range(env.nA):\n",
 48 |     "            for prob, next_state, reward, done in env.P[state][a]:\n",
 49 |     "            \n",
 50 |     "                ####### Gauss Seidel #######\n",
 51 |     "                if next_state < state:\n",
 52 |     "                    A[a] += prob * (reward + discount_factor * V[next_state])\n",
 53 |     "                else:\n",
 54 |     "                    A[a] += prob * (reward + discount_factor * V_old[next_state])\n",
 55 |     "                        \n",
 56 |     "                    \n",
 57 |     "        return A\n",
 58 |     "    \n",
 59 |     "    V = np.zeros(env.nS)\n",
 60 |     "    V_old = np.zeros(env.nS)\n",
 61 |     "    \n",
 62 |     "    t = 0\n",
 63 |     "\n",
 64 |     "    while True:\n",
 65 |     "        \n",
 66 |     "        delta = 0\n",
 67 |     "        V_old = V\n",
 68 |     "        # For every state\n",
 69 |     "        for s in range(env.nS):\n",
 70 |     "            # Do a one-step lookahead to find the best action\n",
 71 |     "            A = one_step_lookahead(s, V,V_old)\n",
 72 |     "            best_action_value = np.max(A)\n",
 73 |     "            # Select best action\n",
 74 |     "            delta = max(delta, np.abs(best_action_value - V[s]))\n",
 75 |     "            # Value function update\n",
 76 |     "             \n",
 77 |     "            V[s] = best_action_value        \n",
 78 |     "        t +=1\n",
 79 |     "        if delta < theta:\n",
 80 |     "            break\n",
 81 |     "        \n",
 82 |     "    \n",
 83 |     "    # Find optimal policy\n",
 84 |     "    policy = np.zeros([env.nS, env.nA])\n",
 85 |     "    for s in range(env.nS):\n",
 86 |     "        \n",
 87 |     "        A = one_step_lookahead(s, V, V_old)\n",
 88 |     "        best_action = np.argmax(A)\n",
 89 |     "        \n",
 90 |     "        policy[s, best_action] = 1.0\n",
 91 |     "    \n",
 92 |     "    return policy, V,t"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 84,
 98 |    "metadata": {
 99 |     "collapsed": false
100 |    },
101 |    "outputs": [
102 |     {
103 |      "name": "stdout",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "Policy Probability Distribution:\n",
107 |       "[[ 1.  0.  0.  0.]\n",
108 |       " [ 0.  0.  0.  1.]\n",
109 |       " [ 0.  0.  0.  1.]\n",
110 |       " [ 0.  0.  1.  0.]\n",
111 |       " [ 1.  0.  0.  0.]\n",
112 |       " [ 1.  0.  0.  0.]\n",
113 |       " [ 1.  0.  0.  0.]\n",
114 |       " [ 0.  0.  1.  0.]\n",
115 |       " [ 1.  0.  0.  0.]\n",
116 |       " [ 1.  0.  0.  0.]\n",
117 |       " [ 0.  1.  0.  0.]\n",
118 |       " [ 0.  0.  1.  0.]\n",
119 |       " [ 1.  0.  0.  0.]\n",
120 |       " [ 0.  1.  0.  0.]\n",
121 |       " [ 0.  1.  0.  0.]\n",
122 |       " [ 1.  0.  0.  0.]]\n",
123 |       "\n",
124 |       "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n",
125 |       "[[0 3 3 2]\n",
126 |       " [0 0 0 2]\n",
127 |       " [0 0 1 2]\n",
128 |       " [0 1 1 0]]\n",
129 |       "\n",
130 |       "Value Function:\n",
131 |       "[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.]\n",
132 |       "\n",
133 |       "Reshaped Grid Value Function:\n",
134 |       "[[ 0. -1. -2. -3.]\n",
135 |       " [-1. -2. -3. -2.]\n",
136 |       " [-2. -3. -2. -1.]\n",
137 |       " [-3. -2. -1.  0.]]\n",
138 |       "\n",
139 |       "Converged in number of steps:\n",
140 |       "4\n",
141 |       "\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "policy, v = value_iteration(env)[0],value_iteration(env)[1]\n",
147 |     "\n",
148 |     "print(\"Policy Probability Distribution:\")\n",
149 |     "print(policy)\n",
150 |     "print(\"\")\n",
151 |     "\n",
152 |     "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n",
153 |     "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n",
154 |     "print(\"\")\n",
155 |     "\n",
156 |     "print(\"Value Function:\")\n",
157 |     "print(v)\n",
158 |     "print(\"\")\n",
159 |     "\n",
160 |     "print(\"Reshaped Grid Value Function:\")\n",
161 |     "print(v.reshape(env.shape))\n",
162 |     "print(\"\")\n",
163 |     "\n",
164 |     "print(\"Converged in number of steps:\")\n",
165 |     "print(value_iteration(env)[2])\n",
166 |     "print(\"\")"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "source": [
175 |     "Credits: The code has been adapted from WildML's blog and has been modified to implement the Gauss Sediel Value Iteration."
176 |    ]
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "anaconda-cloud": {},
181 |   "kernelspec": {
182 |    "display_name": "Python [default]",
183 |    "language": "python",
184 |    "name": "python2"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 2
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython2",
196 |    "version": "2.7.12"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 0
201 | }
202 | 


--------------------------------------------------------------------------------
/Real Time Dynamic Programming/LRTA_RTDP.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Wed Feb  1 19:27:41 2017
  3 | 
  4 | @author: monica
  5 | """
  6 | #--------------------------- Imports ----------------------------
  7 | import cv2
  8 | import itertools
  9 | import numpy as np
 10 | import random
 11 | import math
 12 | 
 13 | #-------------------------Class --------------------------------------
 14 | class Stack:
 15 |      def __init__(self):
 16 |          self.items = []
 17 | 
 18 |      def isEmpty(self):
 19 |          return self.items == []
 20 | 
 21 |      def push(self, item):
 22 |          self.items.append(item)
 23 | 
 24 |      def pop(self):
 25 |          return self.items.pop()
 26 | 
 27 |      def peek(self):
 28 |          return self.items[len(self.items)-1]
 29 | 
 30 |      def size(self):
 31 |          return len(self.items)
 32 | 
 33 | #---------------------- Global Parameters ----------------------------------
 34 | #img1 = cv2.imread('pic3.jpg',0)
 35 | img1 = cv2.imread('pic4.png',0)
 36 | 
 37 | ret,world = cv2.threshold(img1,127,255,cv2.THRESH_BINARY)
 38 | height, width = img1.shape
 39 | 
 40 | print "------------" + str(img1.shape)
 41 | 
 42 | X_min = 0
 43 | Y_min = 0
 44 | X_max = width - 1
 45 | Y_max = height - 1
 46 | states = dict()
 47 | 
 48 | start = set()
 49 | goal = set()
 50 | 
 51 | state_val = np.full((width,height),999.0)
 52 | 
 53 | move = {1,-1}
 54 | 
 55 | admisible_actions = set(itertools.product(move,move))
 56 | 
 57 | p_correct = 0.9
 58 | ci_nonGoal = 1
 59 | ci_Goal = 0
 60 | 
 61 | 
 62 | #----------------------------- Helper FUnctions -------------------------------
 63 | 
 64 | def generate_startGoal():
 65 |     global goal, start, world
 66 |     
 67 |     for i in range(0,70):
 68 |         goal.add((X_max,i))
 69 |         goal.add((X_max-1,i))
 70 |         
 71 |     for j in range(Y_max-70,Y_max):
 72 |         start.add((X_max,j))
 73 |         start.add((X_max-1,j))
 74 |         
 75 |    
 76 | 
 77 | def stateNeighbours(state):
 78 |     """
 79 |     Takes in node, (Image point) and returns all the neighboring image points
 80 |     @param node <-- tuple
 81 |     @return list(tuple)
 82 |     """
 83 |     neighbours = list()
 84 |     
 85 |     x = state[0]
 86 |     y = state[1]
 87 |     
 88 | 
 89 |     if not ((x+1) > X_max or (y+1) > Y_max ):
 90 |         neighbours.append((x+1,y+1))
 91 |     if not ((x+1) > X_max ):
 92 |         neighbours.append((x+1,y))
 93 |     if not ((y+1) > Y_max ):
 94 |         neighbours.append((x,y+1))
 95 |     if not ((x-1) < X_min or (y-1) < Y_min ):
 96 |         neighbours.append((x-1,y-1))
 97 |     if not ((x-1) < X_min ):
 98 |         neighbours.append((x-1,y))
 99 |     if not ((y-1) < Y_min ):
100 |         neighbours.append((x,y-1))
101 |     if not ((x+1) > X_max or (y-1) < Y_min ):
102 |         neighbours.append((x+1,y-1))
103 |     if not ((x-1) < X_min or (y+1) > Y_max ):
104 |         neighbours.append((x-1,y+1))
105 |         
106 |     return neighbours   
107 | 
108 | def search_heuristic(state,goal):
109 |     """
110 |     Hueristic for estimating cost of future nodes.
111 |     @param state - tuple, goal - tuple
112 |     @return float - estimated cost of goal from the state. In this case Euclidean Dist
113 |     """
114 |     dist = math.sqrt((goal[0] - state[0])**2 + (goal[1] - state[1])**2)
115 |     return dist
116 |           
117 | def lrta_star(start,goal):
118 |     """
119 |     LRTA* implementation for shortest path problem in grid world
120 |     @param start, goal
121 |     """
122 |     global world
123 |     path = list()
124 |     cost = dict()
125 |     current_state = start
126 |     
127 |     while(not(current_state in goal)):
128 |         
129 |         #Get all the nodes reachable by admisible action
130 |         neighbours = stateNeighbours(current_state)
131 |         
132 |         #For nodes find the cost
133 |         for n in neighbours:
134 |             cost_n = ci_nonGoal + search_heuristic(n,goal)
135 |             cost[cost_n] = n
136 |         
137 |         min_cost = min(cost.keys())
138 |         
139 |         #Update current node cost to min cost of neighbour node, move to neighbour node
140 |         if state_val[current_state[0],current_state[1]] > min_cost:
141 |             state_val[current_state[0],current_state[1]] = min_cost
142 |             new_state = cost[min_cost]
143 |             path.append(current_state)
144 |             cv2.line(world, (current_state[0],current_state[1]), (new_state[0],new_state[1]), 1, 1)
145 |             current_state = new_state
146 |             
147 |         cv2.imshow('window',world)
148 |         
149 |         if cv2.waitKey(50)==27:
150 |             break
151 |     cv2.destroyAllWindows()
152 |     
153 | def populate_Bt(frontNode,depth):
154 |     """
155 |     Forward search function which gives set for asynchrous DP update
156 |     @param current_node, depth upto which the search is to be done
157 |     @return subset of state set
158 |     """
159 |     count = 0
160 |     Bt = set()
161 |     s = Stack()
162 |     s.push(frontNode)
163 |     while( not (s.size() == 0)):
164 |         count += 1
165 |         node = s.pop()
166 |         Bt.add(node)
167 |         neighbours = stateNeighbours(node)
168 |         if count <= depth:
169 |             for n in neighbours:
170 |                 s.push(n)  
171 |     return Bt
172 |   
173 | def rtdp(start,goal):
174 |     global world
175 |     path = list()
176 |     cost = dict()
177 |     costf= dict()
178 |     sweep_val = np.full((width,height),999.0)
179 |     
180 |     frontNode = start
181 |     
182 |     #While goal is not reached
183 |     while(not(frontNode in goal)):
184 |         Bt = populate_Bt(frontNode,1)
185 |         
186 |         #perform asynchrous updates for all nodes in Bt
187 |         for item in list(Bt):
188 |             neighbours = stateNeighbours(item)
189 |         
190 |             for n in neighbours:
191 |                 cost_n = ci_nonGoal + search_heuristic(n,goal)
192 |                 cost[cost_n] = n
193 | 
194 |             min_cost = min(cost.keys())
195 |         
196 |             if sweep_val[item[0],item[1]] > min_cost:
197 |                 sweep_val[item[0],item[1]] = min_cost
198 |         
199 |         #Choose a greedy control action for current node   
200 |         state_val = sweep_val
201 |         neighbours_front = stateNeighbours(frontNode)
202 |         
203 |         for n in neighbours_front:
204 |             cost_nf = state_val[n[0],n[1]]
205 |             costf[cost_nf] = n
206 |             
207 |         min_costf = min(costf.keys())
208 |         
209 |         new_state = cost[min_costf]
210 |         path.append(frontNode)
211 |         cv2.line(world, (frontNode[0],frontNode[1]), (new_state[0],new_state[1]), 1, 1)
212 |         frontNode = new_state
213 |             
214 |         cv2.imshow('window',world)
215 |         
216 |         if cv2.waitKey(50)==27:
217 |             break
218 |     cv2.destroyAllWindows()
219 | 
220 | if __name__ == '__main__':      
221 |     generate_startGoal()
222 |     start_point = list(start)[random.randint(0,len(start) - 1)]
223 |     goal_point = (100,256) 
224 |     cv2.circle(world, start_point, 2, color=1, thickness=-1, lineType=8, shift=0)
225 |     cv2.circle(world, goal_point, 2, color=1, thickness=-1, lineType=8, shift=0)
226 |     
227 |     ############ PLEASE UNCOMMENT THE ONE TO RUN AND COMMENT ONE NOT NEEDED ##########
228 |     lrta_star(start_point,goal_point)
229 |     #rtdp(start_point,goal_point)


--------------------------------------------------------------------------------
/Real Time Dynamic Programming/SyncDP_GaussSeidal.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Wed Feb  1 19:27:41 2017
  3 | 
  4 | @author: Monica Patel
  5 | """
  6 | #--------------------------- Imports ----------------------------
  7 | import cv2
  8 | import itertools
  9 | import numpy as np
 10 | import random
 11 | import math
 12 | #---------------------- Global Parameters ----------------------------------
 13 | world = cv2.imread('pic4.png',0)
 14 | 
 15 | height, width = world.shape
 16 | state_space = world
 17 | 
 18 | print "-----World Shape-------" + str(world.shape) + '---------World Shape-----------'
 19 | 
 20 | X_min = 0
 21 | Y_min = 0
 22 | X_max = width - 1 
 23 | Y_max = height - 1
 24 | states = dict()
 25 | 
 26 | start = set()
 27 | goal = set()
 28 | 
 29 | move = {1,-1}
 30 | 
 31 | admisible_actions = set(itertools.product(move,move))
 32 | 
 33 | p_correct = 0.9
 34 | 
 35 | ci_nonGoal = 1
 36 | ci_Goal = 0
 37 | ci_wall = 1000
 38 | 
 39 | delta = 0.1
 40 | 
 41 | #----------------------------- Helper FUnctions -------------------------------
 42 | 
 43 | 
 44 | def generate_startGoal():
 45 |     """
 46 |     Generates a set of Start points and Goal points from total State Space
 47 |     """
 48 |     global goal, start, world
 49 |     
 50 |     for i in range(31,50):
 51 |         goal.add((i,0))
 52 |         goal.add((i,1))
 53 |         
 54 |     for j in range(0,20):
 55 |         start.add((j,0))
 56 |         start.add((j,1))
 57 |         
 58 |    
 59 | def stateNeighbours(state):
 60 |     """
 61 |     Takes in node, (Image point) and returns all the neighboring image points
 62 |     @param node <-- tuple
 63 |     @return list(tuple)
 64 |     """
 65 |     neighbours = list()
 66 |     
 67 |     x = state[0]
 68 |     y = state[1]
 69 |     
 70 |     if not ((x+1) > X_max or (y+1) > Y_max ):
 71 |         neighbours.append((x+1,y+1))
 72 |     if not ((x+1) > X_max ):
 73 |         neighbours.append((x+1,y))
 74 |     if not ((y+1) > Y_max ):
 75 |         neighbours.append((x,y+1))
 76 |     if not ((x-1) < X_min or (y-1) < Y_min ):
 77 |         neighbours.append((x-1,y-1))
 78 |     if not ((x-1) < X_min ):
 79 |         neighbours.append((x-1,y))
 80 |     if not ((y-1) < Y_min ):
 81 |         neighbours.append((x,y-1))
 82 |     if not ((x+1) > X_max or (y-1) < Y_min ):
 83 |         neighbours.append((x+1,y-1))
 84 |     if not ((x-1) < X_min or (y+1) > Y_max ):
 85 |         neighbours.append((x-1,y+1))
 86 |         
 87 |     return neighbours   
 88 | 
 89 | def search_heuristic(state,goal):
 90 |     """
 91 |     Hueristic for estimating cost of future nodes.
 92 |     @param state - tuple, goal - tuple
 93 |     @return float - estimated cost of goal from the state. In this case Euclidean Dist
 94 |     """
 95 |     dist = math.sqrt((goal[0] - state[0])**2 + (goal[1] - state[1])**2)
 96 |     return dist
 97 |    
 98 | def convergence_condition(state_val, epoch_val, point):
 99 |     """
100 |     Check if the cost are converged to the optimal values
101 |     @param: cost of states from one epoch back and current epoch
102 |     @return Bool - True if difference between values are less than fixed delta.
103 |     """
104 |     diffrence = (state_val[point[0],point[1]] - epoch_val[point[0],point[1]]) 
105 |     
106 |     if (diffrence < delta):
107 |         return True
108 |     else:
109 |         return False
110 |        
111 | def syncDP(start,goal):
112 |     """
113 |     Implementation of the Synchronous version of DP for shortest path problem
114 |     @param: start, goal
115 |     @return: Optimal Value function
116 |     """
117 |     global state_space
118 |     
119 |     epochs = 0
120 | 
121 |     state_val = np.full((width+1,height+1),999.0)
122 |     epoch_val = np.full((width+1,height+1),999.0)
123 | 
124 |     condition = False
125 |     #Repeat until Convergence
126 |     while (not condition):
127 |         
128 |         for i in range(X_max):
129 |             for j in range(Y_max):
130 |                 # Update costs of the states
131 |                 cost = dict()
132 |                 curr_state = (i,j)
133 |                 neighbours = stateNeighbours((curr_state))
134 |     
135 |                 for n in neighbours:
136 |                     cost_n = ci_nonGoal + search_heuristic(n,goal)
137 |                     cost[cost_n] = n
138 |                 
139 |                 if not len(neighbours) == 0:
140 |                     min_cost = min(cost.keys())
141 |                 
142 | 
143 |                 if epoch_val[curr_state[0],curr_state[1]] > min_cost:
144 |                     epoch_val[curr_state[0],curr_state[1]] = min_cost
145 |         
146 |         epochs += 1
147 |         print '-----------', epochs, '\t  epochs Complete ----------------'
148 |         x = random.randint(X_min,X_max)
149 |         y = random.randint(Y_min,Y_max)
150 |         condition = convergence_condition(state_val,epoch_val,(x,y))
151 |         
152 |         #Back up the costs of all states at once
153 |         state_val = epoch_val
154 |         
155 |         #state_space, ret = cv2.threshold(state_space,127,1,cv2.THRESH_BINARY)
156 |         #cv2.imshow('window',state_space)
157 |         #cv2.waitKey(1)
158 |         
159 |     return state_val
160 |    
161 | 
162 | def gauss_seidal(start,goal):
163 |     """
164 |     Implementation of Gauss_seidal algorithm for solving MDP
165 |     @param start goal
166 |     @Return: Optimal Value function
167 |     """
168 |     epochs = 0
169 | 
170 |     state_val = np.full((width+1,height+1),999.0)
171 |     val_kMinus1 = np.full((width+1,height+1),999.0)
172 | 
173 |     condition = False
174 |     while (not condition):
175 |         
176 |         for i in range(X_max):
177 |             for j in range(Y_max):
178 |                 cost = dict()
179 |                 curr_state = (i,j)
180 |                 
181 |                 neighbours = stateNeighbours((curr_state))
182 |     
183 |                 for n in neighbours:
184 |                     cost_n = ci_nonGoal + search_heuristic(n,goal)
185 |                     cost[cost_n] = n
186 |                 
187 |                 if not len(neighbours) == 0:
188 |                     min_cost = min(cost.keys())
189 |                 
190 |                 #Keep backing up the costs of states as when the sweep is done
191 |                 if state_val[curr_state[0],curr_state[1]] > min_cost:
192 |                     state_val[curr_state[0],curr_state[1]] = min_cost
193 |                 
194 |                 #cv2.circle(state_space,curr_state, 2, 1, -1)    
195 |                 #cv2.imshow('window',state_space)
196 |                 #cv2.waitKey(1)
197 |         
198 |         epochs += 1
199 |         print '-----------', epochs, '\t  epochs Complete ----------------'
200 |         x = random.randint(X_min,X_max)
201 |         y = random.randint(Y_min,Y_max)
202 |         condition = convergence_condition(state_val,val_kMinus1,(x,y))
203 |         val_kMinus1 = state_val
204 |         
205 |     return state_val
206 |                     
207 | def greedy_policy(start_point,goal_point,state_val):
208 |     """
209 |     Generates a greedy policy based on optimal value function
210 |     @param start, goal, Optimal value function
211 |     @return Path or optimal policy
212 |     """
213 |     path = list()
214 |     
215 |     curr_state = start_point
216 |     while not(curr_state == goal_point):
217 |         cost = dict()
218 |         neighbours = stateNeighbours(curr_state)
219 |         
220 |         for n in neighbours:
221 |             cost_n = state_val[n]
222 |             cost[cost_n] = n
223 |         
224 |         
225 |         if not len(neighbours) == 0:
226 |             min_cost = min(cost.keys())
227 |             next_state = cost[min_cost]
228 |             path.append(next_state)
229 |             
230 |             cv2.line(world, (next_state[0],next_state[1]), (curr_state[0],curr_state[1]), 1, 1)
231 |             curr_state = next_state
232 |             
233 |         cv2.imshow('window',world)
234 |         if cv2.waitKey(50) == 27:
235 |             break
236 |     
237 |     cv2.destroyAllWindows()
238 |     return path
239 |     
240 | if __name__ == '__main__':    
241 |     generate_startGoal()
242 |     
243 |     start_point = (50, 60)
244 |     goal_point =  (305,180)
245 | 
246 |     cv2.circle(world, start_point, 5, color=1, thickness=-1, lineType=8, shift=0)
247 |     cv2.circle(world, goal_point, 5, color=1, thickness=-1, lineType=8, shift=0)
248 | 
249 | ############ PLEASE UNCOMMENT THE ONE TO RUN AND COMMENT ONE NOT NEEDED ##########
250 |     state_val = syncDP(start_point,goal_point)
251 |     #state_val = gauss_seidal(start_point,goal_point)
252 | #----------------------------------------------------------------------------    
253 |     print '---------------------Update complete-------------------'
254 | 
255 |     path = greedy_policy(start_point,goal_point,state_val)
256 |     
257 | 


--------------------------------------------------------------------------------