├── .gitignore
├── LICENSE
├── README.md
├── cpp
    ├── ETD.cpp
    └── TOETD.cpp
└── py3
    ├── dvtd.py
    ├── elstd.py
    ├── etd.py
    ├── gtd.py
    ├── htd.py
    ├── idbd.py
    ├── lstd.py
    ├── td.py
    └── totd.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 rldotai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # rl-algorithms
 2 | 
 3 | Reinforcement learning algorithms.
 4 | 
 5 | There are many different variants on the basic ideas of reinforcement learning.
 6 | I have implemented some of them, with a focus on linear function approximation.
 7 | 
 8 | Extending these algorithms (for example, with nonlinear function approximators such as neural nets) is relatively straightforward once you are familiar with the underlying ideas.
 9 | 
10 | To facilitate this, the algorithms listed are written in a straightforward style and thoroughly commented, with references to the relevant papers and some explanation of the reasoning behind the code.
11 | 
12 | ## Implemented Algorithms
13 | 
14 | - [TD(λ): Temporal Difference Learning](py3/td.py)
15 | - [LSTD(λ): Least-Squares Temporal Difference Learning](py3/lstd.py)
16 | - [ETD(λ): Emphatic Temporal Difference Learning](py3/etd.py)
17 | - [GTD(λ): Gradient Temporal Difference Learning, AKA TDC(λ)](py3/gtd.py)
18 | - [TOTD(λ): True-Online Temporal Difference Learning, AKA TD with "Dutch Traces"](py3/totd.py)
19 | - [ESTD(λ): Least Squares Emphatic Temporal Difference Learning](py3/elstd.py)
20 | - [HTD(λ): Hybrid Temporal Difference Learning](py3/htd.py)
21 | - [DVTD(λ) or TD-δ^2: Online Variance Estimation via temporal difference errors](py3/td-variance.py)
22 |     - [The paper describing it](https://arxiv.org/abs/1801.08287)
23 | 
24 | ## TODO
25 | 
26 | - [ ] Q-Learning
27 | - [ ] SARSA
28 | - [ ] Distributional RL algorithms
29 | - [ ] Other second-order TD algorithms (e.g., NTD)
30 | - [ ] Actor-Critic algorithms
31 | 
32 | # Contributing
33 | 
34 | Send me a pull request if you have code to contribute.
35 | 
36 | Alternatively, raise an issue and provide me with a link to the paper describing the algorithm, and I will read and implement it when I get a chance.
37 | 


--------------------------------------------------------------------------------
/cpp/ETD.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | * ETD(lambda): Emphatic Temporal Difference Learning
 3 | *
 4 | * @author Brendan Bennett, Rich Sutton, October 2015.
 5 | *
 6 | * CHANGES FROM TOETD.cpp
 7 | *   - renamed some variables
 8 | *   - removed `gamma` as object variable, since it was unused
 9 | *   - rearranged parameters in `learn()` so that `phi`, `r`, `phi_p` come first
10 | */
11 | 
12 | class ETD
13 | {
14 |     int n;
15 |     double *theta;
16 |     double *e;
17 |     double F;
18 |     double D;
19 | 
20 | public:
21 |     ETD(int fvec_length) {
22 |         n = fvec_length;
23 |         e = new double[n];
24 |         theta = new double[n];
25 | 
26 |         // initialize weight vector and traces
27 |         for (int i=0; i<n; i++) {
28 |             e[i] = 0;
29 |             theta[i] = 0;
30 |         }
31 |         // initialize scalar variables
32 |         F = 0;
33 |         D = 0;
34 |     }
35 | 
36 |     void learn(double phi[], double r, double phi_p[],
37 |                double alpha, double gamma, double gamma_p, double I,
38 |                double lambda, double rho) {
39 |         // perform learning update
40 | 
41 |         F = F + I; // avoid keeping track of previous timestep's rho
42 |         double delta = r + gamma_p * dot(theta, phi_p) - dot(theta, phi);
43 |         double M = lambda*I + (1-lambda)*F;
44 |         double S = rho*alpha*M*(1 - rho*gamma*lambda*dot(phi, e));
45 |         double D_p = 0;
46 | 
47 |         // update weights and traces
48 |         double delta_i;
49 |         for (int i=0; i<n; i++) {
50 |             e[i] = rho*gamma*lambda*e[i] + S*phi[i];
51 |             delta_i = delta*e[i] + D * (e[i] - rho*alpha*M*phi[i]);
52 |             theta[i] += delta_i;
53 |             D_p += delta_i * phi_p[i];
54 |         }
55 |         // prepare for next iteration
56 |         D = D_p;
57 |         F *= rho*gamma_p;
58 |     }
59 | 
60 |     double predict(double fvec[]) {
61 |         // return the prediction for a feature vector
62 |         return dot(theta, fvec);
63 |     }
64 | 
65 |     double dot(double v1[], double v2[]) {
66 |         // inner product of two vectors of `n` components
67 |         double ret = 0;
68 |         for (int i=0; i<n; i++) {
69 |             ret += v1[i]*v2[i];
70 |         }
71 |         return ret;
72 |     }
73 | 
74 |     ~ETD() {
75 |         delete [] e;
76 |         delete [] theta;
77 |     }
78 | }


--------------------------------------------------------------------------------
/cpp/TOETD.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * TOE-TD(lambda): True-online emphatic TD(lambda), an off-policy learning algorithm.
 3 |  * See external documentation in TOETD.pdf on the web.
 4 |  * @author Rich Sutton, September 2014.
 5 |  * Compile with gcc TOETD.cpp -c
 6 |  */
 7 | 
 8 | class TOETD
 9 | {
10 |   //instance variables:
11 |   double *theta;              // main weight vector
12 |   double *e;                  // eligibility trace vector
13 |   int n;                      // dimensionality of the vectors
14 |   double F;                   // scalar memory for the emphasis algorithm
15 |   double D, gamma;            // auxiliary saved scalars from one step to the next
16 | 
17 | public:
18 | 
19 |   TOETD(int nArg, double I) {
20 |     n = nArg;
21 |     e = new double[n];
22 |     theta = new double[n];
23 |     for (int i=0; i<n; i++) theta[i]=e[i]=0;
24 |     F = D = gamma = 0;
25 |   }
26 | 
27 |   void learn(double alpha, double I, double lambda, double phi[], double rho, double R, double phiPrime[], double gammaPrime)
28 |   {
29 |     double Delta_i; // here a scalar, to avoid allocating an extra vector
30 |     double delta = R + gammaPrime*dot(theta,phiPrime) - dot(theta,phi);
31 |     F = F + I;
32 |     double M = lambda*I + (1-lambda)*F;
33 |     double S = rho*alpha*M * (1 - rho*gamma*lambda*dot(phi,e));
34 |     double newD = 0;
35 |     for (int i=0; i<n; i++) {
36 |       e[i] = rho*gamma*lambda*e[i] + S*phi[i];
37 |       Delta_i = delta*e[i] + D * (e[i] - rho*alpha*M*phi[i]);
38 |       theta[i] += Delta_i;
39 |       newD += Delta_i*phiPrime[i];
40 |     }
41 |     D = newD;
42 |     F *= rho*gammaPrime;
43 |     gamma = gammaPrime;
44 |   }
45 | 
46 |   double predict(double phi[]) {
47 |     return dot(theta,phi);
48 |   }
49 | 
50 |   double dot(double v1[], double v2[]) {
51 |     // inner product of two vectors of n components
52 |     double sum = 0;
53 |     for (int i=0; i<n; i++)
54 |       sum += v1[i]*v2[i];
55 |     return sum;
56 |   }
57 | 
58 |   ~TOETD() {
59 |     delete [] theta;
60 |     delete [] e;
61 |   }
62 | 
63 | };
64 | 
65 | 


--------------------------------------------------------------------------------
/py3/dvtd.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Direct variance TD(λ), also known as TD-δ^2, an online temporal difference
  3 | algorithm for estimating the return.
  4 | 
  5 | From the paper "Directly Estimating the Variance of the λ-return Using
  6 | Temporal-Difference Methods" (available on ArXiV[0]).
  7 | 
  8 | 
  9 | Summary
 10 | -------
 11 | 
 12 | When an agent learns the value function, it is learning the expected value of
 13 | the return.
 14 | However, learning the variance of the return may also be worthwhile, but until
 15 | relatively recently there were not a lot of research in this area.
 16 | 
 17 | This algorithm estimates the variance of the λ-return online using two
 18 | TD-learners, one which learns the value function, and a second that uses the
 19 | squared TD-errors from the first to generate a new approximation target.
 20 | 
 21 | When true value function is available, the discounted sum of the squared
 22 | TD-errors as a is equivalent to the variance of the return.
 23 | If the true value function is not available (e.g., because it is not
 24 | representable because of limitations on the function approximation being used)
 25 | it may not target the true variance, but often close enough to be useful.
 26 | 
 27 | 
 28 | Update Equations
 29 | ----------------
 30 | 
 31 | Here we provide the update equations in pseudo-LaTeX, which turns out to have
 32 | been a bit of a poor decision.
 33 | If this is difficult to read, either refer to the paper[0] or the code below.
 34 | 
 35 | For a feature function `x(.)` that maps states to real-valued vectors, the
 36 | estimated value of state `s` is given by `v(s)` expressed as `v(s) = w^T x(s)`.
 37 | In a similar fashion we denote the estimated variance of the λ-return by `u(s)`,
 38 | with `u(s) = ŵ^T x(s)`.
 39 | 
 40 | The update equations for the value estimator are just those of
 41 | TD(λ) with accumulating traces:
 42 | 
 43 |     δ_{t}   = R_{t+1} + γ_{t+1} w_{t}^T x_{t+1} - w_{t}^{T} x_{t}
 44 |     e_{t}   = ρ_{t} (λ_{t} γ_{t} e_{t-1} + x_{t})
 45 |     w_{t+1} = w_{t} + α δ_{t} e_{t}
 46 | 
 47 | Where:
 48 |     - δ refers to the temporal difference error
 49 |     - γ is the discount parameter
 50 |     - λ is the bootstrapping parameter
 51 |     - α is the stepsize parameter
 52 |     - w is the weight vector
 53 |     - e is the eligibility trace
 54 |     - x and r are feature vectors and rewards respectively
 55 | 
 56 | The variance estimator is also based on TD(λ), but uses the TD-errors (δ) from
 57 | the value estimator as rewards, and has a discount factor that is dependent on
 58 | the γ and λ used by the value learner.
 59 | We abuse unicode's circumflex accent in order to make the similarity with the
 60 | value update equations maximally apparent.
 61 | 
 62 |     R̂_{t+1}   = δ_{t}^{2}
 63 |     ŷ_{t+1}   = (γ_{t+1} λ_{t+1})^{2}
 64 |     ε_{t}     = R̂_{t+1} + ŷ_{t+1} ŵ_{t}^{T} x_{t+1} - ŵ_{t}^{T} x_{t}
 65 |     ŵ_{t+1}   = α ε_{t} x_{t}
 66 | 
 67 | Where:
 68 | 
 69 |     - R̂ is the "reward"
 70 |     - ŷ is the discount factor
 71 |     - ε is the temporal difference error
 72 |     - ŵ is the weight vector
 73 |     - Other variables are the same as in the value update equations.
 74 | 
 75 | 
 76 | Notes
 77 | -----
 78 | 
 79 | Here we learn the variance of the λ-return using what is effectively TD(0).
 80 | It is simpler to present this way, but more elaborate variations are possible.
 81 | 
 82 | For example, you it is possible to use different bootstrapping for each
 83 | algorithm, or even learn the variance for one λ-return using the value estimated
 84 | with a different value of λ.
 85 | This is likely irrelevant to the end-users of this code, but I would feel remiss
 86 | if I failed to make a note of it.
 87 | 
 88 | 
 89 | References
 90 | ----------
 91 | 
 92 | 0. https://arxiv.org/abs/1801.08287
 93 | """
 94 | import numpy as np
 95 | 
 96 | 
 97 | class DVTD:
 98 |     """Direct-Variance Temporal Difference Learning or DVTD(λ).
 99 | 
100 |     Attributes
101 |     ----------
102 |     n : int
103 |         The number of features (and therefore the length of the weight vector).
104 |     z_val : Vector[float]
105 |         The eligibility trace vector for the value estimator.
106 |     w_val : Vector[float]
107 |         The weight vector for the value estimator.
108 |     w_var : Vector[float]
109 |         The weight vector for the variance estimator.
110 | 
111 |     Notes
112 |     -----
113 |     This version is somewhat simplified for pedagogical reasons; see the paper
114 |     referenced in the file's documentation for the full version.
115 | 
116 |     The version implemented here uses general value functions (GVFs), meaning that
117 |     the discount factor, γ, and the bootstrapping factor, λ, may be functions
118 |     of state.
119 |     If that seems excessive for your needs, just use constant values for γ and
120 |     λ.
121 | 
122 |     """
123 |     def __init__(self, n):
124 |         """Initialize the learning algorithm.
125 | 
126 |         Parameters
127 |         -----------
128 |         n : int
129 |             The number of features, i.e. expected length of the feature vector.
130 |         """
131 |         self.n = n
132 |         self.w_val = np.zeros(self.n)
133 |         self.z_val = np.zeros(self.n)
134 |         self.w_var = np.zeros(self.n)
135 | 
136 |     def get_value(self, x):
137 |         """Get the approximate value for feature vector `x`."""
138 |         return np.dot(self.w_val, x)
139 | 
140 |     def get_variance(self, x):
141 |         """Get the approximate variance for feature vector `x`."""
142 |         return np.dot(self.w_var, x)
143 | 
144 |     def update(self, x, r, xp, alpha, gm, gm_p, lm, lm_p):
145 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
146 | 
147 |         Parameters
148 |         ----------
149 |         x : Vector[float]
150 |             The observation/features from the current timestep.
151 |         r : float
152 |             The reward from the transition.
153 |         xp : Vector[float]
154 |             The observation/features from the next timestep.
155 |         alpha : float
156 |             The step-size parameter for updating the weight vector.
157 |         gm : float
158 |             Gamma, abbreviated `gm`, the discount factor for the current state.
159 |         gm_p : float
160 |             The discount factor for the next state.
161 |         lm : float
162 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the
163 |             current timestep.
164 |         lm_p: float
165 |             Lambda prime, abbreviated `lm_p`, is the bootstrapping parameter
166 |             for the next timestep.
167 | 
168 |         Returns
169 |         -------
170 |         delta : float
171 |             The temporal difference error from the value update.
172 |         delta_var : float
173 |             The temporal difference error from the variance update.
174 | 
175 |         Notes
176 |         -----
177 |         Features (`x` and `xp`) are assumed to be 1D arrays of length `self.n`.
178 |         Other parameters are floats but are generally expected to be in the
179 |         interval [0, 1].
180 |         """
181 |         delta = r + gm_p*np.dot(self.w, xp) - np.dot(self.w, x)
182 |         self.z = x + gm*lm*self.z
183 |         self.w += alpha*delta*self.z
184 | 
185 |         r_var = delta**2
186 |         γ_var = (gm_p*lm_p)**2
187 |         delta_var = r_var + γ_var*np.dot(self.w_var, xp) - np.dot(self.w_var, x)
188 |         self.w_var += alpha*delta_var*x
189 |         return delta, delta_var
190 | 
191 |     def reset(self):
192 |         """Reset weights, traces, and other parameters."""
193 |         self.z_val = np.zeros(self.n)
194 |         self.w_val = np.zeros(self.n)
195 |         self.w_var = np.zeros(self.n)
196 | 


--------------------------------------------------------------------------------
/py3/elstd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Emphatic least-squares temporal difference learning, also known as ELSTD(λ).
 3 | 
 4 | TODO: Test the implementation
 5 | TODO: Add documentation
 6 | TODO: Add citations
 7 | """
 8 | import numpy as np 
 9 | 
10 | 
11 | class ELSTD:
12 |     """Emphatic least-squares temporal difference learning.
13 | 
14 |     Attributes
15 |     ----------
16 |     n : int
17 |         The number of features (and therefore the length of the weight vector).
18 |     z : Vector[float]
19 |         The eligibility trace vector.
20 |     A : Matrix[float]
21 |         A matrix with shape `(n, n)` that acts like a potential matrix.
22 |     b : Vector[float]
23 |         A vector of length `n` that accumulates the trace multiplied by the
24 |         reward over a trajectory.
25 |     F : float
26 |         The followon trace scalar.
27 |     M : float
28 |         The emphasis scalar.
29 |     """
30 |     def __init__(self, num_features=None, epsilon=0):
31 |         """Initialize the learning algorithm.
32 | 
33 |         Parameters
34 |         -----------
35 |         n : int
36 |             The number of features
37 |         epsilon : float
38 |             To avoid having the `A` matrix be singular, it is sometimes helpful
39 |             to initialize it with the identity matrix multiplied by `epsilon`.
40 |         """
41 |         self.n = n
42 |         self.reset(epsilon)
43 |     
44 |     def reset(self, epsilon=0):
45 |         """Reset weights, traces, and other parameters."""
46 |         self.z = np.zeros(self.n)
47 |         self.A = np.eye(self.n) * epsilon
48 |         self.b = np.zeros(self.n)
49 |         self.F = 0
50 |         self.M = 0
51 | 
52 |     @property
53 |     def theta(self):
54 |         """Compute the weight vector via `A^{-1} b`."""
55 |         _theta = np.dot(np.linalg.pinv(self.A), self.b)
56 |         return _theta
57 | 
58 |     def update(self, x, reward, xp, gm, gm_p, lm, interest):
59 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
60 | 
61 |         Parameters
62 |         ----------
63 |         x : array_like
64 |             The observation/features from the current timestep.
65 |         r : float
66 |             The reward from the transition.
67 |         xp : array_like
68 |             The observation/features from the next timestep.
69 |         gm : float
70 |             Gamma, abbreviated `gm`, the discount factor for the current state.
71 |         gm_p : float
72 |             The discount factor for the next state.
73 |         lm : float
74 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the
75 |             current timestep.
76 |         interest : float 
77 |             The 'interest' in the current state, from which emphasis is derived.
78 |         """
79 |         self.F = gm * self.F + interest
80 |         self.M = (lm * I) + ((1 - lm) * self.F)
81 |         self.z = (gm * lm * self.z + self.M * x)
82 |         self.A += np.outer(self.z, (x - gm_p*xp))
83 |         self.b += self.z * reward


--------------------------------------------------------------------------------
/py3/etd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Emphatic Temporal Difference Learning Algorithm (ETD), implemented in Python 3.
 3 | """
 4 | import numpy as np 
 5 | 
 6 | 
 7 | class ETD:
 8 |     """Emphatic Temporal Difference Learning, or ETD(λ).
 9 | 
10 |     Attributes
11 |     ----------
12 |     n : int
13 |         The number of features (and therefore the length of the weight vector).
14 |     z : Vector[float]
15 |         The eligibility trace vector.
16 |     w : Vector[float]
17 |         The weight vector.
18 |     F : float
19 |         The followon trace scalar.
20 |     M : float
21 |         The emphasis scalar.
22 |     """
23 |     def __init__(self, n):
24 |         """Initialize the learning algorithm.
25 | 
26 |         Parameters
27 |         -----------
28 |         n : int
29 |             The number of features
30 |         """
31 |         self.n = n
32 |         self.w = np.zeros(self.n)
33 |         self.z = np.zeros(self.n)
34 |         self.F = 0
35 |         self.M = 0
36 | 
37 |     def get_value(self, x):
38 |         """Get the approximate value for feature vector `x`."""
39 |         return np.dot(self.w, x)
40 | 
41 |     def update(self, x, r, xp, alpha, gm, gm_p, lm, rho, interest):
42 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
43 | 
44 | 
45 |         Parameters
46 |         ----------
47 |         x : array_like
48 |             The observation/features from the current timestep.
49 |         r : float
50 |             The reward from the transition.
51 |         xp : array_like
52 |             The observation/features from the next timestep.
53 |         alpha : float
54 |             The stepsize parameter for the update.
55 |         gm : float 
56 |             Gamma, abbreviated `gm`, the discount factor for the current state.
57 |         gm_p : float 
58 |             The discount factor for the next state.
59 |         lm : float 
60 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the 
61 |             current timestep.
62 |         rho : float 
63 |             The importance sampling ratio between the target policy and the 
64 |             behavior policy for the current timestep.
65 |         interest : float 
66 |             The interest for the current timestep.
67 | 
68 |         Returns
69 |         -------
70 |         delta : float
71 |             The temporal difference error from the update.
72 | 
73 |         Notes
74 |         -----
75 |         Features (`x` and `xp`) are assumed to be 1D arrays of length `self.n`.
76 |         Other parameters are floats but are generally expected to be in the 
77 |         interval [0, 1].
78 |         """
79 |         delta = r + gm_p*np.dot(self.w, xp) - np.dot(self.w, x)
80 |         self.F = gm*self.F + interest
81 |         self.M = lm*interest + (1 - lm)*self.F
82 |         self.z = rho*(x*self.M + gm*lm*self.z)
83 |         self.w += alpha*delta*self.z
84 | 
85 |         # prepare for next iteration
86 |         self.F *= rho
87 |         return delta
88 | 
89 |     def reset(self):
90 |         """Reset weights, traces, and other parameters."""
91 |         self.F = 0
92 |         self.M = 0
93 |         self.w = np.zeros(self.n)
94 |         self.z = np.zeros(self.n)


--------------------------------------------------------------------------------
/py3/gtd.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Gradient-TD(λ) Learning Algorithm, via Adam White's doctoral thesis, pg. 47.,
  3 | and Maei's doctoral thesis pg. 74 and 91-92 for the original derivation and
  4 | analysis.
  5 | Note that the algorithm is referred to as TDC(λ) or GTD(λ) in that work,
  6 | whereas GTD and GTD2 refer to variations on the same idea but without
  7 | eligibility traces.
  8 | 
  9 | The advantage of GTD(λ) is its stability in the off-policy setting, at the
 10 | expense of worse sample efficiency and therefore slower learning.
 11 | The step-sizes have to be chosen carefully, however, since there is some risk of
 12 | divergence.
 13 | Rules of thumb might be setting `beta` to `alpha/100` or similar; `alpha` might
 14 | also have to be set to a smaller value than you would use with TD(λ).
 15 | 
 16 | In Latex, the update equations look like:
 17 | 
 18 | δ_{t}   = R_{t+1} + γ_{t+1} w_{t}^T x_{t+1} - w_{t}^{T} x_{t}
 19 | e_{t}   = ρ_{t} (λ_{t} γ_{t} e_{t-1} + x_{t})
 20 | w_{t+1} = w_{t} + α[ δ_{t} e_{t} + γ_{t+1} (1 - λ_{t}) ( e_{t}^{T} h_{t} ) x_{t+1} ]
 21 | h_{t+1} = h_{t} + β[ δ_{t} e_{t} - ( h_{t}^{T} x_{t} ) x_{t} ]
 22 | 
 23 | Where:
 24 |     - δ refers to the temporal difference error
 25 |     - γ is the discount parameter
 26 |     - λ is the bootstrapping parameter
 27 |     - α and β are step-size parameters
 28 |     - w and h are weight vectors
 29 |     - e is the eligibility trace
 30 |     - x and r are feature vectors and rewards respectively
 31 | """
 32 | import numpy as np
 33 | 
 34 | 
 35 | class GTD:
 36 |     """Gradient Temporal Difference Learning, or GTD(λ). Suitable for
 37 |     off-policy learning, but with typically lower sample efficiency than TD(λ).
 38 | 
 39 |     Attributes
 40 |     ----------
 41 |     n : int
 42 |         The number of features (and therefore the length of the weight vector).
 43 |     e : Vector[float]
 44 |         The eligibility trace vector.
 45 |     w : Vector[float]
 46 |         The weight vector.
 47 |     h : Vector[float]
 48 |         The gradient adjustment weight vector.
 49 | 
 50 |     Notes
 51 |     -----
 52 |     See page 74 and 91-92 of Maei's thesis for definition of the algorithm.
 53 |     """
 54 |     def __init__(self, n):
 55 |         """Initialize the learning algorithm.
 56 | 
 57 |         Parameters
 58 |         -----------
 59 |         n : int
 60 |             The number of features, i.e. expected length of the feature vector.
 61 |         """
 62 |         self.n = n
 63 |         self.e = np.zeros(self.n)
 64 |         self.w = np.zeros(self.n)
 65 |         self.h = np.zeros(self.n)
 66 | 
 67 |     def get_value(self, x):
 68 |         """Get the approximate value for feature vector `x`."""
 69 |         return np.dot(self.w, x)
 70 | 
 71 |     def update(self, x, r, xp, alpha, beta, gm, gm_p, lm, lm_p, rho):
 72 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
 73 | 
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         x : array_like
 78 |             The observation/features from the current timestep.
 79 |         r : float
 80 |             The reward from the transition.
 81 |         xp : array_like
 82 |             The observation/features from the next timestep.
 83 |         alpha : float
 84 |             The step-size parameter for updating the weight vector.
 85 |         beta : float
 86 |             The step-size parameter for updating the correction weights.
 87 |         gm : float
 88 |             Gamma, abbreviated `gm`, the discount factor for the current state.
 89 |         gm_p : float
 90 |             The discount factor for the next state.
 91 |         lm : float
 92 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the
 93 |             current timestep.
 94 |         lm_p: float
 95 |             The bootstrapping parameter for the next timestep.
 96 |         rho : float
 97 |             The importance sampling ratio between the target policy and the
 98 |             behavior policy for the current timestep.
 99 | 
100 |         Returns
101 |         -------
102 |         delta : float
103 |             The temporal difference error from the update.
104 | 
105 |         Notes
106 |         -----
107 |         Features (`x` and `xp`) are assumed to be 1D arrays of length `self.n`.
108 |         Other parameters are floats but are generally expected to be in the
109 |         interval [0, 1].
110 |         """
111 |         delta = r + gm_p*np.dot(self.w, xp) - np.dot(self.w, x)
112 |         self.e = rho*(lm*gm*self.e + x)
113 |         self.w += alpha*(delta*self.e - gm_p*(1-lm_p)*np.dot(self.e, self.h)*xp)
114 |         self.h += beta*(delta*self.e - np.dot(self.h, x)*x)
115 |         return delta
116 | 
117 |     def reset(self):
118 |         """Reset weights, traces, and other parameters."""
119 |         self.e = np.zeros(self.n)
120 |         self.w = np.zeros(self.n)
121 |         self.h = np.zeros(self.n)
122 | 


--------------------------------------------------------------------------------
/py3/htd.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Hybrid-TD Learning Algorithm, via Adam White's doctoral thesis, pg. 173.
  3 | 
  4 | Like GTD(λ), it doesn't diverge in the off-policy case, while acting like TD(λ)
  5 | in the on-policy case, particularly with regards to good sample efficiency.
  6 | 
  7 | In Latex, the update equations look like:
  8 | 
  9 | δ_{t}   = R_{t+1} + γ_{t+1} w_{t}^T x_{t+1} - w_{t}^{T} x_{t} 
 10 | e_{t}   = ρ_{t} (λ_{t} γ_{t} e_{t-1} + x_{t})
 11 | z_{t}   = λ_{t} γ_{t} z_{t-1} + x_{t}
 12 | w_{t+1} = w_{t} + α[ δ_{t} e_{t} + (γ_{t+1} x_{t+1} - x_{t} ) (z_{t} - e_{t} ) h_{t} ]
 13 | h_{t+1} = h_{t} + β[ δ_{t} e_{t} + (γ_{t+1} x_{t+1} - x_{t} ) z_{t}^{T} h_{t}]
 14 | 
 15 | Where:
 16 |     - δ refers to the temporal difference error; 
 17 |     - γ is the discount parameter,
 18 |     - λ is the bootstrapping parameter
 19 |     - α and β are stepsize parameters, 
 20 |     - w and h are weight vectors
 21 |     - e and z are eligibility traces
 22 |     - x and r are feature vectors and rewards respectively.
 23 | """
 24 | import numpy as np 
 25 | 
 26 | 
 27 | class HTD:
 28 |     """Hybrid Temporal Difference Learning, or HTD(λ).
 29 |     Acts like TD(λ) in the on-policy case, but with GTD(λ)'s stability when
 30 |     updating off-policy.
 31 | 
 32 |     Attributes
 33 |     ----------
 34 |     n : int
 35 |         The number of features (and therefore the length of the weight vector).
 36 |     e : Vector[float]
 37 |         The importance sampling eligibility trace vector.
 38 |     z : Vector[float]
 39 |         The on-policy eligibility trace vector.
 40 |     w : Vector[float]
 41 |         The weight vector.
 42 |     h : Vector[float]
 43 |         The gradient adjustment weight vector.
 44 | 
 45 |     Notes
 46 |     -----
 47 |     See Adam White's PhD thesis, pg. 170-174 for a definition and discussion.
 48 |     """
 49 |     def __init__(self, n):
 50 |         """Initialize the learning algorithm.
 51 | 
 52 |         Parameters
 53 |         -----------
 54 |         n : int
 55 |             The number of features, i.e. expected length of the feature vector.
 56 |         """
 57 |         self.n = n
 58 |         self.e = np.zeros(self.n)
 59 |         self.z = np.zeros(self.n)
 60 |         self.w = np.zeros(self.n)
 61 |         self.h = np.zeros(self.n)
 62 | 
 63 |     def get_value(self, x):
 64 |         """Get the approximate value for feature vector `x`."""
 65 |         return np.dot(self.w, x)
 66 | 
 67 |     def update(self, x, r, xp, alpha, beta, gm, gm_p, lm, rho:
 68 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
 69 | 
 70 |         
 71 |         Parameters
 72 |         ----------
 73 |         x : array_like
 74 |             The observation/features from the current timestep.
 75 |         r : float
 76 |             The reward from the transition.
 77 |         xp : array_like
 78 |             The observation/features from the next timestep.
 79 |         alpha : float
 80 |             The stepsize parameter for updating the weight vector.
 81 |         beta : float 
 82 |             The stepsize parameter for updating the correction weights.
 83 |         gm : float 
 84 |             Gamma, abbreviated `gm`, the discount factor for the current state.
 85 |         gm_p : float 
 86 |             The discount factor for the next state.
 87 |         lm : float 
 88 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the 
 89 |             current timestep.
 90 |         rho : float 
 91 |             The importance sampling ratio between the target policy and the 
 92 |             behavior policy for the current timestep.
 93 | 
 94 |         Returns
 95 |         -------
 96 |         delta : float
 97 |             The temporal difference error from the update.
 98 |         
 99 |         Notes
100 |         -----
101 |         Features (`x` and `xp`) are assumed to be 1D arrays of length `self.n`.
102 |         Other parameters are floats but are generally expected to be in the 
103 |         interval [0, 1].
104 |         """
105 |         delta = r + gm_p*np.dot(self.theta, xp) - np.dot(self.theta, x)
106 |         self.e = rho*(lm*gm*self.e + x)
107 |         self.z = lm*gm*self.z + x 
108 |         self.w += alpha*(delta*self.e + (gm_p*xp - x)*np.dot(self.z - self.e), self.h)
109 |         self.h += beta*(delta*self.e + (gm_p*xp - x)*np.dot(self.z, self.h)
110 | 
111 |         return delta
112 | 
113 |     def reset(self):
114 |         """Reset weights, traces, and other parameters."""
115 |         self.e = np.zeros(self.n)
116 |         self.z = np.zeros(self.n)
117 |         self.w = np.zeros(self.n)
118 |         self.h = np.zeros(self.n)
119 | 
120 | 


--------------------------------------------------------------------------------
/py3/idbd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Incremental Delta-Bar-Delta (IDBD), a stepsize adjustment algorithm.
 3 | Not strictly a reinforcement learning algorithm, since it assumes more of an 
 4 | online learning setting (in contrast to TD(λ), which attempts to solve the 
 5 | Bellman equation, which depends on the current state as well as the next state
 6 | in sequence: δ(t) = r(t) + v(t) - γv(t+1), vs. δ(t) = y(t) - x(t) * w(t)).
 7 | 
 8 | Taken from:
 9 | "Adapting Bias by Gradient Descent: An Incremental Version of Delta-Bar-Delta", 
10 | Richard Sutton
11 | Proceedings of Tenth National Conf. on Artificial Intelligence, pp. 171–176, 
12 | MIT Press, 1992.
13 | """
14 | 
15 | class IDBD:
16 |     """
17 |     Incremental Delta-Bar-Delta, or IDBD.
18 | 
19 |     Attributes
20 |     ----------
21 |     n : int 
22 | 
23 |     alpha : Vector[float]
24 |         The vector of per-weight stepsizes.
25 |     beta : Vector[float]
26 |         The vector of logarithmic per-weight stepsizes.
27 |     w : Vector[float]
28 |         Weight vector.
29 |     h : Vector[float]
30 |         Update memory trace.
31 |     eta : float 
32 |         Meta stepsize parameter.
33 |     """
34 |     def __init__(self, n, eta=1):
35 |         self.n = n 
36 |         self.eta = eta
37 |         self.reset()
38 | 
39 |     def reset(self):
40 |         # What should beta be initialized to? Should `w` be zeros or random?
41 |         self.beta = (-1/self.n)*np.ones(self.n)
42 |         self.h = np.zeros(self.n)
43 |         self.w = np.zeros(self.n)
44 | 
45 |     def update(self, x, delta):
46 |         self.beta += self.eta * self.h * delta * x
47 |         self.alpha = np.exp(self.beta)
48 |         self.w += self.alpha * delta * x 
49 |         self.h = self.h * np.max(0, 1 - self.alpha * x**2) + self.alpha*delta*x


--------------------------------------------------------------------------------
/py3/lstd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Least-squares temporal difference learning, also known as LSTD(λ).
 3 | 
 4 | TODO: Test the implementation
 5 | TODO: Add documentation
 6 | TODO: Add citations
 7 | """
 8 | import numpy as np 
 9 | 
10 | 
11 | class LSTD:
12 |     """Least-squares temporal difference learning.
13 | 
14 |     Attributes
15 |     ----------
16 |     n : int
17 |         The number of features (and therefore the length of the weight vector).
18 |     z : Vector[float]
19 |         The eligibility trace vector.
20 |     A : Matrix[float]
21 |         A matrix with shape `(n, n)` that acts like a potential matrix.
22 |     b : Vector[float]
23 |         A vector of length `n` that accumulates the trace multiplied by the
24 |         reward over a trajectory.
25 |     """
26 |     def __init__(self, n, epsilon=0):
27 |         """Initialize the learning algorithm.
28 | 
29 |         Parameters
30 |         -----------
31 |         n : int
32 |             The number of features
33 |         epsilon : float
34 |             To avoid having the `A` matrix be singular, it is sometimes helpful
35 |             to initialize it with the identity matrix multiplied by `epsilon`.
36 |         """
37 |         self.n = n
38 |         self.reset(epsilon)
39 |     
40 |     def reset(self, epsilon=0):
41 |         """Reset weights, traces, and other parameters."""
42 |         self.z = np.zeros(self.n)
43 |         self.A = np.eye(self.n) * epsilon
44 |         self.b = np.zeros(self.n)
45 | 
46 |     @property
47 |     def theta(self):
48 |         """Compute the weight vector via `A^{-1} b`."""
49 |         _theta = np.dot(np.linalg.pinv(self.A), self.b)
50 |         return _theta
51 | 
52 |     def update(self, x, reward, xp, gm, gm_p, lm):
53 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
54 | 
55 |         Parameters
56 |         ----------
57 |         x : array_like
58 |             The observation/features from the current timestep.
59 |         r : float
60 |             The reward from the transition.
61 |         xp : array_like
62 |             The observation/features from the next timestep.
63 |         gm : float
64 |             Gamma, abbreviated `gm`, the discount factor for the current state.
65 |         gm_p : float
66 |             The discount factor for the next state.
67 |         lm : float
68 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the
69 |             current timestep.
70 |         """
71 |         self.z = (gm * lm * self.z + x)
72 |         self.A += np.outer(self.z, (x - gm_p*xp))
73 |         self.b += self.z * reward


--------------------------------------------------------------------------------
/py3/td.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Temporal difference learning, AKA TD(λ), an on-policy linear-time online
  3 | learning algorithm
  4 | 
  5 | 
  6 | Summary
  7 | -------
  8 | 
  9 | This is one of the foundational algorithms of reinforcement learning.
 10 | See the book "Reinforcement Learning: An Introduction" by Sutton and Barto for
 11 | a full introduction, in particular Chapter 7.
 12 | 
 13 | The algorithm is given in pseudocode on Rich Sutton's website[0].
 14 | 
 15 | It is known to converge in the on-policy setting under mild technical conditions,
 16 | although the fixed-point it converges to changes depending on the bootstrapping
 17 | parameter, λ.
 18 | For λ=0 we bootstrap the value of each state from the reward and the value of its
 19 | successor; this tends to converge quickly but its solution may be different from
 20 | the true value function (and its least-squares approximation).
 21 | With λ=1 we get effectively an online, every-visit Monte-Carlo method for
 22 | estimating state value which may be more accurate, but tends to have a higher
 23 | variance.
 24 | 
 25 | 
 26 | Update Equations
 27 | ----------------
 28 | 
 29 | In pseudo-LaTeX, the update equations look like:
 30 | 
 31 |     δ_{t}   = R_{t+1} + γ_{t+1} w_{t}^T x_{t+1} - w_{t}^{T} x_{t}
 32 |     e_{t}   = ρ_{t} (λ_{t} γ_{t} e_{t-1} + x_{t})
 33 |     w_{t+1} = w_{t} + α δ_{t} e_{t}
 34 | 
 35 | Where:
 36 |     - δ refers to the temporal difference error
 37 |     - γ is the discount parameter
 38 |     - λ is the bootstrapping parameter
 39 |     - α is the stepsize parameter
 40 |     - w is the weight vector
 41 |     - e is the eligibility trace
 42 |     - x and r are feature vectors and rewards respectively
 43 | 
 44 | This version of TD(λ) is an on-policy algorithm, so it doesn't respond
 45 | well to updates from trajectories generated via policies other than the one
 46 | it is currently evaluating.
 47 | There are a slew of modifications that can allow for off-policy evaluation,
 48 | for example: GTD(λ), ETD(λ), and other importance sampling methods.
 49 | Here, we employ accumulating traces (vs. replacing traces or dutch traces),
 50 | although modifying the code for different traces is straightforward.
 51 | 
 52 | 
 53 | References
 54 | ----------
 55 | 
 56 | 0: https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node75.html
 57 | """
 58 | import numpy as np
 59 | 
 60 | 
 61 | class TD:
 62 |     """Temporal Difference Learning or TD(λ) with accumulating traces.
 63 | 
 64 |     The version implemented here uses general value functions (GVFs), meaning that
 65 |     the discount factor, γ, and the bootstrapping factor, λ, may be functions
 66 |     of state.
 67 |     If that doesn't seem germane to your problem, just use a constant value for them.
 68 | 
 69 |     Attributes
 70 |     ----------
 71 |     n : int
 72 |         The number of features (and therefore the length of the weight vector).
 73 |     z : Vector[float]
 74 |         The eligibility trace vector.
 75 |     w : Vector[float]
 76 |         The weight vector.
 77 |     """
 78 |     def __init__(self, n):
 79 |         """Initialize the learning algorithm.
 80 | 
 81 |         Parameters
 82 |         -----------
 83 |         n : int
 84 |             The number of features, i.e. expected length of the feature vector.
 85 |         """
 86 |         self.n = n
 87 |         self.w = np.zeros(self.n)
 88 |         self.z = np.zeros(self.n)
 89 | 
 90 |     def get_value(self, x):
 91 |         """Get the approximate value for feature vector `x`."""
 92 |         return np.dot(self.w, x)
 93 | 
 94 |     def update(self, x, r, xp, alpha, gm, gm_p, lm):
 95 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
 96 | 
 97 |         Parameters
 98 |         ----------
 99 |         x : Vector[float]
100 |             The observation/features from the current timestep.
101 |         r : float
102 |             The reward from the transition.
103 |         xp : Vector[float]
104 |             The observation/features from the next timestep.
105 |         alpha : float
106 |             The step-size parameter for updating the weight vector.
107 |         gm : float
108 |             Gamma, abbreviated `gm`, the discount factor for the current state.
109 |         gm_p : float
110 |             The discount factor for the next state.
111 |         lm : float
112 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the
113 |             current timestep.
114 | 
115 |         Returns
116 |         -------
117 |         delta : float
118 |             The temporal difference error from the update.
119 | 
120 |         Notes
121 |         -----
122 |         Features (`x` and `xp`) are assumed to be 1D arrays of length `self.n`.
123 |         Other parameters are floats but are generally expected to be in the
124 |         interval [0, 1].
125 |         """
126 |         delta = r + gm_p*np.dot(self.w, xp) - np.dot(self.w, x)
127 |         self.z = x + gm*lm*self.z
128 |         self.w += alpha*delta*self.z
129 |         return delta
130 | 
131 |     def reset(self):
132 |         """Reset weights, traces, and other parameters."""
133 |         self.w = np.zeros(self.n)
134 |         self.z = np.zeros(self.n)
135 | 


--------------------------------------------------------------------------------
/py3/totd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | True-online TD(λ), sometimes known as temporal difference learning with 'Dutch traces'.
 3 | 
 4 | ---
 5 | 
 6 | 0 : [van Seijen, Harm, and Richard S. Sutton. "True Online TD (lambda)." 
 7 | ICML. Vol. 14. 2014.](http://www.jmlr.org/proceedings/papers/v32/seijen14.pdf)
 8 | 
 9 | TODO: Test the implementation
10 | TODO: Add documentation
11 | """
12 | 
13 | 
14 | class TOTD:
15 |     """True-online temporal difference learning with linear function approximation.
16 | 
17 |     TODO: Test this code
18 |     TODO: Consider modifying the update function to remove alpha from the trace. 
19 |     """
20 |     def __init__(self, n):
21 |         """Initialize the learning algorithm.
22 | 
23 |         Parameters
24 |         -----------
25 |         n : int
26 |             The number of features, i.e. expected length of the feature vector.
27 |         
28 |         Attributes
29 |         ----------
30 |         w : Vector[float]
31 |             The current weight vector.
32 |         w_old : Vector[float]
33 |             The previous time-step's weight vector.
34 |         z : Vector[float]
35 |             The array of the eligibility traces.
36 |         """
37 |         self.n      = n
38 |         self.w      = np.zeros(self.n)
39 |         self.w_old  = np.zeros(self.n)
40 |         self.z      = np.zeros(self.n)
41 | 
42 |     def get_value(self, x):
43 |         """Get the approximate value for feature vector `x`."""
44 |         return np.dot(self.w, x)
45 | 
46 |     def update(self, x, r, xp, alpha, gm, gm_p, lm):
47 |         """Update from new experience, i.e. from a transition `(x,r,xp)`.
48 | 
49 | 
50 |         Parameters
51 |         ----------
52 |         x : Vector
53 |             The observation/features from the current timestep.
54 |         r : float
55 |             The reward from the transition.
56 |         xp : Vector
57 |             The observation/features from the next timestep.
58 |         alpha : float
59 |             The step-size parameter for updating the weight vector.
60 |         gm : float
61 |             Gamma, abbreviated `gm`, the discount factor for the current state.
62 |         gm_p : float
63 |             The discount factor for the next state.
64 |         lm : float
65 |             Lambda, abbreviated `lm`, is the bootstrapping parameter for the
66 |             current timestep.
67 | 
68 |         Returns
69 |         -------
70 |         delta : float
71 |             The temporal difference error from the update.
72 | 
73 |         Notes
74 |         -----
75 |         Features (`x` and `xp`) are assumed to be 1D arrays of length `self.n`.
76 |         Other parameters are floats but are generally expected to be in the
77 |         interval [0, 1].
78 |         """
79 |         delta = r + gm_p*np.dot(self.w, xp) - np.dot(self.w, x)
80 |         self.z = gm*lm*self.z + alpha*x - alpha*gm*lm*np.dot(self.z, x)*x
81 |         self.w += delta*self.z + alpha*(np.dot(self.w_old, x) - np.dot(self.w, x))*x
82 |         return delta
83 | 
84 |     def reset(self):
85 |         """Reset weights, traces, and other parameters."""
86 |         self.w      = np.zeros(self.n)
87 |         self.w_old  = np.zeros(self.n)
88 |         self.z      = np.zeros(self.n)


--------------------------------------------------------------------------------