├── LICENSE
├── Method.py
├── README.md
├── features
    ├── FourierBasisFeatures.py
    └── RandomFourierFeatures.py
├── gymEnvs
    ├── CartpoleSwingup.py
    ├── DoublePendulum.py
    ├── Hopper.py
    ├── LunarLander.py
    ├── MountainCar.py
    ├── Reacher.py
    ├── SinglePendulum.py
    ├── SparseCartpoleSwingup.py
    ├── SparseCartpoleSwingupDisc.py
    ├── SparseDoublePendulum.py
    ├── SparseDoublePendulumDisc.py
    ├── SparseHopper.py
    ├── SparseHopperDisc.py
    ├── SparseLunarLander.py
    ├── SparseLunarLanderDisc.py
    ├── SparseMountainCar.py
    ├── SparseMountainCarDisc.py
    ├── SparseReacher.py
    ├── SparseReacherDisc.py
    ├── SparseSinglePendulum.py
    ├── SparseSinglePendulumDisc.py
    └── __init__.py
├── main.py
└── rlutils
    ├── ABLR.py
    ├── AgentHelper.py
    ├── Agents.py
    ├── EnvGlue.py
    ├── Envs.py
    ├── LR_SGD.py
    ├── Normaliser.py
    ├── Policies.py
    ├── Runners.py
    ├── TransitionMemory.py
    ├── __init__.py
    └── utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Method.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from tqdm import tqdm
  3 | 
  4 | from rlutils.ABLR import ABLR, fixedMeanABLR
  5 | from rlutils.Policies import eGreedyPolicy
  6 | from rlutils.Agents import AbsTDAgent
  7 | 
  8 | 
  9 | class Method(AbsTDAgent):
 10 |     def __init__(self, agtHlp, alphaQ=0.1, betaQ=0.1, alphaU=0.1, betaU=0.1,
 11 |                  kappa=1.0, gamma=0.99, maxVIiter=30, tm=None):
 12 |         self.agtHlp = agtHlp
 13 |         self.modelQ = ABLR(agtHlp.nFeatures, alphaQ, betaQ, computeSninv=False)
 14 |         sigma = agtHlp.ftmap.sigma
 15 |         self.modelU = fixedMeanABLR(agtHlp.nFeatures, alphaU, betaU, 0.0,
 16 |                                     sigma[0])
 17 |         self.kappa = kappa
 18 |         self.gamma = gamma
 19 |         self.tm = tm
 20 |         self.maxVIiter = maxVIiter
 21 |         self.epsLSTD = 0.1  # Stop LSTD loop if model change under this value
 22 |         self.greedyPolicy = eGreedyPolicy(agtHlp, self.score, 0.0)
 23 |         self.nSampsACont = 10
 24 |         self.varMax = 1.0 / alphaU
 25 |         self.kappa /= self.varMax
 26 |         print("Varmax={}, Kappa={}".format(self.varMax, self.kappa))
 27 | 
 28 |     def update(self, s, a, r, sp):
 29 |         """
 30 |         Adds one data point at a time.
 31 |         """
 32 |         ap = self.greedyPolicy.pick(sp)
 33 |         ftsa = self.agtHlp.toStateActionPair(s, a)
 34 |         ftsap = self.agtHlp.toStateActionPair(sp, ap)
 35 | 
 36 |         # Update Q
 37 |         qTarget = r + self.gamma * self.modelQ.predictMean(ftsap)
 38 |         self.modelQ.update(ftsa, qTarget)
 39 | 
 40 |         # Update U
 41 |         self._updateU(ftsa, ftsap, s, a, sp)
 42 | 
 43 |     def _updateU(self, ftsa, ftsap, s, a, sp):
 44 |         if self.agtHlp.isDiscA():
 45 |             allAs = self.agtHlp.allDiscA().reshape(-1, 1)
 46 |         else:
 47 |             allAs = self.agtHlp.sampleContA(self.nSampsACont)
 48 |         repSp = np.repeat(sp, len(allAs), axis=0)
 49 |         sampsFtsap = self.agtHlp.toStateActionPair(repSp, allAs)
 50 |         varQsp = np.mean(self.modelQ.predictVar(sampsFtsap))
 51 | 
 52 |         Usap = self.modelU.predictMean(ftsap)
 53 | 
 54 |         usp = varQsp - self.varMax
 55 |         self.modelU.update(ftsa, usp + self.gamma * Usap)
 56 | 
 57 |     def score(self, s, a):
 58 |         sa = self.agtHlp.toStateActionPair(s, a)
 59 |         m = self.modelQ.predictMean(sa)
 60 |         u = self.modelU.predictMean(sa)
 61 |         return m + self.kappa * u
 62 | 
 63 |     def endOfEpUpdate(self):
 64 |         if not self.tm:
 65 |             return
 66 |         print("End of episode update")
 67 |         # Retrieve all data
 68 |         data_s, data_a, data_r, data_sp = self.tm.getTransitions()
 69 | 
 70 |         # update for Q
 71 |         ftsa = self.agtHlp.toStateActionPair(data_s, data_a)
 72 |         # Init
 73 |         it = 0
 74 |         w = self.modelQ.mn
 75 |         prevW = w + 2 * self.epsLSTD
 76 |         pbar = tqdm(total=self.maxVIiter)
 77 |         while np.linalg.norm(prevW - w, 2) > self.epsLSTD and \
 78 |                 it < self.maxVIiter:
 79 |             it += 1
 80 |             prevW = w
 81 | 
 82 |             data_ap = self.greedyPolicy.pick(data_sp)
 83 |             ftsap = self.agtHlp.toStateActionPair(data_sp, data_ap)
 84 |             qTargets = data_r + self.gamma * self.modelQ.predictMean(ftsap)
 85 |             self.modelQ.updateTargets(ftsa, qTargets)
 86 |             self.modelQ._recompute()
 87 | 
 88 |             w = self.modelQ.mn
 89 |             pbar.update(1)
 90 |         pbar.update(self.maxVIiter-it)
 91 |         pbar.close()
 92 |         print("")
 93 |         print("\tVI iterations for Q:{}".format(it))
 94 | 
 95 |         if self.kappa == 0:
 96 |             return
 97 |         # update for U
 98 |         data_ap = self.greedyPolicy.pick(data_sp)
 99 |         ftsap = self.agtHlp.toStateActionPair(data_sp, data_ap)
100 |         lenS = data_s.shape[0]
101 |         if self.agtHlp.isDiscA():
102 |             allA = self.agtHlp.allDiscA().reshape(-1, 1)
103 |         else:
104 |             allA = self.agtHlp.sampleContA(self.nSampsACont)
105 |         repAllA = np.vstack([allA] * lenS)
106 |         repSp = np.repeat(data_sp, len(allA), 0)
107 |         sampsFtsap = self.agtHlp.toStateActionPair(repSp, repAllA)
108 |         allVarQsp = self.modelQ.predictVar(sampsFtsap).reshape(-1, lenS)
109 |         varQ_sp = np.mean(allVarQsp, 0).reshape(-1, 1)
110 |         # Init
111 |         it = 0
112 |         w = self.modelU.mn
113 |         prevW = w + 2 * self.epsLSTD
114 |         pbar = tqdm(total=self.maxVIiter)
115 |         while np.linalg.norm(prevW - w, 2) > self.epsLSTD and \
116 |                 it < self.maxVIiter:
117 |             it += 1
118 |             prevW = w
119 | 
120 |             self._endEpUpdateU(ftsa, ftsap, data_s, data_a, data_sp, varQ_sp)
121 |             self.modelU._recompute()
122 | 
123 |             w = self.modelU.mn
124 |             pbar.update(1)
125 |         pbar.update(self.maxVIiter-it)
126 |         pbar.close()
127 |         print("")
128 |         print("\tVI iterations for U:{}".format(it))
129 | 
130 |     def _endEpUpdateU(self, ftsa, ftsap, data_s, data_a, data_sp, varQ_sp):
131 |         rExpl = varQ_sp - self.varMax
132 |         Usap = self.modelU.predictMean(ftsap)
133 |         self.modelU.updateTargets(ftsa, rExpl + self.gamma * Usap)
134 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EMU-Q
 2 | Exploring by Minimizing Uncertainty of Q values (EMU-Q) as presented in "Bayesian RL for Goal-Only Rewards" at CoRL'18 by *P. Morere and F. Ramos* [\[PDF\]](http://proceedings.mlr.press/v87/morere18a/morere18a.pdf).
 3 | 
 4 | If you use any of the code related to this repository in a paper, research etc., please cite:
 5 | 
 6 | ```bibtex
 7 | @inproceedings{
 8 |     morere2018bayesian,
 9 |     title={Bayesian {RL} for Goal-Only Rewards},
10 |     author={Morere, Philippe and Ramos, Fabio},
11 |     booktitle={Conference on Robot Learning},
12 |     year={2018},
13 | }
14 | ```
15 | 
16 | ## Dependencies
17 | This code is written for python3. The dependencies (pip packages) are:
18 | * numpy
19 | * scipy
20 | * gym
21 | * nlopt
22 | * ghalton
23 | * tqdm
24 | 
25 | ## Running the code
26 | The code entry point is `main.py`. Try run `python3 main.py --help` for available options.
27 | ### Running our method
28 | ```
29 | python3 main.py --agent=method --sparseGymEnv=MountainCar-v0 --nStep=300 --nEp=10 --nRFF=300 --sigmaS=0.35 --sigmaA=10 -vv
30 | ```
31 | 
32 | ### Running RFF-Q
33 | ```
34 | python3 main.py --agent=QLearning --gymEnv=MountainCar-v0 --nStep=300 --nEp=30 --nRFF=300 --sigmaS=0.35 --sigmaA=10 -vv
35 | ```
36 | 
37 | ## goal-only discrete and continuous gym environments
38 | All goal-only discrete and continuous gym environments presented in the main paper are located in the `gymEnvs` folder. To use them, these environments need to be registered in gym as described in <https://gym.openai.com/docs/#the-registry>.
39 | These environments can then be called from `main.py` with `--gymEnv=SparseMountainCar-v0` for example.
40 | 


--------------------------------------------------------------------------------
/features/FourierBasisFeatures.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class FourierBasisFeatures:
 5 |     def __init__(self, dim, featureOrder, verbose=1):
 6 |         """
 7 |         Fourier basis features up to order N
 8 |         :param dim: input dimension
 9 |         :param featureOrder: Fourier feature order
10 |         """
11 | 
12 |         freqs = tuple([list(range(featureOrder + 1))] * dim)
13 |         # Cartesian product of arrays:
14 |         prod = np.array(np.meshgrid(*freqs)).T.reshape(-1, dim)
15 |         self.featureCoeff = np.pi * prod
16 |         self.nFeatures = len(self.featureCoeff)
17 |         if verbose >= 2:
18 |             print("Feature coefficients({}): \n{}".format(
19 |                 len(self.featureCoeff), self.featureCoeff))
20 | 
21 |     def toFeatures(self, s):
22 |         return np.cos(self.featureCoeff.dot(s.T)).T
23 | 


--------------------------------------------------------------------------------
/features/RandomFourierFeatures.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numbers
  3 | from scipy.special import erfinv
  4 | import ghalton
  5 | 
  6 | 
  7 | class RFF:
  8 |     """
  9 |     Random Fourier Features, Vanilla or quasi-random
 10 |     Note: make sure input space is normalised
 11 |     """
 12 |     def toFeatures(self, x):
 13 |         pass
 14 | 
 15 |     def __init__(self, m, d, sigma, cosOnly=False, quasiRandom=True,
 16 |                  kernel="RBF"):
 17 |         """
 18 |         :param m: number of features
 19 |         :param d: input dimension
 20 |         :param m: feature lengthscale (can be scalar of vector of size d)
 21 |         :param cosOnly: Using cos-only formulation of RFF (Default=False)
 22 |         :param quasiRandom: Using quasi-random sequence to generate RFF
 23 |                             (Default=True)
 24 |         :param kernel: Type of kernel to approximate: RBF, Laplace/Matern12,
 25 |                        Matern32, Matern52 (Default=RBF)
 26 |         RFF for RBF kernel.
 27 |         """
 28 |         self.m = int(m)
 29 |         self.nFeatures = self.m
 30 |         self.sigma = sigma
 31 |         self.d = int(d)
 32 |         self.coeff = None
 33 |         self.offset = None
 34 |         self.a = 1.0
 35 | 
 36 |         # Fix sigma
 37 |         if isinstance(sigma, numbers.Number):
 38 |             sigma = np.ones(d) * sigma
 39 |         elif isinstance(sigma, list):
 40 |             sigma = np.array(sigma)
 41 | 
 42 |         if kernel == "RBF":
 43 |             rffKernel = RFFKernelRBF()
 44 |         elif kernel == "Laplace" or kernel == "Matern12":
 45 |             rffKernel = RFFKernelMatern12()
 46 |         elif kernel == "Matern32":
 47 |             rffKernel = RFFKernelMatern32()
 48 |         elif kernel == "Matern52":
 49 |             rffKernel = RFFKernelMatern52()
 50 |         else:
 51 |             raise ValueError("Kernel {} is not recognised.".format(kernel))
 52 | 
 53 |         self.quasiRandom = quasiRandom
 54 |         self.cosOnly = cosOnly
 55 |         if self.cosOnly:  # cos only features
 56 |             self.coeff = self._drawCoeff(rffKernel, m)
 57 |             self.offset = 2.0 * np.pi * np.random.rand(1, m)
 58 |             self.a = np.sqrt(1.0/float(self.m))
 59 |             self.toFeatures = self._toCosOnlyFeatures
 60 |         else:  # "cossin"
 61 |             assert m % 2 == 0 and "RFF: Number of fts must be multiple of 2."
 62 |             self.coeff = self._drawCoeff(rffKernel, int(m//2))
 63 |             self.a = np.sqrt(1.0/float(self.m/2))
 64 |             self.toFeatures = self._toCosSinFeatures
 65 | 
 66 |     def _drawCoeff(self, rffKernel, m):
 67 |         if self.quasiRandom:
 68 |             perms = ghalton.EA_PERMS[:self.d]
 69 |             sequencer = ghalton.GeneralizedHalton(perms)
 70 |             points = np.array(sequencer.get(m+1))[1:]
 71 |             freqs = rffKernel.invCDF(points)
 72 |             return freqs / self.sigma.reshape(1, len(self.sigma))
 73 | 
 74 |         else:
 75 |             freqs = rffKernel.sampleFreqs((m, self.d))
 76 |             return freqs / self.sigma.reshape(1, len(self.sigma))
 77 | 
 78 |     def _toCosOnlyFeatures(self, x):
 79 |         inner = x.dot(self.coeff.T)
 80 |         return self.a * np.cos(inner + self.offset)
 81 | 
 82 |     def _toCosSinFeatures(self, x):
 83 |         inner = x.dot(self.coeff.T)
 84 |         return self.a * np.hstack((np.cos(inner), np.sin(inner)))
 85 | 
 86 | 
 87 | class RFFKernel:
 88 |     def sampleFreqs(self, shape):
 89 |         raise NotImplementedError
 90 | 
 91 |     def invCDF(self, x):
 92 |         raise NotImplementedError
 93 | 
 94 | 
 95 | class RFFKernelRBF(RFFKernel):
 96 |     def sampleFreqs(self, shape):
 97 |         return np.random.normal(0.0, 1.0, shape)
 98 | 
 99 |     def invCDF(self, x):
100 |         return erfinv(2*x-1) * np.sqrt(2)
101 | 
102 | 
103 | class RFFKernelMatern12(RFFKernel):
104 |     def sampleFreqs(self, shape):
105 |         return np.random.normal(0, 1, shape) * \
106 |                 np.sqrt(1/np.random.chisquare(1, shape))
107 | 
108 |     def invCDF(self, x):
109 |         # This formula comes from the inv cdf of a standard cauchy
110 |         # distribution (see Laplace RFF).
111 |         return np.tan(np.pi*(x-0.5))
112 | 
113 | 
114 | class RFFKernelMatern32(RFFKernel):
115 |     def sampleFreqs(self, shape):
116 |         return np.random.normal(0, 1, shape) * \
117 |                 np.sqrt(3/np.random.chisquare(3, shape))
118 | 
119 |     def invCDF(self, x):
120 |         # From https://www.researchgate.net/profile/William_Shaw9/publication/247441442_Sampling_Student%27%27s_T_distribution-use_of_the_inverse_cumulative_distribution_function/links/55bbbc7908ae9289a09574f6/Sampling-Students-T-distribution-use-of-the-inverse-cumulative-distribution-function.pdf
121 |         return (2*x - 1) / np.sqrt(2*x*(1-x))
122 | 
123 | 
124 | class RFFKernelMatern52(RFFKernel):
125 |     def sampleFreqs(self, shape):
126 |         return np.random.normal(0, 1, shape) * \
127 |                 np.sqrt(5/np.random.chisquare(5, shape))
128 | 
129 |     def invCDF(self, x):
130 |         # From https://www.researchgate.net/profile/William_Shaw9/publication/247441442_Sampling_Student%27%27s_T_distribution-use_of_the_inverse_cumulative_distribution_function/links/55bbbc7908ae9289a09574f6/Sampling-Students-T-distribution-use-of-the-inverse-cumulative-distribution-function.pdf
131 |         alpha = 4*x*(1-x)
132 |         p = 4 * np.cos(np.arccos(np.sqrt(alpha))/3) / np.sqrt(alpha)
133 |         return np.sign(x-0.5)*np.sqrt(p-4)
134 | 


--------------------------------------------------------------------------------
/gymEnvs/CartpoleSwingup.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class CartpoleSwingup(gym.Env):
 7 | 
 8 |     def __init__(self, render=False):
 9 |         self.env = gym.make("CartPole-v1")
10 |         lowS = [-3.4, -8.0, -5.4, -8.7]
11 |         highS = [3.4, 8.3, 6.35, 8.1]
12 |         lowA = [-10]
13 |         highA = [10]
14 |         self.max_cart_pos = 3
15 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
16 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
17 | 
18 |     def reset(self):
19 |         s = self.env.reset()
20 |         s2 = np.array([s[0], s[1], s[2] + np.pi, s[3]])
21 |         print("b")
22 |         self.env.env.state = s2
23 |         return np.array(s2)
24 | 
25 |     def step(self, a):
26 |         step = self.env.step(1 * (int(np.sign(a)) > 0))
27 |         ss = step[0]
28 | 
29 |         if abs(ss[0]) > self.max_cart_pos:
30 |             reward = -100
31 |             done = True
32 |         elif np.cos(ss[2]) > 0.8:
33 |             reward = 0
34 |             done = True
35 |         else:
36 |             reward = np.cos(ss[2]) - 1
37 |             done = False
38 | 
39 |         self.env.env.steps_beyond_done = None
40 | 
41 |         return ss, reward, done, {}
42 | 


--------------------------------------------------------------------------------
/gymEnvs/DoublePendulum.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class DoublePendulum(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Acrobot-v1")
 9 |         lowS = [-1, -1, -42, -1, -1, -74]
10 |         highS = [1,   1, 42,  1,  1,   74]
11 |         lowA = [-50]
12 |         highA = [50]
13 | 
14 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
15 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
16 | 
17 |     def reset(self):
18 |         return self.env.reset()
19 | 
20 |     def step(self, a):
21 |         step = self.env.step(1 * (int(np.sign(a)) > 0))
22 |         return step[0], step[1], step[2], {}
23 | 


--------------------------------------------------------------------------------
/gymEnvs/Hopper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class Hopper(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Hopper-v2")
 9 |         lowS = [-0.1, -0.25, -0.4, -0.4, -0.32] + [-7] * 6
10 |         highS = [1.3, 0.02, 0.025, 0.03, 0.7] + [7] * 6
11 |         lowA = [-1, -1, -1]
12 |         highA = [1, 1, 1]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         step = self.env.step(np.array(a))
21 |         next_obs = step[0]
22 |         posafter, height, ang = self.env.env.sim.data.qpos[0:3]
23 | 
24 |         if height > 1.3:
25 |             reward = 0
26 |             done = True
27 |         elif abs(ang) > 0.2 or height < 0.7:
28 |             reward = -1000
29 |             done = True
30 |         else:
31 |             reward = -1
32 |             done = False
33 | 
34 |         return next_obs, reward, done, {}
35 | 


--------------------------------------------------------------------------------
/gymEnvs/LunarLander.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class LunarLander(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("LunarLanderContinuous-v2")
 9 |         lowS = [-1.2, -0.2, -3.3, -2.8, -3.4, -4, 0, 0]
10 |         highS = [1.2, 1.6, 2, 0.8, 2.5, 9, 1.0, 1.0]
11 |         lowA = [-1, -1]
12 |         highA = [1, 1]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         next_obs, _, done, __ = self.env.step(np.array(a).reshape(-1))
21 | 
22 |         if done:
23 |             reward = -100
24 |         elif abs(next_obs[0]) < 0.05 and abs(next_obs[1]) < 0.05:
25 |             reward = 0
26 |             done = True
27 |         else:
28 |             reward = -1
29 | 
30 |         return next_obs, reward, done, {}
31 | 


--------------------------------------------------------------------------------
/gymEnvs/MountainCar.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class MountainCar(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("MountainCar-v0")
 9 |         lowS = [-1.2, -0.07]
10 |         highS = [0.6, 0.07]
11 |         lowA = [-1.0]
12 |         highA = [1.0]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         if np.abs(a) < 0.33:
21 |             a2 = 1
22 |         else:
23 |             a2 = int(np.sign(a)) + 1
24 |         step = self.env.step(a2)
25 |         done = bool(step[0][0] >= self.env.env.goal_position)
26 |         reward = 0.0 if done else -1.0
27 |         return step[0], reward, done, {}
28 | 


--------------------------------------------------------------------------------
/gymEnvs/Reacher.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class Reacher(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Reacher-v2")
 9 |         lowS = [-1, -1, -1, -0.17, -0.2, -0.2, -49, -5.6, -0.22, -0.2, 0]
10 |         highS = [1, 1, 1, 1, 0.2, 0.2, 87, 33, 0.4, 0.35, 0.0001]
11 |         lowA = [-1, 1]
12 |         highA = [1, 1]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         step = self.env.step(np.array(a))
21 |         next_obs = step[0]
22 |         vec = self.env.env.get_body_com("fingertip") - \
23 |             self.env.env.get_body_com("target")
24 |         dist = np.linalg.norm(vec)
25 | 
26 |         done = (dist <= 0.015)
27 |         reward = -dist
28 |         if done:
29 |             reward = 0.0
30 |         return next_obs, reward, done, {}
31 | 


--------------------------------------------------------------------------------
/gymEnvs/SinglePendulum.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SinglePendulum(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Pendulum-v0")
 9 |         lowS = [-1, -1, -8]
10 |         highS = [1, 1, 8]
11 |         lowA = [-2]
12 |         highA = [2]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         step = self.env.step(np.array(a))
21 |         s = step[0]
22 |         done = (s[0] > 0.95 and abs(s[1]) < 0.05)
23 |         reward = 0.0 if done else step[1]
24 |         return step[0], reward, done, {}
25 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseCartpoleSwingup.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseCartpoleSwingup(gym.Env):
 7 | 
 8 |     def __init__(self, render=False):
 9 |         self.env = gym.make("CartPole-v1")
10 |         lowS = [-3.4, -8.0, -5.4, -8.7]
11 |         highS = [3.4, 8.3, 6.35, 8.1]
12 |         lowA = [-10]
13 |         highA = [10]
14 |         self.max_cart_pos = 3
15 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
16 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
17 | 
18 |     def reset(self):
19 |         s = self.env.reset()
20 |         s2 = np.array([s[0], s[1], s[2] + np.pi, s[3]])
21 |         self.env.env.state = s2
22 |         return np.array(s2)
23 | 
24 |     def step(self, a):
25 |         step = self.env.step(1 * (int(np.sign(a)) > 0))
26 |         ss = step[0]
27 | 
28 |         if abs(ss[0]) > self.max_cart_pos:
29 |             reward = -100
30 |             done = True
31 |         elif np.cos(ss[2]) > 0.8:
32 |             reward = 1
33 |             done = True
34 |         else:
35 |             reward = 0
36 |             done = False
37 | 
38 |         self.env.env.steps_beyond_done = None
39 | 
40 |         return ss, reward, done, {}
41 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseCartpoleSwingupDisc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseCartpoleSwingupDisc(gym.Env):
 7 | 
 8 |     def __init__(self, render=False):
 9 |         self.env = gym.make("CartPole-v1")
10 |         lowS = [-3.4, -8.0, -5.4, -8.7]
11 |         highS = [3.4, 8.3, 6.35, 8.1]
12 |         self.max_cart_pos = 3
13 |         self.action_space = spaces.Discrete(2)
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         s = self.env.reset()
18 |         s2 = np.array([s[0], s[1], s[2] + np.pi, s[3]])
19 |         self.env.env.state = s2
20 |         return np.array(s2)
21 | 
22 |     def step(self, a):
23 |         step = self.env.step(int(a))
24 |         ss = step[0]
25 | 
26 |         if abs(ss[0]) > self.max_cart_pos:
27 |             reward = -100
28 |             done = True
29 |         elif np.cos(ss[2]) > 0.8:
30 |             reward = 1
31 |             done = True
32 |         else:
33 |             reward = 0
34 |             done = False
35 | 
36 |         self.env.env.steps_beyond_done = None
37 | 
38 |         return ss, reward, done, {}
39 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseDoublePendulum.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseDoublePendulum(gym.Env):
 7 | 
 8 |     def __init__(self):
 9 |         self.env = gym.make("Acrobot-v1")
10 |         lowS = [-1, -1, -42, -1, -1, -74]
11 |         highS = [1,   1, 42,  1,  1,   74]
12 |         lowA = [-50]
13 |         highA = [50]
14 | 
15 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
16 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
17 | 
18 |     def reset(self):
19 |         return self.env.reset()
20 | 
21 |     def step(self, a):
22 |         step = self.env.step(1 * (int(np.sign(a)) > 0))
23 |         done = (step[1] == 0)
24 |         reward = step[1] + 1
25 |         return step[0], reward, done, {}
26 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseDoublePendulumDisc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseDoublePendulumDisc(gym.Env):
 7 | 
 8 |     def __init__(self):
 9 |         self.env = gym.make("Acrobot-v1")
10 |         lowS = [-1, -1, -42, -1, -1, -74]
11 |         highS = [1,   1, 42,  1,  1,   74]
12 | 
13 |         self.action_space = spaces.Discrete(3)
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         step = self.env.step(int(a))
21 |         done = (step[1] == 0)
22 |         reward = step[1] + 1
23 |         return step[0], reward, done, {}
24 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseHopper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseHopper(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Hopper-v2")
 9 |         lowS = [-0.1, -0.25, -0.4, -0.4, -0.32] + [-7] * 6
10 |         highS = [1.3, 0.02, 0.025, 0.03, 0.7] + [7] * 6
11 |         lowA = [-1, -1, -1]
12 |         highA = [1, 1, 1]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         step = self.env.step(np.array(a))
21 |         next_obs = step[0]
22 |         posafter, height, ang = self.env.env.sim.data.qpos[0:3]
23 | 
24 |         if height > 1.3:
25 |             reward = 1
26 |             done = True
27 |         elif abs(ang) > 0.2 or height < 0.7:
28 |             reward = -1
29 |             done = True
30 |         else:
31 |             reward = 0
32 |             done = False
33 | 
34 |         return next_obs, reward, done, {}
35 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseHopperDisc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseHopperDisc(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Hopper-v2")
 9 |         lowS = [-0.1, -0.25, -0.4, -0.4, -0.32] + [-7] * 6
10 |         highS = [1.3, 0.02, 0.025, 0.03, 0.7] + [7] * 6
11 |         self.action_space = spaces.Discrete(27)
12 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
13 | 
14 |     def reset(self):
15 |         return self.env.reset()
16 | 
17 |     def step(self, a):
18 |         a2 = np.array([(a // 9) - 1, ((a % 9) // 3) - 1, ((a % 9) % 3) - 1])
19 |         step = self.env.step(a2)
20 |         next_obs = step[0]
21 |         posafter, height, ang = self.env.env.sim.data.qpos[0:3]
22 | 
23 |         if height > 1.3:
24 |             reward = 1
25 |             done = True
26 |         elif abs(ang) > 0.2 or height < 0.7:
27 |             reward = -1
28 |             done = True
29 |         else:
30 |             reward = 0
31 |             done = False
32 | 
33 |         return next_obs, reward, done, {}
34 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseLunarLander.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseLunarLander(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("LunarLanderContinuous-v2")
 9 |         lowS = [-1.2, -0.2, -3.3, -2.8, -3.4, -4, 0, 0]
10 |         highS = [1.2, 1.6, 2, 0.8, 2.5, 9, 1.0, 1.0]
11 |         lowA = [-1, -1]
12 |         highA = [1, 1]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         next_obs, _, done, __ = self.env.step(np.array(a).reshape(-1))
21 |         if done:
22 |             reward = -1
23 |         elif abs(next_obs[0]) < 0.05 and abs(next_obs[1]) < 0.05:
24 |             reward = 1
25 |             done = True
26 |         else:
27 |             reward = 0
28 | 
29 |         return next_obs, reward, done, {}
30 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseLunarLanderDisc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseLunarLanderDisc(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("LunarLanderContinuous-v2")
 9 |         lowS = [-1.2, -0.2, -3.3, -2.8, -3.4, -4, 0, 0]
10 |         highS = [1.2, 1.6, 2, 0.8, 2.5, 9, 1.0, 1.0]
11 |         self.action_space = spaces.Discrete(9)
12 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
13 | 
14 |     def reset(self):
15 |         return self.env.reset()
16 | 
17 |     def step(self, a):
18 |         a2 = np.array([(a // 3) - 1, (a % 3) - 1])
19 |         next_obs, _, done, __ = self.env.step(a2)
20 |         if done:
21 |             reward = -1
22 |         elif abs(next_obs[0]) < 0.05 and abs(next_obs[1]) < 0.05:
23 |             reward = 1
24 |             done = True
25 |         else:
26 |             reward = 0
27 | 
28 |         return next_obs, reward, done, {}
29 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseMountainCar.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseMountainCar(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("MountainCar-v0")
 9 |         lowS = [-1.2, -0.07]
10 |         highS = [0.6, 0.07]
11 |         lowA = [-1]
12 |         highA = [1]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         if np.abs(a) < 0.33:
21 |             a2 = 1
22 |         else:
23 |             a2 = int(np.sign(a)) + 1
24 |         step = self.env.step(a2)
25 |         done = bool(step[0][0] >= self.env.env.goal_position)
26 |         return step[0], 1.0 * done, done, {}
27 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseMountainCarDisc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseMountainCarDisc(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("MountainCar-v0")
 9 |         lowS = [-1.2, -0.07]
10 |         highS = [0.6, 0.07]
11 |         self.action_space = spaces.Discrete(3)
12 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
13 | 
14 |     def reset(self):
15 |         return self.env.reset()
16 | 
17 |     def step(self, a):
18 |         step = self.env.step(a)
19 |         done = bool(step[0][0] >= self.env.env.goal_position)
20 |         return step[0], 1.0 * done, done, {}
21 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseReacher.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseReacher(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Reacher-v2")
 9 |         lowS = [-1, -1, -1, -0.17, -0.2, -0.2, -49, -5.6, -0.22, -0.2, 0]
10 |         highS = [1, 1, 1, 1, 0.2, 0.2, 87, 33, 0.4, 0.35, 0.0001]
11 |         lowA = [-1, 1]
12 |         highA = [1, 1]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         step = self.env.step(np.array(a))
21 |         next_obs = step[0]
22 |         vec = self.env.env.get_body_com("fingertip") - \
23 |             self.env.env.get_body_com("target")
24 |         dist = np.linalg.norm(vec)
25 | 
26 |         done = (dist <= 0.015)
27 |         reward = 1 * (dist <= 0.015)
28 |         return next_obs, reward, done, {}
29 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseReacherDisc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseReacherDisc(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Reacher-v2")
 9 |         lowS = [-1, -1, -1, -0.17, -0.2, -0.2, -49, -5.6, -0.22, -0.2, 0]
10 |         highS = [1, 1, 1, 1, 0.2, 0.2, 87, 33, 0.4, 0.35, 0.0001]
11 |         self.action_space = spaces.Discrete(9)
12 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
13 | 
14 |     def reset(self):
15 |         return self.env.reset()
16 | 
17 |     def step(self, a):
18 |         a2 = np.array([(a // 3) - 1, (a % 3) - 1])
19 |         step = self.env.step(a2)
20 |         next_obs = step[0]
21 |         vec = self.env.env.get_body_com("fingertip") - \
22 |             self.env.env.get_body_com("target")
23 |         dist = np.linalg.norm(vec)
24 | 
25 |         done = (dist <= 0.015)
26 |         reward = 1 * (dist <= 0.015)
27 |         return next_obs, reward, done, {}
28 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseSinglePendulum.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseSinglePendulum(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Pendulum-v0")
 9 |         lowS = [-1, -1, -8]
10 |         highS = [1, 1, 8]
11 |         lowA = [-2]
12 |         highA = [2]
13 |         self.action_space = spaces.Box(np.array(lowA), np.array(highA))
14 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
15 | 
16 |     def reset(self):
17 |         return self.env.reset()
18 | 
19 |     def step(self, a):
20 |         s = self.env.env.state
21 |         step = self.env.step(np.array(a))
22 |         done = (s[0] > 0.95 and abs(s[1]) < 0.05)
23 |         return step[0], 1.0 * done, done, {}
24 | 


--------------------------------------------------------------------------------
/gymEnvs/SparseSinglePendulumDisc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class SparseSinglePendulumDisc(gym.Env):
 7 |     def __init__(self):
 8 |         self.env = gym.make("Pendulum-v0")
 9 |         lowS = [-1, -1, -8]
10 |         highS = [1, 1, 8]
11 |         self.action_space = spaces.Discrete(5)
12 |         self.observation_space = spaces.Box(np.array(lowS), np.array(highS))
13 | 
14 |     def reset(self):
15 |         return self.env.reset()
16 | 
17 |     def step(self, a):
18 |         s = self.env.env.state
19 |         a2 = (a - 2) * 2
20 |         step = self.env.step(np.array([a2]))
21 |         done = (s[0] > 0.95 and abs(s[1]) < 0.05)
22 |         return step[0], 1.0 * done, done, {}
23 | 


--------------------------------------------------------------------------------
/gymEnvs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.phil.SparseCartpoleSwingup import SparseCartpoleSwingup
 2 | from gym.envs.phil.SparseDoublePendulum import SparseDoublePendulum
 3 | from gym.envs.phil.SparseHopper import SparseHopper
 4 | from gym.envs.phil.SparseLunarLander import SparseLunarLander
 5 | from gym.envs.phil.SparseMountainCar import SparseMountainCar
 6 | from gym.envs.phil.SparseReacher import SparseReacher
 7 | from gym.envs.phil.SparseSinglePendulum import SparseSinglePendulum
 8 | 
 9 | from gym.envs.phil.CartpoleSwingup import CartpoleSwingup
10 | from gym.envs.phil.DoublePendulum import DoublePendulum
11 | from gym.envs.phil.Hopper import Hopper
12 | from gym.envs.phil.LunarLander import LunarLander
13 | from gym.envs.phil.MountainCar import MountainCar
14 | from gym.envs.phil.Reacher import Reacher
15 | from gym.envs.phil.SinglePendulum import SinglePendulum
16 | 
17 | from gym.envs.phil.SparseCartpoleSwingupDisc import SparseCartpoleSwingupDisc
18 | from gym.envs.phil.SparseDoublePendulumDisc import SparseDoublePendulumDisc
19 | from gym.envs.phil.SparseHopperDisc import SparseHopperDisc
20 | from gym.envs.phil.SparseLunarLanderDisc import SparseLunarLanderDisc
21 | from gym.envs.phil.SparseMountainCarDisc import SparseMountainCarDisc
22 | from gym.envs.phil.SparseReacherDisc import SparseReacherDisc
23 | from gym.envs.phil.SparseSinglePendulumDisc import SparseSinglePendulumDisc
24 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import uuid
  3 | import json
  4 | import os
  5 | import argparse
  6 | 
  7 | from rlutils.TransitionMemory import TransitionMemory
  8 | from rlutils.Runners import EnvRunner
  9 | from rlutils.EnvGlue import EnvGlue
 10 | from rlutils.Envs import SparseGymEnv, GymEnv
 11 | from rlutils.AgentHelper import AgentHelper
 12 | from Method import Method
 13 | from rlutils.Agents import QLearning
 14 | from rlutils.LR_SGD import ConstantRate
 15 | from rlutils.Policies import eGreedyPolicy
 16 | 
 17 | from features.RandomFourierFeatures import RFF
 18 | from features.FourierBasisFeatures import FourierBasisFeatures as FBF
 19 | 
 20 | 
 21 | def fix_spaces(env):
 22 |     """
 23 |     Set arbitrary space bounds for states variables missing bounds (inf)
 24 |     """
 25 |     l, h = env.lowS, env.highS
 26 |     for i in range(len(l)):
 27 |         if l[i] < -100:
 28 |             l[i] = 100
 29 |         if h[i] > 100:
 30 |             h[i] = 100
 31 |     return l, h
 32 | 
 33 | 
 34 | def gen_env(opt):
 35 |     nStep = opt.nStep
 36 |     nEp = opt.nEp
 37 |     nRFF = opt.nRFF
 38 |     kernelType = opt.kernelType
 39 |     if opt.gymEnv is not None:
 40 |         env = GymEnv(opt.gymEnv, render=opt.render)
 41 |         envName = opt.gymEnv
 42 |     elif opt.sparseGymEnv is not None:
 43 |         env = SparseGymEnv(opt.sparseGymEnv, render=opt.render)
 44 |         envName = opt.sparseGymEnv
 45 |     else:
 46 |         raise ValueError("An environment needs to be specified")
 47 | 
 48 |     lowS, highS = fix_spaces(env)
 49 |     glue = EnvGlue(env, lowS, highS)
 50 |     sigma = np.array([opt.sigmaS] * glue.dS() + [opt.sigmaA] * glue.dA())
 51 |     alphaQ, betaQ, alphaU, betaU = opt.alphaQ, opt.betaQ, opt.alphaU, opt.betaU
 52 | 
 53 |     return env, envName, nEp, nStep, nRFF, kernelType, lowS, highS, glue, sigma, alphaQ, betaQ, alphaU, betaU
 54 | 
 55 | 
 56 | def main(opt):
 57 |     # Load default values and env
 58 |     env, envName, nEp, nStep, nRFF, kernelType, lowS, highS, glue, sigma, alphaQ, betaQ, alphaU, betaU = gen_env(opt)
 59 | 
 60 |     print(opt)
 61 |     # Generate directory for experiment
 62 |     if opt.enableLogFile:
 63 |         expName, i = "{}_{}".format(opt.expName, envName), 1
 64 |         while os.path.exists("exps/{}{}".format(expName, i)):
 65 |             i += 1
 66 |         dirName = "exps/{}{}".format(expName, i)
 67 |         os.makedirs(dirName)
 68 | 
 69 |     # Run experiment
 70 |     allRets = []
 71 |     for repI in range(opt.nRepeat):
 72 |         tm = TransitionMemory(nEp, nStep, glue.dS(), glue.dA())
 73 | 
 74 |         # Agt, Pol
 75 |         if opt.featureType == "RFF":  # Random Fourier Features
 76 |             ftmap = RFF(nRFF, glue.dS() + glue.dA(), sigma, kernelType)
 77 |         elif opt.featureType == "FBF":  # Fourier basis features
 78 |             ftmap = FBF(glue.dS() + glue.dA(), opt.fourierBasisOrder)
 79 |         agtHlp = AgentHelper(glue, ftmap)
 80 | 
 81 |         if opt.agent == "method":
 82 |             agent = Method(agtHlp, gamma=opt.gamma, alphaQ=alphaQ, betaQ=betaQ,
 83 |                            alphaU=alphaU, betaU=betaU, tm=tm,
 84 |                            maxVIiter=opt.maxVIiterQ, kappa=opt.kappa)
 85 |         elif opt.agent == "QLearning":
 86 |             lr = ConstantRate(opt.learningRate)
 87 |             agent = QLearning(agtHlp, gamma=opt.gamma, learningRate=lr)
 88 |         else:
 89 |             raise ValueError("Unknown agent {}".format(opt.agent))
 90 |         policy = eGreedyPolicy(agtHlp, agent.score, opt.epsilon)
 91 | 
 92 |         # Runner
 93 |         runner = EnvRunner(glue, policy, agent, tm, opt.verbose)
 94 | 
 95 |         # Go
 96 |         runner.run(nEp, nStep, opt.stopWhenSolved)
 97 | 
 98 |         # Keep track of returns
 99 |         idx = np.cumsum([0] + tm.getEpisodeLengths())
100 |         ss, _, rr, __ = tm.getTransitions()
101 |         sMin, sMax = np.min(ss, 0), np.max(ss, 0)
102 |         print("State min/max:\n{}\n{}".format(sMin, sMax))
103 |         rets = [np.sum(rr[idx[i-1]:idx[i]]) for i in range(1, len(idx))]
104 |         print("Repeat", repI, "finished.")
105 |         print("Returns:\n", rets)
106 |         allRets.append(rets)
107 | 
108 |         # Parse all variables to file
109 |         del rr, _, ss, __
110 |         if opt.enableLogFile:
111 |             filename = "{}/vars_{}.json".format(dirName, uuid.uuid4().hex)
112 |             with open(filename, 'w') as f:
113 |                 json.dump({k: repr(v) for k, v in vars().items()}, f,
114 |                           indent=4, sort_keys=True)
115 | 
116 | 
117 | def parse_args():
118 |     parser = argparse.ArgumentParser()
119 |     parser.add_argument("-a", "--agent", help="method, QLearning",
120 |                         default="method")
121 |     parser.add_argument("--gymEnv", help="Gym Enviornment to run",
122 |                         default=None, type=str)
123 |     parser.add_argument("--sparseGymEnv", help="Sparse Gym Enviornment to run",
124 |                         default=None, type=str)
125 |     parser.add_argument("--nStep", help="Number of steps per episode",
126 |                         default=500, type=int)
127 |     parser.add_argument("--nEp", help="Number of episodes", default=20,
128 |                         type=int)
129 |     parser.add_argument("--nRepeat", help="number of repetitions", default=1,
130 |                         type=int)
131 |     parser.add_argument("--stopWhenSolved", help="Stop repeat after goal is "
132 |                         "reahced for the first time.",
133 |                         action="store_true", default=False)
134 |     parser.add_argument("--gamma", help="Reward discount value", default=0.99,
135 |                         type=float)
136 | 
137 |     # Feature parameters
138 |     parser.add_argument("--featureType", help="Type of features to use: "
139 |                         "RFF for Random Fourier Features, "
140 |                         "FBF for Fourier Basis Features.",
141 |                         default="RFF", type=str)
142 |     parser.add_argument("--nRFF", help="Number of RFF features", default=300,
143 |                         type=int)
144 |     parser.add_argument("--kernelType", help="RFF kernel type",
145 |                         default="RBF", type=str)
146 |     parser.add_argument("--sigmaS", help="State RFF features lengthscale",
147 |                         default=0.35, type=float)
148 |     parser.add_argument("--sigmaA", help="Action RFF features lengthscale",
149 |                         default=1.0, type=float)
150 |     parser.add_argument("--fourierBasisOrder", type=int,
151 |                         help="Fourier basis feature order", default=3)
152 | 
153 |     # Algorithm parameters
154 |     parser.add_argument("--kappa", help="Exploration-exploitation balance",
155 |                         default=1.0, type=float)
156 |     parser.add_argument("--epsilon", help="epsilon-greedy policy parameter",
157 |                         default=0.0, type=float)
158 |     parser.add_argument("--alphaQ", help="BLR weight prior precision for Q",
159 |                         default=0.1, type=float)
160 |     parser.add_argument("--betaQ", help="BLR noise precision for Q",
161 |                         default=1.0, type=float)
162 |     parser.add_argument("--alphaU", help="BLR weight prior precision for U",
163 |                         default=0.1, type=float)
164 |     parser.add_argument("--betaU", help="BLR noise precision for U",
165 |                         default=1.0, type=float)
166 |     parser.add_argument("--maxVIiterQ", help="Maximum number of VI iterations"
167 |                         " at the end of each episode for Q", default=30,
168 |                         type=int)
169 |     parser.add_argument("--learningRate", help="QLearning learning rate",
170 |                         default=0.5, type=float)
171 | 
172 |     # Logging
173 |     parser.add_argument("--expName", help="Experiment name", default="dummy",
174 |                         type=str)
175 |     parser.add_argument("--enableLogFile", help="Log transitions in file",
176 |                         default=False, action="store_true")
177 |     parser.add_argument("--render", help="Render agent while learning",
178 |                         action="store_true", default=False)
179 |     parser.add_argument("-v", "--verbose", help="Verbose", action="count",
180 |                         default=0)
181 |     args = parser.parse_args()
182 |     return args
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     args = parse_args()
187 |     main(args)
188 | 


--------------------------------------------------------------------------------
/rlutils/ABLR.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from rlutils.utils import smw_inv_correction
  4 | from rlutils.utils import batch_generator
  5 | 
  6 | 
  7 | class ABLR:
  8 |     """
  9 |     Purely analytic BLR with rank-k streaming updates
 10 |     """
 11 | 
 12 |     def __init__(self, M, alpha=1.0, beta=1.0, dOut=1, computeSninv=True):
 13 |         """
 14 |         Initialize model.
 15 |         :param M: Number of weights
 16 |         :param alpha: Weight prior precision
 17 |         :param beta: Noise precision
 18 |         :param dOut: Number of output dimensions
 19 |         :param computeSninv: Whether to compute the inverse of Sn. Useful for
 20 |         NLML computations (default=True).
 21 |         """
 22 |         self.N_trn = None  # The total number of training samples
 23 |         self.M = M
 24 |         self.dOut = dOut
 25 |         self.computeSninv = computeSninv
 26 | 
 27 |         # dimensionality
 28 |         self.alpha = alpha
 29 |         self.beta = beta
 30 |         self.reset()
 31 | 
 32 |     def reset(self):
 33 |         self.Sninv_tgt = np.zeros(shape=(self.M, self.dOut))
 34 |         self.mn = np.zeros(shape=(self.M, self.dOut))
 35 |         self.tgt0 = np.zeros(shape=(self.M, self.dOut))
 36 |         self.Sn = np.identity(self.M) / self.alpha  # a.k.a. Ainv
 37 |         if self.computeSninv:
 38 |             self.Sninv = np.identity(self.M) * self.alpha  # a.k.a. A
 39 |         self.needRecomputeMean = True
 40 | 
 41 |     def update(self, phi, y):
 42 |         """
 43 |         Update BLR model with one a set of data points, performing a rank-k
 44 |         sherman-morisson-woodburry update.
 45 |         :param phi: Feature map for new points
 46 |         :param y: Target value for new points
 47 |         """
 48 | 
 49 |         self.Sn -= smw_inv_correction(A_inv=self.Sn,
 50 |                                       U=np.sqrt(self.beta) * phi.T,
 51 |                                       V=np.sqrt(self.beta) * phi)
 52 | 
 53 |         self.Sninv_tgt += self.beta * np.dot(phi.T, y)
 54 |         if self.computeSninv:
 55 |             self.Sninv += self.beta * np.dot(phi.T, phi)  # /
 56 |         self.needRecomputeMean = True
 57 | 
 58 |     def learn_from_history(self, all_phi, all_y, batch_size=None):
 59 |         """
 60 |         Train model on dataset by cutting it into batches for faster learning.
 61 |         :param all_phi: data features
 62 |         :param all_y: data targets
 63 |         :param batch_size: size of batches data set is cut into. Set to None
 64 |         for automatically computing optimal batch size (default=None).
 65 |         """
 66 |         # Define the batch data generator. This maintains an internal counter
 67 |         # and also allows wraparound for multiple epochs
 68 | 
 69 |         # Compute optimal batch size
 70 |         if batch_size is None:
 71 |             batch_size = int(np.cbrt(self.M ** 2 / 2))
 72 | 
 73 |         data_batch_gen = batch_generator(arrays=[all_phi, all_y],
 74 |                                          batch_size=batch_size,
 75 |                                          wrapLastBatch=False)
 76 | 
 77 |         N = all_phi.shape[0]  # Alias for the total number of training samples
 78 |         n_batches = int(np.ceil(N / batch_size))  # The number of batches
 79 | 
 80 |         """ Run the batched inference """
 81 |         for _ in range(n_batches):
 82 |             phi_batch, Y_batch = next(data_batch_gen)
 83 |             self.update(phi=phi_batch, y=Y_batch)
 84 | 
 85 |     def _recompute(self):
 86 |         self.mn = np.dot(self.Sn, self.Sninv_tgt)
 87 |         self.needRecomputeMean = False
 88 | 
 89 |     def predictMean(self, phi):
 90 |         """
 91 |         Model predictive mean.
 92 |         :param phi: Feature map for test data point
 93 |         :returns: predictive mean values for each test data point
 94 |         """
 95 |         if self.needRecomputeMean:
 96 |             self._recompute()
 97 |         Y_pred = np.dot(phi, self.mn)
 98 |         return np.atleast_2d(Y_pred)
 99 | 
100 |     def predictVar(self, phi, includeBetaVar=True):
101 |         """
102 |         Model predictive variance.
103 |         :param phi: Feature map for test data point
104 |         :param includeBetaVar: Whether to include the 1/beta offset in variance
105 |         :returns: predictive mean variances for each test data point
106 |         """
107 |         var = np.sum(np.dot(phi, self.Sn) * phi, axis=1, keepdims=True)
108 |         if includeBetaVar:
109 |             var += 1.0/self.beta
110 |         return var
111 | 
112 |     def predict(self, phi):
113 |         return self.predictMean(phi), self.predictVar(phi)
114 | 
115 |     def updateTargets(self, all_phi, all_t):
116 |         """
117 |         Update target for all datapoints.
118 |         :param all_phi: Feature map for all data points
119 |         :param all_t: Targets for all data points
120 |         """
121 |         self.Sninv_tgt = self.tgt0 + self.beta * np.dot(all_phi.T, all_t)
122 |         self.needRecomputeMean = True
123 | 
124 | 
125 | class fixedMeanABLR(ABLR):
126 |     """ Variant of BLR, where the predictive mean returns to a fixed value
127 |     away from data points. This uses the predictive variance, minimum and
128 |     maximum variance to interpolate between true predictive mean and fixed
129 |     mean.
130 |     Note: While the value for maximum variance is correct, the one for
131 |     minimum variance is only an empirical estimate, and depends on the feature
132 |     map used. This implementation works for RFF.
133 |     """
134 |     def __init__(self, M, alpha, beta, fixedMean, sigRFF):
135 |         super().__init__(M, alpha, beta)
136 | 
137 |         self.maxVar = 1.0 / alpha  # Not including 1.0/beta in computation
138 |         # This is an empirical observation
139 |         self.minVar = 1.0 / np.mean(sigRFF * (8*beta+alpha))
140 |         self.fixedMean = fixedMean
141 | 
142 |     def predictMean(self, x_tst):
143 |         mean = super().predictMean(x_tst)
144 |         varRatio = (self.predictVar(x_tst, False) - self.minVar) / \
145 |             (self.maxVar - self.minVar)
146 |         varRatio[varRatio < 0] = 0
147 |         return mean * (1-varRatio) + self.fixedMean * varRatio
148 | 


--------------------------------------------------------------------------------
/rlutils/AgentHelper.py:
--------------------------------------------------------------------------------
 1 | # import numbers
 2 | import numpy as np
 3 | 
 4 | 
 5 | """
 6 | AgentHelpers provide Agents with a series of function to access environment
 7 | state and action spaces, and Random Fourier Features.
 8 | """
 9 | 
10 | 
11 | class AgentHelper:
12 |     def __init__(self, glue, ftmap=None):
13 |         if ftmap is None:
14 |             self.nFeatures = glue.dS() + glue.dA()
15 |         else:
16 |             self.nFeatures = ftmap.nFeatures
17 |         self.glue = glue
18 |         self.ftmap = ftmap
19 | 
20 |     def toStateActionPair(self, ss, aa):
21 |         ssaa = np.hstack([ss, aa])
22 |         if self.ftmap is None:
23 |             return ssaa
24 |         else:
25 |             return self.ftmap.toFeatures(ssaa)
26 | 
27 |     def isDiscA(self):
28 |         return self.glue.env.discA
29 | 
30 |     def allDiscA(self):
31 |         return np.linspace(0, 1, self.nA())
32 | 
33 |     def randDiscA(self, shape):
34 |         nA = self.nA()
35 |         return np.random.randint(0, nA, shape) / (nA-1)
36 | 
37 |     def sampleContA(self, nSamps):
38 |         r = np.random.random((nSamps, self.dA()))
39 |         low, high = self.glue.boundsA()
40 |         return np.multiply(r, high - low) + low
41 | 
42 |     def randA(self, n):
43 |         if self.isDiscA():
44 |             return self.randDiscA(n)
45 |         else:
46 |             return self.sampleContA(n)
47 | 
48 |     def nA(self):
49 |         return self.glue.nA()
50 | 
51 |     def boundsA(self):
52 |         return self.glue.boundsA()
53 | 
54 |     def boundsS(self):
55 |         return self.glue.boundsS()
56 | 
57 |     def dS(self):
58 |         return self.glue.dS()
59 | 
60 |     def dA(self):
61 |         return self.glue.dA()
62 | 


--------------------------------------------------------------------------------
/rlutils/Agents.py:
--------------------------------------------------------------------------------
 1 | from rlutils.LR_SGD import LR_SGD
 2 | from rlutils.Policies import eGreedyPolicy
 3 | 
 4 | """
 5 | Provides an implementation of QLearning.
 6 | """
 7 | 
 8 | 
 9 | class AbsAgent:
10 |     def update(self, s, a, r, sp):
11 |         raise NotImplementedError
12 | 
13 |     def endOfEpUpdate(self):
14 |         raise NotImplementedError
15 | 
16 | 
17 | class AbsTDAgent(AbsAgent):
18 |     def score(self, s, a):
19 |         raise NotImplementedError
20 | 
21 | 
22 | class QLearning(AbsTDAgent):
23 |     def __init__(self, agtHlp, gamma=0.99, **kwargs):
24 |         """
25 |         Parameters
26 |         :param agtHlp: agentHelper object.
27 |         :param learningRate: stochasticOptimisers object (optional.
28 |         :param gamma: discount factor (optional).
29 |         """
30 |         self.agtHlp = agtHlp
31 |         self.gamma = gamma
32 |         self.greedyPolicy = eGreedyPolicy(agtHlp, self.score, 0.0)
33 |         self.model = LR_SGD(M=agtHlp.nFeatures, **kwargs)
34 | 
35 |     def update(self, s, a, r, sp):
36 |         """
37 |         Adds one data point at a time.
38 |         """
39 |         ftsa = self.agtHlp.toStateActionPair(s, a)
40 |         ap = self.greedyPolicy.pick(sp)
41 |         sap = self.agtHlp.toStateActionPair(sp, ap)
42 |         self.model.update(ftsa, r + self.gamma * self.model.predictMean(sap))
43 | 
44 |     def score(self, s, a):
45 |         sa = self.agtHlp.toStateActionPair(s, a)
46 |         return self.model.predictMean(sa)
47 | 
48 |     def endOfEpUpdate(self):
49 |         pass
50 | 


--------------------------------------------------------------------------------
/rlutils/EnvGlue.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from rlutils.Normaliser import Normaliser
  4 | 
  5 | """
  6 | This is the glue (wrapper) between RL environments from Env.py and the runner.
  7 | EnvGlue takes care of state and action space normalisation.
  8 | """
  9 | 
 10 | 
 11 | class AbsEnvGlue:
 12 |     def __init__(self, env):
 13 |         self.env = env
 14 | 
 15 |     def resetEnv(self):
 16 |         raise NotImplementedError
 17 | 
 18 |     def stepEnv(self, a):
 19 |         raise NotImplementedError
 20 | 
 21 |     def nA(self):
 22 |         if not self.env.discA:
 23 |             raise ValueError("Environment action space is continuous.")
 24 |         if self.env.dA > 1:
 25 |             raise ValueError("Environment action space is multi dimensional.")
 26 |         return self.env.nA
 27 | 
 28 |     def boundsS(self):
 29 |         return np.array(self.env.lowS), np.array(self.env.highS)
 30 | 
 31 |     def boundsA(self):
 32 |         return np.array(self.env.lowA), np.array(self.env.highA)
 33 | 
 34 |     def dS(self):
 35 |         return self.env.dS
 36 | 
 37 |     def dA(self):
 38 |         return self.env.dA
 39 | 
 40 | 
 41 | class EnvGlue(AbsEnvGlue):
 42 |     def __init__(self, env, lowS=None, highS=None, normalised=True):
 43 |         super(EnvGlue, self).__init__(env)
 44 | 
 45 |         self.highA = [h - (1 if self.env.discA else 0) for h in self.env.highA]
 46 |         if lowS is None:
 47 |             lowS = self.env.lowS
 48 |         if highS is None:
 49 |             highS = self.env.highS
 50 |         self.normalised = normalised
 51 |         if normalised:
 52 |             self.nrms = Normaliser(lowS, highS, True)
 53 |             self.nrma = Normaliser(self.env.lowA, self.highA, True)
 54 | 
 55 |     def stepEnv(self, a):
 56 |         if self.normalised:
 57 |             rawA = self.nrma.unnormalise(a)
 58 |         else:
 59 |             rawA = a
 60 |         if self.env.discA:
 61 |             rawA = max(0, min(int(np.round(rawA)), self.highA[0]))
 62 |         rawS, r, done = self.env.step(rawA)
 63 |         rawS = rawS.reshape(-1)
 64 |         if self.normalised:
 65 |             return self.nrms.normalise(np.atleast_2d(rawS)), r, done
 66 |         else:
 67 |             return np.atleast_2d(rawS), r, done
 68 | 
 69 |     def stepsEnv(self, ss, aa):
 70 |         if self.normalised:
 71 |             rawAA = self.nrma.unnormalise(aa)
 72 |             rawSS = self.nrms.unnormalise(ss)
 73 |         else:
 74 |             rawAA = aa
 75 |             rawSS = ss
 76 |         if self.env.discA:
 77 |             rawAA = np.clip(np.round(rawAA), 0, self.highA[0])
 78 |         rawSSP, RR, DONE = self.env.steps(rawSS, rawAA)
 79 |         if self.normalised:
 80 |             return self.nrms.normalise(rawSSP), RR, DONE
 81 |         else:
 82 |             return rawSSP, RR, DONE
 83 | 
 84 |     def costEnv(self, ss):
 85 |         if self.normalised:
 86 |             rawSS = self.nrms.unnormalise(ss)
 87 |         else:
 88 |             rawSS = ss
 89 |         return self.env.cost(rawSS)
 90 | 
 91 |     def resetEnv(self):
 92 |         rawS = self.env.reset()
 93 |         if self.normalised:
 94 |             return self.nrms.normalise(np.atleast_2d(rawS))
 95 |         else:
 96 |             return np.atleast_2d(rawS)
 97 | 
 98 |     def boundsA(self):
 99 |         if self.normalised:
100 |             return self.nrma.boundsNormalised()
101 |         else:
102 |             return super().boundsA()
103 | 
104 |     def boundsS(self):
105 |         if self.normalised:
106 |             return self.nrms.boundsNormalised()
107 |         else:
108 |             return super().boundsS()
109 | 


--------------------------------------------------------------------------------
/rlutils/Envs.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import math
  3 | import random
  4 | import numpy as np
  5 | 
  6 | """
  7 | Envs provides a classic interface for all RL environments. It also makes it
  8 | easy to transform Gym environments to sparse reward problems.
  9 | """
 10 | 
 11 | 
 12 | class AbsEnv:
 13 |     def __init__(self, lowS, highS, lowA, highA, discA=False):
 14 |         self.lowS, self.highS, self.lowA, self.highA = lowS, highS, lowA, highA
 15 |         self.dS, self.dA = len(lowS), len(lowA)
 16 |         self.discA = discA
 17 |         self.nA = int(highA[0]) if discA else -1
 18 | 
 19 |     def step(self, a):
 20 |         raise NotImplementedError
 21 | 
 22 |     def reset(self):
 23 |         raise NotImplementedError
 24 | 
 25 | 
 26 | class GymEnv(AbsEnv):
 27 |     def __init__(self, envName, render=False):
 28 |         self.env = gym.make(envName)
 29 | 
 30 |         # Get env bounds
 31 |         lowS = self.env.observation_space.low
 32 |         highS = self.env.observation_space.high
 33 |         # Discrete actions
 34 |         if isinstance(self.env.action_space, gym.spaces.Discrete):
 35 |             discA = True
 36 |             lowA = [0]
 37 |             highA = [self.env.action_space.n]
 38 |         # Continuous actions
 39 |         else:
 40 |             discA = False
 41 |             lowA = self.env.action_space.low
 42 |             highA = self.env.action_space.high
 43 |         super(GymEnv, self).__init__(lowS, highS, lowA, highA, discA)
 44 | 
 45 |         self.render = render
 46 | 
 47 |     def step(self, a):
 48 |         if self.render:
 49 |             self.env.render()
 50 | 
 51 |         s, r, done, _ = self.env.step(a)
 52 |         return s, r, done
 53 | 
 54 |     def reset(self):
 55 |         return self.env.reset()
 56 | 
 57 | 
 58 | class SparseGymEnv(GymEnv):
 59 |     def checkSolved(self, s):
 60 |         raise NotImplementedError
 61 | 
 62 |     def __init__(self, envName, render=False):
 63 |         super(SparseGymEnv, self).__init__(envName, render)
 64 |         checkReward = None
 65 |         if envName == "MountainCar-v0":
 66 |             def checkReward(s):
 67 |                 return bool(s[0] > 0.45)
 68 |         elif envName == "Acrobot-v1":
 69 |             def checkReward(s):
 70 |                 ns = [math.atan2(s[1], s[0]), math.atan2(s[3], s[2])]
 71 |                 return bool(-s[0] - np.cos(ns[0] + ns[1]) > 1.)
 72 |         elif envName == "Pendulum-v0":
 73 |             def checkReward(s):
 74 |                 return s[0] >= 0.97
 75 | 
 76 |             def _checkDone(s, a, r):
 77 |                 return False
 78 |             self.checkDone = _checkDone
 79 |         elif envName == "CartPole-v0":
 80 |             def checkReward(s):
 81 |                 fallen = s[0] < -2.4 or s[0] > 2.4 \
 82 |                         or s[2] < -0.2094 or s[2] > 0.2094
 83 |                 return not fallen
 84 | 
 85 |             def _checkDone(s, a, r):
 86 |                 return r < 1
 87 |             self.checkDone = _checkDone
 88 | 
 89 |         elif envName == "Reacher-v2":
 90 |             def checkReward(s):
 91 |                 vec = self.env.env.get_body_com("fingertip") - \
 92 |                     self.env.env.get_body_com("target")
 93 |                 dist = np.linalg.norm(vec)
 94 |                 return dist <= 0.015
 95 | 
 96 |             def _checkDone(s, a, r):
 97 |                 return False
 98 |             self.checkDone = _checkDone
 99 | 
100 |             def resetEnv():
101 |                 s = self.env.reset()
102 |                 print("d=", np.linalg.norm(self.env.env.goal))
103 |                 while np.linalg.norm(self.env.env.goal) > 0.18:
104 |                     s = self.env.reset()
105 |                 return s
106 |             self.reset = resetEnv
107 |         else:
108 |             raise ValueError("No solved criterion defined to create sparse",
109 |                              "reward for environment {}".format(envName))
110 |         self.checkSolved = checkReward
111 | 
112 |     def step(self, a):
113 |         s, r, done = super(SparseGymEnv, self).step(a)
114 |         r = self.checkSolved(s) * 1
115 |         done = self.checkDone(s, a, r)
116 |         return s, r, done
117 | 
118 |     def checkDone(self, s, a, r):
119 |         return r == 1
120 | 
121 | 
122 | class SparseOsbandChain(AbsEnv):
123 |     def __init__(self, chainLen):
124 |         self.chainLen = chainLen
125 |         self.p = 1.0 - 1.0 / self.chainLen
126 |         discA = True
127 |         lowS, highS = [1], [chainLen]
128 |         lowA, highA = [0], [2]
129 |         super().__init__(lowS, highS, lowA, highA, discA)
130 |         self.reset()
131 | 
132 |     def step(self, a):
133 |         r = 0
134 |         ss = self.s
135 | 
136 |         # Transitiion
137 |         if a == 0:
138 |             ss -= 1
139 |         elif a == 1:
140 |             if random.random() < self.p:
141 |                 ss += 1
142 |             else:
143 |                 ss -= 1
144 |         if ss > self.chainLen:
145 |             ss = self.chainLen
146 |         elif ss < 1:
147 |             ss = 1
148 | 
149 |         # Reward
150 |         if ss == self.chainLen:
151 |             r = 1
152 | 
153 |         self.s = ss
154 |         return np.array([self.s]), r, r == 1
155 | 
156 |     def reset(self):
157 |         self.s = 1
158 |         return np.array([self.s])
159 | 
160 | 
161 | class SemiSparseOsbandChain(SparseOsbandChain):
162 |     def __init__(self, chainLen, rewardSparsity):
163 |         """
164 |         param rewardSparsity: indicates how sparse the domain is (0 to 1)
165 |         """
166 |         self.nRwds = int((1 - rewardSparsity) * (chainLen - 1))
167 |         self.rewardIdx = None
168 |         super().__init__(chainLen)
169 | 
170 |     def step(self, a):
171 |         s, r, done = super().step(a)
172 |         if s in self.rewardIdx:
173 |             r = -1
174 |         return s, r, done
175 | 
176 |     def reset(self):
177 |         idx = np.arange(1, self.chainLen)
178 |         np.random.shuffle(idx)
179 |         self.rewardIdx = idx[0:self.nRwds]
180 |         return super().reset()
181 | 
182 | 
183 | class SparseExplorationChain(AbsEnv):
184 |     """
185 |     Typical problem to test exploration. This chain of specified length is
186 |     hard to explore. Action 1 goes to the next state (increasing). Action 0
187 |     always goes to state 1. A reward of 1 is given for the right-most state,
188 |     otherwise 0 reward is given. Problem starts in state 1.
189 |     """
190 |     def __init__(self, chainLen):
191 |         self.chainLen = chainLen
192 |         discA = True
193 |         lowS, highS = [1], [chainLen]
194 |         lowA, highA = [0], [2]
195 |         super().__init__(lowS, highS, lowA, highA, discA)
196 |         self.reset()
197 | 
198 |     def step(self, a):
199 |         r = 0
200 | 
201 |         # Transition
202 |         if a == 0:
203 |             self.s = 1
204 |         elif a == 1 and self.s < self.chainLen:
205 |             self.s += 1
206 | 
207 |         # Reward
208 |         if self.s == self.chainLen and a == 1:
209 |             r = 1
210 | 
211 |         return np.array([self.s]), r, r == 1
212 | 
213 |     def reset(self):
214 |         self.s = 1
215 |         return np.array([self.s])
216 | 
217 | 
218 | class SemiSparseExplorationChain(SparseExplorationChain):
219 |     def __init__(self, chainLen, rewardSparsity):
220 |         """
221 |         param rewardSparsity: indicates how sparse the domain is (0 to 1)
222 |         """
223 |         self.nRwds = int((1 - rewardSparsity) * (chainLen - 1))
224 |         self.rewardIdx = None
225 |         super().__init__(chainLen)
226 | 
227 |     def step(self, a):
228 |         s, r, done = super().step(a)
229 |         if s in self.rewardIdx:
230 |             r = -1
231 |         return s, r, done
232 | 
233 |     def reset(self):
234 |         idx = np.arange(1, self.chainLen)
235 |         np.random.shuffle(idx)
236 |         self.rewardIdx = idx[0:self.nRwds]
237 |         return super().reset()
238 | 


--------------------------------------------------------------------------------
/rlutils/LR_SGD.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class BaseOptimizer(object):
  5 |     """Base (Stochastic) gradient descent optimizer
  6 |     Parameters
  7 |     ----------
  8 |     params : list, length = len(coefs_) + len(intercepts_)
  9 |         The concatenated list containing coefs_ and intercepts_ in MLP model.
 10 |         Used for initializing velocities and updating params
 11 |     learning_rate_init : float, optional, default 0.1
 12 |         The initial learning rate used. It controls the step-size in updating
 13 |         the weights
 14 |     Attributes
 15 |     ----------
 16 |     learning_rate : float
 17 |         the current learning rate
 18 |     """
 19 | 
 20 |     def __init__(self, params, learning_rate_init=0.1):
 21 |         self.params = params
 22 |         self.learning_rate_init = learning_rate_init
 23 |         self.learning_rate = float(learning_rate_init)
 24 | 
 25 |     def update_params(self, grads):
 26 |         """Update parameters with given gradients
 27 |         Parameters
 28 |         ----------
 29 |         grads : list, length = len(params)
 30 |             Containing gradients with respect to coefs_ and intercepts_ in MLP
 31 |             model. So length should be aligned with params
 32 |         """
 33 |         return self._get_updates(grads)
 34 | 
 35 |     def iteration_ends(self, time_step):
 36 |         """Perform update to learning rate and potentially other states at the
 37 |         end of an iteration
 38 |         """
 39 |         pass
 40 | 
 41 |     def trigger_stopping(self, msg, verbose):
 42 |         """Decides whether it is time to stop training
 43 |         Parameters
 44 |         ----------
 45 |         msg : str
 46 |             Message passed in for verbose output
 47 |         verbose : bool
 48 |             Print message to stdin if True
 49 |         Returns
 50 |         -------
 51 |         is_stopping : bool
 52 |             True if training needs to stop
 53 |         """
 54 |         if verbose:
 55 |             print(msg + " Stopping.")
 56 |         return True
 57 | 
 58 |     def reset(self):
 59 |         """Resets object.
 60 |         """
 61 |         pass
 62 | 
 63 | 
 64 | class ConstantRate(BaseOptimizer):
 65 |     """Constant learning rate for gradient descent
 66 |     parameters
 67 |     ---------
 68 |     learning_rate: float, optional, default 0.01
 69 |         The constant learning rate used.
 70 |     """
 71 | 
 72 |     def __init__(self, learning_rate=0.01):
 73 |         self.learning_rate = learning_rate
 74 | 
 75 |     def _get_updates(self, grads):
 76 |         """Get the values used to update params with given gradients
 77 |         Parameters
 78 |         ----------
 79 |         grads : list, length = len(coefs_) + len(intercepts_)
 80 |             Containing gradients with respect to coefs_ and intercepts_ in MLP
 81 |             model. So length should be aligned with params
 82 |         Returns
 83 |         -------
 84 |         updates : list, length = len(grads)
 85 |             The values to add to params
 86 |         """
 87 | 
 88 |         return self.learning_rate * grads
 89 | 
 90 | 
 91 | class SGDOptimizer(BaseOptimizer):
 92 |     """Stochastic gradient descent optimizer with momentum
 93 |     Parameters
 94 |     ----------
 95 |     params : list, length = len(coefs_) + len(intercepts_)
 96 |         The concatenated list containing coefs_ and intercepts_ in MLP model.
 97 |         Used for initializing velocities and updating params
 98 |     learning_rate_init : float, optional, default 0.1
 99 |         The initial learning rate used. It controls the step-size in updating
100 |         the weights
101 |     lr_schedule : {'constant', 'adaptive', 'invscaling'}, default 'constant'
102 |         Learning rate schedule for weight updates.
103 |         -'constant', is a constant learning rate given by
104 |          'learning_rate_init'.
105 |         -'invscaling' gradually decreases the learning rate 'learning_rate_' at
106 |           each time step 't' using an inverse scaling exponent of 'power_t'.
107 |           learning_rate_ = learning_rate_init / pow(t, power_t)
108 |         -'adaptive', keeps the learning rate constant to
109 |          'learning_rate_init' as long as the training keeps decreasing.
110 |          Each time 2 consecutive epochs fail to decrease the training loss by
111 |          tol, or fail to increase validation score by tol if 'early_stopping'
112 |          is on, the current learning rate is divided by 5.
113 |     momentum : float, optional, default 0.9
114 |         Value of momentum used, must be larger than or equal to 0
115 |     nesterov : bool, optional, default True
116 |         Whether to use nesterov's momentum or not. Use nesterov's if True
117 |     Attributes
118 |     ----------
119 |     learning_rate : float
120 |         the current learning rate
121 |     velocities : list, length = len(params)
122 |         velocities that are used to update params
123 |     """
124 | 
125 |     def __init__(self, params, learning_rate_init=0.1, lr_schedule='constant',
126 |                  momentum=0.9, nesterov=True, power_t=0.5):
127 |         super(SGDOptimizer, self).__init__(params, learning_rate_init)
128 | 
129 |         self.lr_schedule = lr_schedule
130 |         self.momentum = momentum
131 |         self.nesterov = nesterov
132 |         self.power_t = power_t
133 |         self.velocities = np.zeros_like(params).reshape(-1, 1)
134 | 
135 |     def iteration_ends(self, time_step):
136 |         """Perform updates to learning rate and potential other states at the
137 |         end of an iteration
138 |         Parameters
139 |         ----------
140 |         time_step : int
141 |             number of training samples trained on so far, used to update
142 |             learning rate for 'invscaling'
143 |         """
144 |         if self.lr_schedule == 'invscaling':
145 |             self.learning_rate = (float(self.learning_rate_init) /
146 |                                   (time_step + 1) ** self.power_t)
147 | 
148 |     def trigger_stopping(self, msg, verbose):
149 |         if self.lr_schedule == 'adaptive':
150 |             if self.learning_rate > 1e-6:
151 |                 self.learning_rate /= 5.
152 |                 if verbose:
153 |                     print(msg + " Setting learning rate to %f" %
154 |                           self.learning_rate)
155 |                 return False
156 |             else:
157 |                 if verbose:
158 |                     print(msg + " Learning rate too small. Stopping.")
159 |                 return True
160 |         else:
161 |             if verbose:
162 |                 print(msg + " Stopping.")
163 |             return True
164 | 
165 |     def _get_updates(self, grads):
166 |         """Get the values used to update params with given gradients
167 |         Parameters
168 |         ----------
169 |         grads : list, length = len(coefs_) + len(intercepts_)
170 |             Containing gradients with respect to coefs_ and intercepts_ in MLP
171 |             model. So length should be aligned with params
172 |         Returns
173 |         -------
174 |         updates : list, length = len(grads)
175 |             The values to add to params
176 |         """
177 |         updates = self.momentum * self.velocities - self.learning_rate * grads
178 |         self.velocities = updates
179 | 
180 |         if self.nesterov:
181 |             updates = self.momentum * self.velocities \
182 |               - self.learning_rate * grads
183 | 
184 |         return updates
185 | 
186 |     def reset(self):
187 |         self.learning_rate = float(self.learning_rate_init)
188 | 
189 | 
190 | class LR_SGD:
191 |     def __init__(self, M, learningRate=ConstantRate(0.01), dOut=1):
192 |         """
193 |         :param M: Number of weights.
194 |         :param learningRate: a opt.stochasticOptimiser.BaseOptimizer object
195 |         :param dOut: Number of output dimensions.
196 |         """
197 |         self.M = M
198 |         self.dOut = dOut
199 |         self.learningRate = learningRate
200 |         self.isModelInit = False
201 |         self.reset()
202 | 
203 |     def reset(self):
204 |         self.learningRate.reset()
205 |         self.w = np.random.normal(0.0, 1.0, (self.M, self.dOut))
206 |         self.time = 0
207 | 
208 |     def isInit(self):
209 |         return self.isModelInit
210 | 
211 |     def update(self, phi, y):
212 |         self.isModelInit = True
213 |         grads = np.dot(phi.T, y - np.dot(phi, self.w))
214 |         deltaw = self.learningRate.update_params(grads)
215 |         self.w += deltaw.reshape(self.w.shape)
216 | 
217 |         if isinstance(self.learningRate, SGDOptimizer):
218 |             self.time += 1
219 |             self.learningRate.iteration_ends(self.time)
220 | 
221 |     def predictMean(self, phi):
222 |         if self.isModelInit:
223 |             y = np.dot(phi, self.w).reshape(-1, self.dOut)
224 |             return y
225 |         else:
226 |             return np.zeros((phi.shape[0], self.dOut))
227 | 
228 |     def predict(self, phi):
229 |         return self.predictMean, None
230 | 
231 |     def optimise(self, max_evals=200):
232 |         # TODO optimise parameters
233 |         pass
234 | 


--------------------------------------------------------------------------------
/rlutils/Normaliser.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Normaliser:
 5 |     """
 6 |     Normalise/unnormalise data to [0,1] or [-1,1].
 7 |     """
 8 |     def __init__(self, low, high, zeroOneInterval=True):
 9 |         """
10 |         :param low: List of lower-bounds for each dimension
11 |         :param high: List of upper-bounds for each dimension
12 |         :param zeroOneInterval: whether normalised interval should be [0,1]
13 |         (default) or [-1,1]
14 |         """
15 |         assert(len(low) == len(high) and
16 |                "Upper and lower bounds much be same dimension.")
17 |         assert(np.isfinite(np.sum(low)) and
18 |                "Lower bound elements must be numbers.")
19 |         assert(np.isfinite(np.sum(high)) and
20 |                "Upper bound elements must be numbers.")
21 | 
22 |         spaceRange = np.array(high) - np.array(low)
23 | 
24 |         if np.sum(spaceRange > 100) > 0:
25 |             print("Warning: normalising over large space.")
26 | 
27 |         self.factor = (1.0 if zeroOneInterval else 2.0) * spaceRange
28 |         self.invFactor = (1.0 if zeroOneInterval else 2.0) / spaceRange
29 |         self.offset = -np.array(low)
30 |         self.finalOffset = 0.0 if zeroOneInterval else -1.0
31 |         self.boundsNorm = (spaceRange * 0 - (0 if zeroOneInterval else 1),
32 |                            spaceRange * 0 + 1)
33 |         self.boundsOrig = (np.array(low), np.array(high))
34 | 
35 |     def normalise(self, x):
36 |         """
37 |         Normalise x.
38 |         :param x: list with 1 element, or N*D numpy matrix with N elements
39 |         :return: numpy matrix with shape of input
40 |         """
41 |         _x = np.array(x)
42 |         if len(_x.shape) == 1:
43 |             assert(_x.shape == self.offset.shape and
44 |                    "Data must be same dimension as lower/upper bounds")
45 |         else:
46 |             assert(_x.shape[1] == self.offset.shape[0] and
47 |                    "Data must be same dimension as lower/upper bounds")
48 | 
49 |         return (_x + self.offset) * self.invFactor + self.finalOffset
50 | 
51 |     def unnormalise(self, x):
52 |         """
53 |         Unnormalise x.
54 |         :param x: list with 1 element, or N*D numpy matrix with N elements
55 |         :return: numpy matrix with shape of input
56 |         """
57 |         _x = np.array(x)
58 |         if len(_x.shape) == 1:
59 |             assert(_x.shape == self.offset.shape and
60 |                    "Data must be same dimension as lower/upper bounds")
61 |         else:
62 |             assert(_x.shape[1] == self.offset.shape[0] and
63 |                    "Data must be same dimension as lower/upper bounds")
64 | 
65 |         return (_x - self.finalOffset) * self.factor - self.offset
66 | 
67 |     def boundsNormalised(self):
68 |         return self.boundsNorm
69 | 
70 |     def boundsOriginal(self):
71 |         return self.boundsOrig
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     nrm = Normaliser([5, -10], [6, 100], True)
76 | 
77 |     # Test for single element in list
78 |     x = [5.5, 4]
79 |     y = nrm.normalise(x)
80 |     z = nrm.unnormalise(y)
81 |     print(x, y, z)
82 |     assert(np.isclose(0, np.linalg.norm(x-z)))
83 | 
84 |     # Test for numpy array of elements
85 |     x = np.hstack((np.arange(5, 6, 0.1).reshape(-1, 1),
86 |                    np.arange(-10, 100, 11).reshape(-1, 1)))
87 |     y = nrm.normalise(x)
88 |     z = nrm.unnormalise(y)
89 |     assert(np.isclose(0, np.linalg.norm(x-z)))
90 | 


--------------------------------------------------------------------------------
/rlutils/Policies.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import nlopt
  3 | 
  4 | 
  5 | class AbsPolicy:
  6 |     def pick(self, s):
  7 |         raise NotImplementedError
  8 | 
  9 | 
 10 | class RandomPolicy(AbsPolicy):
 11 |     def __init__(self, agentHelper):
 12 |         self.agentHelper = agentHelper
 13 |         if self.agentHelper.isDiscA():
 14 |             self.dA = self.agentHelper
 15 |             self.pick = self.pickDisc
 16 |         else:
 17 |             self.pick = self.pickCont
 18 | 
 19 |     def pickDisc(self, s):
 20 |         return self.agentHelper.randDiscA((len(s), self.dA))
 21 | 
 22 |     def pickCont(self, s):
 23 |         return self.agentHelper.sampleContA(len(s))
 24 | 
 25 | 
 26 | class eGreedyPolicy(AbsPolicy):
 27 |     def __init__(self, agentHelper, scoreFn, epsilon=0.0):
 28 |         self.scoreFn = scoreFn
 29 |         self.agentHelper = agentHelper
 30 |         self.epsilon = epsilon
 31 | 
 32 |         self.dA = self.agentHelper.dA()
 33 | 
 34 |         if self.agentHelper.isDiscA():
 35 |             self.allAs = self.agentHelper.allDiscA().reshape(-1, 1)
 36 |             self.pick = self.pickDisc
 37 |         else:
 38 |             self.noSampsApprox = 15
 39 |             self.pick = self.pickCont
 40 | 
 41 |             # Setup nlopt
 42 |             self.currentState = None
 43 | 
 44 |             def __acq_fun_maximize(_x, grad):
 45 |                 s = self.currentState.reshape(1, -1)
 46 |                 a = _x.reshape(1, -1)
 47 |                 score = float(self.scoreFn(s, a))
 48 |                 return score
 49 | 
 50 |             opt_maxeval = 8
 51 |             self.opt = nlopt.opt(nlopt.LN_COBYLA, self.agentHelper.dA())
 52 |             boundsA = self.agentHelper.boundsA()
 53 |             self.opt.set_lower_bounds(boundsA[0])
 54 |             self.opt.set_upper_bounds(boundsA[1])
 55 |             self.opt.set_maxeval(opt_maxeval)
 56 |             self.opt.set_max_objective(__acq_fun_maximize)
 57 | 
 58 |     def pickDisc(self, s):
 59 |         randMask = np.random.random((len(s), )) < self.epsilon
 60 | 
 61 |         # Greedy
 62 |         repAs = np.vstack([self.allAs]*len(s))
 63 |         scores = self.scoreFn(np.repeat(s, len(self.allAs), axis=0),
 64 |                               repAs).reshape(len(s), -1)
 65 |         maxId = np.argmax(scores, axis=1)
 66 |         aa = self.allAs[maxId].reshape(-1, self.dA)
 67 | 
 68 |         # Combine greedy and random
 69 |         if np.any(randMask):
 70 |             randa = self.agentHelper.randDiscA((len(s), self.dA))
 71 |             return randa * randMask + aa * ~randMask
 72 |         return aa
 73 | 
 74 |     def pickCont2(self, s):
 75 |         aa = []
 76 |         for si in s:
 77 |             a = self.agentHelper.sampleContA(1)
 78 |             if np.random.random() >= self.epsilon:
 79 |                 self.currentState = si
 80 |                 a = a.reshape(-1)
 81 |                 a = self.opt.optimize(a).reshape(1, -1)
 82 |             aa.append(a)
 83 |         return np.vstack(aa)
 84 | 
 85 |     def pickCont(self, s):
 86 |         randMask = np.random.random((len(s), )) < self.epsilon
 87 | 
 88 |         # Greedy
 89 |         sampsAs = self.agentHelper.sampleContA(self.noSampsApprox)
 90 |         repAs = np.vstack([sampsAs]*len(s))
 91 |         scores = self.scoreFn(np.repeat(s, len(sampsAs), axis=0),
 92 |                               repAs).reshape(len(s), -1)
 93 |         maxId = np.argmax(scores, axis=1)
 94 |         aa = sampsAs[maxId].reshape(-1, self.dA)
 95 | 
 96 |         # Combine greedy and random
 97 |         if np.any(randMask):
 98 |             randa = self.agentHelper.sampleContA(len(s))
 99 |             return randa * randMask + aa * ~randMask
100 |         return aa
101 | 


--------------------------------------------------------------------------------
/rlutils/Runners.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from tqdm import tqdm
 3 | from rlutils.EnvGlue import AbsEnvGlue
 4 | from rlutils.Agents import AbsAgent
 5 | from rlutils.Policies import AbsPolicy
 6 | 
 7 | 
 8 | class EnvRunner:
 9 |     def __init__(self, envGlue, policy, agent, tm=None, verbose=1,
10 |                  updateAgent=True):
11 |         self.tm = tm  # transition memory
12 |         self.verbose = verbose
13 | 
14 |         # Environment
15 |         if not isinstance(envGlue, AbsEnvGlue):
16 |             print(envGlue, "/", AbsEnvGlue)
17 |             raise ValueError("Environment glue must be of type AbsEnvGlue")
18 |         self.envGlue = envGlue
19 | 
20 |         # Policy
21 |         if not isinstance(policy, AbsPolicy):
22 |             raise ValueError("Policy must be of type AbsPolicy")
23 |         self.policy = policy
24 | 
25 |         # Agent
26 |         if not isinstance(agent, AbsAgent):
27 |             raise ValueError("Agent must be of type AbsAgent")
28 |         self.agent = agent
29 |         self.updateAgent = updateAgent
30 | 
31 |     def run(self, nEp, nStep, stopAtPosReturn=False):
32 |         if self.verbose == 1:
33 |             gen = tqdm(range(nEp))
34 |         else:
35 |             gen = range(nEp)
36 |         for i in gen:
37 |             if self.verbose > 1:
38 |                 sys.stdout.write("Episode {}".format(i+1))
39 |                 sys.stdout.flush()
40 | 
41 |             # Run episode
42 |             epRet, lastStepDone, nStepFinish, lastR = self.runEpisode(nStep)
43 | 
44 |             # Save transitions
45 |             if self.tm:
46 |                 self.tm.endEpisode()
47 | 
48 |             if stopAtPosReturn:
49 |                 if lastStepDone and nStepFinish < nStep and lastR >= 0:
50 |                     break
51 | 
52 |             # Potential update at the end of the episode
53 |             if self.updateAgent:
54 |                 self.agent.endOfEpUpdate()
55 | 
56 |         if self.verbose > 1:
57 |             print("Finished")
58 | 
59 |     def runEpisode(self, nStep):
60 |         s = self.envGlue.resetEnv()
61 |         ret = 0
62 |         done = False
63 |         for i in range(nStep):
64 |             a = self.policy.pick(s)
65 |             sp, r, done = self.envGlue.stepEnv(a)
66 |             ret += r
67 |             if self.verbose > 2:
68 |                 print("\nTransition:\ts:{}\n\t\ta:{}\n\t\tr:{}\n\t\ts':{}".
69 |                       format(s, a, r, sp))
70 | 
71 |             if self.tm:
72 |                 self.tm.addStep(s, a, r, sp)
73 | 
74 |             if self.updateAgent:
75 |                 self.agent.update(s, a, r, sp)
76 | 
77 |             s = sp
78 |             if done:
79 |                 break
80 |         if self.verbose > 1:
81 |             print("... finished in", i+1, "steps with return", ret)
82 |         return ret, done, i+1, r
83 | 


--------------------------------------------------------------------------------
/rlutils/TransitionMemory.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class TransitionMemory:
  5 |     class EpisodeMemory:
  6 |         def __init__(self, noStepsMax, dS, dA, dR, onlyRewards=False):
  7 |             self.onlyRewards = onlyRewards
  8 |             if not onlyRewards:
  9 |                 self.memS = np.zeros((noStepsMax, dS))
 10 |                 self.memS2 = np.zeros((noStepsMax, dS))
 11 |                 self.memA = np.zeros((noStepsMax, dA))
 12 |             self.memR = np.zeros((noStepsMax, dR))
 13 |             self.memId = 0
 14 | 
 15 |         def addStep(self, s, a, r, s2):
 16 |             if not self.onlyRewards:
 17 |                 self.memS[self.memId] = s
 18 |                 self.memS2[self.memId] = s2
 19 |                 self.memA[self.memId] = a
 20 |             self.memR[self.memId] = r
 21 |             self.memId += 1
 22 | 
 23 |         def getS(self):
 24 |             if self.onlyRewards:
 25 |                 raise NotImplementedError
 26 |             return self.memS[:self.memId, :]
 27 | 
 28 |         def getS2(self):
 29 |             if self.onlyRewards:
 30 |                 raise NotImplementedError
 31 |             return self.memS2[:self.memId, :]
 32 | 
 33 |         def getA(self):
 34 |             if self.onlyRewards:
 35 |                 raise NotImplementedError
 36 |             return self.memA[:self.memId, :]
 37 | 
 38 |         def getR(self):
 39 |             return self.memR[:self.memId, :]
 40 | 
 41 |     def __init__(self, noEpisiodes, noStepsMax, dS, dA, dR=1,
 42 |                  onlyRewards=False):
 43 |         """
 44 |         Memory storing all transitions (state, action, reward, new state) for
 45 |         all episodes.
 46 |         :param noEpisiodes: number of episodes
 47 |         :param noStepsMax: maximum number of steps per episode
 48 |         :param dS: state space dimension
 49 |         :param dA: action space dimension
 50 |         :param dR: reward space dimension (default=1)
 51 |         :param onlyRewards: Wheter to only record rewards (default=False).
 52 |         """
 53 |         self.memEps = [self.EpisodeMemory(noStepsMax, dS, dA, dR, onlyRewards)
 54 |                        for _ in range(noEpisiodes)]
 55 |         self.onlyRewards = onlyRewards
 56 |         self.memId = 0
 57 | 
 58 |     def addStep(self, s, a, r, s2):
 59 |         """
 60 |         Add step to current episode
 61 |         :param s: state
 62 |         :param a: action
 63 |         :param r: reward
 64 |         :param s2: new state
 65 |         """
 66 |         self.memEps[self.memId].addStep(s, a, r, s2)
 67 | 
 68 |     def endEpisode(self):
 69 |         """
 70 |         End current episode memory. Start new episode memory.
 71 |         """
 72 |         if self.memId < len(self.memEps):
 73 |             self.memId += 1
 74 | 
 75 |     def getEpisodeTransitions(self, noEp=-1):
 76 |         """
 77 |         Retreive transitions from given episode.
 78 |         :pram noEp: episode id (default=-1 last episode)
 79 |         :returns: matrixes of states, actopms, rewards, and new states
 80 |         """
 81 |         if noEp == -1:
 82 |             noEp = self.memId
 83 | 
 84 |         rr = self.memEps[noEp].getR()
 85 |         if self.onlyRewards:
 86 |             return rr
 87 | 
 88 |         ss = self.memEps[noEp].getS()
 89 |         ss2 = self.memEps[noEp].getS2()
 90 |         aa = self.memEps[noEp].getA()
 91 |         return ss, aa, rr, ss2
 92 | 
 93 |     def getTransitions(self):
 94 |         """
 95 |         Retreive transitions from all episodes (concatenated).
 96 |         :returns: matrixes of states, actopms, rewards, and new states
 97 |         """
 98 |         if self.memId == 0:
 99 |             rr = self.memEps[0].getR()
100 |             if self.onlyRewards:
101 |                 return rr
102 |             ss = self.memEps[0].getS()
103 |             ss2 = self.memEps[0].getS2()
104 |             aa = self.memEps[0].getA()
105 |         else:
106 |             rr = np.vstack([m.getR() for m in self.memEps[:self.memId]])
107 |             if self.onlyRewards:
108 |                 return rr
109 |             ss = np.vstack([m.getS() for m in self.memEps[:self.memId]])
110 |             ss2 = np.vstack([m.getS2() for m in self.memEps[:self.memId]])
111 |             aa = np.vstack([m.getA() for m in self.memEps[:self.memId]])
112 |         return ss, aa, rr, ss2
113 | 
114 |     def getLastTransitions(self, nTransitions):
115 |         maxEpId = min(len(self.memEps)-1, self.memId)
116 |         # Count transitions
117 |         nn = np.cumsum([self.memEps[i].memId for
118 |                         i in range(maxEpId, -1, -1)])
119 | 
120 |         # Not enough transitions in memory
121 |         if nn[-1] <= nTransitions:
122 |             return self.getTransitions()
123 | 
124 |         idx = np.where(nn > nTransitions)[0][0] - 1
125 |         memIdcs = range(maxEpId, idx-1, -1)
126 | 
127 |         if len(memIdcs) == 1:
128 |             rr = self.memEps[memIdcs[0]].getR()
129 |             if self.onlyRewards:
130 |                 return rr
131 |             ss = self.memEps[memIdcs[0]].getS()
132 |             ss2 = self.memEps[memIdcs[0]].getS2()
133 |             aa = self.memEps[memIdcs[0]].getA()
134 |         else:
135 |             rr = np.vstack([self.memEps[i].getR() for i in memIdcs])
136 |             if self.onlyRewards:
137 |                 return rr
138 |             ss = np.vstack([self.memEps[i].getS() for i in memIdcs])
139 |             ss2 = np.vstack([self.memEps[i].getS2() for i in memIdcs])
140 |             aa = np.vstack([self.memEps[i].getA() for i in memIdcs])
141 | 
142 |         return ss[:nTransitions, :], aa[:nTransitions, :], \
143 |             rr[:nTransitions, :], ss2[:nTransitions, :]
144 | 
145 |     def getEpisodeLengths(self):
146 |         """
147 |         Retreive episode lengths
148 |         :returns: list of episode lengths
149 |         """
150 |         return [m.memId for m in self.memEps[:self.memId]]
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     tm = TransitionMemory(3, 10, 2, 1)
155 |     tm.addStep([0, 1], 0, -1, [0, 2])
156 |     tm.addStep([0, 1], 0, -1, [0, 2])
157 |     tm.addStep([0, 1], 0, -1, [0, 2])
158 |     tm.addStep([0, 1], 0, -1, [0, 2])
159 |     tm.addStep([0, 1], 0, -1, [0, 2])
160 |     tm.endEpisode()
161 |     tm.addStep([0, 2], 0, -1, [0, 2])
162 |     tm.addStep([0, 2], 0, -1, [0, 2])
163 |     tm.addStep([0, 2], 0, -1, [0, 2])
164 |     tm.addStep([0, 2], 0, -1, [0, 2])
165 |     tm.addStep([0, 2], 0, -1, [0, 2])
166 |     tm.endEpisode()
167 |     tm.addStep([0, 3], 0, -1, [0, 2])
168 |     tm.addStep([0, 3], 0, -1, [0, 2])
169 |     tm.endEpisode()
170 |     s, a, r, s2 = tm.getTransitions()
171 |     print("ALL:", s, a, r, s2)
172 |     s, a, r, s2 = tm.getEpisodeTransitions()
173 |     print("ONE:", s, a, r, s2)
174 | 


--------------------------------------------------------------------------------
/rlutils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PhilippeMorere/EMU-Q/c7ee795256f343d468dc22f4d48b1288264e743f/rlutils/__init__.py


--------------------------------------------------------------------------------
/rlutils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def smw_inv_correction(A_inv, U, V):
 5 |     """
 6 |     Sherman-Morrison-Woodbury update
 7 |     For rank k updates to the inverse matrix
 8 | 
 9 |     IMPORTANT: This is the correction factor which one must subtract from A_inv
10 |     Usage:   subtract this value from current A_inv
11 | 
12 |     ref:     http://mathworld.wolfram.com/WoodburyFormula.html
13 |              https://en.wikipedia.org/wiki/Woodbury_matrix_identity
14 |     :param A_inv:   n x n
15 |     :param U:      n x k
16 |     :param V:      k x n
17 |     :return:
18 |     """
19 |     rank = U.shape[1]
20 |     SU = np.dot(A_inv, U)
21 |     VS = np.dot(V, A_inv)
22 |     I_plus_VSU_inv = np.linalg.pinv(np.identity(rank) + np.dot(VS, U))
23 |     SU_I_plus_VSU = np.dot(SU, I_plus_VSU_inv)
24 |     return np.dot(SU_I_plus_VSU, VS)
25 | 
26 | 
27 | def batch_generator(arrays, batch_size, wrapLastBatch=False):
28 |     """
29 |     Batch generator() function for yielding [x_train, y_train] batch slices for
30 |     numpy arrays
31 |     Appropriately deals with looping back around to the start of the dataset
32 |     Generate batches, one with respect to each array's first axis.
33 |     :param arrays:[array, array]  or [array, None]...
34 |                   e.g. [X_trn, Y_trn] where X_trn and Y_trn are ndarrays
35 |     :param batch_size: batch size
36 |     :param wrapLastBatch: whether the last batch should wrap around dataset
37 |     to include first datapoints (True), or be smaller to stop at the end of
38 |     the dataset (False).
39 |     :return:
40 |     """
41 |     starts = [0] * len(
42 |         arrays)  # pointers to where we are in iteration     --> [0, 0]
43 |     while True:
44 |         batches = []
45 |         for i, array in enumerate(arrays):
46 |             start = starts[i]
47 |             stop = start + batch_size
48 |             diff = stop - array.shape[0]
49 |             if diff <= 0:
50 |                 batch = array[start:stop]
51 |                 starts[i] += batch_size
52 |             else:
53 |                 if wrapLastBatch:
54 |                     batch = np.concatenate((array[start:], array[:diff]))
55 |                     starts[i] = diff
56 |                 else:
57 |                     batch = array[start:]
58 |                     starts[i] = 0
59 |             batches.append(batch)
60 |         yield batches
61 | 


--------------------------------------------------------------------------------