├── LICENSE ├── Method.py ├── README.md ├── features ├── FourierBasisFeatures.py └── RandomFourierFeatures.py ├── gymEnvs ├── CartpoleSwingup.py ├── DoublePendulum.py ├── Hopper.py ├── LunarLander.py ├── MountainCar.py ├── Reacher.py ├── SinglePendulum.py ├── SparseCartpoleSwingup.py ├── SparseCartpoleSwingupDisc.py ├── SparseDoublePendulum.py ├── SparseDoublePendulumDisc.py ├── SparseHopper.py ├── SparseHopperDisc.py ├── SparseLunarLander.py ├── SparseLunarLanderDisc.py ├── SparseMountainCar.py ├── SparseMountainCarDisc.py ├── SparseReacher.py ├── SparseReacherDisc.py ├── SparseSinglePendulum.py ├── SparseSinglePendulumDisc.py └── __init__.py ├── main.py └── rlutils ├── ABLR.py ├── AgentHelper.py ├── Agents.py ├── EnvGlue.py ├── Envs.py ├── LR_SGD.py ├── Normaliser.py ├── Policies.py ├── Runners.py ├── TransitionMemory.py ├── __init__.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Method.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | from rlutils.ABLR import ABLR, fixedMeanABLR 5 | from rlutils.Policies import eGreedyPolicy 6 | from rlutils.Agents import AbsTDAgent 7 | 8 | 9 | class Method(AbsTDAgent): 10 | def __init__(self, agtHlp, alphaQ=0.1, betaQ=0.1, alphaU=0.1, betaU=0.1, 11 | kappa=1.0, gamma=0.99, maxVIiter=30, tm=None): 12 | self.agtHlp = agtHlp 13 | self.modelQ = ABLR(agtHlp.nFeatures, alphaQ, betaQ, computeSninv=False) 14 | sigma = agtHlp.ftmap.sigma 15 | self.modelU = fixedMeanABLR(agtHlp.nFeatures, alphaU, betaU, 0.0, 16 | sigma[0]) 17 | self.kappa = kappa 18 | self.gamma = gamma 19 | self.tm = tm 20 | self.maxVIiter = maxVIiter 21 | self.epsLSTD = 0.1 # Stop LSTD loop if model change under this value 22 | self.greedyPolicy = eGreedyPolicy(agtHlp, self.score, 0.0) 23 | self.nSampsACont = 10 24 | self.varMax = 1.0 / alphaU 25 | self.kappa /= self.varMax 26 | print("Varmax={}, Kappa={}".format(self.varMax, self.kappa)) 27 | 28 | def update(self, s, a, r, sp): 29 | """ 30 | Adds one data point at a time. 31 | """ 32 | ap = self.greedyPolicy.pick(sp) 33 | ftsa = self.agtHlp.toStateActionPair(s, a) 34 | ftsap = self.agtHlp.toStateActionPair(sp, ap) 35 | 36 | # Update Q 37 | qTarget = r + self.gamma * self.modelQ.predictMean(ftsap) 38 | self.modelQ.update(ftsa, qTarget) 39 | 40 | # Update U 41 | self._updateU(ftsa, ftsap, s, a, sp) 42 | 43 | def _updateU(self, ftsa, ftsap, s, a, sp): 44 | if self.agtHlp.isDiscA(): 45 | allAs = self.agtHlp.allDiscA().reshape(-1, 1) 46 | else: 47 | allAs = self.agtHlp.sampleContA(self.nSampsACont) 48 | repSp = np.repeat(sp, len(allAs), axis=0) 49 | sampsFtsap = self.agtHlp.toStateActionPair(repSp, allAs) 50 | varQsp = np.mean(self.modelQ.predictVar(sampsFtsap)) 51 | 52 | Usap = self.modelU.predictMean(ftsap) 53 | 54 | usp = varQsp - self.varMax 55 | self.modelU.update(ftsa, usp + self.gamma * Usap) 56 | 57 | def score(self, s, a): 58 | sa = self.agtHlp.toStateActionPair(s, a) 59 | m = self.modelQ.predictMean(sa) 60 | u = self.modelU.predictMean(sa) 61 | return m + self.kappa * u 62 | 63 | def endOfEpUpdate(self): 64 | if not self.tm: 65 | return 66 | print("End of episode update") 67 | # Retrieve all data 68 | data_s, data_a, data_r, data_sp = self.tm.getTransitions() 69 | 70 | # update for Q 71 | ftsa = self.agtHlp.toStateActionPair(data_s, data_a) 72 | # Init 73 | it = 0 74 | w = self.modelQ.mn 75 | prevW = w + 2 * self.epsLSTD 76 | pbar = tqdm(total=self.maxVIiter) 77 | while np.linalg.norm(prevW - w, 2) > self.epsLSTD and \ 78 | it < self.maxVIiter: 79 | it += 1 80 | prevW = w 81 | 82 | data_ap = self.greedyPolicy.pick(data_sp) 83 | ftsap = self.agtHlp.toStateActionPair(data_sp, data_ap) 84 | qTargets = data_r + self.gamma * self.modelQ.predictMean(ftsap) 85 | self.modelQ.updateTargets(ftsa, qTargets) 86 | self.modelQ._recompute() 87 | 88 | w = self.modelQ.mn 89 | pbar.update(1) 90 | pbar.update(self.maxVIiter-it) 91 | pbar.close() 92 | print("") 93 | print("\tVI iterations for Q:{}".format(it)) 94 | 95 | if self.kappa == 0: 96 | return 97 | # update for U 98 | data_ap = self.greedyPolicy.pick(data_sp) 99 | ftsap = self.agtHlp.toStateActionPair(data_sp, data_ap) 100 | lenS = data_s.shape[0] 101 | if self.agtHlp.isDiscA(): 102 | allA = self.agtHlp.allDiscA().reshape(-1, 1) 103 | else: 104 | allA = self.agtHlp.sampleContA(self.nSampsACont) 105 | repAllA = np.vstack([allA] * lenS) 106 | repSp = np.repeat(data_sp, len(allA), 0) 107 | sampsFtsap = self.agtHlp.toStateActionPair(repSp, repAllA) 108 | allVarQsp = self.modelQ.predictVar(sampsFtsap).reshape(-1, lenS) 109 | varQ_sp = np.mean(allVarQsp, 0).reshape(-1, 1) 110 | # Init 111 | it = 0 112 | w = self.modelU.mn 113 | prevW = w + 2 * self.epsLSTD 114 | pbar = tqdm(total=self.maxVIiter) 115 | while np.linalg.norm(prevW - w, 2) > self.epsLSTD and \ 116 | it < self.maxVIiter: 117 | it += 1 118 | prevW = w 119 | 120 | self._endEpUpdateU(ftsa, ftsap, data_s, data_a, data_sp, varQ_sp) 121 | self.modelU._recompute() 122 | 123 | w = self.modelU.mn 124 | pbar.update(1) 125 | pbar.update(self.maxVIiter-it) 126 | pbar.close() 127 | print("") 128 | print("\tVI iterations for U:{}".format(it)) 129 | 130 | def _endEpUpdateU(self, ftsa, ftsap, data_s, data_a, data_sp, varQ_sp): 131 | rExpl = varQ_sp - self.varMax 132 | Usap = self.modelU.predictMean(ftsap) 133 | self.modelU.updateTargets(ftsa, rExpl + self.gamma * Usap) 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EMU-Q 2 | Exploring by Minimizing Uncertainty of Q values (EMU-Q) as presented in "Bayesian RL for Goal-Only Rewards" at CoRL'18 by *P. Morere and F. Ramos* [\[PDF\]](http://proceedings.mlr.press/v87/morere18a/morere18a.pdf). 3 | 4 | If you use any of the code related to this repository in a paper, research etc., please cite: 5 | 6 | ```bibtex 7 | @inproceedings{ 8 | morere2018bayesian, 9 | title={Bayesian {RL} for Goal-Only Rewards}, 10 | author={Morere, Philippe and Ramos, Fabio}, 11 | booktitle={Conference on Robot Learning}, 12 | year={2018}, 13 | } 14 | ``` 15 | 16 | ## Dependencies 17 | This code is written for python3. The dependencies (pip packages) are: 18 | * numpy 19 | * scipy 20 | * gym 21 | * nlopt 22 | * ghalton 23 | * tqdm 24 | 25 | ## Running the code 26 | The code entry point is `main.py`. Try run `python3 main.py --help` for available options. 27 | ### Running our method 28 | ``` 29 | python3 main.py --agent=method --sparseGymEnv=MountainCar-v0 --nStep=300 --nEp=10 --nRFF=300 --sigmaS=0.35 --sigmaA=10 -vv 30 | ``` 31 | 32 | ### Running RFF-Q 33 | ``` 34 | python3 main.py --agent=QLearning --gymEnv=MountainCar-v0 --nStep=300 --nEp=30 --nRFF=300 --sigmaS=0.35 --sigmaA=10 -vv 35 | ``` 36 | 37 | ## goal-only discrete and continuous gym environments 38 | All goal-only discrete and continuous gym environments presented in the main paper are located in the `gymEnvs` folder. To use them, these environments need to be registered in gym as described in . 39 | These environments can then be called from `main.py` with `--gymEnv=SparseMountainCar-v0` for example. 40 | -------------------------------------------------------------------------------- /features/FourierBasisFeatures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class FourierBasisFeatures: 5 | def __init__(self, dim, featureOrder, verbose=1): 6 | """ 7 | Fourier basis features up to order N 8 | :param dim: input dimension 9 | :param featureOrder: Fourier feature order 10 | """ 11 | 12 | freqs = tuple([list(range(featureOrder + 1))] * dim) 13 | # Cartesian product of arrays: 14 | prod = np.array(np.meshgrid(*freqs)).T.reshape(-1, dim) 15 | self.featureCoeff = np.pi * prod 16 | self.nFeatures = len(self.featureCoeff) 17 | if verbose >= 2: 18 | print("Feature coefficients({}): \n{}".format( 19 | len(self.featureCoeff), self.featureCoeff)) 20 | 21 | def toFeatures(self, s): 22 | return np.cos(self.featureCoeff.dot(s.T)).T 23 | -------------------------------------------------------------------------------- /features/RandomFourierFeatures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numbers 3 | from scipy.special import erfinv 4 | import ghalton 5 | 6 | 7 | class RFF: 8 | """ 9 | Random Fourier Features, Vanilla or quasi-random 10 | Note: make sure input space is normalised 11 | """ 12 | def toFeatures(self, x): 13 | pass 14 | 15 | def __init__(self, m, d, sigma, cosOnly=False, quasiRandom=True, 16 | kernel="RBF"): 17 | """ 18 | :param m: number of features 19 | :param d: input dimension 20 | :param m: feature lengthscale (can be scalar of vector of size d) 21 | :param cosOnly: Using cos-only formulation of RFF (Default=False) 22 | :param quasiRandom: Using quasi-random sequence to generate RFF 23 | (Default=True) 24 | :param kernel: Type of kernel to approximate: RBF, Laplace/Matern12, 25 | Matern32, Matern52 (Default=RBF) 26 | RFF for RBF kernel. 27 | """ 28 | self.m = int(m) 29 | self.nFeatures = self.m 30 | self.sigma = sigma 31 | self.d = int(d) 32 | self.coeff = None 33 | self.offset = None 34 | self.a = 1.0 35 | 36 | # Fix sigma 37 | if isinstance(sigma, numbers.Number): 38 | sigma = np.ones(d) * sigma 39 | elif isinstance(sigma, list): 40 | sigma = np.array(sigma) 41 | 42 | if kernel == "RBF": 43 | rffKernel = RFFKernelRBF() 44 | elif kernel == "Laplace" or kernel == "Matern12": 45 | rffKernel = RFFKernelMatern12() 46 | elif kernel == "Matern32": 47 | rffKernel = RFFKernelMatern32() 48 | elif kernel == "Matern52": 49 | rffKernel = RFFKernelMatern52() 50 | else: 51 | raise ValueError("Kernel {} is not recognised.".format(kernel)) 52 | 53 | self.quasiRandom = quasiRandom 54 | self.cosOnly = cosOnly 55 | if self.cosOnly: # cos only features 56 | self.coeff = self._drawCoeff(rffKernel, m) 57 | self.offset = 2.0 * np.pi * np.random.rand(1, m) 58 | self.a = np.sqrt(1.0/float(self.m)) 59 | self.toFeatures = self._toCosOnlyFeatures 60 | else: # "cossin" 61 | assert m % 2 == 0 and "RFF: Number of fts must be multiple of 2." 62 | self.coeff = self._drawCoeff(rffKernel, int(m//2)) 63 | self.a = np.sqrt(1.0/float(self.m/2)) 64 | self.toFeatures = self._toCosSinFeatures 65 | 66 | def _drawCoeff(self, rffKernel, m): 67 | if self.quasiRandom: 68 | perms = ghalton.EA_PERMS[:self.d] 69 | sequencer = ghalton.GeneralizedHalton(perms) 70 | points = np.array(sequencer.get(m+1))[1:] 71 | freqs = rffKernel.invCDF(points) 72 | return freqs / self.sigma.reshape(1, len(self.sigma)) 73 | 74 | else: 75 | freqs = rffKernel.sampleFreqs((m, self.d)) 76 | return freqs / self.sigma.reshape(1, len(self.sigma)) 77 | 78 | def _toCosOnlyFeatures(self, x): 79 | inner = x.dot(self.coeff.T) 80 | return self.a * np.cos(inner + self.offset) 81 | 82 | def _toCosSinFeatures(self, x): 83 | inner = x.dot(self.coeff.T) 84 | return self.a * np.hstack((np.cos(inner), np.sin(inner))) 85 | 86 | 87 | class RFFKernel: 88 | def sampleFreqs(self, shape): 89 | raise NotImplementedError 90 | 91 | def invCDF(self, x): 92 | raise NotImplementedError 93 | 94 | 95 | class RFFKernelRBF(RFFKernel): 96 | def sampleFreqs(self, shape): 97 | return np.random.normal(0.0, 1.0, shape) 98 | 99 | def invCDF(self, x): 100 | return erfinv(2*x-1) * np.sqrt(2) 101 | 102 | 103 | class RFFKernelMatern12(RFFKernel): 104 | def sampleFreqs(self, shape): 105 | return np.random.normal(0, 1, shape) * \ 106 | np.sqrt(1/np.random.chisquare(1, shape)) 107 | 108 | def invCDF(self, x): 109 | # This formula comes from the inv cdf of a standard cauchy 110 | # distribution (see Laplace RFF). 111 | return np.tan(np.pi*(x-0.5)) 112 | 113 | 114 | class RFFKernelMatern32(RFFKernel): 115 | def sampleFreqs(self, shape): 116 | return np.random.normal(0, 1, shape) * \ 117 | np.sqrt(3/np.random.chisquare(3, shape)) 118 | 119 | def invCDF(self, x): 120 | # From https://www.researchgate.net/profile/William_Shaw9/publication/247441442_Sampling_Student%27%27s_T_distribution-use_of_the_inverse_cumulative_distribution_function/links/55bbbc7908ae9289a09574f6/Sampling-Students-T-distribution-use-of-the-inverse-cumulative-distribution-function.pdf 121 | return (2*x - 1) / np.sqrt(2*x*(1-x)) 122 | 123 | 124 | class RFFKernelMatern52(RFFKernel): 125 | def sampleFreqs(self, shape): 126 | return np.random.normal(0, 1, shape) * \ 127 | np.sqrt(5/np.random.chisquare(5, shape)) 128 | 129 | def invCDF(self, x): 130 | # From https://www.researchgate.net/profile/William_Shaw9/publication/247441442_Sampling_Student%27%27s_T_distribution-use_of_the_inverse_cumulative_distribution_function/links/55bbbc7908ae9289a09574f6/Sampling-Students-T-distribution-use-of-the-inverse-cumulative-distribution-function.pdf 131 | alpha = 4*x*(1-x) 132 | p = 4 * np.cos(np.arccos(np.sqrt(alpha))/3) / np.sqrt(alpha) 133 | return np.sign(x-0.5)*np.sqrt(p-4) 134 | -------------------------------------------------------------------------------- /gymEnvs/CartpoleSwingup.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class CartpoleSwingup(gym.Env): 7 | 8 | def __init__(self, render=False): 9 | self.env = gym.make("CartPole-v1") 10 | lowS = [-3.4, -8.0, -5.4, -8.7] 11 | highS = [3.4, 8.3, 6.35, 8.1] 12 | lowA = [-10] 13 | highA = [10] 14 | self.max_cart_pos = 3 15 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 16 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 17 | 18 | def reset(self): 19 | s = self.env.reset() 20 | s2 = np.array([s[0], s[1], s[2] + np.pi, s[3]]) 21 | print("b") 22 | self.env.env.state = s2 23 | return np.array(s2) 24 | 25 | def step(self, a): 26 | step = self.env.step(1 * (int(np.sign(a)) > 0)) 27 | ss = step[0] 28 | 29 | if abs(ss[0]) > self.max_cart_pos: 30 | reward = -100 31 | done = True 32 | elif np.cos(ss[2]) > 0.8: 33 | reward = 0 34 | done = True 35 | else: 36 | reward = np.cos(ss[2]) - 1 37 | done = False 38 | 39 | self.env.env.steps_beyond_done = None 40 | 41 | return ss, reward, done, {} 42 | -------------------------------------------------------------------------------- /gymEnvs/DoublePendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class DoublePendulum(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Acrobot-v1") 9 | lowS = [-1, -1, -42, -1, -1, -74] 10 | highS = [1, 1, 42, 1, 1, 74] 11 | lowA = [-50] 12 | highA = [50] 13 | 14 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 15 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 16 | 17 | def reset(self): 18 | return self.env.reset() 19 | 20 | def step(self, a): 21 | step = self.env.step(1 * (int(np.sign(a)) > 0)) 22 | return step[0], step[1], step[2], {} 23 | -------------------------------------------------------------------------------- /gymEnvs/Hopper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class Hopper(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Hopper-v2") 9 | lowS = [-0.1, -0.25, -0.4, -0.4, -0.32] + [-7] * 6 10 | highS = [1.3, 0.02, 0.025, 0.03, 0.7] + [7] * 6 11 | lowA = [-1, -1, -1] 12 | highA = [1, 1, 1] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | step = self.env.step(np.array(a)) 21 | next_obs = step[0] 22 | posafter, height, ang = self.env.env.sim.data.qpos[0:3] 23 | 24 | if height > 1.3: 25 | reward = 0 26 | done = True 27 | elif abs(ang) > 0.2 or height < 0.7: 28 | reward = -1000 29 | done = True 30 | else: 31 | reward = -1 32 | done = False 33 | 34 | return next_obs, reward, done, {} 35 | -------------------------------------------------------------------------------- /gymEnvs/LunarLander.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class LunarLander(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("LunarLanderContinuous-v2") 9 | lowS = [-1.2, -0.2, -3.3, -2.8, -3.4, -4, 0, 0] 10 | highS = [1.2, 1.6, 2, 0.8, 2.5, 9, 1.0, 1.0] 11 | lowA = [-1, -1] 12 | highA = [1, 1] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | next_obs, _, done, __ = self.env.step(np.array(a).reshape(-1)) 21 | 22 | if done: 23 | reward = -100 24 | elif abs(next_obs[0]) < 0.05 and abs(next_obs[1]) < 0.05: 25 | reward = 0 26 | done = True 27 | else: 28 | reward = -1 29 | 30 | return next_obs, reward, done, {} 31 | -------------------------------------------------------------------------------- /gymEnvs/MountainCar.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class MountainCar(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("MountainCar-v0") 9 | lowS = [-1.2, -0.07] 10 | highS = [0.6, 0.07] 11 | lowA = [-1.0] 12 | highA = [1.0] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | if np.abs(a) < 0.33: 21 | a2 = 1 22 | else: 23 | a2 = int(np.sign(a)) + 1 24 | step = self.env.step(a2) 25 | done = bool(step[0][0] >= self.env.env.goal_position) 26 | reward = 0.0 if done else -1.0 27 | return step[0], reward, done, {} 28 | -------------------------------------------------------------------------------- /gymEnvs/Reacher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class Reacher(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Reacher-v2") 9 | lowS = [-1, -1, -1, -0.17, -0.2, -0.2, -49, -5.6, -0.22, -0.2, 0] 10 | highS = [1, 1, 1, 1, 0.2, 0.2, 87, 33, 0.4, 0.35, 0.0001] 11 | lowA = [-1, 1] 12 | highA = [1, 1] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | step = self.env.step(np.array(a)) 21 | next_obs = step[0] 22 | vec = self.env.env.get_body_com("fingertip") - \ 23 | self.env.env.get_body_com("target") 24 | dist = np.linalg.norm(vec) 25 | 26 | done = (dist <= 0.015) 27 | reward = -dist 28 | if done: 29 | reward = 0.0 30 | return next_obs, reward, done, {} 31 | -------------------------------------------------------------------------------- /gymEnvs/SinglePendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SinglePendulum(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Pendulum-v0") 9 | lowS = [-1, -1, -8] 10 | highS = [1, 1, 8] 11 | lowA = [-2] 12 | highA = [2] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | step = self.env.step(np.array(a)) 21 | s = step[0] 22 | done = (s[0] > 0.95 and abs(s[1]) < 0.05) 23 | reward = 0.0 if done else step[1] 24 | return step[0], reward, done, {} 25 | -------------------------------------------------------------------------------- /gymEnvs/SparseCartpoleSwingup.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseCartpoleSwingup(gym.Env): 7 | 8 | def __init__(self, render=False): 9 | self.env = gym.make("CartPole-v1") 10 | lowS = [-3.4, -8.0, -5.4, -8.7] 11 | highS = [3.4, 8.3, 6.35, 8.1] 12 | lowA = [-10] 13 | highA = [10] 14 | self.max_cart_pos = 3 15 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 16 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 17 | 18 | def reset(self): 19 | s = self.env.reset() 20 | s2 = np.array([s[0], s[1], s[2] + np.pi, s[3]]) 21 | self.env.env.state = s2 22 | return np.array(s2) 23 | 24 | def step(self, a): 25 | step = self.env.step(1 * (int(np.sign(a)) > 0)) 26 | ss = step[0] 27 | 28 | if abs(ss[0]) > self.max_cart_pos: 29 | reward = -100 30 | done = True 31 | elif np.cos(ss[2]) > 0.8: 32 | reward = 1 33 | done = True 34 | else: 35 | reward = 0 36 | done = False 37 | 38 | self.env.env.steps_beyond_done = None 39 | 40 | return ss, reward, done, {} 41 | -------------------------------------------------------------------------------- /gymEnvs/SparseCartpoleSwingupDisc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseCartpoleSwingupDisc(gym.Env): 7 | 8 | def __init__(self, render=False): 9 | self.env = gym.make("CartPole-v1") 10 | lowS = [-3.4, -8.0, -5.4, -8.7] 11 | highS = [3.4, 8.3, 6.35, 8.1] 12 | self.max_cart_pos = 3 13 | self.action_space = spaces.Discrete(2) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | s = self.env.reset() 18 | s2 = np.array([s[0], s[1], s[2] + np.pi, s[3]]) 19 | self.env.env.state = s2 20 | return np.array(s2) 21 | 22 | def step(self, a): 23 | step = self.env.step(int(a)) 24 | ss = step[0] 25 | 26 | if abs(ss[0]) > self.max_cart_pos: 27 | reward = -100 28 | done = True 29 | elif np.cos(ss[2]) > 0.8: 30 | reward = 1 31 | done = True 32 | else: 33 | reward = 0 34 | done = False 35 | 36 | self.env.env.steps_beyond_done = None 37 | 38 | return ss, reward, done, {} 39 | -------------------------------------------------------------------------------- /gymEnvs/SparseDoublePendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseDoublePendulum(gym.Env): 7 | 8 | def __init__(self): 9 | self.env = gym.make("Acrobot-v1") 10 | lowS = [-1, -1, -42, -1, -1, -74] 11 | highS = [1, 1, 42, 1, 1, 74] 12 | lowA = [-50] 13 | highA = [50] 14 | 15 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 16 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 17 | 18 | def reset(self): 19 | return self.env.reset() 20 | 21 | def step(self, a): 22 | step = self.env.step(1 * (int(np.sign(a)) > 0)) 23 | done = (step[1] == 0) 24 | reward = step[1] + 1 25 | return step[0], reward, done, {} 26 | -------------------------------------------------------------------------------- /gymEnvs/SparseDoublePendulumDisc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseDoublePendulumDisc(gym.Env): 7 | 8 | def __init__(self): 9 | self.env = gym.make("Acrobot-v1") 10 | lowS = [-1, -1, -42, -1, -1, -74] 11 | highS = [1, 1, 42, 1, 1, 74] 12 | 13 | self.action_space = spaces.Discrete(3) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | step = self.env.step(int(a)) 21 | done = (step[1] == 0) 22 | reward = step[1] + 1 23 | return step[0], reward, done, {} 24 | -------------------------------------------------------------------------------- /gymEnvs/SparseHopper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseHopper(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Hopper-v2") 9 | lowS = [-0.1, -0.25, -0.4, -0.4, -0.32] + [-7] * 6 10 | highS = [1.3, 0.02, 0.025, 0.03, 0.7] + [7] * 6 11 | lowA = [-1, -1, -1] 12 | highA = [1, 1, 1] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | step = self.env.step(np.array(a)) 21 | next_obs = step[0] 22 | posafter, height, ang = self.env.env.sim.data.qpos[0:3] 23 | 24 | if height > 1.3: 25 | reward = 1 26 | done = True 27 | elif abs(ang) > 0.2 or height < 0.7: 28 | reward = -1 29 | done = True 30 | else: 31 | reward = 0 32 | done = False 33 | 34 | return next_obs, reward, done, {} 35 | -------------------------------------------------------------------------------- /gymEnvs/SparseHopperDisc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseHopperDisc(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Hopper-v2") 9 | lowS = [-0.1, -0.25, -0.4, -0.4, -0.32] + [-7] * 6 10 | highS = [1.3, 0.02, 0.025, 0.03, 0.7] + [7] * 6 11 | self.action_space = spaces.Discrete(27) 12 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 13 | 14 | def reset(self): 15 | return self.env.reset() 16 | 17 | def step(self, a): 18 | a2 = np.array([(a // 9) - 1, ((a % 9) // 3) - 1, ((a % 9) % 3) - 1]) 19 | step = self.env.step(a2) 20 | next_obs = step[0] 21 | posafter, height, ang = self.env.env.sim.data.qpos[0:3] 22 | 23 | if height > 1.3: 24 | reward = 1 25 | done = True 26 | elif abs(ang) > 0.2 or height < 0.7: 27 | reward = -1 28 | done = True 29 | else: 30 | reward = 0 31 | done = False 32 | 33 | return next_obs, reward, done, {} 34 | -------------------------------------------------------------------------------- /gymEnvs/SparseLunarLander.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseLunarLander(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("LunarLanderContinuous-v2") 9 | lowS = [-1.2, -0.2, -3.3, -2.8, -3.4, -4, 0, 0] 10 | highS = [1.2, 1.6, 2, 0.8, 2.5, 9, 1.0, 1.0] 11 | lowA = [-1, -1] 12 | highA = [1, 1] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | next_obs, _, done, __ = self.env.step(np.array(a).reshape(-1)) 21 | if done: 22 | reward = -1 23 | elif abs(next_obs[0]) < 0.05 and abs(next_obs[1]) < 0.05: 24 | reward = 1 25 | done = True 26 | else: 27 | reward = 0 28 | 29 | return next_obs, reward, done, {} 30 | -------------------------------------------------------------------------------- /gymEnvs/SparseLunarLanderDisc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseLunarLanderDisc(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("LunarLanderContinuous-v2") 9 | lowS = [-1.2, -0.2, -3.3, -2.8, -3.4, -4, 0, 0] 10 | highS = [1.2, 1.6, 2, 0.8, 2.5, 9, 1.0, 1.0] 11 | self.action_space = spaces.Discrete(9) 12 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 13 | 14 | def reset(self): 15 | return self.env.reset() 16 | 17 | def step(self, a): 18 | a2 = np.array([(a // 3) - 1, (a % 3) - 1]) 19 | next_obs, _, done, __ = self.env.step(a2) 20 | if done: 21 | reward = -1 22 | elif abs(next_obs[0]) < 0.05 and abs(next_obs[1]) < 0.05: 23 | reward = 1 24 | done = True 25 | else: 26 | reward = 0 27 | 28 | return next_obs, reward, done, {} 29 | -------------------------------------------------------------------------------- /gymEnvs/SparseMountainCar.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseMountainCar(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("MountainCar-v0") 9 | lowS = [-1.2, -0.07] 10 | highS = [0.6, 0.07] 11 | lowA = [-1] 12 | highA = [1] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | if np.abs(a) < 0.33: 21 | a2 = 1 22 | else: 23 | a2 = int(np.sign(a)) + 1 24 | step = self.env.step(a2) 25 | done = bool(step[0][0] >= self.env.env.goal_position) 26 | return step[0], 1.0 * done, done, {} 27 | -------------------------------------------------------------------------------- /gymEnvs/SparseMountainCarDisc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseMountainCarDisc(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("MountainCar-v0") 9 | lowS = [-1.2, -0.07] 10 | highS = [0.6, 0.07] 11 | self.action_space = spaces.Discrete(3) 12 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 13 | 14 | def reset(self): 15 | return self.env.reset() 16 | 17 | def step(self, a): 18 | step = self.env.step(a) 19 | done = bool(step[0][0] >= self.env.env.goal_position) 20 | return step[0], 1.0 * done, done, {} 21 | -------------------------------------------------------------------------------- /gymEnvs/SparseReacher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseReacher(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Reacher-v2") 9 | lowS = [-1, -1, -1, -0.17, -0.2, -0.2, -49, -5.6, -0.22, -0.2, 0] 10 | highS = [1, 1, 1, 1, 0.2, 0.2, 87, 33, 0.4, 0.35, 0.0001] 11 | lowA = [-1, 1] 12 | highA = [1, 1] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | step = self.env.step(np.array(a)) 21 | next_obs = step[0] 22 | vec = self.env.env.get_body_com("fingertip") - \ 23 | self.env.env.get_body_com("target") 24 | dist = np.linalg.norm(vec) 25 | 26 | done = (dist <= 0.015) 27 | reward = 1 * (dist <= 0.015) 28 | return next_obs, reward, done, {} 29 | -------------------------------------------------------------------------------- /gymEnvs/SparseReacherDisc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseReacherDisc(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Reacher-v2") 9 | lowS = [-1, -1, -1, -0.17, -0.2, -0.2, -49, -5.6, -0.22, -0.2, 0] 10 | highS = [1, 1, 1, 1, 0.2, 0.2, 87, 33, 0.4, 0.35, 0.0001] 11 | self.action_space = spaces.Discrete(9) 12 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 13 | 14 | def reset(self): 15 | return self.env.reset() 16 | 17 | def step(self, a): 18 | a2 = np.array([(a // 3) - 1, (a % 3) - 1]) 19 | step = self.env.step(a2) 20 | next_obs = step[0] 21 | vec = self.env.env.get_body_com("fingertip") - \ 22 | self.env.env.get_body_com("target") 23 | dist = np.linalg.norm(vec) 24 | 25 | done = (dist <= 0.015) 26 | reward = 1 * (dist <= 0.015) 27 | return next_obs, reward, done, {} 28 | -------------------------------------------------------------------------------- /gymEnvs/SparseSinglePendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseSinglePendulum(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Pendulum-v0") 9 | lowS = [-1, -1, -8] 10 | highS = [1, 1, 8] 11 | lowA = [-2] 12 | highA = [2] 13 | self.action_space = spaces.Box(np.array(lowA), np.array(highA)) 14 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 15 | 16 | def reset(self): 17 | return self.env.reset() 18 | 19 | def step(self, a): 20 | s = self.env.env.state 21 | step = self.env.step(np.array(a)) 22 | done = (s[0] > 0.95 and abs(s[1]) < 0.05) 23 | return step[0], 1.0 * done, done, {} 24 | -------------------------------------------------------------------------------- /gymEnvs/SparseSinglePendulumDisc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class SparseSinglePendulumDisc(gym.Env): 7 | def __init__(self): 8 | self.env = gym.make("Pendulum-v0") 9 | lowS = [-1, -1, -8] 10 | highS = [1, 1, 8] 11 | self.action_space = spaces.Discrete(5) 12 | self.observation_space = spaces.Box(np.array(lowS), np.array(highS)) 13 | 14 | def reset(self): 15 | return self.env.reset() 16 | 17 | def step(self, a): 18 | s = self.env.env.state 19 | a2 = (a - 2) * 2 20 | step = self.env.step(np.array([a2])) 21 | done = (s[0] > 0.95 and abs(s[1]) < 0.05) 22 | return step[0], 1.0 * done, done, {} 23 | -------------------------------------------------------------------------------- /gymEnvs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.phil.SparseCartpoleSwingup import SparseCartpoleSwingup 2 | from gym.envs.phil.SparseDoublePendulum import SparseDoublePendulum 3 | from gym.envs.phil.SparseHopper import SparseHopper 4 | from gym.envs.phil.SparseLunarLander import SparseLunarLander 5 | from gym.envs.phil.SparseMountainCar import SparseMountainCar 6 | from gym.envs.phil.SparseReacher import SparseReacher 7 | from gym.envs.phil.SparseSinglePendulum import SparseSinglePendulum 8 | 9 | from gym.envs.phil.CartpoleSwingup import CartpoleSwingup 10 | from gym.envs.phil.DoublePendulum import DoublePendulum 11 | from gym.envs.phil.Hopper import Hopper 12 | from gym.envs.phil.LunarLander import LunarLander 13 | from gym.envs.phil.MountainCar import MountainCar 14 | from gym.envs.phil.Reacher import Reacher 15 | from gym.envs.phil.SinglePendulum import SinglePendulum 16 | 17 | from gym.envs.phil.SparseCartpoleSwingupDisc import SparseCartpoleSwingupDisc 18 | from gym.envs.phil.SparseDoublePendulumDisc import SparseDoublePendulumDisc 19 | from gym.envs.phil.SparseHopperDisc import SparseHopperDisc 20 | from gym.envs.phil.SparseLunarLanderDisc import SparseLunarLanderDisc 21 | from gym.envs.phil.SparseMountainCarDisc import SparseMountainCarDisc 22 | from gym.envs.phil.SparseReacherDisc import SparseReacherDisc 23 | from gym.envs.phil.SparseSinglePendulumDisc import SparseSinglePendulumDisc 24 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import uuid 3 | import json 4 | import os 5 | import argparse 6 | 7 | from rlutils.TransitionMemory import TransitionMemory 8 | from rlutils.Runners import EnvRunner 9 | from rlutils.EnvGlue import EnvGlue 10 | from rlutils.Envs import SparseGymEnv, GymEnv 11 | from rlutils.AgentHelper import AgentHelper 12 | from Method import Method 13 | from rlutils.Agents import QLearning 14 | from rlutils.LR_SGD import ConstantRate 15 | from rlutils.Policies import eGreedyPolicy 16 | 17 | from features.RandomFourierFeatures import RFF 18 | from features.FourierBasisFeatures import FourierBasisFeatures as FBF 19 | 20 | 21 | def fix_spaces(env): 22 | """ 23 | Set arbitrary space bounds for states variables missing bounds (inf) 24 | """ 25 | l, h = env.lowS, env.highS 26 | for i in range(len(l)): 27 | if l[i] < -100: 28 | l[i] = 100 29 | if h[i] > 100: 30 | h[i] = 100 31 | return l, h 32 | 33 | 34 | def gen_env(opt): 35 | nStep = opt.nStep 36 | nEp = opt.nEp 37 | nRFF = opt.nRFF 38 | kernelType = opt.kernelType 39 | if opt.gymEnv is not None: 40 | env = GymEnv(opt.gymEnv, render=opt.render) 41 | envName = opt.gymEnv 42 | elif opt.sparseGymEnv is not None: 43 | env = SparseGymEnv(opt.sparseGymEnv, render=opt.render) 44 | envName = opt.sparseGymEnv 45 | else: 46 | raise ValueError("An environment needs to be specified") 47 | 48 | lowS, highS = fix_spaces(env) 49 | glue = EnvGlue(env, lowS, highS) 50 | sigma = np.array([opt.sigmaS] * glue.dS() + [opt.sigmaA] * glue.dA()) 51 | alphaQ, betaQ, alphaU, betaU = opt.alphaQ, opt.betaQ, opt.alphaU, opt.betaU 52 | 53 | return env, envName, nEp, nStep, nRFF, kernelType, lowS, highS, glue, sigma, alphaQ, betaQ, alphaU, betaU 54 | 55 | 56 | def main(opt): 57 | # Load default values and env 58 | env, envName, nEp, nStep, nRFF, kernelType, lowS, highS, glue, sigma, alphaQ, betaQ, alphaU, betaU = gen_env(opt) 59 | 60 | print(opt) 61 | # Generate directory for experiment 62 | if opt.enableLogFile: 63 | expName, i = "{}_{}".format(opt.expName, envName), 1 64 | while os.path.exists("exps/{}{}".format(expName, i)): 65 | i += 1 66 | dirName = "exps/{}{}".format(expName, i) 67 | os.makedirs(dirName) 68 | 69 | # Run experiment 70 | allRets = [] 71 | for repI in range(opt.nRepeat): 72 | tm = TransitionMemory(nEp, nStep, glue.dS(), glue.dA()) 73 | 74 | # Agt, Pol 75 | if opt.featureType == "RFF": # Random Fourier Features 76 | ftmap = RFF(nRFF, glue.dS() + glue.dA(), sigma, kernelType) 77 | elif opt.featureType == "FBF": # Fourier basis features 78 | ftmap = FBF(glue.dS() + glue.dA(), opt.fourierBasisOrder) 79 | agtHlp = AgentHelper(glue, ftmap) 80 | 81 | if opt.agent == "method": 82 | agent = Method(agtHlp, gamma=opt.gamma, alphaQ=alphaQ, betaQ=betaQ, 83 | alphaU=alphaU, betaU=betaU, tm=tm, 84 | maxVIiter=opt.maxVIiterQ, kappa=opt.kappa) 85 | elif opt.agent == "QLearning": 86 | lr = ConstantRate(opt.learningRate) 87 | agent = QLearning(agtHlp, gamma=opt.gamma, learningRate=lr) 88 | else: 89 | raise ValueError("Unknown agent {}".format(opt.agent)) 90 | policy = eGreedyPolicy(agtHlp, agent.score, opt.epsilon) 91 | 92 | # Runner 93 | runner = EnvRunner(glue, policy, agent, tm, opt.verbose) 94 | 95 | # Go 96 | runner.run(nEp, nStep, opt.stopWhenSolved) 97 | 98 | # Keep track of returns 99 | idx = np.cumsum([0] + tm.getEpisodeLengths()) 100 | ss, _, rr, __ = tm.getTransitions() 101 | sMin, sMax = np.min(ss, 0), np.max(ss, 0) 102 | print("State min/max:\n{}\n{}".format(sMin, sMax)) 103 | rets = [np.sum(rr[idx[i-1]:idx[i]]) for i in range(1, len(idx))] 104 | print("Repeat", repI, "finished.") 105 | print("Returns:\n", rets) 106 | allRets.append(rets) 107 | 108 | # Parse all variables to file 109 | del rr, _, ss, __ 110 | if opt.enableLogFile: 111 | filename = "{}/vars_{}.json".format(dirName, uuid.uuid4().hex) 112 | with open(filename, 'w') as f: 113 | json.dump({k: repr(v) for k, v in vars().items()}, f, 114 | indent=4, sort_keys=True) 115 | 116 | 117 | def parse_args(): 118 | parser = argparse.ArgumentParser() 119 | parser.add_argument("-a", "--agent", help="method, QLearning", 120 | default="method") 121 | parser.add_argument("--gymEnv", help="Gym Enviornment to run", 122 | default=None, type=str) 123 | parser.add_argument("--sparseGymEnv", help="Sparse Gym Enviornment to run", 124 | default=None, type=str) 125 | parser.add_argument("--nStep", help="Number of steps per episode", 126 | default=500, type=int) 127 | parser.add_argument("--nEp", help="Number of episodes", default=20, 128 | type=int) 129 | parser.add_argument("--nRepeat", help="number of repetitions", default=1, 130 | type=int) 131 | parser.add_argument("--stopWhenSolved", help="Stop repeat after goal is " 132 | "reahced for the first time.", 133 | action="store_true", default=False) 134 | parser.add_argument("--gamma", help="Reward discount value", default=0.99, 135 | type=float) 136 | 137 | # Feature parameters 138 | parser.add_argument("--featureType", help="Type of features to use: " 139 | "RFF for Random Fourier Features, " 140 | "FBF for Fourier Basis Features.", 141 | default="RFF", type=str) 142 | parser.add_argument("--nRFF", help="Number of RFF features", default=300, 143 | type=int) 144 | parser.add_argument("--kernelType", help="RFF kernel type", 145 | default="RBF", type=str) 146 | parser.add_argument("--sigmaS", help="State RFF features lengthscale", 147 | default=0.35, type=float) 148 | parser.add_argument("--sigmaA", help="Action RFF features lengthscale", 149 | default=1.0, type=float) 150 | parser.add_argument("--fourierBasisOrder", type=int, 151 | help="Fourier basis feature order", default=3) 152 | 153 | # Algorithm parameters 154 | parser.add_argument("--kappa", help="Exploration-exploitation balance", 155 | default=1.0, type=float) 156 | parser.add_argument("--epsilon", help="epsilon-greedy policy parameter", 157 | default=0.0, type=float) 158 | parser.add_argument("--alphaQ", help="BLR weight prior precision for Q", 159 | default=0.1, type=float) 160 | parser.add_argument("--betaQ", help="BLR noise precision for Q", 161 | default=1.0, type=float) 162 | parser.add_argument("--alphaU", help="BLR weight prior precision for U", 163 | default=0.1, type=float) 164 | parser.add_argument("--betaU", help="BLR noise precision for U", 165 | default=1.0, type=float) 166 | parser.add_argument("--maxVIiterQ", help="Maximum number of VI iterations" 167 | " at the end of each episode for Q", default=30, 168 | type=int) 169 | parser.add_argument("--learningRate", help="QLearning learning rate", 170 | default=0.5, type=float) 171 | 172 | # Logging 173 | parser.add_argument("--expName", help="Experiment name", default="dummy", 174 | type=str) 175 | parser.add_argument("--enableLogFile", help="Log transitions in file", 176 | default=False, action="store_true") 177 | parser.add_argument("--render", help="Render agent while learning", 178 | action="store_true", default=False) 179 | parser.add_argument("-v", "--verbose", help="Verbose", action="count", 180 | default=0) 181 | args = parser.parse_args() 182 | return args 183 | 184 | 185 | if __name__ == '__main__': 186 | args = parse_args() 187 | main(args) 188 | -------------------------------------------------------------------------------- /rlutils/ABLR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rlutils.utils import smw_inv_correction 4 | from rlutils.utils import batch_generator 5 | 6 | 7 | class ABLR: 8 | """ 9 | Purely analytic BLR with rank-k streaming updates 10 | """ 11 | 12 | def __init__(self, M, alpha=1.0, beta=1.0, dOut=1, computeSninv=True): 13 | """ 14 | Initialize model. 15 | :param M: Number of weights 16 | :param alpha: Weight prior precision 17 | :param beta: Noise precision 18 | :param dOut: Number of output dimensions 19 | :param computeSninv: Whether to compute the inverse of Sn. Useful for 20 | NLML computations (default=True). 21 | """ 22 | self.N_trn = None # The total number of training samples 23 | self.M = M 24 | self.dOut = dOut 25 | self.computeSninv = computeSninv 26 | 27 | # dimensionality 28 | self.alpha = alpha 29 | self.beta = beta 30 | self.reset() 31 | 32 | def reset(self): 33 | self.Sninv_tgt = np.zeros(shape=(self.M, self.dOut)) 34 | self.mn = np.zeros(shape=(self.M, self.dOut)) 35 | self.tgt0 = np.zeros(shape=(self.M, self.dOut)) 36 | self.Sn = np.identity(self.M) / self.alpha # a.k.a. Ainv 37 | if self.computeSninv: 38 | self.Sninv = np.identity(self.M) * self.alpha # a.k.a. A 39 | self.needRecomputeMean = True 40 | 41 | def update(self, phi, y): 42 | """ 43 | Update BLR model with one a set of data points, performing a rank-k 44 | sherman-morisson-woodburry update. 45 | :param phi: Feature map for new points 46 | :param y: Target value for new points 47 | """ 48 | 49 | self.Sn -= smw_inv_correction(A_inv=self.Sn, 50 | U=np.sqrt(self.beta) * phi.T, 51 | V=np.sqrt(self.beta) * phi) 52 | 53 | self.Sninv_tgt += self.beta * np.dot(phi.T, y) 54 | if self.computeSninv: 55 | self.Sninv += self.beta * np.dot(phi.T, phi) # / 56 | self.needRecomputeMean = True 57 | 58 | def learn_from_history(self, all_phi, all_y, batch_size=None): 59 | """ 60 | Train model on dataset by cutting it into batches for faster learning. 61 | :param all_phi: data features 62 | :param all_y: data targets 63 | :param batch_size: size of batches data set is cut into. Set to None 64 | for automatically computing optimal batch size (default=None). 65 | """ 66 | # Define the batch data generator. This maintains an internal counter 67 | # and also allows wraparound for multiple epochs 68 | 69 | # Compute optimal batch size 70 | if batch_size is None: 71 | batch_size = int(np.cbrt(self.M ** 2 / 2)) 72 | 73 | data_batch_gen = batch_generator(arrays=[all_phi, all_y], 74 | batch_size=batch_size, 75 | wrapLastBatch=False) 76 | 77 | N = all_phi.shape[0] # Alias for the total number of training samples 78 | n_batches = int(np.ceil(N / batch_size)) # The number of batches 79 | 80 | """ Run the batched inference """ 81 | for _ in range(n_batches): 82 | phi_batch, Y_batch = next(data_batch_gen) 83 | self.update(phi=phi_batch, y=Y_batch) 84 | 85 | def _recompute(self): 86 | self.mn = np.dot(self.Sn, self.Sninv_tgt) 87 | self.needRecomputeMean = False 88 | 89 | def predictMean(self, phi): 90 | """ 91 | Model predictive mean. 92 | :param phi: Feature map for test data point 93 | :returns: predictive mean values for each test data point 94 | """ 95 | if self.needRecomputeMean: 96 | self._recompute() 97 | Y_pred = np.dot(phi, self.mn) 98 | return np.atleast_2d(Y_pred) 99 | 100 | def predictVar(self, phi, includeBetaVar=True): 101 | """ 102 | Model predictive variance. 103 | :param phi: Feature map for test data point 104 | :param includeBetaVar: Whether to include the 1/beta offset in variance 105 | :returns: predictive mean variances for each test data point 106 | """ 107 | var = np.sum(np.dot(phi, self.Sn) * phi, axis=1, keepdims=True) 108 | if includeBetaVar: 109 | var += 1.0/self.beta 110 | return var 111 | 112 | def predict(self, phi): 113 | return self.predictMean(phi), self.predictVar(phi) 114 | 115 | def updateTargets(self, all_phi, all_t): 116 | """ 117 | Update target for all datapoints. 118 | :param all_phi: Feature map for all data points 119 | :param all_t: Targets for all data points 120 | """ 121 | self.Sninv_tgt = self.tgt0 + self.beta * np.dot(all_phi.T, all_t) 122 | self.needRecomputeMean = True 123 | 124 | 125 | class fixedMeanABLR(ABLR): 126 | """ Variant of BLR, where the predictive mean returns to a fixed value 127 | away from data points. This uses the predictive variance, minimum and 128 | maximum variance to interpolate between true predictive mean and fixed 129 | mean. 130 | Note: While the value for maximum variance is correct, the one for 131 | minimum variance is only an empirical estimate, and depends on the feature 132 | map used. This implementation works for RFF. 133 | """ 134 | def __init__(self, M, alpha, beta, fixedMean, sigRFF): 135 | super().__init__(M, alpha, beta) 136 | 137 | self.maxVar = 1.0 / alpha # Not including 1.0/beta in computation 138 | # This is an empirical observation 139 | self.minVar = 1.0 / np.mean(sigRFF * (8*beta+alpha)) 140 | self.fixedMean = fixedMean 141 | 142 | def predictMean(self, x_tst): 143 | mean = super().predictMean(x_tst) 144 | varRatio = (self.predictVar(x_tst, False) - self.minVar) / \ 145 | (self.maxVar - self.minVar) 146 | varRatio[varRatio < 0] = 0 147 | return mean * (1-varRatio) + self.fixedMean * varRatio 148 | -------------------------------------------------------------------------------- /rlutils/AgentHelper.py: -------------------------------------------------------------------------------- 1 | # import numbers 2 | import numpy as np 3 | 4 | 5 | """ 6 | AgentHelpers provide Agents with a series of function to access environment 7 | state and action spaces, and Random Fourier Features. 8 | """ 9 | 10 | 11 | class AgentHelper: 12 | def __init__(self, glue, ftmap=None): 13 | if ftmap is None: 14 | self.nFeatures = glue.dS() + glue.dA() 15 | else: 16 | self.nFeatures = ftmap.nFeatures 17 | self.glue = glue 18 | self.ftmap = ftmap 19 | 20 | def toStateActionPair(self, ss, aa): 21 | ssaa = np.hstack([ss, aa]) 22 | if self.ftmap is None: 23 | return ssaa 24 | else: 25 | return self.ftmap.toFeatures(ssaa) 26 | 27 | def isDiscA(self): 28 | return self.glue.env.discA 29 | 30 | def allDiscA(self): 31 | return np.linspace(0, 1, self.nA()) 32 | 33 | def randDiscA(self, shape): 34 | nA = self.nA() 35 | return np.random.randint(0, nA, shape) / (nA-1) 36 | 37 | def sampleContA(self, nSamps): 38 | r = np.random.random((nSamps, self.dA())) 39 | low, high = self.glue.boundsA() 40 | return np.multiply(r, high - low) + low 41 | 42 | def randA(self, n): 43 | if self.isDiscA(): 44 | return self.randDiscA(n) 45 | else: 46 | return self.sampleContA(n) 47 | 48 | def nA(self): 49 | return self.glue.nA() 50 | 51 | def boundsA(self): 52 | return self.glue.boundsA() 53 | 54 | def boundsS(self): 55 | return self.glue.boundsS() 56 | 57 | def dS(self): 58 | return self.glue.dS() 59 | 60 | def dA(self): 61 | return self.glue.dA() 62 | -------------------------------------------------------------------------------- /rlutils/Agents.py: -------------------------------------------------------------------------------- 1 | from rlutils.LR_SGD import LR_SGD 2 | from rlutils.Policies import eGreedyPolicy 3 | 4 | """ 5 | Provides an implementation of QLearning. 6 | """ 7 | 8 | 9 | class AbsAgent: 10 | def update(self, s, a, r, sp): 11 | raise NotImplementedError 12 | 13 | def endOfEpUpdate(self): 14 | raise NotImplementedError 15 | 16 | 17 | class AbsTDAgent(AbsAgent): 18 | def score(self, s, a): 19 | raise NotImplementedError 20 | 21 | 22 | class QLearning(AbsTDAgent): 23 | def __init__(self, agtHlp, gamma=0.99, **kwargs): 24 | """ 25 | Parameters 26 | :param agtHlp: agentHelper object. 27 | :param learningRate: stochasticOptimisers object (optional. 28 | :param gamma: discount factor (optional). 29 | """ 30 | self.agtHlp = agtHlp 31 | self.gamma = gamma 32 | self.greedyPolicy = eGreedyPolicy(agtHlp, self.score, 0.0) 33 | self.model = LR_SGD(M=agtHlp.nFeatures, **kwargs) 34 | 35 | def update(self, s, a, r, sp): 36 | """ 37 | Adds one data point at a time. 38 | """ 39 | ftsa = self.agtHlp.toStateActionPair(s, a) 40 | ap = self.greedyPolicy.pick(sp) 41 | sap = self.agtHlp.toStateActionPair(sp, ap) 42 | self.model.update(ftsa, r + self.gamma * self.model.predictMean(sap)) 43 | 44 | def score(self, s, a): 45 | sa = self.agtHlp.toStateActionPair(s, a) 46 | return self.model.predictMean(sa) 47 | 48 | def endOfEpUpdate(self): 49 | pass 50 | -------------------------------------------------------------------------------- /rlutils/EnvGlue.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rlutils.Normaliser import Normaliser 4 | 5 | """ 6 | This is the glue (wrapper) between RL environments from Env.py and the runner. 7 | EnvGlue takes care of state and action space normalisation. 8 | """ 9 | 10 | 11 | class AbsEnvGlue: 12 | def __init__(self, env): 13 | self.env = env 14 | 15 | def resetEnv(self): 16 | raise NotImplementedError 17 | 18 | def stepEnv(self, a): 19 | raise NotImplementedError 20 | 21 | def nA(self): 22 | if not self.env.discA: 23 | raise ValueError("Environment action space is continuous.") 24 | if self.env.dA > 1: 25 | raise ValueError("Environment action space is multi dimensional.") 26 | return self.env.nA 27 | 28 | def boundsS(self): 29 | return np.array(self.env.lowS), np.array(self.env.highS) 30 | 31 | def boundsA(self): 32 | return np.array(self.env.lowA), np.array(self.env.highA) 33 | 34 | def dS(self): 35 | return self.env.dS 36 | 37 | def dA(self): 38 | return self.env.dA 39 | 40 | 41 | class EnvGlue(AbsEnvGlue): 42 | def __init__(self, env, lowS=None, highS=None, normalised=True): 43 | super(EnvGlue, self).__init__(env) 44 | 45 | self.highA = [h - (1 if self.env.discA else 0) for h in self.env.highA] 46 | if lowS is None: 47 | lowS = self.env.lowS 48 | if highS is None: 49 | highS = self.env.highS 50 | self.normalised = normalised 51 | if normalised: 52 | self.nrms = Normaliser(lowS, highS, True) 53 | self.nrma = Normaliser(self.env.lowA, self.highA, True) 54 | 55 | def stepEnv(self, a): 56 | if self.normalised: 57 | rawA = self.nrma.unnormalise(a) 58 | else: 59 | rawA = a 60 | if self.env.discA: 61 | rawA = max(0, min(int(np.round(rawA)), self.highA[0])) 62 | rawS, r, done = self.env.step(rawA) 63 | rawS = rawS.reshape(-1) 64 | if self.normalised: 65 | return self.nrms.normalise(np.atleast_2d(rawS)), r, done 66 | else: 67 | return np.atleast_2d(rawS), r, done 68 | 69 | def stepsEnv(self, ss, aa): 70 | if self.normalised: 71 | rawAA = self.nrma.unnormalise(aa) 72 | rawSS = self.nrms.unnormalise(ss) 73 | else: 74 | rawAA = aa 75 | rawSS = ss 76 | if self.env.discA: 77 | rawAA = np.clip(np.round(rawAA), 0, self.highA[0]) 78 | rawSSP, RR, DONE = self.env.steps(rawSS, rawAA) 79 | if self.normalised: 80 | return self.nrms.normalise(rawSSP), RR, DONE 81 | else: 82 | return rawSSP, RR, DONE 83 | 84 | def costEnv(self, ss): 85 | if self.normalised: 86 | rawSS = self.nrms.unnormalise(ss) 87 | else: 88 | rawSS = ss 89 | return self.env.cost(rawSS) 90 | 91 | def resetEnv(self): 92 | rawS = self.env.reset() 93 | if self.normalised: 94 | return self.nrms.normalise(np.atleast_2d(rawS)) 95 | else: 96 | return np.atleast_2d(rawS) 97 | 98 | def boundsA(self): 99 | if self.normalised: 100 | return self.nrma.boundsNormalised() 101 | else: 102 | return super().boundsA() 103 | 104 | def boundsS(self): 105 | if self.normalised: 106 | return self.nrms.boundsNormalised() 107 | else: 108 | return super().boundsS() 109 | -------------------------------------------------------------------------------- /rlutils/Envs.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import math 3 | import random 4 | import numpy as np 5 | 6 | """ 7 | Envs provides a classic interface for all RL environments. It also makes it 8 | easy to transform Gym environments to sparse reward problems. 9 | """ 10 | 11 | 12 | class AbsEnv: 13 | def __init__(self, lowS, highS, lowA, highA, discA=False): 14 | self.lowS, self.highS, self.lowA, self.highA = lowS, highS, lowA, highA 15 | self.dS, self.dA = len(lowS), len(lowA) 16 | self.discA = discA 17 | self.nA = int(highA[0]) if discA else -1 18 | 19 | def step(self, a): 20 | raise NotImplementedError 21 | 22 | def reset(self): 23 | raise NotImplementedError 24 | 25 | 26 | class GymEnv(AbsEnv): 27 | def __init__(self, envName, render=False): 28 | self.env = gym.make(envName) 29 | 30 | # Get env bounds 31 | lowS = self.env.observation_space.low 32 | highS = self.env.observation_space.high 33 | # Discrete actions 34 | if isinstance(self.env.action_space, gym.spaces.Discrete): 35 | discA = True 36 | lowA = [0] 37 | highA = [self.env.action_space.n] 38 | # Continuous actions 39 | else: 40 | discA = False 41 | lowA = self.env.action_space.low 42 | highA = self.env.action_space.high 43 | super(GymEnv, self).__init__(lowS, highS, lowA, highA, discA) 44 | 45 | self.render = render 46 | 47 | def step(self, a): 48 | if self.render: 49 | self.env.render() 50 | 51 | s, r, done, _ = self.env.step(a) 52 | return s, r, done 53 | 54 | def reset(self): 55 | return self.env.reset() 56 | 57 | 58 | class SparseGymEnv(GymEnv): 59 | def checkSolved(self, s): 60 | raise NotImplementedError 61 | 62 | def __init__(self, envName, render=False): 63 | super(SparseGymEnv, self).__init__(envName, render) 64 | checkReward = None 65 | if envName == "MountainCar-v0": 66 | def checkReward(s): 67 | return bool(s[0] > 0.45) 68 | elif envName == "Acrobot-v1": 69 | def checkReward(s): 70 | ns = [math.atan2(s[1], s[0]), math.atan2(s[3], s[2])] 71 | return bool(-s[0] - np.cos(ns[0] + ns[1]) > 1.) 72 | elif envName == "Pendulum-v0": 73 | def checkReward(s): 74 | return s[0] >= 0.97 75 | 76 | def _checkDone(s, a, r): 77 | return False 78 | self.checkDone = _checkDone 79 | elif envName == "CartPole-v0": 80 | def checkReward(s): 81 | fallen = s[0] < -2.4 or s[0] > 2.4 \ 82 | or s[2] < -0.2094 or s[2] > 0.2094 83 | return not fallen 84 | 85 | def _checkDone(s, a, r): 86 | return r < 1 87 | self.checkDone = _checkDone 88 | 89 | elif envName == "Reacher-v2": 90 | def checkReward(s): 91 | vec = self.env.env.get_body_com("fingertip") - \ 92 | self.env.env.get_body_com("target") 93 | dist = np.linalg.norm(vec) 94 | return dist <= 0.015 95 | 96 | def _checkDone(s, a, r): 97 | return False 98 | self.checkDone = _checkDone 99 | 100 | def resetEnv(): 101 | s = self.env.reset() 102 | print("d=", np.linalg.norm(self.env.env.goal)) 103 | while np.linalg.norm(self.env.env.goal) > 0.18: 104 | s = self.env.reset() 105 | return s 106 | self.reset = resetEnv 107 | else: 108 | raise ValueError("No solved criterion defined to create sparse", 109 | "reward for environment {}".format(envName)) 110 | self.checkSolved = checkReward 111 | 112 | def step(self, a): 113 | s, r, done = super(SparseGymEnv, self).step(a) 114 | r = self.checkSolved(s) * 1 115 | done = self.checkDone(s, a, r) 116 | return s, r, done 117 | 118 | def checkDone(self, s, a, r): 119 | return r == 1 120 | 121 | 122 | class SparseOsbandChain(AbsEnv): 123 | def __init__(self, chainLen): 124 | self.chainLen = chainLen 125 | self.p = 1.0 - 1.0 / self.chainLen 126 | discA = True 127 | lowS, highS = [1], [chainLen] 128 | lowA, highA = [0], [2] 129 | super().__init__(lowS, highS, lowA, highA, discA) 130 | self.reset() 131 | 132 | def step(self, a): 133 | r = 0 134 | ss = self.s 135 | 136 | # Transitiion 137 | if a == 0: 138 | ss -= 1 139 | elif a == 1: 140 | if random.random() < self.p: 141 | ss += 1 142 | else: 143 | ss -= 1 144 | if ss > self.chainLen: 145 | ss = self.chainLen 146 | elif ss < 1: 147 | ss = 1 148 | 149 | # Reward 150 | if ss == self.chainLen: 151 | r = 1 152 | 153 | self.s = ss 154 | return np.array([self.s]), r, r == 1 155 | 156 | def reset(self): 157 | self.s = 1 158 | return np.array([self.s]) 159 | 160 | 161 | class SemiSparseOsbandChain(SparseOsbandChain): 162 | def __init__(self, chainLen, rewardSparsity): 163 | """ 164 | param rewardSparsity: indicates how sparse the domain is (0 to 1) 165 | """ 166 | self.nRwds = int((1 - rewardSparsity) * (chainLen - 1)) 167 | self.rewardIdx = None 168 | super().__init__(chainLen) 169 | 170 | def step(self, a): 171 | s, r, done = super().step(a) 172 | if s in self.rewardIdx: 173 | r = -1 174 | return s, r, done 175 | 176 | def reset(self): 177 | idx = np.arange(1, self.chainLen) 178 | np.random.shuffle(idx) 179 | self.rewardIdx = idx[0:self.nRwds] 180 | return super().reset() 181 | 182 | 183 | class SparseExplorationChain(AbsEnv): 184 | """ 185 | Typical problem to test exploration. This chain of specified length is 186 | hard to explore. Action 1 goes to the next state (increasing). Action 0 187 | always goes to state 1. A reward of 1 is given for the right-most state, 188 | otherwise 0 reward is given. Problem starts in state 1. 189 | """ 190 | def __init__(self, chainLen): 191 | self.chainLen = chainLen 192 | discA = True 193 | lowS, highS = [1], [chainLen] 194 | lowA, highA = [0], [2] 195 | super().__init__(lowS, highS, lowA, highA, discA) 196 | self.reset() 197 | 198 | def step(self, a): 199 | r = 0 200 | 201 | # Transition 202 | if a == 0: 203 | self.s = 1 204 | elif a == 1 and self.s < self.chainLen: 205 | self.s += 1 206 | 207 | # Reward 208 | if self.s == self.chainLen and a == 1: 209 | r = 1 210 | 211 | return np.array([self.s]), r, r == 1 212 | 213 | def reset(self): 214 | self.s = 1 215 | return np.array([self.s]) 216 | 217 | 218 | class SemiSparseExplorationChain(SparseExplorationChain): 219 | def __init__(self, chainLen, rewardSparsity): 220 | """ 221 | param rewardSparsity: indicates how sparse the domain is (0 to 1) 222 | """ 223 | self.nRwds = int((1 - rewardSparsity) * (chainLen - 1)) 224 | self.rewardIdx = None 225 | super().__init__(chainLen) 226 | 227 | def step(self, a): 228 | s, r, done = super().step(a) 229 | if s in self.rewardIdx: 230 | r = -1 231 | return s, r, done 232 | 233 | def reset(self): 234 | idx = np.arange(1, self.chainLen) 235 | np.random.shuffle(idx) 236 | self.rewardIdx = idx[0:self.nRwds] 237 | return super().reset() 238 | -------------------------------------------------------------------------------- /rlutils/LR_SGD.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BaseOptimizer(object): 5 | """Base (Stochastic) gradient descent optimizer 6 | Parameters 7 | ---------- 8 | params : list, length = len(coefs_) + len(intercepts_) 9 | The concatenated list containing coefs_ and intercepts_ in MLP model. 10 | Used for initializing velocities and updating params 11 | learning_rate_init : float, optional, default 0.1 12 | The initial learning rate used. It controls the step-size in updating 13 | the weights 14 | Attributes 15 | ---------- 16 | learning_rate : float 17 | the current learning rate 18 | """ 19 | 20 | def __init__(self, params, learning_rate_init=0.1): 21 | self.params = params 22 | self.learning_rate_init = learning_rate_init 23 | self.learning_rate = float(learning_rate_init) 24 | 25 | def update_params(self, grads): 26 | """Update parameters with given gradients 27 | Parameters 28 | ---------- 29 | grads : list, length = len(params) 30 | Containing gradients with respect to coefs_ and intercepts_ in MLP 31 | model. So length should be aligned with params 32 | """ 33 | return self._get_updates(grads) 34 | 35 | def iteration_ends(self, time_step): 36 | """Perform update to learning rate and potentially other states at the 37 | end of an iteration 38 | """ 39 | pass 40 | 41 | def trigger_stopping(self, msg, verbose): 42 | """Decides whether it is time to stop training 43 | Parameters 44 | ---------- 45 | msg : str 46 | Message passed in for verbose output 47 | verbose : bool 48 | Print message to stdin if True 49 | Returns 50 | ------- 51 | is_stopping : bool 52 | True if training needs to stop 53 | """ 54 | if verbose: 55 | print(msg + " Stopping.") 56 | return True 57 | 58 | def reset(self): 59 | """Resets object. 60 | """ 61 | pass 62 | 63 | 64 | class ConstantRate(BaseOptimizer): 65 | """Constant learning rate for gradient descent 66 | parameters 67 | --------- 68 | learning_rate: float, optional, default 0.01 69 | The constant learning rate used. 70 | """ 71 | 72 | def __init__(self, learning_rate=0.01): 73 | self.learning_rate = learning_rate 74 | 75 | def _get_updates(self, grads): 76 | """Get the values used to update params with given gradients 77 | Parameters 78 | ---------- 79 | grads : list, length = len(coefs_) + len(intercepts_) 80 | Containing gradients with respect to coefs_ and intercepts_ in MLP 81 | model. So length should be aligned with params 82 | Returns 83 | ------- 84 | updates : list, length = len(grads) 85 | The values to add to params 86 | """ 87 | 88 | return self.learning_rate * grads 89 | 90 | 91 | class SGDOptimizer(BaseOptimizer): 92 | """Stochastic gradient descent optimizer with momentum 93 | Parameters 94 | ---------- 95 | params : list, length = len(coefs_) + len(intercepts_) 96 | The concatenated list containing coefs_ and intercepts_ in MLP model. 97 | Used for initializing velocities and updating params 98 | learning_rate_init : float, optional, default 0.1 99 | The initial learning rate used. It controls the step-size in updating 100 | the weights 101 | lr_schedule : {'constant', 'adaptive', 'invscaling'}, default 'constant' 102 | Learning rate schedule for weight updates. 103 | -'constant', is a constant learning rate given by 104 | 'learning_rate_init'. 105 | -'invscaling' gradually decreases the learning rate 'learning_rate_' at 106 | each time step 't' using an inverse scaling exponent of 'power_t'. 107 | learning_rate_ = learning_rate_init / pow(t, power_t) 108 | -'adaptive', keeps the learning rate constant to 109 | 'learning_rate_init' as long as the training keeps decreasing. 110 | Each time 2 consecutive epochs fail to decrease the training loss by 111 | tol, or fail to increase validation score by tol if 'early_stopping' 112 | is on, the current learning rate is divided by 5. 113 | momentum : float, optional, default 0.9 114 | Value of momentum used, must be larger than or equal to 0 115 | nesterov : bool, optional, default True 116 | Whether to use nesterov's momentum or not. Use nesterov's if True 117 | Attributes 118 | ---------- 119 | learning_rate : float 120 | the current learning rate 121 | velocities : list, length = len(params) 122 | velocities that are used to update params 123 | """ 124 | 125 | def __init__(self, params, learning_rate_init=0.1, lr_schedule='constant', 126 | momentum=0.9, nesterov=True, power_t=0.5): 127 | super(SGDOptimizer, self).__init__(params, learning_rate_init) 128 | 129 | self.lr_schedule = lr_schedule 130 | self.momentum = momentum 131 | self.nesterov = nesterov 132 | self.power_t = power_t 133 | self.velocities = np.zeros_like(params).reshape(-1, 1) 134 | 135 | def iteration_ends(self, time_step): 136 | """Perform updates to learning rate and potential other states at the 137 | end of an iteration 138 | Parameters 139 | ---------- 140 | time_step : int 141 | number of training samples trained on so far, used to update 142 | learning rate for 'invscaling' 143 | """ 144 | if self.lr_schedule == 'invscaling': 145 | self.learning_rate = (float(self.learning_rate_init) / 146 | (time_step + 1) ** self.power_t) 147 | 148 | def trigger_stopping(self, msg, verbose): 149 | if self.lr_schedule == 'adaptive': 150 | if self.learning_rate > 1e-6: 151 | self.learning_rate /= 5. 152 | if verbose: 153 | print(msg + " Setting learning rate to %f" % 154 | self.learning_rate) 155 | return False 156 | else: 157 | if verbose: 158 | print(msg + " Learning rate too small. Stopping.") 159 | return True 160 | else: 161 | if verbose: 162 | print(msg + " Stopping.") 163 | return True 164 | 165 | def _get_updates(self, grads): 166 | """Get the values used to update params with given gradients 167 | Parameters 168 | ---------- 169 | grads : list, length = len(coefs_) + len(intercepts_) 170 | Containing gradients with respect to coefs_ and intercepts_ in MLP 171 | model. So length should be aligned with params 172 | Returns 173 | ------- 174 | updates : list, length = len(grads) 175 | The values to add to params 176 | """ 177 | updates = self.momentum * self.velocities - self.learning_rate * grads 178 | self.velocities = updates 179 | 180 | if self.nesterov: 181 | updates = self.momentum * self.velocities \ 182 | - self.learning_rate * grads 183 | 184 | return updates 185 | 186 | def reset(self): 187 | self.learning_rate = float(self.learning_rate_init) 188 | 189 | 190 | class LR_SGD: 191 | def __init__(self, M, learningRate=ConstantRate(0.01), dOut=1): 192 | """ 193 | :param M: Number of weights. 194 | :param learningRate: a opt.stochasticOptimiser.BaseOptimizer object 195 | :param dOut: Number of output dimensions. 196 | """ 197 | self.M = M 198 | self.dOut = dOut 199 | self.learningRate = learningRate 200 | self.isModelInit = False 201 | self.reset() 202 | 203 | def reset(self): 204 | self.learningRate.reset() 205 | self.w = np.random.normal(0.0, 1.0, (self.M, self.dOut)) 206 | self.time = 0 207 | 208 | def isInit(self): 209 | return self.isModelInit 210 | 211 | def update(self, phi, y): 212 | self.isModelInit = True 213 | grads = np.dot(phi.T, y - np.dot(phi, self.w)) 214 | deltaw = self.learningRate.update_params(grads) 215 | self.w += deltaw.reshape(self.w.shape) 216 | 217 | if isinstance(self.learningRate, SGDOptimizer): 218 | self.time += 1 219 | self.learningRate.iteration_ends(self.time) 220 | 221 | def predictMean(self, phi): 222 | if self.isModelInit: 223 | y = np.dot(phi, self.w).reshape(-1, self.dOut) 224 | return y 225 | else: 226 | return np.zeros((phi.shape[0], self.dOut)) 227 | 228 | def predict(self, phi): 229 | return self.predictMean, None 230 | 231 | def optimise(self, max_evals=200): 232 | # TODO optimise parameters 233 | pass 234 | -------------------------------------------------------------------------------- /rlutils/Normaliser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Normaliser: 5 | """ 6 | Normalise/unnormalise data to [0,1] or [-1,1]. 7 | """ 8 | def __init__(self, low, high, zeroOneInterval=True): 9 | """ 10 | :param low: List of lower-bounds for each dimension 11 | :param high: List of upper-bounds for each dimension 12 | :param zeroOneInterval: whether normalised interval should be [0,1] 13 | (default) or [-1,1] 14 | """ 15 | assert(len(low) == len(high) and 16 | "Upper and lower bounds much be same dimension.") 17 | assert(np.isfinite(np.sum(low)) and 18 | "Lower bound elements must be numbers.") 19 | assert(np.isfinite(np.sum(high)) and 20 | "Upper bound elements must be numbers.") 21 | 22 | spaceRange = np.array(high) - np.array(low) 23 | 24 | if np.sum(spaceRange > 100) > 0: 25 | print("Warning: normalising over large space.") 26 | 27 | self.factor = (1.0 if zeroOneInterval else 2.0) * spaceRange 28 | self.invFactor = (1.0 if zeroOneInterval else 2.0) / spaceRange 29 | self.offset = -np.array(low) 30 | self.finalOffset = 0.0 if zeroOneInterval else -1.0 31 | self.boundsNorm = (spaceRange * 0 - (0 if zeroOneInterval else 1), 32 | spaceRange * 0 + 1) 33 | self.boundsOrig = (np.array(low), np.array(high)) 34 | 35 | def normalise(self, x): 36 | """ 37 | Normalise x. 38 | :param x: list with 1 element, or N*D numpy matrix with N elements 39 | :return: numpy matrix with shape of input 40 | """ 41 | _x = np.array(x) 42 | if len(_x.shape) == 1: 43 | assert(_x.shape == self.offset.shape and 44 | "Data must be same dimension as lower/upper bounds") 45 | else: 46 | assert(_x.shape[1] == self.offset.shape[0] and 47 | "Data must be same dimension as lower/upper bounds") 48 | 49 | return (_x + self.offset) * self.invFactor + self.finalOffset 50 | 51 | def unnormalise(self, x): 52 | """ 53 | Unnormalise x. 54 | :param x: list with 1 element, or N*D numpy matrix with N elements 55 | :return: numpy matrix with shape of input 56 | """ 57 | _x = np.array(x) 58 | if len(_x.shape) == 1: 59 | assert(_x.shape == self.offset.shape and 60 | "Data must be same dimension as lower/upper bounds") 61 | else: 62 | assert(_x.shape[1] == self.offset.shape[0] and 63 | "Data must be same dimension as lower/upper bounds") 64 | 65 | return (_x - self.finalOffset) * self.factor - self.offset 66 | 67 | def boundsNormalised(self): 68 | return self.boundsNorm 69 | 70 | def boundsOriginal(self): 71 | return self.boundsOrig 72 | 73 | 74 | if __name__ == "__main__": 75 | nrm = Normaliser([5, -10], [6, 100], True) 76 | 77 | # Test for single element in list 78 | x = [5.5, 4] 79 | y = nrm.normalise(x) 80 | z = nrm.unnormalise(y) 81 | print(x, y, z) 82 | assert(np.isclose(0, np.linalg.norm(x-z))) 83 | 84 | # Test for numpy array of elements 85 | x = np.hstack((np.arange(5, 6, 0.1).reshape(-1, 1), 86 | np.arange(-10, 100, 11).reshape(-1, 1))) 87 | y = nrm.normalise(x) 88 | z = nrm.unnormalise(y) 89 | assert(np.isclose(0, np.linalg.norm(x-z))) 90 | -------------------------------------------------------------------------------- /rlutils/Policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import nlopt 3 | 4 | 5 | class AbsPolicy: 6 | def pick(self, s): 7 | raise NotImplementedError 8 | 9 | 10 | class RandomPolicy(AbsPolicy): 11 | def __init__(self, agentHelper): 12 | self.agentHelper = agentHelper 13 | if self.agentHelper.isDiscA(): 14 | self.dA = self.agentHelper 15 | self.pick = self.pickDisc 16 | else: 17 | self.pick = self.pickCont 18 | 19 | def pickDisc(self, s): 20 | return self.agentHelper.randDiscA((len(s), self.dA)) 21 | 22 | def pickCont(self, s): 23 | return self.agentHelper.sampleContA(len(s)) 24 | 25 | 26 | class eGreedyPolicy(AbsPolicy): 27 | def __init__(self, agentHelper, scoreFn, epsilon=0.0): 28 | self.scoreFn = scoreFn 29 | self.agentHelper = agentHelper 30 | self.epsilon = epsilon 31 | 32 | self.dA = self.agentHelper.dA() 33 | 34 | if self.agentHelper.isDiscA(): 35 | self.allAs = self.agentHelper.allDiscA().reshape(-1, 1) 36 | self.pick = self.pickDisc 37 | else: 38 | self.noSampsApprox = 15 39 | self.pick = self.pickCont 40 | 41 | # Setup nlopt 42 | self.currentState = None 43 | 44 | def __acq_fun_maximize(_x, grad): 45 | s = self.currentState.reshape(1, -1) 46 | a = _x.reshape(1, -1) 47 | score = float(self.scoreFn(s, a)) 48 | return score 49 | 50 | opt_maxeval = 8 51 | self.opt = nlopt.opt(nlopt.LN_COBYLA, self.agentHelper.dA()) 52 | boundsA = self.agentHelper.boundsA() 53 | self.opt.set_lower_bounds(boundsA[0]) 54 | self.opt.set_upper_bounds(boundsA[1]) 55 | self.opt.set_maxeval(opt_maxeval) 56 | self.opt.set_max_objective(__acq_fun_maximize) 57 | 58 | def pickDisc(self, s): 59 | randMask = np.random.random((len(s), )) < self.epsilon 60 | 61 | # Greedy 62 | repAs = np.vstack([self.allAs]*len(s)) 63 | scores = self.scoreFn(np.repeat(s, len(self.allAs), axis=0), 64 | repAs).reshape(len(s), -1) 65 | maxId = np.argmax(scores, axis=1) 66 | aa = self.allAs[maxId].reshape(-1, self.dA) 67 | 68 | # Combine greedy and random 69 | if np.any(randMask): 70 | randa = self.agentHelper.randDiscA((len(s), self.dA)) 71 | return randa * randMask + aa * ~randMask 72 | return aa 73 | 74 | def pickCont2(self, s): 75 | aa = [] 76 | for si in s: 77 | a = self.agentHelper.sampleContA(1) 78 | if np.random.random() >= self.epsilon: 79 | self.currentState = si 80 | a = a.reshape(-1) 81 | a = self.opt.optimize(a).reshape(1, -1) 82 | aa.append(a) 83 | return np.vstack(aa) 84 | 85 | def pickCont(self, s): 86 | randMask = np.random.random((len(s), )) < self.epsilon 87 | 88 | # Greedy 89 | sampsAs = self.agentHelper.sampleContA(self.noSampsApprox) 90 | repAs = np.vstack([sampsAs]*len(s)) 91 | scores = self.scoreFn(np.repeat(s, len(sampsAs), axis=0), 92 | repAs).reshape(len(s), -1) 93 | maxId = np.argmax(scores, axis=1) 94 | aa = sampsAs[maxId].reshape(-1, self.dA) 95 | 96 | # Combine greedy and random 97 | if np.any(randMask): 98 | randa = self.agentHelper.sampleContA(len(s)) 99 | return randa * randMask + aa * ~randMask 100 | return aa 101 | -------------------------------------------------------------------------------- /rlutils/Runners.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tqdm import tqdm 3 | from rlutils.EnvGlue import AbsEnvGlue 4 | from rlutils.Agents import AbsAgent 5 | from rlutils.Policies import AbsPolicy 6 | 7 | 8 | class EnvRunner: 9 | def __init__(self, envGlue, policy, agent, tm=None, verbose=1, 10 | updateAgent=True): 11 | self.tm = tm # transition memory 12 | self.verbose = verbose 13 | 14 | # Environment 15 | if not isinstance(envGlue, AbsEnvGlue): 16 | print(envGlue, "/", AbsEnvGlue) 17 | raise ValueError("Environment glue must be of type AbsEnvGlue") 18 | self.envGlue = envGlue 19 | 20 | # Policy 21 | if not isinstance(policy, AbsPolicy): 22 | raise ValueError("Policy must be of type AbsPolicy") 23 | self.policy = policy 24 | 25 | # Agent 26 | if not isinstance(agent, AbsAgent): 27 | raise ValueError("Agent must be of type AbsAgent") 28 | self.agent = agent 29 | self.updateAgent = updateAgent 30 | 31 | def run(self, nEp, nStep, stopAtPosReturn=False): 32 | if self.verbose == 1: 33 | gen = tqdm(range(nEp)) 34 | else: 35 | gen = range(nEp) 36 | for i in gen: 37 | if self.verbose > 1: 38 | sys.stdout.write("Episode {}".format(i+1)) 39 | sys.stdout.flush() 40 | 41 | # Run episode 42 | epRet, lastStepDone, nStepFinish, lastR = self.runEpisode(nStep) 43 | 44 | # Save transitions 45 | if self.tm: 46 | self.tm.endEpisode() 47 | 48 | if stopAtPosReturn: 49 | if lastStepDone and nStepFinish < nStep and lastR >= 0: 50 | break 51 | 52 | # Potential update at the end of the episode 53 | if self.updateAgent: 54 | self.agent.endOfEpUpdate() 55 | 56 | if self.verbose > 1: 57 | print("Finished") 58 | 59 | def runEpisode(self, nStep): 60 | s = self.envGlue.resetEnv() 61 | ret = 0 62 | done = False 63 | for i in range(nStep): 64 | a = self.policy.pick(s) 65 | sp, r, done = self.envGlue.stepEnv(a) 66 | ret += r 67 | if self.verbose > 2: 68 | print("\nTransition:\ts:{}\n\t\ta:{}\n\t\tr:{}\n\t\ts':{}". 69 | format(s, a, r, sp)) 70 | 71 | if self.tm: 72 | self.tm.addStep(s, a, r, sp) 73 | 74 | if self.updateAgent: 75 | self.agent.update(s, a, r, sp) 76 | 77 | s = sp 78 | if done: 79 | break 80 | if self.verbose > 1: 81 | print("... finished in", i+1, "steps with return", ret) 82 | return ret, done, i+1, r 83 | -------------------------------------------------------------------------------- /rlutils/TransitionMemory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class TransitionMemory: 5 | class EpisodeMemory: 6 | def __init__(self, noStepsMax, dS, dA, dR, onlyRewards=False): 7 | self.onlyRewards = onlyRewards 8 | if not onlyRewards: 9 | self.memS = np.zeros((noStepsMax, dS)) 10 | self.memS2 = np.zeros((noStepsMax, dS)) 11 | self.memA = np.zeros((noStepsMax, dA)) 12 | self.memR = np.zeros((noStepsMax, dR)) 13 | self.memId = 0 14 | 15 | def addStep(self, s, a, r, s2): 16 | if not self.onlyRewards: 17 | self.memS[self.memId] = s 18 | self.memS2[self.memId] = s2 19 | self.memA[self.memId] = a 20 | self.memR[self.memId] = r 21 | self.memId += 1 22 | 23 | def getS(self): 24 | if self.onlyRewards: 25 | raise NotImplementedError 26 | return self.memS[:self.memId, :] 27 | 28 | def getS2(self): 29 | if self.onlyRewards: 30 | raise NotImplementedError 31 | return self.memS2[:self.memId, :] 32 | 33 | def getA(self): 34 | if self.onlyRewards: 35 | raise NotImplementedError 36 | return self.memA[:self.memId, :] 37 | 38 | def getR(self): 39 | return self.memR[:self.memId, :] 40 | 41 | def __init__(self, noEpisiodes, noStepsMax, dS, dA, dR=1, 42 | onlyRewards=False): 43 | """ 44 | Memory storing all transitions (state, action, reward, new state) for 45 | all episodes. 46 | :param noEpisiodes: number of episodes 47 | :param noStepsMax: maximum number of steps per episode 48 | :param dS: state space dimension 49 | :param dA: action space dimension 50 | :param dR: reward space dimension (default=1) 51 | :param onlyRewards: Wheter to only record rewards (default=False). 52 | """ 53 | self.memEps = [self.EpisodeMemory(noStepsMax, dS, dA, dR, onlyRewards) 54 | for _ in range(noEpisiodes)] 55 | self.onlyRewards = onlyRewards 56 | self.memId = 0 57 | 58 | def addStep(self, s, a, r, s2): 59 | """ 60 | Add step to current episode 61 | :param s: state 62 | :param a: action 63 | :param r: reward 64 | :param s2: new state 65 | """ 66 | self.memEps[self.memId].addStep(s, a, r, s2) 67 | 68 | def endEpisode(self): 69 | """ 70 | End current episode memory. Start new episode memory. 71 | """ 72 | if self.memId < len(self.memEps): 73 | self.memId += 1 74 | 75 | def getEpisodeTransitions(self, noEp=-1): 76 | """ 77 | Retreive transitions from given episode. 78 | :pram noEp: episode id (default=-1 last episode) 79 | :returns: matrixes of states, actopms, rewards, and new states 80 | """ 81 | if noEp == -1: 82 | noEp = self.memId 83 | 84 | rr = self.memEps[noEp].getR() 85 | if self.onlyRewards: 86 | return rr 87 | 88 | ss = self.memEps[noEp].getS() 89 | ss2 = self.memEps[noEp].getS2() 90 | aa = self.memEps[noEp].getA() 91 | return ss, aa, rr, ss2 92 | 93 | def getTransitions(self): 94 | """ 95 | Retreive transitions from all episodes (concatenated). 96 | :returns: matrixes of states, actopms, rewards, and new states 97 | """ 98 | if self.memId == 0: 99 | rr = self.memEps[0].getR() 100 | if self.onlyRewards: 101 | return rr 102 | ss = self.memEps[0].getS() 103 | ss2 = self.memEps[0].getS2() 104 | aa = self.memEps[0].getA() 105 | else: 106 | rr = np.vstack([m.getR() for m in self.memEps[:self.memId]]) 107 | if self.onlyRewards: 108 | return rr 109 | ss = np.vstack([m.getS() for m in self.memEps[:self.memId]]) 110 | ss2 = np.vstack([m.getS2() for m in self.memEps[:self.memId]]) 111 | aa = np.vstack([m.getA() for m in self.memEps[:self.memId]]) 112 | return ss, aa, rr, ss2 113 | 114 | def getLastTransitions(self, nTransitions): 115 | maxEpId = min(len(self.memEps)-1, self.memId) 116 | # Count transitions 117 | nn = np.cumsum([self.memEps[i].memId for 118 | i in range(maxEpId, -1, -1)]) 119 | 120 | # Not enough transitions in memory 121 | if nn[-1] <= nTransitions: 122 | return self.getTransitions() 123 | 124 | idx = np.where(nn > nTransitions)[0][0] - 1 125 | memIdcs = range(maxEpId, idx-1, -1) 126 | 127 | if len(memIdcs) == 1: 128 | rr = self.memEps[memIdcs[0]].getR() 129 | if self.onlyRewards: 130 | return rr 131 | ss = self.memEps[memIdcs[0]].getS() 132 | ss2 = self.memEps[memIdcs[0]].getS2() 133 | aa = self.memEps[memIdcs[0]].getA() 134 | else: 135 | rr = np.vstack([self.memEps[i].getR() for i in memIdcs]) 136 | if self.onlyRewards: 137 | return rr 138 | ss = np.vstack([self.memEps[i].getS() for i in memIdcs]) 139 | ss2 = np.vstack([self.memEps[i].getS2() for i in memIdcs]) 140 | aa = np.vstack([self.memEps[i].getA() for i in memIdcs]) 141 | 142 | return ss[:nTransitions, :], aa[:nTransitions, :], \ 143 | rr[:nTransitions, :], ss2[:nTransitions, :] 144 | 145 | def getEpisodeLengths(self): 146 | """ 147 | Retreive episode lengths 148 | :returns: list of episode lengths 149 | """ 150 | return [m.memId for m in self.memEps[:self.memId]] 151 | 152 | 153 | if __name__ == "__main__": 154 | tm = TransitionMemory(3, 10, 2, 1) 155 | tm.addStep([0, 1], 0, -1, [0, 2]) 156 | tm.addStep([0, 1], 0, -1, [0, 2]) 157 | tm.addStep([0, 1], 0, -1, [0, 2]) 158 | tm.addStep([0, 1], 0, -1, [0, 2]) 159 | tm.addStep([0, 1], 0, -1, [0, 2]) 160 | tm.endEpisode() 161 | tm.addStep([0, 2], 0, -1, [0, 2]) 162 | tm.addStep([0, 2], 0, -1, [0, 2]) 163 | tm.addStep([0, 2], 0, -1, [0, 2]) 164 | tm.addStep([0, 2], 0, -1, [0, 2]) 165 | tm.addStep([0, 2], 0, -1, [0, 2]) 166 | tm.endEpisode() 167 | tm.addStep([0, 3], 0, -1, [0, 2]) 168 | tm.addStep([0, 3], 0, -1, [0, 2]) 169 | tm.endEpisode() 170 | s, a, r, s2 = tm.getTransitions() 171 | print("ALL:", s, a, r, s2) 172 | s, a, r, s2 = tm.getEpisodeTransitions() 173 | print("ONE:", s, a, r, s2) 174 | -------------------------------------------------------------------------------- /rlutils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PhilippeMorere/EMU-Q/c7ee795256f343d468dc22f4d48b1288264e743f/rlutils/__init__.py -------------------------------------------------------------------------------- /rlutils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def smw_inv_correction(A_inv, U, V): 5 | """ 6 | Sherman-Morrison-Woodbury update 7 | For rank k updates to the inverse matrix 8 | 9 | IMPORTANT: This is the correction factor which one must subtract from A_inv 10 | Usage: subtract this value from current A_inv 11 | 12 | ref: http://mathworld.wolfram.com/WoodburyFormula.html 13 | https://en.wikipedia.org/wiki/Woodbury_matrix_identity 14 | :param A_inv: n x n 15 | :param U: n x k 16 | :param V: k x n 17 | :return: 18 | """ 19 | rank = U.shape[1] 20 | SU = np.dot(A_inv, U) 21 | VS = np.dot(V, A_inv) 22 | I_plus_VSU_inv = np.linalg.pinv(np.identity(rank) + np.dot(VS, U)) 23 | SU_I_plus_VSU = np.dot(SU, I_plus_VSU_inv) 24 | return np.dot(SU_I_plus_VSU, VS) 25 | 26 | 27 | def batch_generator(arrays, batch_size, wrapLastBatch=False): 28 | """ 29 | Batch generator() function for yielding [x_train, y_train] batch slices for 30 | numpy arrays 31 | Appropriately deals with looping back around to the start of the dataset 32 | Generate batches, one with respect to each array's first axis. 33 | :param arrays:[array, array] or [array, None]... 34 | e.g. [X_trn, Y_trn] where X_trn and Y_trn are ndarrays 35 | :param batch_size: batch size 36 | :param wrapLastBatch: whether the last batch should wrap around dataset 37 | to include first datapoints (True), or be smaller to stop at the end of 38 | the dataset (False). 39 | :return: 40 | """ 41 | starts = [0] * len( 42 | arrays) # pointers to where we are in iteration --> [0, 0] 43 | while True: 44 | batches = [] 45 | for i, array in enumerate(arrays): 46 | start = starts[i] 47 | stop = start + batch_size 48 | diff = stop - array.shape[0] 49 | if diff <= 0: 50 | batch = array[start:stop] 51 | starts[i] += batch_size 52 | else: 53 | if wrapLastBatch: 54 | batch = np.concatenate((array[start:], array[:diff])) 55 | starts[i] = diff 56 | else: 57 | batch = array[start:] 58 | starts[i] = 0 59 | batches.append(batch) 60 | yield batches 61 | --------------------------------------------------------------------------------