# ThreadID: 140296524760832
2 | File: "/usr/lib/python3.6/threading.py", line 884, in _bootstrap
3 | self._bootstrap_inner()
4 | File: "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
5 | self.run()
6 | File: "/home/richard/rlkit_fresh/stacktracer.py", line 64, in run
7 | self.stacktraces()
8 | File: "/home/richard/rlkit_fresh/stacktracer.py", line 78, in stacktraces
9 | fout.write(stacktraces())
10 | File: "/home/richard/rlkit_fresh/stacktracer.py", line 26, in stacktraces
11 | for filename, lineno, name, line in traceback.extract_stack(stack):
12 |
13 | # ThreadID: 140296657909568
14 | File: "/home/richard/rlkit-relational/examples/relationalrl/train_sequential_transfer.py", line 225, in <module>
15 | File: "/home/richard/rlkit-relational/rlkit/launchers/launcher_util.py", line 590, in run_experiment
16 | **run_experiment_kwargs
17 | File: "/home/richard/rlkit-relational/rlkit/launchers/launcher_util.py", line 168, in run_experiment_here
18 | return experiment_function(variant)
19 | File: "/home/richard/rlkit-relational/examples/relationalrl/train_sequential_transfer.py", line 101, in experiment
20 | File: "/home/richard/rlkit-relational/rlkit/core/rl_algorithm.py", line 169, in train
21 | self.train_batch(start_epoch=start_epoch)
22 | File: "/home/richard/rlkit-relational/rlkit/core/rl_algorithm.py", line 215, in train_batch
23 | self._try_to_train()
24 | File: "/home/richard/rlkit-relational/rlkit/core/rl_algorithm.py", line 282, in _try_to_train
25 | self._do_training()
26 | File: "/home/richard/rlkit-relational/rlkit/torch/sac/twin_sac.py", line 281, in _do_training
27 | self.vf_optimizer.step()
28 | File: "/home/richard/rlkit-relational/rlkit/torch/optim/mpi_adam.py", line 123, in step
29 | self.set_params_from_flat((self.get_flat_params() + step_update).to(device=torch.device("cpu")))
30 | File: "/home/richard/rlkit-relational/rlkit/torch/optim/util.py", line 58, in __call__
31 | param.data.copy_(flattened_parameters[start:start+size].view(shape))
32 |
33 |
--------------------------------------------------------------------------------
/examples/sac.py:
--------------------------------------------------------------------------------
1 | """
2 | Run PyTorch Soft Actor Critic on HalfCheetahEnv.
3 |
4 | NOTE: You need PyTorch 0.3 or more (to have torch.distributions)
5 | """
6 | import numpy as np
7 | from gym.envs.mujoco import HalfCheetahEnv
8 |
9 | import rlkit.torch.pytorch_util as ptu
10 | from rlkit.envs.wrappers import NormalizedBoxEnv
11 | from rlkit.launchers.launcher_util import setup_logger
12 | from rlkit.torch.sac.policies import TanhGaussianPolicy
13 | from rlkit.torch.sac.sac import SoftActorCritic
14 | from rlkit.torch.networks import FlattenMlp
15 |
16 |
17 | def experiment(variant):
18 | env = NormalizedBoxEnv(HalfCheetahEnv())
19 | # Or for a specific version:
20 | # import gym
21 | # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
22 |
23 | obs_dim = int(np.prod(env.observation_space.shape))
24 | action_dim = int(np.prod(env.action_space.shape))
25 |
26 | net_size = variant['net_size']
27 | qf = FlattenMlp(
28 | hidden_sizes=[net_size, net_size],
29 | input_size=obs_dim + action_dim,
30 | output_size=1,
31 | )
32 | vf = FlattenMlp(
33 | hidden_sizes=[net_size, net_size],
34 | input_size=obs_dim,
35 | output_size=1,
36 | )
37 | policy = TanhGaussianPolicy(
38 | hidden_sizes=[net_size, net_size],
39 | obs_dim=obs_dim,
40 | action_dim=action_dim,
41 | )
42 | algorithm = SoftActorCritic(
43 | env=env,
44 | policy=policy,
45 | qf=qf,
46 | vf=vf,
47 | **variant['algo_params']
48 | )
49 | algorithm.to(ptu.device)
50 | algorithm.train()
51 |
52 |
53 | if __name__ == "__main__":
54 | # noinspection PyTypeChecker
55 | variant = dict(
56 | algo_params=dict(
57 | num_epochs=1000,
58 | num_steps_per_epoch=1000,
59 | num_steps_per_eval=1000,
60 | batch_size=128,
61 | max_path_length=999,
62 | discount=0.99,
63 | reward_scale=1,
64 |
65 | soft_target_tau=0.001,
66 | policy_lr=3E-4,
67 | qf_lr=3E-4,
68 | vf_lr=3E-4,
69 | ),
70 | net_size=300,
71 | )
72 | setup_logger('name-of-experiment', variant=variant)
73 | # ptu.set_gpu_mode(True) # optionally set the GPU (default=False)
74 | experiment(variant)
75 |
--------------------------------------------------------------------------------
/examples/td3.py:
--------------------------------------------------------------------------------
1 | """
2 | This should results in an average return of ~3000 by the end of training.
3 |
4 | Usually hits 3000 around epoch 80-100. Within a see, the performance will be
5 | a bit noisy from one epoch to the next (occasionally dips dow to ~2000).
6 |
7 | Note that one epoch = 5k steps, so 200 epochs = 1 million steps.
8 | """
9 | from gym.envs.mujoco import HopperEnv
10 |
11 | import rlkit.torch.pytorch_util as ptu
12 | from rlkit.envs.wrappers import NormalizedBoxEnv
13 | from rlkit.exploration_strategies.base import \
14 | PolicyWrappedWithExplorationStrategy
15 | from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy
16 | from rlkit.launchers.launcher_util import setup_logger
17 | from rlkit.torch.networks import FlattenMlp, TanhMlpPolicy
18 | from rlkit.torch.td3.td3 import TD3
19 |
20 |
21 | def experiment(variant):
22 | env = NormalizedBoxEnv(HopperEnv())
23 | es = GaussianStrategy(
24 | action_space=env.action_space,
25 | max_sigma=0.1,
26 | min_sigma=0.1, # Constant sigma
27 | )
28 | obs_dim = env.observation_space.low.size
29 | action_dim = env.action_space.low.size
30 | qf1 = FlattenMlp(
31 | input_size=obs_dim + action_dim,
32 | output_size=1,
33 | hidden_sizes=[400, 300],
34 | )
35 | qf2 = FlattenMlp(
36 | input_size=obs_dim + action_dim,
37 | output_size=1,
38 | hidden_sizes=[400, 300],
39 | )
40 | policy = TanhMlpPolicy(
41 | input_size=obs_dim,
42 | output_size=action_dim,
43 | hidden_sizes=[400, 300],
44 | )
45 | exploration_policy = PolicyWrappedWithExplorationStrategy(
46 | exploration_strategy=es,
47 | policy=policy,
48 | )
49 | algorithm = TD3(
50 | env,
51 | qf1=qf1,
52 | qf2=qf2,
53 | policy=policy,
54 | exploration_policy=exploration_policy,
55 | **variant['algo_kwargs']
56 | )
57 | algorithm.to(ptu.device)
58 | algorithm.train()
59 |
60 |
61 | if __name__ == "__main__":
62 | variant = dict(
63 | algo_kwargs=dict(
64 | num_epochs=200,
65 | num_steps_per_epoch=5000,
66 | num_steps_per_eval=10000,
67 | max_path_length=1000,
68 | batch_size=100,
69 | discount=0.99,
70 | replay_buffer_size=int(1E6),
71 | ),
72 | )
73 | # ptu.set_gpu_mode(True) # optionally set the GPU (default=False)
74 | setup_logger('name-of-td3-experiment', variant=variant)
75 | experiment(variant)
76 |
--------------------------------------------------------------------------------
/examples/tsac.py:
--------------------------------------------------------------------------------
1 | """
2 | Run PyTorch Soft Actor Critic on HalfCheetahEnv with the "Twin" architecture
3 | from TD3: https://arxiv.org/pdf/1802.09477.pdf
4 | """
5 | import numpy as np
6 |
7 | import rlkit.torch.pytorch_util as ptu
8 | from rlkit.envs.wrappers import NormalizedBoxEnv
9 | from rlkit.launchers.launcher_util import setup_logger
10 | from rlkit.torch.sac.policies import TanhGaussianPolicy
11 | from rlkit.torch.sac.sac import SoftActorCritic
12 | from rlkit.torch.networks import FlattenMlp
13 | from rlkit.torch.sac.twin_sac import TwinSAC
14 |
15 |
16 | def experiment(variant):
17 | import gym
18 | env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))
19 |
20 | obs_dim = int(np.prod(env.observation_space.shape))
21 | action_dim = int(np.prod(env.action_space.shape))
22 |
23 | net_size = variant['net_size']
24 | qf1 = FlattenMlp(
25 | hidden_sizes=[net_size, net_size],
26 | input_size=obs_dim + action_dim,
27 | output_size=1,
28 | )
29 | qf2 = FlattenMlp(
30 | hidden_sizes=[net_size, net_size],
31 | input_size=obs_dim + action_dim,
32 | output_size=1,
33 | )
34 | vf = FlattenMlp(
35 | hidden_sizes=[net_size, net_size],
36 | input_size=obs_dim,
37 | output_size=1,
38 | )
39 | policy = TanhGaussianPolicy(
40 | hidden_sizes=[net_size, net_size],
41 | obs_dim=obs_dim,
42 | action_dim=action_dim,
43 | )
44 | algorithm = TwinSAC(
45 | env=env,
46 | policy=policy,
47 | qf1=qf1,
48 | qf2=qf2,
49 | vf=vf,
50 | **variant['algo_params']
51 | )
52 | algorithm.to(ptu.device)
53 | algorithm.train()
54 |
55 |
56 | if __name__ == "__main__":
57 | # noinspection PyTypeChecker
58 | variant = dict(
59 | algo_params=dict(
60 | num_epochs=1000,
61 | num_steps_per_epoch=1000,
62 | num_steps_per_eval=1000,
63 | max_path_length=1000,
64 | batch_size=128,
65 | discount=0.99,
66 |
67 | soft_target_tau=0.001,
68 | policy_lr=3E-4,
69 | qf_lr=3E-4,
70 | vf_lr=3E-4,
71 | ),
72 | net_size=300,
73 | )
74 | # ptu.set_gpu_mode(True) # optionally set the GPU (default=False)
75 | setup_logger('name-of-experiment', variant=variant)
76 | experiment(variant)
77 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | atomicwrites==1.2.1
2 | attrs==18.2.0
3 | awscli==1.16.96
4 | backcall==0.1.0
5 | boto==2.49.0
6 | boto3==1.9.84
7 | botocore==1.12.86
8 | certifi==2018.11.29
9 | cffi==1.11.5
10 | chardet==3.0.4
11 | Click==7.0
12 | cloudpickle==0.5.2
13 | colorama==0.3.9
14 | cycler==0.10.0
15 | Cython==0.29.2
16 | decorator==4.4.0
17 | dill==0.2.9
18 | docutils==0.14
19 | -e git+git@github.com:richardrl/doodad-2019-fresh.git@93bf5ff595d10f36b1a72419b434858b94489302#egg=doodad
20 | ffmpeg==1.4
21 | filelock==3.0.10
22 | flatbuffers==1.10
23 | funcsigs==1.0.2
24 | future==0.17.1
25 | gitdb2==2.0.5
26 | GitPython==2.1.11
27 | glfw==1.7.0
28 | graphviz==0.12
29 | gtimer==1.0.0b5
30 | gym==0.10.9
31 | h5py==2.9.0
32 | idna==2.8
33 | imageio==2.4.1
34 | ipdb==0.12
35 | ipykernel==5.1.0
36 | ipython==7.4.0
37 | ipython-genutils==0.2.0
38 | jedi==0.13.3
39 | jmespath==0.9.3
40 | joblib==0.13.1
41 | jupyter-client==5.2.4
42 | jupyter-core==4.4.0
43 | kiwisolver==1.0.1
44 | lockfile==0.12.2
45 | matplotlib==3.0.2
46 | more-itertools==5.0.0
47 | mpi4py==3.0.1
48 | mujoco-py==1.50.1.68
49 | numpy==1.16.0
50 | numpy-stl==2.10.0
51 | opencv-python==4.0.0.21
52 | pandas==0.24.1
53 | parso==0.4.0
54 | pexpect==4.7.0
55 | pickleshare==0.7.5
56 | Pillow==5.4.1
57 | pkg-resources==0.0.0
58 | pluggy==0.8.1
59 | prompt-toolkit==2.0.9
60 | psutil==5.5.0
61 | ptyprocess==0.6.0
62 | py==1.7.0
63 | pyasn1==0.4.5
64 | pycparser==2.19
65 | pygame==1.9.4
66 | pyglet==1.3.2
67 | Pygments==2.3.1
68 | pyparsing==2.3.1
69 | pyquaternion==0.9.5
70 | pytest==4.2.0
71 | python-dateutil==2.7.5
72 | python-utils==2.3.0
73 | pytz==2018.9
74 | PyYAML==3.13
75 | pyzmq==18.0.1
76 | ray==0.6.2
77 | redis==3.1.0
78 | requests==2.21.0
79 | rsa==3.4.2
80 | s3transfer==0.1.13
81 | scikit-video==1.1.11
82 | scipy==1.2.0
83 | seaborn==0.9.0
84 | six==1.12.0
85 | smmap2==2.0.5
86 | tk==0.1.0
87 | torch==1.1.0
88 | torchtest==0.4
89 | torchvision==0.2.1
90 | -e git+git@github.com:szagoruyko/pytorchviz.git@46add7f2c071b6d29fc3d56e9d2d21e1c0a3af1d#egg=torchviz
91 | tornado==6.0.2
92 | traitlets==4.3.2
93 | urllib3==1.24.1
94 | wcwidth==0.1.7
--------------------------------------------------------------------------------
/rlkit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/richardrl/rlkit-relational/e01973d0a7e393cb31fbd48e8180ab0d3d8d2a2e/rlkit/__init__.py
--------------------------------------------------------------------------------
/rlkit/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | General classes, functions, utilities that are used throughout rlkit.
3 | """
4 | from rlkit.core.logging import logger
5 |
6 | __all__ = ['logger']
7 |
8 |
--------------------------------------------------------------------------------
/rlkit/core/eval_util.py:
--------------------------------------------------------------------------------
1 | """
2 | Common evaluation utilities.
3 | """
4 |
5 | from collections import OrderedDict
6 | from numbers import Number
7 |
8 | import numpy as np
9 |
10 |
11 | def get_generic_path_information(paths, stat_prefix='', num_blocks=None):
12 | """
13 | Get an OrderedDict with a bunch of statistic names and values.
14 | """
15 | # assert num_blocks is not None
16 | # assert len(paths) == 21, len(paths)
17 | statistics = OrderedDict()
18 | returns = [sum(path["rewards"]) for path in paths]
19 | statistics.update(create_stats_ordered_dict('Returns', returns,
20 | stat_prefix=stat_prefix))
21 |
22 | rewards = np.vstack([path["rewards"] for path in paths])
23 | statistics.update(create_stats_ordered_dict('Rewards', rewards,
24 | stat_prefix=stat_prefix))
25 |
26 | assert np.all([path['mask'][0] == path['mask'][x] for path in paths for x in range(len(path))])
27 |
28 | final_num_blocks_stacked = [path['mask'][0].sum() - np.clip(np.abs(path["rewards"][-1]), None, path['mask'][0].sum()) for path in paths]
29 | statistics[F'{stat_prefix} Final Num Blocks Stacked'] = np.mean(final_num_blocks_stacked)
30 |
31 | mean_num_blocks_stacked = [(path['mask'][0].sum() - np.clip(np.abs(path['rewards']), None, path['mask'][0].sum())).mean() for path in paths]
32 | assert all(x >= 0 for x in mean_num_blocks_stacked), mean_num_blocks_stacked
33 | statistics[F'{stat_prefix} Mean Num Blocks Stacked'] = np.mean(mean_num_blocks_stacked)
34 |
35 | if isinstance(paths[0], dict) and num_blocks:
36 | # Keys are block IDs, values are final goal distances.
37 | # Each block ID is a list of final goal distances for all paths
38 | final_goal_dist = dict()
39 | seq = []
40 | for block_id in range(num_blocks):
41 | final_goal_dist[block_id] = [np.linalg.norm(path['observations'][-1]['achieved_goal'][block_id*3:(block_id+1)*3] - path['observations'][-1]['desired_goal'][block_id*3:(block_id+1)*3]) for path in paths]
42 | # statistics.update(create_stats_ordered_dict(F"Fin Goal Dist Blk {block_id}", final_goal_dist[block_id],
43 | # stat_prefix=stat_prefix))
44 | seq.append(np.array([np.linalg.norm(path['observations'][-1]['achieved_goal'][block_id*3:(block_id+1)*3] - path['observations'][-1]['desired_goal'][block_id*3:(block_id+1)*3]) for path in paths]))
45 |
46 | block_dists = np.vstack(seq)
47 | assert len(block_dists.shape) == 2
48 | sorted = np.sort(block_dists, axis=0)
49 | # sorted = block_dists
50 |
51 | # for block_id in range(num_blocks):
52 | # statistics.update(create_stats_ordered_dict(F"Fin Goal Dist Blk {block_id}", sorted[block_id], stat_prefix=stat_prefix))
53 |
54 | total_solved = 0
55 | goal_threshold = .05
56 | for path_fd_tuple_across_blocks in zip(*list(final_goal_dist.values())):
57 | total_solved +=all(fd_blocki < goal_threshold for fd_blocki in path_fd_tuple_across_blocks)
58 |
59 | assert len(paths) == len(final_goal_dist[0])
60 | percent_solved = total_solved/len(paths)
61 | assert 0 <= percent_solved <= 1, (total_solved, len(paths), final_goal_dist)
62 | statistics[F"{stat_prefix} Percent Solved"] = percent_solved
63 |
64 | actions = [path["actions"] for path in paths]
65 | if len(actions[0].shape) == 1:
66 | actions = np.hstack([path["actions"] for path in paths])
67 | else:
68 | actions = np.vstack([path["actions"] for path in paths])
69 | statistics.update(create_stats_ordered_dict(
70 | 'Actions', actions, stat_prefix=stat_prefix
71 | ))
72 | statistics['Num Paths'] = len(paths)
73 |
74 | return statistics
75 |
76 |
77 | def get_average_returns(paths):
78 | returns = [sum(path["rewards"]) for path in paths]
79 | return np.mean(returns)
80 |
81 |
82 | def create_stats_ordered_dict(
83 | name,
84 | data,
85 | stat_prefix=None,
86 | always_show_all_stats=True,
87 | exclude_max_min=False,
88 | ):
89 | if stat_prefix is not None:
90 | name = "{} {}".format(stat_prefix, name)
91 | if isinstance(data, Number):
92 | return OrderedDict({name: data})
93 |
94 | if len(data) == 0:
95 | return OrderedDict()
96 |
97 | if isinstance(data, tuple):
98 | ordered_dict = OrderedDict()
99 | for number, d in enumerate(data):
100 | sub_dict = create_stats_ordered_dict(
101 | "{0}_{1}".format(name, number),
102 | d,
103 | )
104 | ordered_dict.update(sub_dict)
105 | return ordered_dict
106 |
107 | if isinstance(data, list):
108 | try:
109 | iter(data[0])
110 | except TypeError:
111 | pass
112 | else:
113 | data = np.concatenate(data)
114 |
115 | if (isinstance(data, np.ndarray) and data.size == 1
116 | and not always_show_all_stats):
117 | return OrderedDict({name: float(data)})
118 |
119 | stats = OrderedDict([
120 | (name + ' Mean', np.mean(data)),
121 | (name + ' Std', np.std(data)),
122 | ])
123 | if not exclude_max_min:
124 | stats[name + ' Max'] = np.max(data)
125 | stats[name + ' Min'] = np.min(data)
126 | return stats
127 |
--------------------------------------------------------------------------------
/rlkit/core/serializable.py:
--------------------------------------------------------------------------------
1 | """
2 | Based on rllab's serializable.py file
3 |
4 | https://github.com/rll/rllab
5 | """
6 |
7 | import inspect
8 | import sys
9 |
10 |
11 | class Serializable(object):
12 |
13 | def __init__(self, *args, **kwargs):
14 | self.__args = args
15 | self.__kwargs = kwargs
16 |
17 | def quick_init(self, locals_):
18 | if getattr(self, "_serializable_initialized", False):
19 | return
20 | if sys.version_info >= (3, 0):
21 | spec = inspect.getfullargspec(self.__init__)
22 | # Exclude the first "self" parameter
23 | if spec.varkw:
24 | kwargs = locals_[spec.varkw].copy()
25 | else:
26 | kwargs = dict()
27 | if spec.kwonlyargs:
28 | for key in spec.kwonlyargs:
29 | kwargs[key] = locals_[key]
30 | else:
31 | spec = inspect.getargspec(self.__init__)
32 | if spec.keywords:
33 | kwargs = locals_[spec.keywords]
34 | else:
35 | kwargs = dict()
36 | if spec.varargs:
37 | varargs = locals_[spec.varargs]
38 | else:
39 | varargs = tuple()
40 | in_order_args = [locals_[arg] for arg in spec.args][1:]
41 | self.__args = tuple(in_order_args) + varargs
42 | self.__kwargs = kwargs
43 | setattr(self, "_serializable_initialized", True)
44 |
45 | def __getstate__(self):
46 | return {"__args": self.__args, "__kwargs": self.__kwargs}
47 |
48 | def __setstate__(self, d):
49 | # convert all __args to keyword-based arguments
50 | if sys.version_info >= (3, 0):
51 | spec = inspect.getfullargspec(self.__init__)
52 | else:
53 | spec = inspect.getargspec(self.__init__)
54 | in_order_args = spec.args[1:]
55 | out = type(self)(**dict(zip(in_order_args, d["__args"]), **d["__kwargs"]))
56 | self.__dict__.update(out.__dict__)
57 | self.__dict__.update()
58 |
59 | @classmethod
60 | def clone(cls, obj, **kwargs):
61 | assert isinstance(obj, Serializable)
62 | d = obj.__getstate__()
63 | d["__kwargs"] = dict(d["__kwargs"], **kwargs)
64 | out = type(obj).__new__(type(obj))
65 | out.__setstate__(d)
66 | return out
67 |
--------------------------------------------------------------------------------
/rlkit/data_management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/richardrl/rlkit-relational/e01973d0a7e393cb31fbd48e8180ab0d3d8d2a2e/rlkit/data_management/__init__.py
--------------------------------------------------------------------------------
/rlkit/data_management/env_replay_buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from rlkit.data_management.simple_replay_buffer import SimpleReplayBuffer
3 | from gym.spaces import Box, Discrete, Tuple
4 |
5 |
6 | class EnvReplayBuffer(SimpleReplayBuffer):
7 | def __init__(
8 | self,
9 | max_replay_buffer_size,
10 | env,
11 | ):
12 | """
13 | :param max_replay_buffer_size:
14 | :param env:
15 | """
16 | self.env = env
17 | self._ob_space = env.observation_space
18 | self._action_space = env.action_space
19 | super().__init__(
20 | max_replay_buffer_size=max_replay_buffer_size,
21 | observation_dim=get_dim(self._ob_space),
22 | action_dim=get_dim(self._action_space),
23 | )
24 |
25 | def add_sample(self, observation, action, reward, terminal,
26 | next_observation, **kwargs):
27 |
28 | if isinstance(self._action_space, Discrete):
29 | action = np.eye(self._action_space.n)[action]
30 | super(EnvReplayBuffer, self).add_sample(
31 | observation, action, reward, terminal,
32 | next_observation, **kwargs)
33 |
34 |
35 | def get_dim(space):
36 | if isinstance(space, Box):
37 | return space.low.size
38 | elif isinstance(space, Discrete):
39 | return space.n
40 | elif isinstance(space, Tuple):
41 | return sum(get_dim(subspace) for subspace in space.spaces)
42 | elif hasattr(space, 'flat_dim'):
43 | return space.flat_dim
44 | else:
45 | raise TypeError("Unknown space: {}".format(space))
46 |
--------------------------------------------------------------------------------
/rlkit/data_management/normalizer.py:
--------------------------------------------------------------------------------
1 | """
2 | Based on code from Marcin Andrychowicz
3 | """
4 | import numpy as np
5 |
6 |
7 | class Normalizer(object):
8 | def __init__(
9 | self,
10 | size,
11 | eps=1e-8,
12 | default_clip_range=np.inf,
13 | mean=0,
14 | std=1,
15 | ):
16 | self.size = size
17 | self.eps = eps
18 | self.default_clip_range = default_clip_range
19 | self.sum = np.zeros(self.size, np.float32)
20 | self.sumsq = np.zeros(self.size, np.float32)
21 | self.count = np.ones(1, np.float32)
22 | self.mean = mean + np.zeros(self.size, np.float32)
23 | self.std = std * np.ones(self.size, np.float32)
24 | self.synchronized = True
25 |
26 | def update(self, v):
27 | if v.ndim == 1:
28 | v = np.expand_dims(v, 0)
29 | assert v.ndim == 2
30 | assert v.shape[1] == self.size
31 | self.sum += v.sum(axis=0)
32 | self.sumsq += (np.square(v)).sum(axis=0)
33 | self.count[0] += v.shape[0]
34 | self.synchronized = False
35 |
36 | def normalize(self, v, clip_range=None):
37 | if not self.synchronized:
38 | self.synchronize()
39 | if clip_range is None:
40 | clip_range = self.default_clip_range
41 | mean, std = self.mean, self.std
42 | if v.ndim == 2:
43 | mean = mean.reshape(1, -1)
44 | std = std.reshape(1, -1)
45 | return np.clip((v - mean) / std, -clip_range, clip_range)
46 |
47 | def denormalize(self, v):
48 | if not self.synchronized:
49 | self.synchronize()
50 | mean, std = self.mean, self.std
51 | if v.ndim == 2:
52 | mean = mean.reshape(1, -1)
53 | std = std.reshape(1, -1)
54 | return mean + v * std
55 |
56 | def synchronize(self):
57 | self.mean[...] = self.sum / self.count[0]
58 | self.std[...] = np.sqrt(
59 | np.maximum(
60 | np.square(self.eps),
61 | self.sumsq / self.count[0] - np.square(self.mean)
62 | )
63 | )
64 | self.synchronized = True
65 |
66 |
67 | class IdentityNormalizer(object):
68 | def __init__(self, *args, **kwargs):
69 | pass
70 |
71 | def update(self, v):
72 | pass
73 |
74 | def normalize(self, v, clip_range=None):
75 | return v
76 |
77 | def denormalize(self, v):
78 | return v
79 |
80 |
81 | class FixedNormalizer(object):
82 | def __init__(
83 | self,
84 | size,
85 | default_clip_range=np.inf,
86 | mean=0,
87 | std=1,
88 | eps=1e-8,
89 | ):
90 | assert std > 0
91 | std = std + eps
92 | self.size = size
93 | self.default_clip_range = default_clip_range
94 | self.mean = mean + np.zeros(self.size, np.float32)
95 | self.std = std + np.zeros(self.size, np.float32)
96 | self.eps = eps
97 |
98 | def set_mean(self, mean):
99 | self.mean = mean + np.zeros(self.size, np.float32)
100 |
101 | def set_std(self, std):
102 | std = std + self.eps
103 | self.std = std + np.zeros(self.size, np.float32)
104 |
105 | def normalize(self, v, clip_range=None):
106 | if clip_range is None:
107 | clip_range = self.default_clip_range
108 | mean, std = self.mean, self.std
109 | if v.ndim == 2:
110 | mean = mean.reshape(1, -1)
111 | std = std.reshape(1, -1)
112 | return np.clip((v - mean) / std, -clip_range, clip_range)
113 |
114 | def denormalize(self, v):
115 | mean, std = self.mean, self.std
116 | if v.ndim == 2:
117 | mean = mean.reshape(1, -1)
118 | std = std.reshape(1, -1)
119 | return mean + v * std
120 |
121 | def copy_stats(self, other):
122 | self.set_mean(other.mean)
123 | self.set_std(other.std)
124 |
--------------------------------------------------------------------------------
/rlkit/data_management/path_builder.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class PathBuilder(dict):
5 | """
6 | Usage:
7 | ```
8 | path_builder = PathBuilder()
9 | path.add_sample(
10 | observations=1,
11 | actions=2,
12 | next_observations=3,
13 | ...
14 | )
15 | path.add_sample(
16 | observations=4,
17 | actions=5,
18 | next_observations=6,
19 | ...
20 | )
21 |
22 | path = path_builder.get_all_stacked()
23 |
24 | path['observations']
25 | # output: [1, 4]
26 | path['actions']
27 | # output: [2, 5]
28 | ```
29 |
30 | Note that the key should be "actions" and not "action" since the
31 | resulting dictionary will have those keys.
32 | """
33 |
34 | def __init__(self):
35 | super().__init__()
36 | self._path_length = 0
37 |
38 | def add_all(self, **key_to_value):
39 | for k, v in key_to_value.items():
40 | if k not in self:
41 | self[k] = [v]
42 | else:
43 | self[k].append(v)
44 | self._path_length += 1
45 |
46 | def get_all_stacked(self):
47 | output_dict = dict()
48 | for k, v in self.items():
49 | output_dict[k] = stack_list(v)
50 | return output_dict
51 |
52 | def __len__(self):
53 | return self._path_length
54 |
55 |
56 | def stack_list(lst):
57 | if isinstance(lst[0], dict):
58 | return lst
59 | else:
60 | return np.array(lst)
61 |
--------------------------------------------------------------------------------
/rlkit/data_management/replay_buffer.py:
--------------------------------------------------------------------------------
1 | import abc
2 |
3 |
4 | class ReplayBuffer(object, metaclass=abc.ABCMeta):
5 | """
6 | A class used to save and replay data.
7 | """
8 |
9 | @abc.abstractmethod
10 | def add_sample(self, observation, action, reward, next_observation,
11 | terminal, **kwargs):
12 | """
13 | Add a transition tuple.
14 | """
15 | pass
16 |
17 | @abc.abstractmethod
18 | def terminate_episode(self):
19 | """
20 | Let the replay buffer know that the episode has terminated in case some
21 | special book-keeping has to happen.
22 | :return:
23 | """
24 | pass
25 |
26 | @abc.abstractmethod
27 | def num_steps_can_sample(self, **kwargs):
28 | """
29 | :return: # of unique items that can be sampled.
30 | """
31 | pass
32 |
33 | def add_path(self, path):
34 | """
35 | Add a path to the replay buffer.
36 |
37 | This default implementation naively goes through every step, but you
38 | may want to optimize this.
39 |
40 | NOTE: You should NOT call "terminate_episode" after calling add_path.
41 | It's assumed that this function handles the episode termination.
42 |
43 | :param path: Dict like one outputted by rlkit.samplers.util.rollout
44 | """
45 | for i, (
46 | obs,
47 | action,
48 | reward,
49 | next_obs,
50 | terminal,
51 | agent_info,
52 | env_info
53 | ) in enumerate(zip(
54 | path["observations"],
55 | path["actions"],
56 | path["rewards"],
57 | path["next_observations"],
58 | path["terminals"],
59 | path["agent_infos"],
60 | path["env_infos"],
61 | )):
62 | self.add_sample(
63 | obs,
64 | action,
65 | reward,
66 | next_obs,
67 | terminal,
68 | agent_info=agent_info,
69 | env_info=env_info,
70 | )
71 | self.terminate_episode()
72 |
73 | @abc.abstractmethod
74 | def random_batch(self, batch_size):
75 | """
76 | Return a batch of size `batch_size`.
77 | :param batch_size:
78 | :return:
79 | """
80 | pass
81 |
--------------------------------------------------------------------------------
/rlkit/data_management/simple_replay_buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from rlkit.data_management.replay_buffer import ReplayBuffer
4 |
5 |
6 | class SimpleReplayBuffer(ReplayBuffer):
7 | def __init__(
8 | self, max_replay_buffer_size, observation_dim, action_dim,
9 | ):
10 | self._observation_dim = observation_dim
11 | self._action_dim = action_dim
12 | self._max_replay_buffer_size = max_replay_buffer_size
13 | self._observations = np.zeros((max_replay_buffer_size, observation_dim))
14 | # It's a bit memory inefficient to save the observations twice,
15 | # but it makes the code *much* easier since you no longer have to
16 | # worry about termination conditions.
17 | self._next_obs = np.zeros((max_replay_buffer_size, observation_dim))
18 | self._actions = np.zeros((max_replay_buffer_size, action_dim))
19 | # Make everything a 2D np array to make it easier for other code to
20 | # reason about the shape of the data
21 | self._rewards = np.zeros((max_replay_buffer_size, 1))
22 | # self._terminals[i] = a terminal was received at time i
23 | self._terminals = np.zeros((max_replay_buffer_size, 1), dtype='uint8')
24 | self._top = 0
25 | self._size = 0
26 |
27 | def add_sample(self, observation, action, reward, terminal,
28 | next_observation, **kwargs):
29 | self._observations[self._top] = observation
30 | self._actions[self._top] = action
31 | self._rewards[self._top] = reward
32 | self._terminals[self._top] = terminal
33 | self._next_obs[self._top] = next_observation
34 | self._advance()
35 |
36 | def terminate_episode(self):
37 | pass
38 |
39 | def _advance(self):
40 | self._top = (self._top + 1) % self._max_replay_buffer_size
41 | if self._size < self._max_replay_buffer_size:
42 | self._size += 1
43 |
44 | def random_batch(self, batch_size):
45 | indices = np.random.randint(0, self._size, batch_size)
46 | return dict(
47 | observations=self._observations[indices],
48 | actions=self._actions[indices],
49 | rewards=self._rewards[indices],
50 | terminals=self._terminals[indices],
51 | next_observations=self._next_obs[indices],
52 | )
53 |
54 | def num_steps_can_sample(self):
55 | return self._size
56 |
--------------------------------------------------------------------------------
/rlkit/envs/ant.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from rlkit.envs.mujoco_env import MujocoEnv
4 |
5 |
6 | class AntEnv(MujocoEnv):
7 | def __init__(self, use_low_gear_ratio=True):
8 | self.init_serialization(locals())
9 | if use_low_gear_ratio:
10 | xml_path = 'low_gear_ratio_ant.xml'
11 | else:
12 | xml_path = 'normal_gear_ratio_ant.xml'
13 | super().__init__(
14 | xml_path,
15 | frame_skip=5,
16 | automatically_set_obs_and_action_space=True,
17 | )
18 |
19 | def step(self, a):
20 | torso_xyz_before = self.get_body_com("torso")
21 | self.do_simulation(a, self.frame_skip)
22 | torso_xyz_after = self.get_body_com("torso")
23 | torso_velocity = torso_xyz_after - torso_xyz_before
24 | forward_reward = torso_velocity[0]/self.dt
25 | ctrl_cost = .5 * np.square(a).sum()
26 | contact_cost = 0.5 * 1e-3 * np.sum(
27 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
28 | survive_reward = 1.0
29 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward
30 | state = self.state_vector()
31 | notdone = np.isfinite(state).all() \
32 | and state[2] >= 0.2 and state[2] <= 1.0
33 | done = not notdone
34 | ob = self._get_obs()
35 | return ob, reward, done, dict(
36 | reward_forward=forward_reward,
37 | reward_ctrl=-ctrl_cost,
38 | reward_contact=-contact_cost,
39 | reward_survive=survive_reward,
40 | torso_velocity=torso_velocity,
41 | )
42 |
43 | def _get_obs(self):
44 | return np.concatenate([
45 | self.sim.data.qpos.flat[2:],
46 | self.sim.data.qvel.flat,
47 | ])
48 |
49 | def reset_model(self):
50 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
51 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
52 | self.set_state(qpos, qvel)
53 | return self._get_obs()
54 |
55 | def viewer_setup(self):
56 | self.viewer.cam.distance = self.model.stat.extent * 0.5
57 |
--------------------------------------------------------------------------------
/rlkit/envs/assets/low_gear_ratio_ant.xml:
--------------------------------------------------------------------------------
1 |
2 |