├── .gitignore
├── README.md
├── docker
    ├── Dockerfile
    ├── README.md
    └── setups
    │   ├── requirements.txt
    │   └── setup.sh
├── examples
    ├── mi_estimate.py
    ├── multi_step_mdp.py
    ├── multi_step_mdp_optimality.py
    └── random_sampling.py
├── pic
    ├── __init__.py
    ├── algos
    │   ├── __init__.py
    │   └── numpyagent.py
    ├── gym
    │   ├── __init__.py
    │   ├── multi_step
    │   │   └── multi_step.py
    │   ├── noisy_dynamics
    │   │   ├── cartpole_noise.py
    │   │   ├── halfcheetah_noise.py
    │   │   └── humanoid_noise.py
    │   └── reward_shaping
    │   │   ├── dynamic_mjc.py
    │   │   ├── maze_model.py
    │   │   └── reacher_norm.py
    ├── nn
    │   ├── __init__.py
    │   └── numpymlp.py
    └── sampler
    │   ├── __init__.py
    │   └── sampler.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # optional
132 | output/*
133 | .vscode
134 | outputs/*
135 | results
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Policy Information Capacity: Information-Theoretic Measure for Task Complexity in Deep Reinforcement Learning
 2 | [[arxiv]](https://arxiv.org/abs/2103.12726)
 3 | 
 4 | If you use this codebase for your research, please cite the paper:
 5 | ```
 6 | @inproceedings{furuta2021pic,
 7 |   title={Policy Information Capacity: Information-Theoretic Measure for Task Complexity in Deep Reinforcement Learning},
 8 |   author={Hiroki Furuta and Tatsuya Matsushima and Tadashi Kozuno and Yutaka Matsuo and Sergey Levine and Ofir Nachum and Shixiang Shane Gu},
 9 |   booktitle={International Conference on Machine Learning},
10 |   year={2021}
11 | }
12 | ```
13 | 
14 | 
15 | ## Dependencies
16 | We recommend you to use Docker. See [README](./docker/README.md) for setting up.
17 | 
18 | ## Examples
19 | See [examples](./examples) for the details.
20 | 
21 | For synthetic experiments:
22 | ```
23 | python multi_step_mdp_optimality.py --iterations 100 --population_size 1000 --episodes_per_param 1000 --prior_mean 0.0 --prior_sigma 1.0 --horizon 3 --multiprocess 64
24 | 
25 | python multi_step_mdp.py --iterations 100 --population_size 1000 --episodes_per_param 1000 --prior_mean 0.0 --prior_sigma 1.0 --horizon 2 --multiprocess 64
26 | ```
27 | 
28 | For random sampling:
29 | ```
30 | python random_sampling.py --env CartPole-v0 --random_dist normal --multiprocess 64 --n_units 64 --n_layers 2 --n_samples 1000 --n_episodes 1000
31 | 
32 | python random_sampling.py --env dm2gym:CheetahRun-v0 --random_dist uniform --multiprocess 64 --n_units 64 --n_layers 2 --n_samples 1000 --n_episodes 1000
33 | ```
34 | 
35 | For mutual information estimation (PIC and POIC):
36 | ```
37 | python mi_estimate.py --sourse_path ./results/CartPole-v0.npy --env CartPole-v0
38 | ```
39 | 
40 | ## Environment List
41 | ```
42 | CartPole-v0
43 | Pendulum-v0
44 | MountainCar-v0
45 | MountainCarContinuous-v0
46 | Acrobot-v1
47 | Ant-v2
48 | HalfCheetah-v2
49 | Walker2d-v2
50 | Humanoid-v2
51 | Hopper-v2
52 | dm2gym:CheetahRun-v0
53 | dm2gym:ReacherEasy-v0
54 | dm2gym:Ball_in_cupCatch-v0
55 | ```
56 | 
57 | For reward shaping experiments, see [here](./pic/gym/__init__.py) for the details.
58 | 
59 | ## Reference
60 | This codebase is based on [RWG](https://github.com/declanoller/RWG_benchmarking). We use the implementation of pointmaze environment in [D4RL](https://github.com/rail-berkeley/d4rl).
61 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
 2 | 
 3 | # mujoco
 4 | RUN apt-get update \
 5 |     && DEBIAN_FRONTEND=noninteractive apt-get install -y \
 6 |     curl \
 7 |     git \
 8 |     libgl1-mesa-dev \
 9 |     libgl1-mesa-glx \
10 |     libglew-dev \
11 |     libosmesa6-dev \
12 |     software-properties-common \
13 |     net-tools \
14 |     unzip \
15 |     vim \
16 |     wget \
17 |     xpra \
18 |     xserver-xorg-dev
19 | RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \
20 |     && chmod +x /usr/local/bin/patchelf
21 | RUN mkdir -p /root/.mujoco && \
22 |     wget https://www.roboti.us/download/mujoco200_linux.zip -O mujoco.zip && \
23 |     unzip mujoco.zip -d /root/.mujoco && \
24 |     rm mujoco.zip
25 | RUN echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco200/bin' >> /root/.bashrc
26 | 
27 | # install python libraries
28 | COPY setups/ /root/setups/


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | # How to use
2 | 0. `docker build . -t`
3 | 1. `docker run --runtime=nvidia -it  --privileged -v /path/to/yourworkspace/workspace:/root/workspace -p 9999:9999`
4 | 2. `cd setup`
5 | 3. `source setup.sh`


--------------------------------------------------------------------------------
/docker/setups/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow
2 | gym==0.17.2
3 | mujoco-py==2.0.2.11
4 | dm2gym
5 | optuna


--------------------------------------------------------------------------------
/docker/setups/setup.sh:
--------------------------------------------------------------------------------
1 | ln -s /root/.mujoco/mujoco200_linux /root/.mujoco/mujoco200
2 | pip install -U pip
3 | pip install -U setuptools
4 | pip install -r requirements.txt
5 | pip install git+git://github.com/deepmind/dm_control.git
6 | 


--------------------------------------------------------------------------------
/examples/mi_estimate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import optuna
  4 | import numpy as np
  5 | 
  6 | ALGORITHM_MAX = {
  7 |     'CartPole-v0': 200,
  8 |     'Pendulum-v0': -128.6266493,
  9 |     'MountainCar-v0': -97.2,
 10 |     'MountainCarContinuous-v0': 95.89074929,
 11 |     'Acrobot-v1': -64.826,
 12 |     'Ant-v2': 6584.2,
 13 |     'HalfCheetah-v2': 15266.5,
 14 |     'Hopper-v2': 3564.07,
 15 |     'Walker2d-v2': 5813,
 16 |     'Humanoid-v2': 8264,
 17 |     'dm2gym:CheetahRun-v0': 795.0,
 18 |     'dm2gym:ReacherEasy-v0': 955.0,
 19 |     'dm2gym:Ball_in_cupCatch-v0': 978.2,
 20 | }
 21 | 
 22 | ALGORITHM_AVG = {
 23 |     'CartPole-v0': 194.2,
 24 |     'Pendulum-v0': -571.5,
 25 |     'MountainCar-v0': -143.1,
 26 |     'MountainCarContinuous-v0': 12.9,
 27 |     'Acrobot-v1': -162.9,
 28 |     'Ant-v2': 2450.782353,
 29 |     'HalfCheetah-v2': 6047.226471,
 30 |     'Hopper-v2': 2206.747059,
 31 |     'Walker2d-v2': 3190.777059,
 32 |     'Humanoid-v2': 3880.83,
 33 |     'dm2gym:CheetahRun-v0': 441.9663239,
 34 |     'dm2gym:ReacherEasy-v0': 600.172,
 35 |     'dm2gym:Ball_in_cupCatch-v0': 743.21,
 36 | }
 37 | 
 38 | 
 39 | def main():
 40 |     _basic_columns = (
 41 |         "environment",
 42 |         "normalized_score_A",
 43 |         "normalized_score_R",
 44 |         "POIC",
 45 |         "optimality_marginal",
 46 |         "optimality_conditional",
 47 |         "PIC",
 48 |         "reward_marginal",
 49 |         "reward_conditional",
 50 |         "variance",
 51 |         "temperatures"
 52 |         "r_max",
 53 |         "r_min",
 54 |         "r_mean",
 55 |     )
 56 | 
 57 |     parser = argparse.ArgumentParser()
 58 |     parser.add_argument("--n_trials", type=int, default=200, help="n_trials for optuna")
 59 |     parser.add_argument("--n_bins", type=int, default=100000, help="number of bins")
 60 |     parser.add_argument("--algo_max", action="store_true", help="max(r^algo, r^rand)")
 61 |     parser.add_argument("--clip_persent", type=float, default=0.0, help="top/bottom x percent clipping")
 62 |     parser.add_argument("--sourse_path", type=str, default='./CartPole-v0.npy')
 63 |     parser.add_argument("--root_dir", type=str, default='./results/')
 64 |     parser.add_argument("--env", type=str, default='CartPole-v0')
 65 |     args = parser.parse_args()
 66 | 
 67 |     # save dir
 68 |     output_dir = os.path.join(
 69 |         args.root_dir,
 70 |         'n_trials{}_clip_persent{}'.format(args.n_trials, args.clip_persent),
 71 |         )
 72 |     os.makedirs(output_dir, exist_ok=True)
 73 | 
 74 |     with open(os.path.join(output_dir, "{}_metrics.txt".format(args.env)), "w") as f:
 75 |         print("\t".join(_basic_columns), file=f)
 76 |     with open(os.path.join(output_dir, "{}_tables.txt".format(args.env)), "w") as f:
 77 |         print(" & ".join(_basic_columns), file=f)
 78 | 
 79 |     all_scores_per_param = np.load(args.sourse_path)
 80 |     all_mean_scores = all_scores_per_param.mean(axis=1)
 81 | 
 82 |     if args.clip_persent > 0:
 83 |         upper = np.percentile(all_mean_scores, 100-args.clip_persent)
 84 |         lower = np.percentile(all_mean_scores, args.clip_persent)
 85 |         all_scores_per_param = np.clip(all_scores_per_param, lower, upper)
 86 | 
 87 |     all_scores = all_scores_per_param.flatten()
 88 |     r_max = all_scores.max()
 89 |     r_min = all_mean_scores.min()
 90 |     r_mean = all_scores.mean()
 91 | 
 92 |     variance = 0 if (r_max - r_min) == 0 else all_scores.var()/(r_max - r_min)
 93 | 
 94 |     if args.algo_max:
 95 |         r_max = max(ALGORITHM_MAX[args.env], r_max)
 96 | 
 97 |     def objective(trial):
 98 |         temperature = trial.suggest_loguniform('temperature', 1e-4, 2e4)
 99 |         p_o1 = np.exp((all_scores-r_max)/temperature).mean()
100 |         p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1)
101 |         marginal = -p_o1*np.log(p_o1 + 1e-12) - (1-p_o1)*np.log(1-p_o1 + 1e-12)
102 |         conditional = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12))
103 |         mutual_information = marginal - conditional
104 | 
105 |         return mutual_information
106 | 
107 |     study = optuna.create_study(direction='maximize')
108 |     study.optimize(objective, n_trials=args.n_trials)
109 | 
110 |     # POIC
111 |     trial = study.best_trial
112 |     mi_o = trial.value
113 |     temperature = trial.params['temperature']
114 |     p_o1 = np.exp((all_scores-r_max)/temperature).mean()
115 |     p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1)
116 |     h_o = -p_o1*np.log(p_o1) - (1-p_o1)*np.log(1-p_o1)
117 | 
118 |     h_o_t = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12))
119 | 
120 |     # PIC
121 |     bins = args.n_bins
122 |     hist = np.histogram(all_scores, bins=args.n_bins)
123 |     discretization_all = hist[0] / len(all_scores)
124 |     entropy_all = - np.sum(discretization_all * np.log(discretization_all + 1e-12))
125 |     discretization_r_theta = [np.histogram(x, bins=hist[1])[0] / len(x) for x in all_scores_per_param]
126 |     entropy_r_theta = - np.mean([np.sum(p_r_theta * np.log(p_r_theta + 1e-12)) for p_r_theta in discretization_r_theta])
127 |     mi_r = entropy_all - entropy_r_theta
128 | 
129 |     normalized_score_A = (ALGORITHM_AVG[args.env] - r_min) / (max(ALGORITHM_MAX[args.env], r_max) - r_min)
130 |     normalized_score_R = (r_mean - r_min) / (max(ALGORITHM_MAX[args.env], r_max) - r_min)
131 | 
132 |     # save in scores.txt
133 |     values = (
134 |         args.env,
135 |         normalized_score_A,
136 |         normalized_score_R,
137 |         mi_o,
138 |         h_o,
139 |         h_o_t,
140 |         mi_r,
141 |         entropy_all,
142 |         entropy_r_theta,
143 |         variance,
144 |         temperature,
145 |         r_max,
146 |         r_min,
147 |         r_mean,
148 |         )
149 | 
150 |     with open(os.path.join(output_dir, "{}_metrics.txt".format(args.env)), "a+") as f:
151 |         print("\t".join(str(x) for x in values), file=f)
152 |     with open(os.path.join(output_dir, "{}_tables.txt".format(args.env)), "a+") as f:
153 |         print(" & ".join(str(x) for x in values), file=f)
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     main()
158 | 


--------------------------------------------------------------------------------
/examples/multi_step_mdp.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import gym
  4 | import os
  5 | 
  6 | import pic
  7 | import multiprocessing as mp
  8 | import numpy as np
  9 | from multiprocessing import Pool
 10 | from scipy.special import expit as sigmoid, logit
 11 | 
 12 | 
 13 | _basic_columns = (
 14 |     "episodes",
 15 |     "mutual_infomation",
 16 |     "marginal",
 17 |     "conditional",
 18 |     "mean",
 19 |     "min",
 20 |     "max",
 21 |     "var",
 22 |     "eval_reward_mean",
 23 |     "prior_mean_0",
 24 |     "prior_mean_1",
 25 |     "prior_mean_2",
 26 | )
 27 | 
 28 | 
 29 | class Agent(object):
 30 |     def __init__(self, state_dim=3):
 31 |         self.param = np.zeros(3)
 32 | 
 33 |     def set_weight(self, weight):
 34 |         self.param = weight  # (3, )
 35 | 
 36 |     def sample(self, state):
 37 |         b = np.random.uniform(0, 1, 1)
 38 |         theta_s = np.dot(self.param.T, state)
 39 |         action = int(1 - (theta_s > logit(b)).astype('int'))
 40 |         return action
 41 | 
 42 | 
 43 | class MultiSampler(object):
 44 |     def __init__(self, env, agent):
 45 |         self.env = env
 46 |         self.agent = agent
 47 | 
 48 |     def run_episode(self, index):
 49 |         obs = self.env.reset()
 50 |         score = 0
 51 |         steps = 0
 52 |         done = False
 53 |         while not done:
 54 |             action = self.agent.sample(obs)
 55 |             obs, reward, done, _ = self.env.step(action)
 56 |             score += reward
 57 |         return score
 58 | 
 59 |     def set_weight(self, weight):
 60 |         self.agent.set_weight(weight)
 61 | 
 62 | 
 63 | def main():
 64 |     parser = argparse.ArgumentParser()
 65 |     parser.add_argument("--prior_sigma", type=float, default=1.0, help="std of p(\theta)")
 66 |     parser.add_argument("--prior_mean", type=float, default=0.0, help="mean of p(\theta)")
 67 |     parser.add_argument("--horizon", type=int, default=1, help="horizon of MDP")
 68 |     parser.add_argument("--multiprocess", type=int, default=1, help="number of prosess for distrbuted experiments")
 69 |     parser.add_argument("--population_size", type=int, default=1000, help="number of population for optimization")
 70 |     parser.add_argument("--episodes_per_param", type=int, default=1000, help="the number of episodes per parameter")
 71 |     parser.add_argument("--iterations", type=int, default=100, help="the number of episodes for optimization")
 72 |     parser.add_argument("--learning_rate", type=float, default=0.5, help="learning rate of the parameter")
 73 |     parser.add_argument("--decay", type=float, default=0.999, help="decay of learning rate")
 74 |     parser.add_argument("--output", type=str, default='./outputs/')
 75 |     parser.add_argument("--seed", type=int, default=0, help="random seed")
 76 |     args = parser.parse_args()
 77 | 
 78 |     np.random.seed(seed=args.seed)
 79 | 
 80 |     # save dir
 81 |     timestamp = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S")
 82 |     output_dir = os.path.join(
 83 |         args.output,
 84 |         'multi_step_es_horizon{}'.format(args.horizon),
 85 |         'ps{}_pm{}_pop{}_ep{}_seed-{}'.format(
 86 |             args.prior_sigma,
 87 |             args.prior_mean,
 88 |             args.population_size,
 89 |             args.episodes_per_param,
 90 |             args.seed
 91 |             ),
 92 |         timestamp
 93 |         )
 94 |     os.makedirs(output_dir, exist_ok=True)
 95 | 
 96 |     with open(os.path.join(output_dir, "scores.txt"), "w") as f:
 97 |         print("\t".join(_basic_columns), file=f)
 98 | 
 99 |     if args.horizon == 1:
100 |         env = gym.make('OneStep-v0')
101 |     elif args.horizon == 2:
102 |         env = gym.make('TwoStep-v0')
103 |     elif args.horizon == 3:
104 |         env = gym.make('ThreeStep-v0')
105 | 
106 |     agent = Agent()
107 |     multisampler = MultiSampler(env, agent)
108 | 
109 |     if args.multiprocess > 0:
110 |         num_worker = mp.cpu_count()
111 |         if args.multiprocess > num_worker:
112 |             args.multiprocess = num_worker
113 |         pool = Pool(args.multiprocess)
114 |         print("num_worker: {}/{}".format(args.multiprocess, num_worker))
115 | 
116 |     # sampling parameters
117 |     prior_mu = np.array([args.prior_mean] * 3)
118 |     for itr in range(args.iterations):
119 |         # sampling parameters
120 |         # [population_size, episodes_per_param]
121 |         mu = np.ones((args.population_size, 3)) * prior_mu
122 |         noise = np.random.randn(args.population_size, 3)
123 |         theta = mu + args.prior_sigma * noise
124 | 
125 |         all_scores = []
126 |         all_scores_per_param = []
127 |         # simulate
128 |         for population in theta:
129 |             multisampler.set_weight(population)
130 |             if args.multiprocess > 0:
131 |                 scores = pool.map(multisampler.run_episode, range(args.episodes_per_param))
132 |                 assert len(scores) == args.episodes_per_param
133 |             all_scores += scores
134 |             all_scores_per_param.append(scores)
135 | 
136 |         all_scores = np.array(all_scores)
137 |         all_scores_per_param = np.array(all_scores_per_param)
138 | 
139 |         p = all_scores.sum()/(args.population_size * args.episodes_per_param)
140 |         marginal = -p * np.log(p + 1e-10) -(1 - p) * np.log((1 - p) + 1e-10)
141 |         ps = all_scores_per_param.sum(axis=1) / args.episodes_per_param
142 |         conditional = np.mean(-ps * np.log(ps + 1e-10) -(1 - ps) * np.log((1 - ps) + 1e-10))
143 |         mutual_info = marginal - conditional
144 |         reward_mean = all_scores.mean()
145 |         reward_variance = all_scores.var()
146 |         reward_mean_min = all_scores_per_param.mean(axis=1).min()
147 |         reward_mean_max = all_scores_per_param.mean(axis=1).max()
148 | 
149 |         # update
150 |         reward_mean_over_ep = all_scores_per_param.mean(axis=1)
151 |         std = reward_mean_over_ep.std()
152 |         if std == 0:
153 |             std = 1e-10
154 |         # normalize
155 |         reward_mean_over_ep = (reward_mean_over_ep - reward_mean) / std
156 |         update_factor = 1. / (args.population_size * args.prior_sigma)
157 |         g = update_factor * np.dot(noise.T, reward_mean_over_ep).T
158 |         prior_mu += args.learning_rate * g
159 | 
160 |         # evaluation
161 |         n_eval = 100
162 |         if args.multiprocess > 0:
163 |             eval_reward = pool.map(multisampler.run_episode, range(n_eval))
164 |             assert len(eval_reward) == n_eval
165 |         eval_reward_mean = np.array(eval_reward).mean()
166 | 
167 |         values = (
168 |             (itr + 1) * args.episodes_per_param * args.population_size,
169 |             mutual_info,
170 |             marginal,
171 |             conditional,
172 |             reward_mean,
173 |             reward_mean_min,
174 |             reward_mean_max,
175 |             reward_variance,
176 |             eval_reward_mean,
177 |             prior_mu[0],
178 |             prior_mu[1],
179 |             prior_mu[2],
180 |             )
181 |         with open(os.path.join(output_dir, "scores.txt"), "a+") as f:
182 |             print("\t".join(str(x) for x in values), file=f)
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     main()
187 | 


--------------------------------------------------------------------------------
/examples/multi_step_mdp_optimality.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import gym
  4 | import os
  5 | import optuna
  6 | 
  7 | import pic
  8 | import multiprocessing as mp
  9 | import numpy as np
 10 | import scipy.stats as st
 11 | from multiprocessing import Pool
 12 | from scipy.special import expit as sigmoid, logit
 13 | 
 14 | 
 15 | _basic_columns = (
 16 |     "episodes",
 17 |     "mutual_infomation",
 18 |     "marginal",
 19 |     "conditional",
 20 |     "mean",
 21 |     "min",
 22 |     "max",
 23 |     "var",
 24 |     "eval_reward_mean",
 25 |     "prior_mean_0",
 26 |     "prior_mean_1",
 27 |     "prior_mean_2",
 28 | )
 29 | 
 30 | 
 31 | class Agent(object):
 32 |     def __init__(self, state_dim=3):
 33 |         self.param = np.zeros(3)
 34 | 
 35 |     def set_weight(self, weight):
 36 |         self.param = weight  # (3, )
 37 | 
 38 |     def sample(self, state):
 39 |         b = np.random.uniform(0, 1, 1)
 40 |         theta_s = np.dot(self.param.T, state)
 41 |         action = int(1 - (theta_s > logit(b)).astype('int'))
 42 |         return action
 43 | 
 44 | 
 45 | class MultiSampler(object):
 46 |     def __init__(self, env, agent):
 47 |         self.env = env
 48 |         self.agent = agent
 49 | 
 50 |     def run_episode(self, index):
 51 |         obs = self.env.reset()
 52 |         score = 0
 53 |         steps = 0
 54 |         done = False
 55 |         while not done:
 56 |             action = self.agent.sample(obs)
 57 |             obs, reward, done, _ = self.env.step(action)
 58 |             score += reward
 59 |         return score
 60 | 
 61 |     def set_weight(self, weight):
 62 |         self.agent.set_weight(weight)
 63 | 
 64 | 
 65 | def main():
 66 |     parser = argparse.ArgumentParser()
 67 |     parser.add_argument("--prior_sigma", type=float, default=1.0, help="std of p(\theta)")
 68 |     parser.add_argument("--prior_mean", type=float, default=0.0, help="mean of p(\theta)")
 69 |     parser.add_argument("--horizon", type=int, default=1, help="horizon of MDP")
 70 |     parser.add_argument("--multiprocess", type=int, default=1, help="number of prosess for distrbuted experiments")
 71 |     parser.add_argument("--population_size", type=int, default=1000, help="number of population for optimization")
 72 |     parser.add_argument("--episodes_per_param", type=int, default=1000, help="the number of episodes per parameter")
 73 |     parser.add_argument("--mi_population_size", type=int, default=100, help="population for finite difference")
 74 |     parser.add_argument("--mi_coefficient", type=float, default=1.0, help="coefficient for MI")
 75 |     parser.add_argument("--mi_scale", type=float, default=1.0, help="scale parameter for finite difference")
 76 |     parser.add_argument("--n_trials", type=int, default=50)
 77 |     parser.add_argument("--iterations", type=int, default=100, help="the number of episodes for optimization")
 78 |     parser.add_argument("--learning_rate", type=float, default=0.5, help="learning rate of the parameter")
 79 |     parser.add_argument("--decay", type=float, default=0.999, help="decay of learning rate")
 80 |     parser.add_argument("--output", type=str, default='./outputs/')
 81 |     parser.add_argument("--seed", type=int, default=0, help="random seed")
 82 |     args = parser.parse_args()
 83 | 
 84 |     np.random.seed(seed=args.seed)
 85 | 
 86 |     # save dir
 87 |     timestamp = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S")
 88 |     output_dir = os.path.join(
 89 |         args.output,
 90 |         'optimal_multi_step_es_horizon{}'.format(args.horizon),
 91 |         'ps{}_pm{}_pop{}_ep{}_seed-{}'.format(args.prior_sigma, args.prior_mean, args.population_size, args.episodes_per_param, args.seed),
 92 |         timestamp
 93 |         )
 94 |     os.makedirs(output_dir, exist_ok=True)
 95 | 
 96 |     with open(os.path.join(output_dir, "scores.txt"), "w") as f:
 97 |         print("\t".join(_basic_columns), file=f)
 98 | 
 99 |     if args.horizon == 1:
100 |         env = gym.make('OneStep-v0')
101 |     elif args.horizon == 2:
102 |         env = gym.make('TwoStep-v0')
103 |     elif args.horizon == 3:
104 |         env = gym.make('ThreeStep-v0')
105 | 
106 |     agent = Agent()
107 |     multisampler = MultiSampler(env, agent)
108 | 
109 |     if args.multiprocess > 0:
110 |         num_worker = mp.cpu_count()
111 |         if args.multiprocess > num_worker:
112 |             args.multiprocess = num_worker
113 |         pool = Pool(args.multiprocess)
114 |         print("num_worker: {}/{}".format(args.multiprocess, num_worker))
115 | 
116 |     # sampling parameters
117 |     prior_mu = np.array([args.prior_mean] * 3)
118 |     for itr in range(args.iterations):
119 |         # sampling parameters
120 |         # [population_size, episodes_per_param]
121 |         mu = np.ones((args.population_size, 3)) * prior_mu
122 |         noise = np.random.randn(args.population_size, 3)
123 |         theta = mu + args.prior_sigma * noise
124 |         p_theta = st.multivariate_normal(mean=prior_mu, cov=args.prior_sigma)
125 | 
126 |         all_scores = []
127 |         all_scores_per_param = []
128 |         # simulate
129 |         for population in theta:
130 |             multisampler.set_weight(population)
131 |             if args.multiprocess > 0:
132 |                 scores = pool.map(multisampler.run_episode, range(args.episodes_per_param))
133 |                 assert len(scores) == args.episodes_per_param
134 |             all_scores += scores
135 |             all_scores_per_param.append(scores)
136 | 
137 |         all_scores = np.array(all_scores)
138 |         all_scores_per_param = np.array(all_scores_per_param)
139 | 
140 |         r_max = all_scores.max()
141 | 
142 |         def objective(trial):
143 |             temperature = trial.suggest_loguniform('temperature', 1e-4, 2e4)
144 |             p_o1 = np.exp((all_scores-r_max)/temperature).mean()
145 |             p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1)
146 |             marginal = -p_o1*np.log(p_o1 + 1e-12) - (1-p_o1)*np.log(1-p_o1 + 1e-12)
147 |             conditional = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12))
148 |             mutual_information = marginal - conditional
149 | 
150 |             return mutual_information
151 | 
152 |         study = optuna.create_study(direction='maximize')
153 |         study.optimize(objective, n_trials=args.n_trials)
154 | 
155 |         trial = study.best_trial
156 |         mutual_info = trial.value
157 |         temperature = trial.params['temperature']
158 |         p_o1 = np.exp((all_scores-r_max)/temperature).mean()
159 |         p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1)
160 |         marginal = -p_o1*np.log(p_o1) - (1-p_o1)*np.log(1-p_o1)
161 |         conditional = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12))
162 | 
163 |         reward_mean = all_scores.mean()
164 |         reward_variance = all_scores.var()
165 |         reward_mean_min = all_scores_per_param.mean(axis=1).min()
166 |         reward_mean_max = all_scores_per_param.mean(axis=1).max()
167 |         # update
168 |         reward_mean_over_ep = all_scores_per_param.mean(axis=1)
169 |         std = reward_mean_over_ep.std()
170 |         if std == 0:
171 |             std = 1e-10
172 |         # normalize
173 |         reward_mean_over_ep = (reward_mean_over_ep - reward_mean) / std
174 |         update_factor = 1. / (args.population_size * args.prior_sigma)
175 |         g = update_factor * np.dot(noise.T, reward_mean_over_ep).T
176 |         prior_mu += args.learning_rate * g
177 | 
178 |         # evaluation
179 |         n_eval = 100
180 |         if args.multiprocess > 0:
181 |             eval_reward = pool.map(multisampler.run_episode, range(n_eval))
182 |             assert len(eval_reward) == n_eval
183 |         eval_reward_mean = np.array(eval_reward).mean()
184 | 
185 |         values = (
186 |             (itr + 1) * args.episodes_per_param * args.population_size,
187 |             mutual_info,
188 |             marginal,
189 |             conditional,
190 |             reward_mean,
191 |             reward_mean_min,
192 |             reward_mean_max,
193 |             reward_variance,
194 |             eval_reward_mean,
195 |             prior_mu[0],
196 |             prior_mu[1],
197 |             prior_mu[2],
198 |             )
199 |         with open(os.path.join(output_dir, "scores.txt"), "a+") as f:
200 |             print("\t".join(str(x) for x in values), file=f)
201 | 
202 | 
203 | if __name__ == "__main__":
204 |     main()
205 | 


--------------------------------------------------------------------------------
/examples/random_sampling.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import numpy as np
 4 | 
 5 | from pic.algos import NumpyAgent
 6 | from pic.sampler import Sampler, make_env
 7 | 
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--env", type=str, default="CartPole-v0", help="Open AI gym environments")
12 |     parser.add_argument("--n_units", type=int, default=64, help="number of hidden units")
13 |     parser.add_argument("--n_layers", type=int, default=2, help="number of hidden layers")
14 |     parser.add_argument("--use_bias", action="store_true", help="use bias in NN")
15 |     parser.add_argument("--n_samples", type=int, default=10**4, help="number of parameters sampled from p(\theta)")
16 |     parser.add_argument("--n_episodes", type=int, default=1000, help="number of episode running with parameter \theta")
17 |     parser.add_argument("--root_dir", type=str, default='./results/', help="Root dir to save results")
18 |     parser.add_argument("--random_dist", type=str, choices=['normal', 'uniform', 'xavier_uniform', 'xavier_normal'], default='normal', help="prior distribution of p(\theta)")
19 |     parser.add_argument("--normal_mean", type=float, default=0.0, help="The mean of prior distribution")
20 |     parser.add_argument("--normal_sigma", type=float, default=1.0, help="The sigma of prior distribution")
21 |     parser.add_argument("--uniform_bound", type=float, default=1.0, help="The bound of prior distribution")
22 |     parser.add_argument("--multiprocess", type=int, default=0, help="number of prosess for distrbuted experiments")
23 |     args = parser.parse_args()
24 | 
25 |     sample_env = make_env(args.env, seed=None)
26 |     max_episode_steps = sample_env.spec.max_episode_steps
27 | 
28 |     agent = NumpyAgent(
29 |         env=sample_env,
30 |         n_hidden_layers=args.n_layers,
31 |         n_hidden_units=args.n_units,
32 |         random_dist=args.random_dist,
33 |         normal_mean=args.normal_mean,
34 |         normal_sigma=args.normal_sigma,
35 |         uniform_bound=args.uniform_bound,
36 |         use_bias=args.use_bias,
37 |         env_name=args.env,
38 |         )
39 | 
40 |     # save dir
41 |     output_dir = os.path.join(
42 |         args.root_dir,
43 |         'dist_{}_layers{}_units{}'.format(args.random_dist, args.n_layers, args.n_units)
44 |         )
45 |     os.makedirs(output_dir, exist_ok=True)
46 | 
47 |     sampler = Sampler(
48 |         args.env,
49 |         agent,
50 |         max_episode_steps,
51 |         n_samples=args.n_samples,
52 |         n_episodes=args.n_episodes,
53 |         multiprocess=args.multiprocess,
54 |         )
55 |     all_scores_per_param = sampler.sample()
56 |     np.save(os.path.join(output_dir, "{}.npy".format(args.env)), all_scores_per_param)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/pic/__init__.py:
--------------------------------------------------------------------------------
1 | from pic import nn  # NOQA
2 | from pic import algos  # NOQA
3 | from pic import gym  # NOQA
4 | from pic import sampler  # NOQA
5 | 


--------------------------------------------------------------------------------
/pic/algos/__init__.py:
--------------------------------------------------------------------------------
1 | from pic.algos.numpyagent import NumpyAgent  # NOQA
2 | 


--------------------------------------------------------------------------------
/pic/algos/numpyagent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from pic.nn import NumpyMLP
 4 | 
 5 | 
 6 | class NumpyAgent:
 7 |     def __init__(
 8 |         self,
 9 |         env,
10 |         n_hidden_layers=2,
11 |         n_hidden_units=4,
12 |         random_dist="normal",
13 |         normal_mean=0.0,
14 |         normal_sigma=1.0,
15 |         uniform_bound=1.0,
16 |         act_fn="tanh",
17 |         use_bias=False,
18 |         env_name='CartPole-v0',
19 |         policy_type="deterministic",
20 |     ):
21 |         self.env_name = env_name
22 | 
23 |         if not ("dm2gym" in self.env_name):
24 |             self.n_inputs = env.reset().size
25 |         else:  # for dm2gym
26 |             self.n_inputs = env.reset()['observations'].size
27 | 
28 |         self.policy_type = policy_type
29 |         assert self.policy_type in ["deterministic", "stochastic"]
30 | 
31 |         if type(env.action_space).__name__ == "Discrete":
32 |             self.action_space_type = "discrete"
33 |             self.n_actions = env.action_space.n
34 |             self.n_outputs = self.n_actions
35 |             if self.policy_type == "stochastic":
36 |                 self.output_fn = self.discrete_dist_sample
37 |             else:
38 |                 self.output_fn = np.argmax
39 |         elif type(env.action_space).__name__ == "Box":
40 |             self.action_space_type = "continuous"
41 |             self.action_scale = env.action_space.high.max()
42 |             self.n_actions = env.action_space.shape[0]
43 |             if self.policy_type == "stochastic":
44 |                 self.output_fn = self.continuous_dist_sample
45 |                 self.n_outputs = 2 * self.n_actions
46 |             else:
47 |                 self.output_fn = self.scale_continuous_action
48 |                 self.n_outputs = self.n_actions
49 | 
50 |         self.nn = NumpyMLP(
51 |             n_inputs=self.n_inputs,
52 |             n_outputs=self.n_outputs,
53 |             n_hidden_layers=n_hidden_layers,
54 |             n_hidden_units=n_hidden_units,
55 |             random_dist=random_dist,
56 |             normal_mean=normal_mean,
57 |             normal_sigma=normal_sigma,
58 |             uniform_bound=uniform_bound,
59 |             act_fn=act_fn,
60 |             use_bias=use_bias
61 |             )
62 | 
63 |     def get_action(self, state):
64 |         if ("dm2gym" in self.env_name):
65 |             state = state['observations']
66 |         x = self.nn.forward(state)
67 |         return self.output_fn(x)
68 | 
69 |     def scale_continuous_action(self, x):
70 |         return self.action_scale * np.tanh(x)
71 | 
72 |     def discrete_dist_sample(self, x):
73 |         softmax_x = np.exp(x) / np.exp(x).sum()
74 |         return np.random.choice(list(range(self.N_actions)), p=softmax_x)
75 | 
76 |     def continuous_dist_sample(self, x):
77 |         mus_NN = x[: self.n_actions]
78 |         sigmas_NN = x[self.n_actions:]
79 |         mus = np.tanh(mus_NN) * self.action_scale
80 |         sigmas = np.log(1 + np.exp(sigmas_NN))
81 |         return np.random.normal(loc=mus, scale=sigmas)
82 | 
83 |     def init_weights(self):
84 |         self.nn.init_weights()
85 | 


--------------------------------------------------------------------------------
/pic/gym/__init__.py:
--------------------------------------------------------------------------------
  1 | from gym.envs.registration import register
  2 | from pic.gym.reward_shaping.maze_model import U_MAZE
  3 | 
  4 | 
  5 | register(
  6 |     id='OneStep-v0',
  7 |     entry_point='pic.gym.multi_step.multi_step:MultiStepEnv',
  8 |     kwargs={'horizon': 1},
  9 | )
 10 | 
 11 | register(
 12 |     id='TwoStep-v0',
 13 |     entry_point='pic.gym.multi_step.multi_step:MultiStepEnv',
 14 |     kwargs={'horizon': 2},
 15 | )
 16 | 
 17 | register(
 18 |     id='ThreeStep-v0',
 19 |     entry_point='pic.gym.multi_step.multi_step:MultiStepEnv',
 20 |     kwargs={'horizon': 3},
 21 | )
 22 | 
 23 | register(
 24 |     id='ReacherL1-v0',
 25 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env',
 26 |     max_episode_steps=50,
 27 | )
 28 | 
 29 | register(
 30 |     id='ReacherL2-v0',
 31 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env',
 32 |     max_episode_steps=50,
 33 | )
 34 | 
 35 | register(
 36 |     id='ReacherSparse-v0',
 37 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv',
 38 |     max_episode_steps=50,
 39 | )
 40 | 
 41 | register(
 42 |     id='ReacherFrac-v0',
 43 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv',
 44 |     max_episode_steps=50,
 45 | )
 46 | 
 47 | register(
 48 |     id='ReacherL1_c05-v0',
 49 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env',
 50 |     max_episode_steps=50,
 51 |     kwargs={'coefficent': 0.5},
 52 | )
 53 | 
 54 | register(
 55 |     id='ReacherL1_c20-v0',
 56 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env',
 57 |     max_episode_steps=50,
 58 |     kwargs={'coefficent': 2.0},
 59 | )
 60 | 
 61 | register(
 62 |     id='ReacherL1_c50-v0',
 63 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env',
 64 |     max_episode_steps=50,
 65 |     kwargs={'coefficent': 5.0},
 66 | )
 67 | 
 68 | register(
 69 |     id='ReacherL2_c05-v0',
 70 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env',
 71 |     max_episode_steps=50,
 72 |     kwargs={'coefficent': 0.5},
 73 | )
 74 | 
 75 | register(
 76 |     id='ReacherL2_c20-v0',
 77 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env',
 78 |     max_episode_steps=50,
 79 |     kwargs={'coefficent': 2.0},
 80 | )
 81 | 
 82 | register(
 83 |     id='ReacherL2_c50-v0',
 84 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env',
 85 |     max_episode_steps=50,
 86 |     kwargs={'coefficent': 5.0},
 87 | )
 88 | 
 89 | register(
 90 |     id='ReacherSparse_d001-v0',
 91 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv',
 92 |     max_episode_steps=50,
 93 |     kwargs={'distance_threshold': 0.01},
 94 | )
 95 | 
 96 | register(
 97 |     id='ReacherSparse_d01-v0',
 98 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv',
 99 |     max_episode_steps=50,
100 |     kwargs={'distance_threshold': 0.1},
101 | )
102 | 
103 | register(
104 |     id='ReacherSparse_d015-v0',
105 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv',
106 |     max_episode_steps=50,
107 |     kwargs={'distance_threshold': 0.15},
108 | )
109 | 
110 | register(
111 |     id='ReacherFrac_m01o01-v0',
112 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv',
113 |     max_episode_steps=50,
114 |     kwargs={'multiplier': 0.1, 'offset': 0.1},
115 | )
116 | 
117 | register(
118 |     id='ReacherFrac_m001o001-v0',
119 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv',
120 |     max_episode_steps=50,
121 |     kwargs={'multiplier': 0.01, 'offset': 0.01},
122 | )
123 | 
124 | register(
125 |     id='ReacherFrac_m005o01-v0',
126 |     entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv',
127 |     max_episode_steps=50,
128 |     kwargs={'multiplier': 0.05, 'offset': 0.1},
129 | )
130 | 
131 | register(
132 |     id='maze2d-umaze-negative_sparse-v0',
133 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
134 |     max_episode_steps=150,
135 |     kwargs={
136 |         'reward_type': 'negative_sparse',
137 |         'distance_threshold': 0.5,
138 |     }
139 | )
140 | 
141 | register(
142 |     id='maze2d-umaze-densel2-v0',
143 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
144 |     max_episode_steps=150,
145 |     kwargs={
146 |         'reward_type': 'densel2',
147 |         'coefficent': 1.0,
148 |     }
149 | )
150 | 
151 | register(
152 |     id='maze2d-umaze-densel1-v0',
153 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
154 |     max_episode_steps=150,
155 |     kwargs={
156 |         'reward_type': 'densel1',
157 |         'coefficent': 1.0,
158 |     }
159 | )
160 | 
161 | register(
162 |     id='maze2d-umaze-frac-v0',
163 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
164 |     max_episode_steps=150,
165 |     kwargs={
166 |         'reward_type': 'frac',
167 |         'multiplier': 0.01,
168 |         'offset': 0.1,
169 |     }
170 | )
171 | 
172 | register(
173 |     id='maze2d-umaze-negative_sparse_d10-v0',
174 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
175 |     max_episode_steps=150,
176 |     kwargs={
177 |         'reward_type': 'negative_sparse',
178 |         'distance_threshold': 1.0,
179 |     }
180 | )
181 | 
182 | register(
183 |     id='maze2d-umaze-negative_sparse_d01-v0',
184 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
185 |     max_episode_steps=150,
186 |     kwargs={
187 |         'reward_type': 'negative_sparse',
188 |         'distance_threshold': 0.1,
189 |     }
190 | )
191 | 
192 | register(
193 |     id='maze2d-umaze-negative_sparse_d02-v0',
194 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
195 |     max_episode_steps=150,
196 |     kwargs={
197 |         'reward_type': 'negative_sparse',
198 |         'distance_threshold': 0.2,
199 |     }
200 | )
201 | 
202 | register(
203 |     id='maze2d-umaze-densel1_c05-v0',
204 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
205 |     max_episode_steps=150,
206 |     kwargs={
207 |         'reward_type': 'densel1',
208 |         'coefficent': 0.5,
209 |     }
210 | )
211 | 
212 | register(
213 |     id='maze2d-umaze-densel1_c50-v0',
214 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
215 |     max_episode_steps=150,
216 |     kwargs={
217 |         'reward_type': 'densel1',
218 |         'coefficent': 5.0,
219 |     }
220 | )
221 | 
222 | register(
223 |     id='maze2d-umaze-densel1_c20-v0',
224 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
225 |     max_episode_steps=150,
226 |     kwargs={
227 |         'reward_type': 'densel1',
228 |         'coefficent': 2.0,
229 |     }
230 | )
231 | 
232 | register(
233 |     id='maze2d-umaze-densel2_c05-v0',
234 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
235 |     max_episode_steps=150,
236 |     kwargs={
237 |         'reward_type': 'densel2',
238 |         'coefficent': 0.5,
239 |     }
240 | )
241 | 
242 | register(
243 |     id='maze2d-umaze-densel2_c50-v0',
244 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
245 |     max_episode_steps=150,
246 |     kwargs={
247 |         'reward_type': 'densel2',
248 |         'coefficent': 5.0,
249 |     }
250 | )
251 | 
252 | register(
253 |     id='maze2d-umaze-densel2_c20-v0',
254 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
255 |     max_episode_steps=150,
256 |     kwargs={
257 |         'reward_type': 'densel2',
258 |         'coefficent': 2.0,
259 |     }
260 | )
261 | 
262 | register(
263 |     id='maze2d-umaze-frac_m01o01-v0',
264 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
265 |     max_episode_steps=150,
266 |     kwargs={
267 |         'reward_type': 'frac',
268 |         'multiplier': 0.1,
269 |         'offset': 0.1,
270 |     }
271 | )
272 | 
273 | register(
274 |     id='maze2d-umaze-frac_m001o001-v0',
275 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
276 |     max_episode_steps=150,
277 |     kwargs={
278 |         'reward_type': 'frac',
279 |         'multiplier': 0.01,
280 |         'offset': 0.01,
281 |     }
282 | )
283 | 
284 | register(
285 |     id='maze2d-umaze-frac_m005o01-v0',
286 |     entry_point='pic.gym.reward_shaping.maze_model:MazeEnv',
287 |     max_episode_steps=150,
288 |     kwargs={
289 |         'reward_type': 'frac',
290 |         'multiplier': 0.05,
291 |         'offset': 0.1,
292 |     }
293 | )
294 | 
295 | register(
296 |     id='CartPoleNoise-v0',
297 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
298 |     max_episode_steps=200,
299 |     kwargs={
300 |         'noise_type': 'uniform',
301 |         'noise_scale': 0.0,
302 |         'init_scale': 0.0,
303 |     }
304 | )
305 | 
306 | register(
307 |     id='HalfCheetahNoise-v2',
308 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
309 |     max_episode_steps=1000,
310 |     kwargs={
311 |         'noise_type': 'uniform',
312 |         'noise_scale': 0.0,
313 |         'init_scale': 0.0,
314 |     }
315 | )
316 | 
317 | register(
318 |     id='HumanoidNoise-v2',
319 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
320 |     max_episode_steps=1000,
321 |     kwargs={
322 |         'noise_type': 'uniform',
323 |         'noise_scale': 0.0,
324 |         'init_scale': 0.0,
325 |     }
326 | )
327 | 
328 | # noisy_dynamics
329 | # CartPole
330 | register(
331 |     id='CartPoleNoiseInit005Dynamics003-v0',
332 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
333 |     max_episode_steps=200,
334 |     kwargs={
335 |         'noise_type': 'uniform',
336 |         'noise_scale': 0.03,
337 |         'init_scale': 0.05,
338 |     }
339 | )
340 | 
341 | register(
342 |     id='CartPoleNoiseInit005Dynamics005-v0',
343 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
344 |     max_episode_steps=200,
345 |     kwargs={
346 |         'noise_type': 'uniform',
347 |         'noise_scale': 0.05,
348 |         'init_scale': 0.05,
349 |     }
350 | )
351 | 
352 | register(
353 |     id='CartPoleNoiseInit005Dynamics010-v0',
354 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
355 |     max_episode_steps=200,
356 |     kwargs={
357 |         'noise_type': 'uniform',
358 |         'noise_scale': 0.1,
359 |         'init_scale': 0.05,
360 |     }
361 | )
362 | 
363 | register(
364 |     id='CartPoleNoiseInit010Dynamics000-v0',
365 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
366 |     max_episode_steps=200,
367 |     kwargs={
368 |         'noise_type': 'uniform',
369 |         'noise_scale': 0.0,
370 |         'init_scale': 0.1,
371 |     }
372 | )
373 | 
374 | register(
375 |     id='CartPoleNoiseInit010Dynamics003-v0',
376 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
377 |     max_episode_steps=200,
378 |     kwargs={
379 |         'noise_type': 'uniform',
380 |         'noise_scale': 0.03,
381 |         'init_scale': 0.1,
382 |     }
383 | )
384 | 
385 | register(
386 |     id='CartPoleNoiseInit010Dynamics005-v0',
387 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
388 |     max_episode_steps=200,
389 |     kwargs={
390 |         'noise_type': 'uniform',
391 |         'noise_scale': 0.05,
392 |         'init_scale': 0.1,
393 |     }
394 | )
395 | 
396 | register(
397 |     id='CartPoleNoiseInit010Dynamics010-v0',
398 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
399 |     max_episode_steps=200,
400 |     kwargs={
401 |         'noise_type': 'uniform',
402 |         'noise_scale': 0.1,
403 |         'init_scale': 0.1,
404 |     }
405 | )
406 | 
407 | register(
408 |     id='CartPoleNoiseInit015Dynamics000-v0',
409 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
410 |     max_episode_steps=200,
411 |     kwargs={
412 |         'noise_type': 'uniform',
413 |         'noise_scale': 0.0,
414 |         'init_scale': 0.15,
415 |     }
416 | )
417 | 
418 | register(
419 |     id='CartPoleNoiseInit015Dynamics003-v0',
420 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
421 |     max_episode_steps=200,
422 |     kwargs={
423 |         'noise_type': 'uniform',
424 |         'noise_scale': 0.03,
425 |         'init_scale': 0.15,
426 |     }
427 | )
428 | 
429 | register(
430 |     id='CartPoleNoiseInit015Dynamics005-v0',
431 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
432 |     max_episode_steps=200,
433 |     kwargs={
434 |         'noise_type': 'uniform',
435 |         'noise_scale': 0.05,
436 |         'init_scale': 0.15,
437 |     }
438 | )
439 | 
440 | register(
441 |     id='CartPoleNoiseInit015Dynamics010-v0',
442 |     entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv',
443 |     max_episode_steps=200,
444 |     kwargs={
445 |         'noise_type': 'uniform',
446 |         'noise_scale': 0.1,
447 |         'init_scale': 0.15,
448 |     }
449 | )
450 | 
451 | # HalfCheetah
452 | register(
453 |     id='HalfCheetahNoiseInit010Dynamics003-v2',
454 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
455 |     max_episode_steps=1000,
456 |     kwargs={
457 |         'noise_type': 'uniform',
458 |         'noise_scale': 0.03,
459 |         'init_scale': 0.1,
460 |     }
461 | )
462 | 
463 | register(
464 |     id='HalfCheetahNoiseInit010Dynamics005-v2',
465 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
466 |     max_episode_steps=1000,
467 |     kwargs={
468 |         'noise_type': 'uniform',
469 |         'noise_scale': 0.05,
470 |         'init_scale': 0.1,
471 |     }
472 | )
473 | 
474 | register(
475 |     id='HalfCheetahNoiseInit010Dynamics010-v2',
476 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
477 |     max_episode_steps=1000,
478 |     kwargs={
479 |         'noise_type': 'uniform',
480 |         'noise_scale': 0.1,
481 |         'init_scale': 0.1,
482 |     }
483 | )
484 | 
485 | register(
486 |     id='HalfCheetahNoiseInit030Dynamics000-v2',
487 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
488 |     max_episode_steps=1000,
489 |     kwargs={
490 |         'noise_type': 'uniform',
491 |         'noise_scale': 0.00,
492 |         'init_scale': 0.3,
493 |     }
494 | )
495 | 
496 | register(
497 |     id='HalfCheetahNoiseInit030Dynamics003-v2',
498 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
499 |     max_episode_steps=1000,
500 |     kwargs={
501 |         'noise_type': 'uniform',
502 |         'noise_scale': 0.03,
503 |         'init_scale': 0.3,
504 |     }
505 | )
506 | 
507 | register(
508 |     id='HalfCheetahNoiseInit030Dynamics005-v2',
509 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
510 |     max_episode_steps=1000,
511 |     kwargs={
512 |         'noise_type': 'uniform',
513 |         'noise_scale': 0.05,
514 |         'init_scale': 0.3,
515 |     }
516 | )
517 | 
518 | register(
519 |     id='HalfCheetahNoiseInit030Dynamics010-v2',
520 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
521 |     max_episode_steps=1000,
522 |     kwargs={
523 |         'noise_type': 'uniform',
524 |         'noise_scale': 0.1,
525 |         'init_scale': 0.3,
526 |     }
527 | )
528 | 
529 | register(
530 |     id='HalfCheetahNoiseInit050Dynamics000-v2',
531 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
532 |     max_episode_steps=1000,
533 |     kwargs={
534 |         'noise_type': 'uniform',
535 |         'noise_scale': 0.00,
536 |         'init_scale': 0.5,
537 |     }
538 | )
539 | 
540 | register(
541 |     id='HalfCheetahNoiseInit050Dynamics003-v2',
542 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
543 |     max_episode_steps=1000,
544 |     kwargs={
545 |         'noise_type': 'uniform',
546 |         'noise_scale': 0.03,
547 |         'init_scale': 0.5,
548 |     }
549 | )
550 | 
551 | register(
552 |     id='HalfCheetahNoiseInit050Dynamics005-v2',
553 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
554 |     max_episode_steps=1000,
555 |     kwargs={
556 |         'noise_type': 'uniform',
557 |         'noise_scale': 0.05,
558 |         'init_scale': 0.5,
559 |     }
560 | )
561 | 
562 | register(
563 |     id='HalfCheetahNoiseInit050Dynamics010-v2',
564 |     entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv',
565 |     max_episode_steps=1000,
566 |     kwargs={
567 |         'noise_type': 'uniform',
568 |         'noise_scale': 0.1,
569 |         'init_scale': 0.5,
570 |     }
571 | )
572 | 
573 | # Humanoid
574 | register(
575 |     id='HumanoidNoiseInit001Dynamics003-v2',
576 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
577 |     max_episode_steps=1000,
578 |     kwargs={
579 |         'noise_type': 'uniform',
580 |         'noise_scale': 0.03,
581 |         'init_scale': 0.01,
582 |     }
583 | )
584 | 
585 | register(
586 |     id='HumanoidNoiseInit001Dynamics005-v2',
587 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
588 |     max_episode_steps=1000,
589 |     kwargs={
590 |         'noise_type': 'uniform',
591 |         'noise_scale': 0.05,
592 |         'init_scale': 0.01,
593 |     }
594 | )
595 | 
596 | register(
597 |     id='HumanoidNoiseInit001Dynamics010-v2',
598 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
599 |     max_episode_steps=1000,
600 |     kwargs={
601 |         'noise_type': 'uniform',
602 |         'noise_scale': 0.1,
603 |         'init_scale': 0.01,
604 |     }
605 | )
606 | 
607 | register(
608 |     id='HumanoidNoiseInit003Dynamics000-v2',
609 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
610 |     max_episode_steps=1000,
611 |     kwargs={
612 |         'noise_type': 'uniform',
613 |         'noise_scale': 0.0,
614 |         'init_scale': 0.03,
615 |     }
616 | )
617 | 
618 | register(
619 |     id='HumanoidNoiseInit003Dynamics003-v2',
620 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
621 |     max_episode_steps=1000,
622 |     kwargs={
623 |         'noise_type': 'uniform',
624 |         'noise_scale': 0.03,
625 |         'init_scale': 0.03,
626 |     }
627 | )
628 | 
629 | register(
630 |     id='HumanoidNoiseInit003Dynamics005-v2',
631 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
632 |     max_episode_steps=1000,
633 |     kwargs={
634 |         'noise_type': 'uniform',
635 |         'noise_scale': 0.05,
636 |         'init_scale': 0.03,
637 |     }
638 | )
639 | 
640 | register(
641 |     id='HumanoidNoiseInit003Dynamics010-v2',
642 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
643 |     max_episode_steps=1000,
644 |     kwargs={
645 |         'noise_type': 'uniform',
646 |         'noise_scale': 0.1,
647 |         'init_scale': 0.03,
648 |     }
649 | )
650 | 
651 | register(
652 |     id='HumanoidNoiseInit005Dynamics000-v2',
653 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
654 |     max_episode_steps=1000,
655 |     kwargs={
656 |         'noise_type': 'uniform',
657 |         'noise_scale': 0.0,
658 |         'init_scale': 0.05,
659 |     }
660 | )
661 | 
662 | register(
663 |     id='HumanoidNoiseInit005Dynamics003-v2',
664 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
665 |     max_episode_steps=1000,
666 |     kwargs={
667 |         'noise_type': 'uniform',
668 |         'noise_scale': 0.03,
669 |         'init_scale': 0.05,
670 |     }
671 | )
672 | 
673 | register(
674 |     id='HumanoidNoiseInit005Dynamics005-v2',
675 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
676 |     max_episode_steps=1000,
677 |     kwargs={
678 |         'noise_type': 'uniform',
679 |         'noise_scale': 0.05,
680 |         'init_scale': 0.05,
681 |     }
682 | )
683 | 
684 | register(
685 |     id='HumanoidNoiseInit005Dynamics010-v2',
686 |     entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv',
687 |     max_episode_steps=1000,
688 |     kwargs={
689 |         'noise_type': 'uniform',
690 |         'noise_scale': 0.1,
691 |         'init_scale': 0.05,
692 |     }
693 | )
694 | 


--------------------------------------------------------------------------------
/pic/gym/multi_step/multi_step.py:
--------------------------------------------------------------------------------
  1 | from gym import Env
  2 | from gym.spaces import Discrete, Box
  3 | from gym.utils import seeding
  4 | 
  5 | import numpy as np
  6 | 
  7 | 
  8 | class MultiStepEnv(Env):
  9 |     def __init__(self, horizon=1):
 10 |         self.horizon = horizon
 11 |         self.action_space = Discrete(2)
 12 |         self.observation_space = Box(low=0, high=1, shape=(3,))
 13 |         self.state_dict = {
 14 |             's_1': np.array([1, 0, 0]),
 15 |             's_2': np.array([0, 1, 0]),
 16 |             's_3': np.array([0, 0, 1]),
 17 |             's_4': np.array([1, 1, 1]),
 18 |             's_5': np.array([0, 0, 0]),
 19 |         }
 20 |         self.seed()
 21 |         self.reset()
 22 | 
 23 |     def seed(self, seed=None):
 24 |         self.np_random, seed = seeding.np_random(seed)
 25 |         return [seed]
 26 | 
 27 |     def reset(self):
 28 |         self.state = 's_1'
 29 |         self.time_step = 0
 30 |         return self.state_dict[self.state]
 31 | 
 32 |     def step(self, action):
 33 |         assert self.action_space.contains(action)
 34 |         reward = 0.0
 35 |         if self.state == 's_1':
 36 |             if action == 0:
 37 |                 self.state = 's_2'
 38 |             elif action == 1:
 39 |                 self.state = 's_3'
 40 |         elif self.state == 's_2':
 41 |             if action == 1:
 42 |                 self.state = 's_4'
 43 |         elif self.state =='s_4':
 44 |             if action == 0:
 45 |                 self.state = 's_5'
 46 | 
 47 |         self.time_step += 1
 48 |         if self.time_step == self.horizon:
 49 |             done = True
 50 |             if self.state == 's_2' and self.time_step == 1:
 51 |                 reward = 1.0
 52 |             elif self.state == 's_4' and self.time_step == 2:
 53 |                 reward = 1.0
 54 |             elif self.state == 's_5' and self.time_step == 3:
 55 |                 reward = 1.0
 56 |         else:
 57 |             done = False
 58 | 
 59 |         return self.state_dict[self.state], reward, done, {'state': self.state}
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 |     # test
 64 |     env = MultiStepEnv(horizon=1)
 65 |     print('=======horizon1=======')
 66 |     for i in range(2):
 67 |         print(i)
 68 |         obs = env.reset()
 69 |         obs, reward, done, info = env.step(i)
 70 |         print('======================')
 71 |         print('obs: {}'.format(obs))
 72 |         print('reward: {}'.format(reward))
 73 |         print('done: {}'.format(done))
 74 |         print('state: {}'.format(info['state']))
 75 |         print('======================')
 76 | 
 77 |     env = MultiStepEnv(horizon=2)
 78 |     for actions in [[0,0], [0,1], [1,0], [1,1]]:
 79 |         print('=======horizon2=======')
 80 |         print(actions)
 81 |         obs = env.reset()
 82 |         for i in range(2):
 83 |             obs, reward, done, info = env.step(actions[i])
 84 |             print('======================')
 85 |             print('obs: {}'.format(obs))
 86 |             print('reward: {}'.format(reward))
 87 |             print('done: {}'.format(done))
 88 |             print('state: {}'.format(info['state']))
 89 |             print('======================')
 90 | 
 91 |     env = MultiStepEnv(horizon=3)
 92 |     for actions in [[0,0,0], [0,0,1], [0,1,0], [1,0,0], [0,1,1], [1,0,1], [1,1,0], [1,1,1]]:
 93 |         print('=======horizon3=======')
 94 |         print(actions)
 95 |         obs = env.reset()
 96 |         for i in range(3):
 97 |             obs, reward, done, info = env.step(actions[i])
 98 |             print('======================')
 99 |             print('obs: {}'.format(obs))
100 |             print('reward: {}'.format(reward))
101 |             print('done: {}'.format(done))
102 |             print('state: {}'.format(info['state']))
103 |             print('======================')
104 | 


--------------------------------------------------------------------------------
/pic/gym/noisy_dynamics/cartpole_noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gym.envs.classic_control import CartPoleEnv
 4 | 
 5 | 
 6 | class CartPoleNoiseEnv(CartPoleEnv):
 7 |     def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0):
 8 |         self.noise_type = noise_type
 9 |         assert self.noise_type in ['normal', 'uniform']
10 |         self.noise_scale = noise_scale
11 |         self.init_scale = init_scale
12 | 
13 |         CartPoleEnv.__init__(self)
14 | 
15 |     def step(self, action):
16 |         err_msg = "%r (%s) invalid" % (action, type(action))
17 |         assert self.action_space.contains(action), err_msg
18 | 
19 |         x, x_dot, theta, theta_dot = self.state
20 |         force = self.force_mag if action == 1 else -self.force_mag
21 |         costheta = np.cos(theta)
22 |         sintheta = np.sin(theta)
23 | 
24 |         temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / self.total_mass
25 |         thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta ** 2 / self.total_mass))
26 |         xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
27 | 
28 |         if self.noise_scale == 0.0:
29 |             noise = np.zeros((1,))
30 |         elif self.noise_type == 'normal':
31 |             noise = np.random.normal(loc=0., scale=self.noise_scale, size=1)
32 |         elif self.noise_type == 'uniform':
33 |             noise = np.random.uniform(-self.noise_scale, self.noise_scale, 1)
34 | 
35 |         if self.kinematics_integrator == 'euler':
36 |             x = x + self.tau * x_dot
37 |             x_dot = x_dot + self.tau * xacc
38 |             theta = theta + self.tau * theta_dot
39 |             theta_dot = theta_dot + self.tau * thetaacc + noise[0]
40 |         else:  # semi-implicit euler
41 |             x_dot = x_dot + self.tau * xacc
42 |             x = x + self.tau * x_dot
43 |             theta_dot = theta_dot + self.tau * thetaacc + noise[0]
44 |             theta = theta + self.tau * theta_dot
45 | 
46 |         self.state = (x, x_dot, theta, theta_dot)
47 | 
48 |         done = bool(
49 |             x < -self.x_threshold
50 |             or x > self.x_threshold
51 |             or theta < -self.theta_threshold_radians
52 |             or theta > self.theta_threshold_radians
53 |         )
54 | 
55 |         if not done:
56 |             reward = 1.0
57 |         elif self.steps_beyond_done is None:
58 |             self.steps_beyond_done = 0
59 |             reward = 1.0
60 |         else:
61 |             self.steps_beyond_done += 1
62 |             reward = 0.0
63 | 
64 |         return np.array(self.state), reward, done, {}
65 | 
66 |     def reset(self):
67 |         # original: low=-0.05, high=0.05
68 |         self.state = self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=(4,))
69 |         self.steps_beyond_done = None
70 |         return np.array(self.state)
71 | 


--------------------------------------------------------------------------------
/pic/gym/noisy_dynamics/halfcheetah_noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym.envs.mujoco import HalfCheetahEnv
 3 | 
 4 | 
 5 | class HalfCheetahNoiseEnv(HalfCheetahEnv):
 6 |     def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0):
 7 |         self.noise_type = noise_type
 8 |         assert self.noise_type in ['normal', 'uniform']
 9 |         self.noise_scale = noise_scale
10 |         self.init_scale = init_scale
11 | 
12 |         HalfCheetahEnv.__init__(self)
13 | 
14 |     def step(self, action):
15 |         xposbefore = self.sim.data.qpos[0]
16 | 
17 |         # noise
18 |         if self.noise_scale == 0.0:
19 |             noise = np.zeros((1,))
20 |         elif self.noise_type == 'normal':
21 |             noise = np.random.normal(loc=0., scale=self.noise_scale, size=1)
22 |         elif self.noise_type == 'uniform':
23 |             noise = np.random.uniform(-self.noise_scale, self.noise_scale, 1)
24 |         self.do_simulation(action, self.frame_skip)
25 |         # add noise
26 |         self.sim.data.qvel[0] += noise[0]
27 | 
28 |         xposafter = self.sim.data.qpos[0]
29 |         ob = self._get_obs()
30 |         reward_ctrl = - 0.1 * np.square(action).sum()
31 |         reward_run = (xposafter - xposbefore)/self.dt
32 |         reward = reward_ctrl + reward_run
33 |         done = False
34 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
35 | 
36 |     def _get_obs(self):
37 |         return np.concatenate([
38 |             self.sim.data.qpos.flat[1:],
39 |             self.sim.data.qvel.flat,
40 |         ])
41 | 
42 |     def reset_model(self):
43 |         # original: self.init_scale=0.1
44 |         qpos = self.init_qpos + self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=self.model.nq)
45 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * self.init_scale
46 |         self.set_state(qpos, qvel)
47 |         return self._get_obs()
48 | 


--------------------------------------------------------------------------------
/pic/gym/noisy_dynamics/humanoid_noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym.envs.mujoco import HumanoidEnv
 3 | 
 4 | 
 5 | def mass_center(model, sim):
 6 |     mass = np.expand_dims(model.body_mass, 1)
 7 |     xpos = sim.data.xipos
 8 |     return (np.sum(mass * xpos, 0) / np.sum(mass))[0]
 9 | 
10 | 
11 | class HumanoidNoiseEnv(HumanoidEnv):
12 |     def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0):
13 |         self.noise_type = noise_type
14 |         assert self.noise_type in ['normal', 'uniform']
15 |         self.noise_scale = noise_scale
16 |         self.init_scale = init_scale
17 | 
18 |         HumanoidEnv.__init__(self)
19 | 
20 |     def _get_obs(self):
21 |         data = self.sim.data
22 |         return np.concatenate(
23 |             [
24 |                 data.qpos.flat[2:],
25 |                 data.qvel.flat,
26 |                 data.cinert.flat,
27 |                 data.cvel.flat,
28 |                 data.qfrc_actuator.flat,
29 |                 data.cfrc_ext.flat
30 |                 ]
31 |                 )
32 | 
33 |     def step(self, a):
34 |         pos_before = mass_center(self.model, self.sim)
35 | 
36 |         # noise
37 |         if self.noise_scale == 0.0:
38 |             noise = np.zeros((1,))
39 |         elif self.noise_type == 'normal':
40 |             noise = np.random.normal(loc=0., scale=self.noise_scale, size=1)
41 |         elif self.noise_type == 'uniform':
42 |             noise = np.random.uniform(-self.noise_scale, self.noise_scale, 1)
43 |         self.do_simulation(a, self.frame_skip)
44 |         # add noise
45 |         self.sim.data.qvel[0] += noise[0]
46 | 
47 |         pos_after = mass_center(self.model, self.sim)
48 |         alive_bonus = 5.0
49 |         data = self.sim.data
50 |         lin_vel_cost = 1.25 * (pos_after - pos_before) / self.dt
51 |         quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
52 |         quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
53 |         quad_impact_cost = min(quad_impact_cost, 10)
54 |         reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
55 |         qpos = self.sim.data.qpos
56 |         done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
57 |         return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
58 | 
59 |     def reset_model(self):
60 |         # original: self.init_scale = 0.01
61 |         self.set_state(
62 |             self.init_qpos + self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=self.model.nq),
63 |             self.init_qvel + self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=self.model.nv,)
64 |         )
65 |         return self._get_obs()
66 | 


--------------------------------------------------------------------------------
/pic/gym/reward_shaping/dynamic_mjc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | dynamic_mjc.py
  3 | A small library for programatically building MuJoCo XML files
  4 | """
  5 | from contextlib import contextmanager
  6 | import tempfile
  7 | import numpy as np
  8 | 
  9 | 
 10 | def default_model(name):
 11 |     """
 12 |     Get a model with basic settings such as gravity and RK4 integration enabled
 13 |     """
 14 |     model = MJCModel(name)
 15 |     root = model.root
 16 | 
 17 |     # Setup
 18 |     root.compiler(angle="radian", inertiafromgeom="true")
 19 |     default = root.default()
 20 |     default.joint(armature=1, damping=1, limited="true")
 21 |     default.geom(contype=0, friction='1 0.1 0.1', rgba='0.7 0.7 0 1')
 22 |     root.option(gravity="0 0 -9.81", integrator="RK4", timestep=0.01)
 23 |     return model
 24 | 
 25 | 
 26 | def pointmass_model(name):
 27 |     """
 28 |     Get a model with basic settings such as gravity and Euler integration enabled
 29 |     """
 30 |     model = MJCModel(name)
 31 |     root = model.root
 32 | 
 33 |     # Setup
 34 |     root.compiler(angle="radian", inertiafromgeom="true", coordinate="local")
 35 |     default = root.default()
 36 |     default.joint(limited="false", damping=1)
 37 |     default.geom(contype=2, conaffinity="1", condim="1", friction=".5 .1 .1", density="1000", margin="0.002")
 38 |     root.option(timestep=0.01, gravity="0 0 0", iterations="20", integrator="Euler")
 39 |     return model
 40 | 
 41 | 
 42 | class MJCModel(object):
 43 |     def __init__(self, name):
 44 |         self.name = name
 45 |         self.root = MJCTreeNode("mujoco").add_attr('model', name)
 46 | 
 47 |     @contextmanager
 48 |     def asfile(self):
 49 |         """
 50 |         Usage:
 51 |         model = MJCModel('reacher')
 52 |         with model.asfile() as f:
 53 |             print f.read()  # prints a dump of the model
 54 |         """
 55 |         with tempfile.NamedTemporaryFile(mode='w+', suffix='.xml', delete=True) as f:
 56 |             self.root.write(f)
 57 |             f.seek(0)
 58 |             yield f
 59 | 
 60 |     def open(self):
 61 |         self.file = tempfile.NamedTemporaryFile(mode='w+', suffix='.xml', delete=True)
 62 |         self.root.write(self.file)
 63 |         self.file.seek(0)
 64 |         return self.file
 65 | 
 66 |     def close(self):
 67 |         self.file.close()
 68 | 
 69 |     def find_attr(self, attr, value):
 70 |         return self.root.find_attr(attr, value)
 71 | 
 72 |     def __getstate__(self):
 73 |         return {}
 74 | 
 75 |     def __setstate__(self, state):
 76 |         pass
 77 | 
 78 | 
 79 | class MJCTreeNode(object):
 80 |     def __init__(self, name):
 81 |         self.name = name
 82 |         self.attrs = {}
 83 |         self.children = []
 84 | 
 85 |     def add_attr(self, key, value):
 86 |         if isinstance(value, str):
 87 |             pass
 88 |         elif isinstance(value, list) or isinstance(value, np.ndarray):
 89 |             value = ' '.join([str(val).lower() for val in value])
 90 |         else:
 91 |             value = str(value).lower()
 92 | 
 93 |         self.attrs[key] = value
 94 |         return self
 95 | 
 96 |     def __getattr__(self, name):
 97 |         def wrapper(**kwargs):
 98 |             newnode =  MJCTreeNode(name)
 99 |             for (k, v) in kwargs.items():
100 |                 newnode.add_attr(k, v)
101 |             self.children.append(newnode)
102 |             return newnode
103 |         return wrapper
104 | 
105 |     def dfs(self):
106 |         yield self
107 |         if self.children:
108 |             for child in self.children:
109 |                 for node in child.dfs():
110 |                     yield node
111 | 
112 |     def find_attr(self, attr, value):
113 |         """ Run DFS to find a matching attr """
114 |         if attr in self.attrs and self.attrs[attr] == value:
115 |             return self
116 |         for child in self.children:
117 |             res = child.find_attr(attr, value)
118 |             if res is not None:
119 |                 return res
120 |         return None
121 | 
122 |     def write(self, ostream, tabs=0):
123 |         contents = ' '.join(['%s="%s"'%(k,v) for (k,v) in self.attrs.items()])
124 |         if self.children:
125 |             ostream.write('\t'*tabs)
126 |             ostream.write('<%s %s>\n' % (self.name, contents))
127 |             for child in self.children:
128 |                 child.write(ostream, tabs=tabs+1)
129 |             ostream.write('\t'*tabs)
130 |             ostream.write('</%s>\n' % self.name)
131 |         else:
132 |             ostream.write('\t'*tabs)
133 |             ostream.write('<%s %s/>\n' % (self.name, contents))
134 | 
135 |     def __str__(self):
136 |         s = "<"+self.name
137 |         s += ' '.join(['%s="%s"'%(k,v) for (k,v) in self.attrs.items()])
138 |         return s+">"
139 | 


--------------------------------------------------------------------------------
/pic/gym/reward_shaping/maze_model.py:
--------------------------------------------------------------------------------
  1 | """ A pointmass maze env."""
  2 | from gym.envs.mujoco import mujoco_env
  3 | from gym import utils
  4 | from pic.gym.reward_shaping.dynamic_mjc import MJCModel
  5 | import numpy as np
  6 | import random
  7 | 
  8 | 
  9 | WALL = 10
 10 | EMPTY = 11
 11 | GOAL = 12
 12 | 
 13 | 
 14 | def parse_maze(maze_str):
 15 |     lines = maze_str.strip().split('\\')
 16 |     width, height = len(lines), len(lines[0])
 17 |     maze_arr = np.zeros((width, height), dtype=np.int32)
 18 |     for w in range(width):
 19 |         for h in range(height):
 20 |             tile = lines[w][h]
 21 |             if tile == '#':
 22 |                 maze_arr[w][h] = WALL
 23 |             elif tile == 'G':
 24 |                 maze_arr[w][h] = GOAL
 25 |             elif tile == ' ' or tile == 'O' or tile == '0':
 26 |                 maze_arr[w][h] = EMPTY
 27 |             else:
 28 |                 raise ValueError('Unknown tile type: %s' % tile)
 29 |     return maze_arr
 30 | 
 31 | 
 32 | def point_maze(maze_str):
 33 |     maze_arr = parse_maze(maze_str)
 34 | 
 35 |     mjcmodel = MJCModel('point_maze')
 36 |     mjcmodel.root.compiler(inertiafromgeom="true", angle="radian", coordinate="local")
 37 |     mjcmodel.root.option(timestep="0.01", gravity="0 0 0", iterations="20", integrator="Euler")
 38 |     default = mjcmodel.root.default()
 39 |     default.joint(damping=1, limited='false')
 40 |     default.geom(friction=".5 .1 .1", density="1000", margin="0.002", condim="1", contype="2", conaffinity="1")
 41 | 
 42 |     asset = mjcmodel.root.asset()
 43 |     asset.texture(type="2d", name="groundplane", builtin="checker", rgb1="0.2 0.3 0.4", rgb2="0.1 0.2 0.3", width=100, height=100)
 44 |     asset.texture(
 45 |         name="skybox", type="skybox", builtin="gradient", rgb1=".4 .6 .8", rgb2="0 0 0",
 46 |         width="800", height="800", mark="random", markrgb="1 1 1"
 47 |     )
 48 |     asset.material(name="groundplane", texture="groundplane", texrepeat="20 20")
 49 |     asset.material(name="wall", rgba=".7 .5 .3 1")
 50 |     asset.material(name="target", rgba=".6 .3 .3 1")
 51 | 
 52 |     visual = mjcmodel.root.visual()
 53 |     visual.headlight(ambient=".4 .4 .4", diffuse=".8 .8 .8", specular="0.1 0.1 0.1")
 54 |     visual.map(znear=.01)
 55 |     visual.quality(shadowsize=2048)
 56 | 
 57 |     worldbody = mjcmodel.root.worldbody()
 58 |     worldbody.geom(name='ground', size="40 40 0.25", pos="0 0 -0.1", type="plane", contype=1, conaffinity=0, material="groundplane")
 59 | 
 60 |     particle = worldbody.body(name='particle', pos=[1.2,1.2,0])
 61 |     particle.geom(name='particle_geom', type='sphere', size=0.1, rgba='0.0 0.0 1.0 0.0', contype=1)
 62 |     particle.site(name='particle_site', pos=[0.0,0.0,0], size=0.2, rgba='0.3 0.6 0.3 1')
 63 |     particle.joint(name='ball_x', type='slide', pos=[0,0,0], axis=[1,0,0])
 64 |     particle.joint(name='ball_y', type='slide', pos=[0,0,0], axis=[0,1,0])
 65 | 
 66 |     worldbody.site(name='target_site', pos=[0.0,0.0,0], size=0.2, material='target')
 67 | 
 68 |     width, height = maze_arr.shape
 69 |     for w in range(width):
 70 |         for h in range(height):
 71 |             if maze_arr[w,h] == WALL:
 72 |                 worldbody.geom(
 73 |                     conaffinity=1,
 74 |                     type='box',
 75 |                     name='wall_%d_%d'%(w,h),
 76 |                     material='wall',
 77 |                     pos=[w+1.0,h+1.0,0],
 78 |                     size=[0.5,0.5,0.2]
 79 |                 )
 80 | 
 81 |     actuator = mjcmodel.root.actuator()
 82 |     actuator.motor(joint="ball_x", ctrlrange=[-1.0, 1.0], ctrllimited=True, gear=100)
 83 |     actuator.motor(joint="ball_y", ctrlrange=[-1.0, 1.0], ctrllimited=True, gear=100)
 84 | 
 85 |     return mjcmodel
 86 | 
 87 | 
 88 | LARGE_MAZE = \
 89 |         "############\\"+\
 90 |         "#OOOO#OOOOO#\\"+\
 91 |         "#O##O#O#O#O#\\"+\
 92 |         "#OOOOOO#OOO#\\"+\
 93 |         "#O####O###O#\\"+\
 94 |         "#OO#O#OOOOO#\\"+\
 95 |         "##O#O#O#O###\\"+\
 96 |         "#OO#OOO#OGO#\\"+\
 97 |         "############"
 98 | 
 99 | LARGE_MAZE_EVAL = \
100 |         "############\\"+\
101 |         "#OO#OOO#OGO#\\"+\
102 |         "##O###O#O#O#\\"+\
103 |         "#OO#O#OOOOO#\\"+\
104 |         "#O##O#OO##O#\\"+\
105 |         "#OOOOOO#OOO#\\"+\
106 |         "#O##O#O#O###\\"+\
107 |         "#OOOO#OOOOO#\\"+\
108 |         "############"
109 | 
110 | MEDIUM_MAZE = \
111 |         '########\\'+\
112 |         '#OO##OO#\\'+\
113 |         '#OO#OOO#\\'+\
114 |         '##OOO###\\'+\
115 |         '#OO#OOO#\\'+\
116 |         '#O#OO#O#\\'+\
117 |         '#OOO#OG#\\'+\
118 |         "########"
119 | 
120 | MEDIUM_MAZE_EVAL = \
121 |         '########\\'+\
122 |         '#OOOOOG#\\'+\
123 |         '#O#O##O#\\'+\
124 |         '#OOOO#O#\\'+\
125 |         '###OO###\\'+\
126 |         '#OOOOOO#\\'+\
127 |         '#OO##OO#\\'+\
128 |         "########"
129 | 
130 | SMALL_MAZE = \
131 |         "######\\"+\
132 |         "#OOOO#\\"+\
133 |         "#O##O#\\"+\
134 |         "#OOOO#\\"+\
135 |         "######"
136 | 
137 | U_MAZE = \
138 |         "#####\\"+\
139 |         "#GOO#\\"+\
140 |         "###O#\\"+\
141 |         "#OOO#\\"+\
142 |         "#####"
143 | 
144 | U_MAZE_EVAL = \
145 |         "#####\\"+\
146 |         "#OOG#\\"+\
147 |         "#O###\\"+\
148 |         "#OOO#\\"+\
149 |         "#####"
150 | 
151 | OPEN = \
152 |         "#######\\"+\
153 |         "#OOOOO#\\"+\
154 |         "#OOGOO#\\"+\
155 |         "#OOOOO#\\"+\
156 |         "#######"
157 | 
158 | 
159 | class MazeEnv(mujoco_env.MujocoEnv, utils.EzPickle):
160 |     def __init__(
161 |         self,
162 |         reward_type='negative_sparse',
163 |         maze_spec=U_MAZE,
164 |         reset_target=False,
165 |         coefficent=1.0,
166 |         distance_threshold=0.5,
167 |         multiplier=0.01,
168 |         offset=0.1,
169 |     ):
170 |         self.reset_target = reset_target
171 |         self.str_maze_spec = maze_spec
172 |         self.maze_arr = parse_maze(maze_spec)
173 |         self.reward_type = reward_type
174 |         self.coefficent = coefficent
175 |         self.distance_threshold = distance_threshold
176 |         self.multiplier = multiplier
177 |         self.offset = offset
178 |         self.reset_locations = list(zip(*np.where(self.maze_arr == EMPTY)))
179 |         self.reset_locations.sort()
180 | 
181 |         self._target = np.array([0.0,0.0])
182 | 
183 |         model = point_maze(maze_spec)
184 |         with model.asfile() as f:
185 |             mujoco_env.MujocoEnv.__init__(self, model_path=f.name, frame_skip=1)
186 |         utils.EzPickle.__init__(self)
187 | 
188 |         # Set the default goal (overriden by a call to set_target)
189 |         # Try to find a goal if it exists
190 |         self.goal_locations = list(zip(*np.where(self.maze_arr == GOAL)))
191 |         if len(self.goal_locations) == 1:
192 |             self.set_target(self.goal_locations[0])
193 |         elif len(self.goal_locations) > 1:
194 |             raise ValueError("More than 1 goal specified!")
195 |         else:
196 |             # If no goal, use the first empty tile
197 |             self.set_target(np.array(self.reset_locations[0]).astype(self.observation_space.dtype))
198 |         self.empty_and_goal_locations = self.reset_locations + self.goal_locations
199 | 
200 |     def step(self, action):
201 |         action = np.clip(action, -1.0, 1.0)
202 |         self.clip_velocity()
203 |         self.do_simulation(action, self.frame_skip)
204 |         self.set_marker()
205 |         ob = self._get_obs()
206 | 
207 |         if self.reward_type == 'negative_sparse':
208 |             reward = 0.0 if np.linalg.norm(ob[0:2] - self._target) <= self.distance_threshold else -1.0
209 |             assert reward < 0.001
210 |         elif self.reward_type == 'densel1':
211 |             reward = -self.coefficent * np.linalg.norm(ob[0:2] - self._target, ord=1)
212 |             assert reward < 0.0
213 |         elif self.reward_type == 'densel2':
214 |             reward = -self.coefficent * np.linalg.norm(ob[0:2] - self._target, ord=2)
215 |             assert reward < 0.0
216 |         elif self.reward_type == 'frac':
217 |             d = np.linalg.norm(ob[0:2] - self._target, ord=2)
218 |             reward = self.multiplier / (self.offset + d)
219 |         else:
220 |             raise ValueError('Unknown reward type %s' % self.reward_type)
221 |         done = False
222 |         return ob, reward, done, {}
223 | 
224 |     def _get_obs(self):
225 |         return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel()
226 | 
227 |     def get_target(self):
228 |         return self._target
229 | 
230 |     def set_target(self, target_location=None):
231 |         if target_location is None:
232 |             idx = self.np_random.choice(len(self.empty_and_goal_locations))
233 |             reset_location = np.array(self.empty_and_goal_locations[idx]).astype(self.observation_space.dtype)
234 |             target_location = reset_location + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
235 |         self._target = target_location
236 | 
237 |     def set_marker(self):
238 |         self.data.site_xpos[self.model.site_name2id('target_site')] = np.array([self._target[0]+1, self._target[1]+1, 0.0])
239 | 
240 |     def clip_velocity(self):
241 |         qvel = np.clip(self.sim.data.qvel, -5.0, 5.0)
242 |         self.set_state(self.sim.data.qpos, qvel)
243 | 
244 |     def reset_model(self):
245 |         idx = self.np_random.choice(len(self.empty_and_goal_locations))
246 |         reset_location = np.array(self.empty_and_goal_locations[idx]).astype(self.observation_space.dtype)
247 |         qpos = reset_location + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
248 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
249 |         self.set_state(qpos, qvel)
250 |         if self.reset_target:
251 |             self.set_target()
252 |         return self._get_obs()
253 | 
254 |     def reset_to_location(self, location):
255 |         self.sim.reset()
256 |         reset_location = np.array(location).astype(self.observation_space.dtype)
257 |         qpos = reset_location + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
258 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
259 |         self.set_state(qpos, qvel)
260 |         return self._get_obs()
261 | 
262 |     def viewer_setup(self):
263 |         pass
264 | 


--------------------------------------------------------------------------------
/pic/gym/reward_shaping/reacher_norm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym.envs.mujoco import ReacherEnv
 3 | 
 4 | 
 5 | class ReacherL1Env(ReacherEnv):
 6 |     def __init__(self, coefficent=1.0):
 7 |         self.coefficent = coefficent
 8 | 
 9 |         ReacherEnv.__init__(self)
10 | 
11 |     def step(self, a):
12 |         vec = self.get_body_com("fingertip")-self.get_body_com("target")
13 |         reward = - self.coefficent * np.linalg.norm(vec, ord=1)  # L1 norm
14 |         self.do_simulation(a, self.frame_skip)
15 |         ob = self._get_obs()
16 |         done = False
17 |         return ob, reward, done, {}
18 | 
19 | 
20 | class ReacherL2Env(ReacherEnv):
21 |     def __init__(self, coefficent=1.0):
22 |         self.coefficent = coefficent
23 | 
24 |         ReacherEnv.__init__(self)
25 | 
26 |     def step(self, a):
27 |         vec = self.get_body_com("fingertip")-self.get_body_com("target")
28 |         reward = - self.coefficent * np.linalg.norm(vec)  # L2 norm
29 |         self.do_simulation(a, self.frame_skip)
30 |         ob = self._get_obs()
31 |         done = False
32 |         return ob, reward, done, {}
33 | 
34 | 
35 | class ReacherSparseEnv(ReacherEnv):
36 |     def __init__(self, distance_threshold=0.05):
37 |         self.distance_threshold = distance_threshold
38 | 
39 |         ReacherEnv.__init__(self)
40 | 
41 |     def step(self, a):
42 |         vec = self.get_body_com("fingertip")-self.get_body_com("target")
43 |         d = np.linalg.norm(vec)
44 |         reward = -(d > self.distance_threshold).astype(np.float32)  # Sparse
45 |         self.do_simulation(a, self.frame_skip)
46 |         ob = self._get_obs()
47 |         done = False
48 |         return ob, reward, done, {}
49 | 
50 | 
51 | class ReacherFracEnv(ReacherEnv):
52 |     def __init__(self, multiplier=0.01, offset=0.1):
53 |         self.multiplier = multiplier
54 |         self.offset = offset
55 | 
56 |         ReacherEnv.__init__(self)
57 | 
58 |     def step(self, a):
59 |         vec = self.get_body_com("fingertip")-self.get_body_com("target")
60 |         d = np.linalg.norm(vec)
61 |         reward = self.multiplier / (self.offset + d)
62 |         self.do_simulation(a, self.frame_skip)
63 |         ob = self._get_obs()
64 |         done = False
65 |         return ob, reward, done, {}
66 | 


--------------------------------------------------------------------------------
/pic/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from pic.nn.numpymlp import NumpyMLP  # NOQA
2 | 


--------------------------------------------------------------------------------
/pic/nn/numpymlp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class NumpyMLP:
 5 |     def __init__(
 6 |         self,
 7 |         n_inputs,
 8 |         n_outputs,
 9 |         n_hidden_layers=2,
10 |         n_hidden_units=4,
11 |         random_dist="normal",
12 |         normal_mean=0.0,
13 |         normal_sigma=1.0,
14 |         uniform_bound=1.0,
15 |         act_fn="tanh",
16 |         use_bias=False
17 |     ):
18 |         self.n_inputs = n_inputs
19 |         self.n_outputs = n_outputs
20 |         self.n_hidden_layers = n_hidden_layers
21 |         self.n_hidden_units = n_hidden_units
22 |         random_dists = ['normal', 'uniform', 'xavier_uniform', 'xavier_normal']
23 |         assert (random_dist in random_dists)
24 |         self.random_dist = random_dist
25 |         self.random_dist_scaling = 1.0
26 |         self.normal_mean = normal_mean
27 |         self.normal_sigma = normal_sigma
28 |         self.uniform_bound = uniform_bound
29 |         self.use_bias = use_bias
30 | 
31 |         self.init_weights()
32 | 
33 |         activation_fn_d = {
34 |             "tanh": np.tanh,
35 |             "linear": lambda x: x,
36 |             "relu": lambda x: np.maximum(0, x),
37 |         }
38 |         assert (act_fn in activation_fn_d.keys())
39 |         self.act_fn = activation_fn_d[act_fn]
40 | 
41 |     def init_weights(self):
42 |         self.weights_matrix = []
43 |         mat_input_size = self.n_inputs
44 |         if self.use_bias:
45 |             mat_input_size += 1
46 | 
47 |         for i in range(self.n_hidden_layers):
48 |             mat_output_size = self.n_hidden_units
49 |             if self.random_dist == "normal":
50 |                 mat = np.random.normal(loc=self.normal_mean, scale=self.normal_sigma, size=(mat_output_size, mat_input_size))
51 |             elif self.random_dist == "uniform":
52 |                 mat = np.random.uniform(-self.uniform_bound, self.uniform_bound, (mat_output_size, mat_input_size))
53 |             elif self.random_dist == "xavier_uniform":
54 |                 bound = 5 / 3 * np.sqrt(6 / (mat_output_size + mat_input_size))  # for tanh
55 |                 mat = np.random.uniform(-bound, bound, (mat_output_size, mat_input_size))
56 |             elif self.random_dist == "xavier_normal":
57 |                 bound = 5 / 3 * np.sqrt(2 / (mat_output_size + mat_input_size))  # for tanh
58 |                 mat = np.random.normal(loc=0.0, scale=bound, size=(mat_output_size, mat_input_size))
59 |             else:
60 |                 raise
61 |             self.weights_matrix.append(self.random_dist_scaling * mat)
62 |             mat_input_size = mat_output_size
63 |             if self.use_bias:
64 |                 mat_input_size += 1
65 |         # for the last layer:
66 |         if self.random_dist == "normal":
67 |             mat = np.random.normal(loc=self.normal_mean, scale=self.normal_sigma, size=(self.n_outputs, mat_input_size))
68 |         elif self.random_dist == "uniform":
69 |             mat = np.random.uniform(-self.uniform_bound, self.uniform_bound, (self.n_outputs, mat_input_size))
70 |         elif self.random_dist == "xavier_uniform":
71 |             bound = 5 / 3 * np.sqrt(6 / (self.N_outputs + mat_input_size))  # for tanh
72 |             mat = np.random.uniform(-bound, bound, (self.N_outputs, mat_input_size))
73 |         elif self.random_dist == "xavier_normal":
74 |             bound = 5 / 3 * np.sqrt(2 / (self.N_outputs + mat_input_size))  # for tanh
75 |             mat = np.random.normal(loc=0., scale=bound, size=(self.N_outputs, mat_input_size))
76 |         self.weights_matrix.append(self.random_dist_scaling * mat)
77 | 
78 |         self.w_mat_shapes = [w.shape for w in self.weights_matrix]
79 |         self.w_mat_lens = [len(w.flatten()) for w in self.weights_matrix]
80 |         self.n_weights = sum(self.w_mat_lens)
81 | 
82 |     def forward(self, x):
83 |         for i, w in enumerate(self.weights_matrix):
84 |             if self.use_bias:
85 |                 x = np.concatenate((x, [1.0]))
86 |             x = np.dot(w, x)
87 |             if i < self.n_hidden_layers:
88 |                 x = self.act_fn(x)
89 |         return x
90 | 


--------------------------------------------------------------------------------
/pic/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from pic.sampler.sampler import Sampler  # NOQA
2 | from pic.sampler.sampler import make_env  # NOQA
3 | 


--------------------------------------------------------------------------------
/pic/sampler/sampler.py:
--------------------------------------------------------------------------------
 1 | import dm2gym
 2 | import gym
 3 | import random
 4 | import multiprocessing as mp
 5 | import numpy as np
 6 | 
 7 | import pic
 8 | 
 9 | 
10 | class Sampler(object):
11 |     def __init__(self, env_name, agent, max_episode_steps, n_samples=10**4, n_episodes=10**3, multiprocess=0):
12 |         self.env_name = env_name
13 |         self.agent = agent
14 |         self.n_samples = n_samples
15 |         self.n_episodes = n_episodes
16 |         self.multiprocess = multiprocess
17 |         self.max_episode_steps = max_episode_steps
18 | 
19 |     def sample(self):
20 |         all_scores_per_param = []
21 |         if self.multiprocess > 0:
22 |             num_worker = mp.cpu_count()
23 |             if self.multiprocess > num_worker:
24 |                 self.multiprocess = num_worker
25 |             p = mp.Pool(self.multiprocess)
26 |             print("num_worker: {}/{}".format(self.multiprocess, num_worker))
27 | 
28 |         for samp_num in range(self.n_samples):
29 |             if samp_num % max(1, self.n_samples // 10) == 0:
30 |                 print(f"Sample {samp_num}/{self.n_samples}")
31 |             score_episodes = []
32 |             if self.multiprocess > 0:
33 |                 episodes_per_worker = max(1, int(np.ceil(self.n_episodes / self.multiprocess)))
34 |                 scores = p.starmap(run_episode_wrapper, [[i, self.env_name, self.agent, self.max_episode_steps, episodes_per_worker] for i in range(self.multiprocess)])
35 |                 scores = list(itertools.chain(*scores))[:self.n_episodes]
36 |                 assert len(scores) == self.n_episodes, f'{len(scores)} != {self.n_episodes}'
37 |                 score_episodes += scores
38 |             else:
39 |                 env = make_env(env_name, seed=None)
40 |                 for _ in range(self.n_episodes):
41 |                     score = run_episode(env, self.agent, self.max_episode_steps)
42 |                     score_episodes.append(score)
43 |             all_scores_per_param.append(score_episodes)
44 |             self.agent.init_weights()
45 | 
46 |         if self.multiprocess > 0:
47 |             p.close()
48 | 
49 |         return np.array(all_scores_per_param)
50 | 
51 | 
52 | def make_env(env_name, seed=None):
53 |     if "dm2gym" in env_name:
54 |         env = gym.make(env_name, environment_kwargs={'flat_observation': True})
55 |     else:
56 |         env = gym.make(env_name)
57 |     if seed is not None:
58 |         env.seed(seed)
59 |         random.seed(seed)
60 |         np.random.seed(seed)
61 |     return env
62 | 
63 | 
64 | def run_episode(env, agent, max_episode_steps):
65 |     obs = env.reset()
66 |     score = 0
67 |     steps = 0
68 |     done = False
69 |     while not done:
70 |         action = agent.get_action(obs)
71 |         obs, r, done, _ = env.step(action)
72 |         score += r
73 |         steps += 1
74 |         if steps >= max_episode_steps:
75 |             done = True
76 |     return score
77 | 
78 | 
79 | def run_episode_wrapper(index, env_name, agent, max_episode_steps, num_episodes):
80 |     env = make_env(env_name, index)
81 |     return [run_episode(env, agent, max_episode_steps) for _ in range(num_episodes)]
82 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages
 2 | from setuptools import setup
 3 | 
 4 | install_requires = [
 5 |     'torch>=1.5.1',
 6 |     'gym>=0.17.2',
 7 |     'numpy',
 8 |     'pillow',
 9 |     'optuna',
10 |     'cloudpickle==1.3.0',
11 |     'cycler==0.10.0',
12 |     'future==0.18.2',
13 |     'kiwisolver==1.2.0',
14 |     'matplotlib',
15 |     'pandas',
16 |     'pyglet==1.5.0',
17 |     'pyparsing==2.4.7',
18 |     'python-dateutil==2.8.1',
19 |     'pytz==2020.1',
20 |     'scipy',
21 |     'seaborn',
22 |     'six',
23 |     'tabulate==0.8.7',
24 | ]
25 | 
26 | setup(
27 |     name='pic',
28 |     version='0.0.1',
29 |     description='',
30 |     author='Hiroki Furuta',
31 |     author_email='',
32 |     url='',
33 |     license='MIT License',
34 |     packages=find_packages(),
35 |     install_requires=install_requires,
36 | )
37 | 


--------------------------------------------------------------------------------