├── .gitignore ├── README.md ├── docker ├── Dockerfile ├── README.md └── setups │ ├── requirements.txt │ └── setup.sh ├── examples ├── mi_estimate.py ├── multi_step_mdp.py ├── multi_step_mdp_optimality.py └── random_sampling.py ├── pic ├── __init__.py ├── algos │ ├── __init__.py │ └── numpyagent.py ├── gym │ ├── __init__.py │ ├── multi_step │ │ └── multi_step.py │ ├── noisy_dynamics │ │ ├── cartpole_noise.py │ │ ├── halfcheetah_noise.py │ │ └── humanoid_noise.py │ └── reward_shaping │ │ ├── dynamic_mjc.py │ │ ├── maze_model.py │ │ └── reacher_norm.py ├── nn │ ├── __init__.py │ └── numpymlp.py └── sampler │ ├── __init__.py │ └── sampler.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # optional 132 | output/* 133 | .vscode 134 | outputs/* 135 | results 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Policy Information Capacity: Information-Theoretic Measure for Task Complexity in Deep Reinforcement Learning 2 | [[arxiv]](https://arxiv.org/abs/2103.12726) 3 | 4 | If you use this codebase for your research, please cite the paper: 5 | ``` 6 | @inproceedings{furuta2021pic, 7 | title={Policy Information Capacity: Information-Theoretic Measure for Task Complexity in Deep Reinforcement Learning}, 8 | author={Hiroki Furuta and Tatsuya Matsushima and Tadashi Kozuno and Yutaka Matsuo and Sergey Levine and Ofir Nachum and Shixiang Shane Gu}, 9 | booktitle={International Conference on Machine Learning}, 10 | year={2021} 11 | } 12 | ``` 13 | 14 | 15 | ## Dependencies 16 | We recommend you to use Docker. See [README](./docker/README.md) for setting up. 17 | 18 | ## Examples 19 | See [examples](./examples) for the details. 20 | 21 | For synthetic experiments: 22 | ``` 23 | python multi_step_mdp_optimality.py --iterations 100 --population_size 1000 --episodes_per_param 1000 --prior_mean 0.0 --prior_sigma 1.0 --horizon 3 --multiprocess 64 24 | 25 | python multi_step_mdp.py --iterations 100 --population_size 1000 --episodes_per_param 1000 --prior_mean 0.0 --prior_sigma 1.0 --horizon 2 --multiprocess 64 26 | ``` 27 | 28 | For random sampling: 29 | ``` 30 | python random_sampling.py --env CartPole-v0 --random_dist normal --multiprocess 64 --n_units 64 --n_layers 2 --n_samples 1000 --n_episodes 1000 31 | 32 | python random_sampling.py --env dm2gym:CheetahRun-v0 --random_dist uniform --multiprocess 64 --n_units 64 --n_layers 2 --n_samples 1000 --n_episodes 1000 33 | ``` 34 | 35 | For mutual information estimation (PIC and POIC): 36 | ``` 37 | python mi_estimate.py --sourse_path ./results/CartPole-v0.npy --env CartPole-v0 38 | ``` 39 | 40 | ## Environment List 41 | ``` 42 | CartPole-v0 43 | Pendulum-v0 44 | MountainCar-v0 45 | MountainCarContinuous-v0 46 | Acrobot-v1 47 | Ant-v2 48 | HalfCheetah-v2 49 | Walker2d-v2 50 | Humanoid-v2 51 | Hopper-v2 52 | dm2gym:CheetahRun-v0 53 | dm2gym:ReacherEasy-v0 54 | dm2gym:Ball_in_cupCatch-v0 55 | ``` 56 | 57 | For reward shaping experiments, see [here](./pic/gym/__init__.py) for the details. 58 | 59 | ## Reference 60 | This codebase is based on [RWG](https://github.com/declanoller/RWG_benchmarking). We use the implementation of pointmaze environment in [D4RL](https://github.com/rail-berkeley/d4rl). 61 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel 2 | 3 | # mujoco 4 | RUN apt-get update \ 5 | && DEBIAN_FRONTEND=noninteractive apt-get install -y \ 6 | curl \ 7 | git \ 8 | libgl1-mesa-dev \ 9 | libgl1-mesa-glx \ 10 | libglew-dev \ 11 | libosmesa6-dev \ 12 | software-properties-common \ 13 | net-tools \ 14 | unzip \ 15 | vim \ 16 | wget \ 17 | xpra \ 18 | xserver-xorg-dev 19 | RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \ 20 | && chmod +x /usr/local/bin/patchelf 21 | RUN mkdir -p /root/.mujoco && \ 22 | wget https://www.roboti.us/download/mujoco200_linux.zip -O mujoco.zip && \ 23 | unzip mujoco.zip -d /root/.mujoco && \ 24 | rm mujoco.zip 25 | RUN echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco200/bin' >> /root/.bashrc 26 | 27 | # install python libraries 28 | COPY setups/ /root/setups/ -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # How to use 2 | 0. `docker build . -t` 3 | 1. `docker run --runtime=nvidia -it --privileged -v /path/to/yourworkspace/workspace:/root/workspace -p 9999:9999` 4 | 2. `cd setup` 5 | 3. `source setup.sh` -------------------------------------------------------------------------------- /docker/setups/requirements.txt: -------------------------------------------------------------------------------- 1 | pillow 2 | gym==0.17.2 3 | mujoco-py==2.0.2.11 4 | dm2gym 5 | optuna -------------------------------------------------------------------------------- /docker/setups/setup.sh: -------------------------------------------------------------------------------- 1 | ln -s /root/.mujoco/mujoco200_linux /root/.mujoco/mujoco200 2 | pip install -U pip 3 | pip install -U setuptools 4 | pip install -r requirements.txt 5 | pip install git+git://github.com/deepmind/dm_control.git 6 | -------------------------------------------------------------------------------- /examples/mi_estimate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import optuna 4 | import numpy as np 5 | 6 | ALGORITHM_MAX = { 7 | 'CartPole-v0': 200, 8 | 'Pendulum-v0': -128.6266493, 9 | 'MountainCar-v0': -97.2, 10 | 'MountainCarContinuous-v0': 95.89074929, 11 | 'Acrobot-v1': -64.826, 12 | 'Ant-v2': 6584.2, 13 | 'HalfCheetah-v2': 15266.5, 14 | 'Hopper-v2': 3564.07, 15 | 'Walker2d-v2': 5813, 16 | 'Humanoid-v2': 8264, 17 | 'dm2gym:CheetahRun-v0': 795.0, 18 | 'dm2gym:ReacherEasy-v0': 955.0, 19 | 'dm2gym:Ball_in_cupCatch-v0': 978.2, 20 | } 21 | 22 | ALGORITHM_AVG = { 23 | 'CartPole-v0': 194.2, 24 | 'Pendulum-v0': -571.5, 25 | 'MountainCar-v0': -143.1, 26 | 'MountainCarContinuous-v0': 12.9, 27 | 'Acrobot-v1': -162.9, 28 | 'Ant-v2': 2450.782353, 29 | 'HalfCheetah-v2': 6047.226471, 30 | 'Hopper-v2': 2206.747059, 31 | 'Walker2d-v2': 3190.777059, 32 | 'Humanoid-v2': 3880.83, 33 | 'dm2gym:CheetahRun-v0': 441.9663239, 34 | 'dm2gym:ReacherEasy-v0': 600.172, 35 | 'dm2gym:Ball_in_cupCatch-v0': 743.21, 36 | } 37 | 38 | 39 | def main(): 40 | _basic_columns = ( 41 | "environment", 42 | "normalized_score_A", 43 | "normalized_score_R", 44 | "POIC", 45 | "optimality_marginal", 46 | "optimality_conditional", 47 | "PIC", 48 | "reward_marginal", 49 | "reward_conditional", 50 | "variance", 51 | "temperatures" 52 | "r_max", 53 | "r_min", 54 | "r_mean", 55 | ) 56 | 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--n_trials", type=int, default=200, help="n_trials for optuna") 59 | parser.add_argument("--n_bins", type=int, default=100000, help="number of bins") 60 | parser.add_argument("--algo_max", action="store_true", help="max(r^algo, r^rand)") 61 | parser.add_argument("--clip_persent", type=float, default=0.0, help="top/bottom x percent clipping") 62 | parser.add_argument("--sourse_path", type=str, default='./CartPole-v0.npy') 63 | parser.add_argument("--root_dir", type=str, default='./results/') 64 | parser.add_argument("--env", type=str, default='CartPole-v0') 65 | args = parser.parse_args() 66 | 67 | # save dir 68 | output_dir = os.path.join( 69 | args.root_dir, 70 | 'n_trials{}_clip_persent{}'.format(args.n_trials, args.clip_persent), 71 | ) 72 | os.makedirs(output_dir, exist_ok=True) 73 | 74 | with open(os.path.join(output_dir, "{}_metrics.txt".format(args.env)), "w") as f: 75 | print("\t".join(_basic_columns), file=f) 76 | with open(os.path.join(output_dir, "{}_tables.txt".format(args.env)), "w") as f: 77 | print(" & ".join(_basic_columns), file=f) 78 | 79 | all_scores_per_param = np.load(args.sourse_path) 80 | all_mean_scores = all_scores_per_param.mean(axis=1) 81 | 82 | if args.clip_persent > 0: 83 | upper = np.percentile(all_mean_scores, 100-args.clip_persent) 84 | lower = np.percentile(all_mean_scores, args.clip_persent) 85 | all_scores_per_param = np.clip(all_scores_per_param, lower, upper) 86 | 87 | all_scores = all_scores_per_param.flatten() 88 | r_max = all_scores.max() 89 | r_min = all_mean_scores.min() 90 | r_mean = all_scores.mean() 91 | 92 | variance = 0 if (r_max - r_min) == 0 else all_scores.var()/(r_max - r_min) 93 | 94 | if args.algo_max: 95 | r_max = max(ALGORITHM_MAX[args.env], r_max) 96 | 97 | def objective(trial): 98 | temperature = trial.suggest_loguniform('temperature', 1e-4, 2e4) 99 | p_o1 = np.exp((all_scores-r_max)/temperature).mean() 100 | p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1) 101 | marginal = -p_o1*np.log(p_o1 + 1e-12) - (1-p_o1)*np.log(1-p_o1 + 1e-12) 102 | conditional = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12)) 103 | mutual_information = marginal - conditional 104 | 105 | return mutual_information 106 | 107 | study = optuna.create_study(direction='maximize') 108 | study.optimize(objective, n_trials=args.n_trials) 109 | 110 | # POIC 111 | trial = study.best_trial 112 | mi_o = trial.value 113 | temperature = trial.params['temperature'] 114 | p_o1 = np.exp((all_scores-r_max)/temperature).mean() 115 | p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1) 116 | h_o = -p_o1*np.log(p_o1) - (1-p_o1)*np.log(1-p_o1) 117 | 118 | h_o_t = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12)) 119 | 120 | # PIC 121 | bins = args.n_bins 122 | hist = np.histogram(all_scores, bins=args.n_bins) 123 | discretization_all = hist[0] / len(all_scores) 124 | entropy_all = - np.sum(discretization_all * np.log(discretization_all + 1e-12)) 125 | discretization_r_theta = [np.histogram(x, bins=hist[1])[0] / len(x) for x in all_scores_per_param] 126 | entropy_r_theta = - np.mean([np.sum(p_r_theta * np.log(p_r_theta + 1e-12)) for p_r_theta in discretization_r_theta]) 127 | mi_r = entropy_all - entropy_r_theta 128 | 129 | normalized_score_A = (ALGORITHM_AVG[args.env] - r_min) / (max(ALGORITHM_MAX[args.env], r_max) - r_min) 130 | normalized_score_R = (r_mean - r_min) / (max(ALGORITHM_MAX[args.env], r_max) - r_min) 131 | 132 | # save in scores.txt 133 | values = ( 134 | args.env, 135 | normalized_score_A, 136 | normalized_score_R, 137 | mi_o, 138 | h_o, 139 | h_o_t, 140 | mi_r, 141 | entropy_all, 142 | entropy_r_theta, 143 | variance, 144 | temperature, 145 | r_max, 146 | r_min, 147 | r_mean, 148 | ) 149 | 150 | with open(os.path.join(output_dir, "{}_metrics.txt".format(args.env)), "a+") as f: 151 | print("\t".join(str(x) for x in values), file=f) 152 | with open(os.path.join(output_dir, "{}_tables.txt".format(args.env)), "a+") as f: 153 | print(" & ".join(str(x) for x in values), file=f) 154 | 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /examples/multi_step_mdp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import gym 4 | import os 5 | 6 | import pic 7 | import multiprocessing as mp 8 | import numpy as np 9 | from multiprocessing import Pool 10 | from scipy.special import expit as sigmoid, logit 11 | 12 | 13 | _basic_columns = ( 14 | "episodes", 15 | "mutual_infomation", 16 | "marginal", 17 | "conditional", 18 | "mean", 19 | "min", 20 | "max", 21 | "var", 22 | "eval_reward_mean", 23 | "prior_mean_0", 24 | "prior_mean_1", 25 | "prior_mean_2", 26 | ) 27 | 28 | 29 | class Agent(object): 30 | def __init__(self, state_dim=3): 31 | self.param = np.zeros(3) 32 | 33 | def set_weight(self, weight): 34 | self.param = weight # (3, ) 35 | 36 | def sample(self, state): 37 | b = np.random.uniform(0, 1, 1) 38 | theta_s = np.dot(self.param.T, state) 39 | action = int(1 - (theta_s > logit(b)).astype('int')) 40 | return action 41 | 42 | 43 | class MultiSampler(object): 44 | def __init__(self, env, agent): 45 | self.env = env 46 | self.agent = agent 47 | 48 | def run_episode(self, index): 49 | obs = self.env.reset() 50 | score = 0 51 | steps = 0 52 | done = False 53 | while not done: 54 | action = self.agent.sample(obs) 55 | obs, reward, done, _ = self.env.step(action) 56 | score += reward 57 | return score 58 | 59 | def set_weight(self, weight): 60 | self.agent.set_weight(weight) 61 | 62 | 63 | def main(): 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument("--prior_sigma", type=float, default=1.0, help="std of p(\theta)") 66 | parser.add_argument("--prior_mean", type=float, default=0.0, help="mean of p(\theta)") 67 | parser.add_argument("--horizon", type=int, default=1, help="horizon of MDP") 68 | parser.add_argument("--multiprocess", type=int, default=1, help="number of prosess for distrbuted experiments") 69 | parser.add_argument("--population_size", type=int, default=1000, help="number of population for optimization") 70 | parser.add_argument("--episodes_per_param", type=int, default=1000, help="the number of episodes per parameter") 71 | parser.add_argument("--iterations", type=int, default=100, help="the number of episodes for optimization") 72 | parser.add_argument("--learning_rate", type=float, default=0.5, help="learning rate of the parameter") 73 | parser.add_argument("--decay", type=float, default=0.999, help="decay of learning rate") 74 | parser.add_argument("--output", type=str, default='./outputs/') 75 | parser.add_argument("--seed", type=int, default=0, help="random seed") 76 | args = parser.parse_args() 77 | 78 | np.random.seed(seed=args.seed) 79 | 80 | # save dir 81 | timestamp = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S") 82 | output_dir = os.path.join( 83 | args.output, 84 | 'multi_step_es_horizon{}'.format(args.horizon), 85 | 'ps{}_pm{}_pop{}_ep{}_seed-{}'.format( 86 | args.prior_sigma, 87 | args.prior_mean, 88 | args.population_size, 89 | args.episodes_per_param, 90 | args.seed 91 | ), 92 | timestamp 93 | ) 94 | os.makedirs(output_dir, exist_ok=True) 95 | 96 | with open(os.path.join(output_dir, "scores.txt"), "w") as f: 97 | print("\t".join(_basic_columns), file=f) 98 | 99 | if args.horizon == 1: 100 | env = gym.make('OneStep-v0') 101 | elif args.horizon == 2: 102 | env = gym.make('TwoStep-v0') 103 | elif args.horizon == 3: 104 | env = gym.make('ThreeStep-v0') 105 | 106 | agent = Agent() 107 | multisampler = MultiSampler(env, agent) 108 | 109 | if args.multiprocess > 0: 110 | num_worker = mp.cpu_count() 111 | if args.multiprocess > num_worker: 112 | args.multiprocess = num_worker 113 | pool = Pool(args.multiprocess) 114 | print("num_worker: {}/{}".format(args.multiprocess, num_worker)) 115 | 116 | # sampling parameters 117 | prior_mu = np.array([args.prior_mean] * 3) 118 | for itr in range(args.iterations): 119 | # sampling parameters 120 | # [population_size, episodes_per_param] 121 | mu = np.ones((args.population_size, 3)) * prior_mu 122 | noise = np.random.randn(args.population_size, 3) 123 | theta = mu + args.prior_sigma * noise 124 | 125 | all_scores = [] 126 | all_scores_per_param = [] 127 | # simulate 128 | for population in theta: 129 | multisampler.set_weight(population) 130 | if args.multiprocess > 0: 131 | scores = pool.map(multisampler.run_episode, range(args.episodes_per_param)) 132 | assert len(scores) == args.episodes_per_param 133 | all_scores += scores 134 | all_scores_per_param.append(scores) 135 | 136 | all_scores = np.array(all_scores) 137 | all_scores_per_param = np.array(all_scores_per_param) 138 | 139 | p = all_scores.sum()/(args.population_size * args.episodes_per_param) 140 | marginal = -p * np.log(p + 1e-10) -(1 - p) * np.log((1 - p) + 1e-10) 141 | ps = all_scores_per_param.sum(axis=1) / args.episodes_per_param 142 | conditional = np.mean(-ps * np.log(ps + 1e-10) -(1 - ps) * np.log((1 - ps) + 1e-10)) 143 | mutual_info = marginal - conditional 144 | reward_mean = all_scores.mean() 145 | reward_variance = all_scores.var() 146 | reward_mean_min = all_scores_per_param.mean(axis=1).min() 147 | reward_mean_max = all_scores_per_param.mean(axis=1).max() 148 | 149 | # update 150 | reward_mean_over_ep = all_scores_per_param.mean(axis=1) 151 | std = reward_mean_over_ep.std() 152 | if std == 0: 153 | std = 1e-10 154 | # normalize 155 | reward_mean_over_ep = (reward_mean_over_ep - reward_mean) / std 156 | update_factor = 1. / (args.population_size * args.prior_sigma) 157 | g = update_factor * np.dot(noise.T, reward_mean_over_ep).T 158 | prior_mu += args.learning_rate * g 159 | 160 | # evaluation 161 | n_eval = 100 162 | if args.multiprocess > 0: 163 | eval_reward = pool.map(multisampler.run_episode, range(n_eval)) 164 | assert len(eval_reward) == n_eval 165 | eval_reward_mean = np.array(eval_reward).mean() 166 | 167 | values = ( 168 | (itr + 1) * args.episodes_per_param * args.population_size, 169 | mutual_info, 170 | marginal, 171 | conditional, 172 | reward_mean, 173 | reward_mean_min, 174 | reward_mean_max, 175 | reward_variance, 176 | eval_reward_mean, 177 | prior_mu[0], 178 | prior_mu[1], 179 | prior_mu[2], 180 | ) 181 | with open(os.path.join(output_dir, "scores.txt"), "a+") as f: 182 | print("\t".join(str(x) for x in values), file=f) 183 | 184 | 185 | if __name__ == "__main__": 186 | main() 187 | -------------------------------------------------------------------------------- /examples/multi_step_mdp_optimality.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import gym 4 | import os 5 | import optuna 6 | 7 | import pic 8 | import multiprocessing as mp 9 | import numpy as np 10 | import scipy.stats as st 11 | from multiprocessing import Pool 12 | from scipy.special import expit as sigmoid, logit 13 | 14 | 15 | _basic_columns = ( 16 | "episodes", 17 | "mutual_infomation", 18 | "marginal", 19 | "conditional", 20 | "mean", 21 | "min", 22 | "max", 23 | "var", 24 | "eval_reward_mean", 25 | "prior_mean_0", 26 | "prior_mean_1", 27 | "prior_mean_2", 28 | ) 29 | 30 | 31 | class Agent(object): 32 | def __init__(self, state_dim=3): 33 | self.param = np.zeros(3) 34 | 35 | def set_weight(self, weight): 36 | self.param = weight # (3, ) 37 | 38 | def sample(self, state): 39 | b = np.random.uniform(0, 1, 1) 40 | theta_s = np.dot(self.param.T, state) 41 | action = int(1 - (theta_s > logit(b)).astype('int')) 42 | return action 43 | 44 | 45 | class MultiSampler(object): 46 | def __init__(self, env, agent): 47 | self.env = env 48 | self.agent = agent 49 | 50 | def run_episode(self, index): 51 | obs = self.env.reset() 52 | score = 0 53 | steps = 0 54 | done = False 55 | while not done: 56 | action = self.agent.sample(obs) 57 | obs, reward, done, _ = self.env.step(action) 58 | score += reward 59 | return score 60 | 61 | def set_weight(self, weight): 62 | self.agent.set_weight(weight) 63 | 64 | 65 | def main(): 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument("--prior_sigma", type=float, default=1.0, help="std of p(\theta)") 68 | parser.add_argument("--prior_mean", type=float, default=0.0, help="mean of p(\theta)") 69 | parser.add_argument("--horizon", type=int, default=1, help="horizon of MDP") 70 | parser.add_argument("--multiprocess", type=int, default=1, help="number of prosess for distrbuted experiments") 71 | parser.add_argument("--population_size", type=int, default=1000, help="number of population for optimization") 72 | parser.add_argument("--episodes_per_param", type=int, default=1000, help="the number of episodes per parameter") 73 | parser.add_argument("--mi_population_size", type=int, default=100, help="population for finite difference") 74 | parser.add_argument("--mi_coefficient", type=float, default=1.0, help="coefficient for MI") 75 | parser.add_argument("--mi_scale", type=float, default=1.0, help="scale parameter for finite difference") 76 | parser.add_argument("--n_trials", type=int, default=50) 77 | parser.add_argument("--iterations", type=int, default=100, help="the number of episodes for optimization") 78 | parser.add_argument("--learning_rate", type=float, default=0.5, help="learning rate of the parameter") 79 | parser.add_argument("--decay", type=float, default=0.999, help="decay of learning rate") 80 | parser.add_argument("--output", type=str, default='./outputs/') 81 | parser.add_argument("--seed", type=int, default=0, help="random seed") 82 | args = parser.parse_args() 83 | 84 | np.random.seed(seed=args.seed) 85 | 86 | # save dir 87 | timestamp = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S") 88 | output_dir = os.path.join( 89 | args.output, 90 | 'optimal_multi_step_es_horizon{}'.format(args.horizon), 91 | 'ps{}_pm{}_pop{}_ep{}_seed-{}'.format(args.prior_sigma, args.prior_mean, args.population_size, args.episodes_per_param, args.seed), 92 | timestamp 93 | ) 94 | os.makedirs(output_dir, exist_ok=True) 95 | 96 | with open(os.path.join(output_dir, "scores.txt"), "w") as f: 97 | print("\t".join(_basic_columns), file=f) 98 | 99 | if args.horizon == 1: 100 | env = gym.make('OneStep-v0') 101 | elif args.horizon == 2: 102 | env = gym.make('TwoStep-v0') 103 | elif args.horizon == 3: 104 | env = gym.make('ThreeStep-v0') 105 | 106 | agent = Agent() 107 | multisampler = MultiSampler(env, agent) 108 | 109 | if args.multiprocess > 0: 110 | num_worker = mp.cpu_count() 111 | if args.multiprocess > num_worker: 112 | args.multiprocess = num_worker 113 | pool = Pool(args.multiprocess) 114 | print("num_worker: {}/{}".format(args.multiprocess, num_worker)) 115 | 116 | # sampling parameters 117 | prior_mu = np.array([args.prior_mean] * 3) 118 | for itr in range(args.iterations): 119 | # sampling parameters 120 | # [population_size, episodes_per_param] 121 | mu = np.ones((args.population_size, 3)) * prior_mu 122 | noise = np.random.randn(args.population_size, 3) 123 | theta = mu + args.prior_sigma * noise 124 | p_theta = st.multivariate_normal(mean=prior_mu, cov=args.prior_sigma) 125 | 126 | all_scores = [] 127 | all_scores_per_param = [] 128 | # simulate 129 | for population in theta: 130 | multisampler.set_weight(population) 131 | if args.multiprocess > 0: 132 | scores = pool.map(multisampler.run_episode, range(args.episodes_per_param)) 133 | assert len(scores) == args.episodes_per_param 134 | all_scores += scores 135 | all_scores_per_param.append(scores) 136 | 137 | all_scores = np.array(all_scores) 138 | all_scores_per_param = np.array(all_scores_per_param) 139 | 140 | r_max = all_scores.max() 141 | 142 | def objective(trial): 143 | temperature = trial.suggest_loguniform('temperature', 1e-4, 2e4) 144 | p_o1 = np.exp((all_scores-r_max)/temperature).mean() 145 | p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1) 146 | marginal = -p_o1*np.log(p_o1 + 1e-12) - (1-p_o1)*np.log(1-p_o1 + 1e-12) 147 | conditional = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12)) 148 | mutual_information = marginal - conditional 149 | 150 | return mutual_information 151 | 152 | study = optuna.create_study(direction='maximize') 153 | study.optimize(objective, n_trials=args.n_trials) 154 | 155 | trial = study.best_trial 156 | mutual_info = trial.value 157 | temperature = trial.params['temperature'] 158 | p_o1 = np.exp((all_scores-r_max)/temperature).mean() 159 | p_o1_ts = np.exp((all_scores_per_param-r_max)/temperature).mean(axis=1) 160 | marginal = -p_o1*np.log(p_o1) - (1-p_o1)*np.log(1-p_o1) 161 | conditional = np.mean(-p_o1_ts*np.log(p_o1_ts + 1e-12) - (1-p_o1_ts)*np.log(1-p_o1_ts + 1e-12)) 162 | 163 | reward_mean = all_scores.mean() 164 | reward_variance = all_scores.var() 165 | reward_mean_min = all_scores_per_param.mean(axis=1).min() 166 | reward_mean_max = all_scores_per_param.mean(axis=1).max() 167 | # update 168 | reward_mean_over_ep = all_scores_per_param.mean(axis=1) 169 | std = reward_mean_over_ep.std() 170 | if std == 0: 171 | std = 1e-10 172 | # normalize 173 | reward_mean_over_ep = (reward_mean_over_ep - reward_mean) / std 174 | update_factor = 1. / (args.population_size * args.prior_sigma) 175 | g = update_factor * np.dot(noise.T, reward_mean_over_ep).T 176 | prior_mu += args.learning_rate * g 177 | 178 | # evaluation 179 | n_eval = 100 180 | if args.multiprocess > 0: 181 | eval_reward = pool.map(multisampler.run_episode, range(n_eval)) 182 | assert len(eval_reward) == n_eval 183 | eval_reward_mean = np.array(eval_reward).mean() 184 | 185 | values = ( 186 | (itr + 1) * args.episodes_per_param * args.population_size, 187 | mutual_info, 188 | marginal, 189 | conditional, 190 | reward_mean, 191 | reward_mean_min, 192 | reward_mean_max, 193 | reward_variance, 194 | eval_reward_mean, 195 | prior_mu[0], 196 | prior_mu[1], 197 | prior_mu[2], 198 | ) 199 | with open(os.path.join(output_dir, "scores.txt"), "a+") as f: 200 | print("\t".join(str(x) for x in values), file=f) 201 | 202 | 203 | if __name__ == "__main__": 204 | main() 205 | -------------------------------------------------------------------------------- /examples/random_sampling.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | 5 | from pic.algos import NumpyAgent 6 | from pic.sampler import Sampler, make_env 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--env", type=str, default="CartPole-v0", help="Open AI gym environments") 12 | parser.add_argument("--n_units", type=int, default=64, help="number of hidden units") 13 | parser.add_argument("--n_layers", type=int, default=2, help="number of hidden layers") 14 | parser.add_argument("--use_bias", action="store_true", help="use bias in NN") 15 | parser.add_argument("--n_samples", type=int, default=10**4, help="number of parameters sampled from p(\theta)") 16 | parser.add_argument("--n_episodes", type=int, default=1000, help="number of episode running with parameter \theta") 17 | parser.add_argument("--root_dir", type=str, default='./results/', help="Root dir to save results") 18 | parser.add_argument("--random_dist", type=str, choices=['normal', 'uniform', 'xavier_uniform', 'xavier_normal'], default='normal', help="prior distribution of p(\theta)") 19 | parser.add_argument("--normal_mean", type=float, default=0.0, help="The mean of prior distribution") 20 | parser.add_argument("--normal_sigma", type=float, default=1.0, help="The sigma of prior distribution") 21 | parser.add_argument("--uniform_bound", type=float, default=1.0, help="The bound of prior distribution") 22 | parser.add_argument("--multiprocess", type=int, default=0, help="number of prosess for distrbuted experiments") 23 | args = parser.parse_args() 24 | 25 | sample_env = make_env(args.env, seed=None) 26 | max_episode_steps = sample_env.spec.max_episode_steps 27 | 28 | agent = NumpyAgent( 29 | env=sample_env, 30 | n_hidden_layers=args.n_layers, 31 | n_hidden_units=args.n_units, 32 | random_dist=args.random_dist, 33 | normal_mean=args.normal_mean, 34 | normal_sigma=args.normal_sigma, 35 | uniform_bound=args.uniform_bound, 36 | use_bias=args.use_bias, 37 | env_name=args.env, 38 | ) 39 | 40 | # save dir 41 | output_dir = os.path.join( 42 | args.root_dir, 43 | 'dist_{}_layers{}_units{}'.format(args.random_dist, args.n_layers, args.n_units) 44 | ) 45 | os.makedirs(output_dir, exist_ok=True) 46 | 47 | sampler = Sampler( 48 | args.env, 49 | agent, 50 | max_episode_steps, 51 | n_samples=args.n_samples, 52 | n_episodes=args.n_episodes, 53 | multiprocess=args.multiprocess, 54 | ) 55 | all_scores_per_param = sampler.sample() 56 | np.save(os.path.join(output_dir, "{}.npy".format(args.env)), all_scores_per_param) 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /pic/__init__.py: -------------------------------------------------------------------------------- 1 | from pic import nn # NOQA 2 | from pic import algos # NOQA 3 | from pic import gym # NOQA 4 | from pic import sampler # NOQA 5 | -------------------------------------------------------------------------------- /pic/algos/__init__.py: -------------------------------------------------------------------------------- 1 | from pic.algos.numpyagent import NumpyAgent # NOQA 2 | -------------------------------------------------------------------------------- /pic/algos/numpyagent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pic.nn import NumpyMLP 4 | 5 | 6 | class NumpyAgent: 7 | def __init__( 8 | self, 9 | env, 10 | n_hidden_layers=2, 11 | n_hidden_units=4, 12 | random_dist="normal", 13 | normal_mean=0.0, 14 | normal_sigma=1.0, 15 | uniform_bound=1.0, 16 | act_fn="tanh", 17 | use_bias=False, 18 | env_name='CartPole-v0', 19 | policy_type="deterministic", 20 | ): 21 | self.env_name = env_name 22 | 23 | if not ("dm2gym" in self.env_name): 24 | self.n_inputs = env.reset().size 25 | else: # for dm2gym 26 | self.n_inputs = env.reset()['observations'].size 27 | 28 | self.policy_type = policy_type 29 | assert self.policy_type in ["deterministic", "stochastic"] 30 | 31 | if type(env.action_space).__name__ == "Discrete": 32 | self.action_space_type = "discrete" 33 | self.n_actions = env.action_space.n 34 | self.n_outputs = self.n_actions 35 | if self.policy_type == "stochastic": 36 | self.output_fn = self.discrete_dist_sample 37 | else: 38 | self.output_fn = np.argmax 39 | elif type(env.action_space).__name__ == "Box": 40 | self.action_space_type = "continuous" 41 | self.action_scale = env.action_space.high.max() 42 | self.n_actions = env.action_space.shape[0] 43 | if self.policy_type == "stochastic": 44 | self.output_fn = self.continuous_dist_sample 45 | self.n_outputs = 2 * self.n_actions 46 | else: 47 | self.output_fn = self.scale_continuous_action 48 | self.n_outputs = self.n_actions 49 | 50 | self.nn = NumpyMLP( 51 | n_inputs=self.n_inputs, 52 | n_outputs=self.n_outputs, 53 | n_hidden_layers=n_hidden_layers, 54 | n_hidden_units=n_hidden_units, 55 | random_dist=random_dist, 56 | normal_mean=normal_mean, 57 | normal_sigma=normal_sigma, 58 | uniform_bound=uniform_bound, 59 | act_fn=act_fn, 60 | use_bias=use_bias 61 | ) 62 | 63 | def get_action(self, state): 64 | if ("dm2gym" in self.env_name): 65 | state = state['observations'] 66 | x = self.nn.forward(state) 67 | return self.output_fn(x) 68 | 69 | def scale_continuous_action(self, x): 70 | return self.action_scale * np.tanh(x) 71 | 72 | def discrete_dist_sample(self, x): 73 | softmax_x = np.exp(x) / np.exp(x).sum() 74 | return np.random.choice(list(range(self.N_actions)), p=softmax_x) 75 | 76 | def continuous_dist_sample(self, x): 77 | mus_NN = x[: self.n_actions] 78 | sigmas_NN = x[self.n_actions:] 79 | mus = np.tanh(mus_NN) * self.action_scale 80 | sigmas = np.log(1 + np.exp(sigmas_NN)) 81 | return np.random.normal(loc=mus, scale=sigmas) 82 | 83 | def init_weights(self): 84 | self.nn.init_weights() 85 | -------------------------------------------------------------------------------- /pic/gym/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | from pic.gym.reward_shaping.maze_model import U_MAZE 3 | 4 | 5 | register( 6 | id='OneStep-v0', 7 | entry_point='pic.gym.multi_step.multi_step:MultiStepEnv', 8 | kwargs={'horizon': 1}, 9 | ) 10 | 11 | register( 12 | id='TwoStep-v0', 13 | entry_point='pic.gym.multi_step.multi_step:MultiStepEnv', 14 | kwargs={'horizon': 2}, 15 | ) 16 | 17 | register( 18 | id='ThreeStep-v0', 19 | entry_point='pic.gym.multi_step.multi_step:MultiStepEnv', 20 | kwargs={'horizon': 3}, 21 | ) 22 | 23 | register( 24 | id='ReacherL1-v0', 25 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env', 26 | max_episode_steps=50, 27 | ) 28 | 29 | register( 30 | id='ReacherL2-v0', 31 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env', 32 | max_episode_steps=50, 33 | ) 34 | 35 | register( 36 | id='ReacherSparse-v0', 37 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv', 38 | max_episode_steps=50, 39 | ) 40 | 41 | register( 42 | id='ReacherFrac-v0', 43 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv', 44 | max_episode_steps=50, 45 | ) 46 | 47 | register( 48 | id='ReacherL1_c05-v0', 49 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env', 50 | max_episode_steps=50, 51 | kwargs={'coefficent': 0.5}, 52 | ) 53 | 54 | register( 55 | id='ReacherL1_c20-v0', 56 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env', 57 | max_episode_steps=50, 58 | kwargs={'coefficent': 2.0}, 59 | ) 60 | 61 | register( 62 | id='ReacherL1_c50-v0', 63 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL1Env', 64 | max_episode_steps=50, 65 | kwargs={'coefficent': 5.0}, 66 | ) 67 | 68 | register( 69 | id='ReacherL2_c05-v0', 70 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env', 71 | max_episode_steps=50, 72 | kwargs={'coefficent': 0.5}, 73 | ) 74 | 75 | register( 76 | id='ReacherL2_c20-v0', 77 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env', 78 | max_episode_steps=50, 79 | kwargs={'coefficent': 2.0}, 80 | ) 81 | 82 | register( 83 | id='ReacherL2_c50-v0', 84 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherL2Env', 85 | max_episode_steps=50, 86 | kwargs={'coefficent': 5.0}, 87 | ) 88 | 89 | register( 90 | id='ReacherSparse_d001-v0', 91 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv', 92 | max_episode_steps=50, 93 | kwargs={'distance_threshold': 0.01}, 94 | ) 95 | 96 | register( 97 | id='ReacherSparse_d01-v0', 98 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv', 99 | max_episode_steps=50, 100 | kwargs={'distance_threshold': 0.1}, 101 | ) 102 | 103 | register( 104 | id='ReacherSparse_d015-v0', 105 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherSparseEnv', 106 | max_episode_steps=50, 107 | kwargs={'distance_threshold': 0.15}, 108 | ) 109 | 110 | register( 111 | id='ReacherFrac_m01o01-v0', 112 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv', 113 | max_episode_steps=50, 114 | kwargs={'multiplier': 0.1, 'offset': 0.1}, 115 | ) 116 | 117 | register( 118 | id='ReacherFrac_m001o001-v0', 119 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv', 120 | max_episode_steps=50, 121 | kwargs={'multiplier': 0.01, 'offset': 0.01}, 122 | ) 123 | 124 | register( 125 | id='ReacherFrac_m005o01-v0', 126 | entry_point='pic.gym.reward_shaping.reacher_norm:ReacherFracEnv', 127 | max_episode_steps=50, 128 | kwargs={'multiplier': 0.05, 'offset': 0.1}, 129 | ) 130 | 131 | register( 132 | id='maze2d-umaze-negative_sparse-v0', 133 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 134 | max_episode_steps=150, 135 | kwargs={ 136 | 'reward_type': 'negative_sparse', 137 | 'distance_threshold': 0.5, 138 | } 139 | ) 140 | 141 | register( 142 | id='maze2d-umaze-densel2-v0', 143 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 144 | max_episode_steps=150, 145 | kwargs={ 146 | 'reward_type': 'densel2', 147 | 'coefficent': 1.0, 148 | } 149 | ) 150 | 151 | register( 152 | id='maze2d-umaze-densel1-v0', 153 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 154 | max_episode_steps=150, 155 | kwargs={ 156 | 'reward_type': 'densel1', 157 | 'coefficent': 1.0, 158 | } 159 | ) 160 | 161 | register( 162 | id='maze2d-umaze-frac-v0', 163 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 164 | max_episode_steps=150, 165 | kwargs={ 166 | 'reward_type': 'frac', 167 | 'multiplier': 0.01, 168 | 'offset': 0.1, 169 | } 170 | ) 171 | 172 | register( 173 | id='maze2d-umaze-negative_sparse_d10-v0', 174 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 175 | max_episode_steps=150, 176 | kwargs={ 177 | 'reward_type': 'negative_sparse', 178 | 'distance_threshold': 1.0, 179 | } 180 | ) 181 | 182 | register( 183 | id='maze2d-umaze-negative_sparse_d01-v0', 184 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 185 | max_episode_steps=150, 186 | kwargs={ 187 | 'reward_type': 'negative_sparse', 188 | 'distance_threshold': 0.1, 189 | } 190 | ) 191 | 192 | register( 193 | id='maze2d-umaze-negative_sparse_d02-v0', 194 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 195 | max_episode_steps=150, 196 | kwargs={ 197 | 'reward_type': 'negative_sparse', 198 | 'distance_threshold': 0.2, 199 | } 200 | ) 201 | 202 | register( 203 | id='maze2d-umaze-densel1_c05-v0', 204 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 205 | max_episode_steps=150, 206 | kwargs={ 207 | 'reward_type': 'densel1', 208 | 'coefficent': 0.5, 209 | } 210 | ) 211 | 212 | register( 213 | id='maze2d-umaze-densel1_c50-v0', 214 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 215 | max_episode_steps=150, 216 | kwargs={ 217 | 'reward_type': 'densel1', 218 | 'coefficent': 5.0, 219 | } 220 | ) 221 | 222 | register( 223 | id='maze2d-umaze-densel1_c20-v0', 224 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 225 | max_episode_steps=150, 226 | kwargs={ 227 | 'reward_type': 'densel1', 228 | 'coefficent': 2.0, 229 | } 230 | ) 231 | 232 | register( 233 | id='maze2d-umaze-densel2_c05-v0', 234 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 235 | max_episode_steps=150, 236 | kwargs={ 237 | 'reward_type': 'densel2', 238 | 'coefficent': 0.5, 239 | } 240 | ) 241 | 242 | register( 243 | id='maze2d-umaze-densel2_c50-v0', 244 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 245 | max_episode_steps=150, 246 | kwargs={ 247 | 'reward_type': 'densel2', 248 | 'coefficent': 5.0, 249 | } 250 | ) 251 | 252 | register( 253 | id='maze2d-umaze-densel2_c20-v0', 254 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 255 | max_episode_steps=150, 256 | kwargs={ 257 | 'reward_type': 'densel2', 258 | 'coefficent': 2.0, 259 | } 260 | ) 261 | 262 | register( 263 | id='maze2d-umaze-frac_m01o01-v0', 264 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 265 | max_episode_steps=150, 266 | kwargs={ 267 | 'reward_type': 'frac', 268 | 'multiplier': 0.1, 269 | 'offset': 0.1, 270 | } 271 | ) 272 | 273 | register( 274 | id='maze2d-umaze-frac_m001o001-v0', 275 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 276 | max_episode_steps=150, 277 | kwargs={ 278 | 'reward_type': 'frac', 279 | 'multiplier': 0.01, 280 | 'offset': 0.01, 281 | } 282 | ) 283 | 284 | register( 285 | id='maze2d-umaze-frac_m005o01-v0', 286 | entry_point='pic.gym.reward_shaping.maze_model:MazeEnv', 287 | max_episode_steps=150, 288 | kwargs={ 289 | 'reward_type': 'frac', 290 | 'multiplier': 0.05, 291 | 'offset': 0.1, 292 | } 293 | ) 294 | 295 | register( 296 | id='CartPoleNoise-v0', 297 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 298 | max_episode_steps=200, 299 | kwargs={ 300 | 'noise_type': 'uniform', 301 | 'noise_scale': 0.0, 302 | 'init_scale': 0.0, 303 | } 304 | ) 305 | 306 | register( 307 | id='HalfCheetahNoise-v2', 308 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 309 | max_episode_steps=1000, 310 | kwargs={ 311 | 'noise_type': 'uniform', 312 | 'noise_scale': 0.0, 313 | 'init_scale': 0.0, 314 | } 315 | ) 316 | 317 | register( 318 | id='HumanoidNoise-v2', 319 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 320 | max_episode_steps=1000, 321 | kwargs={ 322 | 'noise_type': 'uniform', 323 | 'noise_scale': 0.0, 324 | 'init_scale': 0.0, 325 | } 326 | ) 327 | 328 | # noisy_dynamics 329 | # CartPole 330 | register( 331 | id='CartPoleNoiseInit005Dynamics003-v0', 332 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 333 | max_episode_steps=200, 334 | kwargs={ 335 | 'noise_type': 'uniform', 336 | 'noise_scale': 0.03, 337 | 'init_scale': 0.05, 338 | } 339 | ) 340 | 341 | register( 342 | id='CartPoleNoiseInit005Dynamics005-v0', 343 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 344 | max_episode_steps=200, 345 | kwargs={ 346 | 'noise_type': 'uniform', 347 | 'noise_scale': 0.05, 348 | 'init_scale': 0.05, 349 | } 350 | ) 351 | 352 | register( 353 | id='CartPoleNoiseInit005Dynamics010-v0', 354 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 355 | max_episode_steps=200, 356 | kwargs={ 357 | 'noise_type': 'uniform', 358 | 'noise_scale': 0.1, 359 | 'init_scale': 0.05, 360 | } 361 | ) 362 | 363 | register( 364 | id='CartPoleNoiseInit010Dynamics000-v0', 365 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 366 | max_episode_steps=200, 367 | kwargs={ 368 | 'noise_type': 'uniform', 369 | 'noise_scale': 0.0, 370 | 'init_scale': 0.1, 371 | } 372 | ) 373 | 374 | register( 375 | id='CartPoleNoiseInit010Dynamics003-v0', 376 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 377 | max_episode_steps=200, 378 | kwargs={ 379 | 'noise_type': 'uniform', 380 | 'noise_scale': 0.03, 381 | 'init_scale': 0.1, 382 | } 383 | ) 384 | 385 | register( 386 | id='CartPoleNoiseInit010Dynamics005-v0', 387 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 388 | max_episode_steps=200, 389 | kwargs={ 390 | 'noise_type': 'uniform', 391 | 'noise_scale': 0.05, 392 | 'init_scale': 0.1, 393 | } 394 | ) 395 | 396 | register( 397 | id='CartPoleNoiseInit010Dynamics010-v0', 398 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 399 | max_episode_steps=200, 400 | kwargs={ 401 | 'noise_type': 'uniform', 402 | 'noise_scale': 0.1, 403 | 'init_scale': 0.1, 404 | } 405 | ) 406 | 407 | register( 408 | id='CartPoleNoiseInit015Dynamics000-v0', 409 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 410 | max_episode_steps=200, 411 | kwargs={ 412 | 'noise_type': 'uniform', 413 | 'noise_scale': 0.0, 414 | 'init_scale': 0.15, 415 | } 416 | ) 417 | 418 | register( 419 | id='CartPoleNoiseInit015Dynamics003-v0', 420 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 421 | max_episode_steps=200, 422 | kwargs={ 423 | 'noise_type': 'uniform', 424 | 'noise_scale': 0.03, 425 | 'init_scale': 0.15, 426 | } 427 | ) 428 | 429 | register( 430 | id='CartPoleNoiseInit015Dynamics005-v0', 431 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 432 | max_episode_steps=200, 433 | kwargs={ 434 | 'noise_type': 'uniform', 435 | 'noise_scale': 0.05, 436 | 'init_scale': 0.15, 437 | } 438 | ) 439 | 440 | register( 441 | id='CartPoleNoiseInit015Dynamics010-v0', 442 | entry_point='pic.gym.noisy_dynamics.cartpole_noise:CartPoleNoiseEnv', 443 | max_episode_steps=200, 444 | kwargs={ 445 | 'noise_type': 'uniform', 446 | 'noise_scale': 0.1, 447 | 'init_scale': 0.15, 448 | } 449 | ) 450 | 451 | # HalfCheetah 452 | register( 453 | id='HalfCheetahNoiseInit010Dynamics003-v2', 454 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 455 | max_episode_steps=1000, 456 | kwargs={ 457 | 'noise_type': 'uniform', 458 | 'noise_scale': 0.03, 459 | 'init_scale': 0.1, 460 | } 461 | ) 462 | 463 | register( 464 | id='HalfCheetahNoiseInit010Dynamics005-v2', 465 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 466 | max_episode_steps=1000, 467 | kwargs={ 468 | 'noise_type': 'uniform', 469 | 'noise_scale': 0.05, 470 | 'init_scale': 0.1, 471 | } 472 | ) 473 | 474 | register( 475 | id='HalfCheetahNoiseInit010Dynamics010-v2', 476 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 477 | max_episode_steps=1000, 478 | kwargs={ 479 | 'noise_type': 'uniform', 480 | 'noise_scale': 0.1, 481 | 'init_scale': 0.1, 482 | } 483 | ) 484 | 485 | register( 486 | id='HalfCheetahNoiseInit030Dynamics000-v2', 487 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 488 | max_episode_steps=1000, 489 | kwargs={ 490 | 'noise_type': 'uniform', 491 | 'noise_scale': 0.00, 492 | 'init_scale': 0.3, 493 | } 494 | ) 495 | 496 | register( 497 | id='HalfCheetahNoiseInit030Dynamics003-v2', 498 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 499 | max_episode_steps=1000, 500 | kwargs={ 501 | 'noise_type': 'uniform', 502 | 'noise_scale': 0.03, 503 | 'init_scale': 0.3, 504 | } 505 | ) 506 | 507 | register( 508 | id='HalfCheetahNoiseInit030Dynamics005-v2', 509 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 510 | max_episode_steps=1000, 511 | kwargs={ 512 | 'noise_type': 'uniform', 513 | 'noise_scale': 0.05, 514 | 'init_scale': 0.3, 515 | } 516 | ) 517 | 518 | register( 519 | id='HalfCheetahNoiseInit030Dynamics010-v2', 520 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 521 | max_episode_steps=1000, 522 | kwargs={ 523 | 'noise_type': 'uniform', 524 | 'noise_scale': 0.1, 525 | 'init_scale': 0.3, 526 | } 527 | ) 528 | 529 | register( 530 | id='HalfCheetahNoiseInit050Dynamics000-v2', 531 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 532 | max_episode_steps=1000, 533 | kwargs={ 534 | 'noise_type': 'uniform', 535 | 'noise_scale': 0.00, 536 | 'init_scale': 0.5, 537 | } 538 | ) 539 | 540 | register( 541 | id='HalfCheetahNoiseInit050Dynamics003-v2', 542 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 543 | max_episode_steps=1000, 544 | kwargs={ 545 | 'noise_type': 'uniform', 546 | 'noise_scale': 0.03, 547 | 'init_scale': 0.5, 548 | } 549 | ) 550 | 551 | register( 552 | id='HalfCheetahNoiseInit050Dynamics005-v2', 553 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 554 | max_episode_steps=1000, 555 | kwargs={ 556 | 'noise_type': 'uniform', 557 | 'noise_scale': 0.05, 558 | 'init_scale': 0.5, 559 | } 560 | ) 561 | 562 | register( 563 | id='HalfCheetahNoiseInit050Dynamics010-v2', 564 | entry_point='pic.gym.noisy_dynamics.halfcheetah_noise:HalfCheetahNoiseEnv', 565 | max_episode_steps=1000, 566 | kwargs={ 567 | 'noise_type': 'uniform', 568 | 'noise_scale': 0.1, 569 | 'init_scale': 0.5, 570 | } 571 | ) 572 | 573 | # Humanoid 574 | register( 575 | id='HumanoidNoiseInit001Dynamics003-v2', 576 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 577 | max_episode_steps=1000, 578 | kwargs={ 579 | 'noise_type': 'uniform', 580 | 'noise_scale': 0.03, 581 | 'init_scale': 0.01, 582 | } 583 | ) 584 | 585 | register( 586 | id='HumanoidNoiseInit001Dynamics005-v2', 587 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 588 | max_episode_steps=1000, 589 | kwargs={ 590 | 'noise_type': 'uniform', 591 | 'noise_scale': 0.05, 592 | 'init_scale': 0.01, 593 | } 594 | ) 595 | 596 | register( 597 | id='HumanoidNoiseInit001Dynamics010-v2', 598 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 599 | max_episode_steps=1000, 600 | kwargs={ 601 | 'noise_type': 'uniform', 602 | 'noise_scale': 0.1, 603 | 'init_scale': 0.01, 604 | } 605 | ) 606 | 607 | register( 608 | id='HumanoidNoiseInit003Dynamics000-v2', 609 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 610 | max_episode_steps=1000, 611 | kwargs={ 612 | 'noise_type': 'uniform', 613 | 'noise_scale': 0.0, 614 | 'init_scale': 0.03, 615 | } 616 | ) 617 | 618 | register( 619 | id='HumanoidNoiseInit003Dynamics003-v2', 620 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 621 | max_episode_steps=1000, 622 | kwargs={ 623 | 'noise_type': 'uniform', 624 | 'noise_scale': 0.03, 625 | 'init_scale': 0.03, 626 | } 627 | ) 628 | 629 | register( 630 | id='HumanoidNoiseInit003Dynamics005-v2', 631 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 632 | max_episode_steps=1000, 633 | kwargs={ 634 | 'noise_type': 'uniform', 635 | 'noise_scale': 0.05, 636 | 'init_scale': 0.03, 637 | } 638 | ) 639 | 640 | register( 641 | id='HumanoidNoiseInit003Dynamics010-v2', 642 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 643 | max_episode_steps=1000, 644 | kwargs={ 645 | 'noise_type': 'uniform', 646 | 'noise_scale': 0.1, 647 | 'init_scale': 0.03, 648 | } 649 | ) 650 | 651 | register( 652 | id='HumanoidNoiseInit005Dynamics000-v2', 653 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 654 | max_episode_steps=1000, 655 | kwargs={ 656 | 'noise_type': 'uniform', 657 | 'noise_scale': 0.0, 658 | 'init_scale': 0.05, 659 | } 660 | ) 661 | 662 | register( 663 | id='HumanoidNoiseInit005Dynamics003-v2', 664 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 665 | max_episode_steps=1000, 666 | kwargs={ 667 | 'noise_type': 'uniform', 668 | 'noise_scale': 0.03, 669 | 'init_scale': 0.05, 670 | } 671 | ) 672 | 673 | register( 674 | id='HumanoidNoiseInit005Dynamics005-v2', 675 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 676 | max_episode_steps=1000, 677 | kwargs={ 678 | 'noise_type': 'uniform', 679 | 'noise_scale': 0.05, 680 | 'init_scale': 0.05, 681 | } 682 | ) 683 | 684 | register( 685 | id='HumanoidNoiseInit005Dynamics010-v2', 686 | entry_point='pic.gym.noisy_dynamics.humanoid_noise:HumanoidNoiseEnv', 687 | max_episode_steps=1000, 688 | kwargs={ 689 | 'noise_type': 'uniform', 690 | 'noise_scale': 0.1, 691 | 'init_scale': 0.05, 692 | } 693 | ) 694 | -------------------------------------------------------------------------------- /pic/gym/multi_step/multi_step.py: -------------------------------------------------------------------------------- 1 | from gym import Env 2 | from gym.spaces import Discrete, Box 3 | from gym.utils import seeding 4 | 5 | import numpy as np 6 | 7 | 8 | class MultiStepEnv(Env): 9 | def __init__(self, horizon=1): 10 | self.horizon = horizon 11 | self.action_space = Discrete(2) 12 | self.observation_space = Box(low=0, high=1, shape=(3,)) 13 | self.state_dict = { 14 | 's_1': np.array([1, 0, 0]), 15 | 's_2': np.array([0, 1, 0]), 16 | 's_3': np.array([0, 0, 1]), 17 | 's_4': np.array([1, 1, 1]), 18 | 's_5': np.array([0, 0, 0]), 19 | } 20 | self.seed() 21 | self.reset() 22 | 23 | def seed(self, seed=None): 24 | self.np_random, seed = seeding.np_random(seed) 25 | return [seed] 26 | 27 | def reset(self): 28 | self.state = 's_1' 29 | self.time_step = 0 30 | return self.state_dict[self.state] 31 | 32 | def step(self, action): 33 | assert self.action_space.contains(action) 34 | reward = 0.0 35 | if self.state == 's_1': 36 | if action == 0: 37 | self.state = 's_2' 38 | elif action == 1: 39 | self.state = 's_3' 40 | elif self.state == 's_2': 41 | if action == 1: 42 | self.state = 's_4' 43 | elif self.state =='s_4': 44 | if action == 0: 45 | self.state = 's_5' 46 | 47 | self.time_step += 1 48 | if self.time_step == self.horizon: 49 | done = True 50 | if self.state == 's_2' and self.time_step == 1: 51 | reward = 1.0 52 | elif self.state == 's_4' and self.time_step == 2: 53 | reward = 1.0 54 | elif self.state == 's_5' and self.time_step == 3: 55 | reward = 1.0 56 | else: 57 | done = False 58 | 59 | return self.state_dict[self.state], reward, done, {'state': self.state} 60 | 61 | 62 | if __name__ == "__main__": 63 | # test 64 | env = MultiStepEnv(horizon=1) 65 | print('=======horizon1=======') 66 | for i in range(2): 67 | print(i) 68 | obs = env.reset() 69 | obs, reward, done, info = env.step(i) 70 | print('======================') 71 | print('obs: {}'.format(obs)) 72 | print('reward: {}'.format(reward)) 73 | print('done: {}'.format(done)) 74 | print('state: {}'.format(info['state'])) 75 | print('======================') 76 | 77 | env = MultiStepEnv(horizon=2) 78 | for actions in [[0,0], [0,1], [1,0], [1,1]]: 79 | print('=======horizon2=======') 80 | print(actions) 81 | obs = env.reset() 82 | for i in range(2): 83 | obs, reward, done, info = env.step(actions[i]) 84 | print('======================') 85 | print('obs: {}'.format(obs)) 86 | print('reward: {}'.format(reward)) 87 | print('done: {}'.format(done)) 88 | print('state: {}'.format(info['state'])) 89 | print('======================') 90 | 91 | env = MultiStepEnv(horizon=3) 92 | for actions in [[0,0,0], [0,0,1], [0,1,0], [1,0,0], [0,1,1], [1,0,1], [1,1,0], [1,1,1]]: 93 | print('=======horizon3=======') 94 | print(actions) 95 | obs = env.reset() 96 | for i in range(3): 97 | obs, reward, done, info = env.step(actions[i]) 98 | print('======================') 99 | print('obs: {}'.format(obs)) 100 | print('reward: {}'.format(reward)) 101 | print('done: {}'.format(done)) 102 | print('state: {}'.format(info['state'])) 103 | print('======================') 104 | -------------------------------------------------------------------------------- /pic/gym/noisy_dynamics/cartpole_noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym.envs.classic_control import CartPoleEnv 4 | 5 | 6 | class CartPoleNoiseEnv(CartPoleEnv): 7 | def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0): 8 | self.noise_type = noise_type 9 | assert self.noise_type in ['normal', 'uniform'] 10 | self.noise_scale = noise_scale 11 | self.init_scale = init_scale 12 | 13 | CartPoleEnv.__init__(self) 14 | 15 | def step(self, action): 16 | err_msg = "%r (%s) invalid" % (action, type(action)) 17 | assert self.action_space.contains(action), err_msg 18 | 19 | x, x_dot, theta, theta_dot = self.state 20 | force = self.force_mag if action == 1 else -self.force_mag 21 | costheta = np.cos(theta) 22 | sintheta = np.sin(theta) 23 | 24 | temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / self.total_mass 25 | thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta ** 2 / self.total_mass)) 26 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass 27 | 28 | if self.noise_scale == 0.0: 29 | noise = np.zeros((1,)) 30 | elif self.noise_type == 'normal': 31 | noise = np.random.normal(loc=0., scale=self.noise_scale, size=1) 32 | elif self.noise_type == 'uniform': 33 | noise = np.random.uniform(-self.noise_scale, self.noise_scale, 1) 34 | 35 | if self.kinematics_integrator == 'euler': 36 | x = x + self.tau * x_dot 37 | x_dot = x_dot + self.tau * xacc 38 | theta = theta + self.tau * theta_dot 39 | theta_dot = theta_dot + self.tau * thetaacc + noise[0] 40 | else: # semi-implicit euler 41 | x_dot = x_dot + self.tau * xacc 42 | x = x + self.tau * x_dot 43 | theta_dot = theta_dot + self.tau * thetaacc + noise[0] 44 | theta = theta + self.tau * theta_dot 45 | 46 | self.state = (x, x_dot, theta, theta_dot) 47 | 48 | done = bool( 49 | x < -self.x_threshold 50 | or x > self.x_threshold 51 | or theta < -self.theta_threshold_radians 52 | or theta > self.theta_threshold_radians 53 | ) 54 | 55 | if not done: 56 | reward = 1.0 57 | elif self.steps_beyond_done is None: 58 | self.steps_beyond_done = 0 59 | reward = 1.0 60 | else: 61 | self.steps_beyond_done += 1 62 | reward = 0.0 63 | 64 | return np.array(self.state), reward, done, {} 65 | 66 | def reset(self): 67 | # original: low=-0.05, high=0.05 68 | self.state = self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=(4,)) 69 | self.steps_beyond_done = None 70 | return np.array(self.state) 71 | -------------------------------------------------------------------------------- /pic/gym/noisy_dynamics/halfcheetah_noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import HalfCheetahEnv 3 | 4 | 5 | class HalfCheetahNoiseEnv(HalfCheetahEnv): 6 | def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0): 7 | self.noise_type = noise_type 8 | assert self.noise_type in ['normal', 'uniform'] 9 | self.noise_scale = noise_scale 10 | self.init_scale = init_scale 11 | 12 | HalfCheetahEnv.__init__(self) 13 | 14 | def step(self, action): 15 | xposbefore = self.sim.data.qpos[0] 16 | 17 | # noise 18 | if self.noise_scale == 0.0: 19 | noise = np.zeros((1,)) 20 | elif self.noise_type == 'normal': 21 | noise = np.random.normal(loc=0., scale=self.noise_scale, size=1) 22 | elif self.noise_type == 'uniform': 23 | noise = np.random.uniform(-self.noise_scale, self.noise_scale, 1) 24 | self.do_simulation(action, self.frame_skip) 25 | # add noise 26 | self.sim.data.qvel[0] += noise[0] 27 | 28 | xposafter = self.sim.data.qpos[0] 29 | ob = self._get_obs() 30 | reward_ctrl = - 0.1 * np.square(action).sum() 31 | reward_run = (xposafter - xposbefore)/self.dt 32 | reward = reward_ctrl + reward_run 33 | done = False 34 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 35 | 36 | def _get_obs(self): 37 | return np.concatenate([ 38 | self.sim.data.qpos.flat[1:], 39 | self.sim.data.qvel.flat, 40 | ]) 41 | 42 | def reset_model(self): 43 | # original: self.init_scale=0.1 44 | qpos = self.init_qpos + self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=self.model.nq) 45 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * self.init_scale 46 | self.set_state(qpos, qvel) 47 | return self._get_obs() 48 | -------------------------------------------------------------------------------- /pic/gym/noisy_dynamics/humanoid_noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import HumanoidEnv 3 | 4 | 5 | def mass_center(model, sim): 6 | mass = np.expand_dims(model.body_mass, 1) 7 | xpos = sim.data.xipos 8 | return (np.sum(mass * xpos, 0) / np.sum(mass))[0] 9 | 10 | 11 | class HumanoidNoiseEnv(HumanoidEnv): 12 | def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0): 13 | self.noise_type = noise_type 14 | assert self.noise_type in ['normal', 'uniform'] 15 | self.noise_scale = noise_scale 16 | self.init_scale = init_scale 17 | 18 | HumanoidEnv.__init__(self) 19 | 20 | def _get_obs(self): 21 | data = self.sim.data 22 | return np.concatenate( 23 | [ 24 | data.qpos.flat[2:], 25 | data.qvel.flat, 26 | data.cinert.flat, 27 | data.cvel.flat, 28 | data.qfrc_actuator.flat, 29 | data.cfrc_ext.flat 30 | ] 31 | ) 32 | 33 | def step(self, a): 34 | pos_before = mass_center(self.model, self.sim) 35 | 36 | # noise 37 | if self.noise_scale == 0.0: 38 | noise = np.zeros((1,)) 39 | elif self.noise_type == 'normal': 40 | noise = np.random.normal(loc=0., scale=self.noise_scale, size=1) 41 | elif self.noise_type == 'uniform': 42 | noise = np.random.uniform(-self.noise_scale, self.noise_scale, 1) 43 | self.do_simulation(a, self.frame_skip) 44 | # add noise 45 | self.sim.data.qvel[0] += noise[0] 46 | 47 | pos_after = mass_center(self.model, self.sim) 48 | alive_bonus = 5.0 49 | data = self.sim.data 50 | lin_vel_cost = 1.25 * (pos_after - pos_before) / self.dt 51 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() 52 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 53 | quad_impact_cost = min(quad_impact_cost, 10) 54 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 55 | qpos = self.sim.data.qpos 56 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 57 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 58 | 59 | def reset_model(self): 60 | # original: self.init_scale = 0.01 61 | self.set_state( 62 | self.init_qpos + self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=self.model.nq), 63 | self.init_qvel + self.np_random.uniform(low=-self.init_scale, high=self.init_scale, size=self.model.nv,) 64 | ) 65 | return self._get_obs() 66 | -------------------------------------------------------------------------------- /pic/gym/reward_shaping/dynamic_mjc.py: -------------------------------------------------------------------------------- 1 | """ 2 | dynamic_mjc.py 3 | A small library for programatically building MuJoCo XML files 4 | """ 5 | from contextlib import contextmanager 6 | import tempfile 7 | import numpy as np 8 | 9 | 10 | def default_model(name): 11 | """ 12 | Get a model with basic settings such as gravity and RK4 integration enabled 13 | """ 14 | model = MJCModel(name) 15 | root = model.root 16 | 17 | # Setup 18 | root.compiler(angle="radian", inertiafromgeom="true") 19 | default = root.default() 20 | default.joint(armature=1, damping=1, limited="true") 21 | default.geom(contype=0, friction='1 0.1 0.1', rgba='0.7 0.7 0 1') 22 | root.option(gravity="0 0 -9.81", integrator="RK4", timestep=0.01) 23 | return model 24 | 25 | 26 | def pointmass_model(name): 27 | """ 28 | Get a model with basic settings such as gravity and Euler integration enabled 29 | """ 30 | model = MJCModel(name) 31 | root = model.root 32 | 33 | # Setup 34 | root.compiler(angle="radian", inertiafromgeom="true", coordinate="local") 35 | default = root.default() 36 | default.joint(limited="false", damping=1) 37 | default.geom(contype=2, conaffinity="1", condim="1", friction=".5 .1 .1", density="1000", margin="0.002") 38 | root.option(timestep=0.01, gravity="0 0 0", iterations="20", integrator="Euler") 39 | return model 40 | 41 | 42 | class MJCModel(object): 43 | def __init__(self, name): 44 | self.name = name 45 | self.root = MJCTreeNode("mujoco").add_attr('model', name) 46 | 47 | @contextmanager 48 | def asfile(self): 49 | """ 50 | Usage: 51 | model = MJCModel('reacher') 52 | with model.asfile() as f: 53 | print f.read() # prints a dump of the model 54 | """ 55 | with tempfile.NamedTemporaryFile(mode='w+', suffix='.xml', delete=True) as f: 56 | self.root.write(f) 57 | f.seek(0) 58 | yield f 59 | 60 | def open(self): 61 | self.file = tempfile.NamedTemporaryFile(mode='w+', suffix='.xml', delete=True) 62 | self.root.write(self.file) 63 | self.file.seek(0) 64 | return self.file 65 | 66 | def close(self): 67 | self.file.close() 68 | 69 | def find_attr(self, attr, value): 70 | return self.root.find_attr(attr, value) 71 | 72 | def __getstate__(self): 73 | return {} 74 | 75 | def __setstate__(self, state): 76 | pass 77 | 78 | 79 | class MJCTreeNode(object): 80 | def __init__(self, name): 81 | self.name = name 82 | self.attrs = {} 83 | self.children = [] 84 | 85 | def add_attr(self, key, value): 86 | if isinstance(value, str): 87 | pass 88 | elif isinstance(value, list) or isinstance(value, np.ndarray): 89 | value = ' '.join([str(val).lower() for val in value]) 90 | else: 91 | value = str(value).lower() 92 | 93 | self.attrs[key] = value 94 | return self 95 | 96 | def __getattr__(self, name): 97 | def wrapper(**kwargs): 98 | newnode = MJCTreeNode(name) 99 | for (k, v) in kwargs.items(): 100 | newnode.add_attr(k, v) 101 | self.children.append(newnode) 102 | return newnode 103 | return wrapper 104 | 105 | def dfs(self): 106 | yield self 107 | if self.children: 108 | for child in self.children: 109 | for node in child.dfs(): 110 | yield node 111 | 112 | def find_attr(self, attr, value): 113 | """ Run DFS to find a matching attr """ 114 | if attr in self.attrs and self.attrs[attr] == value: 115 | return self 116 | for child in self.children: 117 | res = child.find_attr(attr, value) 118 | if res is not None: 119 | return res 120 | return None 121 | 122 | def write(self, ostream, tabs=0): 123 | contents = ' '.join(['%s="%s"'%(k,v) for (k,v) in self.attrs.items()]) 124 | if self.children: 125 | ostream.write('\t'*tabs) 126 | ostream.write('<%s %s>\n' % (self.name, contents)) 127 | for child in self.children: 128 | child.write(ostream, tabs=tabs+1) 129 | ostream.write('\t'*tabs) 130 | ostream.write('\n' % self.name) 131 | else: 132 | ostream.write('\t'*tabs) 133 | ostream.write('<%s %s/>\n' % (self.name, contents)) 134 | 135 | def __str__(self): 136 | s = "<"+self.name 137 | s += ' '.join(['%s="%s"'%(k,v) for (k,v) in self.attrs.items()]) 138 | return s+">" 139 | -------------------------------------------------------------------------------- /pic/gym/reward_shaping/maze_model.py: -------------------------------------------------------------------------------- 1 | """ A pointmass maze env.""" 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | from pic.gym.reward_shaping.dynamic_mjc import MJCModel 5 | import numpy as np 6 | import random 7 | 8 | 9 | WALL = 10 10 | EMPTY = 11 11 | GOAL = 12 12 | 13 | 14 | def parse_maze(maze_str): 15 | lines = maze_str.strip().split('\\') 16 | width, height = len(lines), len(lines[0]) 17 | maze_arr = np.zeros((width, height), dtype=np.int32) 18 | for w in range(width): 19 | for h in range(height): 20 | tile = lines[w][h] 21 | if tile == '#': 22 | maze_arr[w][h] = WALL 23 | elif tile == 'G': 24 | maze_arr[w][h] = GOAL 25 | elif tile == ' ' or tile == 'O' or tile == '0': 26 | maze_arr[w][h] = EMPTY 27 | else: 28 | raise ValueError('Unknown tile type: %s' % tile) 29 | return maze_arr 30 | 31 | 32 | def point_maze(maze_str): 33 | maze_arr = parse_maze(maze_str) 34 | 35 | mjcmodel = MJCModel('point_maze') 36 | mjcmodel.root.compiler(inertiafromgeom="true", angle="radian", coordinate="local") 37 | mjcmodel.root.option(timestep="0.01", gravity="0 0 0", iterations="20", integrator="Euler") 38 | default = mjcmodel.root.default() 39 | default.joint(damping=1, limited='false') 40 | default.geom(friction=".5 .1 .1", density="1000", margin="0.002", condim="1", contype="2", conaffinity="1") 41 | 42 | asset = mjcmodel.root.asset() 43 | asset.texture(type="2d", name="groundplane", builtin="checker", rgb1="0.2 0.3 0.4", rgb2="0.1 0.2 0.3", width=100, height=100) 44 | asset.texture( 45 | name="skybox", type="skybox", builtin="gradient", rgb1=".4 .6 .8", rgb2="0 0 0", 46 | width="800", height="800", mark="random", markrgb="1 1 1" 47 | ) 48 | asset.material(name="groundplane", texture="groundplane", texrepeat="20 20") 49 | asset.material(name="wall", rgba=".7 .5 .3 1") 50 | asset.material(name="target", rgba=".6 .3 .3 1") 51 | 52 | visual = mjcmodel.root.visual() 53 | visual.headlight(ambient=".4 .4 .4", diffuse=".8 .8 .8", specular="0.1 0.1 0.1") 54 | visual.map(znear=.01) 55 | visual.quality(shadowsize=2048) 56 | 57 | worldbody = mjcmodel.root.worldbody() 58 | worldbody.geom(name='ground', size="40 40 0.25", pos="0 0 -0.1", type="plane", contype=1, conaffinity=0, material="groundplane") 59 | 60 | particle = worldbody.body(name='particle', pos=[1.2,1.2,0]) 61 | particle.geom(name='particle_geom', type='sphere', size=0.1, rgba='0.0 0.0 1.0 0.0', contype=1) 62 | particle.site(name='particle_site', pos=[0.0,0.0,0], size=0.2, rgba='0.3 0.6 0.3 1') 63 | particle.joint(name='ball_x', type='slide', pos=[0,0,0], axis=[1,0,0]) 64 | particle.joint(name='ball_y', type='slide', pos=[0,0,0], axis=[0,1,0]) 65 | 66 | worldbody.site(name='target_site', pos=[0.0,0.0,0], size=0.2, material='target') 67 | 68 | width, height = maze_arr.shape 69 | for w in range(width): 70 | for h in range(height): 71 | if maze_arr[w,h] == WALL: 72 | worldbody.geom( 73 | conaffinity=1, 74 | type='box', 75 | name='wall_%d_%d'%(w,h), 76 | material='wall', 77 | pos=[w+1.0,h+1.0,0], 78 | size=[0.5,0.5,0.2] 79 | ) 80 | 81 | actuator = mjcmodel.root.actuator() 82 | actuator.motor(joint="ball_x", ctrlrange=[-1.0, 1.0], ctrllimited=True, gear=100) 83 | actuator.motor(joint="ball_y", ctrlrange=[-1.0, 1.0], ctrllimited=True, gear=100) 84 | 85 | return mjcmodel 86 | 87 | 88 | LARGE_MAZE = \ 89 | "############\\"+\ 90 | "#OOOO#OOOOO#\\"+\ 91 | "#O##O#O#O#O#\\"+\ 92 | "#OOOOOO#OOO#\\"+\ 93 | "#O####O###O#\\"+\ 94 | "#OO#O#OOOOO#\\"+\ 95 | "##O#O#O#O###\\"+\ 96 | "#OO#OOO#OGO#\\"+\ 97 | "############" 98 | 99 | LARGE_MAZE_EVAL = \ 100 | "############\\"+\ 101 | "#OO#OOO#OGO#\\"+\ 102 | "##O###O#O#O#\\"+\ 103 | "#OO#O#OOOOO#\\"+\ 104 | "#O##O#OO##O#\\"+\ 105 | "#OOOOOO#OOO#\\"+\ 106 | "#O##O#O#O###\\"+\ 107 | "#OOOO#OOOOO#\\"+\ 108 | "############" 109 | 110 | MEDIUM_MAZE = \ 111 | '########\\'+\ 112 | '#OO##OO#\\'+\ 113 | '#OO#OOO#\\'+\ 114 | '##OOO###\\'+\ 115 | '#OO#OOO#\\'+\ 116 | '#O#OO#O#\\'+\ 117 | '#OOO#OG#\\'+\ 118 | "########" 119 | 120 | MEDIUM_MAZE_EVAL = \ 121 | '########\\'+\ 122 | '#OOOOOG#\\'+\ 123 | '#O#O##O#\\'+\ 124 | '#OOOO#O#\\'+\ 125 | '###OO###\\'+\ 126 | '#OOOOOO#\\'+\ 127 | '#OO##OO#\\'+\ 128 | "########" 129 | 130 | SMALL_MAZE = \ 131 | "######\\"+\ 132 | "#OOOO#\\"+\ 133 | "#O##O#\\"+\ 134 | "#OOOO#\\"+\ 135 | "######" 136 | 137 | U_MAZE = \ 138 | "#####\\"+\ 139 | "#GOO#\\"+\ 140 | "###O#\\"+\ 141 | "#OOO#\\"+\ 142 | "#####" 143 | 144 | U_MAZE_EVAL = \ 145 | "#####\\"+\ 146 | "#OOG#\\"+\ 147 | "#O###\\"+\ 148 | "#OOO#\\"+\ 149 | "#####" 150 | 151 | OPEN = \ 152 | "#######\\"+\ 153 | "#OOOOO#\\"+\ 154 | "#OOGOO#\\"+\ 155 | "#OOOOO#\\"+\ 156 | "#######" 157 | 158 | 159 | class MazeEnv(mujoco_env.MujocoEnv, utils.EzPickle): 160 | def __init__( 161 | self, 162 | reward_type='negative_sparse', 163 | maze_spec=U_MAZE, 164 | reset_target=False, 165 | coefficent=1.0, 166 | distance_threshold=0.5, 167 | multiplier=0.01, 168 | offset=0.1, 169 | ): 170 | self.reset_target = reset_target 171 | self.str_maze_spec = maze_spec 172 | self.maze_arr = parse_maze(maze_spec) 173 | self.reward_type = reward_type 174 | self.coefficent = coefficent 175 | self.distance_threshold = distance_threshold 176 | self.multiplier = multiplier 177 | self.offset = offset 178 | self.reset_locations = list(zip(*np.where(self.maze_arr == EMPTY))) 179 | self.reset_locations.sort() 180 | 181 | self._target = np.array([0.0,0.0]) 182 | 183 | model = point_maze(maze_spec) 184 | with model.asfile() as f: 185 | mujoco_env.MujocoEnv.__init__(self, model_path=f.name, frame_skip=1) 186 | utils.EzPickle.__init__(self) 187 | 188 | # Set the default goal (overriden by a call to set_target) 189 | # Try to find a goal if it exists 190 | self.goal_locations = list(zip(*np.where(self.maze_arr == GOAL))) 191 | if len(self.goal_locations) == 1: 192 | self.set_target(self.goal_locations[0]) 193 | elif len(self.goal_locations) > 1: 194 | raise ValueError("More than 1 goal specified!") 195 | else: 196 | # If no goal, use the first empty tile 197 | self.set_target(np.array(self.reset_locations[0]).astype(self.observation_space.dtype)) 198 | self.empty_and_goal_locations = self.reset_locations + self.goal_locations 199 | 200 | def step(self, action): 201 | action = np.clip(action, -1.0, 1.0) 202 | self.clip_velocity() 203 | self.do_simulation(action, self.frame_skip) 204 | self.set_marker() 205 | ob = self._get_obs() 206 | 207 | if self.reward_type == 'negative_sparse': 208 | reward = 0.0 if np.linalg.norm(ob[0:2] - self._target) <= self.distance_threshold else -1.0 209 | assert reward < 0.001 210 | elif self.reward_type == 'densel1': 211 | reward = -self.coefficent * np.linalg.norm(ob[0:2] - self._target, ord=1) 212 | assert reward < 0.0 213 | elif self.reward_type == 'densel2': 214 | reward = -self.coefficent * np.linalg.norm(ob[0:2] - self._target, ord=2) 215 | assert reward < 0.0 216 | elif self.reward_type == 'frac': 217 | d = np.linalg.norm(ob[0:2] - self._target, ord=2) 218 | reward = self.multiplier / (self.offset + d) 219 | else: 220 | raise ValueError('Unknown reward type %s' % self.reward_type) 221 | done = False 222 | return ob, reward, done, {} 223 | 224 | def _get_obs(self): 225 | return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel() 226 | 227 | def get_target(self): 228 | return self._target 229 | 230 | def set_target(self, target_location=None): 231 | if target_location is None: 232 | idx = self.np_random.choice(len(self.empty_and_goal_locations)) 233 | reset_location = np.array(self.empty_and_goal_locations[idx]).astype(self.observation_space.dtype) 234 | target_location = reset_location + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 235 | self._target = target_location 236 | 237 | def set_marker(self): 238 | self.data.site_xpos[self.model.site_name2id('target_site')] = np.array([self._target[0]+1, self._target[1]+1, 0.0]) 239 | 240 | def clip_velocity(self): 241 | qvel = np.clip(self.sim.data.qvel, -5.0, 5.0) 242 | self.set_state(self.sim.data.qpos, qvel) 243 | 244 | def reset_model(self): 245 | idx = self.np_random.choice(len(self.empty_and_goal_locations)) 246 | reset_location = np.array(self.empty_and_goal_locations[idx]).astype(self.observation_space.dtype) 247 | qpos = reset_location + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 248 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 249 | self.set_state(qpos, qvel) 250 | if self.reset_target: 251 | self.set_target() 252 | return self._get_obs() 253 | 254 | def reset_to_location(self, location): 255 | self.sim.reset() 256 | reset_location = np.array(location).astype(self.observation_space.dtype) 257 | qpos = reset_location + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 258 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 259 | self.set_state(qpos, qvel) 260 | return self._get_obs() 261 | 262 | def viewer_setup(self): 263 | pass 264 | -------------------------------------------------------------------------------- /pic/gym/reward_shaping/reacher_norm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import ReacherEnv 3 | 4 | 5 | class ReacherL1Env(ReacherEnv): 6 | def __init__(self, coefficent=1.0): 7 | self.coefficent = coefficent 8 | 9 | ReacherEnv.__init__(self) 10 | 11 | def step(self, a): 12 | vec = self.get_body_com("fingertip")-self.get_body_com("target") 13 | reward = - self.coefficent * np.linalg.norm(vec, ord=1) # L1 norm 14 | self.do_simulation(a, self.frame_skip) 15 | ob = self._get_obs() 16 | done = False 17 | return ob, reward, done, {} 18 | 19 | 20 | class ReacherL2Env(ReacherEnv): 21 | def __init__(self, coefficent=1.0): 22 | self.coefficent = coefficent 23 | 24 | ReacherEnv.__init__(self) 25 | 26 | def step(self, a): 27 | vec = self.get_body_com("fingertip")-self.get_body_com("target") 28 | reward = - self.coefficent * np.linalg.norm(vec) # L2 norm 29 | self.do_simulation(a, self.frame_skip) 30 | ob = self._get_obs() 31 | done = False 32 | return ob, reward, done, {} 33 | 34 | 35 | class ReacherSparseEnv(ReacherEnv): 36 | def __init__(self, distance_threshold=0.05): 37 | self.distance_threshold = distance_threshold 38 | 39 | ReacherEnv.__init__(self) 40 | 41 | def step(self, a): 42 | vec = self.get_body_com("fingertip")-self.get_body_com("target") 43 | d = np.linalg.norm(vec) 44 | reward = -(d > self.distance_threshold).astype(np.float32) # Sparse 45 | self.do_simulation(a, self.frame_skip) 46 | ob = self._get_obs() 47 | done = False 48 | return ob, reward, done, {} 49 | 50 | 51 | class ReacherFracEnv(ReacherEnv): 52 | def __init__(self, multiplier=0.01, offset=0.1): 53 | self.multiplier = multiplier 54 | self.offset = offset 55 | 56 | ReacherEnv.__init__(self) 57 | 58 | def step(self, a): 59 | vec = self.get_body_com("fingertip")-self.get_body_com("target") 60 | d = np.linalg.norm(vec) 61 | reward = self.multiplier / (self.offset + d) 62 | self.do_simulation(a, self.frame_skip) 63 | ob = self._get_obs() 64 | done = False 65 | return ob, reward, done, {} 66 | -------------------------------------------------------------------------------- /pic/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from pic.nn.numpymlp import NumpyMLP # NOQA 2 | -------------------------------------------------------------------------------- /pic/nn/numpymlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class NumpyMLP: 5 | def __init__( 6 | self, 7 | n_inputs, 8 | n_outputs, 9 | n_hidden_layers=2, 10 | n_hidden_units=4, 11 | random_dist="normal", 12 | normal_mean=0.0, 13 | normal_sigma=1.0, 14 | uniform_bound=1.0, 15 | act_fn="tanh", 16 | use_bias=False 17 | ): 18 | self.n_inputs = n_inputs 19 | self.n_outputs = n_outputs 20 | self.n_hidden_layers = n_hidden_layers 21 | self.n_hidden_units = n_hidden_units 22 | random_dists = ['normal', 'uniform', 'xavier_uniform', 'xavier_normal'] 23 | assert (random_dist in random_dists) 24 | self.random_dist = random_dist 25 | self.random_dist_scaling = 1.0 26 | self.normal_mean = normal_mean 27 | self.normal_sigma = normal_sigma 28 | self.uniform_bound = uniform_bound 29 | self.use_bias = use_bias 30 | 31 | self.init_weights() 32 | 33 | activation_fn_d = { 34 | "tanh": np.tanh, 35 | "linear": lambda x: x, 36 | "relu": lambda x: np.maximum(0, x), 37 | } 38 | assert (act_fn in activation_fn_d.keys()) 39 | self.act_fn = activation_fn_d[act_fn] 40 | 41 | def init_weights(self): 42 | self.weights_matrix = [] 43 | mat_input_size = self.n_inputs 44 | if self.use_bias: 45 | mat_input_size += 1 46 | 47 | for i in range(self.n_hidden_layers): 48 | mat_output_size = self.n_hidden_units 49 | if self.random_dist == "normal": 50 | mat = np.random.normal(loc=self.normal_mean, scale=self.normal_sigma, size=(mat_output_size, mat_input_size)) 51 | elif self.random_dist == "uniform": 52 | mat = np.random.uniform(-self.uniform_bound, self.uniform_bound, (mat_output_size, mat_input_size)) 53 | elif self.random_dist == "xavier_uniform": 54 | bound = 5 / 3 * np.sqrt(6 / (mat_output_size + mat_input_size)) # for tanh 55 | mat = np.random.uniform(-bound, bound, (mat_output_size, mat_input_size)) 56 | elif self.random_dist == "xavier_normal": 57 | bound = 5 / 3 * np.sqrt(2 / (mat_output_size + mat_input_size)) # for tanh 58 | mat = np.random.normal(loc=0.0, scale=bound, size=(mat_output_size, mat_input_size)) 59 | else: 60 | raise 61 | self.weights_matrix.append(self.random_dist_scaling * mat) 62 | mat_input_size = mat_output_size 63 | if self.use_bias: 64 | mat_input_size += 1 65 | # for the last layer: 66 | if self.random_dist == "normal": 67 | mat = np.random.normal(loc=self.normal_mean, scale=self.normal_sigma, size=(self.n_outputs, mat_input_size)) 68 | elif self.random_dist == "uniform": 69 | mat = np.random.uniform(-self.uniform_bound, self.uniform_bound, (self.n_outputs, mat_input_size)) 70 | elif self.random_dist == "xavier_uniform": 71 | bound = 5 / 3 * np.sqrt(6 / (self.N_outputs + mat_input_size)) # for tanh 72 | mat = np.random.uniform(-bound, bound, (self.N_outputs, mat_input_size)) 73 | elif self.random_dist == "xavier_normal": 74 | bound = 5 / 3 * np.sqrt(2 / (self.N_outputs + mat_input_size)) # for tanh 75 | mat = np.random.normal(loc=0., scale=bound, size=(self.N_outputs, mat_input_size)) 76 | self.weights_matrix.append(self.random_dist_scaling * mat) 77 | 78 | self.w_mat_shapes = [w.shape for w in self.weights_matrix] 79 | self.w_mat_lens = [len(w.flatten()) for w in self.weights_matrix] 80 | self.n_weights = sum(self.w_mat_lens) 81 | 82 | def forward(self, x): 83 | for i, w in enumerate(self.weights_matrix): 84 | if self.use_bias: 85 | x = np.concatenate((x, [1.0])) 86 | x = np.dot(w, x) 87 | if i < self.n_hidden_layers: 88 | x = self.act_fn(x) 89 | return x 90 | -------------------------------------------------------------------------------- /pic/sampler/__init__.py: -------------------------------------------------------------------------------- 1 | from pic.sampler.sampler import Sampler # NOQA 2 | from pic.sampler.sampler import make_env # NOQA 3 | -------------------------------------------------------------------------------- /pic/sampler/sampler.py: -------------------------------------------------------------------------------- 1 | import dm2gym 2 | import gym 3 | import random 4 | import multiprocessing as mp 5 | import numpy as np 6 | 7 | import pic 8 | 9 | 10 | class Sampler(object): 11 | def __init__(self, env_name, agent, max_episode_steps, n_samples=10**4, n_episodes=10**3, multiprocess=0): 12 | self.env_name = env_name 13 | self.agent = agent 14 | self.n_samples = n_samples 15 | self.n_episodes = n_episodes 16 | self.multiprocess = multiprocess 17 | self.max_episode_steps = max_episode_steps 18 | 19 | def sample(self): 20 | all_scores_per_param = [] 21 | if self.multiprocess > 0: 22 | num_worker = mp.cpu_count() 23 | if self.multiprocess > num_worker: 24 | self.multiprocess = num_worker 25 | p = mp.Pool(self.multiprocess) 26 | print("num_worker: {}/{}".format(self.multiprocess, num_worker)) 27 | 28 | for samp_num in range(self.n_samples): 29 | if samp_num % max(1, self.n_samples // 10) == 0: 30 | print(f"Sample {samp_num}/{self.n_samples}") 31 | score_episodes = [] 32 | if self.multiprocess > 0: 33 | episodes_per_worker = max(1, int(np.ceil(self.n_episodes / self.multiprocess))) 34 | scores = p.starmap(run_episode_wrapper, [[i, self.env_name, self.agent, self.max_episode_steps, episodes_per_worker] for i in range(self.multiprocess)]) 35 | scores = list(itertools.chain(*scores))[:self.n_episodes] 36 | assert len(scores) == self.n_episodes, f'{len(scores)} != {self.n_episodes}' 37 | score_episodes += scores 38 | else: 39 | env = make_env(env_name, seed=None) 40 | for _ in range(self.n_episodes): 41 | score = run_episode(env, self.agent, self.max_episode_steps) 42 | score_episodes.append(score) 43 | all_scores_per_param.append(score_episodes) 44 | self.agent.init_weights() 45 | 46 | if self.multiprocess > 0: 47 | p.close() 48 | 49 | return np.array(all_scores_per_param) 50 | 51 | 52 | def make_env(env_name, seed=None): 53 | if "dm2gym" in env_name: 54 | env = gym.make(env_name, environment_kwargs={'flat_observation': True}) 55 | else: 56 | env = gym.make(env_name) 57 | if seed is not None: 58 | env.seed(seed) 59 | random.seed(seed) 60 | np.random.seed(seed) 61 | return env 62 | 63 | 64 | def run_episode(env, agent, max_episode_steps): 65 | obs = env.reset() 66 | score = 0 67 | steps = 0 68 | done = False 69 | while not done: 70 | action = agent.get_action(obs) 71 | obs, r, done, _ = env.step(action) 72 | score += r 73 | steps += 1 74 | if steps >= max_episode_steps: 75 | done = True 76 | return score 77 | 78 | 79 | def run_episode_wrapper(index, env_name, agent, max_episode_steps, num_episodes): 80 | env = make_env(env_name, index) 81 | return [run_episode(env, agent, max_episode_steps) for _ in range(num_episodes)] 82 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | install_requires = [ 5 | 'torch>=1.5.1', 6 | 'gym>=0.17.2', 7 | 'numpy', 8 | 'pillow', 9 | 'optuna', 10 | 'cloudpickle==1.3.0', 11 | 'cycler==0.10.0', 12 | 'future==0.18.2', 13 | 'kiwisolver==1.2.0', 14 | 'matplotlib', 15 | 'pandas', 16 | 'pyglet==1.5.0', 17 | 'pyparsing==2.4.7', 18 | 'python-dateutil==2.8.1', 19 | 'pytz==2020.1', 20 | 'scipy', 21 | 'seaborn', 22 | 'six', 23 | 'tabulate==0.8.7', 24 | ] 25 | 26 | setup( 27 | name='pic', 28 | version='0.0.1', 29 | description='', 30 | author='Hiroki Furuta', 31 | author_email='', 32 | url='', 33 | license='MIT License', 34 | packages=find_packages(), 35 | install_requires=install_requires, 36 | ) 37 | --------------------------------------------------------------------------------