├── .dockerignore ├── .env ├── .gitignore ├── LICENSE ├── README.md ├── config ├── locals ├── ray-autoscaler-ec2.yaml └── ray-autoscaler-gce.yaml ├── docker ├── Dockerfile.softlearning ├── Dockerfile.softlearning.base.cpu ├── Dockerfile.softlearning.base.gpu ├── cloudbuild.yaml ├── docker-compose.cloud.yml ├── docker-compose.dev.cpu.yml ├── docker-compose.dev.gpu.yml └── entrypoint.sh ├── environment.yml ├── examples ├── __init__.py ├── development │ ├── __init__.py │ ├── main.py │ ├── main_test.py │ ├── simulate_policy.py │ └── variants.py ├── instrument.py ├── multi_goal │ ├── __init__.py │ ├── main.py │ └── variants.py └── utils.py ├── models ├── cross_maze_ant.xml ├── pusher_2d.xml └── simple_maze_ant.xml ├── requirements.txt ├── scripts ├── __init__.py ├── archive_gs.py ├── deploy-aws.sh ├── install_mujoco.py ├── sync_gs.py └── test-cloud-build.sh ├── setup.py └── softlearning ├── __init__.py ├── algorithms ├── __init__.py ├── rl_algorithm.py ├── sac.py └── sql.py ├── distributions ├── __init__.py └── bijectors │ ├── __init__.py │ ├── conditional_scale.py │ ├── conditional_scale_test.py │ ├── conditional_shift.py │ ├── conditional_shift_test.py │ ├── real_nvp_flow.py │ └── real_nvp_flow_test.py ├── environments ├── __init__.py ├── adapters │ ├── __init__.py │ ├── dm_control_adapter.py │ ├── dm_control_adapter_test.py │ ├── gym_adapter.py │ ├── gym_adapter_test.py │ ├── robosuite_adapter.py │ ├── robosuite_adapter_test.py │ ├── softlearning_env.py │ └── softlearning_env_test.py ├── dm_control │ ├── __init__.py │ └── suite │ │ ├── __init__.py │ │ └── wrappers │ │ └── __init__.py ├── gym │ ├── __init__.py │ ├── mujoco │ │ ├── __init__.py │ │ ├── image_pusher_2d.py │ │ └── pusher_2d.py │ ├── multi_goal.py │ ├── robotics │ │ └── __init__.py │ └── wrappers │ │ ├── __init__.py │ │ ├── rescale_observation.py │ │ └── rescale_observation_test.py ├── helpers.py └── utils.py ├── misc ├── __init__.py ├── kernel.py └── plotter.py ├── models ├── __init__.py ├── convnet.py ├── feedforward.py ├── feedforward_test.py └── utils.py ├── policies ├── __init__.py ├── base_policy.py ├── gaussian_policy.py ├── gaussian_policy_test.py ├── real_nvp_policy.py ├── real_nvp_policy_test.py ├── uniform_policy.py ├── uniform_policy_test.py └── utils.py ├── preprocessors └── __init__.py ├── replay_pools ├── __init__.py ├── flexible_replay_pool.py ├── flexible_replay_pool_test.py ├── goal_replay_pool.py ├── hindsight_experience_replay_pool.py ├── hindsight_experience_replay_pool_test.py ├── replay_pool.py ├── simple_replay_pool.py ├── simple_replay_pool_test.py └── union_pool.py ├── samplers ├── __init__.py ├── base_sampler.py ├── dummy_sampler.py ├── goal_sampler.py ├── remote_sampler.py ├── remote_sampler_test.py ├── simple_sampler.py └── utils.py ├── scripts ├── __init__.py └── console_scripts.py ├── utils ├── __init__.py ├── dict.py ├── gcp.py ├── git.py ├── gym.py ├── misc.py ├── numpy.py ├── random.py ├── serialization.py ├── serialization_test.py ├── tensorflow.py ├── times.py ├── tune.py └── video.py └── value_functions ├── __init__.py ├── base_value_function.py ├── base_value_function_test.py └── vanilla.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .dockerignore 2 | Dockerfile 3 | db.sqlite3 4 | __pycache__ 5 | *.pyc 6 | *.pyo 7 | *.pyd 8 | .Python 9 | env 10 | pip-log.txt 11 | pip-delete-this-directory.txt 12 | .tox 13 | .coverage 14 | .coverage.* 15 | .cache 16 | coverage.xml 17 | *,cover 18 | *.log 19 | .git 20 | data/ 21 | tmp/ 22 | vis/ 23 | .vscode 24 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | SOFTLEARNING_DEV_TAG=20181212-dev-cpu-v1 2 | SOFTLEARNING_DEV_CPU_TAG=20181212-dev-cpu-v1 3 | SOFTLEARNING_DEV_GPU_TAG=20181212-dev-gpu-v1 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | # soft learning specific things 106 | *.swp 107 | .idea 108 | *.mp4 109 | data/ 110 | vis/ 111 | tmp/ 112 | vendor/* 113 | .pkl 114 | 115 | 116 | .mujoco/ 117 | .vscode/ 118 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Softlearning authors and contributors 4 | 5 | Softlearning uses a shared copyright model: each contributor holds copyright over 6 | their contributions to Softlearning. The project versioning records all such 7 | contribution and copyright details. 8 | 9 | By contributing to the Softlearning repository through pull-request, comment, 10 | or otherwise, the contributor releases their content to the license and 11 | copyright terms herein. 12 | 13 | Permission is hereby granted, free of charge, to any person obtaining a copy 14 | of this software and associated documentation files (the "Software"), to deal 15 | in the Software without restriction, including without limitation the rights 16 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17 | copies of the Software, and to permit persons to whom the Software is 18 | furnished to do so, subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be included in all 21 | copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 | SOFTWARE. 30 | -------------------------------------------------------------------------------- /config/locals: -------------------------------------------------------------------------------- 1 | AWS_ECR_REGISTRY_URL="" 2 | -------------------------------------------------------------------------------- /config/ray-autoscaler-ec2.yaml: -------------------------------------------------------------------------------- 1 | # An unique identifier for the head node and workers of this cluster. 2 | cluster_name: softlearning 3 | 4 | # The minimum number of workers nodes to launch in addition to the head 5 | # node. This number should be >= 0. 6 | min_workers: 0 7 | 8 | # The maximum number of workers nodes to launch in addition to the head 9 | # node. This takes precedence over min_workers. 10 | max_workers: 100 11 | 12 | # The initial number of worker nodes to launch in addition to the head 13 | # node. When the cluster is first brought up (or when it is refreshed with a 14 | # subsequent `ray up`) this number of nodes will be started. 15 | initial_workers: 0 16 | 17 | # This executes all commands on all nodes in the docker container, 18 | # and opens all the necessary ports to support the Ray cluster. 19 | # Empty string means disabled. 20 | docker: 21 | image: "" 22 | container_name: "" # e.g. ray_docker 23 | # container_name: "softlearning" 24 | 25 | # The autoscaler will scale up the cluster to this target fraction of resource 26 | # usage. For example, if a cluster of 10 nodes is 100% busy and 27 | # target_utilization is 0.8, it would resize the cluster to 13. This fraction 28 | # can be decreased to increase the aggressiveness of upscaling. 29 | # This value must be less than 1.0 for scaling to happen. 30 | target_utilization_fraction: 0.99 31 | 32 | # If a node is idle for this many minutes, it will be removed. 33 | idle_timeout_minutes: 5 34 | 35 | # Cloud-provider specific configuration. 36 | provider: 37 | type: aws 38 | region: us-west-2 39 | availability_zone: us-west-2a 40 | 41 | # How Ray will authenticate with newly launched nodes. 42 | auth: 43 | ssh_user: ubuntu 44 | # By default Ray creates a new private keypair, but you can also use your own. 45 | # If you do so, make sure to also set "KeyName" in the head and worker node 46 | # configurations below. 47 | # ssh_private_key: /path/to/your/key.pem 48 | 49 | # Provider-specific config for the head node, e.g. instance type. By default 50 | # Ray will auto-configure unspecified fields such as SubnetId and KeyName. 51 | # For more documentation on available fields, see: 52 | # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances 53 | head_node: 54 | # TODO: pick suitable instance type (https://aws.amazon.com/ec2/instance-types) 55 | InstanceType: c5.2xlarge 56 | ImageId: 57 | 58 | # # You can provision additional disk space with a conf as follows 59 | # BlockDeviceMappings: 60 | # - DeviceName: /dev/sda1 61 | # Ebs: 62 | # VolumeSize: 50 63 | 64 | # Additional options in the boto docs. 65 | 66 | # Provider-specific config for worker nodes, e.g. instance type. By default 67 | # Ray will auto-configure unspecified fields such as SubnetId and KeyName. 68 | # For more documentation on available fields, see: 69 | # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances 70 | worker_nodes: 71 | # TODO: pick suitable instance type (https://aws.amazon.com/ec2/instance-types) 72 | InstanceType: c5.2xlarge 73 | # InstanceType: t2.micro 74 | ImageId: 75 | 76 | # Run workers on spot by default. Comment this out to use on-demand. 77 | InstanceMarketOptions: 78 | MarketType: spot 79 | # Additional options can be found in the boto docs, e.g. 80 | # SpotOptions: 81 | # MaxPrice: 0.5 82 | 83 | # Additional options in the boto docs. 84 | 85 | # Files or directories to copy to the head and worker nodes. The format is a 86 | # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. 87 | file_mounts: { 88 | "/tmp/current_git_HEAD": "/.git/HEAD", 89 | "~/softlearning": "", 90 | "~/.mujoco/mjkey.txt": "~/.mujoco/mjkey.txt", 91 | } 92 | 93 | # List of shell commands to run to set up nodes. 94 | setup_commands: 95 | - >- 96 | pip install -U -e ~/softlearning 97 | 98 | initialization_commands: [] 99 | 100 | # Custom commands that will be run on the head node after common setup. 101 | head_setup_commands: [] 102 | 103 | # Custom commands that will be run on worker nodes after common setup. 104 | worker_setup_commands: [] 105 | 106 | # Command to start ray on the head node. You don't need to change this. 107 | head_start_ray_commands: 108 | - ray stop 109 | - ray start 110 | --head 111 | --redis-port=6379 112 | --object-manager-port=8076 113 | --autoscaling-config=~/ray_bootstrap_config.yaml 114 | --internal-config='{ 115 | "num_heartbeats_timeout":300, 116 | "raylet_heartbeat_timeout_milliseconds":1000 117 | }' 118 | 119 | # Command to start ray on worker nodes. You don't need to change this. 120 | worker_start_ray_commands: 121 | - ray stop 122 | - >- 123 | ray start 124 | --address=$RAY_HEAD_IP:6379 125 | --object-manager-port=8076 126 | -------------------------------------------------------------------------------- /config/ray-autoscaler-gce.yaml: -------------------------------------------------------------------------------- 1 | # An unique identifier for the head node and workers of this cluster. 2 | cluster_name: softlearning 3 | 4 | # The minimum number of workers nodes to launch in addition to the head 5 | # node. This number should be >= 0. 6 | min_workers: 0 7 | 8 | # The maximum number of workers nodes to launch in addition to the head 9 | # node. This takes precedence over min_workers. 10 | max_workers: 100 11 | 12 | # The initial number of worker nodes to launch in addition to the head 13 | # node. When the cluster is first brought up (or when it is refreshed with a 14 | # subsequent `ray up`) this number of nodes will be started. 15 | initial_workers: 0 16 | 17 | # This executes all commands on all nodes in the docker container, 18 | # and opens all the necessary ports to support the Ray cluster. 19 | # Empty string means disabled. 20 | docker: 21 | image: "" 22 | container_name: "" # e.g. ray_docker 23 | # container_name: "softlearning" 24 | 25 | # The autoscaler will scale up the cluster to this target fraction of resource 26 | # usage. For example, if a cluster of 10 nodes is 100% busy and 27 | # target_utilization is 0.8, it would resize the cluster to 13. This fraction 28 | # can be decreased to increase the aggressiveness of upscaling. 29 | # This value must be less than 1.0 for scaling to happen. 30 | target_utilization_fraction: 0.99 31 | 32 | # If a node is idle for this many minutes, it will be removed. 33 | idle_timeout_minutes: 5 34 | 35 | # Cloud-provider specific configuration. 36 | provider: 37 | type: gcp 38 | region: us-west1 39 | availability_zone: us-west1-a 40 | project_id: 41 | 42 | # How Ray will authenticate with newly launched nodes. 43 | auth: 44 | ssh_user: ubuntu 45 | # By default Ray creates a new private keypair, but you can also use your own. 46 | # If you do so, make sure to also set "KeyName" in the head and worker node 47 | # configurations below. 48 | # ssh_private_key: /path/to/your/key.pem 49 | 50 | # Provider-specific config for the head node, e.g. instance type. By default 51 | # Ray will auto-configure unspecified fields such as SubnetId and KeyName. 52 | # For more documentation on available fields, see: 53 | # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances 54 | head_node: 55 | machineType: n1-standard-4 # n1-highcpu-16 56 | disks: 57 | - boot: true 58 | autoDelete: true 59 | type: PERSISTENT 60 | initializeParams: 61 | diskSizeGb: 50 62 | # See https://cloud.google.com/compute/docs/images for more images 63 | sourceImage: projects//global/images/family/ 64 | 65 | # Additional options can be found in in the compute docs at 66 | # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert 67 | 68 | # Provider-specific config for worker nodes, e.g. instance type. By default 69 | # Ray will auto-configure unspecified fields such as SubnetId and KeyName. 70 | # For more documentation on available fields, see: 71 | # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances 72 | worker_nodes: 73 | machineType: n1-standard-8 # n1-highcpu-8 74 | disks: 75 | - boot: true 76 | autoDelete: true 77 | type: PERSISTENT 78 | initializeParams: 79 | diskSizeGb: 50 80 | # See https://cloud.google.com/compute/docs/images for more images 81 | sourceImage: projects//global/images/family/ 82 | # Run workers on preemtible instance by default. 83 | # Note that GCP preemptible instances automatically shut down after 24h. 84 | # Comment this out to use on-demand. 85 | scheduling: 86 | - preemptible: true 87 | - onHostMaintenance: TERMINATE 88 | 89 | # Additional options can be found in in the compute docs at 90 | # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert 91 | 92 | # Files or directories to copy to the head and worker nodes. The format is a 93 | # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. 94 | file_mounts: { 95 | "/tmp/current_git_HEAD": "/.git/HEAD", 96 | "~/softlearning": "", 97 | "~/.mujoco/mjkey.txt": "~/.mujoco/mjkey.txt", 98 | } 99 | 100 | # List of shell commands to run to set up nodes. 101 | setup_commands: 102 | - >- 103 | pip install -U -e ~/softlearning 104 | 105 | initialization_commands: 106 | - gcloud auth configure-docker 107 | 108 | # Custom commands that will be run on the head node after common setup. 109 | head_setup_commands: [] 110 | 111 | # Custom commands that will be run on worker nodes after common setup. 112 | worker_setup_commands: [] 113 | 114 | # Command to start ray on the head node. You don't need to change this. 115 | head_start_ray_commands: 116 | - ray stop 117 | - ray start 118 | --head 119 | --redis-port=6379 120 | --object-manager-port=8076 121 | --autoscaling-config=~/ray_bootstrap_config.yaml 122 | --internal-config='{ 123 | "num_heartbeats_timeout":300, 124 | "raylet_heartbeat_timeout_milliseconds":1000 125 | }' 126 | 127 | # Command to start ray on worker nodes. You don't need to change this. 128 | worker_start_ray_commands: 129 | - ray stop 130 | - >- 131 | ray start 132 | --address=$RAY_HEAD_IP:6379 133 | --object-manager-port=8076 134 | -------------------------------------------------------------------------------- /docker/Dockerfile.softlearning: -------------------------------------------------------------------------------- 1 | # WIP 2 | 3 | # Dockerfile that clones the softlearning repo into the softlearning base 4 | # image. Should be used for running stuff on the cloud, e.g. with ray. 5 | 6 | # Base container to clone the softlearning-private repo 7 | FROM ubuntu:18.04 as git_cloner 8 | # Note that the SSH_PRIVATE_KEY arg is NOT saved on the final container 9 | 10 | # add credentials on build 11 | ARG SSH_PRIVATE_KEY 12 | 13 | # install git 14 | RUN apt-get update \ 15 | && apt-get install -y git \ 16 | && mkdir /root/.ssh/ \ 17 | && echo "${SSH_PRIVATE_KEY}" > /root/.ssh/id_rsa \ 18 | && chmod 0600 /root/.ssh/id_rsa \ 19 | && touch /root/.ssh/known_hosts \ 20 | && ssh-keyscan github.com >> /root/.ssh/known_hosts \ 21 | && git clone git@github.com:rail-berkeley/softlearning.git /root/softlearning \ 22 | && rm -vf /root/.ssh/id_rsa 23 | 24 | # Base container to clone the sac_envs repo 25 | FROM ubuntu:18.04 as sac_envs_cloner 26 | # Note that the SSH_PRIVATE_KEY arg is NOT saved on the final container 27 | 28 | # add credentials on build 29 | ARG SSH_PRIVATE_KEY 30 | 31 | # install git 32 | RUN apt-get update \ 33 | && apt-get install -y git \ 34 | && mkdir /root/.ssh/ \ 35 | && echo "${SSH_PRIVATE_KEY}" > /root/.ssh/id_rsa \ 36 | && chmod 0600 /root/.ssh/id_rsa \ 37 | && touch /root/.ssh/known_hosts \ 38 | && ssh-keyscan github.com >> /root/.ssh/known_hosts \ 39 | && git clone git@github.com:vikashplus/sac_envs.git /root/sac_envs \ 40 | && rm -vf /root/.ssh/id_rsa 41 | 42 | FROM softlearning-dev 43 | 44 | # ========== Add codebase stub ========== 45 | COPY --from=softlearning_cloner /root/softlearning /root/softlearning 46 | COPY --from=sac_envs_cloner /root/sac_envs /root/sac_envs 47 | WORKDIR /root/softlearning 48 | -------------------------------------------------------------------------------- /docker/Dockerfile.softlearning.base.cpu: -------------------------------------------------------------------------------- 1 | # syntax = docker/dockerfile:1.0-experimental 2 | 3 | # Base softlearning container that contains all softlearning requirements, 4 | # but not the actual softlearning repo. Could be used for example when developing 5 | # softlearning, in which case you would mount softlearning repo in to the container 6 | # as a volume, and thus be able to modify code on the host, yet run things inside 7 | # the container. You are encouraged to use docker-compose (docker-compose.dev.yml), 8 | # which should allow you to setup your environment with a single one command. 9 | # 10 | # Usage: 11 | # 1) Build image. Typically `docker-compose` would handle this automatically for us 12 | # # but due to incompatible secret handling, we have to build the image manually. 13 | # DOCKER_BUILDKIT=1 \ 14 | # docker build \ 15 | # -f ./docker/Dockerfile.softlearning.base.cpu \ 16 | # -t softlearning:latest-cpu \ 17 | # --progress=plain \ 18 | # --secret id=mjkey,src="${HOME}/.mujoco/mjkey.txt" . 19 | # 2) Run: 20 | # docker-compose \ 21 | # -p ${USER} \ 22 | # -f ./docker/docker-compose.dev.cpu.yml \ 23 | # up \ 24 | # -d \ 25 | # --force-recreate 26 | 27 | 28 | ARG UBUNTU_VERSION=18.04 29 | 30 | FROM ubuntu:${UBUNTU_VERSION} as base 31 | 32 | ARG UBUNTU_VERSION 33 | 34 | SHELL ["/bin/bash", "-c"] 35 | 36 | # MAINTAINER Kristian Hartikainen 37 | 38 | ENV DEBIAN_FRONTEND="noninteractive" 39 | # See http://bugs.python.org/issue19846 40 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 41 | ENV PATH /opt/conda/bin:$PATH 42 | 43 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 44 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 45 | git mercurial subversion 46 | 47 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \ 48 | /bin/bash /tmp/miniconda.sh -b -p /opt/conda && \ 49 | rm /tmp/miniconda.sh && \ 50 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 51 | echo ". /opt/conda/etc/profile.d/conda.sh" >> /etc/bash.bashrc 52 | 53 | RUN apt-get install -y curl grep sed dpkg && \ 54 | TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \ 55 | curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ 56 | dpkg -i tini.deb && \ 57 | rm tini.deb && \ 58 | apt-get clean \ 59 | && rm -rf /var/lib/apt/lists/* 60 | 61 | 62 | RUN conda update -y --name base conda \ 63 | && conda clean --all -y 64 | 65 | 66 | # ========== Softlearning dependencies ========== 67 | RUN apt-get update -y \ 68 | && apt-get install -y --no-install-recommends \ 69 | build-essential \ 70 | curl \ 71 | git \ 72 | gnupg2 \ 73 | make \ 74 | cmake \ 75 | ffmpeg \ 76 | swig \ 77 | libz-dev \ 78 | unzip \ 79 | zlib1g-dev \ 80 | libglfw3 \ 81 | libglfw3-dev \ 82 | libxrandr2 \ 83 | libxinerama-dev \ 84 | libxi6 \ 85 | libxcursor-dev \ 86 | libgl1-mesa-dev \ 87 | libgl1-mesa-glx \ 88 | libglew-dev \ 89 | libosmesa6-dev \ 90 | lsb-release \ 91 | ack-grep \ 92 | patchelf \ 93 | vim \ 94 | emacs \ 95 | wget \ 96 | xpra \ 97 | xserver-xorg-dev \ 98 | xvfb \ 99 | && export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" \ 100 | && echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" \ 101 | | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \ 102 | && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ 103 | | apt-key add - \ 104 | && apt-get update -y \ 105 | && apt-get install -y google-cloud-sdk \ 106 | && apt-get clean \ 107 | && rm -rf /var/lib/apt/lists/* 108 | 109 | 110 | # ========= MuJoCo =============== 111 | COPY ./scripts/install_mujoco.py /tmp/ 112 | 113 | RUN /tmp/install_mujoco.py --mujoco-path=/root/.mujoco --versions 1.50 2.00 \ 114 | && ln -s /root/.mujoco/mujoco200_linux /root/.mujoco/mujoco200 \ 115 | && rm /tmp/install_mujoco.py 116 | 117 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:${LD_LIBRARY_PATH} 118 | ENV LD_LIBRARY_PATH /root/.mujoco/mujoco200/bin:${LD_LIBRARY_PATH} 119 | ENV LD_LIBRARY_PATH /root/.mujoco/mujoco200_linux/bin:${LD_LIBRARY_PATH} 120 | 121 | # This is a hack required to make mujocopy to compile in gpu mode 122 | RUN mkdir -p /usr/lib/nvidia-000 123 | ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/lib/nvidia-000 124 | 125 | 126 | # ========== Conda Environment ========== 127 | COPY ./environment.yml /tmp/ 128 | COPY ./requirements.txt /tmp/ 129 | 130 | # NOTE: Fetch `mjkey.txt` from secret mount to avoid writing it to the build 131 | # history. For details, see: 132 | # https://docs.docker.com/develop/develop-images/build_enhancements/#new-docker-build-secret-information 133 | RUN --mount=type=secret,id=mjkey,dst=/root/.mujoco/mjkey.txt \ 134 | conda env update -f /tmp/environment.yml \ 135 | && conda clean --all -y 136 | 137 | RUN echo "conda activate softlearning" >> ~/.bashrc \ 138 | && echo "cd ~/softlearning" >> ~/.bashrc 139 | 140 | 141 | # =========== Container Entrypoint ============= 142 | COPY ./docker/entrypoint.sh /entrypoint.sh 143 | ENTRYPOINT ["/usr/bin/tini", "--", "/entrypoint.sh"] 144 | -------------------------------------------------------------------------------- /docker/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # https://cloud.google.com/cloud-build/docs/build-config 2 | steps: 3 | 4 | # 1. Build gpu image 5 | # Build image with docker-compose 6 | - name: 'docker/compose:1.24.0' 7 | args: 8 | - '-f' 9 | - '/workspace/docker/docker-compose.dev.gpu.yml' 10 | - 'build' 11 | - '--force-rm' 12 | - '--parallel' 13 | secretEnv: 14 | - MJKEY 15 | env: 16 | - 'IMAGE_NAME=${REPO_NAME}' 17 | - 'IMAGE_TAG=${SHORT_SHA}' 18 | 19 | # 2. Retag the gpu image into a gce repository. 20 | - name: 'gcr.io/cloud-builders/docker' 21 | args: 22 | - 'tag' 23 | - '${REPO_NAME}:${SHORT_SHA}-gpu' 24 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:${SHORT_SHA}-gpu' 25 | 26 | - name: 'gcr.io/cloud-builders/docker' 27 | args: 28 | - 'tag' 29 | - '${REPO_NAME}:${SHORT_SHA}-gpu' 30 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:latest-gpu' 31 | 32 | 33 | # 1. Build cpu image 34 | - name: 'docker/compose:1.24.0' 35 | args: 36 | - '-f' 37 | - '/workspace/docker/docker-compose.dev.cpu.yml' 38 | - 'build' 39 | - '--force-rm' 40 | - '--parallel' 41 | secretEnv: 42 | - MJKEY 43 | env: 44 | - 'IMAGE_NAME=${REPO_NAME}' 45 | - 'IMAGE_TAG=${SHORT_SHA}' 46 | 47 | # 2. Retag the cpu image into a gce repository. 48 | - name: 'gcr.io/cloud-builders/docker' 49 | args: 50 | - 'tag' 51 | - '${REPO_NAME}:${SHORT_SHA}-cpu' 52 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:${SHORT_SHA}-cpu' 53 | 54 | - name: 'gcr.io/cloud-builders/docker' 55 | args: 56 | - 'tag' 57 | - '${REPO_NAME}:${SHORT_SHA}-cpu' 58 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:latest-cpu' 59 | 60 | 61 | options: 62 | substitution_option: 'ALLOW_LOOSE' 63 | secrets: 64 | - kmsKeyName: projects/tuomas-softlearning/locations/global/keyRings/softlearning-ring/cryptoKeys/softlearning-test 65 | secretEnv: 66 | MJKEY: | 67 | CiQAAis3HkqhleV++GZn2GvPx8fsw7lGNoPscpAqqdhMA8T64ZgS7wcAsW+D72srQXBnF3Gxpn/D 68 | WVPtYTiehuCCC+Knnl9MqU/X4/8b1TIqYzPfZMZmP85b4gzWZJ4LPYJVVZbjzZI3vSn8OB0ejXa3 69 | 5AK+NIjonhq/6/f6CeXJxlEXj8OL7PwFFe09yMjmpvcJPsvgJrGseKuZU/Bbl4iR3DTtNqA/0eId 70 | YhPmuq3XL4MCnVQ+OKNFfQZtCHEVXNvtBjZ0j0U6/pQMEOmhbOzp/zMEYeMH4/P553bF3m3L7Qi5 71 | Zt8lIVqIHs4mI9VBKQ2CRlUzIh5+Y8Luk5csErHM9ilg6dViEJAA3+cEijd92x3Df2/NevQdN7FB 72 | c74obb5u87V6GRjYor0HJujH9RJNcFXKs5Wi9x1/8Fw4fNH2fDBEAdOjDsSlL9zF/b6+9D3ncvan 73 | aWhtzNjr14coi9Ay9LoSJZRaCLUIB1VG6w5deQMpKuEs9b54u9UkwgMys5H9sEEnsuc6IQR1Prnz 74 | 7xQN7I8XfiSYwg3xoWgHMNHrcyZ/FLNXhP3j51L1AboSaNfe1SPrtiJie2pjrcaLMNI7iWbUkLgs 75 | JFvszGbbDTFTw6RA+U+uz6S4EnioFJJHHxsM0nB7CU7JC81PQ2m1lKdaKWMcZ5qsIyj7iFZjQWn0 76 | b1LMuPD0xdOt2FQ7BPlX8uE9Qrc3xJRFgscW0O0I1880OrjFaLQlosQjE/Sz43VaBE/xTsnRWAxT 77 | gVK3wK0pok5oiLxwbvXII0T/tv5lQdOxAcbKhEMXSWLZv86tQaOKW6rFPrL2MY2yEOcE3bk1oHzX 78 | vutfuCxdWKz42IPWFEhs69NxgxT1iBLRqR9KjKiTYnnXdTjOxJ0i/M1Q8PoMTB6QwXsyrYwDjsR7 79 | yK8jmKNuFmi4N535bOLg2z+wN4ClHdwlJODrcBFBCI0Xbykd/KBlX+VuStd/E6NOuAEgl3XqBPNw 80 | baHVo9OAyhLFNxGc9mrX3uKywzwEfkiDi0Zo0KLN4hi7J19kGvuKja22sm9aMpmeZKFMEP7bMc/3 81 | YwGvLAMPmGKhDbmFDOkyKwy/RPifVHomCJ0U8s29PSHxjTmukooYLsVHe1OVbkz44Xo68xQ6afLz 82 | LBVLfEIcWClbmDNYxDrCUQXnrZyGHpeNG3rCzqTX6a7ZUDh0locX9f+JphggJrcV05zBNiXeQ+XZ 83 | lxuAI0cnx4euiHZb6MRXA3H6TlS9PIEF4n7eLuIC827w55qMRmJmEY59mZ/1xqs8buln087mcz4b 84 | HIG5KpRwrSC80JHVpdiXrxupOjvknSWmvMo34dmNZvazcVkcWqT8otjwV8FDU7kTlIe+pXbV9YQx 85 | eBMtntxk93yy9vM7RHvMccGObx/iaQ== 86 | 87 | 88 | # logsBucket: 'gs://' 89 | images: 90 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:latest-gpu' 91 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:latest-cpu' 92 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:${SHORT_SHA}-gpu' 93 | - 'gcr.io/${PROJECT_ID}/${REPO_NAME}:${SHORT_SHA}-cpu' 94 | -------------------------------------------------------------------------------- /docker/docker-compose.cloud.yml: -------------------------------------------------------------------------------- 1 | # WIP 2 | 3 | # Docker compose file that builds images and runs the containers needed for 4 | # running softlearning on cloud (e.g. with ray). You need to have your 5 | # MJKEY set in the environment. 6 | # 7 | # docker-compose \ 8 | # -f ./docker/docker-compose.dev.yml \ 9 | # build \ 10 | # --build-arg MJKEY="$(cat ~/.mujoco/mjkey.txt)" 11 | 12 | version: "3" 13 | services: 14 | softlearning-dev: 15 | image: softlearning-dev-compose-test 16 | build: 17 | context: ../. 18 | dockerfile: docker/Dockerfile.softlearning.base.gpu 19 | args: 20 | - MJKEY 21 | ports: 22 | - "6006-6016" # Tensorboard 23 | - "5000-5010" # Viskit 24 | - "8888-8898" # Jupyter 25 | volumes: 26 | - ~/.aws:/root/.aws # If using aws, e.g. for s3 27 | - ~/.config/gcloud:/root/.config/gcloud # If using gcp, e.g. for gs 28 | - ~/.mujoco:/root/.mujoco # mjkey.txt 29 | - ..:/root/softlearning-private 30 | -------------------------------------------------------------------------------- /docker/docker-compose.dev.cpu.yml: -------------------------------------------------------------------------------- 1 | # Docker compose file that builds images and runs the containers needed for 2 | # development. 3 | # 4 | # NOTE(hartikainen): This file doesn't currently work since docker-compose 5 | # doesn't support buildkit secrets. 6 | 7 | 8 | version: "2.4" 9 | services: 10 | softlearning-dev-cpu: 11 | image: ${IMAGE_NAME:-softlearning}:${IMAGE_TAG:-latest}-cpu 12 | container_name: softlearning-dev-cpu 13 | init: true 14 | working_dir: /root/softlearning 15 | environment: 16 | - DISPLAY=:0 17 | ports: 18 | - "6006" # Tensorboard 19 | - "5000" # Viskit 20 | - "8888" # Jupyter 21 | volumes: 22 | - ~/.aws:/root/.aws # If using aws, e.g. for s3 23 | - ~/.config/gcloud:/root/.config/gcloud # If using gcp, e.g. for gs 24 | - ~/.mujoco/mjkey.txt:/root/.mujoco/mjkey.txt 25 | - ..:/root/softlearning 26 | - ~/ray_results/softlearning-dev-cpu:/root/ray_results 27 | command: 28 | - bash 29 | stdin_open: true 30 | tty: true 31 | -------------------------------------------------------------------------------- /docker/docker-compose.dev.gpu.yml: -------------------------------------------------------------------------------- 1 | # Docker compose file that builds images and runs the containers needed for 2 | # development. 3 | # 4 | # NOTE(hartikainen): This file doesn't currently work since docker-compose 5 | # doesn't support buildkit secrets. 6 | 7 | 8 | version: "2.4" 9 | services: 10 | softlearning-dev-gpu: 11 | image: ${IMAGE_NAME:-softlearning}:${IMAGE_TAG:-latest}-gpu 12 | container_name: softlearning-dev-gpu 13 | runtime: nvidia 14 | init: true 15 | working_dir: /root/softlearning 16 | environment: 17 | - DISPLAY=:0 18 | ports: 19 | - "6006" # Tensorboard 20 | - "5000" # Viskit 21 | - "8888" # Jupyter 22 | volumes: 23 | - ~/.aws:/root/.aws # If using aws, e.g. for s3 24 | - ~/.config/gcloud:/root/.config/gcloud # If using gcp, e.g. for gs 25 | - ~/.mujoco/mjkey.txt:/root/.mujoco/mjkey.txt 26 | - ..:/root/softlearning 27 | - ~/ray_results/softlearning-dev-gpu:/root/ray_results 28 | command: 29 | - bash 30 | stdin_open: true 31 | tty: true 32 | -------------------------------------------------------------------------------- /docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | # Set up display; otherwise rendering will fail 6 | Xvfb -screen 0 320x240x24 & 7 | export DISPLAY=:0 8 | 9 | # Wait for the file to come up 10 | file="/tmp/.X11-unix/X0" 11 | for i in $(seq 1 10); do 12 | if [ -e "$file" ]; then 13 | break 14 | fi 15 | 16 | echo "Waiting for $file to be created (try $i/10)" 17 | sleep "$i" 18 | done 19 | if ! [ -e "$file" ]; then 20 | echo "Timing out: $file was not created" 21 | exit 1 22 | fi 23 | 24 | exec "$@" 25 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: softlearning 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python>=3.8,<3.9 7 | - pip>=20.0 8 | - conda>=4.8 9 | # - cudatoolkit==11.0.221 10 | # - nvidia::cudnn==8.0.4 11 | - pip: 12 | - -r ./requirements.txt 13 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/examples/__init__.py -------------------------------------------------------------------------------- /examples/development/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides functions that are utilized by the command line interface. 2 | 3 | In particular, the examples are exposed to the command line interface 4 | (defined in `softlearning.scripts.console_scripts`) through the 5 | `get_trainable_class`, `get_variant_spec`, and `get_parser` functions. 6 | """ 7 | 8 | 9 | def get_trainable_class(*args, **kwargs): 10 | from .main import ExperimentRunner 11 | return ExperimentRunner 12 | 13 | 14 | def get_variant_spec(command_line_args, *args, **kwargs): 15 | from .variants import get_variant_spec 16 | variant_spec = get_variant_spec(command_line_args, *args, **kwargs) 17 | return variant_spec 18 | 19 | 20 | def get_parser(): 21 | from examples.utils import get_parser 22 | parser = get_parser() 23 | return parser 24 | -------------------------------------------------------------------------------- /examples/development/simulate_policy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | from pathlib import Path 5 | import pickle 6 | 7 | import pandas as pd 8 | 9 | from softlearning.environments.utils import get_environment_from_params 10 | from softlearning import policies 11 | from softlearning.samplers import rollouts 12 | from softlearning.utils.tensorflow import set_gpu_memory_growth 13 | from softlearning.utils.video import save_video 14 | from .main import ExperimentRunner 15 | 16 | 17 | DEFAULT_RENDER_KWARGS = { 18 | 'mode': 'human', 19 | } 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('checkpoint_path', 25 | type=str, 26 | help='Path to the checkpoint.') 27 | parser.add_argument('--max-path-length', '-l', type=int, default=1000) 28 | parser.add_argument('--num-rollouts', '-n', type=int, default=10) 29 | parser.add_argument('--render-kwargs', '-r', 30 | type=json.loads, 31 | default='{}', 32 | help="Kwargs for rollouts renderer.") 33 | parser.add_argument('--video-save-path', 34 | type=Path, 35 | default=None) 36 | 37 | args = parser.parse_args() 38 | 39 | return args 40 | 41 | 42 | def load_variant_progress_metadata(checkpoint_path): 43 | checkpoint_path = checkpoint_path.rstrip('/') 44 | trial_path = os.path.dirname(checkpoint_path) 45 | 46 | variant_path = os.path.join(trial_path, 'params.pkl') 47 | with open(variant_path, 'rb') as f: 48 | variant = pickle.load(f) 49 | 50 | metadata_path = os.path.join(checkpoint_path, ".tune_metadata") 51 | if os.path.exists(metadata_path): 52 | with open(metadata_path, "rb") as f: 53 | metadata = pickle.load(f) 54 | else: 55 | metadata = None 56 | 57 | progress_path = os.path.join(trial_path, 'progress.csv') 58 | progress = pd.read_csv(progress_path) 59 | 60 | return variant, progress, metadata 61 | 62 | 63 | def load_environment(variant): 64 | environment_params = ( 65 | variant['environment_params']['training'] 66 | if 'evaluation' in variant['environment_params'] 67 | else variant['environment_params']['training']) 68 | 69 | environment = get_environment_from_params(environment_params) 70 | return environment 71 | 72 | 73 | def load_policy(checkpoint_dir, variant, environment): 74 | policy_params = variant['policy_params'].copy() 75 | policy_params['config'] = { 76 | **policy_params['config'], 77 | 'action_range': (environment.action_space.low, 78 | environment.action_space.high), 79 | 'input_shapes': environment.observation_shape, 80 | 'output_shape': environment.action_shape, 81 | } 82 | 83 | policy = policies.get(policy_params) 84 | 85 | policy_save_path = ExperimentRunner._policy_save_path(checkpoint_dir) 86 | status = policy.load_weights(policy_save_path) 87 | status.assert_consumed().run_restore_ops() 88 | 89 | return policy 90 | 91 | 92 | def simulate_policy(checkpoint_path, 93 | num_rollouts, 94 | max_path_length, 95 | render_kwargs, 96 | video_save_path=None, 97 | evaluation_environment_params=None): 98 | checkpoint_path = os.path.abspath(checkpoint_path.rstrip('/')) 99 | variant, progress, metadata = load_variant_progress_metadata( 100 | checkpoint_path) 101 | environment = load_environment(variant) 102 | policy = load_policy(checkpoint_path, variant, environment) 103 | render_kwargs = {**DEFAULT_RENDER_KWARGS, **render_kwargs} 104 | 105 | paths = rollouts(num_rollouts, 106 | environment, 107 | policy, 108 | path_length=max_path_length, 109 | render_kwargs=render_kwargs) 110 | 111 | if video_save_path and render_kwargs.get('mode') == 'rgb_array': 112 | fps = 1 // getattr(environment, 'dt', 1/30) 113 | for i, path in enumerate(paths): 114 | video_save_dir = os.path.expanduser('/tmp/simulate_policy/') 115 | video_save_path = os.path.join(video_save_dir, f'episode_{i}.mp4') 116 | save_video(path['images'], video_save_path, fps=fps) 117 | 118 | return paths 119 | 120 | 121 | if __name__ == '__main__': 122 | set_gpu_memory_growth(True) 123 | args = parse_args() 124 | simulate_policy(**vars(args)) 125 | -------------------------------------------------------------------------------- /examples/multi_goal/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides functions that are utilized by the command line interface. 2 | 3 | In particular, the examples are exposed to the command line interface 4 | (defined in `softlearning.scripts.console_scripts`) through the 5 | `get_trainable_class`, `get_variant_spec`, and `get_parser` functions. 6 | """ 7 | 8 | 9 | def get_trainable_class(*args, **kwargs): 10 | from .main import run_experiment 11 | return run_experiment 12 | 13 | 14 | def get_variant_spec(command_line_args, *args, **kwargs): 15 | from .variants import get_variant_spec 16 | variant_spec = get_variant_spec(command_line_args, *args, **kwargs) 17 | return variant_spec 18 | 19 | 20 | def get_parser(): 21 | from examples.utils import get_parser 22 | parser = get_parser() 23 | 24 | for dest, value in (('universe', 'gym'), 25 | ('task', 'MultiGoal'), 26 | ('domain', 'Default-v0')): 27 | option = next(x for x in parser._actions if x.dest == dest) 28 | option.default = value 29 | option.choices = {value} 30 | 31 | return parser 32 | -------------------------------------------------------------------------------- /examples/multi_goal/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | 5 | from softlearning import algorithms 6 | from softlearning.environments.utils import get_environment 7 | from softlearning.misc.plotter import QFPolicyPlotter 8 | from softlearning.samplers import SimpleSampler 9 | from softlearning import policies 10 | from softlearning.replay_pools import SimpleReplayPool 11 | from softlearning import value_functions 12 | from examples.instrument import run_example_local 13 | 14 | 15 | def run_experiment(variant, reporter): 16 | training_environment = ( 17 | get_environment('gym', 'MultiGoal', 'Default-v0', { 18 | 'actuation_cost_coeff': 30, 19 | 'distance_cost_coeff': 1, 20 | 'goal_reward': 10, 21 | 'init_sigma': 0.1, 22 | })) 23 | evaluation_environment = training_environment.copy() 24 | 25 | pool = SimpleReplayPool( 26 | environment=training_environment, 27 | max_size=1e6) 28 | 29 | sampler = SimpleSampler(max_path_length=30) 30 | 31 | variant['Q_params']['config'].update({ 32 | 'input_shapes': ( 33 | training_environment.observation_shape, 34 | training_environment.action_shape, 35 | ) 36 | }) 37 | Qs = value_functions.get(variant['Q_params']) 38 | 39 | variant['policy_params']['config'].update({ 40 | 'action_range': (training_environment.action_space.low, 41 | training_environment.action_space.high), 42 | 'input_shapes': training_environment.observation_shape, 43 | 'output_shape': training_environment.action_shape, 44 | }) 45 | policy = policies.get(variant['policy_params']) 46 | 47 | plotter = QFPolicyPlotter( 48 | Q=Qs[0], 49 | policy=policy, 50 | obs_lst=np.array(((-2.5, 0.0), 51 | (0.0, 0.0), 52 | (2.5, 2.5), 53 | (-2.5, -2.5))), 54 | default_action=(np.nan, np.nan), 55 | n_samples=100) 56 | 57 | variant['algorithm_params']['config'].update({ 58 | 'training_environment': training_environment, 59 | 'evaluation_environment': evaluation_environment, 60 | 'policy': policy, 61 | 'Qs': Qs, 62 | 'pool': pool, 63 | 'sampler': sampler, 64 | 'min_pool_size': 100, 65 | 'batch_size': 64, 66 | 'plotter': plotter, 67 | }) 68 | algorithm = algorithms.get(variant['algorithm_params']) 69 | 70 | for train_result in algorithm.train(): 71 | reporter(**train_result) 72 | 73 | 74 | def main(argv=None): 75 | """Run ExperimentRunner locally on ray. 76 | 77 | To run this example on cloud (e.g. gce/ec2), use the setup scripts: 78 | 'softlearning launch_example_{gce,ec2} examples.development '. 79 | 80 | Run 'softlearning launch_example_{gce,ec2} --help' for further 81 | instructions. 82 | """ 83 | run_example_local('examples.multi_goal', argv) 84 | 85 | 86 | if __name__ == '__main__': 87 | main(argv=sys.argv[1:]) 88 | -------------------------------------------------------------------------------- /examples/multi_goal/variants.py: -------------------------------------------------------------------------------- 1 | from softlearning.utils.dict import deep_update 2 | 3 | ALGORITHM_PARAMS_BASE = { 4 | 'class_name': 'SAC', 5 | 6 | 'config': { 7 | 'epoch_length': 100, 8 | 'n_epochs': 1000, 9 | 'n_train_repeat': 1, 10 | 'eval_render_kwargs': { 11 | 'mode': 'human', 12 | }, 13 | 'eval_n_episodes': 10, 14 | 15 | 'discount': 0.99, 16 | 'reward_scale': 1.0, 17 | 'save_full_state': True, 18 | 'target_update_interval': 1000, 19 | 'tau': 1.0, 20 | } 21 | } 22 | 23 | ALGORITHM_PARAMS_ADDITIONAL = { 24 | 'SAC': { 25 | 'class_name': 'SAC', 26 | 'config': { 27 | 'lr': 3e-4, 28 | 'reward_scale': 0.1, 29 | 'target_entropy': 'auto', 30 | 'initial_exploration_policy': None 31 | } 32 | }, 33 | 'SQL': { 34 | 'class_name': 'SQL', 35 | 'config': { 36 | 'policy_lr': 3e-4, 37 | 'reward_scale': 0.1, 38 | 'value_n_particles': 16, 39 | 'kernel_n_particles': 32, 40 | 'kernel_update_ratio': 0.5, 41 | } 42 | } 43 | } 44 | 45 | 46 | def get_variant_spec(args): 47 | algorithm = args.algorithm 48 | 49 | layer_size = 128 50 | variant_spec = { 51 | 'layer_size': layer_size, 52 | 'policy_params': { 53 | 'class_name': 'FeedforwardGaussianPolicy', 54 | 'config': { 55 | 'hidden_layer_sizes': (layer_size, layer_size), 56 | 'squash': True, 57 | }, 58 | }, 59 | 'algorithm_params': deep_update( 60 | ALGORITHM_PARAMS_BASE, 61 | ALGORITHM_PARAMS_ADDITIONAL.get(algorithm, {}) 62 | ), 63 | 'Q_params': { 64 | 'class_name': 'double_feedforward_Q_function', 65 | 'config': { 66 | 'hidden_layer_sizes': (layer_size, layer_size), 67 | }, 68 | }, 69 | 'run_params': { 70 | 'seed': 1, 71 | }, 72 | } 73 | 74 | return variant_spec 75 | -------------------------------------------------------------------------------- /models/pusher_2d.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 72 | -------------------------------------------------------------------------------- /models/simple_maze_ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 90 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | aiohttp==3.7.4 3 | aiohttp-cors==0.7.0 4 | aioredis==1.3.1 5 | appnope==0.1.2 6 | astunparse==1.6.3 7 | async-timeout==3.0.1 8 | attrs==20.3.0 9 | backcall==0.2.0 10 | blessings==1.7 11 | brotlipy==0.7.0 12 | cachetools==4.2.0 13 | certifi==2020.12.5 14 | cffi==1.14.4 15 | chardet==3.0.4 16 | click==7.1.2 17 | cloudpickle==1.6.0 18 | colorama==0.4.4 19 | colorful==0.5.4 20 | conda==4.9.2 21 | conda-package-handling==1.7.2 22 | cryptography==3.3.2 23 | cycler==0.10.0 24 | Cython==0.29.21 25 | decorator==4.4.2 26 | dm-control==0.0.322773188 27 | dm-env==1.3 28 | dm-tree==0.1.5 29 | fasteners==0.16 30 | filelock==3.0.12 31 | flatbuffers==1.12 32 | future==0.18.2 33 | gast==0.3.3 34 | gitdb==4.0.5 35 | GitPython==3.1.12 36 | glfw==2.0.0 37 | google-api-core==1.25.0 38 | google-auth==1.24.0 39 | google-auth-oauthlib==0.4.2 40 | google-pasta==0.2.0 41 | googleapis-common-protos==1.52.0 42 | gpustat==0.6.0 43 | grpcio==1.32.0 44 | gtimer==1.0.0b5 45 | gym==0.18.0 46 | h5py==2.10.0 47 | hiredis==1.1.0 48 | idna==2.10 49 | imageio==2.9.0 50 | iniconfig==1.1.1 51 | ipdb==0.13.4 52 | ipython==7.19.0 53 | ipython-genutils==0.2.0 54 | jedi==0.18.0 55 | joblib==1.0.0 56 | jsonschema==3.2.0 57 | Keras-Preprocessing==1.1.2 58 | kiwisolver==1.3.1 59 | labmaze==1.0.3 60 | lxml==4.6.2 61 | Markdown==3.3.3 62 | matplotlib==3.3.3 63 | msgpack==1.0.2 64 | mujoco-py==2.0.2.13 65 | multidict==5.1.0 66 | networkx==2.5 67 | numpy==1.19.5 68 | nvidia-ml-py3==7.352.0 69 | oauthlib==3.1.0 70 | opencensus==0.7.12 71 | opencensus-context==0.1.2 72 | opt-einsum==3.3.0 73 | packaging==20.8 74 | pandas==1.2.0 75 | parso==0.8.1 76 | pexpect==4.8.0 77 | pickleshare==0.7.5 78 | Pillow==7.2.0 79 | pip==20.3.3 80 | pluggy==0.13.1 81 | prometheus-client==0.9.0 82 | prompt-toolkit==3.0.10 83 | protobuf==3.14.0 84 | psutil==5.8.0 85 | ptyprocess==0.7.0 86 | py==1.10.0 87 | py-spy==0.3.4 88 | pyasn1==0.4.8 89 | pyasn1-modules==0.2.8 90 | pycosat==0.6.3 91 | pycparser==2.20 92 | pyglet==1.5.0 93 | Pygments==2.7.4 94 | PyOpenGL==3.1.5 95 | PyOpenGL-accelerate==3.1.5 96 | pyOpenSSL==20.0.1 97 | pyparsing==2.4.7 98 | pyrsistent==0.17.3 99 | PySocks==1.7.1 100 | pytest==6.2.1 101 | python-dateutil==2.8.1 102 | pytz==2020.5 103 | PyWavelets==1.1.1 104 | PyYAML==5.4 105 | ray[tune]==1.2.0 106 | redis==3.5.3 107 | requests==2.25.1 108 | requests-oauthlib==1.3.0 109 | rsa==4.7 110 | ruamel-yaml==0.15.87 111 | scikit-image==0.18.1 112 | scikit-learn==0.24.1 113 | scikit-video==1.1.11 114 | scipy==1.6.0 115 | setproctitle==1.2.1 116 | six==1.15.0 117 | smmap==3.0.4 118 | tabulate==0.8.7 119 | tensorboard==2.4.1 120 | tensorboard-plugin-wit==1.8.0 121 | tensorboardX==2.1 122 | tensorflow==2.4.1 123 | tensorflow-addons==0.12.1 124 | tensorflow-estimator==2.4.0 125 | tensorflow-probability==0.12.1 126 | termcolor==1.1.0 127 | threadpoolctl==2.1.0 128 | tifffile==2021.1.14 129 | toml==0.10.2 130 | tqdm==4.55.1 131 | traitlets==5.0.5 132 | typeguard==2.10.0 133 | typing-extensions==3.7.4.3 134 | urllib3==1.26.3 135 | wcwidth==0.2.5 136 | Werkzeug==1.0.1 137 | wheel==0.36.2 138 | wrapt==1.12.1 139 | yarl==1.6.3 140 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/archive_gs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import argparse 4 | import os 5 | import subprocess 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('archive_path', type=str, default=None, nargs='?') 11 | parser.add_argument('--unarchive', action='store_true', default=False) 12 | parser.add_argument('--dry', action='store_true', default=False) 13 | args = parser.parse_args() 14 | 15 | return args 16 | 17 | 18 | def archive_gs(args): 19 | """Archive files in google cloud storage bucket. 20 | 21 | Moves files from `/ray/results` to `/archive/ray/results`. 22 | 23 | TODO(hartikainen): Refactor this to use project config instead of 24 | environment variables (e.g. `SAC_GS_BUCKET`). 25 | """ 26 | if 'SAC_GS_BUCKET' not in os.environ: 27 | raise ValueError( 28 | "'SAC_GS_BUCKET' environment variable needs to be set.") 29 | 30 | bucket = os.environ['SAC_GS_BUCKET'] 31 | fresh_results_path = os.path.join(bucket, 'ray', 'results') 32 | archive_results_path = os.path.join(bucket, 'archive', 'ray', 'results') 33 | 34 | fresh_url = os.path.join(fresh_results_path, args.archive_path) 35 | archive_url = os.path.join(archive_results_path, args.archive_path) 36 | 37 | src_url, dst_url = ( 38 | (archive_url, fresh_url) 39 | if args.unarchive 40 | else (fresh_url, archive_url)) 41 | 42 | command_parts = ['gsutil', '-m', 'mv', src_url, dst_url] 43 | command = " ".join(command_parts) 44 | 45 | if args.dry: 46 | print(command) 47 | return 48 | 49 | subprocess.call(command, shell=True) 50 | 51 | 52 | def main(): 53 | args = parse_args() 54 | archive_gs(args) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /scripts/deploy-aws.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | declare -r SCRIPT_DIRECTORY="$(dirname $(realpath ${BASH_SOURCE[0]}))" 4 | declare -r PROJECT_ROOT="$(dirname ${SCRIPT_DIRECTORY})" 5 | 6 | cd "${PROJECT_ROOT}" \ 7 | && . ./.env \ 8 | && . ./config/locals 9 | 10 | if [ -z "${AWS_ECR_REGISTRY_URL}" ]; then 11 | echo "AWS_ECR_REGISTRY_URL variable in 'config/locals' is empty or unset." \ 12 | " Fill in the values in 'config/locals' and rerun this file." 13 | exit 1 14 | fi 15 | 16 | declare -r IMAGE_NAME="softlearning" 17 | declare -r IMAGE_TAG="${SOFTLEARNING_DEV_TAG}" 18 | declare -r TARGET_REGISTRY="${AWS_ECR_REGISTRY_URL}" 19 | 20 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 21 | 22 | build_docker_image() { 23 | 24 | echo "Building Docker image." 25 | 26 | docker-compose \ 27 | -f ./docker/docker-compose.dev.cpu.yml \ 28 | build \ 29 | --build-arg MJKEY="$(cat ~/.mujoco/mjkey.txt)" 30 | 31 | echo "Build successful." 32 | 33 | } 34 | 35 | push_image_to_aws_ecr() { 36 | 37 | SOURCE_IMAGE="${IMAGE_NAME}:${IMAGE_TAG}" 38 | TARGET_IMAGE="${TARGET_REGISTRY}/${SOURCE_IMAGE}" 39 | 40 | echo "${SOURCE_IMAGE}" 41 | echo "${TARGET_IMAGE}" 42 | 43 | $(aws ecr get-login --no-include-email) 44 | 45 | docker tag "${SOURCE_IMAGE}" "${TARGET_IMAGE}" 46 | docker push "${TARGET_IMAGE}" 47 | 48 | } 49 | 50 | main() { 51 | 52 | build_docker_image 53 | push_image_to_aws_ecr 54 | 55 | } 56 | 57 | main 58 | -------------------------------------------------------------------------------- /scripts/install_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from distutils.version import LooseVersion 5 | import os 6 | import subprocess 7 | import sys 8 | 9 | 10 | KNOWN_PLATFORMS = ('linux', 'darwin') 11 | DEFAULT_MUJOCO_PATH = '~/.mujoco' 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--mujoco-path', type=str, default=DEFAULT_MUJOCO_PATH) 17 | parser.add_argument('--versions', 18 | type=str, 19 | nargs='+', 20 | default=('2.00', )) 21 | return parser 22 | 23 | 24 | def get_mujoco_zip_name(platform, version): 25 | past_150 = LooseVersion(version) > LooseVersion("1.50") 26 | basename = "mujoco" if past_150 else "mjpro" 27 | 28 | if platform == 'darwin': 29 | platform_id = 'macos' if past_150 else 'osx' 30 | elif platform == 'linux': 31 | platform_id = 'linux' 32 | else: 33 | raise ValueError(platform) 34 | 35 | # For example: "mujoco200_linux.zip" 36 | zip_name = f"{basename}{version.replace('.', '')}_{platform_id}.zip" 37 | return zip_name 38 | 39 | 40 | def install_mujoco(platform, version, mujoco_path): 41 | print(f"Installing MuJoCo version {version} to {mujoco_path}") 42 | 43 | mujoco_zip_name = get_mujoco_zip_name(platform, version) 44 | mujoco_dir_name = os.path.splitext(mujoco_zip_name)[0] 45 | if os.path.exists(os.path.join(mujoco_path, mujoco_dir_name)): 46 | print(f"MuJoCo {platform}, {version} already installed.") 47 | return 48 | 49 | mujoco_zip_url = f"https://www.roboti.us/download/{mujoco_zip_name}" 50 | 51 | if subprocess.call(["command", "-v", "wget"], shell=True) == 0: 52 | subprocess.check_call([ 53 | "wget", 54 | "--progress=bar:force", 55 | "--show-progress", 56 | "--timestamping", 57 | "--directory-prefix", 58 | mujoco_path, 59 | mujoco_zip_url]) 60 | elif subprocess.call(["command", "-v", "curl"], shell=True) == 0: 61 | subprocess.check_call([ 62 | "curl", 63 | "--location", 64 | "--show-error", 65 | "--output", 66 | os.path.join(mujoco_path, mujoco_zip_name), 67 | mujoco_zip_url]) 68 | else: 69 | raise ValueError("Need either `wget` or `curl` to download mujoco.") 70 | 71 | subprocess.call([ 72 | "unzip", 73 | "-n", 74 | os.path.join(mujoco_path, mujoco_zip_name), 75 | "-d", 76 | mujoco_path]) 77 | subprocess.call(["rm", os.path.join(mujoco_path, mujoco_zip_name)]) 78 | 79 | if LooseVersion(version) == LooseVersion('2.0'): 80 | subprocess.call([ 81 | "ln", 82 | "-s", 83 | os.path.join(mujoco_path, mujoco_dir_name), 84 | os.path.join(mujoco_path, "mujoco200"), 85 | ]) 86 | 87 | 88 | def main(): 89 | parser = get_parser() 90 | args = parser.parse_args() 91 | mujoco_path = os.path.expanduser(args.mujoco_path) 92 | 93 | if not os.path.exists(mujoco_path): 94 | os.makedirs(mujoco_path) 95 | 96 | platform = sys.platform 97 | assert platform in KNOWN_PLATFORMS, (platform, KNOWN_PLATFORMS) 98 | 99 | for version in args.versions: 100 | install_mujoco(platform, version, mujoco_path) 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /scripts/sync_gs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import argparse 4 | import os 5 | import shlex 6 | import subprocess 7 | 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser() 11 | 12 | parser.add_argument( 13 | 'sync_path', type=str, default=None, nargs='?') 14 | parser.add_argument( 15 | '--sync-checkpoints', action='store_true', default=False) 16 | parser.add_argument( 17 | '--dry', action='store_true', default=False) 18 | args = parser.parse_args() 19 | 20 | return args 21 | 22 | 23 | def sync_gs(args): 24 | """Sync files from google cloud storage bucket to local machine. 25 | 26 | TODO(hartikainen): Refactor this to use project config instead of 27 | environment variables (e.g. `SAC_GS_BUCKET`). 28 | """ 29 | if 'SAC_GS_BUCKET' not in os.environ: 30 | raise ValueError( 31 | "'SAC_GS_BUCKET' environment variable needs to be set.") 32 | 33 | bucket = os.environ['SAC_GS_BUCKET'] 34 | 35 | remote_gs_parts = [bucket, 'ray', 'results'] 36 | local_gs_parts = [os.path.expanduser('~/ray_results/gs/')] 37 | 38 | if args.sync_path is not None: 39 | remote_gs_parts.append(args.sync_path) 40 | local_gs_parts.append(args.sync_path) 41 | 42 | remote_gs_path = os.path.join(*remote_gs_parts) 43 | local_gs_path = os.path.join(*local_gs_parts) 44 | 45 | if not os.path.exists(local_gs_path): 46 | os.makedirs(local_gs_path) 47 | 48 | command_parts = ['gsutil', '-m', 'rsync', '-r'] 49 | 50 | if not args.sync_checkpoints: 51 | command_parts += ['-x', '".*./checkpoint_.*./.*"'] 52 | 53 | if args.dry: 54 | command_parts += ["-n"] 55 | 56 | command_parts += [shlex.quote(remote_gs_path), shlex.quote(local_gs_path)] 57 | 58 | command = " ".join(command_parts) 59 | 60 | subprocess.call(command, shell=True) 61 | 62 | 63 | def main(): 64 | args = parse_args() 65 | sync_gs(args) 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /scripts/test-cloud-build.sh: -------------------------------------------------------------------------------- 1 | cloud-build-local \ 2 | --config=./docker/cloudbuild.yaml \ 3 | --dryrun=false \ 4 | --push \ 5 | --write-workspace=/tmp/workspace \ 6 | --substitutions=REPO_NAME="softlearning",BRANCH_NAME="$(git rev-parse --abbrev-ref HEAD)",COMMIT_SHA="$(git rev-parse HEAD)",SHORT_SHA="$(git rev-parse --short HEAD)" \ 7 | . 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | 5 | NAME = 'softlearning' 6 | VERSION = '0.0.1' 7 | DESCRIPTION = ( 8 | "Softlearning is a deep reinforcement learning toolbox for training" 9 | " maximum entropy policies in continuous domains.") 10 | 11 | 12 | setup( 13 | name=NAME, 14 | packages=find_packages( 15 | exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 16 | version=VERSION, 17 | description=DESCRIPTION, 18 | long_description=open('./README.md').read(), 19 | author='Kristian Hartikainen', 20 | author_email='kristian.hartikainen@gmail.com', 21 | url='https://github.com/rail-berkeley/softlearning', 22 | keywords=( 23 | 'softlearning', 24 | 'soft-actor-critic', 25 | 'sac', 26 | 'soft-q-learning', 27 | 'sql', 28 | 'machine-learning', 29 | 'reinforcement-learning', 30 | 'deep-learning', 31 | 'robotics', 32 | 'tensorflow', 33 | 'tensorflow-2', 34 | ), 35 | entry_points={ 36 | 'console_scripts': ( 37 | 'softlearning=softlearning.scripts.console_scripts:main', 38 | ), 39 | }, 40 | install_requires=( 41 | 'Click>=7.0', 42 | 'GitPython==3.1.2', 43 | 'dm-control>=0.0.322773188', 44 | 'gtimer>=1.0.0b5', 45 | 'gym>=0.17.2', 46 | 'mujoco-py>=2.0.2.10', 47 | 'numpy>=1.17.5', 48 | 'pandas', 49 | 'ray[tune]>=1.0.0', 50 | 'scikit-image>=0.17.2', 51 | 'scikit-video>=1.1.11', 52 | 'scipy>=1.4.1', 53 | 'tensorflow>=2.2.0', 54 | 'tensorflow-probability>=0.10.0', 55 | ), 56 | zip_safe=True, 57 | license='MIT' 58 | ) 59 | -------------------------------------------------------------------------------- /softlearning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/__init__.py -------------------------------------------------------------------------------- /softlearning/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | from softlearning.utils.serialization import ( 2 | serialize_softlearning_object, deserialize_softlearning_object) 3 | 4 | from .sql import SQL # noqa: unused-import 5 | from .sac import SAC # noqa: unused-import 6 | 7 | 8 | def serialize(algorithm): 9 | return serialize_softlearning_object(algorithm) 10 | 11 | 12 | def deserialize(name, custom_objects=None): 13 | """Returns a algorithm function or class denoted by input string. 14 | 15 | Arguments: 16 | name : String 17 | 18 | Returns: 19 | Algorithm function or class denoted by input string. 20 | 21 | For example: 22 | >>> softlearning.algorithms.get({'class_name': 'SAC', ...}) 23 | 24 | >>> softlearning.algorithms.get('abcd') 25 | Traceback (most recent call last): 26 | ... 27 | ValueError: Unknown algorithm: abcd 28 | 29 | Args: 30 | name: The name of the algorithm. 31 | 32 | Raises: 33 | ValueError: `Unknown algorithm` if the input string does not 34 | denote any defined algorithm. 35 | """ 36 | return deserialize_softlearning_object( 37 | name, 38 | module_objects=globals(), 39 | custom_objects=custom_objects, 40 | printable_module_name='algorithm') 41 | 42 | 43 | def get(identifier): 44 | """Returns a algorithm. 45 | 46 | Arguments: 47 | identifier: function, string, or dict. 48 | 49 | Returns: 50 | A algorithm denoted by identifier. 51 | 52 | For example: 53 | >>> softlearning.algorithms.get({'class_name': 'SAC', ...}) 54 | 55 | >>> softlearning.algorithms.get('abcd') 56 | Traceback (most recent call last): 57 | ... 58 | ValueError: Unknown algorithm: abcd 59 | 60 | Raises: 61 | ValueError: Input is an unknown function or string, i.e., the 62 | identifier does not denote any defined algorithm. 63 | """ 64 | if identifier is None: 65 | return None 66 | if isinstance(identifier, str): 67 | return deserialize(identifier) 68 | elif isinstance(identifier, dict): 69 | return deserialize(identifier) 70 | elif callable(identifier): 71 | return identifier 72 | else: 73 | raise TypeError( 74 | f"Could not interpret algorithm function identifier:" 75 | " {repr(identifier)}.") 76 | -------------------------------------------------------------------------------- /softlearning/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/distributions/__init__.py -------------------------------------------------------------------------------- /softlearning/distributions/bijectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .conditional_scale import ConditionalScale 2 | from .conditional_shift import ConditionalShift 3 | 4 | 5 | __all__ = ( 6 | "ConditionalScale", 7 | "ConditionalShift", 8 | ) 9 | -------------------------------------------------------------------------------- /softlearning/distributions/bijectors/conditional_scale.py: -------------------------------------------------------------------------------- 1 | """Scale bijector.""" 2 | 3 | import tensorflow.compat.v2 as tf 4 | 5 | from tensorflow_probability.python.bijectors import bijector 6 | from tensorflow_probability.python.internal import assert_util 7 | 8 | 9 | __all__ = [ 10 | 'ConditionalScale', 11 | ] 12 | 13 | 14 | class ConditionalScale(bijector.Bijector): 15 | def __init__(self, 16 | dtype=tf.float32, 17 | validate_args=False, 18 | name='conditional_scale'): 19 | """Instantiates the `ConditionalScale` bijector. 20 | 21 | This `Bijector`'s forward operation is: 22 | 23 | ```none 24 | Y = g(X) = scale * X 25 | ``` 26 | 27 | Args: 28 | validate_args: Python `bool` indicating whether arguments should be 29 | checked for correctness. 30 | name: Python `str` name given to ops managed by this object. 31 | """ 32 | parameters = dict(locals()) 33 | with tf.name_scope(name) as name: 34 | super(ConditionalScale, self).__init__( 35 | forward_min_event_ndims=0, 36 | is_constant_jacobian=True, 37 | validate_args=validate_args, 38 | dtype=dtype, 39 | parameters=parameters, 40 | name=name) 41 | 42 | def _maybe_assert_valid_scale(self, scale): 43 | if not self.validate_args: 44 | return () 45 | is_non_zero = assert_util.assert_none_equal( 46 | scale, 47 | tf.zeros((), dtype=scale.dtype), 48 | message='Argument `scale` must be non-zero.') 49 | return (is_non_zero, ) 50 | 51 | def _forward(self, x, scale): 52 | with tf.control_dependencies(self._maybe_assert_valid_scale(scale)): 53 | return x * scale 54 | 55 | def _inverse(self, y, scale): 56 | with tf.control_dependencies(self._maybe_assert_valid_scale(scale)): 57 | return y / scale 58 | 59 | def _forward_log_det_jacobian(self, x, scale): 60 | with tf.control_dependencies(self._maybe_assert_valid_scale(scale)): 61 | return tf.math.log(tf.abs(scale)) 62 | -------------------------------------------------------------------------------- /softlearning/distributions/bijectors/conditional_scale_test.py: -------------------------------------------------------------------------------- 1 | """ConditionalScale Tests.""" 2 | 3 | # Dependency imports 4 | from absl.testing import parameterized 5 | import numpy as np 6 | import tensorflow.compat.v2 as tf 7 | 8 | from softlearning.distributions import bijectors 9 | from softlearning.internal import test_util 10 | 11 | 12 | @test_util.test_all_tf_execution_regimes 13 | class ScaleBijectorTest(test_util.TestCase, parameterized.TestCase): 14 | """Tests correctness of the Y = scale @ x transformation.""" 15 | 16 | def testName(self): 17 | bijector = bijectors.ConditionalScale() 18 | self.assertStartsWith(bijector.name, 'conditional_scale') 19 | 20 | @parameterized.named_parameters( 21 | dict(testcase_name='static_float32', is_static=True, dtype=np.float32), 22 | dict(testcase_name='static_float64', is_static=True, dtype=np.float64), 23 | dict(testcase_name='dynamic_float32', is_static=False, dtype=np.float32), 24 | dict(testcase_name='dynamic_float64', is_static=False, dtype=np.float64), 25 | ) 26 | def testNoBatchScale(self, is_static, dtype): 27 | scale = dtype(2.0) 28 | bijector = bijectors.ConditionalScale(dtype=dtype) 29 | x = self.maybe_static(np.array([1., 2, 3], dtype), is_static) 30 | self.assertAllClose([2., 4, 6], bijector.forward(x, scale=scale)) 31 | self.assertAllClose([.5, 1, 1.5], bijector.inverse(x, scale=scale)) 32 | self.assertAllClose( 33 | -np.log(2.), 34 | bijector.inverse_log_det_jacobian(x, scale=scale, event_ndims=0)) 35 | 36 | @parameterized.named_parameters( 37 | dict(testcase_name='static_float32', is_static=True, dtype=np.float32), 38 | dict(testcase_name='static_float64', is_static=True, dtype=np.float64), 39 | dict(testcase_name='dynamic_float32', is_static=False, dtype=np.float32), 40 | dict(testcase_name='dynamic_float64', is_static=False, dtype=np.float64), 41 | ) 42 | def testBatchScale(self, is_static, dtype): 43 | # Batched scale 44 | scale = tf.constant([2., 3.], dtype=dtype) 45 | bijector = bijectors.ConditionalScale(dtype=dtype) 46 | x = self.maybe_static(np.array([1.], dtype=dtype), is_static) 47 | self.assertAllClose([2., 3.], bijector.forward(x, scale=scale)) 48 | self.assertAllClose([0.5, 1./3.], bijector.inverse(x, scale=scale)) 49 | self.assertAllClose( 50 | [-np.log(2.), -np.log(3.)], 51 | bijector.inverse_log_det_jacobian(x, scale=scale, event_ndims=0)) 52 | 53 | 54 | if __name__ == '__main__': 55 | tf.test.main() 56 | -------------------------------------------------------------------------------- /softlearning/distributions/bijectors/conditional_shift.py: -------------------------------------------------------------------------------- 1 | """Shift bijector.""" 2 | 3 | import tensorflow.compat.v2 as tf 4 | from tensorflow_probability.python.internal import dtype_util 5 | from tensorflow_probability.python import bijectors as tfb 6 | 7 | 8 | __all__ = [ 9 | 'ConditionalShift', 10 | ] 11 | 12 | 13 | class ConditionalShift(tfb.Bijector): 14 | """Compute `Y = g(X; shift) = X + shift`. 15 | 16 | where `shift` is a numeric `Tensor`. 17 | 18 | Example Use: 19 | 20 | ```python 21 | shift = Shift([-1., 0., 1]) 22 | x = [1., 2, 3] 23 | # `forward` is equivalent to: 24 | # y = x + shift 25 | y = shift.forward(x) # [0., 2., 4.] 26 | ``` 27 | 28 | """ 29 | def __init__(self, 30 | dtype=tf.float32, 31 | validate_args=False, 32 | name='conditional_shift'): 33 | """Instantiates the `ConditionalShift` bijector. 34 | 35 | Args: 36 | validate_args: Python `bool` indicating whether arguments should be 37 | checked for correctness. 38 | name: Python `str` name given to ops managed by this object. 39 | """ 40 | parameters = dict(locals()) 41 | with tf.name_scope(name) as name: 42 | super(ConditionalShift, self).__init__( 43 | forward_min_event_ndims=0, 44 | is_constant_jacobian=True, 45 | dtype=dtype, 46 | validate_args=validate_args, 47 | parameters=parameters, 48 | name=name) 49 | 50 | @classmethod 51 | def _is_increasing(cls): 52 | return True 53 | 54 | def _forward(self, x, shift): 55 | return x + shift 56 | 57 | def _inverse(self, y, shift): 58 | return y - shift 59 | 60 | def _forward_log_det_jacobian(self, x, shift): 61 | # is_constant_jacobian = True for this bijector, hence the 62 | # `log_det_jacobian` need only be specified for a single input, as this will 63 | # be tiled to match `event_ndims`. 64 | return tf.zeros((), dtype=dtype_util.base_dtype(x.dtype)) 65 | -------------------------------------------------------------------------------- /softlearning/distributions/bijectors/conditional_shift_test.py: -------------------------------------------------------------------------------- 1 | """ConditionalShift Tests.""" 2 | 3 | # Dependency imports 4 | 5 | from absl.testing import parameterized 6 | import tensorflow.compat.v2 as tf 7 | 8 | from softlearning.distributions import bijectors 9 | from softlearning.internal import test_util 10 | 11 | 12 | @test_util.test_all_tf_execution_regimes 13 | class ShiftTest(test_util.TestCase, parameterized.TestCase): 14 | 15 | @parameterized.named_parameters( 16 | dict(testcase_name='static', is_static=True), 17 | dict(testcase_name='dynamic', is_static=False), 18 | ) 19 | def testNoBatch(self, is_static): 20 | shift = bijectors.ConditionalShift() 21 | x = self.maybe_static([1., 1.], is_static) 22 | self.assertAllClose([2., 0.], shift.forward(x, shift=[1., -1.])) 23 | self.assertAllClose([0., 2.], shift.inverse(x, shift=[1., -1.])) 24 | self.assertAllClose( 25 | 0., shift.inverse_log_det_jacobian(x, shift=[[2., -.5], [1., -3.]], event_ndims=1)) 26 | 27 | @parameterized.named_parameters( 28 | dict(testcase_name='static', is_static=True), 29 | dict(testcase_name='dynamic', is_static=False), 30 | ) 31 | def testBatch(self, is_static): 32 | shift = bijectors.ConditionalShift() 33 | x = self.maybe_static([1., 1.], is_static) 34 | 35 | self.assertAllClose([[3., .5], [2., -2.]], shift.forward( 36 | x, shift=[[2., -.5], [1., -3.]])) 37 | self.assertAllClose([[-1., 1.5], [0., 4.]], shift.inverse( 38 | x, shift=[[2., -.5], [1., -3.]])) 39 | self.assertAllClose(0., shift.inverse_log_det_jacobian( 40 | x, shift=[[2., -.5], [1., -3.]], event_ndims=1)) 41 | 42 | 43 | if __name__ == '__main__': 44 | tf.test.main() 45 | -------------------------------------------------------------------------------- /softlearning/distributions/bijectors/real_nvp_flow_test.py: -------------------------------------------------------------------------------- 1 | """Tests for RealNVPFlow.""" 2 | 3 | import pytest 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow_probability import bijectors 7 | from tensorflow.python.framework import test_util # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top 8 | 9 | from softlearning.distributions.bijectors.real_nvp_flow import RealNVPFlow 10 | 11 | 12 | @pytest.mark.skip(reason="tf2 broke these tests.") 13 | class RealNVPFlowTest(tf.test.TestCase): 14 | def test_build(self): 15 | x_ = np.reshape(np.linspace(-1.0, 1.0, 8, dtype=np.float32), (-1, 4)) 16 | 17 | num_coupling_layers = 10 18 | hidden_layer_sizes = (64, 64) 19 | 20 | flow = RealNVPFlow( 21 | num_coupling_layers=num_coupling_layers, 22 | hidden_layer_sizes=hidden_layer_sizes) 23 | 24 | self.assertFalse(flow._built) 25 | flow.forward(x_) 26 | self.assertTrue(flow._built) 27 | 28 | real_nvp_layers = [ 29 | layer for layer in flow.flow.bijectors 30 | if isinstance(layer, bijectors.RealNVP) 31 | ] 32 | self.assertEqual(len(real_nvp_layers), num_coupling_layers) 33 | 34 | permute_layers = [ 35 | layer for layer in flow.flow.bijectors 36 | if isinstance(layer, bijectors.Permute) 37 | ] 38 | self.assertEqual(len(permute_layers), num_coupling_layers-1) 39 | 40 | batch_normalization_layers = [ 41 | layer for layer in flow.flow.bijectors 42 | if isinstance(layer, bijectors.BatchNormalization) 43 | ] 44 | self.assertEqual(len(batch_normalization_layers), 0) 45 | 46 | self.assertEqual( 47 | len(flow.flow.bijectors), 48 | len(real_nvp_layers) + len(permute_layers)) 49 | 50 | def test_forward_inverse_returns_identity(self): 51 | x_ = np.reshape(np.linspace(-1.0, 1.0, 8, dtype=np.float32), (-1, 4)) 52 | 53 | flow = RealNVPFlow( 54 | num_coupling_layers=2, 55 | hidden_layer_sizes=(64,)) 56 | 57 | x = tf.constant(x_) 58 | forward_x = flow.forward(x) 59 | # Use identity to invalidate cache. 60 | inverse_y = flow.inverse(tf.identity(forward_x)) 61 | forward_inverse_y = flow.forward(inverse_y) 62 | fldj = flow.forward_log_det_jacobian(x, event_ndims=1) 63 | # Use identity to invalidate cache. 64 | ildj = flow.inverse_log_det_jacobian(tf.identity(forward_x), event_ndims=1) 65 | 66 | forward_x_ = forward_x.numpy() 67 | inverse_y_ = inverse_y.numpy() 68 | forward_inverse_y_ = forward_inverse_y.numpy() 69 | ildj_ = ildj.numpy() 70 | fldj_ = fldj.numpy() 71 | 72 | self.assertEqual("real_nvp_flow", flow.name) 73 | self.assertAllClose(forward_x_, forward_inverse_y_, rtol=1e-4, atol=0.) 74 | self.assertAllClose(x_, inverse_y_, rtol=1e-4, atol=0.0) 75 | self.assertAllClose(ildj_, -fldj_, rtol=1e-6, atol=0.0) 76 | 77 | def test_should_reuse_scale_and_log_scale_variables(self): 78 | x_ = np.reshape(np.linspace(-1.0, 1.0, 8, dtype=np.float32), (-1, 4)) 79 | 80 | flow = RealNVPFlow( 81 | num_coupling_layers=2, 82 | hidden_layer_sizes=(64,)) 83 | 84 | x = tf.constant(x_) 85 | 86 | assert not tf.compat.v1.trainable_variables() 87 | 88 | forward_x = flow.forward(x) 89 | 90 | self.assertEqual( 91 | len(tf.compat.v1.trainable_variables()), 4 * flow._num_coupling_layers) 92 | 93 | inverse_y = flow.inverse(tf.identity(forward_x)) 94 | forward_inverse_y = flow.forward(inverse_y) 95 | fldj = flow.forward_log_det_jacobian(x, event_ndims=1) 96 | ildj = flow.inverse_log_det_jacobian( 97 | tf.identity(forward_x), event_ndims=1) 98 | 99 | self.assertEqual( 100 | len(tf.compat.v1.trainable_variables()), 4 * flow._num_coupling_layers) 101 | 102 | def test_batched_flow_with_mlp_transform(self): 103 | x_ = np.random.normal(0., 1., (3, 8)).astype(np.float32) 104 | flow = RealNVPFlow( 105 | num_coupling_layers=2, 106 | hidden_layer_sizes=(64,), 107 | use_batch_normalization=False) 108 | x = tf.constant(x_) 109 | forward_x = flow.forward(x) 110 | # Use identity to invalidate cache. 111 | inverse_y = flow.inverse(forward_x) 112 | forward_inverse_y = flow.forward(inverse_y) 113 | fldj = flow.forward_log_det_jacobian(x, event_ndims=1) 114 | # Use identity to invalidate cache. 115 | ildj = flow.inverse_log_det_jacobian(forward_x, event_ndims=1) 116 | 117 | [ 118 | forward_x_, 119 | inverse_y_, 120 | forward_inverse_y_, 121 | ildj_, 122 | fldj_, 123 | ] = [ 124 | forward_x.numpy(), 125 | inverse_y.numpy(), 126 | forward_inverse_y.numpy(), 127 | ildj.numpy(), 128 | fldj.numpy(), 129 | ] 130 | 131 | self.assertEqual("real_nvp_flow", flow.name) 132 | self.assertAllClose(forward_x_, forward_inverse_y_, rtol=1e-4, atol=0.) 133 | self.assertAllClose(x_, inverse_y_, rtol=1e-4, atol=0.) 134 | self.assertAllClose(ildj_, -fldj_, rtol=1e-6, atol=1e-8) 135 | 136 | def test_with_batch_normalization(self): 137 | x_ = np.reshape(np.linspace(-1.0, 1.0, 8, dtype=np.float32), (-1, 4)) 138 | 139 | with self.assertRaises(NotImplementedError): 140 | flow = RealNVPFlow( 141 | num_coupling_layers=2, 142 | hidden_layer_sizes=(64,), 143 | use_batch_normalization=True) 144 | 145 | 146 | if __name__ == '__main__': 147 | tf.test.main() 148 | -------------------------------------------------------------------------------- /softlearning/environments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/environments/__init__.py -------------------------------------------------------------------------------- /softlearning/environments/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | """Module that provides adapters between SoftlearningEnv and other universes""" 2 | -------------------------------------------------------------------------------- /softlearning/environments/adapters/dm_control_adapter_test.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import unittest 3 | 4 | import numpy as np 5 | from gym import spaces 6 | import pytest 7 | 8 | from .softlearning_env_test import AdapterTestClass 9 | from softlearning.environments.adapters.dm_control_adapter import ( 10 | DmControlAdapter) 11 | 12 | 13 | class TestDmControlAdapter(unittest.TestCase, AdapterTestClass): 14 | def create_adapter(self, 15 | domain='cartpole', 16 | task='swingup', 17 | *args, 18 | **kwargs): 19 | return DmControlAdapter(domain, task, *args, **kwargs) 20 | 21 | def test_environments(self): 22 | # Make sure that all the environments are creatable 23 | TEST_ENVIRONMENTS = ( 24 | ('cartpole', 'swingup'), 25 | ) 26 | 27 | def verify_reset_and_step(domain, task): 28 | env = DmControlAdapter(domain=domain, task=task) 29 | env.reset() 30 | env.step(env.action_space.sample()) 31 | 32 | for domain, task in TEST_ENVIRONMENTS: 33 | print("testing: ", domain, task) 34 | verify_reset_and_step(domain, task) 35 | 36 | def test_render_human(self): 37 | env = self.create_adapter() 38 | with self.assertRaises(NotImplementedError): 39 | result = env.render(mode='human') 40 | self.assertIsNone(result) 41 | 42 | def test_environment_kwargs(self): 43 | # TODO(hartikainen): Figure this out later. 44 | pass 45 | 46 | def test_serialize_deserialize(self): 47 | domain, task = 'hopper', 'hop' 48 | env_kwargs = { 49 | 'environment_kwargs': { 50 | 'flat_observation': True, 51 | } 52 | } 53 | env1 = self.create_adapter(domain=domain, task=task, **env_kwargs) 54 | env1.reset() 55 | 56 | env2 = pickle.loads(pickle.dumps(env1)) 57 | 58 | self.assertEqual(env1.observation_keys, env2.observation_keys) 59 | for key, value in env_kwargs['environment_kwargs'].items(): 60 | self.assertEqual(getattr(env1.unwrapped, f'_{key}'), value) 61 | self.assertEqual(getattr(env2.unwrapped, f'_{key}'), value) 62 | 63 | def test_copy_environments(self): 64 | domain, task = 'cartpole', 'swingup' 65 | env_kwargs = { 66 | 'environment_kwargs': { 67 | 'flat_observation': False, 68 | } 69 | } 70 | env1 = self.create_adapter(domain=domain, task=task, **env_kwargs) 71 | env1.reset() 72 | env2 = env1.copy() 73 | 74 | self.assertEqual(env1.observation_keys, env2.observation_keys) 75 | for key, value in env_kwargs['environment_kwargs'].items(): 76 | self.assertEqual(getattr(env1.unwrapped, f'_{key}'), value) 77 | self.assertEqual(getattr(env2.unwrapped, f'_{key}'), value) 78 | 79 | def test_rescale_action(self): 80 | environment_kwargs = { 81 | 'domain': 'quadruped', 82 | 'task': 'run', 83 | } 84 | environment = DmControlAdapter(**environment_kwargs, rescale_action_range=None) 85 | new_low, new_high = -1.0, 1.0 86 | 87 | assert isinstance(environment.action_space, spaces.Box) 88 | assert np.any(environment.action_space.low != new_low) 89 | assert np.any(environment.action_space.high != new_high) 90 | 91 | rescaled_environment = DmControlAdapter( 92 | **environment_kwargs, rescale_action_range=(new_low, new_high)) 93 | 94 | np.testing.assert_allclose( 95 | rescaled_environment.action_space.low, new_low) 96 | np.testing.assert_allclose( 97 | rescaled_environment.action_space.high, new_high) 98 | 99 | def test_rescale_observation_raises_exception(self): 100 | environment_kwargs = { 101 | 'domain': 'quadruped', 102 | 'task': 'run', 103 | 'rescale_observation_range': (-1.0, 1.0), 104 | } 105 | with pytest.raises( 106 | NotImplementedError, match=r"Observation rescaling .*"): 107 | environment = DmControlAdapter(**environment_kwargs) 108 | 109 | 110 | if __name__ == '__main__': 111 | unittest.main() 112 | -------------------------------------------------------------------------------- /softlearning/environments/adapters/gym_adapter.py: -------------------------------------------------------------------------------- 1 | """Implements a GymAdapter that converts Gym envs into SoftlearningEnv.""" 2 | 3 | from collections import defaultdict, OrderedDict 4 | import copy 5 | 6 | import gym 7 | from gym import spaces, wrappers 8 | from gym.envs.mujoco.mujoco_env import MujocoEnv 9 | 10 | from .softlearning_env import SoftlearningEnv 11 | from softlearning.environments.gym import register_environments 12 | from softlearning.environments.gym.wrappers import RescaleObservation 13 | from softlearning.utils.gym import is_continuous_space 14 | 15 | 16 | def parse_domain_task(gym_id): 17 | domain_task_parts = gym_id.split('-') 18 | domain = '-'.join(domain_task_parts[:1]) 19 | task = '-'.join(domain_task_parts[1:]) 20 | 21 | return domain, task 22 | 23 | 24 | CUSTOM_GYM_ENVIRONMENT_IDS = register_environments() 25 | CUSTOM_GYM_ENVIRONMENTS = defaultdict(list) 26 | 27 | for gym_id in CUSTOM_GYM_ENVIRONMENT_IDS: 28 | domain, task = parse_domain_task(gym_id) 29 | CUSTOM_GYM_ENVIRONMENTS[domain].append(task) 30 | 31 | CUSTOM_GYM_ENVIRONMENTS = dict(CUSTOM_GYM_ENVIRONMENTS) 32 | 33 | GYM_ENVIRONMENT_IDS = tuple(gym.envs.registry.env_specs.keys()) 34 | GYM_ENVIRONMENTS = defaultdict(list) 35 | 36 | 37 | for gym_id in GYM_ENVIRONMENT_IDS: 38 | domain, task = parse_domain_task(gym_id) 39 | GYM_ENVIRONMENTS[domain].append(task) 40 | 41 | GYM_ENVIRONMENTS = dict(GYM_ENVIRONMENTS) 42 | 43 | 44 | DEFAULT_OBSERVATION_KEY = 'observations' 45 | 46 | 47 | class GymAdapter(SoftlearningEnv): 48 | """Adapter that implements the SoftlearningEnv for Gym envs.""" 49 | 50 | def __init__(self, 51 | domain, 52 | task, 53 | *args, 54 | env=None, 55 | rescale_action_range=(-1.0, 1.0), 56 | rescale_observation_range=None, 57 | observation_keys=(), 58 | goal_keys=(), 59 | unwrap_time_limit=True, 60 | pixel_wrapper_kwargs=None, 61 | **kwargs): 62 | assert not args, ( 63 | "Gym environments don't support args. Use kwargs instead.") 64 | 65 | self.rescale_action_range = rescale_action_range 66 | self.rescale_observation_range = rescale_observation_range 67 | self.unwrap_time_limit = unwrap_time_limit 68 | 69 | super(GymAdapter, self).__init__( 70 | domain, task, *args, goal_keys=goal_keys, **kwargs) 71 | 72 | if env is None: 73 | assert (domain is not None and task is not None), (domain, task) 74 | try: 75 | env_id = f"{domain}-{task}" 76 | env = gym.envs.make(env_id, **kwargs) 77 | except gym.error.UnregisteredEnv: 78 | env_id = f"{domain}{task}" 79 | env = gym.envs.make(env_id, **kwargs) 80 | self._env_kwargs = kwargs 81 | else: 82 | assert not kwargs 83 | assert domain is None and task is None, (domain, task) 84 | 85 | if isinstance(env, wrappers.TimeLimit) and unwrap_time_limit: 86 | # Remove the TimeLimit wrapper that sets 'done = True' when 87 | # the time limit specified for each environment has been passed and 88 | # therefore the environment is not Markovian (terminal condition 89 | # depends on time rather than state). 90 | env = env.env 91 | 92 | if rescale_observation_range: 93 | env = RescaleObservation(env, *rescale_observation_range) 94 | 95 | if rescale_action_range and is_continuous_space(env.action_space): 96 | env = wrappers.RescaleAction(env, *rescale_action_range) 97 | 98 | # TODO(hartikainen): We need the clip action wrapper because sometimes 99 | # the tfp.bijectors.Tanh() produces values strictly greater than 1 or 100 | # strictly less than -1, which causes the env fail without clipping. 101 | # The error is in the order of 1e-7, which should not cause issues. 102 | # See https://github.com/tensorflow/probability/issues/664. 103 | env = wrappers.ClipAction(env) 104 | 105 | if pixel_wrapper_kwargs is not None: 106 | env = wrappers.PixelObservationWrapper(env, **pixel_wrapper_kwargs) 107 | 108 | self._env = env 109 | 110 | if isinstance(self._env.observation_space, spaces.Dict): 111 | dict_observation_space = self._env.observation_space 112 | self.observation_keys = ( 113 | observation_keys or (*self.observation_space.spaces.keys(), )) 114 | elif isinstance(self._env.observation_space, spaces.Box): 115 | dict_observation_space = spaces.Dict(OrderedDict(( 116 | (DEFAULT_OBSERVATION_KEY, self._env.observation_space), 117 | ))) 118 | self.observation_keys = (DEFAULT_OBSERVATION_KEY, ) 119 | 120 | self._observation_space = type(dict_observation_space)([ 121 | (name, copy.deepcopy(space)) 122 | for name, space in dict_observation_space.spaces.items() 123 | if name in self.observation_keys + self.goal_keys 124 | ]) 125 | 126 | if len(self._env.action_space.shape) > 1: 127 | raise NotImplementedError( 128 | "Shape of the action space ({}) is not flat, make sure to" 129 | " check the implemenation.".format(self._env.action_space)) 130 | 131 | self._action_space = self._env.action_space 132 | 133 | def step(self, action, *args, **kwargs): 134 | observation, reward, terminal, info = self._env.step( 135 | action, *args, **kwargs) 136 | 137 | if not isinstance(self._env.observation_space, spaces.Dict): 138 | observation = {DEFAULT_OBSERVATION_KEY: observation} 139 | 140 | observation = self._filter_observation(observation) 141 | return observation, reward, terminal, info 142 | 143 | def reset(self, *args, **kwargs): 144 | observation = self._env.reset() 145 | 146 | if not isinstance(self._env.observation_space, spaces.Dict): 147 | observation = {DEFAULT_OBSERVATION_KEY: observation} 148 | 149 | observation = self._filter_observation(observation) 150 | return observation 151 | 152 | def render(self, *args, width=100, height=100, **kwargs): 153 | if isinstance(self._env.unwrapped, MujocoEnv): 154 | self._env.render(*args, width=width, height=height, **kwargs) 155 | 156 | return self._env.render(*args, **kwargs) 157 | 158 | def seed(self, *args, **kwargs): 159 | return self._env.seed(*args, **kwargs) 160 | 161 | @property 162 | def unwrapped(self): 163 | return self._env.unwrapped 164 | -------------------------------------------------------------------------------- /softlearning/environments/adapters/robosuite_adapter_test.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import unittest 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from .softlearning_env_test import AdapterTestClass 8 | from softlearning.environments.adapters.robosuite_adapter import ( 9 | RobosuiteAdapter) 10 | 11 | 12 | class TestRobosuiteAdapter(unittest.TestCase, AdapterTestClass): 13 | def create_adapter(self, domain='Sawyer', task='Lift', *args, **kwargs): 14 | return RobosuiteAdapter( 15 | domain, 16 | task, 17 | *args, 18 | **{ 19 | 'has_renderer': False, 20 | 'has_offscreen_renderer': False, 21 | 'use_camera_obs': False, 22 | **kwargs 23 | }) 24 | 25 | def test_environments(self): 26 | # Make sure that all the environments are creatable 27 | TEST_ENVIRONMENTS = [('Sawyer', 'Lift')] 28 | 29 | def verify_reset_and_step(domain, task): 30 | env = RobosuiteAdapter( 31 | domain=domain, 32 | task=task, 33 | has_renderer=True, 34 | has_offscreen_renderer=True, 35 | use_camera_obs=False) 36 | env.reset() 37 | env.step(env.action_space.sample()) 38 | 39 | for domain, task in TEST_ENVIRONMENTS: 40 | verify_reset_and_step(domain, task) 41 | 42 | def test_serialize_deserialize(self): 43 | domain, task = 'Sawyer', 'Lift' 44 | env_kwargs = { 45 | 'has_renderer': False, 46 | 'has_offscreen_renderer': False, 47 | 'use_camera_obs': False, 48 | 'reward_shaping': True, 49 | } 50 | env1 = self.create_adapter(domain=domain, task=task, **env_kwargs) 51 | env1.reset() 52 | 53 | env2 = pickle.loads(pickle.dumps(env1)) 54 | 55 | self.assertEqual(env1.observation_keys, env2.observation_keys) 56 | for key, value in env_kwargs.items(): 57 | self.assertEqual(getattr(env1.unwrapped, f'{key}'), value) 58 | self.assertEqual(getattr(env2.unwrapped, f'{key}'), value) 59 | 60 | def test_copy_environments(self): 61 | domain, task = 'Sawyer', 'Lift' 62 | env_kwargs = { 63 | "gripper_type": "TwoFingerGripper", 64 | "table_full_size": (0.8, 0.8, 0.8) 65 | } 66 | env1 = self.create_adapter(domain=domain, task=task, **env_kwargs) 67 | env1.reset() 68 | env2 = env1.copy() 69 | 70 | self.assertEqual(env1.observation_keys, env2.observation_keys) 71 | for key, value in env_kwargs.items(): 72 | self.assertEqual(getattr(env1.unwrapped, key), value) 73 | self.assertEqual(getattr(env2.unwrapped, key), value) 74 | 75 | domain, task = 'Sawyer', 'Lift' 76 | robosuite_adapter_kwargs = { 77 | 'observation_keys': ('joint_pos', 'joint_vel') 78 | } 79 | env_kwargs = { 80 | "gripper_type": "TwoFingerGripper", 81 | "table_full_size": (0.8, 0.8, 0.8) 82 | } 83 | env1 = self.create_adapter( 84 | domain=domain, task=task, **robosuite_adapter_kwargs, **env_kwargs) 85 | env1.reset() 86 | env2 = env1.copy() 87 | 88 | for key, value in robosuite_adapter_kwargs.items(): 89 | self.assertEqual(getattr(env1, key), value) 90 | self.assertEqual(getattr(env2, key), value) 91 | 92 | for key, value in env_kwargs.items(): 93 | self.assertEqual(getattr(env1.unwrapped, key), value) 94 | self.assertEqual(getattr(env2.unwrapped, key), value) 95 | 96 | def test_fails_with_invalid_environment_kwargs(self): 97 | domain, task = 'Sawyer', 'Lift' 98 | robosuite_adapter_kwargs = { 99 | 'observation_keys': ('joint_pos', 'invalid_key') 100 | } 101 | with self.assertRaises(AssertionError): 102 | env = self.create_adapter( 103 | domain=domain, task=task, **robosuite_adapter_kwargs) 104 | 105 | def test_environment_kwargs(self): 106 | env_kwargs = { 107 | "has_renderer": False, 108 | "has_offscreen_renderer": False, 109 | "use_camera_obs": False, 110 | "control_freq": 10, 111 | "horizon": 1000 112 | } 113 | 114 | env = RobosuiteAdapter( 115 | domain='Sawyer', task='Lift', **env_kwargs) 116 | 117 | observation1, reward, done, info = env.step(env.action_space.sample()) 118 | 119 | self.assertAlmostEqual(reward, 0.0) 120 | 121 | for key, expected_value in env_kwargs.items(): 122 | actual_value = getattr(env.unwrapped, key) 123 | self.assertEqual(actual_value, expected_value) 124 | 125 | def test_render_rgb_array(self): 126 | env = self.create_adapter( 127 | has_renderer=False, 128 | has_offscreen_renderer=True) 129 | env.render(mode='rgb_array', camera_id=0, width=32, height=32) 130 | 131 | def test_render_human(self): 132 | env = self.create_adapter( 133 | has_renderer=True, 134 | has_offscreen_renderer=False) 135 | env.render(mode='human') 136 | 137 | def test_fails_with_unnormalized_action_spec(self): 138 | from robosuite.environments.sawyer_lift import SawyerLift 139 | 140 | class UnnormalizedEnv(SawyerLift): 141 | @property 142 | def dof(self): 143 | return 5 144 | 145 | @property 146 | def action_spec(self): 147 | low, high = np.ones(self.dof) * -2.0, np.ones(self.dof) * 2.0 148 | return low, high 149 | 150 | env = UnnormalizedEnv( 151 | has_renderer=False, 152 | has_offscreen_renderer=False, 153 | use_camera_obs=False) 154 | with self.assertRaises(AssertionError): 155 | adapter = RobosuiteAdapter(domain=None, task=None, env=env) 156 | 157 | def test_rescale_observation_raises_exception(self): 158 | environment_kwargs = { 159 | 'domain': 'Sawyer', 160 | 'task': 'Lift', 161 | 'rescale_observation_range': (-1.0, 1.0), 162 | } 163 | with pytest.raises( 164 | NotImplementedError, match=r"Observation rescaling .*"): 165 | environment = RobosuiteAdapter(**environment_kwargs) 166 | 167 | 168 | if __name__ == '__main__': 169 | unittest.main() 170 | -------------------------------------------------------------------------------- /softlearning/environments/adapters/softlearning_env_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | 4 | 5 | class AdapterTestClass(object): 6 | ENVIRONMENTS = [] 7 | 8 | def test_observation_space(self): 9 | env = self.create_adapter() 10 | observation_space = env.observation_space 11 | self.assertTrue( 12 | isinstance(observation_space, (spaces.Box, spaces.Dict))) 13 | # TODO(hartikainen): Test actual conversion of dimensions and types of 14 | # inside items; not just outside type. 15 | 16 | def test_action_space(self): 17 | env = self.create_adapter() 18 | action_space = env.action_space 19 | self.assertTrue( 20 | isinstance(action_space, spaces.Box)) 21 | 22 | def test_step(self): 23 | env = self.create_adapter() 24 | env.reset() 25 | step = env.step(env.action_space.sample()) 26 | self.assertTrue(isinstance(step, tuple)) 27 | self.assertEqual(len(step), 4) 28 | 29 | observation, reward, done, info = step 30 | self.assertIsInstance(observation, dict) 31 | self.assertIsInstance(reward, np.float) 32 | self.assertIsInstance(done, bool) 33 | self.assertIsInstance(info, dict) 34 | 35 | def test_reset(self): 36 | env = self.create_adapter() 37 | observation = env.reset() 38 | self.assertIsInstance(observation, dict) 39 | 40 | def test_render_rgb_array(self): 41 | env = self.create_adapter() 42 | result = env.render(mode='rgb_array') 43 | self.assertIsInstance(result, np.ndarray) 44 | env.close() 45 | 46 | def test_render_human(self): 47 | env = self.create_adapter() 48 | result = env.render(mode='human') 49 | self.assertIsNone(result) 50 | env.close() 51 | 52 | def test_close(self): 53 | env = self.create_adapter() 54 | env.close() 55 | -------------------------------------------------------------------------------- /softlearning/environments/dm_control/__init__.py: -------------------------------------------------------------------------------- 1 | """Custom DeepMind Control Suite environments. 2 | 3 | Every class inside this module should extend a dm_control.suite.Task class. The 4 | # file structure should be similar to dm_control's file structure. 5 | """ 6 | -------------------------------------------------------------------------------- /softlearning/environments/dm_control/suite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/environments/dm_control/suite/__init__.py -------------------------------------------------------------------------------- /softlearning/environments/dm_control/suite/wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/environments/dm_control/suite/wrappers/__init__.py -------------------------------------------------------------------------------- /softlearning/environments/gym/__init__.py: -------------------------------------------------------------------------------- 1 | """Custom Gym environments. 2 | 3 | Every class inside this module should extend a gym.Env class. The file 4 | structure should be similar to gym.envs file structure, e.g. if you're 5 | implementing a mujoco env, you would implement it under gym.mujoco submodule. 6 | """ 7 | 8 | import gym 9 | 10 | 11 | CUSTOM_GYM_ENVIRONMENTS_PATH = __package__ 12 | MUJOCO_ENVIRONMENTS_PATH = f'{CUSTOM_GYM_ENVIRONMENTS_PATH}.mujoco' 13 | 14 | MUJOCO_ENVIRONMENT_SPECS = ( 15 | { 16 | 'id': 'Swimmer-Parameterizable-v3', 17 | 'entry_point': (f'gym.envs.mujoco.swimmer_v3:SwimmerEnv'), 18 | }, 19 | { 20 | 'id': 'Hopper-Parameterizable-v3', 21 | 'entry_point': (f'gym.envs.mujoco.hopper_v3:HopperEnv'), 22 | }, 23 | { 24 | 'id': 'Walker2d-Parameterizable-v3', 25 | 'entry_point': (f'gym.envs.mujoco.walker2d_v3:Walker2dEnv'), 26 | }, 27 | { 28 | 'id': 'HalfCheetah-Parameterizable-v3', 29 | 'entry_point': (f'gym.envs.mujoco.half_cheetah_v3:HalfCheetahEnv'), 30 | }, 31 | { 32 | 'id': 'Ant-Parameterizable-v3', 33 | 'entry_point': (f'gym.envs.mujoco.ant_v3:AntEnv'), 34 | }, 35 | { 36 | 'id': 'Humanoid-Parameterizable-v3', 37 | 'entry_point': (f'gym.envs.mujoco.humanoid_v3:HumanoidEnv'), 38 | }, 39 | { 40 | 'id': 'Pusher2d-Default-v0', 41 | 'entry_point': (f'{MUJOCO_ENVIRONMENTS_PATH}' 42 | '.pusher_2d:Pusher2dEnv'), 43 | }, 44 | { 45 | 'id': 'Pusher2d-DefaultReach-v0', 46 | 'entry_point': (f'{MUJOCO_ENVIRONMENTS_PATH}' 47 | '.pusher_2d:ForkReacherEnv'), 48 | }, 49 | { 50 | 'id': 'Pusher2d-ImageDefault-v0', 51 | 'entry_point': (f'{MUJOCO_ENVIRONMENTS_PATH}' 52 | '.image_pusher_2d:ImagePusher2dEnv'), 53 | }, 54 | { 55 | 'id': 'Pusher2d-ImageReach-v0', 56 | 'entry_point': (f'{MUJOCO_ENVIRONMENTS_PATH}' 57 | '.image_pusher_2d:ImageForkReacher2dEnv'), 58 | }, 59 | { 60 | 'id': 'Pusher2d-BlindReach-v0', 61 | 'entry_point': (f'{MUJOCO_ENVIRONMENTS_PATH}' 62 | '.image_pusher_2d:BlindForkReacher2dEnv'), 63 | }, 64 | ) 65 | 66 | GENERAL_ENVIRONMENT_SPECS = ( 67 | { 68 | 'id': 'MultiGoal-Default-v0', 69 | 'entry_point': (f'{CUSTOM_GYM_ENVIRONMENTS_PATH}' 70 | '.multi_goal:MultiGoalEnv') 71 | }, 72 | ) 73 | 74 | MUJOCO_ENVIRONMENTS = tuple( 75 | environment_spec['id'] 76 | for environment_spec in MUJOCO_ENVIRONMENT_SPECS) 77 | 78 | 79 | GENERAL_ENVIRONMENTS = tuple( 80 | environment_spec['id'] 81 | for environment_spec in GENERAL_ENVIRONMENT_SPECS) 82 | 83 | 84 | GYM_ENVIRONMENTS = ( 85 | *MUJOCO_ENVIRONMENTS, 86 | *GENERAL_ENVIRONMENTS, 87 | ) 88 | 89 | 90 | def register_mujoco_environments(): 91 | """Register softlearning mujoco environments.""" 92 | for mujoco_environment in MUJOCO_ENVIRONMENT_SPECS: 93 | gym.register(**mujoco_environment) 94 | 95 | gym_ids = tuple( 96 | environment_spec['id'] 97 | for environment_spec in MUJOCO_ENVIRONMENT_SPECS) 98 | 99 | return gym_ids 100 | 101 | 102 | def register_general_environments(): 103 | """Register gym environments that don't fall under a specific category.""" 104 | for general_environment in GENERAL_ENVIRONMENT_SPECS: 105 | gym.register(**general_environment) 106 | 107 | gym_ids = tuple( 108 | environment_spec['id'] 109 | for environment_spec in GENERAL_ENVIRONMENT_SPECS) 110 | 111 | return gym_ids 112 | 113 | 114 | def register_environments(): 115 | registered_mujoco_environments = register_mujoco_environments() 116 | registered_general_environments = register_general_environments() 117 | 118 | return ( 119 | *registered_mujoco_environments, 120 | *registered_general_environments, 121 | ) 122 | -------------------------------------------------------------------------------- /softlearning/environments/gym/mujoco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/environments/gym/mujoco/__init__.py -------------------------------------------------------------------------------- /softlearning/environments/gym/mujoco/image_pusher_2d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | 4 | from softlearning.environments.helpers import random_point_in_circle 5 | from .pusher_2d import Pusher2dEnv 6 | 7 | 8 | class ImagePusher2dEnv(Pusher2dEnv): 9 | def __init__(self, image_shape, *args, **kwargs): 10 | utils.EzPickle.__init__(**locals()) 11 | self.image_shape = image_shape 12 | Pusher2dEnv.__init__(self, *args, **kwargs) 13 | 14 | def _get_obs(self): 15 | width, height = self.image_shape[:2] 16 | image = self.render(mode='rgb_array', width=width, height=height) 17 | image = ((2.0 / 255.0) * image - 1.0) 18 | 19 | return np.concatenate([ 20 | image.reshape(-1), 21 | self.sim.data.qpos.flat[self.JOINT_INDS], 22 | self.sim.data.qvel.flat[self.JOINT_INDS], 23 | ]).reshape(-1) 24 | 25 | def step(self, action): 26 | """Step, computing reward from 'true' observations and not images.""" 27 | 28 | reward_observations = super(ImagePusher2dEnv, self)._get_obs() 29 | reward, info = self.compute_reward(reward_observations, action) 30 | 31 | self.do_simulation(action, self.frame_skip) 32 | 33 | observation = self._get_obs() 34 | done = False 35 | 36 | return observation, reward, done, info 37 | 38 | def viewer_setup(self): 39 | self.viewer.cam.trackbodyid = 0 40 | self.viewer.cam.lookat[:3] = [0, 0, 0] 41 | self.viewer.cam.distance = 3.5 42 | self.viewer.cam.elevation = -90 43 | self.viewer.cam.azimuth = 0 44 | self.viewer.cam.trackbodyid = -1 45 | 46 | 47 | class ImageForkReacher2dEnv(ImagePusher2dEnv): 48 | def __init__(self, 49 | arm_goal_distance_cost_coeff, 50 | arm_object_distance_cost_coeff, 51 | *args, 52 | **kwargs): 53 | utils.EzPickle.__init__(**locals()) 54 | 55 | self._arm_goal_distance_cost_coeff = arm_goal_distance_cost_coeff 56 | self._arm_object_distance_cost_coeff = arm_object_distance_cost_coeff 57 | 58 | super(ImageForkReacher2dEnv, self).__init__(*args, **kwargs) 59 | 60 | def compute_reward(self, observations, actions): 61 | is_batch = True 62 | if observations.ndim == 1: 63 | observations = observations[None] 64 | actions = actions[None] 65 | is_batch = False 66 | else: 67 | raise NotImplementedError('Might be broken.') 68 | 69 | arm_pos = observations[:, -6:-4] 70 | goal_pos = self.get_body_com('goal')[:2][None] 71 | object_pos = observations[:, -3:-1] 72 | 73 | arm_goal_dists = np.linalg.norm(arm_pos - goal_pos, axis=1) 74 | arm_object_dists = np.linalg.norm(arm_pos - object_pos, axis=1) 75 | ctrl_costs = np.sum(actions**2, axis=1) 76 | 77 | costs = ( 78 | + self._arm_goal_distance_cost_coeff * arm_goal_dists 79 | + self._arm_object_distance_cost_coeff * arm_object_dists 80 | + self._ctrl_cost_coeff * ctrl_costs) 81 | 82 | rewards = -costs 83 | 84 | if not is_batch: 85 | rewards = rewards.squeeze() 86 | arm_goal_dists = arm_goal_dists.squeeze() 87 | arm_object_dists = arm_object_dists.squeeze() 88 | 89 | return rewards, { 90 | 'arm_goal_distance': arm_goal_dists, 91 | 'arm_object_distance': arm_object_dists, 92 | } 93 | 94 | def reset_model(self): 95 | qpos = np.random.uniform( 96 | low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos.squeeze() 97 | 98 | # qpos[self.JOINT_INDS[0]] = np.random.uniform(-np.pi, np.pi) 99 | # qpos[self.JOINT_INDS[1]] = np.random.uniform( 100 | # -np.pi/2, np.pi/2) + np.pi/4 101 | # qpos[self.JOINT_INDS[2]] = np.random.uniform( 102 | # -np.pi/2, np.pi/2) + np.pi/2 103 | 104 | target_position = np.array(random_point_in_circle( 105 | angle_range=(0, 2*np.pi), radius=(0.6, 1.2))) 106 | target_position[1] += 1.0 107 | 108 | qpos[self.TARGET_INDS] = target_position 109 | # qpos[self.TARGET_INDS] = [1.0, 2.0] 110 | # qpos[self.TARGET_INDS] = self.init_qpos.squeeze()[self.TARGET_INDS] 111 | 112 | puck_position = np.random.uniform([-1.0], [1.0], size=[2]) 113 | puck_position = ( 114 | np.sign(puck_position) 115 | * np.maximum(np.abs(puck_position), 1/2)) 116 | puck_position[np.flatnonzero(puck_position == 0)] = 1.0 117 | # puck_position[1] += 1.0 118 | # puck_position = np.random.uniform( 119 | # low=[0.3, -1.0], high=[1.0, -0.4]), 120 | 121 | qpos[self.PUCK_INDS] = puck_position 122 | 123 | qvel = self.init_qvel.copy().squeeze() 124 | qvel[self.PUCK_INDS] = 0 125 | qvel[self.TARGET_INDS] = 0 126 | 127 | # TODO: remnants from rllab -> gym conversion 128 | # qacc = np.zeros(self.sim.data.qacc.shape[0]) 129 | # ctrl = np.zeros(self.sim.data.ctrl.shape[0]) 130 | # full_state = np.concatenate((qpos, qvel, qacc, ctrl)) 131 | 132 | # super(Pusher2dEnv, self).reset(full_state) 133 | 134 | self.set_state(qpos, qvel) 135 | 136 | return self._get_obs() 137 | 138 | 139 | class BlindForkReacher2dEnv(ImageForkReacher2dEnv): 140 | def _get_obs(self): 141 | return np.concatenate([ 142 | self.sim.data.qpos.flat[self.JOINT_INDS], 143 | self.sim.data.qvel.flat[self.JOINT_INDS], 144 | ]).reshape(-1) 145 | -------------------------------------------------------------------------------- /softlearning/environments/gym/robotics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/environments/gym/robotics/__init__.py -------------------------------------------------------------------------------- /softlearning/environments/gym/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .rescale_observation import RescaleObservation # noqa: unused-import 2 | -------------------------------------------------------------------------------- /softlearning/environments/gym/wrappers/rescale_observation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import gym 4 | from gym import spaces 5 | 6 | 7 | def rescale_values(values, old_low, old_high, new_low, new_high): 8 | rescaled_values = new_low + (new_high - new_low) * ( 9 | (values - old_low) / (old_high - old_low)) 10 | rescaled_values = np.clip(rescaled_values, new_low, new_high) 11 | return rescaled_values 12 | 13 | 14 | class RescaleObservation(gym.ObservationWrapper): 15 | def __init__(self, env, low, high): 16 | r"""Rescale observation space to a range [`low`, `high`]. 17 | Example: 18 | >>> RescaleObservation(env, low, high).observation_space == Box(low, high) 19 | True 20 | Raises: 21 | TypeError: If `not isinstance(environment.observation_space, spaces.Box)`. 22 | ValueError: If either `low` or `high` is not finite. 23 | ValueError: If any of `observation_space.{low,high}` is not finite. 24 | ValueError: If `high <= low`. 25 | TODO(hartikainen): This should be extended to work with Dict and Tuple spaces. 26 | """ 27 | if np.any(~np.isfinite((low, high))): 28 | raise ValueError( 29 | "Arguments 'low' and 'high' need to be finite." 30 | " Got: low={}, high={}".format(low, high)) 31 | 32 | if np.any(high <= low): 33 | raise ValueError("Argument `low` must be smaller than `high`" 34 | " Got: low={}, high=".format(low, high)) 35 | 36 | super(RescaleObservation, self).__init__(env) 37 | 38 | if not isinstance(env.observation_space, spaces.Box): 39 | raise TypeError("Expected Box observation space. Got: {}" 40 | "".format(type(env.observation_space))) 41 | 42 | if np.any(~np.isfinite(( 43 | env.observation_space.low, env.observation_space.high))): 44 | raise ValueError( 45 | "Observation space 'low' and 'high' need to be finite." 46 | " Got: observation_space.low={}, observation_space.high={}" 47 | "".format(env.observation_space.low, 48 | env.observation_space.high)) 49 | 50 | shape = env.observation_space.shape 51 | dtype = env.observation_space.dtype 52 | 53 | self.low = low + np.zeros(shape, dtype=dtype) 54 | self.high = high + np.zeros(shape, dtype=dtype) 55 | self.observation_space = spaces.Box( 56 | low=self.low, high=self.high, shape=shape, dtype=dtype) 57 | 58 | def observation(self, observation): 59 | rescaled_observation = rescale_values( 60 | observation, 61 | old_low=self.env.observation_space.low, 62 | old_high=self.env.observation_space.high, 63 | new_low=self.low, 64 | new_high=self.high) 65 | 66 | return rescaled_observation 67 | -------------------------------------------------------------------------------- /softlearning/environments/gym/wrappers/rescale_observation_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | 5 | import gym 6 | from gym import spaces 7 | from .rescale_observation import RescaleObservation 8 | 9 | 10 | class FakeEnvironment(gym.Env): 11 | def __init__(self): 12 | """Fake environment whose observation equals broadcasted action.""" 13 | self.observation_space = gym.spaces.Box( 14 | shape=(2, ), 15 | low=np.array((-1.2, -0.07)), 16 | high=np.array((0.6, 0.07)), 17 | dtype=np.float32) 18 | self.action_space = self.observation_space 19 | 20 | def reset(self): 21 | observation = self.observation_space.sample() 22 | return observation 23 | 24 | def step(self, action): 25 | observation = action * np.ones(self.observation_space.shape) 26 | reward, terminal, info = 0.0, False, {} 27 | return observation, reward, terminal, info 28 | 29 | 30 | def test_rescale_observation(): 31 | new_low, new_high = -1.0, 1.0 32 | env = FakeEnvironment() 33 | wrapped_env = RescaleObservation(env, new_low, new_high) 34 | 35 | np.testing.assert_allclose(wrapped_env.observation_space.low, new_low) 36 | np.testing.assert_allclose(wrapped_env.observation_space.high, new_high) 37 | 38 | seed = 0 39 | env.seed(seed) 40 | wrapped_env.seed(seed) 41 | 42 | env.reset() 43 | wrapped_env.reset() 44 | 45 | low_observation = env.step(env.observation_space.low)[0] 46 | wrapped_low_observation = wrapped_env.step(env.observation_space.low)[0] 47 | 48 | assert np.allclose(low_observation, env.observation_space.low) 49 | assert np.allclose( 50 | wrapped_low_observation, wrapped_env.observation_space.low) 51 | 52 | high_observation = env.step(env.observation_space.high)[0] 53 | wrapped_high_observation = wrapped_env.step(env.observation_space.high)[0] 54 | 55 | assert np.allclose(high_observation, env.observation_space.high) 56 | assert np.allclose( 57 | wrapped_high_observation, wrapped_env.observation_space.high) 58 | 59 | 60 | def test_raises_on_non_finite_low(): 61 | env = FakeEnvironment() 62 | assert isinstance(env.observation_space, spaces.Box) 63 | 64 | with pytest.raises(ValueError): 65 | RescaleObservation(env, -float('inf'), 1.0) 66 | 67 | with pytest.raises(ValueError): 68 | RescaleObservation(env, -1.0, float('inf')) 69 | 70 | with pytest.raises(ValueError): 71 | RescaleObservation(env, -1.0, np.nan) 72 | 73 | 74 | def test_raises_on_high_less_than_low(): 75 | env = FakeEnvironment() 76 | assert isinstance(env.observation_space, spaces.Box) 77 | with pytest.raises(ValueError): 78 | RescaleObservation(env, 1.0, 1.0) 79 | with pytest.raises(ValueError): 80 | RescaleObservation(env, 1.0, -1.0) 81 | 82 | 83 | def test_raises_on_high_equals_low(): 84 | env = FakeEnvironment() 85 | assert isinstance(env.observation_space, spaces.Box) 86 | with pytest.raises(ValueError): 87 | RescaleObservation(env, 1.0, 1.0) 88 | 89 | 90 | def test_raises_on_non_box_space(): 91 | env = gym.envs.make('Copy-v0') 92 | assert isinstance(env.observation_space, spaces.Discrete) 93 | with pytest.raises(TypeError): 94 | RescaleObservation(env, -1.0, 1.0) 95 | 96 | 97 | def test_raises_on_non_finite_space(): 98 | env = gym.envs.make('Swimmer-v3') 99 | assert np.any(np.isinf(( 100 | env.observation_space.low, env.observation_space.high))) 101 | with pytest.raises(ValueError): 102 | RescaleObservation(env, -1.0, 1.0) 103 | -------------------------------------------------------------------------------- /softlearning/environments/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def random_point_in_circle(angle_range=(0, 2*np.pi), radius=(0, 25)): 5 | angle = np.random.uniform(*angle_range) 6 | radius = radius if np.isscalar(radius) else np.random.uniform(*radius) 7 | x, y = np.cos(angle) * radius, np.sin(angle) * radius 8 | point = np.array([x, y]) 9 | return point 10 | -------------------------------------------------------------------------------- /softlearning/environments/utils.py: -------------------------------------------------------------------------------- 1 | from .adapters.gym_adapter import GymAdapter 2 | 3 | ADAPTERS = { 4 | 'gym': GymAdapter, 5 | } 6 | 7 | try: 8 | from .adapters.dm_control_adapter import DmControlAdapter 9 | ADAPTERS['dm_control'] = DmControlAdapter 10 | except ModuleNotFoundError as e: 11 | if 'dm_control' not in e.msg: 12 | raise 13 | 14 | print("Warning: dm_control package not found. Run" 15 | " `pip install git+https://github.com/deepmind/dm_control.git`" 16 | " to use dm_control environments.") 17 | 18 | try: 19 | from .adapters.robosuite_adapter import RobosuiteAdapter 20 | ADAPTERS['robosuite'] = RobosuiteAdapter 21 | except ModuleNotFoundError as e: 22 | if 'robosuite' not in e.msg: 23 | raise 24 | 25 | print("Warning: robosuite package not found. Run `pip install robosuite`" 26 | " to use robosuite environments.") 27 | 28 | UNIVERSES = set(ADAPTERS.keys()) 29 | 30 | 31 | def get_environment(universe, domain, task, environment_params): 32 | return ADAPTERS[universe](domain, task, **environment_params) 33 | 34 | 35 | def get_environment_from_params(environment_params): 36 | universe = environment_params['universe'] 37 | task = environment_params['task'] 38 | domain = environment_params['domain'] 39 | environment_kwargs = environment_params.get('kwargs', {}).copy() 40 | 41 | return get_environment(universe, domain, task, environment_kwargs) 42 | -------------------------------------------------------------------------------- /softlearning/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/misc/__init__.py -------------------------------------------------------------------------------- /softlearning/misc/kernel.py: -------------------------------------------------------------------------------- 1 | from distutils.version import LooseVersion 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | 7 | def adaptive_isotropic_gaussian_kernel(xs, ys, h_min=1e-3): 8 | """Gaussian kernel with dynamic bandwidth. 9 | 10 | The bandwidth is adjusted dynamically to match median_distance / log(Kx). 11 | See [2] for more information. 12 | 13 | Args: 14 | xs(`tf.Tensor`): A tensor of shape (N x Kx x D) containing N sets of Kx 15 | particles of dimension D. This is the first kernel argument. 16 | ys(`tf.Tensor`): A tensor of shape (N x Ky x D) containing N sets of Kx 17 | particles of dimension D. This is the second kernel argument. 18 | h_min(`float`): Minimum bandwidth. 19 | 20 | Returns: 21 | `dict`: Returned dictionary has two fields: 22 | 'output': A `tf.Tensor` object of shape (N x Kx x Ky) representing 23 | the kernel matrix for inputs `xs` and `ys`. 24 | 'gradient': A 'tf.Tensor` object of shape (N x Kx x Ky x D) 25 | representing the gradient of the kernel with respect to `xs`. 26 | 27 | Reference: 28 | [2] Qiang Liu,Dilin Wang, "Stein Variational Gradient Descent: A General 29 | Purpose Bayesian Inference Algorithm," Neural Information Processing 30 | Systems (NIPS), 2016. 31 | """ 32 | Kx, D = xs.get_shape().as_list()[-2:] 33 | Ky, D2 = ys.get_shape().as_list()[-2:] 34 | assert D == D2 35 | 36 | leading_shape = tf.shape(input=xs)[:-2] 37 | 38 | # Compute the pairwise distances of left and right particles. 39 | diff = tf.expand_dims(xs, -2) - tf.expand_dims(ys, -3) 40 | # ... x Kx x Ky x D 41 | 42 | if LooseVersion(tf.__version__) <= LooseVersion('1.5.0'): 43 | dist_sq = tf.reduce_sum(input_tensor=diff**2, axis=-1, keepdims=False) 44 | else: 45 | dist_sq = tf.reduce_sum(input_tensor=diff**2, axis=-1, keepdims=False) 46 | # ... x Kx x Ky 47 | 48 | # Get median. 49 | input_shape = tf.concat((leading_shape, [Kx * Ky]), axis=0) 50 | values, _ = tf.nn.top_k( 51 | input=tf.reshape(dist_sq, input_shape), 52 | k=(Kx * Ky // 2 + 1), # This is exactly true only if Kx*Ky is odd. 53 | sorted=True) # ... x floor(Ks*Kd/2) 54 | 55 | medians_sq = values[..., -1] # ... (shape) (last element is the median) 56 | 57 | h = medians_sq / np.log(Kx) # ... (shape) 58 | h = tf.maximum(h, h_min) 59 | h = tf.stop_gradient(h) # Just in case. 60 | h_expanded_twice = tf.expand_dims(tf.expand_dims(h, -1), -1) 61 | # ... x 1 x 1 62 | 63 | kappa = tf.exp(-dist_sq / h_expanded_twice) # ... x Kx x Ky 64 | 65 | # Construct the gradient 66 | h_expanded_thrice = tf.expand_dims(h_expanded_twice, -1) 67 | # ... x 1 x 1 x 1 68 | kappa_expanded = tf.expand_dims(kappa, -1) # ... x Kx x Ky x 1 69 | 70 | kappa_grad = -2 * diff / h_expanded_thrice * kappa_expanded 71 | # ... x Kx x Ky x D 72 | 73 | return {"output": kappa, "gradient": kappa_grad} 74 | -------------------------------------------------------------------------------- /softlearning/misc/plotter.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | class QFPolicyPlotter: 7 | def __init__(self, Q, policy, obs_lst, default_action, n_samples): 8 | self._Q = Q 9 | self._policy = policy 10 | self._obs_lst = obs_lst 11 | self._default_action = np.array(default_action) 12 | self._n_samples = n_samples 13 | 14 | self._var_inds = np.flatnonzero(np.isnan(default_action)) 15 | 16 | assert len(self._var_inds) == 2 17 | 18 | n_plots = len(obs_lst) 19 | 20 | x_size = 5 * n_plots 21 | y_size = 5 22 | 23 | fig = plt.figure(figsize=(x_size, y_size)) 24 | self._ax_lst = [] 25 | for i in range(n_plots): 26 | ax = fig.add_subplot(100 + n_plots * 10 + i + 1) 27 | ax.set_xlim((-1, 1)) 28 | ax.set_ylim((-1, 1)) 29 | ax.grid(True) 30 | self._ax_lst.append(ax) 31 | 32 | self._line_objects = list() 33 | 34 | def draw(self): 35 | # noinspection PyArgumentList 36 | [h.remove() for h in self._line_objects] 37 | self._line_objects = list() 38 | 39 | self._plot_level_curves() 40 | self._plot_action_samples() 41 | 42 | plt.draw() 43 | plt.pause(0.001) 44 | 45 | def _plot_level_curves(self): 46 | # Create mesh grid. 47 | xs = np.linspace(-1, 1, 50) 48 | ys = np.linspace(-1, 1, 50) 49 | xgrid, ygrid = np.meshgrid(xs, ys) 50 | N = len(xs)*len(ys) 51 | 52 | # Copy default values along the first axis and replace nans with 53 | # the mesh grid points. 54 | actions = np.tile(self._default_action.astype(np.float32), (N, 1)) 55 | actions[:, self._var_inds[0]] = xgrid.ravel() 56 | actions[:, self._var_inds[1]] = ygrid.ravel() 57 | 58 | for ax, obs in zip(self._ax_lst, self._obs_lst): 59 | observations = np.tile( 60 | obs[None].astype(np.float32), (actions.shape[0], 1)) 61 | 62 | Q_np = self._Q.values(observations, actions).numpy() 63 | Q_np = np.reshape(Q_np, xgrid.shape) 64 | 65 | cs = ax.contour(xgrid, ygrid, Q_np, 20) 66 | self._line_objects += cs.collections 67 | self._line_objects += ax.clabel( 68 | cs, inline=1, fontsize=10, fmt='%.2f') 69 | 70 | def _plot_action_samples(self): 71 | for ax, obs in zip(self._ax_lst, self._obs_lst): 72 | observations = np.ones((self._n_samples, 1)) * obs[None, :] 73 | actions = self._policy.actions(observations).numpy() 74 | 75 | x, y = actions[:, 0], actions[:, 1] 76 | self._line_objects += ax.plot(x, y, 'b*') 77 | -------------------------------------------------------------------------------- /softlearning/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/models/__init__.py -------------------------------------------------------------------------------- /softlearning/models/convnet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_probability as tfp 3 | import tensorflow_addons as tfa 4 | from tensorflow.keras import layers 5 | import tree 6 | 7 | 8 | tfk = tf.keras 9 | tfkl = tf.keras.layers 10 | tfpl = tfp.layers 11 | tfd = tfp.distributions 12 | tfb = tfp.bijectors 13 | 14 | 15 | def convnet_model( 16 | conv_filters=(64, 64, 64), 17 | conv_kernel_sizes=(3, 3, 3), 18 | conv_strides=(2, 2, 2), 19 | padding="SAME", 20 | normalization_type=None, 21 | normalization_kwargs={}, 22 | downsampling_type='conv', 23 | activation=layers.LeakyReLU, 24 | name="convnet", 25 | *args, 26 | **kwargs): 27 | normalization_layer = { 28 | 'batch': layers.BatchNormalization, 29 | 'layer': layers.LayerNormalization, 30 | 'group': tfa.layers.normalizations.GroupNormalization, 31 | 'instance': tfa.layers.normalizations.InstanceNormalization, 32 | None: None, 33 | }[normalization_type] 34 | 35 | def conv_block(conv_filter, conv_kernel_size, conv_stride): 36 | block_parts = [ 37 | layers.Conv2D( 38 | filters=conv_filter, 39 | kernel_size=conv_kernel_size, 40 | strides=(conv_stride if downsampling_type == 'conv' else 1), 41 | padding=padding, 42 | activation='linear', 43 | *args, 44 | **kwargs), 45 | ] 46 | 47 | if normalization_layer is not None: 48 | block_parts += [normalization_layer(**normalization_kwargs)] 49 | 50 | block_parts += [(layers.Activation(activation) 51 | if isinstance(activation, str) 52 | else activation())] 53 | 54 | if downsampling_type == 'pool' and conv_stride > 1: 55 | block_parts += [getattr(layers, 'AvgPool2D')( 56 | pool_size=conv_stride, strides=conv_stride)] 57 | 58 | block = tfk.Sequential(block_parts, name='conv_block') 59 | return block 60 | 61 | def preprocess(x): 62 | """Cast to float, normalize, and concatenate images along last axis.""" 63 | x = tree.map_structure( 64 | lambda image: tf.image.convert_image_dtype(image, tf.float32), x) 65 | x = tree.flatten(x) 66 | x = tf.concat(x, axis=-1) 67 | x = (tf.image.convert_image_dtype(x, tf.float32) - 0.5) * 2.0 68 | return x 69 | 70 | model = tf.keras.Sequential(( 71 | tfkl.Lambda(preprocess), 72 | *[ 73 | conv_block(conv_filter, conv_kernel_size, conv_stride) 74 | for (conv_filter, conv_kernel_size, conv_stride) in 75 | zip(conv_filters, conv_kernel_sizes, conv_strides) 76 | ], 77 | tfkl.Flatten(), 78 | 79 | ), name=name) 80 | 81 | return model 82 | -------------------------------------------------------------------------------- /softlearning/models/feedforward.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_probability as tfp 3 | 4 | from softlearning.utils.tensorflow import cast_and_concat 5 | 6 | 7 | tfk = tf.keras 8 | tfkl = tf.keras.layers 9 | tfpl = tfp.layers 10 | tfd = tfp.distributions 11 | tfb = tfp.bijectors 12 | 13 | 14 | def feedforward_model(hidden_layer_sizes, 15 | output_shape, 16 | activation='relu', 17 | output_activation='linear', 18 | preprocessors=None, 19 | name='feedforward_model', 20 | *args, 21 | **kwargs): 22 | output_size = tf.reduce_prod(output_shape) 23 | if 1 < len(output_shape): 24 | raise NotImplementedError("TODO(hartikainen)") 25 | model = tf.keras.Sequential(( 26 | tfkl.Lambda(cast_and_concat), 27 | *[ 28 | tf.keras.layers.Dense( 29 | hidden_layer_size, *args, activation=activation, **kwargs) 30 | for hidden_layer_size in hidden_layer_sizes 31 | ], 32 | tf.keras.layers.Dense( 33 | output_size, *args, activation=output_activation, **kwargs), 34 | # tf.keras.layers.Reshape(output_shape), 35 | ), name=name) 36 | 37 | return model 38 | -------------------------------------------------------------------------------- /softlearning/models/feedforward_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from softlearning.models.feedforward import feedforward_model 5 | 6 | 7 | class FeedforwardTest(tf.test.TestCase): 8 | 9 | def test_clone_model(self): 10 | """Make sure that cloning works and clones can predict.""" 11 | output_shape = (5, ) 12 | x_np = np.random.uniform(0, 1, (1, 13)).astype(np.float32) 13 | x = tf.constant(x_np) 14 | 15 | fn1 = feedforward_model( 16 | output_shape=output_shape, 17 | hidden_layer_sizes=(6, 4, 2), 18 | name='feedforward_function') 19 | result_1 = fn1([x, x]).numpy() 20 | 21 | fn2 = tf.keras.models.clone_model(fn1) 22 | result_2 = fn2([x, x]).numpy() 23 | 24 | variable_names = [x.name for x in fn1.variables] 25 | for variable_name, variable_1, variable_2 in zip( 26 | variable_names, fn1.get_weights(), fn2.get_weights()): 27 | self.assertEqual(variable_1.shape, variable_2.shape) 28 | 29 | if 'kernel' in variable_name: 30 | self.assertNotAllClose(variable_1, variable_2) 31 | 32 | self.assertEqual( 33 | len(set((v1.experimental_ref() for v1 in fn1.trainable_variables)) 34 | & 35 | set((v2.experimental_ref() for v2 in fn2.trainable_variables))), 36 | 0) 37 | 38 | result_1_predict = fn1.predict((x_np, x_np)) 39 | result_2_predict = fn2.predict((x_np, x_np)) 40 | 41 | self.assertEqual(fn1.name, fn2.name) 42 | self.assertEqual(result_1_predict.shape, result_2_predict.shape) 43 | 44 | self.assertAllEqual(result_1_predict, result_1) 45 | self.assertAllEqual(result_2_predict, result_2) 46 | 47 | def test_without_name(self): 48 | fn = feedforward_model( 49 | output_shape=(1, ), 50 | hidden_layer_sizes=(6, 4, 2)) 51 | 52 | self.assertEqual(fn.name, 'feedforward_model') 53 | 54 | 55 | if __name__ == '__main__': 56 | tf.test.main() 57 | -------------------------------------------------------------------------------- /softlearning/models/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tree 3 | 4 | 5 | def get_inputs_for_nested_shapes(input_shapes, name=None): 6 | if isinstance(input_shapes, dict): 7 | return type(input_shapes)([ 8 | (name, get_inputs_for_nested_shapes(value, name)) 9 | for name, value in input_shapes.items() 10 | ]) 11 | elif isinstance(input_shapes, (tuple, list)): 12 | if all(isinstance(x, int) for x in input_shapes): 13 | return tf.keras.layers.Input(shape=input_shapes, name=name) 14 | else: 15 | return type(input_shapes)(( 16 | get_inputs_for_nested_shapes(input_shape, name=None) 17 | for input_shape in input_shapes 18 | )) 19 | elif isinstance(input_shapes, tf.TensorShape): 20 | return tf.keras.layers.Input(shape=input_shapes, name=name) 21 | 22 | raise NotImplementedError(input_shapes) 23 | 24 | 25 | def flatten_input_structure(inputs): 26 | inputs_flat = tree.flatten(inputs) 27 | return inputs_flat 28 | 29 | 30 | def create_input(path, shape, dtype=None): 31 | name = "/".join(str(x) for x in path) 32 | 33 | if dtype is None: 34 | # TODO(hartikainen): This is not a very robust way to handle the 35 | # dtypes. Need to figure out something better. 36 | # Try to infer dtype manually 37 | dtype = (tf.uint8 # Image observation 38 | if len(shape) == 3 and shape[-1] in (1, 3) 39 | else tf.float32) # Non-image 40 | 41 | input_ = tf.keras.layers.Input( 42 | shape=shape, 43 | name=name, 44 | dtype=dtype 45 | ) 46 | 47 | return input_ 48 | 49 | 50 | def create_inputs(shapes, dtypes=None): 51 | """Creates `tf.keras.layers.Input`s based on input shapes. 52 | 53 | Args: 54 | input_shapes: (possibly nested) list/array/dict structure of 55 | inputs shapes. 56 | 57 | Returns: 58 | inputs: nested structure, of same shape as input_shapes, containing 59 | `tf.keras.layers.Input`s. 60 | 61 | TODO(hartikainen): Need to figure out a better way for handling the dtypes. 62 | """ 63 | if dtypes is None: 64 | dtypes = tree.map_structure(lambda _: None, shapes) 65 | inputs = tree.map_structure_with_path(create_input, shapes, dtypes) 66 | 67 | return inputs 68 | 69 | 70 | def create_sequence_inputs(shapes, dtypes=None): 71 | """Creates `tf.keras.layers.Input`s usable for sequential models like RNN. 72 | 73 | Args: 74 | See `create_inputs`. 75 | 76 | Returns: 77 | inputs: nested structure, of same shape as input_shapes, containing 78 | `tf.keras.layers.Input`s, each with shape (None, ...). 79 | """ 80 | shapes = tree.map_structure(lambda x: tf.TensorShape([None]) + x, shapes) 81 | sequence_inputs = create_inputs(shapes, dtypes) 82 | 83 | return sequence_inputs 84 | -------------------------------------------------------------------------------- /softlearning/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from softlearning.utils.serialization import ( 2 | serialize_softlearning_object, deserialize_softlearning_object) 3 | 4 | from .base_policy import BasePolicy, LatentSpacePolicy, ContinuousPolicy # noqa: unused-import 5 | from .gaussian_policy import GaussianPolicy, FeedforwardGaussianPolicy # noqa: unused-import 6 | from .uniform_policy import UniformPolicyMixin, ContinuousUniformPolicy # noqa: unused-import 7 | 8 | 9 | def serialize(policy): 10 | return serialize_softlearning_object(policy) 11 | 12 | 13 | def deserialize(name, custom_objects=None): 14 | """Returns a policy function or class denoted by input string. 15 | 16 | Arguments: 17 | name : String 18 | 19 | Returns: 20 | Policy function or class denoted by input string. 21 | 22 | For example: 23 | >>> softlearning.policies.get({ 24 | ... 'class_name': 'ContinuousUniformPolicy', 25 | ... 'config': { 26 | ... 'action_range': [[-1], [1]], 27 | ... 'input_shapes': tf.TensorShape((3, )), 28 | ... 'output_shape': 2 29 | ... } 30 | ... }) 31 | 32 | >>> softlearning.policies.get('abcd') 33 | Traceback (most recent call last): 34 | ... 35 | ValueError: Unknown policy: abcd 36 | 37 | Args: 38 | name: The name of the policy. 39 | 40 | Raises: 41 | ValueError: `Unknown policy` if the input string does not 42 | denote any defined policy. 43 | """ 44 | return deserialize_softlearning_object( 45 | name, 46 | module_objects=globals(), 47 | custom_objects=custom_objects, 48 | printable_module_name='policy') 49 | 50 | 51 | def get(identifier): 52 | """Returns a policy. 53 | 54 | Arguments: 55 | identifier: function, string, or dict. 56 | 57 | Returns: 58 | A policy denoted by identifier. 59 | 60 | For example: 61 | >>> softlearning.policies.get({ 62 | ... 'class_name': 'ContinuousUniformPolicy', 63 | ... 'config': { 64 | ... 'action_range': [[-1], [1]], 65 | ... 'input_shapes': tf.TensorShape((3, )), 66 | ... 'output_shape': 2 67 | ... } 68 | ... }) 69 | 70 | >>> softlearning.policies.get('abcd') 71 | Traceback (most recent call last): 72 | ... 73 | ValueError: Unknown policy: abcd 74 | 75 | Raises: 76 | ValueError: Input is an unknown function or string, i.e., the 77 | identifier does not denote any defined policy. 78 | """ 79 | if identifier is None: 80 | return None 81 | if isinstance(identifier, str): 82 | return deserialize(identifier) 83 | elif isinstance(identifier, dict): 84 | return deserialize(identifier) 85 | elif callable(identifier): 86 | return identifier 87 | else: 88 | raise TypeError( 89 | f"Could not interpret policy function identifier:" 90 | " {repr(identifier)}.") 91 | -------------------------------------------------------------------------------- /softlearning/policies/real_nvp_policy.py: -------------------------------------------------------------------------------- 1 | """RealNVPPolicy.""" 2 | 3 | from collections import OrderedDict 4 | 5 | import tensorflow as tf 6 | import tensorflow_probability as tfp 7 | import tree 8 | 9 | from softlearning.distributions.bijectors.real_nvp_flow import RealNVPFlow 10 | 11 | from .base_policy import LatentSpacePolicy 12 | 13 | 14 | class RealNVPPolicy(LatentSpacePolicy): 15 | def __init__(self, 16 | hidden_layer_sizes, 17 | num_coupling_layers, 18 | *args, 19 | activation=tf.nn.relu, 20 | use_batch_normalization=False, 21 | **kwargs): 22 | super(RealNVPPolicy, self).__init__(*args, **kwargs) 23 | 24 | base_distribution = tfp.distributions.MultivariateNormalDiag( 25 | loc=tf.zeros(self._output_shape), 26 | scale_diag=tf.ones(self._output_shape)) 27 | 28 | self.flow_model = RealNVPFlow( 29 | num_coupling_layers=num_coupling_layers, 30 | hidden_layer_sizes=hidden_layer_sizes, 31 | use_batch_normalization=use_batch_normalization, 32 | activation=activation) 33 | 34 | raw_action_distribution = self.flow_model(base_distribution) 35 | 36 | self.base_distribution = base_distribution 37 | self.raw_action_distribution = raw_action_distribution 38 | self.action_distribution = self._action_post_processor( 39 | raw_action_distribution) 40 | 41 | @tf.function(experimental_relax_shapes=True) 42 | def actions(self, observations): 43 | if 0 < self._smoothing_alpha: 44 | raise NotImplementedError( 45 | "TODO(hartikainen): Smoothing alpha temporarily dropped on tf2" 46 | " migration. Should add it back. See:" 47 | " https://github.com/rail-berkeley/softlearning/blob/46374df0294b9b5f6dbe65b9471ec491a82b6944/softlearning/policies/base_policy.py#L80") 48 | 49 | observations = self._filter_observations(observations) 50 | 51 | batch_shape = tf.shape(tree.flatten(observations)[0])[:-1] 52 | actions = self.action_distribution.sample( 53 | batch_shape, bijector_kwargs={ 54 | self.flow_model.name: {'observations': observations} 55 | }) 56 | 57 | return actions 58 | 59 | @tf.function(experimental_relax_shapes=True) 60 | def log_probs(self, observations, actions): 61 | observations = self._filter_observations(observations) 62 | log_probs = self.action_distribution.log_prob( 63 | actions, 64 | bijector_kwargs={ 65 | self.flow_model.name: {'observations': observations} 66 | })[..., tf.newaxis] 67 | 68 | return log_probs 69 | 70 | @tf.function(experimental_relax_shapes=True) 71 | def probs(self, observations, actions): 72 | observations = self._filter_observations(observations) 73 | probs = self.action_distribution.prob( 74 | actions, 75 | bijector_kwargs={ 76 | self.flow_model.name: {'observations': observations} 77 | })[..., tf.newaxis] 78 | 79 | return probs 80 | 81 | def get_weights(self): 82 | return self.flow_model.get_weights() 83 | 84 | def set_weights(self, *args, **kwargs): 85 | return self.flow_model.set_weights(*args, **kwargs) 86 | 87 | @property 88 | def trainable_weights(self): 89 | return self.flow_model.trainable_variables 90 | 91 | @property 92 | def non_trainable_weights(self): 93 | return self.flow_model.non_trainable_weights 94 | 95 | @tf.function(experimental_relax_shapes=True) 96 | def get_diagnostics(self, inputs): 97 | """Return diagnostic information of the policy. 98 | 99 | Returns the mean, min, max, and standard deviation of means and 100 | covariances. 101 | """ 102 | actions = self.actions(inputs) 103 | log_pis = self.log_probs(inputs, actions) 104 | 105 | return OrderedDict(( 106 | ('entropy-mean', tf.reduce_mean(-log_pis)), 107 | ('entropy-std', tf.math.reduce_std(-log_pis)), 108 | 109 | ('actions-mean', tf.reduce_mean(actions)), 110 | ('actions-std', tf.math.reduce_std(actions)), 111 | ('actions-min', tf.reduce_min(actions)), 112 | ('actions-max', tf.reduce_max(actions)), 113 | )) 114 | -------------------------------------------------------------------------------- /softlearning/policies/uniform_policy.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_probability as tfp 3 | import tree 4 | 5 | from .base_policy import ContinuousPolicy 6 | 7 | 8 | class UniformPolicyMixin: 9 | @tf.function(experimental_relax_shapes=True) 10 | def actions(self, observations): 11 | first_observation = tree.flatten(observations)[0] 12 | first_input_rank = tf.size(tree.flatten(self._input_shapes)[0]) 13 | batch_shape = tf.shape(first_observation)[:-first_input_rank] 14 | 15 | actions = self.distribution.sample(batch_shape) 16 | 17 | return actions 18 | 19 | @tf.function(experimental_relax_shapes=True) 20 | def log_probs(self, observations, actions): 21 | log_probs = self.distribution.log_prob(actions)[..., tf.newaxis] 22 | return log_probs 23 | 24 | @tf.function(experimental_relax_shapes=True) 25 | def probs(self, observations, actions): 26 | probs = self.distribution.prob(actions)[..., tf.newaxis] 27 | return probs 28 | 29 | 30 | class ContinuousUniformPolicy(UniformPolicyMixin, ContinuousPolicy): 31 | def __init__(self, *args, **kwargs): 32 | super(ContinuousUniformPolicy, self).__init__(*args, **kwargs) 33 | low, high = self._action_range 34 | self.distribution = tfp.distributions.Independent( 35 | tfp.distributions.Uniform(low=low, high=high), 36 | reinterpreted_batch_ndims=1) 37 | -------------------------------------------------------------------------------- /softlearning/policies/uniform_policy_test.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | import tensorflow_probability as tfp 6 | import tree 7 | 8 | from softlearning import policies 9 | from softlearning.policies.uniform_policy import ContinuousUniformPolicy 10 | from softlearning.environments.utils import get_environment 11 | from softlearning.samplers import utils as sampler_utils 12 | 13 | 14 | class ContinuousUniformPolicyTest(tf.test.TestCase): 15 | def setUp(self): 16 | self.env = get_environment('gym', 'Swimmer', 'v3', {}) 17 | self.policy = ContinuousUniformPolicy( 18 | action_range=( 19 | self.env.action_space.low, 20 | self.env.action_space.high, 21 | ), 22 | input_shapes=self.env.observation_shape, 23 | output_shape=self.env.action_shape, 24 | observation_keys=self.env.observation_keys) 25 | 26 | def test_actions_and_log_probs(self): 27 | observation1_np = self.env.reset() 28 | observation2_np = self.env.step(self.env.action_space.sample())[0] 29 | 30 | observations_np = type(observation1_np)(( 31 | (key, np.stack(( 32 | observation1_np[key], observation2_np[key] 33 | ), axis=0).astype(np.float32)) 34 | for key in observation1_np.keys() 35 | )) 36 | 37 | observations_tf = tree.map_structure( 38 | lambda x: tf.constant(x, dtype=x.dtype), observations_np) 39 | 40 | for observations in (observations_np, observations_tf): 41 | actions = self.policy.actions(observations) 42 | log_pis = self.policy.log_probs(observations, actions) 43 | 44 | self.assertAllEqual( 45 | log_pis, 46 | tfp.distributions.Independent( 47 | tfp.distributions.Uniform( 48 | low=self.env.action_space.low, 49 | high=self.env.action_space.high, 50 | ), 51 | reinterpreted_batch_ndims=1, 52 | ).log_prob(actions)[..., None]) 53 | 54 | self.assertEqual(actions.shape, (2, *self.env.action_shape)) 55 | 56 | def test_env_step_with_actions(self): 57 | observation_np = self.env.reset() 58 | action = self.policy.action(observation_np).numpy() 59 | self.env.step(action) 60 | 61 | def test_trainable_variables(self): 62 | self.assertEqual(len(self.policy.trainable_variables), 0) 63 | 64 | def test_get_diagnostics(self): 65 | observation1_np = self.env.reset() 66 | observation2_np = self.env.step(self.env.action_space.sample())[0] 67 | observations_np = {} 68 | observations_np = type(observation1_np)(( 69 | (key, np.stack(( 70 | observation1_np[key], observation2_np[key] 71 | ), axis=0).astype(np.float32)) 72 | for key in observation1_np.keys() 73 | )) 74 | 75 | diagnostics = self.policy.get_diagnostics(observations_np) 76 | self.assertTrue(isinstance(diagnostics, OrderedDict)) 77 | self.assertFalse(diagnostics) 78 | 79 | def test_serialize_deserialize(self): 80 | policy_1 = ContinuousUniformPolicy( 81 | action_range=( 82 | self.env.action_space.low, 83 | self.env.action_space.high, 84 | ), 85 | input_shapes=self.env.observation_shape, 86 | output_shape=self.env.action_shape, 87 | observation_keys=self.env.observation_keys) 88 | 89 | self.assertFalse(policy_1.trainable_weights) 90 | 91 | config = policies.serialize(policy_1) 92 | policy_2 = policies.deserialize(config) 93 | 94 | self.assertEqual(policy_2._action_range, policy_1._action_range) 95 | self.assertEqual(policy_2._input_shapes, policy_1._input_shapes) 96 | self.assertEqual(policy_2._output_shape, policy_1._output_shape) 97 | self.assertEqual( 98 | policy_2._observation_keys, policy_1._observation_keys) 99 | 100 | path = sampler_utils.rollout( 101 | self.env, 102 | policy_2, 103 | path_length=10, 104 | break_on_terminal=False) 105 | observations = path['observations'] 106 | np.testing.assert_equal( 107 | policy_1.actions(observations).numpy().shape, 108 | policy_2.actions(observations).numpy().shape) 109 | 110 | 111 | if __name__ == '__main__': 112 | tf.test.main() 113 | -------------------------------------------------------------------------------- /softlearning/policies/utils.py: -------------------------------------------------------------------------------- 1 | from gym import spaces 2 | 3 | from .uniform_policy import ContinuousUniformPolicy 4 | 5 | 6 | def get_uniform_policy(environment): 7 | if isinstance(environment.action_space, spaces.Box): 8 | return ContinuousUniformPolicy( 9 | action_range=( 10 | environment.action_space.low, 11 | environment.action_space.high, 12 | ), 13 | input_shapes=environment.observation_shape, 14 | output_shape=environment.action_shape, 15 | observation_keys=environment.observation_keys) 16 | 17 | raise NotImplementedError(( 18 | type(environment.action_space), environment.action_space)) 19 | -------------------------------------------------------------------------------- /softlearning/preprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | from softlearning.utils.serialization import ( 2 | serialize_softlearning_object, deserialize_softlearning_object) 3 | 4 | 5 | def convnet_preprocessor(name='convnet_preprocessor', **kwargs): 6 | from softlearning.models.convnet import convnet_model 7 | 8 | preprocessor = convnet_model(name=name, **kwargs) 9 | 10 | return preprocessor 11 | 12 | 13 | def serialize(preprocessor): 14 | return serialize_softlearning_object(preprocessor) 15 | 16 | 17 | def deserialize(name, custom_objects=None): 18 | """Returns a preprocessor function or class denoted by input string. 19 | 20 | Arguments: 21 | name : String 22 | 23 | Returns: 24 | Preprocessor function or class denoted by input string. 25 | 26 | For example: 27 | >>> softlearning.preprocessors.get('convnet_preprocessor') 28 | 29 | >>> softlearning.preprocessors.get('abcd') 30 | Traceback (most recent call last): 31 | ... 32 | ValueError: Unknown preprocessor: abcd 33 | 34 | Args: 35 | name: The name of the preprocessor. 36 | 37 | Raises: 38 | ValueError: `Unknown preprocessor` if the input string does not 39 | denote any defined preprocessor. 40 | """ 41 | return deserialize_softlearning_object( 42 | name, 43 | module_objects=globals(), 44 | custom_objects=custom_objects, 45 | printable_module_name='preprocessor') 46 | 47 | 48 | def get(identifier): 49 | """Returns a preprocessor. 50 | 51 | Arguments: 52 | identifier: function, string, or dict. 53 | 54 | Returns: 55 | A preprocessor denoted by identifier. 56 | 57 | For example: 58 | 59 | >>> softlearning.preprocessors.get('convnet_preprocessor') 60 | 61 | >>> softlearning.preprocessors.get('abcd') 62 | Traceback (most recent call last): 63 | ... 64 | ValueError: Unknown preprocessor: abcd 65 | 66 | Raises: 67 | ValueError: Input is an unknown function or string, i.e., the 68 | identifier does not denote any defined preprocessor. 69 | """ 70 | if identifier is None: 71 | return None 72 | if isinstance(identifier, str): 73 | return deserialize(identifier) 74 | elif isinstance(identifier, dict): 75 | return deserialize(identifier) 76 | elif callable(identifier): 77 | return identifier 78 | else: 79 | raise TypeError( 80 | f"Could not interpret preprocessor function identifier:" 81 | " {repr(identifier)}.") 82 | -------------------------------------------------------------------------------- /softlearning/replay_pools/__init__.py: -------------------------------------------------------------------------------- 1 | from softlearning.utils.serialization import ( 2 | serialize_softlearning_object, deserialize_softlearning_object) 3 | 4 | from .simple_replay_pool import SimpleReplayPool # noqa: unused-import 5 | from .goal_replay_pool import GoalReplayPool # noqa: unused-import 6 | from .union_pool import UnionPool # noqa: unused-import 7 | from .hindsight_experience_replay_pool import HindsightExperienceReplayPool # noqa: unused-import 8 | 9 | 10 | def serialize(replay_pool): 11 | return serialize_softlearning_object(replay_pool) 12 | 13 | 14 | def deserialize(name, custom_objects=None): 15 | """Returns a replay pool function or class denoted by input string. 16 | 17 | Arguments: 18 | name : String 19 | 20 | Returns: 21 | Replay Pool function or class denoted by input string. 22 | 23 | For example: 24 | >>> softlearning.replay_pools.get({'class_name': 'SimpleReplayPool', ...}) 25 | 26 | >>> softlearning.replay_pools.get('abcd') 27 | Traceback (most recent call last): 28 | ... 29 | ValueError: Unknown replay pool: abcd 30 | 31 | Args: 32 | name: The name of the replay pool. 33 | 34 | Raises: 35 | ValueError: `Unknown replay pool` if the input string does not 36 | denote any defined replay pool. 37 | """ 38 | return deserialize_softlearning_object( 39 | name, 40 | module_objects=globals(), 41 | custom_objects=custom_objects, 42 | printable_module_name='replay pool') 43 | 44 | 45 | def get(identifier): 46 | """Returns a replay pool. 47 | 48 | Arguments: 49 | identifier: function, string, or dict. 50 | 51 | Returns: 52 | A replay pool denoted by identifier. 53 | 54 | For example: 55 | >>> softlearning.replay_pools.get({'class_name': 'SimpleReplayPool', ...}) 56 | 57 | >>> softlearning.replay_pools.get('abcd') 58 | Traceback (most recent call last): 59 | ... 60 | ValueError: Unknown replay_pool: abcd 61 | 62 | Raises: 63 | ValueError: Input is an unknown function or string, i.e., the 64 | identifier does not denote any defined replay pool. 65 | """ 66 | if identifier is None: 67 | return None 68 | if isinstance(identifier, str): 69 | return deserialize(identifier) 70 | elif isinstance(identifier, dict): 71 | return deserialize(identifier) 72 | elif callable(identifier): 73 | return identifier 74 | else: 75 | raise TypeError( 76 | f"Could not interpret replay pool function identifier:" 77 | " {repr(identifier)}.") 78 | -------------------------------------------------------------------------------- /softlearning/replay_pools/goal_replay_pool.py: -------------------------------------------------------------------------------- 1 | from gym.spaces import Dict 2 | 3 | from .flexible_replay_pool import FlexibleReplayPool, Field 4 | 5 | 6 | class GoalReplayPool(FlexibleReplayPool): 7 | def __init__(self, 8 | environment, 9 | observation_fields=None, 10 | new_observation_fields=None, 11 | *args, 12 | extra_fields=None, 13 | **kwargs): 14 | extra_fields = extra_fields or {} 15 | observation_space = environment.observation_space 16 | action_space = environment.action_space 17 | assert isinstance(observation_space, Dict), observation_space 18 | 19 | self._environment = environment 20 | self._observation_space = observation_space 21 | self._action_space = action_space 22 | 23 | fields = { 24 | 'observations': { 25 | name: Field( 26 | name=name, 27 | dtype=observation_space.dtype, 28 | shape=observation_space.shape) 29 | for name, observation_space 30 | in observation_space.spaces.items() 31 | if name in environment.observation_keys 32 | }, 33 | 'next_observations': { 34 | name: Field( 35 | name=name, 36 | dtype=observation_space.dtype, 37 | shape=observation_space.shape) 38 | for name, observation_space 39 | in observation_space.spaces.items() 40 | if name in environment.observation_keys 41 | }, 42 | 'goals': { 43 | name: Field( 44 | name=name, 45 | dtype=observation_space.dtype, 46 | shape=observation_space.shape) 47 | for name, observation_space 48 | in observation_space.spaces.items() 49 | if name in environment.goal_keys 50 | }, 51 | 'actions': Field( 52 | name='actions', 53 | dtype=action_space.dtype, 54 | shape=environment.action_shape), 55 | 'rewards': Field( 56 | name='rewards', 57 | dtype='float32', 58 | shape=(1, )), 59 | # terminals[i] = a terminal was received at time i 60 | 'terminals': Field( 61 | name='terminals', 62 | dtype='bool', 63 | shape=(1, )), 64 | **extra_fields 65 | } 66 | 67 | super(GoalReplayPool, self).__init__(*args, fields=fields, **kwargs) 68 | 69 | def add_samples(self, samples, *args, **kwargs): 70 | observations = type(samples['observations'])( 71 | (key, values) 72 | for key, values in samples['observations'].items() 73 | if key in self._environment.observation_keys 74 | ) 75 | next_observations = type(samples['next_observations'])( 76 | (key, values) 77 | for key, values in samples['next_observations'].items() 78 | if key in self._environment.observation_keys 79 | ) 80 | goals = type(samples['observations'])( 81 | (key, values) 82 | for key, values in samples['observations'].items() 83 | if key in self._environment.goal_keys 84 | ) 85 | 86 | samples.update({ 87 | 'observations': observations, 88 | 'next_observations': next_observations, 89 | 'goals': goals, 90 | }) 91 | 92 | return super(GoalReplayPool, self).add_samples( 93 | samples, *args, **kwargs) 94 | -------------------------------------------------------------------------------- /softlearning/replay_pools/replay_pool.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class ReplayPool(object): 5 | """A class used to save and replay data.""" 6 | 7 | @abc.abstractmethod 8 | def add_sample(self, sample): 9 | """Add a transition tuple.""" 10 | pass 11 | 12 | @abc.abstractmethod 13 | def terminate_episode(self): 14 | """Clean up pool after episode termination.""" 15 | pass 16 | 17 | @property 18 | @abc.abstractmethod 19 | def size(self, **kwargs): 20 | pass 21 | 22 | @property 23 | @abc.abstractmethod 24 | def add_path(self, path): 25 | """Add a rollout to the replay pool.""" 26 | pass 27 | 28 | @abc.abstractmethod 29 | def random_batch(self, batch_size): 30 | """Return a random batch of size `batch_size`.""" 31 | pass 32 | -------------------------------------------------------------------------------- /softlearning/replay_pools/simple_replay_pool.py: -------------------------------------------------------------------------------- 1 | from gym import spaces 2 | import tree 3 | 4 | from .flexible_replay_pool import FlexibleReplayPool, Field 5 | 6 | 7 | def field_from_gym_space(name, space): 8 | if isinstance(space, spaces.Box): 9 | if isinstance(name, (list, tuple)): 10 | name = '/'.join(name) 11 | return Field(name=name, dtype=space.dtype, shape=space.shape) 12 | elif isinstance(space, spaces.Dict): 13 | return tree.map_structure_with_path( 14 | field_from_gym_space, space.spaces) 15 | else: 16 | raise NotImplementedError(space) 17 | 18 | 19 | class SimpleReplayPool(FlexibleReplayPool): 20 | def __init__(self, 21 | environment, 22 | *args, 23 | extra_fields=None, 24 | **kwargs): 25 | extra_fields = extra_fields or {} 26 | observation_space = environment.observation_space 27 | action_space = environment.action_space 28 | 29 | self._environment = environment 30 | self._observation_space = observation_space 31 | self._action_space = action_space 32 | 33 | fields = { 34 | 'observations': field_from_gym_space( 35 | 'observations', observation_space), 36 | 'next_observations': field_from_gym_space( 37 | 'next_observations', observation_space), 38 | 'actions': Field( 39 | name='actions', 40 | dtype=action_space.dtype, 41 | shape=environment.action_space.shape), 42 | 'rewards': Field( 43 | name='rewards', 44 | dtype='float32', 45 | shape=(1, )), 46 | # terminals[i] = a terminal was received at time i 47 | 'terminals': Field( 48 | name='terminals', 49 | dtype='bool', 50 | shape=(1, )), 51 | **extra_fields 52 | } 53 | 54 | super(SimpleReplayPool, self).__init__( 55 | *args, fields=fields, **kwargs) 56 | -------------------------------------------------------------------------------- /softlearning/replay_pools/simple_replay_pool_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import unittest 3 | import numpy as np 4 | import gym 5 | 6 | from softlearning.replay_pools.simple_replay_pool import SimpleReplayPool 7 | from softlearning.replay_pools.flexible_replay_pool import Field 8 | from softlearning.environments.utils import get_environment 9 | 10 | 11 | def create_pool(env, max_size=100): 12 | return SimpleReplayPool(environment=env, max_size=max_size) 13 | 14 | 15 | class SimpleReplayPoolTest(unittest.TestCase): 16 | def test_create_pool(self): 17 | ENVIRONMENTS = ( 18 | get_environment('gym', 'Swimmer', 'v3', {}), 19 | gym.make('Swimmer-v3'), 20 | gym.make('HandManipulateBlock-v0'), 21 | ) 22 | for environment in ENVIRONMENTS: 23 | pool = create_pool(env=environment, max_size=100) 24 | 25 | def verify_field(field, expected_name, expected_dtype, expected_shape): 26 | self.assertIsInstance(field, Field) 27 | self.assertEqual(field.name, expected_name) 28 | self.assertEqual(field.dtype, expected_dtype) 29 | self.assertEqual(field.shape, expected_shape) 30 | self.assertEqual(field.initializer, np.zeros) 31 | self.assertEqual(field.default_value, 0.0) 32 | 33 | if isinstance(environment.observation_space, gym.spaces.Dict): 34 | self.assertIsInstance(pool.fields['observations'], dict) 35 | for name, space in environment.observation_space.spaces.items(): 36 | self.assertIn(name, pool.fields['observations']) 37 | field = pool.fields['observations'][name] 38 | verify_field(field, name, space.dtype, space.shape) 39 | 40 | elif isinstance(environment.observation_space, gym.spaces.Box): 41 | self.assertIsInstance(pool.fields['observations'], Field) 42 | verify_field(field, 43 | 'observations', 44 | environment.observation_space.dtype, 45 | environment.observation_space.shape) 46 | else: 47 | raise ValueError(environment.observation_space) 48 | 49 | verify_field( 50 | pool.fields['actions'], 51 | 'actions', 52 | environment.action_space.dtype, 53 | environment.action_space.shape) 54 | 55 | verify_field(pool.fields['rewards'], 'rewards', 'float32', (1, )) 56 | verify_field(pool.fields['terminals'], 'terminals', 'bool', (1, )) 57 | 58 | def test_add_samples_box_observation(self): 59 | env = gym.make('Swimmer-v3') 60 | pool = create_pool(env=env, max_size=100) 61 | 62 | env.reset() 63 | 64 | num_samples = pool._max_size // 2 65 | 66 | samples = { 67 | 'observations': np.empty( 68 | (num_samples, *env.observation_space.shape), 69 | dtype=env.observation_space.dtype), 70 | 'next_observations': np.empty( 71 | (num_samples, *env.observation_space.shape), 72 | dtype=env.observation_space.dtype), 73 | 'actions': np.empty((num_samples, *env.action_space.shape)), 74 | 'rewards': np.empty((num_samples, 1), dtype=np.float32), 75 | 'terminals': np.empty((num_samples, 1), dtype=bool), 76 | } 77 | 78 | for i in range(num_samples): 79 | action = env.action_space.sample() 80 | observation, reward, terminal, info = env.step(action) 81 | samples['observations'][i, :] = observation 82 | samples['next_observations'][i, :] = observation 83 | samples['actions'][i] = action 84 | samples['rewards'][i] = reward 85 | samples['terminals'][i] = terminal 86 | 87 | pool.add_path(samples) 88 | last_n_batch = pool.last_n_batch(num_samples) 89 | np.testing.assert_equal( 90 | { 91 | key: value 92 | for key, value in last_n_batch.items() 93 | if key not in 94 | ('episode_index_backwards', 'episode_index_forwards') 95 | }, 96 | samples) 97 | 98 | def test_add_samples_dict_observation(self): 99 | env = get_environment('gym', 'Swimmer', 'v3', {}) 100 | pool = create_pool(env=env, max_size=100) 101 | 102 | env.reset() 103 | 104 | num_samples = pool._max_size // 2 105 | 106 | samples = { 107 | 'observations': { 108 | name: np.empty((num_samples, *space.shape), dtype=space.dtype) 109 | for name, space in env.observation_space.spaces.items() 110 | }, 111 | 'next_observations': { 112 | name: np.empty((num_samples, *space.shape), dtype=space.dtype) 113 | for name, space in env.observation_space.spaces.items() 114 | }, 115 | 'actions': np.empty((num_samples, *env.action_space.shape)), 116 | 'rewards': np.empty((num_samples, 1), dtype=np.float32), 117 | 'terminals': np.empty((num_samples, 1), dtype=bool), 118 | } 119 | 120 | for i in range(num_samples): 121 | action = env.action_space.sample() 122 | observation, reward, terminal, info = env.step(action) 123 | for name, value in observation.items(): 124 | samples['observations'][name][i, :] = value 125 | samples['next_observations'][name][i, :] = value 126 | samples['actions'][i] = action 127 | samples['rewards'][i] = reward 128 | samples['terminals'][i] = terminal 129 | 130 | pool.add_path(samples) 131 | last_n_batch = pool.last_n_batch(num_samples) 132 | np.testing.assert_equal( 133 | { 134 | key: value 135 | for key, value in last_n_batch.items() 136 | if key not in 137 | ('episode_index_backwards', 'episode_index_forwards') 138 | }, 139 | samples) 140 | 141 | 142 | if __name__ == '__main__': 143 | unittest.main() 144 | -------------------------------------------------------------------------------- /softlearning/replay_pools/union_pool.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .replay_pool import ReplayPool 4 | 5 | 6 | class UnionPool(ReplayPool): 7 | def __init__(self, pools): 8 | pool_sizes = np.array([b.size for b in pools]) 9 | self._total_size = sum(pool_sizes) 10 | self._normalized_pool_sizes = pool_sizes / self._total_size 11 | 12 | self.pools = pools 13 | 14 | def add_sample(self, *args, **kwargs): 15 | raise NotImplementedError 16 | 17 | def terminate_episode(self): 18 | raise NotImplementedError 19 | 20 | @property 21 | def size(self): 22 | return self._total_size 23 | 24 | def add_path(self, **kwargs): 25 | raise NotImplementedError 26 | 27 | def random_batch(self, batch_size): 28 | 29 | # TODO: Hack 30 | partial_batch_sizes = self._normalized_pool_sizes * batch_size 31 | partial_batch_sizes = partial_batch_sizes.astype(int) 32 | partial_batch_sizes[0] = batch_size - sum(partial_batch_sizes[1:]) 33 | 34 | partial_batches = [ 35 | pool.random_batch(partial_batch_size) for pool, 36 | partial_batch_size in zip(self.pools, partial_batch_sizes) 37 | ] 38 | 39 | def all_values(key): 40 | return [partial_batch[key] for partial_batch in partial_batches] 41 | 42 | keys = partial_batches[0].keys() 43 | 44 | return {key: np.concatenate(all_values(key), axis=0) for key in keys} 45 | -------------------------------------------------------------------------------- /softlearning/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from softlearning.utils.serialization import ( 2 | serialize_softlearning_object, deserialize_softlearning_object) 3 | 4 | from .base_sampler import BaseSampler # noqa: unused-import 5 | from .dummy_sampler import DummySampler # noqa: unused-import 6 | from .simple_sampler import SimpleSampler # noqa: unused-import 7 | from .remote_sampler import RemoteSampler # noqa: unused-import 8 | from .utils import rollout, rollouts # noqa: unused-import 9 | 10 | 11 | def serialize(sampler): 12 | return serialize_softlearning_object(sampler) 13 | 14 | 15 | def deserialize(name, custom_objects=None): 16 | """Returns a sampler function or class denoted by input string. 17 | 18 | Arguments: 19 | name : String 20 | 21 | Returns: 22 | Sampler function or class denoted by input string. 23 | 24 | For example: 25 | >>> softlearning.samplers.get({'class_name': 'SimpleSampler', ...}) 26 | 27 | >>> softlearning.samplers.get('abcd') 28 | Traceback (most recent call last): 29 | ... 30 | ValueError: Unknown sampler: abcd 31 | 32 | Args: 33 | name: The name of the sampler. 34 | 35 | Raises: 36 | ValueError: `Unknown sampler` if the input string does not 37 | denote any defined sampler. 38 | """ 39 | return deserialize_softlearning_object( 40 | name, 41 | module_objects=globals(), 42 | custom_objects=custom_objects, 43 | printable_module_name='sampler') 44 | 45 | 46 | def get(identifier): 47 | """Returns a sampler. 48 | 49 | Arguments: 50 | identifier: function, string, or dict. 51 | 52 | Returns: 53 | A sampler denoted by identifier. 54 | 55 | For example: 56 | >>> softlearning.samplers.get({'class_name': 'SimpleSampler', ...}) 57 | 58 | >>> softlearning.samplers.get('abcd') 59 | Traceback (most recent call last): 60 | ... 61 | ValueError: Unknown sampler: abcd 62 | 63 | Raises: 64 | ValueError: Input is an unknown function or string, i.e., the 65 | identifier does not denote any defined sampler. 66 | """ 67 | if identifier is None: 68 | return None 69 | if isinstance(identifier, str): 70 | return deserialize(identifier) 71 | elif isinstance(identifier, dict): 72 | return deserialize(identifier) 73 | elif callable(identifier): 74 | return identifier 75 | else: 76 | raise TypeError( 77 | f"Could not interpret sampler function identifier:" 78 | " {repr(identifier)}.") 79 | -------------------------------------------------------------------------------- /softlearning/samplers/base_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import deque, OrderedDict 2 | from itertools import islice 3 | 4 | 5 | class BaseSampler(object): 6 | def __init__(self, 7 | max_path_length, 8 | environment=None, 9 | policy=None, 10 | pool=None, 11 | store_last_n_paths=10): 12 | self._max_path_length = max_path_length 13 | self._store_last_n_paths = store_last_n_paths 14 | self._last_n_paths = deque(maxlen=store_last_n_paths) 15 | 16 | self.environment = environment 17 | self.policy = policy 18 | self.pool = pool 19 | 20 | def initialize(self, environment, policy, pool): 21 | self.environment = environment 22 | self.policy = policy 23 | self.pool = pool 24 | 25 | def reset(self): 26 | pass 27 | 28 | def set_policy(self, policy): 29 | self.policy = policy 30 | 31 | def clear_last_n_paths(self): 32 | self._last_n_paths.clear() 33 | 34 | def get_last_n_paths(self, n=None): 35 | if n is None: 36 | n = self._store_last_n_paths 37 | 38 | last_n_paths = tuple(islice(self._last_n_paths, None, n)) 39 | 40 | return last_n_paths 41 | 42 | def sample(self): 43 | raise NotImplementedError 44 | 45 | def terminate(self): 46 | self.environment.close() 47 | 48 | def get_diagnostics(self): 49 | diagnostics = OrderedDict({'pool-size': self.pool.size}) 50 | return diagnostics 51 | 52 | def __getstate__(self): 53 | state = { 54 | key: value for key, value in self.__dict__.items() 55 | if key not in ( 56 | 'environment', 57 | 'policy', 58 | 'pool', 59 | '_last_n_paths', 60 | '_current_observation', 61 | '_current_path', 62 | '_is_first_step', 63 | ) 64 | } 65 | 66 | return state 67 | 68 | def __setstate__(self, state): 69 | self.__dict__.update(state) 70 | 71 | self.environment = None 72 | self.policy = None 73 | self.pool = None 74 | # TODO(hartikainen): Maybe try restoring these from the pool? 75 | self._last_n_paths = deque(maxlen=self._store_last_n_paths) 76 | -------------------------------------------------------------------------------- /softlearning/samplers/dummy_sampler.py: -------------------------------------------------------------------------------- 1 | from .base_sampler import BaseSampler 2 | 3 | 4 | class DummySampler(BaseSampler): 5 | def sample(self): 6 | pass 7 | -------------------------------------------------------------------------------- /softlearning/samplers/goal_sampler.py: -------------------------------------------------------------------------------- 1 | from .simple_sampler import SimpleSampler 2 | 3 | 4 | class GoalSampler(SimpleSampler): 5 | @property 6 | def _policy_input(self): 7 | observation = super(GoalSampler, self)._action_input 8 | goal = { 9 | key: self._current_observation[key] 10 | for key in self.policy.goal_keys 11 | } 12 | 13 | return (observation, goal) 14 | 15 | def _process_sample(self, 16 | observation, 17 | action, 18 | reward, 19 | terminal, 20 | next_observation, 21 | info): 22 | full_observation = observation.copy() 23 | observation = { 24 | key: full_observation[key] 25 | for key, value in self.policy.observation_keys 26 | } 27 | goal = { 28 | key: full_observation[key] 29 | for key, value in self.policy.goal_keys 30 | } 31 | processed_observation = { 32 | 'observations': observation, 33 | 'actions': action, 34 | 'rewards': [reward], 35 | 'terminals': [terminal], 36 | 'next_observations': next_observation, 37 | 'goals': goal, 38 | 'infos': info, 39 | } 40 | 41 | return processed_observation 42 | -------------------------------------------------------------------------------- /softlearning/samplers/remote_sampler.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from collections import OrderedDict 3 | 4 | import ray 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | 9 | from .base_sampler import BaseSampler 10 | from .utils import rollout 11 | 12 | 13 | class RemoteSampler(BaseSampler): 14 | def __init__(self, **kwargs): 15 | raise NotImplementedError( 16 | "TODO(hartikainen): There's a bug here that causes tf to end up in" 17 | " a RecursionError. This should be fixed/refactored before usage.") 18 | super(RemoteSampler, self).__init__(**kwargs) 19 | 20 | self._remote_environment = None 21 | self._remote_path = None 22 | self._n_episodes = 0 23 | self._total_samples = 0 24 | self._last_path_return = 0 25 | self._max_path_return = -np.inf 26 | 27 | def _create_remote_environment(self, env, policy): 28 | env_pkl = pickle.dumps(env) 29 | policy_pkl = pickle.dumps(policy) 30 | 31 | if not ray.is_initialized(): 32 | ray.init() 33 | 34 | self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl) 35 | 36 | # Block until the env and policy is ready 37 | initialized = ray.get(self._remote_environment.initialized.remote()) 38 | assert initialized, initialized 39 | 40 | def initialize(self, environment, policy, pool): 41 | super(RemoteSampler, self).initialize(environment, policy, pool) 42 | self._create_remote_environment(environment, policy) 43 | 44 | def wait_for_path(self, timeout=1): 45 | if self._remote_path is None: 46 | return [True] 47 | 48 | path_ready, _ = ray.wait([self._remote_path], timeout=timeout) 49 | return path_ready 50 | 51 | def sample(self, timeout=0): 52 | if self._remote_path is None: 53 | policy_params = self.policy.get_weights() 54 | self._remote_path = self._remote_environment.rollout.remote( 55 | policy_params, self._max_path_length) 56 | 57 | path_ready = self.wait_for_path(timeout=timeout) 58 | 59 | if len(path_ready) or not self.batch_ready(): 60 | path_samples = ray.get(self._remote_path) 61 | self._last_n_paths.appendleft(path_samples) 62 | 63 | self.pool.add_samples({ 64 | key: value 65 | for key, value in path_samples.items() 66 | if key != 'infos' 67 | }) 68 | 69 | self._remote_path = None 70 | self._total_samples += path_samples['rewards'].shape[0] 71 | self._last_path_return = np.sum(path_samples['rewards']) 72 | self._max_path_return = max(self._max_path_return, 73 | self._last_path_return) 74 | self._n_episodes += 1 75 | 76 | def get_diagnostics(self): 77 | diagnostics = OrderedDict({ 78 | 'max-path-return': self._max_path_return, 79 | 'last-path-return': self._last_path_return, 80 | 'pool-size': self.pool.size, 81 | 'episodes': self._n_episodes, 82 | 'total-samples': self._total_samples, 83 | }) 84 | 85 | return diagnostics 86 | 87 | def __getstate__(self): 88 | super_state = super(RemoteSampler, self).__getstate__() 89 | state = { 90 | key: value for key, value in super_state.items() 91 | if key not in ('_remote_environment', '_remote_path') 92 | } 93 | 94 | return state 95 | 96 | def __setstate__(self, state): 97 | super(RemoteSampler, self).__setstate__(state) 98 | self._remote_path = None 99 | 100 | 101 | @ray.remote 102 | class _RemoteEnv(object): 103 | def __init__(self, env_pkl, policy_pkl): 104 | gpu_options = tf.GPUOptions(allow_growth=True) 105 | self._session = tf.Session( 106 | config=tf.ConfigProto(gpu_options=gpu_options)) 107 | tf.compat.v1.keras.backend.set_session(self._session) 108 | 109 | self._env = pickle.loads(env_pkl) 110 | self._policy = pickle.loads(policy_pkl) 111 | 112 | if hasattr(self._env, 'initialize'): 113 | self._env.initialize() 114 | 115 | self._initialized = True 116 | 117 | def initialized(self): 118 | return self._initialized 119 | 120 | def rollout(self, policy_weights, path_length): 121 | self._policy.set_weights(policy_weights) 122 | path = rollout(self._env, self._policy, path_length) 123 | 124 | return path 125 | -------------------------------------------------------------------------------- /softlearning/samplers/remote_sampler_test.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import unittest 3 | import pytest 4 | 5 | from softlearning.environments.utils import get_environment 6 | from softlearning.samplers.remote_sampler import RemoteSampler 7 | from softlearning.replay_pools.simple_replay_pool import SimpleReplayPool 8 | from softlearning import policies 9 | 10 | 11 | @pytest.mark.skip(reason="RemoteSampler is currently broken.") 12 | class RemoteSamplerTest(unittest.TestCase): 13 | def setUp(self): 14 | self.env = get_environment('gym', 'Swimmer', 'v3', {}) 15 | self.policy = policies.ContinuousUniformPolicy( 16 | action_range=( 17 | self.env.action_space.low, 18 | self.env.action_space.high, 19 | ), 20 | input_shapes=self.env.observation_shape, 21 | output_shape=self.env.action_shape, 22 | observation_keys=self.env.observation_keys) 23 | self.pool = SimpleReplayPool(max_size=100, environment=self.env) 24 | self.remote_sampler = RemoteSampler(max_path_length=10) 25 | 26 | def test_initialization(self): 27 | self.assertEqual(self.pool.size, 0) 28 | self.remote_sampler.initialize(self.env, self.policy, self.pool) 29 | self.remote_sampler.sample(timeout=10) 30 | self.assertEqual(self.pool.size, 10) 31 | 32 | def test_serialize_deserialize(self): 33 | self.assertEqual(self.pool.size, 0) 34 | 35 | self.remote_sampler.initialize(self.env, self.policy, self.pool) 36 | 37 | self.remote_sampler.sample() 38 | 39 | deserialized = pickle.loads(pickle.dumps(self.remote_sampler)) 40 | deserialized.initialize(self.env, self.policy, self.pool) 41 | 42 | self.assertEqual(self.pool.size, 10) 43 | 44 | self.remote_sampler.sample(timeout=10) 45 | self.assertEqual(self.pool.size, 20) 46 | 47 | deserialized = pickle.loads(pickle.dumps(self.remote_sampler)) 48 | deserialized.initialize(self.env, self.policy, self.pool) 49 | 50 | self.assertTrue(isinstance( 51 | deserialized.environment, type(self.remote_sampler.environment))) 52 | self.assertEqual( 53 | self.remote_sampler._n_episodes, deserialized._n_episodes) 54 | self.assertEqual( 55 | self.remote_sampler._max_path_return, 56 | deserialized._max_path_return) 57 | self.assertEqual( 58 | self.remote_sampler._last_path_return, 59 | deserialized._last_path_return) 60 | self.assertEqual( 61 | len(self.remote_sampler._last_n_paths), 62 | len(deserialized._last_n_paths)) 63 | 64 | self.remote_sampler.sample(timeout=10) 65 | deserialized.sample(timeout=10) 66 | 67 | self.assertEqual( 68 | self.remote_sampler._n_episodes, deserialized._n_episodes) 69 | self.assertNotEqual( 70 | self.remote_sampler._last_path_return, 71 | deserialized._last_path_return) 72 | self.assertEqual( 73 | len(self.remote_sampler._last_n_paths), 74 | len(deserialized._last_n_paths)) 75 | 76 | 77 | if __name__ == '__main__': 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /softlearning/samplers/simple_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | import tree 5 | 6 | from .base_sampler import BaseSampler 7 | 8 | 9 | class SimpleSampler(BaseSampler): 10 | def __init__(self, **kwargs): 11 | super(SimpleSampler, self).__init__(**kwargs) 12 | 13 | self._last_path_return = 0 14 | self._max_path_return = -np.inf 15 | self._n_episodes = 0 16 | self._total_samples = 0 17 | 18 | self._is_first_step = True 19 | 20 | def reset(self): 21 | if self.policy is not None: 22 | self.policy.reset() 23 | 24 | self._path_length = 0 25 | self._path_return = 0 26 | self._current_path = [] 27 | self._current_observation = self.environment.reset() 28 | 29 | @property 30 | def _policy_input(self): 31 | return self._current_observation 32 | 33 | def _process_sample(self, 34 | observation, 35 | action, 36 | reward, 37 | terminal, 38 | next_observation, 39 | info): 40 | processed_observation = { 41 | 'observations': observation, 42 | 'actions': action, 43 | 'rewards': np.atleast_1d(reward), 44 | 'terminals': np.atleast_1d(terminal), 45 | 'next_observations': next_observation, 46 | 'infos': info, 47 | } 48 | 49 | return processed_observation 50 | 51 | def sample(self): 52 | if self._is_first_step: 53 | self.reset() 54 | 55 | action = self.policy.action(self._policy_input).numpy() 56 | 57 | next_observation, reward, terminal, info = self.environment.step( 58 | action) 59 | self._path_length += 1 60 | self._path_return += reward 61 | self._total_samples += 1 62 | 63 | processed_sample = self._process_sample( 64 | observation=self._current_observation, 65 | action=action, 66 | reward=reward, 67 | terminal=terminal, 68 | next_observation=next_observation, 69 | info=info, 70 | ) 71 | 72 | self._current_path.append(processed_sample) 73 | 74 | if terminal or self._path_length >= self._max_path_length: 75 | last_path = tree.map_structure( 76 | lambda *x: np.stack(x, axis=0), *self._current_path) 77 | 78 | self.pool.add_path({ 79 | key: value 80 | for key, value in last_path.items() 81 | if key != 'infos' 82 | }) 83 | 84 | self._last_n_paths.appendleft(last_path) 85 | 86 | self._max_path_return = max(self._max_path_return, 87 | self._path_return) 88 | self._last_path_return = self._path_return 89 | self._n_episodes += 1 90 | 91 | self.pool.terminate_episode() 92 | 93 | self._is_first_step = True 94 | # Reset is done in the beginning of next episode, see above. 95 | 96 | else: 97 | self._current_observation = next_observation 98 | self._is_first_step = False 99 | 100 | return next_observation, reward, terminal, info 101 | 102 | def get_diagnostics(self): 103 | diagnostics = super(SimpleSampler, self).get_diagnostics() 104 | diagnostics.update({ 105 | 'max-path-return': self._max_path_return, 106 | 'last-path-return': self._last_path_return, 107 | 'episodes': self._n_episodes, 108 | 'total-samples': self._total_samples, 109 | }) 110 | 111 | return diagnostics 112 | -------------------------------------------------------------------------------- /softlearning/samplers/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | from softlearning import replay_pools 6 | from . import simple_sampler 7 | 8 | 9 | DEFAULT_PIXEL_RENDER_KWARGS = { 10 | 'mode': 'rgb_array', 11 | 'width': 100, 12 | 'height': 100, 13 | } 14 | 15 | DEFAULT_HUMAN_RENDER_KWARGS = { 16 | 'mode': 'human', 17 | 'width': 500, 18 | 'height': 500, 19 | } 20 | 21 | 22 | def rollout(environment, 23 | policy, 24 | path_length, 25 | replay_pool_class=replay_pools.SimpleReplayPool, 26 | sampler_class=simple_sampler.SimpleSampler, 27 | render_kwargs=None, 28 | break_on_terminal=True): 29 | pool = replay_pool_class(environment, max_size=path_length) 30 | sampler = sampler_class( 31 | environment=environment, 32 | policy=policy, 33 | pool=pool, 34 | max_path_length=path_length) 35 | 36 | render_mode = (render_kwargs or {}).get('mode', None) 37 | if render_mode == 'rgb_array': 38 | render_kwargs = { 39 | **DEFAULT_PIXEL_RENDER_KWARGS, 40 | **render_kwargs 41 | } 42 | elif render_mode == 'human': 43 | render_kwargs = { 44 | **DEFAULT_HUMAN_RENDER_KWARGS, 45 | **render_kwargs 46 | } 47 | else: 48 | render_kwargs = None 49 | 50 | images = [] 51 | infos = defaultdict(list) 52 | 53 | t = 0 54 | for t in range(path_length): 55 | observation, reward, terminal, info = sampler.sample() 56 | for key, value in info.items(): 57 | infos[key].append(value) 58 | 59 | if render_kwargs: 60 | image = environment.render(**render_kwargs) 61 | images.append(image) 62 | 63 | if terminal: 64 | policy.reset() 65 | if break_on_terminal: break 66 | 67 | assert pool._size == t + 1 68 | 69 | path = pool.batch_by_indices(np.arange(pool._size)) 70 | path['infos'] = infos 71 | 72 | if render_mode == 'rgb_array': 73 | path['images'] = np.stack(images, axis=0) 74 | 75 | return path 76 | 77 | 78 | def rollouts(n_paths, *args, **kwargs): 79 | paths = [rollout(*args, **kwargs) for i in range(n_paths)] 80 | return paths 81 | -------------------------------------------------------------------------------- /softlearning/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/scripts/__init__.py -------------------------------------------------------------------------------- /softlearning/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rail-berkeley/softlearning/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/utils/__init__.py -------------------------------------------------------------------------------- /softlearning/utils/dict.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | def deep_update(d, *us): 5 | d = d.copy() 6 | 7 | for u in us: 8 | u = u.copy() 9 | for k, v in u.items(): 10 | d[k] = ( 11 | deep_update(d.get(k, {}), v) 12 | if isinstance(v, collections.Mapping) 13 | else v) 14 | 15 | return d 16 | -------------------------------------------------------------------------------- /softlearning/utils/gcp.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def instance_preempted(): 5 | try: 6 | response = requests.get( 7 | "http://metadata/computeMetadata/v1/instance/preempted", 8 | headers={'Metadata-Flavor': 'Google'} 9 | ) 10 | preempted = (response.status_code == 200 11 | and response.text != 'FALSE') 12 | except Exception: 13 | preempted = False 14 | 15 | return preempted 16 | -------------------------------------------------------------------------------- /softlearning/utils/git.py: -------------------------------------------------------------------------------- 1 | from softlearning.utils.misc import PROJECT_PATH 2 | 3 | 4 | def get_git_rev(path=PROJECT_PATH, search_parent_directories=True): 5 | try: 6 | import git 7 | except ImportError: 8 | print( 9 | "Warning: gitpython not installed." 10 | " Unable to log git rev." 11 | " Run `pip install gitpython` if you want git revs to be logged.") 12 | return None 13 | 14 | try: 15 | repo = git.Repo( 16 | path, search_parent_directories=search_parent_directories) 17 | if repo.head.is_detached: 18 | git_rev = repo.head.object.name_rev 19 | else: 20 | git_rev = repo.active_branch.commit.name_rev 21 | except git.InvalidGitRepositoryError: 22 | git_rev = None 23 | 24 | return git_rev 25 | -------------------------------------------------------------------------------- /softlearning/utils/gym.py: -------------------------------------------------------------------------------- 1 | from gym import spaces 2 | 3 | 4 | DISCRETE_SPACES = ( 5 | spaces.Discrete, 6 | spaces.MultiBinary, 7 | spaces.MultiDiscrete, 8 | ) 9 | CONTINUOUS_SPACES = (spaces.Box, ) 10 | 11 | 12 | def is_continuous_space(space): 13 | return isinstance(space, CONTINUOUS_SPACES) 14 | 15 | 16 | def is_discrete_space(space): 17 | return isinstance(space, DISCRETE_SPACES) 18 | -------------------------------------------------------------------------------- /softlearning/utils/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | 8 | PROJECT_PATH = os.path.dirname( 9 | os.path.realpath(os.path.join(__file__, '..', '..'))) 10 | 11 | 12 | def set_seed(seed): 13 | seed %= 4294967294 14 | random.seed(seed) 15 | np.random.seed(seed) 16 | tf.random.set_seed(seed) 17 | print(f"Using seed {seed}") 18 | 19 | 20 | def get_host_name(): 21 | try: 22 | import socket 23 | return socket.gethostname() 24 | except Exception as e: 25 | print("Failed to get host name!") 26 | return None 27 | -------------------------------------------------------------------------------- /softlearning/utils/numpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def softmax(x): 5 | max_x = np.max(x) 6 | exp_x = np.exp(x - max_x) 7 | return exp_x / np.sum(exp_x) 8 | -------------------------------------------------------------------------------- /softlearning/utils/random.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def spherical(size=None, ndim=2): 5 | size = np.atleast_1d(size if size is not None else ()) 6 | random_normal = np.random.standard_normal((ndim, *size)) 7 | normalized = random_normal / np.linalg.norm(random_normal, axis=0) 8 | return normalized 9 | -------------------------------------------------------------------------------- /softlearning/utils/tensorflow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tree 3 | 4 | 5 | def set_gpu_memory_growth(growth): 6 | gpus = tf.config.experimental.list_physical_devices('GPU') 7 | if gpus: 8 | try: 9 | # Currently, memory growth needs to be the same across GPUs 10 | for gpu in gpus: 11 | tf.config.experimental.set_memory_growth(gpu, growth) 12 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 13 | print(len(gpus), "Physical GPUs,", len(logical_gpus), 14 | "Logical GPUs") 15 | except RuntimeError as e: 16 | # Memory growth must be set before GPUs have been initialized 17 | print(e) 18 | 19 | 20 | def apply_preprocessors(preprocessors, inputs): 21 | tree.assert_same_structure(inputs, preprocessors) 22 | preprocessed_inputs = tree.map_structure( 23 | lambda preprocessor, input_: ( 24 | preprocessor(input_) if preprocessor is not None else input_), 25 | preprocessors, 26 | inputs, 27 | ) 28 | 29 | return preprocessed_inputs 30 | 31 | 32 | def cast_and_concat(x): 33 | x = tree.map_structure( 34 | lambda element: tf.cast(element, tf.float32), x) 35 | x = tree.flatten(x) 36 | x = tf.concat(x, axis=-1) 37 | return x 38 | -------------------------------------------------------------------------------- /softlearning/utils/times.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | 4 | def datetimestamp(divider='-', datetime_divider='T'): 5 | now = datetime.datetime.now() 6 | return now.strftime( 7 | '%Y{d}%m{d}%dT%H{d}%M{d}%S' 8 | ''.format(d=divider, dtd=datetime_divider)) 9 | 10 | 11 | def datestamp(divider='-'): 12 | return datetime.date.today().isoformat().replace('-', divider) 13 | 14 | 15 | def timestamp(divider='-'): 16 | now = datetime.datetime.now() 17 | time_now = datetime.datetime.time(now) 18 | return time_now.strftime( 19 | '%H{d}%M{d}%S'.format(d=divider)) 20 | -------------------------------------------------------------------------------- /softlearning/utils/tune.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from pprint import pprint 4 | import re 5 | import shutil 6 | 7 | 8 | RESULT_FILE_REGEXES = ( 9 | "^result.json$", 10 | "^progress.csv$", 11 | "^events.out.tfevents.\\d+.\\w$", 12 | ) 13 | 14 | 15 | PARAMS_FILE_REGEXES = ( 16 | "^params.json$", 17 | "^params.pkl$", 18 | ) 19 | 20 | CHECKPOINT_DIRECTORY_REGEXES = ( 21 | "^checkpoint_\\d+$" 22 | ) 23 | 24 | 25 | def is_result_file(filename): 26 | return any( 27 | re.match(result_file_regex, filename) 28 | for result_file_regex in RESULT_FILE_REGEXES) 29 | 30 | 31 | def is_params_file(filename): 32 | return any( 33 | re.match(params_file_regex, filename) 34 | for params_file_regex in PARAMS_FILE_REGEXES) 35 | 36 | 37 | def is_checkpoint_directory(dirname): 38 | # TODO(hartikainen): might want to check the contents of this directory. 39 | # e.g. check `.tune_metadata`, etc. 40 | return any( 41 | re.match(checkpoint_directory_regex, dirname) 42 | for checkpoint_directory_regex in CHECKPOINT_DIRECTORY_REGEXES) 43 | 44 | 45 | def is_trial_directory(root_dir): 46 | if not os.path.isdir(root_dir): 47 | return False 48 | 49 | root, directories, files = next(os.walk(root_dir)) 50 | # json logger: params.json, result.json, params.pkl 51 | # csv logger: progress.csv 52 | # tf logger: events.out.tfevents.1562394433.ray-hopp-2-head-4ba37bcf 53 | # log_syncxurz09ic.log 54 | 55 | result_files = [ 56 | filename 57 | for filename in files 58 | if is_result_file(filename) 59 | ] 60 | 61 | params_files = [ 62 | filename 63 | for filename in files 64 | if is_params_file(filename) 65 | ] 66 | 67 | # TODO(hartikainen): checkpoint_directories are currently unused here 68 | checkpoint_directories = [ 69 | directory 70 | for directory in directories 71 | if is_checkpoint_directory(os.path.join(root, directory)) 72 | ] 73 | 74 | # TODO(hartikainen): might want to check if "^log_sync\\d{8}.log$" exists 75 | 76 | return result_files and params_files 77 | 78 | 79 | def is_experiment_directory(root_dir): 80 | if not os.path.isdir(root_dir): 81 | return False 82 | 83 | root, directories, files = next(os.walk(root_dir)) 84 | # 1) experiment_state.json exists -> is experiment 85 | experiment_state_paths = glob.glob( 86 | os.path.join(root, "experiment_state*.json")) 87 | 88 | if experiment_state_paths: 89 | # TODO(hartikainen): This needs to be fixed. In general, a directory 90 | # can have multiple experiment state files. Softlearning experiment 91 | # directories shouldn't though. 92 | assert len(experiment_state_paths) == 1, experiment_state_paths 93 | return True 94 | 95 | # 2) All the subfolders are trials -> is experiment 96 | if directories and all( 97 | is_trial_directory(os.path.join(root, directory)) 98 | for directory in directories): 99 | return True 100 | 101 | return False 102 | 103 | 104 | def find_all_experiment_directories(root_dir): 105 | """Given a directory path, recursively find all experiment directories in it. 106 | 107 | TODO(hartikainen): Should maybe have an option for recursive=False? 108 | """ 109 | 110 | root_dir = os.path.expanduser(root_dir) 111 | 112 | if is_experiment_directory(root_dir): 113 | return (root_dir, ) 114 | 115 | directories = next(os.walk(root_dir))[1] 116 | all_experiment_directories = sum(( 117 | find_all_experiment_directories(os.path.join(root_dir, directory)) 118 | for directory in directories 119 | ), ()) 120 | 121 | return all_experiment_directories 122 | 123 | 124 | def find_all_trial_directories(experiment_dir): 125 | """Given a path to experiment, find all trial directories in it. 126 | 127 | Raises an error if given experiment path is not actually an experiment 128 | path. 129 | """ 130 | 131 | assert is_experiment_directory(experiment_dir), experiment_dir 132 | 133 | experiment_dir = os.path.expanduser(experiment_dir) 134 | directories = next(os.walk(experiment_dir))[1] 135 | 136 | all_trial_directories = [ 137 | os.path.join(experiment_dir, directory) 138 | for directory in directories 139 | if is_trial_directory(os.path.join(experiment_dir, directory)) 140 | ] 141 | 142 | return all_trial_directories 143 | -------------------------------------------------------------------------------- /softlearning/utils/video.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | 6 | def _make_dir(filename): 7 | folder = os.path.dirname(filename) 8 | if not os.path.exists(folder): 9 | os.makedirs(folder) 10 | 11 | 12 | def save_video(video_frames, filename, fps=60, video_format='mp4'): 13 | assert fps == int(fps), fps 14 | import skvideo.io 15 | _make_dir(filename) 16 | 17 | skvideo.io.vwrite( 18 | filename, 19 | video_frames, 20 | inputdict={ 21 | '-r': str(int(fps)), 22 | }, 23 | outputdict={ 24 | '-f': video_format, 25 | '-pix_fmt': 'yuv420p', # '-pix_fmt=yuv420p' needed for osx https://github.com/scikit-video/scikit-video/issues/74 26 | } 27 | ) 28 | 29 | 30 | def create_video_grid(col_and_row_frames): 31 | video_grid_frames = np.concatenate([ 32 | np.concatenate(row_frames, axis=-2) 33 | for row_frames in col_and_row_frames 34 | ], axis=-3) 35 | 36 | return video_grid_frames 37 | -------------------------------------------------------------------------------- /softlearning/value_functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .vanilla import ( # noqa: unused-import 2 | feedforward_Q_function, 3 | double_feedforward_Q_function, 4 | ensemble_feedforward_Q_function, 5 | ) 6 | 7 | from softlearning.utils.serialization import ( 8 | serialize_softlearning_object, deserialize_softlearning_object) 9 | 10 | 11 | def serialize(value_function): 12 | return serialize_softlearning_object(value_function) 13 | 14 | 15 | def deserialize(name, custom_objects=None): 16 | """Returns a value function or class denoted by input string. 17 | 18 | Arguments: 19 | name : String 20 | 21 | Returns: 22 | Value function function or class denoted by input string. 23 | 24 | For example: 25 | >>> softlearning.value_functions.get('double_feedforward_Q_function') 26 | 27 | >>> softlearning.value_functions.get('abcd') 28 | Traceback (most recent call last): 29 | ... 30 | ValueError: Unknown value function: abcd 31 | 32 | Args: 33 | name: The name of the value function. 34 | 35 | Raises: 36 | ValueError: `Unknown value function` if the input string does not 37 | denote any defined value function. 38 | """ 39 | return deserialize_softlearning_object( 40 | name, 41 | module_objects=globals(), 42 | custom_objects=custom_objects, 43 | printable_module_name='value function') 44 | 45 | 46 | def get(identifier): 47 | """Returns a value function. 48 | 49 | Arguments: 50 | identifier: function, string, or dict. 51 | 52 | Returns: 53 | A value function denoted by identifier. 54 | 55 | For example: 56 | 57 | >>> softlearning.value_functions.get('double_feedforward_Q_function') 58 | 59 | >>> softlearning.value_functions.get('abcd') 60 | Traceback (most recent call last): 61 | ... 62 | ValueError: Unknown value function: abcd 63 | 64 | Raises: 65 | ValueError: Input is an unknown function or string, i.e., the 66 | identifier does not denote any defined value function. 67 | """ 68 | if identifier is None: 69 | return None 70 | if isinstance(identifier, str): 71 | return deserialize(identifier) 72 | elif isinstance(identifier, dict): 73 | return deserialize(identifier) 74 | elif callable(identifier): 75 | return identifier 76 | else: 77 | raise TypeError( 78 | f"Could not interpret value function function identifier:" 79 | " {repr(identifier)}.") 80 | -------------------------------------------------------------------------------- /softlearning/value_functions/base_value_function.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from collections import OrderedDict 3 | 4 | import tensorflow as tf 5 | import tree 6 | 7 | 8 | class BaseValueFunction: 9 | def __init__(self, model, observation_keys, name='value_function'): 10 | self._observation_keys = observation_keys 11 | self.model = model 12 | self._name = name 13 | 14 | @property 15 | def name(self): 16 | return self._name 17 | 18 | @property 19 | def observation_keys(self): 20 | return self._observation_keys 21 | 22 | def reset(self): 23 | """Reset and clean the value function.""" 24 | 25 | def get_weights(self, *args, **kwargs): 26 | return self.model.get_weights(*args, **kwargs) 27 | 28 | def set_weights(self, *args, **kwargs): 29 | return self.model.set_weights(*args, **kwargs) 30 | 31 | def save_weights(self, *args, **kwargs): 32 | self.model.save_weights(*args, **kwargs) 33 | 34 | def load_weights(self, *args, **kwargs): 35 | self.model.load_weights(*args, **kwargs) 36 | 37 | @property 38 | def weights(self): 39 | """Returns the list of all policy variables/weights. 40 | 41 | Returns: 42 | A list of variables. 43 | """ 44 | return self.trainable_weights + self.non_trainable_weights 45 | 46 | @property 47 | def trainable_weights(self): 48 | return self.model.trainable_weights 49 | 50 | @property 51 | def non_trainable_weights(self): 52 | return self.model.non_trainable_weights 53 | 54 | @property 55 | def variables(self): 56 | """Returns the list of all policy variables/weights. 57 | 58 | Alias of `self.weights`. 59 | 60 | Returns: 61 | A list of variables. 62 | """ 63 | return self.weights 64 | 65 | @property 66 | def trainable_variables(self): 67 | return self.trainable_weights 68 | 69 | @property 70 | def non_trainable_variables(self): 71 | return self.non_trainable_weights 72 | 73 | @abc.abstractmethod 74 | def values(self, inputs): 75 | """Compute values for given inputs, (e.g. observations).""" 76 | raise NotImplementedError 77 | 78 | def value(self, *args, **kwargs): 79 | """Compute a value for a single input, (e.g. observation).""" 80 | args_, kwargs_ = tree.map_structure( 81 | lambda x: x[None, ...], (args, kwargs)) 82 | values = self.values(*args_, **kwargs_) 83 | value = tree.map_structure(lambda x: x[0], values) 84 | return value 85 | 86 | def _filter_observations(self, observations): 87 | if (isinstance(observations, dict) 88 | and self._observation_keys is not None): 89 | observations = type(observations)(( 90 | (key, observations[key]) 91 | for key in self.observation_keys 92 | )) 93 | return observations 94 | 95 | def get_diagnostics(self, *inputs): 96 | """Return loggable diagnostic information of the value function.""" 97 | diagnostics = OrderedDict() 98 | return diagnostics 99 | 100 | def __getstate__(self): 101 | state = self.__dict__.copy() 102 | model = state.pop('model') 103 | state.update({ 104 | 'model_config': model.get_config(), 105 | 'model_weights': model.get_weights(), 106 | }) 107 | return state 108 | 109 | def __setstate__(self, state): 110 | model_config = state.pop('model_config') 111 | model_weights = state.pop('model_weights') 112 | model = tf.keras.Model.from_config(model_config) 113 | model.set_weights(model_weights) 114 | state['model'] = model 115 | self.__dict__ = state 116 | 117 | 118 | class StateValueFunction(BaseValueFunction): 119 | def values(self, observations, **kwargs): 120 | """Compute values given observations.""" 121 | observations = self._filter_observations(observations) 122 | values = self.model(observations, **kwargs) 123 | return values 124 | 125 | 126 | class StateActionValueFunction(BaseValueFunction): 127 | def values(self, observations, actions, **kwargs): 128 | """Compute values given observations.""" 129 | observations = self._filter_observations(observations) 130 | values = self.model((observations, actions), **kwargs) 131 | return values 132 | -------------------------------------------------------------------------------- /softlearning/value_functions/base_value_function_test.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from collections import OrderedDict 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | import tree 7 | 8 | from softlearning.value_functions.vanilla import feedforward_Q_function 9 | from softlearning.environments.utils import get_environment 10 | 11 | 12 | class ValueFunctionTest(tf.test.TestCase): 13 | def setUp(self): 14 | self.env = get_environment('gym', 'Swimmer', 'v3', {}) 15 | self.hidden_layer_sizes = (8, 8) 16 | 17 | observation_shapes = OrderedDict(( 18 | (key, value) for key, value in self.env.observation_shape.items() 19 | )) 20 | action_shape = self.env.action_shape 21 | input_shapes = (observation_shapes, action_shape) 22 | self.value_function = feedforward_Q_function( 23 | input_shapes=input_shapes, 24 | hidden_layer_sizes=self.hidden_layer_sizes, 25 | ) 26 | 27 | def test_values(self): 28 | _ = self.env.reset() 29 | action1_np = self.env.action_space.sample() 30 | observation1_np = self.env.step(action1_np)[0] 31 | action2_np = self.env.action_space.sample() 32 | observation2_np = self.env.step(action2_np)[0] 33 | 34 | observations_np = type(observation1_np)(( 35 | (key, np.stack(( 36 | observation1_np[key], observation2_np[key] 37 | ), axis=0).astype(np.float32)) 38 | for key in observation1_np.keys() 39 | )) 40 | 41 | actions_np = np.stack(( 42 | action1_np, action2_np 43 | ), axis=0).astype(np.float32) 44 | 45 | observations_tf = tree.map_structure( 46 | lambda x: tf.constant(x, dtype=x.dtype), observations_np) 47 | actions_tf = tree.map_structure( 48 | lambda x: tf.constant(x, dtype=x.dtype), actions_np) 49 | 50 | for observations, actions in ( 51 | (observations_np, actions_np), 52 | (observations_tf, actions_tf)): 53 | values = self.value_function.values(observations, actions) 54 | 55 | tf.debugging.assert_shapes(((values, (2, 1)),)) 56 | 57 | def test_trainable_variables(self): 58 | self.assertEqual( 59 | len(self.value_function.trainable_variables), 60 | 2 * (len(self.hidden_layer_sizes) + 1)) 61 | 62 | def test_get_diagnostics(self): 63 | _ = self.env.reset() 64 | action1 = self.env.action_space.sample() 65 | observation1 = self.env.step(action1)[0] 66 | action2 = self.env.action_space.sample() 67 | observation2 = self.env.step(action2)[0] 68 | 69 | observations = type(observation1)(( 70 | (key, np.stack(( 71 | observation1[key], observation2[key] 72 | ), axis=0).astype(np.float32)) 73 | for key in observation1.keys() 74 | )) 75 | 76 | actions = np.stack(( 77 | action1, action2 78 | ), axis=0).astype(np.float32) 79 | 80 | diagnostics = self.value_function.get_diagnostics( 81 | observations, actions) 82 | 83 | self.assertTrue(isinstance(diagnostics, OrderedDict)) 84 | self.assertEqual(tuple(diagnostics.keys()), ()) 85 | 86 | for value in diagnostics.values(): 87 | self.assertTrue(np.isscalar(value)) 88 | 89 | def test_serialize_deserialize(self): 90 | _ = self.env.reset() 91 | action1_np = self.env.action_space.sample() 92 | observation1_np = self.env.step(action1_np)[0] 93 | action2_np = self.env.action_space.sample() 94 | observation2_np = self.env.step(action2_np)[0] 95 | 96 | observations = type(observation1_np)(( 97 | (key, np.stack(( 98 | observation1_np[key], observation2_np[key] 99 | ), axis=0).astype(np.float32)) 100 | for key in observation1_np.keys() 101 | )) 102 | 103 | actions = np.stack(( 104 | action1_np, action2_np 105 | ), axis=0).astype(np.float32) 106 | 107 | weights_1 = self.value_function.get_weights() 108 | 109 | values_1 = self.value_function.values(observations, actions).numpy() 110 | 111 | serialized = pickle.dumps(self.value_function) 112 | deserialized = pickle.loads(serialized) 113 | 114 | weights_2 = deserialized.get_weights() 115 | values_2 = deserialized.values(observations, actions).numpy() 116 | 117 | for weight_1, weight_2 in zip(weights_1, weights_2): 118 | np.testing.assert_array_equal(weight_1, weight_2) 119 | 120 | np.testing.assert_array_equal(values_1, values_2) 121 | 122 | 123 | if __name__ == '__main__': 124 | tf.test.main() 125 | -------------------------------------------------------------------------------- /softlearning/value_functions/vanilla.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tree 3 | 4 | from softlearning.models.feedforward import feedforward_model 5 | from softlearning.models.utils import create_inputs 6 | from softlearning.utils.tensorflow import apply_preprocessors 7 | from softlearning import preprocessors as preprocessors_lib 8 | from softlearning.utils.tensorflow import cast_and_concat 9 | 10 | from .base_value_function import StateActionValueFunction 11 | 12 | 13 | def create_ensemble_value_function(N, value_fn, *args, **kwargs): 14 | # TODO(hartikainen): The ensemble Q-function should support the same 15 | # interface as the regular ones. Implement the double min-thing 16 | # as a Keras layer. 17 | value_fns = tuple(value_fn(*args, **kwargs) for i in range(N)) 18 | return value_fns 19 | 20 | 21 | def double_feedforward_Q_function(*args, **kwargs): 22 | return create_ensemble_value_function( 23 | 2, feedforward_Q_function, *args, **kwargs) 24 | 25 | 26 | def ensemble_feedforward_Q_function(N, *args, **kwargs): 27 | return create_ensemble_value_function( 28 | N, feedforward_Q_function, *args, **kwargs) 29 | 30 | 31 | def feedforward_Q_function(input_shapes, 32 | *args, 33 | preprocessors=None, 34 | observation_keys=None, 35 | name='feedforward_Q', 36 | **kwargs): 37 | inputs = create_inputs(input_shapes) 38 | 39 | if preprocessors is None: 40 | preprocessors = tree.map_structure(lambda _: None, inputs) 41 | 42 | preprocessors = tree.map_structure_up_to( 43 | inputs, preprocessors_lib.deserialize, preprocessors) 44 | 45 | preprocessed_inputs = apply_preprocessors(preprocessors, inputs) 46 | 47 | # NOTE(hartikainen): `feedforward_model` would do the `cast_and_concat` 48 | # step for us, but tf2.2 broke the sequential multi-input handling: See: 49 | # https://github.com/tensorflow/tensorflow/issues/37061. 50 | out = tf.keras.layers.Lambda(cast_and_concat)(preprocessed_inputs) 51 | Q_model_body = feedforward_model( 52 | *args, 53 | output_shape=[1], 54 | name=name, 55 | **kwargs 56 | ) 57 | 58 | Q_model = tf.keras.Model(inputs, Q_model_body(out), name=name) 59 | 60 | Q_function = StateActionValueFunction( 61 | model=Q_model, observation_keys=observation_keys, name=name) 62 | 63 | return Q_function 64 | --------------------------------------------------------------------------------