├── .gitignore ├── LICENSE ├── README.md ├── examples ├── results │ ├── ddpg_on_pendulum.png │ ├── dqn_on_cartpole.png │ └── qlearning_on_cliffwalking.png ├── use_ddpg_in_pendulum.ipynb ├── use_dqn_in_cartpole.ipynb └── use_qlearning_in_cliff_walking.ipynb ├── life ├── base │ ├── __pycache__ │ │ ├── q_learning.cpython-37.pyc │ │ ├── sarsa.cpython-37.pyc │ │ └── trainer.cpython-37.pyc │ ├── q_learning.py │ ├── sarsa.py │ └── trainer.py ├── dqn │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── dqn.cpython-37.pyc │ │ ├── dqn_improved.cpython-37.pyc │ │ └── trainer.cpython-37.pyc │ ├── dqn.py │ ├── dqn_improved.py │ └── trainer.py ├── envs │ ├── __pycache__ │ │ ├── cliffwalking.cpython-37.pyc │ │ ├── con_env_demo.cpython-37.pyc │ │ └── dis_env_demo.cpython-37.pyc │ ├── cliffwalking.py │ ├── con_env_demo.py │ └── dis_env_demo.py ├── imitation │ ├── __init__.py │ ├── bc.py │ ├── gail.py │ └── trainer.py ├── policy │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── ddpg.cpython-37.pyc │ │ ├── ppo.cpython-37.pyc │ │ ├── reinforce.cpython-37.pyc │ │ ├── sac.cpython-37.pyc │ │ └── trainer.cpython-37.pyc │ ├── ac │ │ ├── a3c.py │ │ └── ac.py │ ├── ddpg.py │ ├── ppo.py │ ├── reinforce.py │ ├── sac.py │ └── trainer.py ├── test │ ├── test_dqn.py │ ├── test_off_policy.py │ ├── test_on_policy.py │ └── test_ql.py └── utils │ ├── __pycache__ │ ├── calculator.cpython-37.pyc │ └── cont2disp.cpython-37.pyc │ ├── calculator.py │ ├── cont2disp.py │ └── replay │ ├── __pycache__ │ └── replay_buffer.cpython-37.pyc │ ├── per_replay_buffer.py │ └── replay_buffer.py └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 HanggeAi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Life 2 | Life is a library for reinforce learning implemented by PyTorch. 3 | ![190503058512b5a059717be2719e6a1](https://user-images.githubusercontent.com/106570281/220634585-7f9375f2-599f-479c-bceb-f624f9932528.jpg) 4 | ## 目前,Life库实现的算法有: 5 | - Sarsa 6 | - multi-Sarsa 7 | - Q-Learning 8 | - Dyna-Q 9 | - DQN 10 | - Double-DQN 11 | - Dueling-DQN 12 | - REINFORCE策略梯度 13 | - Actor-Critic 14 | - PPO 15 | - DDPG 16 | - SAC 17 | - BC 18 | - GAIL 19 | - CQL 20 | # 安装(install) 21 | ```bash 22 | pip install rllife 23 | ``` 24 | ## 或者 25 | 你可以在[PyPI](https://pypi.org/project/rllife/#files)上面下载.gz文件,然后通过本地安装。 26 | # requirement 27 | ```bash 28 | pyyaml==6.0 29 | ipykernel==6.15.1 30 | jupyter==1.0.0 31 | matplotlib==3.5.3 32 | seaborn==0.12.1 33 | dill==0.3.5.1 34 | argparse==1.4.0 35 | pandas==1.3.5 36 | pyglet==1.5.26 37 | importlib-metadata<5.0 38 | setuptools==65.2.0 39 | gym==0.25.2 40 | numpy==1.21.6 41 | pandas==1.3.4 42 | torch==1.10.0 43 | tqdm==4.64.1 44 | ``` 45 | ## 主要特征 46 | - 基于目前主流的深度学习框架pytorch,支持gpu加速。 47 | - 简洁易用,仅需寥寥几行代码,即可实现强化学习算法的构建与训练。 48 | - 覆盖面广,从传统的QLearning,到一些最新的强化学习算法都有实现。 49 | - 所有超参均支持自定义,同时可自定义深度神经网络的结构,封装程度低而又简介易用。 50 | ## 图解Life的结构 51 | ![life_struct](https://user-images.githubusercontent.com/106570281/221387421-566e1444-ea61-48ed-b68e-34ee1725560f.jpg) 52 | ## Life将强化学习算法分为以下几类: 53 | 1. 传统的强化学习算法,如Sarsa; 54 | 2. 只基于值函数的深度强化学习算法,如DQN; 55 | 3. 基于策略函数和值函数的深度强化学习算法,如AC; 56 | 4. 模仿强化学习算法,如BC; 57 | 5. 离线强化学习算法,如CQL。 58 | ## 对于每一类强化学习算法,都配有一个训练器 59 | 训练器的名称和算法的名称是一一对应的,如要训练```DQN```,则其训练函数的名称为: 60 | ```train_dqn``` 61 | ### 以DQN为例,其结构如下 62 | ![dqn_struct](https://user-images.githubusercontent.com/106570281/221387444-67dc5dc9-4ba1-4707-9bcc-d8ae9abdb7cf.jpg) 63 | 其中: 64 | - dqn.py中为传统DQN算法 65 | - dqn_improved.py中为一些改进的DQN算法 66 | - trainer中包含了以上各种dqn算法的训练函数 67 | # Get Started 68 | 要使用Life进行强化学习,仅需简单的三步,下面以DQN在CartPole环境上的训练为例进行快速入门: 69 | ## 第一步,导入相关的模块 70 | ```python 71 | from life.dqn.dqn import DQN # 导入模型 72 | from life.dqn.trainer import train_dqn # 导入训练器 73 | from life.envs.dis_env_demo import make # 环境的一个例子 74 | from life.utils.replay.replay_buffer import ReplayBuffer # 回放池 75 | import torch 76 | import matplotlib.pyplot as plt 77 | ``` 78 | ## 第二步,设置超参数,并构建模型 79 | ```python 80 | # 设置超参数 81 | lr = 2e-3 82 | num_episodes = 500 83 | hidden_dim = 128 84 | gamma = 0.98 85 | epsilon = 0.01 86 | target_update = 10 87 | buffer_size = 10000 88 | minimal_size = 500 89 | batch_size = 64 90 | device = torch.device("cpu") # 也可指定为gpu : torch.device("cuda") 91 | 92 | env=make() # 建立环境,这里为 CartPole-v0 93 | replay_buffer = ReplayBuffer(buffer_size) # 回放池 94 | state_dim = env.observation_space.shape[0] 95 | action_dim = env.action_space.n 96 | 97 | # 建立模型 98 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, 99 | target_update, device) # DQN模型 100 | ``` 101 | 注意,如果你足够细心,你会发现在上述建立DQN的过程中,我们没有传入一个Neural Network,这是因为在建立深度强化学习时,Life提供了一个默认的双层神经网络作为建立DQN的**默认神经网络**。当然,你也可以**使用自己设计的神经网络结构:** 102 | ```python 103 | class YourNet: 104 | """your network for your task""" 105 | pass 106 | 107 | 108 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, 109 | target_update, device, q_net=YourNet) # DQN模型 110 | ``` 111 | 此时,原本用于传递给默认神经网络的超参数state_dim,hidden_dim,action_dim就没有用了,可随意设置。 112 | ## 第三步,使用训练器训练模型 113 | ```python 114 | result=train_dqn(agent,env,replay_buffer,minimal_size,batch_size,num_episodes) 115 | ``` 116 | ## 上述训练函数返回的是:训练过程中每个回合的汇报,如果你想的话,可以将其可视化出来: 117 | ```python 118 | episodes_list = list(range(len(result))) 119 | plt.figure(figsize=(8,6)) 120 | plt.plot(episodes_list, result) 121 | plt.xlabel("Episodes") 122 | plt.ylabel("Returns") 123 | plt.title("DQN on {}".format("Cart Pole v1")) 124 | plt.show() 125 | ``` 126 | 得到: 127 | ![dqn_on_cartpole](https://user-images.githubusercontent.com/106570281/221387500-714d271b-51fa-43b5-9025-56dd4b5c76b7.png) 128 | ## 当然,如果你需要智能体的话,也可以设置```return_agent=True```,这会返回一个元组```(return_list, agent)``` 129 | 其中,```return_list```为:训练过程中每个回合的汇报,```agent```为训练好的智能体。 130 | ```return_agent```默认为```False```。 131 | 132 | **可见,除了超参数的设置之外,我们构建DQN算法只使用了两行代码:** 133 | ```python 134 | from life.dqn.dqn import DQN 135 | agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon,target_update, device) 136 | ``` 137 | **我们训练DQN同样只使用了两行代码:** 138 | ```python 139 | from life.dqn.trainer import train_dqn 140 | result=train_dqn(agent,env,replay_buffer,minimal_size,batch_size,num_episodes) 141 | ``` 142 | ### 这让我们的强化学习实现的相当简洁和方便! 143 | 144 | ## 上述的例子在项目的examples中 145 | # 关于名称与LOGO 146 | - Life的中文含义为:生命,生活,强化学习本来就是人生的一个过程,我们无时无刻不在进行着强化学习。强化学习不仅是一种科学的决策方法,各种算法的思想也给予我们很多人生的哲理,使人受益匪浅。 147 | - LOGO 底色采用深蓝色,图案和文字采用浅蓝白色,整体端庄严谨,富有科技感。文字部分由项目名称LIFE字样和寄语:RL IS THE PROCESS OF LIFE 即可以理解为强化学习是人生的过程,也可以理解为强化学习是Life库的程序,一语双关。 148 | - LOGO图案部分为4个伸长了的F,同时将F上面一个笔画伸长,使其左旋90°时形成L字样,为LIFE的简写LF; 同时致敬OpenAI的LOGO: 149 | ![image](https://user-images.githubusercontent.com/106570281/221387550-49896c2c-dfa9-4f35-a2d6-56314e8cb44f.png) 150 | 151 | -------------------------------------------------------------------------------- /examples/results/ddpg_on_pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/examples/results/ddpg_on_pendulum.png -------------------------------------------------------------------------------- /examples/results/dqn_on_cartpole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/examples/results/dqn_on_cartpole.png -------------------------------------------------------------------------------- /examples/results/qlearning_on_cliffwalking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/examples/results/qlearning_on_cliffwalking.png -------------------------------------------------------------------------------- /examples/use_ddpg_in_pendulum.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# 第一步,导入相关的模块" 7 | ], 8 | "metadata": { 9 | "collapsed": false 10 | } 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 3, 15 | "outputs": [], 16 | "source": [ 17 | "from life.policy.ddpg import DDPG\n", 18 | "from life.policy.trainer import train_ddpg\n", 19 | "from life.envs.con_env_demo import make\n", 20 | "from life.utils.replay.replay_buffer import ReplayBuffer\n", 21 | "import torch\n", 22 | "import matplotlib.pyplot as plt" 23 | ], 24 | "metadata": { 25 | "collapsed": false 26 | } 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "source": [ 31 | "# 第二步,设置超参数,并建立模型" 32 | ], 33 | "metadata": { 34 | "collapsed": false 35 | } 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "outputs": [ 41 | { 42 | "name": "stderr", 43 | "output_type": "stream", 44 | "text": [ 45 | "D:\\anocoodaa\\envs\\zyhrl\\lib\\site-packages\\gym\\core.py:318: DeprecationWarning: \u001B[33mWARN: Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.\u001B[0m\n", 46 | " \"Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.\"\n", 47 | "D:\\anocoodaa\\envs\\zyhrl\\lib\\site-packages\\gym\\wrappers\\step_api_compatibility.py:40: DeprecationWarning: \u001B[33mWARN: Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.\u001B[0m\n", 48 | " \"Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.\"\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "# 设置超参数\n", 54 | "actor_lr = 3e-4\n", 55 | "critic_lr = 3e-3\n", 56 | "num_episodes = 200\n", 57 | "hidden_dim = 64\n", 58 | "gamma = 0.98\n", 59 | "tau = 0.005 # 软更新参数\n", 60 | "buffer_size = 10000\n", 61 | "minimal_size = 1000\n", 62 | "batch_size = 64\n", 63 | "sigma = 0.01 # 高斯噪声标准差\n", 64 | "device = torch.device(\"cpu\")\n", 65 | "\n", 66 | "env=make()\n", 67 | "replay_buffer = ReplayBuffer(buffer_size)\n", 68 | "state_dim = env.observation_space.shape[0]\n", 69 | "action_dim = env.action_space.shape[0]\n", 70 | "action_bound = env.action_space.high[0] # 动作最大值\n", 71 | "\n", 72 | "# 建立模型\n", 73 | "agent = DDPG(state_dim, action_dim, state_dim+action_dim,hidden_dim,\n", 74 | " False,action_bound, sigma, actor_lr, critic_lr, tau, gamma, device)" 75 | ], 76 | "metadata": { 77 | "collapsed": false 78 | } 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "source": [ 83 | "# 第三步,训练模型" 84 | ], 85 | "metadata": { 86 | "collapsed": false 87 | } 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 4, 92 | "outputs": [ 93 | { 94 | "name": "stderr", 95 | "output_type": "stream", 96 | "text": [ 97 | "Iteration 0: 0%| | 0/20 [00:00", 135 | "image/png": "\n" 136 | }, 137 | "metadata": {}, 138 | "output_type": "display_data" 139 | } 140 | ], 141 | "source": [ 142 | "episodes_list = list(range(len(result)))\n", 143 | "plt.figure(figsize=(8,6))\n", 144 | "plt.plot(episodes_list, result)\n", 145 | "plt.xlabel(\"Episodes\")\n", 146 | "plt.ylabel(\"Returns\")\n", 147 | "plt.title(\"DDPG on {}\".format(\"Pendulum v1\"))\n", 148 | "plt.show()" 149 | ], 150 | "metadata": { 151 | "collapsed": false 152 | } 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "outputs": [], 158 | "source": [], 159 | "metadata": { 160 | "collapsed": false 161 | } 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "Python 3", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 2 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython2", 180 | "version": "2.7.6" 181 | } 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 0 185 | } 186 | -------------------------------------------------------------------------------- /life/base/__pycache__/q_learning.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/base/__pycache__/q_learning.cpython-37.pyc -------------------------------------------------------------------------------- /life/base/__pycache__/sarsa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/base/__pycache__/sarsa.cpython-37.pyc -------------------------------------------------------------------------------- /life/base/__pycache__/trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/base/__pycache__/trainer.cpython-37.pyc -------------------------------------------------------------------------------- /life/base/q_learning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | class QLearning: 6 | """Q-Learning算法""" 7 | 8 | def __init__(self, n_state, epsilon, alpha, gamma, n_action=4): 9 | self.Q_table = np.zeros((n_state, n_action)) 10 | self.n_action = n_action 11 | self.epsilon = epsilon 12 | self.alpha = alpha 13 | self.gamma = gamma 14 | 15 | def take_action(self, state): 16 | """根据策略Q选取在state下的最有动作action""" 17 | if np.random.rand() < self.epsilon: 18 | action = np.random.randint(self.n_action) 19 | else: 20 | action = np.argmax(self.Q_table[state]) 21 | return action 22 | 23 | def best_action(self, state): 24 | """训练完成后选择最优动作""" 25 | Q_max = np.max(self.Q_table[state]) 26 | a = [0 for _ in range(self.n_action)] 27 | for i in range(self.n_action): 28 | if self.Q_table[state, i] == Q_max: 29 | a[i] = 1 30 | return a 31 | 32 | def update(self, s0, a0, r, s1): 33 | """更新Q表格""" 34 | td_error = r + self.gamma * self.Q_table[s1].max() - self.Q_table[s0, a0] 35 | self.Q_table[s0, a0] += self.alpha * td_error 36 | 37 | 38 | class DynaQ: 39 | def __init__(self, n_state, epsilon, alpha, gamma, n_planning, n_action=4): 40 | self.Q_table = np.zeros((n_state, n_action)) 41 | self.n_action = n_action 42 | self.alpha = alpha 43 | self.gamma = gamma 44 | self.epsilon = epsilon 45 | self.n_planning = n_planning # 每执行一次Q-learning,执行n_planning次Q-planning 46 | self.model = dict() # 每次在真实环境中收集到新数据,就加入到字典中(如果之前不存在的话) 47 | 48 | def take_action(self, state): 49 | """根据状态选取下一步的动作""" 50 | if np.random.rand() < self.epsilon: 51 | action = np.random.randint(self.n_action) 52 | else: 53 | action = np.argmax(self.Q_table[state]) 54 | return action 55 | 56 | def q_learning(self, s0, a0, r, s1): 57 | """使用Q-learning的方法更新Q表格""" 58 | td_error = r + self.gamma * self.Q_table[s1].max() - self.Q_table[s0, a0] 59 | self.Q_table[s0, a0] += self.alpha * td_error 60 | 61 | def update(self, s0, a0, r, s1): 62 | """Dyna-Q算法的主要部分,更新Q表格 63 | 使用Q-learning更新一次,在使用Q-planning从历史数据中更新n_planning次""" 64 | self.q_learning(s0, a0, r, s1) 65 | self.model[(s0, a0)] = r, s1 # 将新数据加入到model中 66 | for _ in range(self.n_planning): # Q-planning循环 67 | (s, a), (r, s_) = random.choice(list(self.model.items())) # 随机选择之前的数据 68 | self.q_learning(s, a, r, s_) 69 | -------------------------------------------------------------------------------- /life/base/sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Sarsa: 5 | def __init__(self, n_state, epsilon, alpha, gamma, n_action=4): 6 | """Sarsa算法 7 | 8 | Arguments: 9 | ncol -- 环境列数 10 | nrow -- 环境行数 11 | epsilon -- 随机选择动作的概率 12 | alpha -- 学习率 13 | gamma -- 折扣因子 14 | 15 | Keyword Arguments: 16 | n_action -- 动作的个数 (default: {4}) 17 | """ 18 | self.Q_table = np.zeros((n_state, n_action)) 19 | self.n_action = n_action 20 | self.alpha = alpha 21 | self.epsilon = epsilon 22 | self.gamma = gamma 23 | 24 | def take_action(self, state): 25 | """根据state选择下一步的操作,具体实现为epsilon-贪心""" 26 | if np.random.rand() < self.epsilon: 27 | action = np.random.randint(self.n_action) 28 | else: 29 | action = np.argmax(self.Q_table[state]) 30 | return action 31 | 32 | def best_action(self, state): 33 | """用于打印策略""" 34 | Q_max = np.max(self.Q_table[state]) 35 | a = [0 for _ in range(self.n_action)] 36 | 37 | # 若两个动作的价值一样,都会被记录下来 38 | for i in range(self.n_action): 39 | if self.Q_table[state][i] == Q_max: 40 | a[i] = 1 41 | return a 42 | 43 | def update(self, s0, a0, r, s1, a1): 44 | """"更新Q表格""" 45 | td_error = r + self.gamma * self.Q_table[s1, a1] - self.Q_table[s0, a0] # 时序差分误差 46 | self.Q_table[s0, a0] += self.alpha * td_error 47 | 48 | 49 | class MultiSarsa: 50 | """n步Sarsa算法""" 51 | 52 | def __init__(self, n, n_state, epsilon, alpha, gamma, n_action=4): 53 | self.Q_table = np.zeros((n_state, n_action)) 54 | self.n_action = n_action 55 | self.alpha = alpha 56 | self.gamma = gamma 57 | self.epsilon = epsilon 58 | self.n = n # 采用n步Sarsa算法 59 | self.state_list = [] # 保存之前的状态 60 | self.action_list = [] # 保存之前的动作 61 | self.reward_list = [] # 保存之前的奖励 62 | 63 | def take_action(self, state): 64 | """根据状态图选取一个动作""" 65 | if np.random.rand() < self.epsilon: 66 | action = np.random.randint(self.n_action) 67 | else: 68 | action = np.argmax(self.Q_table[state]) 69 | return action 70 | 71 | def best_action(self, state): 72 | """用于输出state下的最优动作(训练完成后)""" 73 | Q_max = np.max(self.Q_table[state]) 74 | a = [0 for _ in range(self.n_action)] 75 | for i in range(self.n_action): 76 | if self.Q_table[state, i] == Q_max: 77 | a[i] = 1 78 | return a 79 | 80 | def update(self, s0, a0, r, s1, a1, done): 81 | """基于Sarsa算法,更新Q表格""" 82 | self.state_list.append(s0) 83 | self.action_list.append(a0) 84 | self.reward_list.append(r) 85 | 86 | if len(self.state_list) == self.n: # 若保存的数据可以进行n步更新 87 | G = self.Q_table[s1, a1] # 得到Q(s_{t+n},a_{t+n}) 88 | for i in reversed(range(self.n)): # 不断向前计算每一步的回报,并折扣累加 89 | G = self.gamma * G + self.reward_list[i] 90 | if done and i > 0: # 虽然最后几步没有到达n步,但是到达了终止状态,也将其更新 91 | s = self.state_list[i] 92 | a = self.action_list[i] 93 | self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a]) 94 | s = self.state_list.pop(0) # s_t 95 | a = self.action_list.pop(0) # a_t 96 | self.reward_list.pop(0) # r_t 97 | # n步Sarsa的主要更新步骤 98 | self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a]) 99 | if done: 100 | # 到达终止状态,即将开始下一个序列,将列表清空 101 | self.state_list.clear() 102 | self.action_list.clear() 103 | self.reward_list.clear() 104 | -------------------------------------------------------------------------------- /life/base/trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | 5 | def train_sarsa(env, agent, num_episodes=500, return_agent=False): 6 | """ 7 | 8 | :param env: 9 | :param agent: 10 | :param num_episodes: 11 | :param return_agent: 12 | :return: 13 | """ 14 | return_list = [] # 记录每一条序列的回报 15 | for i in range(10): # 显示10个进度条 16 | # tqdm的进度条功能 17 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 18 | for i_episode in range(int(num_episodes / 10)): # 每个进度条的序列数 19 | episode_return = 0 20 | state = env.reset() 21 | action = agent.take_action(state) 22 | done = False 23 | while not done: 24 | next_state, reward, done = env.step(action) 25 | next_action = agent.take_action(next_state) 26 | episode_return += reward # 这里回报的计算不进行折扣因子衰减 27 | agent.update(state, action, reward, next_state, next_action) 28 | state = next_state 29 | action = next_action 30 | return_list.append(episode_return) 31 | if (i_episode + 1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报 32 | pbar.set_postfix({ 33 | 'episode': 34 | '%d' % (num_episodes / 10 * i + i_episode + 1), 35 | 'return': 36 | '%.3f' % np.mean(return_list[-10:]) 37 | }) 38 | pbar.update(1) 39 | if return_agent: 40 | return return_list, agent 41 | return return_list 42 | 43 | 44 | def train_multi_sarsa(env, agent, num_episodes=500, return_agent=False): 45 | return_list = [] # 记录每一条序列的回报 46 | for i in range(10): # 显示10个进度条 47 | # tqdm的进度条功能 48 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 49 | for i_episode in range(int(num_episodes / 10)): # 每个进度条的序列数 50 | episode_return = 0 51 | state = env.reset() 52 | action = agent.take_action(state) 53 | done = False 54 | while not done: 55 | next_state, reward, done = env.step(action) 56 | next_action = agent.take_action(next_state) 57 | episode_return += reward # 这里回报的计算不进行折扣因子衰减 58 | agent.update(state, action, reward, next_state, next_action, 59 | done) 60 | state = next_state 61 | action = next_action 62 | return_list.append(episode_return) 63 | if (i_episode + 1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报 64 | pbar.set_postfix({ 65 | 'episode': 66 | '%d' % (num_episodes / 10 * i + i_episode + 1), 67 | 'return': 68 | '%.3f' % np.mean(return_list[-10:]) 69 | }) 70 | pbar.update(1) 71 | if return_agent: 72 | return return_list, agent 73 | return return_list 74 | 75 | 76 | def train_qlearning(env, agent, num_episodes=500, return_agent=False): 77 | """""" 78 | return_list = [] 79 | for i in range(10): # 显示10个进度条 80 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 81 | for i_episode in range(int(num_episodes / 10)): # 每个进度条的序列数 82 | episode_return = 0 # 初始化一个回合的回报 83 | # 初始化状态 动作 done 84 | state = env.reset() # 初始状态 85 | done = False 86 | 87 | while not done: 88 | action = agent.take_action(state) 89 | next_state, reward, done = env.step(action) # 智能体与环境交互 90 | episode_return += reward 91 | agent.update(state, action, reward, next_state) 92 | state = next_state 93 | return_list.append(episode_return) 94 | if (i_episode + 1) % 10 == 0: 95 | pbar.set_postfix({"episode": "%d" % (num_episodes / 10 * i + i_episode + 1), 96 | "return": "%.3f" % np.mean(return_list[-10:])}) 97 | pbar.update(1) 98 | if return_agent: 99 | return return_list, agent 100 | return return_list 101 | -------------------------------------------------------------------------------- /life/dqn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /life/dqn/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/dqn/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /life/dqn/__pycache__/dqn.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/dqn/__pycache__/dqn.cpython-37.pyc -------------------------------------------------------------------------------- /life/dqn/__pycache__/dqn_improved.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/dqn/__pycache__/dqn_improved.cpython-37.pyc -------------------------------------------------------------------------------- /life/dqn/__pycache__/trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/dqn/__pycache__/trainer.cpython-37.pyc -------------------------------------------------------------------------------- /life/dqn/dqn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | 6 | class Qnet(torch.nn.Module): 7 | ''' 只有一层隐藏层的Q网络 ''' 8 | 9 | def __init__(self, state_dim, hidden_dim, action_dim): 10 | super(Qnet, self).__init__() 11 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) 12 | self.fc2 = torch.nn.Linear(hidden_dim, action_dim) 13 | 14 | def forward(self, x): 15 | x = F.relu(self.fc1(x)) # 隐藏层使用ReLU激活函数 16 | return self.fc2(x) 17 | 18 | 19 | class DQN: 20 | ''' DQN算法 ''' 21 | 22 | def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, 23 | epsilon, target_update, device, q_net=Qnet): 24 | """ 25 | 26 | :param state_dim: 27 | :param hidden_dim: 28 | :param action_dim: 29 | :param learning_rate: 30 | :param gamma: 31 | :param epsilon: 32 | :param target_update: 33 | :param device:torch的device 34 | :param q_net: 计算q值的网络,默认为2层的全连接神经网络,也可以自己定义网络 35 | """ 36 | self.action_dim = action_dim 37 | self.q_net = q_net(state_dim, hidden_dim, 38 | self.action_dim).to(device) # Q网络 39 | # 目标网络 40 | self.target_q_net = q_net(state_dim, hidden_dim, 41 | self.action_dim).to(device) 42 | # 使用Adam优化器 43 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), 44 | lr=learning_rate) 45 | self.gamma = gamma # 折扣因子 46 | self.epsilon = epsilon # epsilon-贪婪策略 47 | self.target_update = target_update # 目标网络更新频率 48 | self.count = 0 # 计数器,记录更新次数 49 | self.device = device 50 | 51 | def take_action(self, state): # epsilon-贪婪策略采取动作 52 | if np.random.random() < self.epsilon: 53 | action = np.random.randint(self.action_dim) 54 | else: 55 | state = torch.tensor([state], dtype=torch.float).to(self.device) 56 | action = self.q_net(state).argmax().item() 57 | return action 58 | 59 | def max_q_value(self, state): 60 | state = torch.tensor([state], dtype=torch.float).to(self.device) 61 | return self.q_net(state).max().item() 62 | 63 | def update(self, transition_dict): 64 | states = torch.tensor(transition_dict['states'], 65 | dtype=torch.float).to(self.device) 66 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 67 | self.device) 68 | rewards = torch.tensor(transition_dict['rewards'], 69 | dtype=torch.float).view(-1, 1).to(self.device) 70 | next_states = torch.tensor(transition_dict['next_states'], 71 | dtype=torch.float).to(self.device) 72 | dones = torch.tensor(transition_dict['dones'], 73 | dtype=torch.float).view(-1, 1).to(self.device) 74 | 75 | q_values = self.q_net(states).gather(1, actions) # Q值 76 | # 下个状态的最大Q值 77 | max_next_q_values = self.target_q_net(next_states).max(1)[0].view( 78 | -1, 1) 79 | q_targets = rewards + self.gamma * max_next_q_values * (1 - dones 80 | ) # TD误差目标 81 | dqn_loss = torch.mean(F.mse_loss(q_values, q_targets)) # 均方误差损失函数 82 | self.optimizer.zero_grad() # PyTorch中默认梯度会累积,这里需要显式将梯度置为0 83 | dqn_loss.backward() # 反向传播更新参数 84 | self.optimizer.step() 85 | 86 | if self.count % self.target_update == 0: 87 | self.target_q_net.load_state_dict( 88 | self.q_net.state_dict()) # 更新目标网络 89 | self.count += 1 90 | -------------------------------------------------------------------------------- /life/dqn/dqn_improved.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn.functional as F 4 | from .dqn import Qnet 5 | 6 | 7 | class DoubleDQN: 8 | """Double DQN 算法""" 9 | 10 | def __init__(self, 11 | state_dim, 12 | hidden_dim, 13 | action_dim, 14 | learning_rate, 15 | gamma, 16 | epsilon, 17 | target_update, 18 | device, 19 | q_net=Qnet): 20 | self.action_dim = action_dim 21 | self.q_net = q_net(state_dim, hidden_dim, self.action_dim).to(device) 22 | self.target_q_net = q_net(state_dim, hidden_dim, 23 | self.action_dim).to(device) 24 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), 25 | lr=learning_rate) 26 | self.gamma = gamma 27 | self.epsilon = epsilon 28 | self.target_update = target_update 29 | self.count = 0 30 | self.device = device 31 | 32 | def take_action(self, state): 33 | if np.random.random() < self.epsilon: 34 | action = np.random.randint(self.action_dim) 35 | else: 36 | state = torch.tensor([state], dtype=torch.float).to(self.device) 37 | action = self.q_net(state).argmax().item() 38 | return action 39 | 40 | def max_q_value(self, state): 41 | """在一堆Q值中寻找最大的Q""" 42 | state = torch.tensor([state], dtype=torch.float).to(self.device) 43 | return self.q_net(state).max().item() 44 | 45 | def update(self, transition_dict): 46 | states = torch.tensor(transition_dict['states'], 47 | dtype=torch.float).to(self.device) 48 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 49 | self.device) 50 | rewards = torch.tensor(transition_dict['rewards'], 51 | dtype=torch.float).view(-1, 1).to(self.device) 52 | next_states = torch.tensor(transition_dict['next_states'], 53 | dtype=torch.float).to(self.device) 54 | dones = torch.tensor(transition_dict['dones'], 55 | dtype=torch.float).view(-1, 1).to(self.device) 56 | 57 | q_values = self.q_net(states).gather(1, actions) # q值的计算都是一样的 58 | max_action = self.q_net(next_states).max(1)[1].view(-1, 1) # 动作选择有q-net负责 59 | max_next_q_values = self.target_q_net(next_states).gather(1, max_action) # Q值计算由t-net负责 60 | 61 | q_targets = rewards + self.gamma * max_next_q_values * (1 - dones) # q_target的计算也是一样的 62 | 63 | # 计算loss 64 | loss = torch.mean(F.mse_loss(q_values, q_targets)) 65 | self.optimizer.zero_grad() 66 | loss.backward() 67 | self.optimizer.step() 68 | 69 | if self.count % self.target_update == 0: 70 | self.target_q_net.load_state_dict( 71 | self.q_net.state_dict()) # 更新目标网络 72 | self.count += 1 73 | 74 | 75 | class VAnet(torch.nn.Module): 76 | """只有一层隐藏层的A网络和V网络""" 77 | 78 | def __init__(self, state_dim, hidden_dim, action_dim): 79 | super(VAnet, self).__init__() 80 | self.fc1 = torch.nn.Linear(state_dim, hidden_dim) # 共享网络部分 81 | self.fc_A = torch.nn.Linear(hidden_dim, action_dim) 82 | self.fc_V = torch.nn.Linear(hidden_dim, 1) 83 | 84 | def forward(self, x): 85 | A = self.fc_A(F.relu(self.fc1(x))) 86 | V = self.fc_V(F.relu(self.fc1(x))) 87 | Q = V + A - A.mean(1).view(-1, 1) # Q值由V值和A值计算得到 88 | return Q 89 | 90 | 91 | class DuelingDQN: 92 | """Dueling DQN算法""" 93 | def __init__(self, 94 | state_dim, 95 | hidden_dim, 96 | action_dim, 97 | learning_rate, 98 | gamma, 99 | epsilon, 100 | target_update, 101 | device): 102 | self.action_dim = action_dim 103 | self.q_net = VAnet(state_dim, hidden_dim, 104 | self.action_dim).to(device) 105 | self.target_q_net = VAnet(state_dim, hidden_dim, 106 | self.action_dim).to(device) 107 | self.optimizer = torch.optim.Adam(self.q_net.parameters(), 108 | lr=learning_rate) 109 | self.gamma = gamma 110 | self.epsilon = epsilon 111 | self.target_update = target_update 112 | self.count = 0 113 | self.device = device 114 | 115 | def take_action(self, state): 116 | if np.random.random() < self.epsilon: 117 | action = np.random.randint(self.action_dim) 118 | else: 119 | state = torch.tensor([state], dtype=torch.float).to(self.device) 120 | action = self.q_net(state).argmax().item() 121 | return action 122 | 123 | def max_q_value(self, state): 124 | state = torch.tensor([state], dtype=torch.float).to(self.device) 125 | return self.q_net(state).max().item() 126 | 127 | def update(self, transition_dict): 128 | states = torch.tensor(transition_dict['states'], 129 | dtype=torch.float).to(self.device) 130 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 131 | self.device) 132 | rewards = torch.tensor(transition_dict['rewards'], 133 | dtype=torch.float).view(-1, 1).to(self.device) 134 | next_states = torch.tensor(transition_dict['next_states'], 135 | dtype=torch.float).to(self.device) 136 | dones = torch.tensor(transition_dict['dones'], 137 | dtype=torch.float).view(-1, 1).to(self.device) 138 | 139 | q_values = self.q_net(states).gather(1, actions) 140 | 141 | max_next_q_values = self.target_q_net(next_states).max(1)[0].view(-1, 1) 142 | q_targets = rewards + self.gamma * max_next_q_values * (1 - dones) 143 | dqn_loss = torch.mean(F.mse_loss(q_values, q_targets)) 144 | self.optimizer.zero_grad() 145 | dqn_loss.backward() 146 | self.optimizer.step() 147 | 148 | if self.count % self.target_update == 0: 149 | self.target_q_net.load_state_dict(self.q_net.state_dict()) 150 | self.count += 1 151 | -------------------------------------------------------------------------------- /life/dqn/trainer.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from life.utils.cont2disp import dis2con 3 | import numpy as np 4 | 5 | 6 | def train_dqn(agent, env, replay_buffer, minimal_size, batch_size, num_episodes=500, 7 | conti_action=False, return_agent=False): 8 | """ 9 | 训练各种DQN 10 | :param agent: 11 | :param env: 12 | :param num_episodes: 13 | :param replay_buffer: 14 | :param minimal_size: replay_buffer只有超过了minimal_size,才开始训练 15 | :param batch_size: 16 | :param conti_action: 是否用于连续动作 17 | :param return_agent: 是否返回智能体 18 | :return: 19 | """ 20 | return_list = [] 21 | for i in range(10): 22 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 23 | for i_episode in range(int(num_episodes / 10)): 24 | episode_return = 0 25 | state = env.reset() 26 | done = False 27 | while not done: 28 | action = agent.take_action(state) 29 | next_state, reward, done, _ = env.step(action) 30 | replay_buffer.add(state, action, reward, next_state, done) 31 | state = next_state 32 | episode_return += reward 33 | # 当buffer数据的数量超过一定值后,才进行Q网络训练 34 | if replay_buffer.size() > minimal_size: 35 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) 36 | transition_dict = { 37 | 'states': b_s, 38 | 'actions': b_a, 39 | 'next_states': b_ns, 40 | 'rewards': b_r, 41 | 'dones': b_d 42 | } 43 | agent.update(transition_dict) 44 | return_list.append(episode_return) 45 | if (i_episode + 1) % 10 == 0: 46 | pbar.set_postfix({ 47 | 'episode': 48 | '%d' % (num_episodes / 10 * i + i_episode + 1), 49 | 'return': 50 | '%.3f' % np.mean(return_list[-10:]) 51 | }) 52 | pbar.update(1) 53 | if return_agent: 54 | return return_list, agent 55 | return return_list 56 | 57 | 58 | def train(agent, env, replay_buffer, minimal_size, 59 | batch_size, num_episodes=500, con_act=False,return_agent=False): 60 | return_list = [] 61 | max_q_value_list = [] 62 | max_q_value = 0 63 | for i in range(10): 64 | with tqdm(total=int(num_episodes / 10), 65 | desc='Iteration %d' % i) as pbar: 66 | for i_episode in range(int(num_episodes / 10)): 67 | episode_return = 0 68 | state = env.reset() 69 | done = False 70 | while not done: 71 | action = agent.take_action(state) 72 | max_q_value = agent.max_q_value( 73 | state) * 0.005 + max_q_value * 0.995 # 平滑处理 74 | max_q_value_list.append(max_q_value) # 保存每个状态的最大Q值 75 | if con_act: 76 | action_continuous = dis2con(action, env, 77 | agent.action_dim) 78 | next_state, reward, done, _ = env.step([action_continuous]) 79 | else: 80 | next_state, reward, done, _ = env.step(action) # 用于离散动作的DQN不需加[] 81 | replay_buffer.add(state, action, reward, next_state, done) 82 | state = next_state 83 | episode_return += reward 84 | if replay_buffer.size() > minimal_size: 85 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample( 86 | batch_size) 87 | transition_dict = { 88 | 'states': b_s, 89 | 'actions': b_a, 90 | 'next_states': b_ns, 91 | 'rewards': b_r, 92 | 'dones': b_d 93 | } 94 | agent.update(transition_dict) 95 | return_list.append(episode_return) 96 | if (i_episode + 1) % 10 == 0: 97 | pbar.set_postfix({ 98 | 'episode': 99 | '%d' % (num_episodes / 10 * i + i_episode + 1), 100 | 'return': 101 | '%.3f' % np.mean(return_list[-10:]) 102 | }) 103 | pbar.update(1) 104 | if return_agent: 105 | return return_list, agent 106 | return return_list 107 | -------------------------------------------------------------------------------- /life/envs/__pycache__/cliffwalking.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/envs/__pycache__/cliffwalking.cpython-37.pyc -------------------------------------------------------------------------------- /life/envs/__pycache__/con_env_demo.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/envs/__pycache__/con_env_demo.cpython-37.pyc -------------------------------------------------------------------------------- /life/envs/__pycache__/dis_env_demo.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/envs/__pycache__/dis_env_demo.cpython-37.pyc -------------------------------------------------------------------------------- /life/envs/cliffwalking.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | 4 | class CliffWalkingEnv: 5 | def __init__(self, ncol, nrow): 6 | self.nrow = nrow 7 | self.ncol = ncol 8 | self.x = 0 # 记录当前智能体位置的横坐标 9 | self.y = self.nrow - 1 # 记录当前智能体位置的纵坐标 10 | 11 | def step(self, action): # 外部调用这个函数来改变当前位置 12 | # 4种动作, change[0]:上, change[1]:下, change[2]:左, change[3]:右。坐标系原点(0,0) 13 | # 定义在左上角 14 | change = [[0, -1], [0, 1], [-1, 0], [1, 0]] 15 | self.x = min(self.ncol - 1, max(0, self.x + change[action][0])) 16 | self.y = min(self.nrow - 1, max(0, self.y + change[action][1])) 17 | next_state = self.y * self.ncol + self.x 18 | reward = -1 19 | done = False 20 | if self.y == self.nrow - 1 and self.x > 0: # 下一个位置在悬崖或者目标 21 | done = True 22 | if self.x != self.ncol - 1: 23 | reward = -100 24 | return next_state, reward, done 25 | 26 | def reset(self): # 回归初始状态,坐标轴原点在左上角 27 | self.x = 0 28 | self.y = self.nrow - 1 29 | return self.y * self.ncol + self.x 30 | -------------------------------------------------------------------------------- /life/envs/con_env_demo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | 4 | def make(): 5 | return gym.make("Pendulum-v1") -------------------------------------------------------------------------------- /life/envs/dis_env_demo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | 4 | def make(): 5 | return gym.make('CartPole-v0') 6 | -------------------------------------------------------------------------------- /life/imitation/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /life/imitation/bc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ..policy.ppo import PolicyNet 3 | 4 | 5 | class BehaviorClone: 6 | def __init__(self, state_dim, hidden_dim, action_dim, lr, device, policy_net=PolicyNet): 7 | self.policy = policy_net(state_dim, hidden_dim, action_dim).to(device) 8 | self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) 9 | self.device = device 10 | 11 | def learn(self, states, actions): 12 | """policy net 学习,参数更新""" 13 | states = torch.tensor(states, dtype=torch.float).to(self.device) 14 | actions = torch.tensor(actions, dtype=torch.int64).view(-1, 1).to(self.device) 15 | log_probs = torch.log(self.policy(states).gather(1, actions)) # 注意这里的损失函数计算方式 16 | bc_loss = torch.mean(-log_probs) # 最大似然估计 17 | 18 | self.optimizer.zero_grad() 19 | bc_loss.backward() 20 | self.optimizer.step() 21 | 22 | def take_action(self, state): 23 | state = torch.tensor([state], dtype=torch.float).to(self.device) 24 | probs = self.policy(state) 25 | action_dist = torch.distributions.Categorical(probs) 26 | action = action_dist.sample() 27 | return action.item() 28 | -------------------------------------------------------------------------------- /life/imitation/gail.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Discriminator(nn.Module): 7 | """判别器模型""" 8 | 9 | def __init__(self, state_dim, hidden_dim, action_dim) -> None: 10 | super().__init__() 11 | self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim) 12 | self.fc2 = nn.Linear(hidden_dim, 1) 13 | 14 | def forward(self, x, a): 15 | cat = torch.cat([x, a], dim=1) 16 | x = F.relu(self.fc1(cat)) 17 | return torch.sigmoid(self.fc2(x)) # 输出的是一个概率标量 18 | 19 | 20 | class GAIL: 21 | def __init__(self, agent, state_dim, action_dim, hidden_dim, lr_d, device, discriminator=Discriminator): 22 | self.dicriminator = discriminator(state_dim, hidden_dim, action_dim).to(device) 23 | self.dicriminator_optimizer = torch.optim.Adam(self.dicriminator.parameters(), lr=lr_d) 24 | self.agent = agent 25 | self.device = device 26 | 27 | def learn(self, expert_s, expert_a, agent_s, agent_a, next_s, dones): 28 | expert_states = torch.tensor(expert_s, dtype=torch.float).to(self.device) 29 | expert_actions = torch.tensor(expert_a).to(self.device) 30 | agent_states = torch.tensor(agent_s, dtype=torch.float).to(self.device) 31 | agent_actions = torch.tensor(agent_a).to(self.device) 32 | 33 | expert_actions = F.one_hot(expert_actions.to(torch.int64), num_classes=2).float() # 两个动作 34 | agent_actions = F.one_hot(agent_actions.to(torch.int64), num_classes=2).float() 35 | 36 | expert_prob = self.dicriminator(expert_states, expert_actions) # 前向传播,输出数据来自于专家的概率 37 | agent_prob = self.dicriminator(agent_states, agent_actions) 38 | # 计算判别器的损失 39 | discriminator_loss = nn.BCELoss()(agent_prob, torch.ones_like(agent_prob)) + \ 40 | nn.BCELoss()(expert_prob, torch.zeros_like(expert_prob)) 41 | # 优化更新 42 | self.dicriminator_optimizer.zero_grad() 43 | discriminator_loss.backward() 44 | self.dicriminator_optimizer.step() 45 | 46 | # 将判别器的输出转换为策略的奖励信号 47 | rewards = -torch.log(agent_prob).detach().cpu().numpy() 48 | transition_dict = { 49 | 'states': agent_s, 50 | 'actions': agent_a, 51 | 'rewards': rewards, # 只有rewards改变了,换成了 概率(被判别器识破的概率) 52 | 'next_states': next_s, 53 | 'dones': dones 54 | } 55 | self.agent.update(transition_dict) 56 | -------------------------------------------------------------------------------- /life/imitation/trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | 5 | def test_agent(agent, env, n_episode): 6 | """ 7 | 对智能体进行episode次测试,记录每个回合的reward,返回其平均值 8 | """ 9 | return_list = [] 10 | for episode in range(n_episode): 11 | episode_return = 0 12 | state = env.reset() 13 | done = False 14 | 15 | while not done: 16 | action = agent.take_action(state) 17 | next_state, reward, done, _ = env.step(action) 18 | state = next_state 19 | episode_return += reward 20 | return_list.append(episode_return) 21 | return np.mean(return_list) 22 | 23 | 24 | def train_bc(bc_agent, env, expert_s, expert_a, n_iterations, batch_size, return_agent=False): 25 | """训练bc算法的函数""" 26 | test_returns = [] 27 | 28 | with tqdm(total=n_iterations, desc="进度条") as pbar: 29 | for i in range(n_iterations): 30 | sample_indices = np.random.randint(0, expert_s.shape[0], size=batch_size) 31 | expert_s_sample_batch = expert_s[sample_indices] # 含有重复数据,如本例是从30条经验数据中采样64个 32 | expert_a_sample_batch = expert_a[sample_indices] 33 | 34 | bc_agent.learn(expert_s_sample_batch, expert_a_sample_batch) # 有监督的智能体学习 35 | 36 | current_return = test_agent(bc_agent, env, 5) 37 | test_returns.append(current_return) 38 | if (i + 1) % 10 == 0: 39 | pbar.set_postfix({"return": "%.3f" % np.mean(test_returns[-10:])}) 40 | pbar.update(1) 41 | if return_agent: 42 | return test_returns, bc_agent 43 | return test_returns 44 | 45 | 46 | def train_gail(agent, gail, env, expert_s, expert_a, n_episode=500, return_agent=False): 47 | """ 48 | gail算法的训练函数 49 | :param agent: 需要与环境交互的智能体,同时也是要传入gail算法类的智能体 50 | :param gail: GAIL算法类 51 | :param env: 52 | :param expert_s: 专家数据(s,a)中的s 53 | :param expert_a: 专家数据(s,a)中的a 54 | :param n_episode: 55 | :param return_agent: 56 | :return: 57 | """ 58 | return_list = [] 59 | 60 | with tqdm(total=n_episode, desc="进度条") as pbar: 61 | for i in range(n_episode): 62 | episode_return = 0 63 | state = env.reset() 64 | done = False 65 | state_list = [] 66 | action_list = [] 67 | next_state_list = [] 68 | done_list = [] 69 | 70 | while not done: 71 | action = agent.take_action(state) # 也可换成gail.agent 72 | next_state, reward, done, _ = env.step(action) 73 | state_list.append(state) 74 | action_list.append(action) 75 | next_state_list.append(next_state) 76 | done_list.append(done) 77 | episode_return += reward 78 | state = next_state 79 | return_list.append(episode_return) 80 | 81 | gail.learn(expert_s, expert_a, # 之前的那30条专家数据 82 | state_list, action_list, next_state_list, done_list) 83 | 84 | if (i + 1) % 10 == 0: 85 | pbar.set_postfix({'return': '%.3f' % np.mean(return_list[-10:])}) 86 | pbar.update(1) 87 | if return_agent: 88 | return return_list, gail.agent 89 | return return_list 90 | -------------------------------------------------------------------------------- /life/policy/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /life/policy/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/policy/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /life/policy/__pycache__/ddpg.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/policy/__pycache__/ddpg.cpython-37.pyc -------------------------------------------------------------------------------- /life/policy/__pycache__/ppo.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/policy/__pycache__/ppo.cpython-37.pyc -------------------------------------------------------------------------------- /life/policy/__pycache__/reinforce.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/policy/__pycache__/reinforce.cpython-37.pyc -------------------------------------------------------------------------------- /life/policy/__pycache__/sac.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/policy/__pycache__/sac.cpython-37.pyc -------------------------------------------------------------------------------- /life/policy/__pycache__/trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/policy/__pycache__/trainer.cpython-37.pyc -------------------------------------------------------------------------------- /life/policy/ac/a3c.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/policy/ac/a3c.py -------------------------------------------------------------------------------- /life/policy/ac/ac.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from ..reinforce import PolicyNet 5 | 6 | 7 | class ValueNet(nn.Module): 8 | def __init__(self, state_dim, hidden_dim): 9 | super().__init__() 10 | self.fc1 = nn.Linear(state_dim, hidden_dim) 11 | self.fc2 = nn.Linear(hidden_dim, 1) 12 | 13 | def forward(self, state): 14 | x = F.relu(self.fc1(state)) 15 | return self.fc2(x) # 注意这是一个回归问题 16 | 17 | 18 | class ActorCritic: 19 | def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma, device, 20 | policy_net=PolicyNet, value_net=ValueNet): 21 | # 定义策略网络 和 价值网络 22 | self.actor = policy_net(state_dim, hidden_dim, action_dim).to(device) 23 | self.critic = value_net(state_dim, hidden_dim).to(device) 24 | 25 | # 分别为两个网络建立优化器 26 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 27 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) 28 | 29 | self.gamma = gamma 30 | self.device = device 31 | 32 | def take_action(self, state): 33 | state = torch.tensor([state], dtype=torch.float).to(self.device) 34 | probs = self.actor(state) 35 | action_dist = torch.distributions.Categorical(probs) # 根据概率大小采样 36 | action = action_dist.sample() 37 | return action.item() # 输出标量 38 | 39 | def update(self, transition_dict): 40 | states = torch.tensor(transition_dict['states'], 41 | dtype=torch.float).to(self.device) 42 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 43 | self.device) 44 | rewards = torch.tensor(transition_dict['rewards'], 45 | dtype=torch.float).view(-1, 1).to(self.device) 46 | next_states = torch.tensor(transition_dict['next_states'], 47 | dtype=torch.float).to(self.device) 48 | dones = torch.tensor(transition_dict['dones'], 49 | dtype=torch.float).view(-1, 1).to(self.device) 50 | 51 | td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones) # 时序差分目标 52 | td_delta = td_target - self.critic(states) # 时序差分误差 53 | log_probs = torch.log(self.actor(states).gather(1, actions)) 54 | 55 | # 计算两个网络的loss 56 | actor_loss = torch.mean(-log_probs * td_delta.detach()) # 策略的损失函数 57 | critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detach())) 58 | 59 | # 更新网络参数 60 | self.actor_optimizer.zero_grad() 61 | self.critic_optimizer.zero_grad() 62 | 63 | # 误差反向传播 64 | actor_loss.backward() 65 | critic_loss.backward() 66 | 67 | # 优化器step() 68 | self.actor_optimizer.step() 69 | self.critic_optimizer.step() 70 | -------------------------------------------------------------------------------- /life/policy/ddpg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | 7 | class TwoLayerFC(nn.Module): 8 | def __init__(self, num_in, num_out, hidden_dim, activation=F.relu, out_fn=lambda x: x) -> None: 9 | super().__init__() 10 | self.fc1 = nn.Linear(num_in, hidden_dim) 11 | self.fc2 = nn.Linear(hidden_dim, hidden_dim) 12 | self.fc3 = nn.Linear(hidden_dim, num_out) 13 | self.activation = activation 14 | self.out_fn = out_fn 15 | 16 | def forward(self, x): 17 | x = self.activation(self.fc1(x)) 18 | x = self.activation(self.fc2(x)) 19 | x = self.out_fn(self.fc3(x)) 20 | return x 21 | 22 | 23 | class DDPG: 24 | def __init__(self, num_in_actor, num_out_actor, num_in_critic, hidden_dim, 25 | discrete, action_bound, sigma, actor_lr, critic_lr, 26 | tau, gamma, device, common_net=TwoLayerFC): 27 | """ 28 | 第一行是神经网络结构上的超参数 29 | discrete:是否用于处理离散动作 30 | action_bound:限制动作取值范围 31 | sigma:用于添加高斯噪声的高斯分布参数 32 | tau:软更新目标网络的参数 33 | gamma:衰减因子 34 | """ 35 | out_fn = (lambda x: x) if discrete else ( 36 | lambda x: torch.tanh(x) * action_bound) 37 | self.actor = common_net(num_in_actor, num_out_actor, hidden_dim, 38 | activation=F.relu, out_fn=out_fn).to(device) 39 | self.target_actor = common_net(num_in_actor, num_out_actor, hidden_dim, 40 | activation=F.relu, out_fn=out_fn).to(device) 41 | self.critic = common_net(num_in_critic, 1, hidden_dim).to(device) 42 | self.target_critic = common_net( 43 | num_in_critic, 1, hidden_dim).to(device) 44 | 45 | # 设置目标价值网络并设置和价值网络相同的参数 46 | self.target_critic.load_state_dict(self.critic.state_dict()) 47 | # 初始化目标策略网略并设置和策略相同的参数 48 | self.target_actor.load_state_dict(self.actor.state_dict()) 49 | 50 | self.actor_optimizer = torch.optim.Adam( 51 | self.actor.parameters(), lr=actor_lr) 52 | self.critic_optimizer = torch.optim.Adam( 53 | self.critic.parameters(), lr=critic_lr) 54 | self.gamma = gamma 55 | self.sigma = sigma # 高斯噪声的标准差,均值直接设为0 56 | self.action_bound = action_bound 57 | self.tau = tau # 目标网络软更新参数 58 | self.action_dim = num_out_actor 59 | self.device = device 60 | 61 | def take_action(self, state): 62 | """输入状态,输出带有噪声的动作""" 63 | state = torch.tensor([state], dtype=torch.float).to(self.device) 64 | action = self.actor(state).item() 65 | # 给动作添加噪声,增加探索 66 | action = action + self.gamma * np.random.randn(self.action_dim) 67 | return action 68 | 69 | def soft_update(self, net, target_net): 70 | for param_target, param in zip(target_net.parameters(), net.parameters()): 71 | param_target.data.copy_( 72 | param_target.data * (1 - self.tau) + param.data * self.tau) 73 | 74 | def update(self, transition_dict): 75 | states = torch.tensor( 76 | transition_dict['states'], dtype=torch.float).to(self.device) 77 | actions = torch.tensor( 78 | transition_dict['actions'], dtype=torch.float).view(-1, 1).to(self.device) 79 | rewards = torch.tensor( 80 | transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device) 81 | next_states = torch.tensor( 82 | transition_dict['next_states'], dtype=torch.float).to(self.device) 83 | dones = torch.tensor( 84 | transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device) 85 | 86 | # 计算critic loss 87 | next_q_values = self.target_critic(torch.cat([next_states, 88 | self.target_actor(next_states)], 89 | dim=1)) # Q_{w-} 90 | q_targets = rewards + self.gamma * next_q_values * (1 - dones) 91 | critic_loss = torch.mean(F.mse_loss( 92 | self.critic(torch.cat([states, actions], dim=1)), 93 | q_targets 94 | )) 95 | # 优化 96 | self.critic_optimizer.zero_grad() 97 | critic_loss.backward() 98 | self.critic_optimizer.step() 99 | 100 | # 计算actor loss 101 | actor_loss = - \ 102 | torch.mean(self.critic( 103 | torch.cat([states, self.actor(states)], dim=1))) 104 | # 优化 105 | self.actor_optimizer.zero_grad() 106 | actor_loss.backward() 107 | self.actor_optimizer.step() 108 | 109 | # 软更新两个两个目标网络 110 | self.soft_update(self.critic, self.target_critic) 111 | self.soft_update(self.actor, self.target_actor) 112 | -------------------------------------------------------------------------------- /life/policy/ppo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | import torch.nn.functional as F 5 | from ..utils.calculator import compute_advantage 6 | 7 | 8 | # from .ac.ac import PolicyNet,ValueNet 9 | 10 | 11 | class PolicyNet(nn.Module): 12 | def __init__(self, state_dim, hidden_dim, action_dim): 13 | super().__init__() 14 | self.fc1 = nn.Linear(state_dim, hidden_dim) 15 | self.fc2 = nn.Linear(hidden_dim, action_dim) 16 | 17 | def forward(self, state): 18 | x = F.relu(self.fc1(state)) 19 | return F.softmax(self.fc2(x), dim=1) 20 | 21 | 22 | class ValueNet(nn.Module): 23 | def __init__(self, state_dim, hidden_dim): 24 | super().__init__() 25 | self.fc1 = nn.Linear(state_dim, hidden_dim) 26 | self.fc2 = nn.Linear(hidden_dim, 1) 27 | 28 | def forward(self, state): 29 | x = F.relu(self.fc1(state)) 30 | return self.fc2(x) 31 | 32 | 33 | class PPO: 34 | """PPO 算法,采用截断的方式""" 35 | 36 | def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, 37 | lmbda, epochs, eps, gamma, device, policy_net=PolicyNet, value_net=ValueNet): 38 | """ 39 | lmbda:广义优势估计的lambda因子 40 | epochs: 一条序列的数据用来训练的轮数 41 | eps: PPO中阶段范围的参数 42 | """ 43 | self.actor = policy_net(state_dim, hidden_dim, action_dim).to(device) 44 | self.critic = value_net(state_dim, hidden_dim).to(device) 45 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 46 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) 47 | self.gamma = gamma 48 | self.lmbda = lmbda 49 | self.epochs = epochs 50 | self.eps = eps 51 | self.device = device 52 | 53 | def take_action(self, state): 54 | state = torch.tensor([state], dtype=torch.float).to(self.device) 55 | probs = self.actor(state) # 输出动作的概率分布 56 | action_dist = torch.distributions.Categorical(probs) 57 | action = action_dist.sample() 58 | return action.item() 59 | 60 | def update(self, transition_dict): 61 | # 数据类型转换 62 | states = torch.tensor(transition_dict['states'], 63 | dtype=torch.float).to(self.device) 64 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 65 | self.device) 66 | rewards = torch.tensor(transition_dict['rewards'], 67 | dtype=torch.float).view(-1, 1).to(self.device) 68 | next_states = torch.tensor(transition_dict['next_states'], 69 | dtype=torch.float).to(self.device) 70 | dones = torch.tensor(transition_dict['dones'], 71 | dtype=torch.float).view(-1, 1).to(self.device) 72 | 73 | td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones) # 时序差分目标 74 | td_delta = td_target - self.critic(states) # 时序差分误差 75 | 76 | advantage = compute_advantage(self.gamma, self.lmbda, td_delta.cpu()).to(self.device) 77 | 78 | old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach() # 旧策略 79 | 80 | # 对于actor每采样的一组数据,更新epoch次网络 81 | for _ in range(self.epochs): 82 | log_probs = torch.log(self.actor(states).gather(1, actions)) 83 | ratio = torch.exp(log_probs - old_log_probs) # 比值 84 | 85 | surr1 = ratio * advantage 86 | surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage # 对比值进行裁剪 87 | 88 | # 计算loss 89 | actor_loss = torch.mean(-torch.min(surr1, surr2)) # 对演员的loss,使用ppo目标函数 90 | critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detach())) 91 | 92 | # 优化 93 | self.actor_optimizer.zero_grad() 94 | self.critic_optimizer.zero_grad() 95 | actor_loss.backward() 96 | critic_loss.backward() 97 | self.actor_optimizer.step() 98 | self.critic_optimizer.step() 99 | -------------------------------------------------------------------------------- /life/policy/reinforce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class PolicyNet(nn.Module): 7 | def __init__(self, state_dim, hidden_dim, action_dim): 8 | super().__init__() 9 | self.fc1 = nn.Linear(state_dim, hidden_dim) 10 | self.fc2 = nn.Linear(hidden_dim, action_dim) 11 | 12 | def forward(self, state): 13 | x = F.relu(self.fc1(state)) 14 | return F.softmax(self.fc2(x), dim=1) 15 | 16 | 17 | class REINFORCE: 18 | def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, device, net=PolicyNet): 19 | self.policy_net = net(state_dim, hidden_dim, action_dim).to(device=device) 20 | self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=learning_rate) 21 | self.gamma = gamma 22 | self.device = device 23 | 24 | def take_action(self, state): 25 | """根据动作概率分布随机采样""" 26 | state = torch.tensor([state], dtype=torch.float).to(self.device) 27 | probs = self.policy_net(state) # 动作概率分布 28 | action_dist = torch.distributions.Categorical(probs=probs) # 创建分类分布 29 | action = action_dist.sample() # 从创建的分布中采样 30 | return action.item() 31 | 32 | def update(self, transition_dict): 33 | reward_list = transition_dict['rewards'] 34 | state_list = transition_dict['states'] 35 | action_list = transition_dict['actions'] 36 | 37 | G = 0 38 | self.optimizer.zero_grad() 39 | for i in reversed(range(len(reward_list))): 40 | reward = reward_list[i] 41 | state = torch.tensor([state_list[i]], dtype=torch.float).to(self.device) 42 | action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device) 43 | 44 | log_prob = torch.log(self.policy_net(state).gather(1, action)) # log \pi(a|s) 45 | G = self.gamma * G + reward 46 | loss = -log_prob * G # 每一步的损失函数 47 | loss.backward() 48 | self.optimizer.step() 49 | -------------------------------------------------------------------------------- /life/policy/sac.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import numpy as np 6 | 7 | 8 | class PolicyNetContinuous(nn.Module): 9 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound): 10 | super(PolicyNetContinuous, self).__init__() 11 | self.fc1 = nn.Linear(state_dim, hidden_dim) 12 | self.fc_mu = nn.Linear(hidden_dim, action_dim) 13 | self.fc_std = nn.Linear(hidden_dim, action_dim) 14 | self.action_bound = action_bound 15 | 16 | def forward(self, x): 17 | x = F.relu(self.fc1(x)) 18 | mu = self.fc_mu(x) 19 | std = F.softplus(self.fc_std(x)) 20 | 21 | dist = Normal(mu, std) 22 | normal_sample = dist.rsample() # 重参数化采样 23 | log_prob = dist.log_prob(normal_sample) # log (pi) 24 | 25 | action = torch.tanh(normal_sample) 26 | log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7) 27 | action = action * self.action_bound 28 | 29 | return action, log_prob 30 | 31 | 32 | class QValueNetContinuous(nn.Module): 33 | def __init__(self, state_dim, hidden_dim, action_dim) -> None: 34 | super().__init__() 35 | self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim) 36 | self.fc2 = nn.Linear(hidden_dim, hidden_dim) 37 | self.fc3 = nn.Linear(hidden_dim, 1) 38 | 39 | def forward(self, x, a): 40 | """state,action""" 41 | cat = torch.cat([x, a], dim=1) 42 | x = F.relu(self.fc1(cat)) 43 | x = F.relu(self.fc2(x)) 44 | return self.fc3(x) 45 | 46 | 47 | class SACContinuous: 48 | """处理连续动作的SAC算法""" 49 | 50 | def __init__(self, state_dim, hidden_dim, action_dim, action_bound, 51 | actor_lr, critic_lr, alpha_lr, 52 | target_entropy, tau, gamma, device, 53 | actor_net=PolicyNetContinuous, critic_net=QValueNetContinuous): 54 | # 5个网络 55 | self.actor = actor_net(state_dim, hidden_dim, action_dim, 56 | action_bound).to(device) # 策略网络 57 | self.critic_1 = critic_net(state_dim, hidden_dim, 58 | action_dim).to(device) # 第一个Q网络 59 | self.critic_2 = critic_net(state_dim, hidden_dim, 60 | action_dim).to(device) # 第二个Q网络 61 | self.target_critic_1 = critic_net(state_dim, 62 | hidden_dim, action_dim).to( 63 | device) # 第一个目标Q网络 64 | self.target_critic_2 = critic_net(state_dim, 65 | hidden_dim, action_dim).to( 66 | device) # 第二个目标Q网络 67 | # 令目标价值网络的初始参数和价值网络一样 68 | self.target_critic_1.load_state_dict(self.critic_1.state_dict()) 69 | self.target_critic_2.load_state_dict(self.critic_2.state_dict()) 70 | # 优化器 71 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 72 | lr=actor_lr) 73 | self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(), 74 | lr=critic_lr) 75 | self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(), 76 | lr=critic_lr) 77 | # 使用alpha的Log值,可以使训练效果比较稳定 78 | self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) 79 | self.log_alpha.requires_grad = True # 可以对alpha求梯度 80 | self.log_alpha_optimizer = torch.optim.Adam( 81 | [self.log_alpha], lr=alpha_lr) 82 | 83 | self.target_entropy = target_entropy 84 | self.gamma = gamma 85 | self.tau = tau 86 | self.device = device 87 | 88 | def take_action(self, state): 89 | state = torch.tensor([state], dtype=torch.float).to(self.device) 90 | action = self.actor(state)[0] 91 | return [action.item()] 92 | 93 | def calc_target(self, rewards, next_states, dones): 94 | """计算目标Q值""" 95 | next_actions, log_prob = self.actor(next_states) 96 | entropy = -log_prob 97 | q1_value = self.target_critic_1(next_states, next_actions) 98 | q2_value = self.target_critic_2(next_states, next_actions) 99 | next_value = torch.min(q1_value, q2_value) + self.log_alpha.exp() * entropy 100 | td_target = rewards + self.gamma * next_value * (1 - dones) 101 | return td_target 102 | 103 | def soft_update(self, net, target_net): 104 | for param_target, param in zip(target_net.parameters(), net.parameters()): 105 | param_target.data.copy_( 106 | param_target.data * (1 - self.tau) + param.data * self.tau) 107 | 108 | def update(self, transition_dict): 109 | # 数据转换 110 | states = torch.tensor(transition_dict['states'], 111 | dtype=torch.float).to(self.device) 112 | actions = torch.tensor(transition_dict['actions'], 113 | dtype=torch.float).view(-1, 1).to(self.device) 114 | rewards = torch.tensor(transition_dict['rewards'], 115 | dtype=torch.float).view(-1, 1).to(self.device) 116 | next_states = torch.tensor(transition_dict['next_states'], 117 | dtype=torch.float).to(self.device) 118 | dones = torch.tensor(transition_dict['dones'], 119 | dtype=torch.float).view(-1, 1).to(self.device) 120 | # 和之前章节一样,对倒立摆环境的奖励进行重塑以便训练 121 | rewards = (rewards + 8.0) / 8.0 122 | 123 | # 更新两个Q网络 124 | td_target = self.calc_target(rewards, next_states, dones) 125 | critic_1_loss = torch.mean(F.mse_loss(self.critic_1(states, actions), 126 | td_target.detach())) 127 | critic_2_loss = torch.mean(F.mse_loss(self.critic_2(states, actions), 128 | td_target.detach())) 129 | # 优化 130 | self.critic_1_optimizer.zero_grad() 131 | critic_1_loss.backward() 132 | self.critic_1_optimizer.step() 133 | 134 | self.critic_2_optimizer.zero_grad() 135 | critic_2_loss.backward() 136 | self.critic_2_optimizer.step() 137 | 138 | # 更新策略网络 139 | new_actions, log_prob = self.actor(states) 140 | entropy = -log_prob 141 | q1_value = self.critic_1(states, new_actions) 142 | q2_value = self.critic_2(states, new_actions) 143 | 144 | actor_loss = torch.mean(-self.log_alpha.exp() 145 | * entropy - torch.min(q1_value, q2_value)) 146 | 147 | # 优化 148 | self.actor_optimizer.zero_grad() 149 | actor_loss.backward() 150 | self.actor_optimizer.step() 151 | 152 | # 更新alpha的值 153 | alpha_loss = torch.mean( 154 | (entropy - self.target_entropy).detach() * self.log_alpha.exp()) 155 | self.log_alpha_optimizer.zero_grad() 156 | alpha_loss.backward() 157 | self.log_alpha_optimizer.step() 158 | 159 | self.soft_update(self.critic_1, self.target_critic_1) 160 | self.soft_update(self.critic_2, self.target_critic_2) 161 | 162 | 163 | class PolicyNet(nn.Module): 164 | def __init__(self, state_dim, hidden_dim, action_dim) -> None: 165 | super().__init__() 166 | self.fc1 = nn.Linear(state_dim, hidden_dim) 167 | self.fc2 = nn.Linear(hidden_dim, action_dim) 168 | 169 | def forward(self, x): 170 | x = F.relu(self.fc1(x)) 171 | return F.softmax(self.fc2(x), dim=1) 172 | 173 | 174 | class QValueNet(nn.Module): 175 | def __init__(self, state_dim, hidden_dim, action_dim): 176 | super().__init__() 177 | self.fc1 = nn.Linear(state_dim, hidden_dim) 178 | self.fc2 = nn.Linear(hidden_dim, action_dim) 179 | 180 | def forward(self, x): 181 | x = F.relu(self.fc1(x)) 182 | return self.fc2(x) 183 | 184 | 185 | class SACDiscrete: 186 | """处理离散动作的SAC""" 187 | 188 | def __init__(self, state_dim, hidden_dim, action_dim, 189 | actor_lr, critic_lr, alpha_lr, 190 | target_entropy, tau, gamma, device, 191 | actor_net=PolicyNet, critic_net=QValueNet): 192 | # 策略网络 193 | self.actor = actor_net(state_dim, hidden_dim, action_dim).to(device) 194 | # 第一个Q网络 195 | self.critic_1 = critic_net(state_dim, hidden_dim, action_dim).to(device) 196 | # 第二个Q网络 197 | self.critic_2 = critic_net(state_dim, hidden_dim, action_dim).to(device) 198 | self.target_critic_1 = critic_net(state_dim, hidden_dim, 199 | action_dim).to(device) # 第一个目标Q网络 200 | self.target_critic_2 = critic_net(state_dim, hidden_dim, 201 | action_dim).to(device) # 第二个目标Q网络 202 | # 令目标Q网络的初始参数和Q网络一样 203 | self.target_critic_1.load_state_dict(self.critic_1.state_dict()) 204 | self.target_critic_2.load_state_dict(self.critic_2.state_dict()) 205 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), 206 | lr=actor_lr) 207 | self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(), 208 | lr=critic_lr) 209 | self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(), 210 | lr=critic_lr) 211 | # 使用alpha的log值,可以使训练结果比较稳定 212 | self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) 213 | self.log_alpha.requires_grad = True # 可以对alpha求梯度 214 | self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], 215 | lr=alpha_lr) 216 | self.target_entropy = target_entropy # 目标熵的大小 217 | self.gamma = gamma 218 | self.tau = tau 219 | self.device = device 220 | 221 | def take_action(self, state): 222 | state = torch.tensor([state], dtype=torch.float).to(self.device) 223 | probs = self.actor(state) 224 | action_dist = torch.distributions.Categorical(probs) 225 | action = action_dist.sample() 226 | return action.item() 227 | 228 | def calc_target(self, rewards, next_states, dones): 229 | """计算目标Q值,直接使用策略网络的输出概率进行计算""" 230 | next_probs = self.actor(next_states) 231 | next_log_probs = torch.log(next_probs + 1e-8) 232 | entropy = -torch.sum(next_probs * next_log_probs, dim=1, keepdim=True) # 计算熵 233 | 234 | q1_value = self.target_critic_1(next_states) 235 | q2_value = self.target_critic_2(next_states) 236 | q_value = torch.min(q1_value, q2_value) # q_value 237 | min_value = torch.sum(next_probs * q_value, dim=1, keepdim=True) 238 | 239 | next_value = min_value + self.log_alpha.exp() * entropy 240 | td_target = rewards + self.gamma * next_value * (1 - dones) 241 | return td_target 242 | 243 | def soft_update(self, net, target_net): 244 | """软更新target_net""" 245 | for param_target, param in zip(target_net.parameters(), net.parameters()): 246 | param_target.data.copy_(param_target.data * (1 - self.tau) + param.data * self.tau) 247 | 248 | def update(self, transition_dict): 249 | # 数据类型转换 250 | states = torch.tensor(transition_dict['states'], 251 | dtype=torch.float).to(self.device) 252 | actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( 253 | self.device) # 动作不再是float类型 254 | rewards = torch.tensor(transition_dict['rewards'], 255 | dtype=torch.float).view(-1, 1).to(self.device) 256 | next_states = torch.tensor(transition_dict['next_states'], 257 | dtype=torch.float).to(self.device) 258 | dones = torch.tensor(transition_dict['dones'], 259 | dtype=torch.float).view(-1, 1).to(self.device) 260 | 261 | # 更新两个Q网络 262 | td_target = self.calc_target(rewards, next_states, dones) 263 | critic1_q_values = self.critic_1(states).gather(1, actions) 264 | critic1_loss = torch.mean(F.mse_loss(critic1_q_values, td_target.detach())) 265 | 266 | critic2_q_values = self.critic_2(states).gather(1, actions) 267 | critic2_loss = torch.mean(F.mse_loss(critic2_q_values, td_target.detach())) 268 | 269 | # 优化 270 | self.critic_1_optimizer.zero_grad() 271 | critic1_loss.backward() 272 | self.critic_1_optimizer.step() 273 | 274 | self.critic_2_optimizer.zero_grad() 275 | critic2_loss.backward() 276 | self.critic_2_optimizer.step() 277 | 278 | # 更新策略网络 279 | probs = self.actor(states) 280 | log_probs = torch.log(probs + 1e-8) 281 | 282 | # 直接根据概率计算熵 283 | entropy = -torch.sum(probs * log_probs, dim=1, keepdim=True) 284 | q1_value = self.critic_1(states) 285 | q2_value = self.critic_2(states) 286 | q_value = torch.min(q1_value, q2_value) 287 | min_qvalue = torch.sum(probs * q_value, dim=1, keepdim=True) 288 | # actor_loss 289 | actor_loss = torch.mean(-self.log_alpha.exp() * entropy - min_qvalue) 290 | # 优化 291 | self.actor_optimizer.zero_grad() 292 | actor_loss.backward() 293 | self.actor_optimizer.step() 294 | 295 | # 更新alpha的值 296 | alpha_loss = torch.mean((entropy - target_entropy).detach() * self.log_alpha.exp()) 297 | self.log_alpha_optimizer.zero_grad() 298 | alpha_loss.backward() 299 | self.log_alpha_optimizer.step() 300 | 301 | self.soft_update(self.critic_1, self.target_critic_1) 302 | self.soft_update(self.critic_2, self.target_critic_2) 303 | -------------------------------------------------------------------------------- /life/policy/trainer.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | 4 | 5 | def train_reinforce(agent, env, num_episodes, return_agent=False): 6 | """REINFORCE算法的训练函数""" 7 | return_list = [] 8 | for i in range(10): 9 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 10 | for i_episode in range(int(num_episodes / 10)): 11 | episode_return = 0 12 | transition_dict = { 13 | 'states': [], 14 | 'actions': [], 15 | 'next_states': [], 16 | 'rewards': [], 17 | 'dones': [] 18 | } 19 | state = env.reset() 20 | done = False 21 | while not done: 22 | action = agent.take_action(state) 23 | next_state, reward, done, _ = env.step(action) 24 | transition_dict['states'].append(state) 25 | transition_dict['actions'].append(action) 26 | transition_dict['next_states'].append(next_state) 27 | transition_dict['rewards'].append(reward) 28 | transition_dict['dones'].append(done) 29 | state = next_state 30 | episode_return += reward 31 | return_list.append(episode_return) 32 | agent.update(transition_dict) 33 | if (i_episode + 1) % 10 == 0: 34 | pbar.set_postfix({ 35 | 'episode': 36 | '%d' % (num_episodes / 10 * i + i_episode + 1), 37 | 'return': 38 | '%.3f' % np.mean(return_list[-10:]) 39 | }) 40 | pbar.update(1) 41 | if return_agent: 42 | return return_list, agent 43 | return return_list 44 | 45 | 46 | def train_ac(agent, env, num_episodes, return_agent=False): 47 | """ac算法的训练函数""" 48 | out = train_reinforce(agent, env, num_episodes, return_agent=return_agent) 49 | return out 50 | 51 | 52 | def train_ppo(agent, env, num_episodes, return_agent=False): 53 | """ppo算法的训练函数""" 54 | out = train_reinforce(agent, env, num_episodes, return_agent=return_agent) 55 | return out 56 | 57 | 58 | # ################################################# 59 | # 深度确定性策略梯度属于off policy 60 | def train_ddpg(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, return_agent=False): 61 | """DDPG算法的训练函数""" 62 | return_list = [] 63 | for i in range(10): 64 | with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: 65 | for i_episode in range(int(num_episodes / 10)): 66 | episode_return = 0 67 | state = env.reset() 68 | done = False 69 | while not done: 70 | action = agent.take_action(state) 71 | next_state, reward, done, _ = env.step(action) 72 | replay_buffer.add(state, action, reward, next_state, done) 73 | state = next_state 74 | episode_return += reward 75 | if replay_buffer.size() > minimal_size: 76 | b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) 77 | transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 78 | 'dones': b_d} 79 | agent.update(transition_dict) 80 | return_list.append(episode_return) 81 | if (i_episode + 1) % 10 == 0: 82 | pbar.set_postfix({'episode': '%d' % (num_episodes / 10 * i + i_episode + 1), 83 | 'return': '%.3f' % np.mean(return_list[-10:])}) 84 | pbar.update(1) 85 | if return_agent: 86 | return return_list, agent 87 | return return_list 88 | 89 | 90 | def train_sac(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, return_agent=False): 91 | """训练sac算法的函数""" 92 | out = train_ddpg(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, return_agent=return_agent) 93 | return out 94 | -------------------------------------------------------------------------------- /life/test/test_dqn.py: -------------------------------------------------------------------------------- 1 | from life.dqn.dqn_improved import DuelingDQN 2 | from life.dqn.trainer import train 3 | from life.utils.replay.replay_buffer import ReplayBuffer 4 | from life.envs.con_env_demo import make 5 | import gym 6 | import torch 7 | 8 | lr = 2e-3 9 | num_episodes = 500 10 | hidden_dim = 128 11 | gamma = 0.98 12 | epsilon = 0.01 13 | target_update = 10 14 | buffer_size = 10000 15 | minimal_size = 500 16 | batch_size = 64 17 | device = torch.device("cpu") 18 | env = make() 19 | state_dim = env.observation_space.shape[0] 20 | action_dim = 11 21 | agent = DuelingDQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon, 22 | target_update, device) 23 | replay_buffer = ReplayBuffer(buffer_size) 24 | result = train(agent, env, replay_buffer, minimal_size, batch_size, con_act=True) 25 | print(result) 26 | -------------------------------------------------------------------------------- /life/test/test_off_policy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from life.policy.sac import SACContinuous 3 | from life.policy.trainer import train_sac 4 | from life.envs.con_env_demo import make 5 | from life.utils.replay.replay_buffer import ReplayBuffer 6 | 7 | env = make() 8 | state_dim = env.observation_space.shape[0] 9 | action_dim = env.action_space.shape[0] 10 | action_bound = env.action_space.high[0] # 动作最大值 11 | 12 | actor_lr = 3e-4 13 | critic_lr = 3e-3 14 | alpha_lr = 3e-4 15 | num_episodes = 100 16 | hidden_dim = 128 17 | gamma = 0.99 18 | tau = 0.005 # 软更新参数 19 | buffer_size = 100000 20 | minimal_size = 1000 21 | batch_size = 64 22 | target_entropy = -env.action_space.shape[0] 23 | device = torch.device("cpu") 24 | 25 | replay_buffer = ReplayBuffer(buffer_size) 26 | agent = SACContinuous(state_dim, hidden_dim, action_dim, action_bound, 27 | actor_lr, critic_lr, alpha_lr, target_entropy, tau, 28 | gamma, device) 29 | 30 | result = train_sac(env, agent, num_episodes, replay_buffer, 31 | minimal_size, batch_size) 32 | -------------------------------------------------------------------------------- /life/test/test_on_policy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from life.policy.ppo import PPO 3 | from life.policy.trainer import train_ppo 4 | from life.envs.dis_env_demo import make 5 | 6 | actor_lr = 1e-3 7 | critic_lr = 1e-2 8 | num_episodes = 500 9 | hidden_dim = 128 10 | gamma = 0.98 11 | lmbda = 0.95 12 | epochs = 10 13 | eps = 0.2 14 | device = torch.device("cpu") 15 | 16 | env = make() 17 | env.seed(0) 18 | torch.manual_seed(0) 19 | state_dim = env.observation_space.shape[0] 20 | action_dim = env.action_space.n 21 | agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda, 22 | epochs, eps, gamma, device) 23 | result = train_ppo(agent, env, num_episodes) 24 | -------------------------------------------------------------------------------- /life/test/test_ql.py: -------------------------------------------------------------------------------- 1 | from life.base.q_learning import QLearning 2 | from life.base.trainer import train_qlearning 3 | from life.envs.cliffwalking import CliffWalkingEnv 4 | 5 | agent = QLearning(12 * 4, 0.1, 0.1, 0.9) 6 | env = CliffWalkingEnv(12, 4) 7 | result = train_qlearning(env, agent, ) 8 | 9 | print(result) 10 | -------------------------------------------------------------------------------- /life/utils/__pycache__/calculator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/utils/__pycache__/calculator.cpython-37.pyc -------------------------------------------------------------------------------- /life/utils/__pycache__/cont2disp.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/utils/__pycache__/cont2disp.cpython-37.pyc -------------------------------------------------------------------------------- /life/utils/calculator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def compute_advantage(gamma, lmbda, td_delta): 6 | """计算优势函数""" 7 | td_delta = td_delta.detach().numpy() 8 | advantage_list = [] 9 | advantage = 0.0 10 | for delta in td_delta[::-1]: 11 | advantage = gamma * lmbda * advantage + delta 12 | advantage_list.append(advantage) # 妙啊,边累计计算advantage,边加入列表 以保存每一个时间步的advantage. 13 | advantage_list.reverse() 14 | return torch.tensor(advantage_list, dtype=torch.float) 15 | 16 | 17 | def sample_expert_data(env, agent, n_episodes): 18 | """ 19 | 生成专家的与环境交互的轨迹数据 20 | env:专家所在的环境 21 | agent:专家智能体 22 | n_episode:轨迹个数 23 | """ 24 | states = [] 25 | actions = [] 26 | for episode in range(n_episodes): 27 | state = env.reset() 28 | done = False 29 | 30 | while not done: 31 | action = agent.take_action(state) 32 | states.append(state) 33 | actions.append(action) 34 | 35 | next_state, reward, done, _ = env.step(action) 36 | state = next_state 37 | return np.array(states), np.array(actions) 38 | 39 | 40 | def moving_average(a, window_size): 41 | """数据平滑处理""" 42 | cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 43 | middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size 44 | r = np.arange(1, window_size-1, 2) 45 | begin = np.cumsum(a[:window_size-1])[::2] / r 46 | end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1] 47 | return np.concatenate((begin, middle, end)) -------------------------------------------------------------------------------- /life/utils/cont2disp.py: -------------------------------------------------------------------------------- 1 | def dis2con(discrete_action, env, action_dim): 2 | """离散动作 转为 连续动作的函数(将[0,1,2,..,10]映射到[-2,-1.6,...,1.6,2])""" 3 | action_low = env.action_space.low[0] # 连续动作的最小值 4 | action_up = env.action_space.high[0] # 连续动作的最大值 5 | out = action_low + (discrete_action / (action_dim - 1)) * (action_up - action_low) 6 | return out 7 | -------------------------------------------------------------------------------- /life/utils/replay/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanggeAi/Life/10c4d37fb1112ac017ca1239f85e7874cb51aa32/life/utils/replay/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /life/utils/replay/per_replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | class SumTree: 6 | def __init__(self, capacity: int): 7 | self.capacity = capacity # 叶子节点个数 8 | self.data_pointer = 0 9 | self.n_entries = 0 10 | self.tree = np.zeros(2 * capacity - 1) # 树中总的节点个数 11 | self.data = np.zeros(capacity, dtype=object) 12 | 13 | def update(self, tree_idx, p): 14 | """Update the sampling weight 15 | """ 16 | change = p - self.tree[tree_idx] 17 | self.tree[tree_idx] = p 18 | 19 | while tree_idx != 0: 20 | tree_idx = (tree_idx - 1) // 2 21 | self.tree[tree_idx] += change 22 | 23 | def add(self, p, data): 24 | """Adding new data to the sumTree 25 | """ 26 | tree_idx = self.data_pointer + self.capacity - 1 27 | self.data[self.data_pointer] = data 28 | # print ("tree_idx=", tree_idx) 29 | # print ("nonzero = ", np.count_nonzero(self.tree)) 30 | self.update(tree_idx, p) 31 | 32 | self.data_pointer += 1 33 | if self.data_pointer >= self.capacity: 34 | self.data_pointer = 0 35 | 36 | if self.n_entries < self.capacity: 37 | self.n_entries += 1 38 | 39 | def get_leaf(self, v): 40 | """Sampling the data 41 | """ 42 | parent_idx = 0 43 | while True: 44 | cl_idx = 2 * parent_idx + 1 45 | cr_idx = cl_idx + 1 46 | if cl_idx >= len(self.tree): 47 | leaf_idx = parent_idx 48 | break 49 | else: 50 | if v <= self.tree[cl_idx]: 51 | parent_idx = cl_idx 52 | else: 53 | v -= self.tree[cl_idx] 54 | parent_idx = cr_idx 55 | 56 | data_idx = leaf_idx - self.capacity + 1 57 | return leaf_idx, self.tree[leaf_idx], self.data[data_idx] 58 | 59 | def total(self): 60 | return int(self.tree[0]) 61 | 62 | 63 | class ReplayTree: 64 | """ReplayTree for the per(Prioritized Experience Replay) DQN. 65 | """ 66 | 67 | def __init__(self, capacity): 68 | self.capacity = capacity # the capacity for memory replay 69 | self.tree = SumTree(capacity) 70 | self.abs_err_upper = 1. 71 | 72 | self.beta_increment_per_sampling = 0.001 73 | self.alpha = 0.6 74 | self.beta = 0.4 75 | self.epsilon = 0.01 76 | self.abs_err_upper = 1. 77 | 78 | def __len__(self): 79 | """ return the num of storage 80 | """ 81 | return self.tree.total() 82 | 83 | def push(self, error, sample): 84 | """Push the sample into the replay according to the importance sampling weight 85 | """ 86 | p = (np.abs(error) + self.epsilon) ** self.alpha 87 | self.tree.add(p, sample) 88 | 89 | def sample(self, batch_size): 90 | """This is for sampling a batch data and the original code is from: 91 | https://github.com/rlcode/per/blob/master/prioritized_memory.py 92 | """ 93 | pri_segment = self.tree.total() / batch_size 94 | 95 | priorities = [] 96 | batch = [] 97 | idxs = [] 98 | 99 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) 100 | 101 | for i in range(batch_size): 102 | a = pri_segment * i 103 | b = pri_segment * (i + 1) 104 | 105 | s = random.uniform(a, b) 106 | idx, p, data = self.tree.get_leaf(s) 107 | 108 | priorities.append(p) 109 | batch.append(data) 110 | idxs.append(idx) 111 | 112 | sampling_probabilities = np.array(priorities) / self.tree.total() 113 | is_weights = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) 114 | is_weights /= is_weights.max() 115 | 116 | return zip(*batch), idxs, is_weights 117 | 118 | def batch_update(self, tree_idx, abs_errors): 119 | """Update the importance sampling weight 120 | """ 121 | abs_errors += self.epsilon 122 | 123 | clipped_errors = np.minimum(abs_errors, self.abs_err_upper) 124 | ps = np.power(clipped_errors, self.alpha) 125 | 126 | for ti, p in zip(tree_idx, ps): 127 | self.tree.update(ti, p) 128 | -------------------------------------------------------------------------------- /life/utils/replay/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import random 3 | import numpy as np 4 | 5 | 6 | class ReplayBuffer: 7 | ''' 经验回放池 ''' 8 | 9 | def __init__(self, capacity): 10 | self.buffer = collections.deque(maxlen=capacity) # 队列,先进先出 11 | 12 | def add(self, state, action, reward, next_state, done): # 将数据加入buffer 13 | self.buffer.append((state, action, reward, next_state, done)) 14 | 15 | def sample(self, batch_size): # 从buffer中采样数据,数量为batch_size 16 | transitions = random.sample(self.buffer, batch_size) 17 | state, action, reward, next_state, done = zip(*transitions) 18 | return np.array(state), action, reward, np.array(next_state), done 19 | 20 | def size(self): # 目前buffer中数据的数量 21 | return len(self.buffer) 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # This is a sample Python script. 2 | 3 | # Press Shift+F10 to execute it or replace it with your code. 4 | # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. 5 | 6 | 7 | def print_hi(name): 8 | # Use a breakpoint in the code line below to debug your script. 9 | print(f'Hello , {name}') # Press Ctrl+F8 to toggle the breakpoint. 10 | 11 | 12 | # Press the green button in the gutter to run the script. 13 | if __name__ == '__main__': 14 | print_hi('Life !') 15 | 16 | # See PyCharm help at https://www.jetbrains.com/help/pycharm/ 17 | --------------------------------------------------------------------------------