├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
└── README.md
├── docs
├── .nojekyll
├── BCQ.md
├── C51.md
├── CONTRIBUTE.md
├── MOQ-Learning.md
├── NoisyDQN.md
├── PER_DQN.md
├── PPO.md
├── QLearning.md
├── README.md
├── Rainbow.md
├── _sidebar.md
├── algo_cfg.md
├── api.md
├── basic_concept.md
├── dev
│ └── offline_run.md
├── devlop.md
├── docsify.md
├── figs
│ ├── DQN_pseu.png
│ ├── branch_merge.png
│ ├── collector.png
│ ├── data_flow.png
│ ├── interaction_mdp.png
│ ├── interactor_learner.png
│ ├── overall_framework.png
│ ├── recorder.png
│ └── tasks_dir.png
├── general_cfg.md
├── hyper_cfg.md
├── index.html
├── multiprocessing.md
├── multiprocessing_DRL
│ ├── mp_DQN.svg
│ └── multiprocessing_DQN.md
├── ray_DRL
│ ├── ray_DRL.md
│ └── results.png
└── usage.md
├── examples
├── custom_env.py
├── quick_start_1.py
└── quick_start_2.py
├── joyrl
├── __init__.py
├── algos
│ ├── A3C
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── CategoricalDQN
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── DDPG
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ ├── model.py
│ │ └── policy.py
│ ├── DQN
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── DoubleDQN
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── DuelingDQN
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── HierarchicalDQN
│ │ ├── README.md
│ │ ├── agent.py
│ │ ├── task0.py
│ │ └── train.py
│ ├── MonteCarlo
│ │ ├── agent.py
│ │ └── task0.py
│ ├── NoisyDQN
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── PPO
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── PolicyGradient
│ │ ├── main.py
│ │ └── pg.py
│ ├── QLearning
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── RainbowDQN
│ │ ├── rainbow_dqn.py
│ │ └── task0.py
│ ├── SAC-S
│ │ └── sac.py
│ ├── SAC
│ │ └── sacd_cnn.py
│ ├── Sarsa
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── SoftActorCritic
│ │ ├── env_wrapper.py
│ │ ├── model.py
│ │ ├── sac.py
│ │ ├── task0.py
│ │ └── task0_train.ipynb
│ ├── SoftQ
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ └── policy.py
│ ├── TD3
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── data_handler.py
│ │ ├── model.py
│ │ └── policy.py
│ ├── __init__.py
│ └── base
│ │ ├── __init__.py
│ │ ├── action_layer.py
│ │ ├── base_layer.py
│ │ ├── buffer.py
│ │ ├── data_handler.py
│ │ ├── experience.py
│ │ ├── network.py
│ │ ├── noise.py
│ │ ├── optm.py
│ │ └── policy.py
├── envs
│ ├── README.md
│ ├── __init__.py
│ ├── assets
│ │ ├── action_grid.png
│ │ ├── gym_info_20211130180023.png
│ │ ├── image-20200820174307301.png
│ │ ├── image-20200820174814084.png
│ │ ├── image-20201007211441036.png
│ │ ├── image-20201007211858925.png
│ │ ├── image-20210429150622353.png
│ │ ├── image-20210429150630806.png
│ │ └── track_big.png
│ ├── blackjack.py
│ ├── cliff_walking.py
│ ├── gridworld.py
│ ├── gridworld_env.py
│ ├── gym
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── toy_text
│ │ │ └── cliff_walking.py
│ │ └── wrappers.py
│ ├── gym_info.md
│ ├── mujoco_info.md
│ ├── racetrack.py
│ ├── racetrack_env.md
│ ├── register.py
│ ├── snake
│ │ ├── README.md
│ │ ├── agent.py
│ │ ├── example_assignment_and_report2.pdf
│ │ ├── main.py
│ │ ├── snake_env.py
│ │ └── utils.py
│ ├── stochastic_mdp.py
│ ├── track.txt
│ └── windy_gridworld.py
├── framework
│ ├── __init__.py
│ ├── base.py
│ ├── collector.py
│ ├── config.py
│ ├── core_types.py
│ ├── interactor.py
│ ├── learner.py
│ ├── message.py
│ ├── policy_mgr.py
│ ├── recorder.py
│ ├── tester.py
│ ├── tracker.py
│ ├── trainer.py
│ └── utils.py
├── run.py
└── scripts
│ ├── __init__.py
│ └── scripts.py
├── offline_run.py
├── presets
├── Atari
│ ├── AirRaid-v5
│ │ └── AirRaid-v5_PPO.yaml
│ ├── Breakout-v5
│ │ ├── Breakout-v5_DQN.yaml
│ │ ├── Breakout-v5_PPO.yaml
│ │ └── Breakout-v5_PPO_test.yaml
│ ├── DemonAttack-v5
│ │ ├── DoubleDQN_DemonAttack-v5_Test.yaml
│ │ └── DoubleDQN_DemonAttack-v5_Train.yaml
│ └── Enduro-v5
│ │ └── Enduro-v5_DQN.yaml
├── BipedalWalker-v3_DDPG_mp_Train.yaml
├── Box2D
│ ├── BipedalWalker-v3
│ │ ├── BipedalWalker-v3_DDPG_Test.yaml
│ │ ├── BipedalWalker-v3_DDPG_Train.yaml
│ │ ├── BipedalWalker-v3_PPO.yaml
│ │ ├── BipedalWalker-v3_SAC_Test.yaml
│ │ └── BipedalWalker-v3_SAC_Train.yaml
│ ├── BipedalWalkerHardcore-v3
│ │ └── TD3_BipedalWalkerHardcore-v3.yaml
│ ├── CarRacing-v2
│ │ ├── CarRacing-v2_PPO.yaml
│ │ ├── DQN_carRacing-v2_Test.yaml
│ │ ├── DQN_carRacing-v2_Train.yaml
│ │ └── TD3_carRacing-v2_Train.yaml
│ └── LunarLander-v2
│ │ ├── LunarLander-v2_PPO.yaml
│ │ ├── LunarLander-v2_PPO_Test.yaml
│ │ ├── LunarLanderContinuous-v2_SAC_Test.yaml
│ │ └── LunarLanderContinuous-v2_SAC_Train.yaml
├── ClassControl
│ ├── Acrobot-v1
│ │ ├── Acrobot-v1_DQN.yaml
│ │ ├── Acrobot-v1_DoubleDQN.yaml
│ │ ├── Acrobot-v1_DuelingDQN.yaml
│ │ ├── Acrobot-v1_NoisyDQN.yaml
│ │ └── Acrobot-v1_PPO.yaml
│ ├── CartPole-v1
│ │ ├── CartPole-v1_A3C.yaml
│ │ ├── CartPole-v1_BC_Test.yaml
│ │ ├── CartPole-v1_BC_Train.yaml
│ │ ├── CartPole-v1_C51_Test.yaml
│ │ ├── CartPole-v1_C51_Train.yaml
│ │ ├── CartPole-v1_CQL_Test.yaml
│ │ ├── CartPole-v1_CQL_Train.yaml
│ │ ├── CartPole-v1_CategoricalDQN.yaml
│ │ ├── CartPole-v1_DQN.yaml
│ │ ├── CartPole-v1_DQN_1.yaml
│ │ ├── CartPole-v1_DRQN_Test.yaml
│ │ ├── CartPole-v1_DRQN_Train.yaml
│ │ ├── CartPole-v1_DoubleDQN.yaml
│ │ ├── CartPole-v1_DuelingDQN.yaml
│ │ ├── CartPole-v1_GAIL_Test.yaml
│ │ ├── CartPole-v1_GAIL_Train.yaml
│ │ ├── CartPole-v1_NoisyDQN.yaml
│ │ ├── CartPole-v1_PER_DQN.yaml
│ │ ├── CartPole-v1_PPO.yaml
│ │ ├── CartPole-v1_PPO_off_policy.yaml
│ │ ├── CartPole-v1_REINFORCE_Test.yaml
│ │ ├── CartPole-v1_REINFORCE_Train.yaml
│ │ ├── CartPole-v1_RainbowDQN_Test.yaml
│ │ ├── CartPole-v1_RainbowDQN_Train.yaml
│ │ ├── CartPole-v1_RainbowDQN_Train_mp.yaml
│ │ ├── CartPole-v1_SAC_D_Test.yaml
│ │ ├── CartPole-v1_SAC_D_Train.yaml
│ │ └── CartPole-v1_SoftQ.yaml
│ ├── MountainCar-v0
│ │ ├── MountainCar-v0_DQN.yaml
│ │ └── MountainCar-v0_PPO.yaml
│ ├── MountainCarContinuous-v0
│ │ ├── MountainCarContinuous-v0_PPO-test.yaml
│ │ └── MountainCarContinuous-v0_PPO.yaml
│ └── Pendulum-v1
│ │ ├── Pendulum-v1_DDPG.yaml
│ │ ├── Pendulum-v1_DDPG_HER_Test.yaml
│ │ ├── Pendulum-v1_DDPG_HER_Train.yaml
│ │ ├── Pendulum-v1_PPO.yaml
│ │ ├── Pendulum-v1_PPO_off_policy.yaml
│ │ ├── Pendulum-v1_SAC_Train.yaml
│ │ ├── Pendulum-v1_TD3.yaml
│ │ ├── Pendulum-v1_TD3_BC_Test.yaml
│ │ └── Pendulum-v1_TD3_BC_Train.yaml
├── External
│ └── Mario
│ │ ├── Mario_DQN_CNN_Test.yaml
│ │ └── Mario_DQN_CNN_Train.yaml
├── Mujoco
│ ├── Ant-v4
│ │ └── Ant-v4_PPO.yaml
│ ├── HalfCheetah-v4
│ │ ├── HalfCheetah-v2_TD3_Test.yaml
│ │ ├── HalfCheetah-v2_TD3_Train.yaml
│ │ └── HalfCheetah-v4_PPO.yaml
│ ├── Hopper-v4
│ │ └── Hopper-v4_PPO.yaml
│ └── Reacher-v4
│ │ └── Reacher-v4_PPO.yaml
├── Others
│ ├── Racetrack-v0
│ │ ├── Racetrack-v0_QLearning_Test.yaml
│ │ ├── Racetrack-v0_QLearning_Train.yaml
│ │ └── Train_Racetrack-v0_FirstVisitMC.yaml
│ ├── deep-sea-treasure-v0
│ │ ├── DeepSeaTreasure-v0_MOQLearning_Test.yaml
│ │ └── DeepSeaTreasure-v0_MOQLearning_Train.yaml
│ └── theAlley
│ │ ├── theAlley_VI_Test.yaml
│ │ └── theAlley_VI_Train.yaml
├── Pendulum-v1_TD3_mp_Test.yaml
├── Pendulum-v1_TD3_mp_Train.yaml
└── ToyText
│ ├── CliffWalking-v0
│ ├── CliffWalking-v0_DynaQ_Test.yaml
│ ├── CliffWalking-v0_DynaQ_Train.yaml
│ ├── CliffWalking-v0_QLearning.yaml
│ ├── CliffWalking-v0_Sarsa.yaml
│ └── CustomCliffWalking-v0_DQN.yaml
│ └── FrozenLake-v1
│ └── FrozenLake-v1_NoSlippery_QLearning.yaml
├── requirements.txt
├── run.bat
├── run.sh
├── setup.cfg
├── setup.py
├── setup.sh
├── stop.bat
└── stop.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .ipynb_checkpoints
3 | __pycache__
4 | .vscode
5 | .pypirc
6 | dist
7 | build
8 | joyrl.egg-info
9 | tasks
10 | test
11 | test.py
12 | *.log
13 | presets/*/*/*.ipynb
14 | presets/*/*/*/*.ckpt
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Tianshou contributors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # benchmarks
2 |
3 | Save well trained models and test results here.
4 |
5 | Now we have moved to: [Mega](https://mega.nz/folder/dwxEnRoT#qPiDkhL4eyzvcSfgLxIsHQ) and [天翼云盘,访问码:88yf](https://cloud.189.cn/web/share?code=7JrQRzfQf6Nn)。
--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/.nojekyll
--------------------------------------------------------------------------------
/docs/BCQ.md:
--------------------------------------------------------------------------------
1 | # BCQ 算法参数说明
2 |
3 | ```python
4 | class AlgoConfig:
5 | def __init__(self):
6 | self.critic_hidden_dims = [400,300] # Critic隐藏单元
7 | self.actor_hidden_dims = [400,300] # Actor隐藏单元设置
8 |
9 | self.vae_hidden_dims = [750,750] # VAE 隐藏单元设置
10 |
11 | self.critic_lr = 1e-3
12 | self.actor_lr = 1e-3
13 | self.vae_lr = 1e-3
14 | self.batch_size = 128
15 |
16 | self.gamma = 0.99
17 | self.tau = 0.005 # Target critic/actor 更新的快慢
18 | self.lmbda = 0.75 # soft double Q learning: target_Q = lmbda * min(q1,q2) + (1-lmbda) * max(q1,q2)
19 | self.phi = 0.05 # BCQ 特有的参数, 表示 action 相比经验池中的action, 最大的波动范围 (Actor中使用)
20 |
21 | # train parameters
22 | self.iters_per_ep = 10 # 在train BCQ Agent时, 每一个 train_one_episode中 迭代的次数, 每一次迭代都是batch_size的经验。
23 | self.max_buffer_size = int(1e5) # BCQ Agent中 memories的经验池大小
24 | self.start_learn_buffer_size = 1e3 # memories的最少经验数量,少于此数量,会报错。
25 |
26 | # parameters for collecting data
27 | self.collect_explore_data = True # 收集数据时 DDPG是否加噪音
28 | self.behavior_agent_name = "DDPG" # 使用的行为智能体算法
29 | self.behavior_agent_parameters_path = "/behave_param.yaml" # 行为智能体的参数, 在BCQ目录下。 具体参数请看 行为智能体算法参数
30 | self.behavior_policy_path = "/behaviour_models" # 行为智能体的模型, 收集数据时需要用到
31 | self.collect_eps = 500 # 收集的数据 episode 数量
32 | ```
33 | * `z_dim`: 这是 VAE中latent space的维度参数,固定为action dim的两倍,因此无需设置。
34 |
35 |
36 | # BCQ 训练过程
37 | BCQ算法属于offline RL,因此需要 行为智能体来收集数据, 然后根据数据进行学习,而不与环境进行交互。
38 | 算法执行步骤:
39 | 1. **生成行为智能题模型**:使用 DDPG算法 (可以自行更换其他算法) 与环境交互,模型学习好之后将模型保存。
40 | 2. **获取训练数据**: 主目录的config下开启 "collect" mode,采用“BCQ算法”, 将DDPG算法的model复制到 “BCQ/behaviour_models”下,
41 | 然后在 "tasks/你的训练文档/traj"下会生成memories对应的数据。
42 | 3. **训练BCQ agent**: 主目录下的config开启 "train" mode,采用“BCQ”算法,将上一步骤生成的"traj"复制到"BCQ/traj"下,
43 | 然后训练就可以结果。 **注意**:因为训练过程中不与环境交互,因此每次训练完智能体,我们都会test_one_episode,生成reward。
44 |
45 |
46 | # BCQ学习
47 |
48 | ## VAE介绍
49 |
50 | [一文理解变分自编码器(VAE)](https://zhuanlan.zhihu.com/p/64485020)
51 | [VAE手写体识别项目实现(详细注释)从小项目通俗理解变分自编码器(Variational Autoencoder, VAE)tu](https://blog.csdn.net/weixin_40015791/article/details/89735233)
52 |
53 | ## BCQ算法介绍
54 |
55 | 1. [BCQ 张楚珩](https://zhuanlan.zhihu.com/p/136844574)
56 | 2. [(RL)BCQ](https://zhuanlan.zhihu.com/p/206489894)
57 | 3. [BCQ github code](https://github.com/sfujim/BCQ/tree/master/continuous_BCQ)
58 | 4. [Batch RL与BCQ算法](https://zhuanlan.zhihu.com/p/269475418)
59 |
--------------------------------------------------------------------------------
/docs/C51.md:
--------------------------------------------------------------------------------
1 | ## 算法参数说明
2 |
3 | C51的算法参数如下:
4 |
5 | ```python
6 | class AlgoConfig(DefaultConfig):
7 | def __init__(self):
8 | self.gamma = 0.99 # discount factor
9 | self.tau = 1.0 # 1.0 means hard update
10 | self.Vmin = 0. # support of C51
11 | self.Vmax = 200. # support of C51
12 | self.num_atoms = 51 # support of C51
13 | self.support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms) # support of C51
14 | self.delta_z = (self.Vmax - self.Vmin) / (self.num_atoms - 1) # support of C51
15 |
16 | self.batch_size = 32 # batch size
17 | self.lr = 0.0001 # learning rate
18 | self.target_update = 200 # target network update frequency
19 | self.memory_capacity = 10000 # size of replay buffer
20 | self.epsilon_start = 0.95 # epsilon start value
21 | self.epsilon_end = 0.01 # epsilon end value
22 | self.epsilon_decay = 500 # epsilon decay rate
23 | ```
24 |
25 | 其中Vmin表示支撑中的最小值,Vmax表示支撑中的最大值,num_atoms表示支撑的单元数,support表示C51中的支撑表示分布可能取到的值,delta_z表示相邻支撑单元之间的差距。
26 |
27 | 其他的参数内容和DQN基本保持一致。
--------------------------------------------------------------------------------
/docs/CONTRIBUTE.md:
--------------------------------------------------------------------------------
1 | # 如何贡献
2 |
3 | 欢迎广大开发爱好者为 JoyRL 贡献代码,如果你想参与bug修复的话,直接修改对应的代码然后PR即可,PR教程可参考[VS Code快速实现Git PR操作](https://blog.csdn.net/JohnJim0/article/details/128156442)。如果你想贡献新的算法的话,可以按照以下步骤进行,有问题的话随时交流~(微信:johnjim0816)
4 |
5 | ## 新建算法
6 |
7 | 首先在`algos`目录下新建文件夹,明明为你想要新增的算法名称,并且在`config.py`下配置好默认参数
8 |
9 | ## 配置参数
10 |
11 | 在`presets`下配置好`yaml`文件,包括`Train`和`Test`的部分
12 |
13 | ## 运行代码
14 |
15 | 调试好你的算法代码之后,分别训练和测试一次,将对应输出的文件夹放到`benchmarks`目录下
16 |
17 | ## 修改文档
18 |
19 | 在`docs/hyper_tuning.md`文件中写好你贡献的算法的参数说明,最后PR即可
--------------------------------------------------------------------------------
/docs/MOQ-Learning.md:
--------------------------------------------------------------------------------
1 | ## MOQ-learning
2 |
3 | ```python
4 | class AlgoConfig:
5 | def __init__(self) -> None:
6 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
7 | self.epsilon_start = 0.95 # epsilon start value
8 | self.epsilon_end = 0.01 # epsilon end value
9 | self.epsilon_decay = 300 # epsilon decay rate
10 | self.gamma = 0.90 # discount factor
11 | self.lr = 0.1 # learning rate
12 | self.weights = [0.5, 0.5] # weights for scalarization
13 | ```
14 |
15 | 其中gamma是强化学习中的折扣因子,一般调整在0.9-0.999之间即可,可以默认为0.99。weights为目标之间的权重向量;max_buffer_size、target_update以及epsilon都需要根据实际环境的情况来经验性的调整。
16 |
17 | MOQ-Learning中的epsilon的衰减机制和DQN的保持一致。总体来说,MOQ-Learning的参数和DQN大体一致,这里不再赘述。
18 |
--------------------------------------------------------------------------------
/docs/NoisyDQN.md:
--------------------------------------------------------------------------------
1 | ## 算法参数说明
2 |
3 | NoisyDQN的算法参数如下,基本和DQN中的一致:
4 |
5 | ```python
6 | class AlgoConfig(DefaultConfig):
7 | def __init__(self) -> None:
8 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
9 | self.epsilon_start = 0.95 # epsilon start value
10 | self.epsilon_end = 0.01 # epsilon end value
11 | self.epsilon_decay = 500 # epsilon decay rate
12 | self.hidden_dim = 256 # hidden_dim for MLP
13 | self.gamma = 0.95 # discount factor
14 | self.lr = 0.0001 # learning rate
15 | self.max_buffer_size = 100000 # size of replay buffer
16 | self.batch_size = 64 # batch size
17 | self.target_update = 4 # target network update frequency
18 | ```
19 |
20 | 其中gamma是强化学习中的折扣因子,一般调整在0.9-0.999之间即可,可以默认为0.99。max_buffer_size、target_update以及epsilon都需要根据实际环境的情况来经验性的调整。
21 |
22 | NoisyDQN中的epsilon的衰减机制和DQN的保持一致。总体来说,NoisyDQN的参数和DQN大体一致,这里不再赘述。
--------------------------------------------------------------------------------
/docs/PER_DQN.md:
--------------------------------------------------------------------------------
1 | ## 算法参数说明
2 |
3 | PER_DQN的算法参数如下,基本和DQN中的一致:
4 |
5 | ```python
6 | class AlgoConfig(DefaultConfig):
7 | def __init__(self) -> None:
8 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
9 | self.epsilon_start = 0.95 # epsilon start value
10 | self.epsilon_end = 0.01 # epsilon end value
11 | self.epsilon_decay = 500 # epsilon decay rate
12 | self.hidden_dim = 256 # hidden_dim for MLP
13 | self.gamma = 0.95 # discount factor
14 | self.lr = 0.0001 # learning rate
15 | self.max_buffer_size = 100000 # size of replay buffer
16 | self.batch_size = 64 # batch size
17 | self.target_update = 4 # target network update frequency
18 | self.value_layers = [
19 | {'layer_type': 'linear', 'layer_dim': ['n_states', 256],
20 | 'activation': 'relu'},
21 | {'layer_type': 'linear', 'layer_dim': [256, 256],
22 | 'activation': 'relu'},
23 | {'layer_type': 'linear', 'layer_dim': [256, 'n_actions'],
24 | 'activation': 'none'}]
25 | ```
26 |
27 |
28 | 其中gamma是强化学习中的折扣因子,一般调整在0.9-0.999之间即可,可以默认为0.99。max_buffer_size、target_update以及epsilon都需要根据实际环境的情况来经验性的调整。
29 |
30 | PER_DQN中的epsilon的衰减机制和DQN的保持一致。
31 |
32 | 因为PER_DQN只改变了replay buffer,这里的参数相比DQN基本变化不大。
--------------------------------------------------------------------------------
/docs/PPO.md:
--------------------------------------------------------------------------------
1 |
2 | ## 算法参数说明
3 |
4 | PPO算法参数如下:
5 |
6 | ```python
7 | class AlgoConfig:
8 | def __init__(self):
9 | ppo_type = 'clip' # clip or kl
10 | self.gamma = 0.99 # discount factor
11 | self.k_epochs = 4 # update policy for K epochs
12 | self.actor_lr = 0.0003 # learning rate for actor
13 | self.critic_lr = 0.001 # learning rate for critic
14 | self.eps_clip = 0.2 # clip parameter for PPO
15 | self.entropy_coef = 0.01 # entropy coefficient
16 | self.update_freq = 100 # update policy every n steps
17 | self.actor_hidden_dim = 256 # hidden dimension for actor
18 | self.critic_hidden_dim = 256 # hidden dimension for critic
19 | # batch size
20 | self.train_batch_size = 100 # ppo train batch size
21 | self.sgd_batch_size = 64 # sgd batch size
22 | # continuous PPO
23 | self.continuous = False # continuous action space
24 | # KL parameter
25 | self.kl_target = 0.1 # target KL divergence
26 | self.kl_lambda = 0.5 # lambda for KL penalty, 0.5 is the default value in the paper
27 | self.kl_beta = 1.5 # beta for KL penalty, 1.5 is the default value in the paper
28 | self.kl_alpha = 2 # alpha for KL penalty, 2 is the default value in the paper
29 | ```
30 |
31 | * `ppo_type`: PPO有两种Loss函数更新方式:clip方法和KL散度。现在一般都用clip方法更新,一方面因为KL调参比较费劲,另一方面clip方法基本可以满足所有需求
32 | * `eps_clip`:clip参数,一般设置为0.1-0.2之间即可
33 | * `entropy_coef`:策略熵损失系数,增加该系数提高actor的稳定性,保持0.0001-0.02即可,或者直接设置为0在一些问题中影响也不大
34 | * `update_freq`:更新频率,在JoyRL中设置为每隔几步更新,一般跟环境中每回合最大步数线性相关,例如carpole-v1环境中每回合最大步数是200,这里更新频率可以设置为50,100,200等等,这项参数需要根据实际经验调整
35 | * `k_epochs`:调整每次更新的epoch数,不能太大也不能太小,太大了一方面收敛速度会变慢,另一方面容易过拟合,太小了容易欠拟合
36 | * `train_batch_size`: 一般取值比较大(这里取100实际上是为了计算简便),当batch_size比较大时,训练的结果比较准确,但是训练速度比较慢
37 | * `sgd_batch_size`: 小批量样本,一般取值64或128。当batch_size特别小的时候,训练速度很快,但是训练结果准确性不高,这时就需要一个折中的办法,即使用小批量样本计算
38 | * `continuous`: 动作空间是否连续
39 | * `kl_target`: KL散度的目标值
40 | * `kl_lambda`: KL惩罚项的系数,PPO论文中的默认值是0.5
41 | * `kl_beta`: KL散度目标值的系数,默认值为1.5
42 | * `kl_alpha`: KL惩罚项的系数的更新参数,默认值为2
43 |
--------------------------------------------------------------------------------
/docs/QLearning.md:
--------------------------------------------------------------------------------
1 |
2 | ## 算法参数说明
3 |
4 | PPO算法参数如下:
5 |
6 | ```python
7 | class AlgoConfig:
8 | def __init__(self) -> None:
9 | ## 设置 epsilon_start=epsilon_end 可以得到固定的 epsilon,即等于epsilon_end
10 | self.epsilon_start = 0.95 # epsilon 初始值
11 | self.epsilon_end = 0.01 # epsilon 终止值
12 | self.epsilon_decay = 300 # epsilon 衰减率
13 | self.gamma = 0.90 # 奖励折扣因子
14 | self.lr = 0.1 # 学习率
15 | ```
16 |
17 | * 适当调整`epsilon_decay`以保证`epsilon`在训练过程中不会过早衰减。
18 | * 由于传统强化学习算法面对的环境都比较简单,因此`gamma`一般设置为`0.9`,`lr`且设置得比较大,不用太担心过拟合的情况。
19 |
--------------------------------------------------------------------------------
/docs/Rainbow.md:
--------------------------------------------------------------------------------
1 | ## 算法参数说明
2 |
3 | Rainbow的算法参数如下:
4 |
5 | ```python
6 | class AlgoConfig(DefaultConfig):
7 | def __init__(self):
8 | self.gamma = 0.99 # discount factor
9 | self.tau = 1.0 # 1.0 means hard update
10 | self.hidden_dim = 256 # hidden_dim for MLP
11 | self.Vmin = 0. # support of C51
12 | self.Vmax = 200. # support of C51
13 | self.num_atoms = 51 # support of C51
14 | self.support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms) # support of C51
15 | self.delta_z = (self.Vmax - self.Vmin) / (self.num_atoms - 1) # support of C51
16 |
17 | self.n_step = 1 #the n_step for N-step DQN
18 | self.batch_size = 32 # batch size
19 | self.lr = 0.0001 # learning rate
20 | self.target_update = 200 # target network update frequency
21 | self.memory_capacity = 10000 # size of replay buffer
22 | self.epsilon_start = 0.95 # epsilon start value
23 | self.epsilon_end = 0.01 # epsilon end value
24 | self.epsilon_decay = 500 # epsilon decay rate
25 | ```
26 |
27 | 参数配置和C51基本一致,其中增加的n_step表示n_step DQN的步长。同样,Vmin表示支撑中的最小值,Vmax表示支撑中的最大值,num_atoms表示支撑的单元数,support表示C51中的支撑表示分布可能取到的值,delta_z表示相邻支撑单元之间的差距。
28 |
29 | 剩下的参数都和DQN基本保持一致,这里不再赘述。
--------------------------------------------------------------------------------
/docs/_sidebar.md:
--------------------------------------------------------------------------------
1 | - [README](/)
2 | - [Basic Concept](./basic_concept.md)
3 | - [Usage](./usage.md)
4 | - [HyperParameter Config](./hyper_cfg.md)
5 | - [General Config](./general_cfg.md)
6 | - [Algo Config](./algo_cfg.md)
7 | - [API](./api.md)
8 | - [Contribution](./CONTRIBUTE.md)
--------------------------------------------------------------------------------
/docs/algo_cfg.md:
--------------------------------------------------------------------------------
1 | The default parameter settings for the environment are stored in `joyrl/framework/envs/gym/config.py`, as follows:
2 |
3 | ### Q-learning
4 |
5 | ```python
6 | class AlgoConfig:
7 | def __init__(self) -> None:
8 | self.epsilon_start = 0.95 # epsilon start value
9 | self.epsilon_end = 0.01 # epsilon end value
10 | self.epsilon_decay = 300 # epsilon decay rate
11 | self.gamma = 0.90 # discount factor
12 | self.lr = 0.1 # learning rate
13 | ```
14 |
15 | Note:
16 |
17 | * Set `epsilon_start=epsilon_end` can obtain fixed `epsilon=epsilon_end`.
18 | * Adjust `epsilon_decay` appropriately to ensure that `epsilon` will not decay too early during the training process.
19 | * Since the traditional reinforcement learning algorithm faces a relatively simple environment, `gamma` is generally set to `0.9`, and `lr` can be set to a relatively large value such as `0.1`, and there is no need to worry too much about overfitting.
20 |
21 | ### DQN
22 |
23 | ```python
24 | class AlgoConfig(DefaultConfig):
25 | def __init__(self) -> None:
26 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
27 | self.epsilon_start = 0.95 # epsilon start value
28 | self.epsilon_end = 0.01 # epsilon end value
29 | self.epsilon_decay = 500 # epsilon decay rate
30 | self.gamma = 0.95 # discount factor
31 | self.lr = 0.0001 # learning rate
32 | self.max_buffer_size = 100000 # size of replay buffer
33 | self.batch_size = 64 # batch size
34 | self.target_update = 4 # target network update frequency
35 | self.value_layers = [
36 | {'layer_type': 'linear', 'layer_dim': ['n_states', 256],
37 | 'activation': 'relu'},
38 | {'layer_type': 'linear', 'layer_dim': [256, 256],
39 | 'activation': 'relu'},
40 | {'layer_type': 'linear', 'layer_dim': [256, 'n_actions'],
41 | 'activation': 'none'}]
42 | ```
--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
1 | # API
2 |
3 | 敬请期待
--------------------------------------------------------------------------------
/docs/dev/offline_run.md:
--------------------------------------------------------------------------------
1 |
2 | ### Offline Run
3 |
4 | If you want to run from source code for debugging or other purposes, you can clone this repo:
5 |
6 | ```bash
7 | git clone https://github.com/datawhalechina/joyrl.git
8 | ```
9 |
10 | Then install the dependencies:
11 |
12 | ```bash
13 | pip install -r requirements.txt
14 | # if you have installed joyrl, you'd better uninstall it to avoid conflicts
15 | pip uninstall joyrl
16 | ```
17 |
18 | Then you can run the following command to train a DQN agent on CartPole-v1 environment.
19 |
20 | ```bash
21 | python offline_run.py --yaml ./presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml
22 | ```
--------------------------------------------------------------------------------
/docs/docsify.md:
--------------------------------------------------------------------------------
1 | 安装
2 |
3 | ```bash
4 | # 安装node(Mac)
5 | brew install node
6 | # windows,安装后需要重启
7 | https://nodejs.org/en/
8 | # 全局安装docsify-cli,没有代理加速可以安装cnpm镜像加速
9 | npm i docsify-cli -g
10 | # 初始化,会在docs文件夹下生成README,index.html文件
11 | docsify init ./docs
12 | ```
13 |
14 | 本地部署,预览网站就在http://localhost:3000网址打开
15 |
16 | ```bash
17 | cd docs
18 | docsify serve
19 | ## 或者
20 | docsify serve ./docs
21 | ```
22 |
23 | 配置侧边栏
24 |
25 | index.html文件`window.$docsify`中增加`loadSidebar: true`,然后在docs下新建`_sidebar.md`
26 |
27 | ```html
28 | window.$docsify = {
29 | name: '',
30 | repo: '',
31 | loadSidebar: true,
32 | }
33 | ```
34 |
35 | latex 公式显示问题
36 |
37 | https://github.com/scruel/docsify-latex
38 |
39 | ## 编写规范
40 |
41 | 1. 不能在代码块里面加入`\`,否则会导致后面的公式不渲染
42 | 2. 英文和文中的数字用`$\text{}$`
43 | 3. 公式编号用`\tag{}`,公式和图表引用要在实际的公式和图表之前,便于阅读
44 | ### 参考
45 |
46 | [docsify中文文档](https://jingping-ye.github.io/docsify-docs-zh/#/%E5%BF%AB%E9%80%9F%E4%B8%8A%E6%89%8B/%E5%BC%80%E5%A7%8B)
--------------------------------------------------------------------------------
/docs/figs/DQN_pseu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/DQN_pseu.png
--------------------------------------------------------------------------------
/docs/figs/branch_merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/branch_merge.png
--------------------------------------------------------------------------------
/docs/figs/collector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/collector.png
--------------------------------------------------------------------------------
/docs/figs/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/data_flow.png
--------------------------------------------------------------------------------
/docs/figs/interaction_mdp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/interaction_mdp.png
--------------------------------------------------------------------------------
/docs/figs/interactor_learner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/interactor_learner.png
--------------------------------------------------------------------------------
/docs/figs/overall_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/overall_framework.png
--------------------------------------------------------------------------------
/docs/figs/recorder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/recorder.png
--------------------------------------------------------------------------------
/docs/figs/tasks_dir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/tasks_dir.png
--------------------------------------------------------------------------------
/docs/hyper_cfg.md:
--------------------------------------------------------------------------------
1 | # HyperParameter Config
2 |
3 | This part mainly introduces the parameter configuration and description of `JoyRL`.
4 |
5 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Document
6 |
7 |
8 |
9 |
10 |
11 |
12 |
17 |
18 |
19 |
20 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/docs/multiprocessing.md:
--------------------------------------------------------------------------------
1 | [multiprocessing官方教程-Python](https://docs.python.org/zh-cn/3/library/multiprocessing.html)
2 |
3 | 容易陷入的误区:
4 |
5 | * 电脑的CPU核数不等于支持的进程数,实际上能够支持的进程数更多,一般每个核支持两个进程
6 | * 进程与线程也有区别
7 |
8 | 执行下列代码可查看电脑能够支持的最大进程数:
9 | ```python
10 | import multiprocessing as mp
11 | print(mp.cpu_count())
12 | ```
13 |
14 | ## 构建子进程的方式
15 |
16 | 一般有三种,即fork,spawn和forkserver。unix环境中默认为fork,win环境下不支持fork,需要设置为spawn。
17 |
18 | fork模式下,除了必要的启动资源,子进程中的其他变量、包和数据等等都继承父进程,因而启动较快,但是大部分用的都是父进程的数据,不是很安全的模式
19 |
20 | spawn模式下,子进程是从头开始创建的,变量、包和数据等等都是从父进程拷贝而来,因此启动较慢,但是安全系数高。
21 |
22 | ```python
23 | import multiprocessing as mp
24 | print(mp.get_all_start_methods()) # 查看所有启动子进程的方法
25 | print(mp.get_start_method()) # 查看当前系统启动子进程的默认方法
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/docs/ray_DRL/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/ray_DRL/results.png
--------------------------------------------------------------------------------
/examples/quick_start_1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-22 13:42:56
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-12-22 13:49:09
9 | Discription:
10 | '''
11 | import joyrl
12 |
13 | if __name__ == "__main__":
14 | print(joyrl.__version__)
15 | yaml_path = "./presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml"
16 | joyrl.run(yaml_path = yaml_path)
--------------------------------------------------------------------------------
/examples/quick_start_2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-23 10:45:01
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-12-24 00:05:32
9 | Discription:
10 | '''
11 | import joyrl
12 |
13 | class GeneralConfig:
14 | ''' General parameters for running
15 | '''
16 | def __init__(self) -> None:
17 | # basic settings
18 | self.env_name = "gym" # name of environment
19 | self.algo_name = "DQN" # name of algorithm
20 | self.mode = "train" # train, test
21 | self.interactor_mode = "dummy" # dummy, only works when learner_mode is serial
22 | self.learner_mode = "serial" # serial, parallel, whether workers and learners are in parallel
23 | self.device = "cpu" # device to use
24 | self.seed = 0 # random seed
25 | self.max_episode = -1 # number of episodes for training, set -1 to keep running
26 | self.max_step = 200 # number of episodes for testing, set -1 means unlimited steps
27 | self.collect_traj = False # if collect trajectory or not
28 | # multiprocessing settings
29 | self.n_interactors = 1 # number of workers
30 | # online evaluation settings
31 | self.online_eval = True # online evaluation or not
32 | self.online_eval_episode = 10 # online eval episodes
33 | self.model_save_fre = 500 # model save frequency per update step
34 | # load model settings
35 | self.load_checkpoint = False # if load checkpoint
36 | self.load_path = "Train_single_CartPole-v1_DQN_20230515-211721" # path to load model
37 | self.load_model_step = 'best' # load model at which step
38 |
39 | class EnvConfig(object):
40 | def __init__(self) -> None:
41 | self.id = "CartPole-v1" # environment id
42 |
43 | if __name__ == "__main__":
44 | general_cfg = GeneralConfig()
45 | env_cfg = EnvConfig()
46 | joyrl.run(general_cfg = general_cfg, env_cfg = env_cfg)
--------------------------------------------------------------------------------
/joyrl/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-01-01 16:20:49
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-12-19 13:44:27
9 | Discription:
10 | '''
11 | from joyrl import algos, framework, envs
12 | from joyrl.run import run
13 |
14 | __version__ = "0.6.8"
15 |
16 | __all__ = [
17 | "algos",
18 | "config",
19 | "envs",
20 | "framework",
21 | "run"
22 | ]
23 |
--------------------------------------------------------------------------------
/joyrl/algos/A3C/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/A3C/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/A3C/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-06-03 13:37:11
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-23 17:49:41
9 | Discription:
10 | '''
11 |
12 | class AlgoConfig(object):
13 | def __init__(self):
14 | self.independ_actor = True # whether to use independent actor
15 | self.action_type_list = "continuous" # continuous action space
16 | self.gae_lambda = 0.95 # lambda for GAE
17 | self.gamma = 0.99 # discount factor
18 | self.lr = 0.0001 # for shared optimizer
19 | self.actor_lr = 0.0003 # learning rate for actor, must be specified if share_optimizer is False
20 | self.critic_lr = 0.001 # learning rate for critic, must be specified if share_optimizer is False
21 | self.critic_loss_coef = 0.001 # critic loss coefficient
22 | self.entropy_coef = 0.01 # entropy coefficient
23 | self.batch_size = 256 # ppo train batch size
24 | self.min_policy = 0 # min value for policy (for discrete action space)
25 | self.buffer_type = 'REPLAY_QUE'
26 | self.branch_layers = []
27 | self.merge_layers = []
28 | self.actor_branch_layers = []
29 | self.actor_merge_layers = []
30 | self.critic_branch_layers = []
31 | self.critic_merge_layers = []
32 |
--------------------------------------------------------------------------------
/joyrl/algos/CategoricalDQN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/CategoricalDQN/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/CategoricalDQN/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-12-18 13:15:04
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-12-19 13:24:11
9 | Discription:
10 | '''
11 | class AlgoConfig():
12 | ''' algorithm parameters
13 | '''
14 | def __init__(self) -> None:
15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end
16 | self.epsilon_start = 0.95 # epsilon start value
17 | self.epsilon_end = 0.01 # epsilon end value
18 | self.epsilon_decay = 500 # epsilon decay
19 | self.gamma = 0.95 # reward discount factor
20 | self.lr = 0.0001 # learning rate
21 | self.buffer_type = 'REPLAY_QUE' # replay buffer type
22 | self.max_buffer_size = 100000 # replay buffer size
23 | self.batch_size = 256 # batch size
24 | self.target_update = 4 # target network update frequency
25 | self.distributional = True # if use distributional dqn
26 | self.n_atoms = 51 # number of atoms
27 | self.v_min = -10 # min value
28 | self.v_max = 10 # max value
29 | self.enable_soft_update = True # enable soft update
30 | self.tau_soft_update = 0.005 # soft update tau
31 | # value network layers config
32 | # [{'name': 'feature_1', 'layers': [{'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}, {'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}]}]
33 | self.branch_layers = [
34 | # {
35 | # 'name': 'feature_1',
36 | # 'layers':
37 | # [
38 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
39 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
40 | # ]
41 | # },
42 | # {
43 | # 'name': 'feature_2',
44 | # 'layers':
45 | # [
46 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
47 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
48 | # ]
49 | # }
50 | ]
51 | self.merge_layers = [
52 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
53 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
54 | ]
55 |
--------------------------------------------------------------------------------
/joyrl/algos/CategoricalDQN/data_handler.py:
--------------------------------------------------------------------------------
1 | from joyrl.algos.base.data_handler import BaseDataHandler
2 |
3 | class DataHandler(BaseDataHandler):
4 | def __init__(self, cfg):
5 | super().__init__(cfg)
6 |
--------------------------------------------------------------------------------
/joyrl/algos/DDPG/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DDPG/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/DDPG/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-15 13:16:24
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-01-25 12:01:27
9 | Discription:
10 | '''
11 | import numpy as np
12 | class AlgoConfig:
13 | def __init__(self):
14 | self.action_type_list = 'dpg' # action type, dpg: deterministic policy gradient
15 | self.buffer_type = 'REPLAY_QUE' # replay buffer type
16 | self.max_buffer_size = 100000 # replay buffer size
17 | self.batch_size = 128 # batch size
18 | self.gamma = 0.99 # discount factor
19 | self.policy_loss_weight = 0.002 # policy loss weight
20 | self.critic_lr = 1e-3 # learning rate of critic
21 | self.actor_lr = 1e-4 # learning rate of actor
22 | self.tau = 0.001 # soft update parameter
23 | self.value_min = -np.inf # clip min critic value
24 | self.value_max = np.inf # clip max critic value
25 | # self.actor_layers = [
26 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'},
27 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'},
28 | # ]
29 | # self.critic_layers = [
30 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'},
31 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'},
32 | # ]
33 | self.branch_layers = []
34 | self.merge_layers = []
35 | self.actor_branch_layers = []
36 | self.actor_merge_layers = []
37 | self.critic_branch_layers = []
38 | self.critic_merge_layers = []
--------------------------------------------------------------------------------
/joyrl/algos/DDPG/data_handler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-02-25 15:46:04
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-07-21 14:45:35
9 | Discription:
10 | '''
11 | from joyrl.algos.base.data_handler import BaseDataHandler
12 | import numpy as np
13 | class DataHandler(BaseDataHandler):
14 | def __init__(self, cfg):
15 | super().__init__(cfg)
16 |
--------------------------------------------------------------------------------
/joyrl/algos/DDPG/model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-07-20 14:15:24
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-07-21 16:38:14
9 | Discription:
10 | '''
11 | import torch.nn as nn
12 | from joyrl.algos.base.network import *
13 |
14 | class Model(nn.Module):
15 | def __init__(self, cfg ):
16 | super(Model, self).__init__()
17 | state_size_list = cfg.obs_space_info.size
18 | action_size_list = cfg.action_space_info.size
19 | critic_input_size_list = state_size_list+ [[None, len(action_size_list)]]
20 | self.actor = ActorNetwork(cfg, input_size_list = state_size_list)
21 | self.critic = CriticNetwork(cfg, input_size_list = critic_input_size_list)
--------------------------------------------------------------------------------
/joyrl/algos/DQN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DQN/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/DQN/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-20 23:39:18
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 22:44:09
9 | Discription:
10 | '''
11 | class AlgoConfig():
12 | ''' algorithm parameters
13 | '''
14 | def __init__(self) -> None:
15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end
16 | self.epsilon_start = 0.95 # epsilon start value
17 | self.epsilon_end = 0.01 # epsilon end value
18 | self.epsilon_decay = 500 # epsilon decay
19 | self.gamma = 0.95 # reward discount factor
20 | self.lr = 0.0001 # learning rate
21 | self.buffer_type = 'REPLAY_QUE' # replay buffer type
22 | self.max_buffer_size = 100000 # replay buffer size
23 | self.batch_size = 64 # batch size
24 | self.target_update = 4 # target network update frequency
25 | # value network layers config
26 | # [{'name': 'feature_1', 'layers': [{'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}, {'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}]}]
27 | self.branch_layers = [
28 | # {
29 | # 'name': 'feature_1',
30 | # 'layers':
31 | # [
32 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
33 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
34 | # ]
35 | # },
36 | # {
37 | # 'name': 'feature_2',
38 | # 'layers':
39 | # [
40 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
41 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
42 | # ]
43 | # }
44 | ]
45 | self.merge_layers = [
46 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
47 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
48 | ]
49 |
--------------------------------------------------------------------------------
/joyrl/algos/DQN/data_handler.py:
--------------------------------------------------------------------------------
1 | from joyrl.algos.base.data_handler import BaseDataHandler
2 |
3 | class DataHandler(BaseDataHandler):
4 | def __init__(self, cfg):
5 | super().__init__(cfg)
6 |
--------------------------------------------------------------------------------
/joyrl/algos/DoubleDQN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DoubleDQN/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/DoubleDQN/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-03-15 22:04:42
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-15 14:03:54
9 | Discription:
10 | '''
11 | class AlgoConfig(object):
12 | ''' algorithm parameters
13 | '''
14 | def __init__(self) -> None:
15 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
16 | self.epsilon_start = 0.95 # epsilon start value
17 | self.epsilon_end = 0.01 # epsilon end value
18 | self.epsilon_decay = 500 # epsilon decay rate
19 | self.gamma = 0.99 # discount factor
20 | self.lr = 0.0001 # learning rate
21 | self.max_buffer_size = 100000 # size of replay buffer
22 | self.batch_size = 64 # batch size
23 | self.target_update = 4 # target network update frequency
24 | # value network layers config
25 | self.branch_layers = [
26 | # {
27 | # 'name': 'feature_1',
28 | # 'layers':
29 | # [
30 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
31 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
32 | # ]
33 | # },
34 | # {
35 | # 'name': 'feature_2',
36 | # 'layers':
37 | # [
38 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
39 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
40 | # ]
41 | # }
42 | ]
43 | self.merge_layers = [
44 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
45 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
46 | ]
47 |
48 |
--------------------------------------------------------------------------------
/joyrl/algos/DoubleDQN/data_handler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | @Author: John
5 | @Email: johnjim0816@gmail.com
6 | @Date: 2020-06-12 00:50:49
7 | @LastEditor: John
8 | LastEditTime: 2023-12-24 19:57:50
9 | @Discription:
10 | @Environment:
11 | '''
12 | from joyrl.algos.base.data_handler import BaseDataHandler
13 | class DataHandler(BaseDataHandler):
14 | def __init__(self, cfg):
15 | super().__init__(cfg)
--------------------------------------------------------------------------------
/joyrl/algos/DoubleDQN/policy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-22 23:02:13
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 22:49:54
9 | Discription:
10 | '''
11 | import torch
12 | import torch.nn as nn
13 | from joyrl.algos.DQN.policy import Policy as DQNPolicy
14 | class Policy(DQNPolicy):
15 | def __init__(self,cfg) -> None:
16 | super(Policy, self).__init__(cfg)
17 |
18 | def learn(self, **kwargs):
19 | ''' learn policy
20 | '''
21 | self.prepare_data_before_learn(**kwargs)
22 | actor_outputs = self.model(self.states)['actor_outputs']
23 | target_actor_outputs = self.target_model(self.next_states)['actor_outputs']
24 | tot_loss = 0
25 | self.summary_loss = []
26 | for i in range(len(self.action_size_list)):
27 | actual_q_value = actor_outputs[i]['q_value'].gather(1, self.actions[i].long())
28 | next_q_values = target_actor_outputs[i]['q_value']
29 | next_target_q_values_action = next_q_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1))
30 | expected_q_value = self.rewards + self.gamma * next_target_q_values_action * (1 - self.dones)
31 | loss_i = nn.MSELoss()(actual_q_value, expected_q_value)
32 | tot_loss += loss_i
33 | self.summary_loss.append(loss_i.item())
34 | self.optimizer.zero_grad()
35 | tot_loss.backward()
36 | # clip to avoid gradient explosion
37 | for param in self.model.parameters():
38 | param.grad.data.clamp_(-1, 1)
39 | self.optimizer.step()
40 | # update target net every C steps
41 | if self.update_step % self.target_update == 0:
42 | self.target_model.load_state_dict(self.model.state_dict())
43 | self.update_step += 1
44 | self.update_summary() # update summary
45 |
46 |
--------------------------------------------------------------------------------
/joyrl/algos/DuelingDQN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DuelingDQN/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/DuelingDQN/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-02-25 15:46:04
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 22:54:08
9 | Discription:
10 | '''
11 | class AlgoConfig(object):
12 | def __init__(self) -> None:
13 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
14 | self.dueling = True # use dueling network
15 | self.epsilon_start = 0.95 # epsilon start value
16 | self.epsilon_end = 0.01 # epsilon end value
17 | self.epsilon_decay = 500 # epsilon decay rate
18 | self.gamma = 0.99 # discount factor
19 | self.lr = 0.0001 # learning rate
20 | self.buffer_type = 'REPLAY_QUE' # replay buffer type
21 | self.max_buffer_size = 100000 # replay buffer size
22 | self.batch_size = 64 # batch size
23 | self.target_update = 4 # target network update frequency
24 | # value network layers config
25 | self.branch_layers = [
26 | # {
27 | # 'name': 'feature_1',
28 | # 'layers':
29 | # [
30 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
31 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
32 | # ]
33 | # },
34 | # {
35 | # 'name': 'feature_2',
36 | # 'layers':
37 | # [
38 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
39 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
40 | # ]
41 | # }
42 | ]
43 | self.merge_layers = [
44 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
45 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
46 | ]
47 |
--------------------------------------------------------------------------------
/joyrl/algos/DuelingDQN/data_handler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-02-21 20:32:11
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-05-17 11:39:38
9 | Discription:
10 | '''
11 | from joyrl.algos.base.data_handler import BaseDataHandler
12 | class DataHandler(BaseDataHandler):
13 | def __init__(self, cfg):
14 | super().__init__(cfg)
15 |
--------------------------------------------------------------------------------
/joyrl/algos/DuelingDQN/policy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2022-11-14 23:50:59
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 22:49:36
9 | Discription:
10 | '''
11 | from joyrl.algos.DQN.policy import Policy as DQNPolicy
12 |
13 | class Policy(DQNPolicy):
14 | def __init__(self,cfg) -> None:
15 | super(Policy, self).__init__(cfg)
16 |
--------------------------------------------------------------------------------
/joyrl/algos/HierarchicalDQN/README.md:
--------------------------------------------------------------------------------
1 | # Hierarchical DQN
2 |
3 | ## 原理简介
4 |
5 | Hierarchical DQN是一种分层强化学习方法,与DQN相比增加了一个meta controller,
6 |
7 | 
8 |
9 | 即学习时,meta controller每次会生成一个goal,然后controller或者说下面的actor就会达到这个goal,直到done为止。这就相当于给agent增加了一个队长,队长擅长制定局部目标,指导agent前行,这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。
10 |
11 | ## 伪代码
12 |
13 | 
--------------------------------------------------------------------------------
/joyrl/algos/NoisyDQN/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-05-30 23:57:29
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-05-30 23:57:30
9 | Discription:
10 | '''
11 |
--------------------------------------------------------------------------------
/joyrl/algos/NoisyDQN/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-02-21 20:32:11
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 22:54:02
9 | Discription:
10 | '''
11 | class AlgoConfig(object):
12 | ''' algorithm parameters
13 | '''
14 | def __init__(self) -> None:
15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end
16 | self.epsilon_start = 0.95 # epsilon start value
17 | self.epsilon_end = 0.01 # epsilon end value
18 | self.epsilon_decay = 500 # epsilon decay
19 | self.gamma = 0.95 # reward discount factor
20 | self.lr = 0.0001 # learning rate
21 | self.max_buffer_size = 100000 # replay buffer size
22 | self.batch_size = 64 # batch size
23 | self.target_update = 4 # target network update frequency
24 | self.branch_layers = []
25 | self.merge_layers = [
26 | # {'layer_type': 'noisy_linear', 'layer_size': [64], 'activation': 'ReLU','std_init': 0.4},
27 | # {'layer_type': 'noisy_linear', 'layer_size': [64], 'activation': 'ReLU','std_init': 0.4},
28 | ]
29 |
--------------------------------------------------------------------------------
/joyrl/algos/NoisyDQN/data_handler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-05-18 13:21:15
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-12-25 00:07:41
9 | Discription:
10 | '''
11 | from joyrl.algos.base.data_handler import BaseDataHandler
12 | class DataHandler(BaseDataHandler):
13 | def __init__(self, cfg):
14 | super().__init__(cfg)
15 |
--------------------------------------------------------------------------------
/joyrl/algos/NoisyDQN/policy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-04-17 11:23:49
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 22:53:19
9 | Discription:
10 | '''
11 | import torch
12 | import torch.nn as nn
13 | import torch.optim as optim
14 | import torch.nn.functional as F
15 | import numpy as np
16 | import math
17 | import random
18 |
19 | from joyrl.algos.DQN.policy import Policy as DQNPolicy
20 |
21 |
22 | class Policy(DQNPolicy):
23 | def __init__(self,cfg) -> None:
24 | super(Policy, self).__init__(cfg)
25 |
26 |
27 | def learn(self, **kwargs):
28 | ''' train policy
29 | '''
30 | self.prepare_data_before_learn(**kwargs)
31 | self.summary_loss = []
32 | tot_loss = 0
33 | actor_outputs = self.model(self.states)['actor_outputs']
34 | target_actor_outputs = self.target_model(self.next_states)['actor_outputs']
35 | for i in range(len(self.action_size_list)):
36 | actual_q_value = actor_outputs[i]['q_value'].gather(1, self.actions[i].long())
37 | # compute next max q value
38 | next_q_value_max = target_actor_outputs[i]['q_value'].max(1)[0].unsqueeze(dim=1)
39 | # compute target Q values
40 | target_q_value = self.rewards + (1 - self.dones) * self.gamma * next_q_value_max
41 | # compute loss
42 | loss_i = nn.MSELoss()(actual_q_value, target_q_value)
43 | tot_loss += loss_i
44 | self.summary_loss.append(loss_i.item())
45 | self.optimizer.zero_grad()
46 | tot_loss.backward()
47 | # clip to avoid gradient explosion
48 | for param in self.model.parameters():
49 | param.grad.data.clamp_(-1, 1)
50 | self.optimizer.step()
51 | # update target net every C steps
52 | if self.update_step % self.target_update == 0:
53 | self.target_model.load_state_dict(self.model.state_dict())
54 | self.update_step += 1
55 | self.model.reset_noise()
56 | self.target_model.reset_noise()
57 | self.update_summary() # update summary
58 |
59 |
--------------------------------------------------------------------------------
/joyrl/algos/PPO/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/PPO/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/PPO/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-02-20 21:53:39
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-23 17:50:12
9 | Discription:
10 | '''
11 | class AlgoConfig(object):
12 | def __init__(self):
13 | self.independ_actor = False # whether to use independent actor
14 | # whether actor and critic share the same optimizer
15 | self.ppo_type = 'clip' # clip or kl
16 | self.eps_clip = 0.2 # clip parameter for PPO
17 | self.gae_lambda = 0.95 # lambda for GAE
18 | # for kl penalty version of PPO
19 | self.kl_target = 0.1 # target KL divergence
20 | self.kl_lambda = 0.5 # lambda for KL penalty, 0.5 is the default value in the paper
21 | self.kl_beta = 1.5 # beta for KL penalty, 1.5 is the default value in the paper
22 | self.kl_alpha = 2 # alpha for KL penalty, 2 is the default value in the paper
23 | self.action_type_list = "continuous" # continuous action space
24 | self.return_form = 'mc' # 'mc' or 'td' or 'gae'
25 | self.gamma = 0.99 # discount factor
26 | self.k_epochs = 4 # update policy for K epochs
27 | self.lr = 0.0001 # for shared optimizer
28 | self.actor_lr = 0.0003 # learning rate for actor, must be specified if share_optimizer is False
29 | self.critic_lr = 0.001 # learning rate for critic, must be specified if share_optimizer is False
30 | self.critic_loss_coef = 0.001 # critic loss coefficient
31 | self.entropy_coef = 0.01 # entropy coefficient
32 | self.batch_size = 256 # ppo train batch size
33 | self.sgd_batch_size = 32 # sgd batch size
34 | self.min_policy = 0 # min value for policy (for discrete action space)
35 | self.buffer_type = 'REPLAY_QUE'
36 | self.branch_layers = []
37 | self.merge_layers = []
38 | self.actor_branch_layers = []
39 | self.actor_merge_layers = []
40 | self.critic_branch_layers = []
41 | self.critic_merge_layers = []
42 |
--------------------------------------------------------------------------------
/joyrl/algos/QLearning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/QLearning/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/QLearning/config.py:
--------------------------------------------------------------------------------
1 |
2 | class AlgoConfig():
3 | ''' algorithm parameters
4 | '''
5 | def __init__(self) -> None:
6 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end
7 | self.epsilon_start = 0.95 # epsilon start value
8 | self.epsilon_end = 0.01 # epsilon end value
9 | self.epsilon_decay = 500 # epsilon decay
10 | self.gamma = 0.95 # reward discount factor
11 | self.lr = 0.0001 # learning rate
12 |
--------------------------------------------------------------------------------
/joyrl/algos/QLearning/data_handler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-24 19:13:11
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-01-13 16:03:27
9 | Discription:
10 | '''
11 | import numpy as np
12 | from joyrl.algos.base.data_handler import BaseDataHandler
13 |
14 |
15 | class DataHandler(BaseDataHandler):
16 | def __init__(self,cfg) -> None:
17 | self.cfg = cfg
18 | self.buffer = []
19 | self.data_after_train = {}
20 | def add_exps(self, exps):
21 | ''' add transition to buffer
22 | '''
23 | self.buffer.append(exps)
24 |
25 | def sample_training_data(self):
26 | ''' sample training data from buffer
27 | '''
28 | if len(self.buffer) == 0:
29 | return None
30 | exp = self.buffer.pop()[0]
31 | return self._handle_exps_before_train(exp)
32 | def _handle_exps_before_train(self, exp, **kwargs):
33 | ''' convert exps to training data
34 | '''
35 | state = np.array(exp.state)
36 | action = np.array(exp.action)
37 | reward = np.array(exp.reward)
38 | next_state = np.array(exp.next_state)
39 | done = np.array(exp.done)
40 | data = {'state': state, 'action': action, 'reward': reward, 'next_state': next_state, 'done': done}
41 | return data
42 | def handle_exps_after_train(self):
43 | ''' handle exps after train
44 | '''
45 | pass
46 |
--------------------------------------------------------------------------------
/joyrl/algos/QLearning/policy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-24 15:09:47
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-01-13 18:26:43
9 | Discription:
10 | '''
11 | import math
12 | import numpy as np
13 | from collections import defaultdict
14 | from joyrl.algos.base.policy import ToyPolicy
15 |
16 | class Policy(ToyPolicy):
17 | def __init__(self,cfg) -> None:
18 | super(Policy, self).__init__(cfg)
19 | self.lr = cfg.lr
20 | self.gamma = cfg.gamma
21 | self.epsilon = cfg.epsilon_start
22 | self.epsilon_start = cfg.epsilon_start
23 | self.epsilon_end = cfg.epsilon_end
24 | self.epsilon_decay = cfg.epsilon_decay
25 | self.Q_table = defaultdict(lambda: np.zeros(self.n_actions))
26 | self.sample_count = 0
27 | self.create_summary()
28 |
29 | def sample_action(self, state, **kwargs):
30 | self.sample_count += 1
31 | self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
32 | math.exp(-1. * self.sample_count / self.epsilon_decay)
33 | if np.random.uniform(0, 1) > self.epsilon:
34 | action = np.argmax(self.Q_table[str(state)]) # select the action with max Q value
35 | else:
36 | action = np.random.choice(self.n_actions) # random select an action
37 | return action
38 |
39 | def predict_action(self, state, **kwargs):
40 | action = np.argmax(self.Q_table[str(state)])
41 | return action
42 |
43 | def learn(self, **kwargs):
44 | state, action, reward, next_state, done = kwargs.get('state'), kwargs.get('action'), kwargs.get('reward'), kwargs.get('next_state'), kwargs.get('done')
45 | Q_predict = self.Q_table[str(state)][action]
46 | if done:
47 | Q_target = reward
48 | else:
49 | Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)])
50 | self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
51 | self.loss = (Q_target - Q_predict) ** 2
52 | self.update_summary() # update summary
53 |
--------------------------------------------------------------------------------
/joyrl/algos/SAC-S/sac.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.optim as optim
3 | import torch.nn as nn
4 | import numpy as np
5 | class SAC:
6 | def __init__(self,n_actions,models,memory,cfg):
7 | self.device = cfg.device
8 | self.value_net = models['ValueNet'].to(self.device) # $\psi$
9 | self.target_value_net = models['ValueNet'].to(self.device) # $\bar{\psi}$
10 | self.soft_q_net = models['SoftQNet'].to(self.device) # $\theta$
11 | self.policy_net = models['PolicyNet'].to(self.device) # $\phi$
12 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr)
13 | self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr)
14 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr)
15 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
16 | target_param.data.copy_(param.data)
17 | self.value_criterion = nn.MSELoss()
18 | self.soft_q_criterion = nn.MSELoss()
19 | def update(self):
20 | # sample a batch of transitions from replay buffer
21 | state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
22 | self.batch_size)
23 | state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
24 | action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
25 | reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize)
26 | next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
27 | done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
28 |
--------------------------------------------------------------------------------
/joyrl/algos/SAC/sacd_cnn.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/SAC/sacd_cnn.py
--------------------------------------------------------------------------------
/joyrl/algos/Sarsa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/Sarsa/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/Sarsa/config.py:
--------------------------------------------------------------------------------
1 |
2 | class AlgoConfig():
3 | ''' algorithm parameters
4 | '''
5 | def __init__(self) -> None:
6 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end
7 | self.epsilon_start = 0.95 # epsilon start value
8 | self.epsilon_end = 0.01 # epsilon end value
9 | self.epsilon_decay = 500 # epsilon decay
10 | self.gamma = 0.95 # reward discount factor
11 | self.lr = 0.0001 # learning rate
12 |
--------------------------------------------------------------------------------
/joyrl/algos/Sarsa/data_handler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-24 19:13:11
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-01-13 16:03:27
9 | Discription:
10 | '''
11 | import numpy as np
12 | from joyrl.algos.base.data_handler import BaseDataHandler
13 |
14 |
15 | class DataHandler(BaseDataHandler):
16 | def __init__(self,cfg) -> None:
17 | self.cfg = cfg
18 | self.buffer = []
19 | self.data_after_train = {}
20 | def add_exps(self, exps):
21 | ''' add transition to buffer
22 | '''
23 | self.buffer.append(exps)
24 |
25 | def sample_training_data(self):
26 | ''' sample training data from buffer
27 | '''
28 | if len(self.buffer) == 0:
29 | return None
30 | exp = self.buffer.pop()[0]
31 | return self._handle_exps_before_train(exp)
32 | def _handle_exps_before_train(self, exp, **kwargs):
33 | ''' convert exps to training data
34 | '''
35 | state = np.array(exp.state)
36 | action = np.array(exp.action)
37 | reward = np.array(exp.reward)
38 | next_state = np.array(exp.next_state)
39 | done = np.array(exp.done)
40 | data = {'state': state, 'action': action, 'reward': reward, 'next_state': next_state, 'done': done}
41 | return data
42 | def handle_exps_after_train(self):
43 | ''' handle exps after train
44 | '''
45 | pass
46 |
--------------------------------------------------------------------------------
/joyrl/algos/Sarsa/policy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-24 15:09:47
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-01-13 18:37:13
9 | Discription:
10 | '''
11 | import math
12 | import numpy as np
13 | from collections import defaultdict
14 | from joyrl.algos.base.policy import ToyPolicy
15 |
16 | class Policy(ToyPolicy):
17 | def __init__(self,cfg) -> None:
18 | super(Policy, self).__init__(cfg)
19 | self.lr = cfg.lr
20 | self.gamma = cfg.gamma
21 | self.epsilon = cfg.epsilon_start
22 | self.epsilon_start = cfg.epsilon_start
23 | self.epsilon_end = cfg.epsilon_end
24 | self.epsilon_decay = cfg.epsilon_decay
25 | self.Q_table = defaultdict(lambda: np.zeros(self.n_actions))
26 | self.sample_count = 0
27 | self.next_action = None
28 | self.create_summary()
29 |
30 | def sample_action(self, state, **kwargs):
31 | self.sample_count += 1
32 | self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
33 | math.exp(-1. * self.sample_count / self.epsilon_decay)
34 | if np.random.uniform(0, 1) > self.epsilon:
35 | action = self.predict_action(state)
36 | if self.next_action is not None:
37 | action = self.next_action
38 | self.next_action = None
39 | else:
40 | action = np.random.choice(self.n_actions) # random select an action
41 | return action
42 |
43 | def predict_action(self, state, **kwargs):
44 | action = np.argmax(self.Q_table[str(state)])
45 | return action
46 |
47 | def learn(self, **kwargs):
48 | state, action, reward, next_state, done = kwargs.get('state'), kwargs.get('action'), kwargs.get('reward'), kwargs.get('next_state'), kwargs.get('done')
49 | Q_predict = self.Q_table[str(state)][action]
50 | self.next_action = self.predict_action(next_state)
51 | if done:
52 | Q_target = reward
53 | else:
54 | Q_target = reward + self.gamma * self.Q_table[str(next_state)][self.next_action]
55 | self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
56 | self.loss = (Q_target - Q_predict) ** 2
57 | self.update_summary() # update summary
58 |
--------------------------------------------------------------------------------
/joyrl/algos/SoftActorCritic/env_wrapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2021-04-29 12:52:11
7 | LastEditor: JiangJi
8 | LastEditTime: 2021-12-22 15:36:36
9 | Discription:
10 | Environment:
11 | '''
12 | import gym
13 | import numpy as np
14 |
15 | class NormalizedActions(gym.ActionWrapper):
16 | def action(self, action):
17 | low = self.action_space.low
18 | high = self.action_space.high
19 |
20 | action = low + (action + 1.0) * 0.5 * (high - low)
21 | action = np.clip(action, low, high)
22 |
23 | return action
24 |
25 | def reverse_action(self, action):
26 | low = self.action_space.low
27 | high = self.action_space.high
28 | action = 2 * (action - low) / (high - low) - 1
29 | action = np.clip(action, low, high)
30 | return action
--------------------------------------------------------------------------------
/joyrl/algos/SoftQ/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-07-30 13:40:26
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-07-30 13:40:27
9 | Discription:
10 | '''
11 |
--------------------------------------------------------------------------------
/joyrl/algos/SoftQ/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-20 23:39:18
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-07-30 13:40:49
9 | Discription:
10 | '''
11 | class AlgoConfig():
12 | ''' algorithm parameters
13 | '''
14 | def __init__(self) -> None:
15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end
16 | self.epsilon_start = 0.95 # epsilon start value
17 | self.epsilon_end = 0.01 # epsilon end value
18 | self.epsilon_decay = 500 # epsilon decay
19 | self.gamma = 0.95 # reward discount factor
20 | self.alpha = 0.4 # temperature parameter of softmax
21 | self.lr = 0.0001 # learning rate
22 | self.buffer_type = 'REPLAY_QUE' # replay buffer type
23 | self.max_buffer_size = 100000 # replay buffer size
24 | self.batch_size = 64 # batch size
25 | self.target_update = 4 # target network update frequency
26 | # value network layers config
27 | # [{'name': 'feature_1', 'layers': [{'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}, {'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}]}]
28 | self.branch_layers = [
29 | # {
30 | # 'name': 'feature_1',
31 | # 'layers':
32 | # [
33 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
34 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
35 | # ]
36 | # },
37 | # {
38 | # 'name': 'feature_2',
39 | # 'layers':
40 | # [
41 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
42 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'},
43 | # ]
44 | # }
45 | ]
46 | self.merge_layers = [
47 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
48 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'},
49 | ]
50 |
--------------------------------------------------------------------------------
/joyrl/algos/SoftQ/data_handler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-07-30 13:40:11
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-07-30 13:40:12
9 | Discription:
10 | '''
11 | from joyrl.algos.base.data_handler import BaseDataHandler
12 |
13 | class DataHandler(BaseDataHandler):
14 | def __init__(self, cfg):
15 | super().__init__(cfg)
16 |
--------------------------------------------------------------------------------
/joyrl/algos/TD3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/TD3/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/TD3/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-25 00:37:19
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-01-27 11:54:20
9 | Discription:
10 | '''
11 | class AlgoConfig:
12 | def __init__(self) -> None:
13 | self.action_type_list = 'dpg' # action type, dpg: deterministic policy gradient
14 | self.buffer_type = 'REPLAY_QUE' # replay buffer type
15 | self.explore_steps = 100 # exploration steps before training
16 | self.policy_freq = 2 # policy update frequency
17 | self.actor_lr = 1e-4 # actor learning rate 3e-4
18 | self.critic_lr = 1e-3 # critic learning rate
19 | self.gamma = 0.99 # discount factor
20 | self.tau = 0.005 # target smoothing coefficient
21 | self.policy_noise = 0.2 # noise added to target policy during critic update
22 | self.expl_noise = 0.1 # std of Gaussian exploration noise
23 | self.noise_clip = 0.5 # range to clip target policy noise
24 | self.batch_size = 100 # batch size for both actor and critic
25 | self.max_buffer_size = 1000000 # replay buffer size
26 | self.branch_layers = []
27 | self.merge_layers = []
28 | self.actor_branch_layers = []
29 | self.actor_merge_layers = []
30 | self.critic_branch_layers = []
31 | self.critic_merge_layers = []
--------------------------------------------------------------------------------
/joyrl/algos/TD3/data_handler.py:
--------------------------------------------------------------------------------
1 | from joyrl.algos.base.data_handler import BaseDataHandler
2 |
3 | class DataHandler(BaseDataHandler):
4 | def __init__(self, cfg):
5 | super().__init__(cfg)
--------------------------------------------------------------------------------
/joyrl/algos/TD3/model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-07-21 16:37:59
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-07-21 16:38:00
9 | Discription:
10 | '''
11 | import torch.nn as nn
12 | from joyrl.algos.base.network import *
13 |
14 | class Model(nn.Module):
15 | def __init__(self, cfg ):
16 | super(Model, self).__init__()
17 | state_size_list = cfg.obs_space_info.size
18 | action_size_list = cfg.action_space_info.size
19 | critic_input_size_list = state_size_list+ [[None, len(action_size_list)]]
20 | self.actor = ActorNetwork(cfg, input_size_list = state_size_list)
21 | self.critic_1 = CriticNetwork(cfg, input_size_list = critic_input_size_list)
22 | self.critic_2 = CriticNetwork(cfg, input_size_list = critic_input_size_list)
--------------------------------------------------------------------------------
/joyrl/algos/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-01-01 16:20:49
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-12-25 12:52:47
9 | Discription:
10 | '''
11 | from joyrl.algos import base,DQN,DoubleDQN,DuelingDQN,NoisyDQN,PPO
12 | __all__ = [
13 | "base",
14 | "QLearning",
15 | "DQN",
16 | "DoubleDQN",
17 | "DuelingDQN",
18 | "NoisyDQN",
19 | "PPO"
20 | ]
21 |
--------------------------------------------------------------------------------
/joyrl/algos/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/base/__init__.py
--------------------------------------------------------------------------------
/joyrl/algos/base/experience.py:
--------------------------------------------------------------------------------
1 | class Exp:
2 | def __init__(self, **kwargs) -> None:
3 | for k,v in kwargs.items():
4 | setattr(self,k,v)
5 |
6 |
--------------------------------------------------------------------------------
/joyrl/algos/base/optm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import math
3 | class SharedAdam(torch.optim.Adam):
4 | """Implements Adam algorithm with shared states.
5 | """
6 |
7 | def __init__(self,
8 | params,
9 | lr=1e-3,
10 | betas=(0.9, 0.999),
11 | eps=1e-8,
12 | weight_decay=0):
13 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
14 |
15 | for group in self.param_groups:
16 | for p in group['params']:
17 | state = self.state[p]
18 | state['step'] = torch.zeros(1)
19 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
20 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
21 |
22 | def share_memory(self):
23 | for group in self.param_groups:
24 | for p in group['params']:
25 | state = self.state[p]
26 | state['step'].share_memory_()
27 | state['exp_avg'].share_memory_()
28 | state['exp_avg_sq'].share_memory_()
29 |
30 | def step(self, closure=None):
31 | """Performs a single optimization step.
32 | Arguments:
33 | closure (callable, optional): A closure that reevaluates the model
34 | and returns the loss.
35 | """
36 | loss = None
37 | if closure is not None:
38 | loss = closure()
39 |
40 | for group in self.param_groups:
41 | for p in group['params']:
42 | if p.grad is None:
43 | continue
44 | grad = p.grad.data
45 | state = self.state[p]
46 |
47 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
48 | beta1, beta2 = group['betas']
49 |
50 | state['step'] += 1
51 |
52 | if group['weight_decay'] != 0:
53 | grad = grad.add(group['weight_decay'], p.data)
54 |
55 | # Decay the first and second moment running average coefficient
56 | exp_avg.mul_(beta1).add_(grad,alpha = 1 - beta1)
57 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad,value = 1 - beta2)
58 |
59 | denom = exp_avg_sq.sqrt().add_(group['eps'])
60 |
61 | bias_correction1 = 1 - beta1 ** state['step'].item()
62 | bias_correction2 = 1 - beta2 ** state['step'].item()
63 | step_size = group['lr'] * math.sqrt(
64 | bias_correction2) / bias_correction1
65 |
66 | p.data.addcdiv_(exp_avg, denom,value = -step_size)
67 | return loss
--------------------------------------------------------------------------------
/joyrl/envs/README.md:
--------------------------------------------------------------------------------
1 | # 环境说明汇总
2 |
3 | ## 算法SAR一览
4 |
5 | 说明:SAR分别指状态(S)、动作(A)以及奖励(R),下表的Reward Range表示每回合能获得的奖励范围,Steps表示环境中每回合的最大步数
6 |
7 | | Environment ID | Observation Space | Action Space | Reward Range | Steps |
8 | | :--------------------------------: | :---------------: | :----------: | :----------: | :------: |
9 | | CartPole-v0 | Box(4,) | Discrete(2) | [0,200] | 200 |
10 | | CartPole-v1 | Box(4,) | Discrete(2) | [0,500] | 500 |
11 | | CliffWalking-v0 | Discrete(48) | Discrete(4) | [-inf,-13] | [13,inf] |
12 | | FrozenLake-v1(*is_slippery*=False) | Discrete(16) | Discrete(4) | 0 or 1 | [6,info] |
13 |
14 | ## 环境描述
15 |
16 | [OpenAI Gym](./gym_info.md)
17 | [MuJoCo](./mujoco_info.md)
18 |
19 |
--------------------------------------------------------------------------------
/joyrl/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from joyrl.envs import gym
2 | __all__ = [
3 | "gym",
4 | ]
--------------------------------------------------------------------------------
/joyrl/envs/assets/action_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/action_grid.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/gym_info_20211130180023.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/gym_info_20211130180023.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/image-20200820174307301.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20200820174307301.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/image-20200820174814084.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20200820174814084.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/image-20201007211441036.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20201007211441036.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/image-20201007211858925.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20201007211858925.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/image-20210429150622353.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20210429150622353.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/image-20210429150630806.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20210429150630806.png
--------------------------------------------------------------------------------
/joyrl/envs/assets/track_big.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/track_big.png
--------------------------------------------------------------------------------
/joyrl/envs/gym/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-05-28 18:49:43
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-05-28 18:49:46
9 | Discription:
10 | '''
11 | from joyrl.envs.gym.wrappers import *
--------------------------------------------------------------------------------
/joyrl/envs/gym/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-05-27 20:55:27
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-17 01:31:52
9 | Discription:
10 | '''
11 | from joyrl.envs.register import register_env
12 | class EnvConfig(object):
13 | def __init__(self) -> None:
14 | self.id = "CartPole-v1" # environment id
15 | register_env(self.id)
16 | self.render_mode = None # render mode: None, rgb_array, human
17 | self.wrappers = []
18 | self.ignore_params = ["wrappers", "ignore_params"]
--------------------------------------------------------------------------------
/joyrl/envs/gym_info.md:
--------------------------------------------------------------------------------
1 | # OpenAi Gym 环境说明
2 | ## 基础控制
3 |
4 | ### [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0)
5 |
6 |
7 |
8 | 通过向左或向右推车能够实现平衡,所以动作空间由两个动作组成。每进行一个step就会给一个reward,如果无法保持平衡那么done等于true,本次episode失败。理想状态下,每个episode至少能进行200个step,也就是说每个episode的reward总和至少为200,step数目至少为200
9 |
10 | ### CartPole-v1
11 |
12 | ```CartPole v1```环境其实跟```CartPole v0```是一模一样的,区别在于每回合最大步数(max_episode_steps)以及奖励阈值(reward_threshold),如下是相关源码:
13 |
14 | 
15 |
16 | 这里先解释一下奖励阈值(reward_threshold),即Gym设置的一个合格标准,比如对于```CartPole v0```如果算法能够将奖励收敛到195以上,说明该算法合格。但实际上```CartPole v0```的每回合最大步数(max_episode_steps)是200,每步的奖励最大是1,也就是每回合最大奖励是200,比Gym设置的奖励阈值高。笔者猜测这是Gym可能是给算法学习者们设置的一个参考线,而实际中在写算法时并不会用到这个算法阈值,所以可以忽略。
17 |
18 | 再看每回合最大步数,可以看到```CartPole v1```的步数更长,相应的奖励要求更高,可以理解为```v1```是```v0```的难度升级版。
19 |
20 |
21 | ### [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0)
22 |
23 | 注:gym 0.18.0之后版本中Pendulum-v0已经改为Pendulum-v1
24 |
25 |
26 | 钟摆以随机位置开始,目标是将其摆动,使其保持向上直立。动作空间是连续的,值的区间为[-2,2]。每个step给的reward最低为-16.27,最高为0。目前最好的成绩是100个episode的reward之和为-123.11 ± 6.86。
27 |
28 | ###
29 |
30 | 悬崖寻路问题(CliffWalking)是指在一个4 x 12的网格中,智能体以网格的左下角位置为起点,以网格的下角位置为终点,目标是移动智能体到达终点位置,智能体每次可以在上、下、左、右这4个方向中移动一步,每移动一步会得到-1单位的奖励。
31 |
32 |
33 |
34 | 如图,红色部分表示悬崖,数字代表智能体能够观测到的位置信息,即observation,总共会有0-47等48个不同的值,智能体再移动中会有以下限制:
35 |
36 | * 智能体不能移出网格,如果智能体想执行某个动作移出网格,那么这一步智能体不会移动,但是这个操作依然会得到-1单位的奖励
37 |
38 | * 如果智能体“掉入悬崖” ,会立即回到起点位置,并得到-100单位的奖励
39 |
40 | * 当智能体移动到终点时,该回合结束,该回合总奖励为各步奖励之和
41 |
42 | 实际的仿真界面如下:
43 |
44 |
45 |
46 | 由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。
47 |
48 | ## 参考
49 |
50 | [Gym环境相关源码](https://github.com/openai/gym/tree/master/gym/envs)
--------------------------------------------------------------------------------
/joyrl/envs/mujoco_info.md:
--------------------------------------------------------------------------------
1 | # MuJoCo
2 |
3 | MuJoCo(Multi-Joint dynamics with Contact)是一个物理模拟器,可以用于机器人控制优化等研究。安装见[Mac安装MuJoCo以及mujoco_py](https://blog.csdn.net/JohnJim0/article/details/115656392?spm=1001.2014.3001.5501)
4 |
5 |
6 |
7 | ## HalfCheetah-v2
8 |
9 |
10 |
11 | 该环境基于mujoco仿真引擎,该环境的目的是使一只两只脚的“猎豹”跑得越快越好(下面图谷歌HalfCheetah-v2的,https://gym.openai.com/envs/HalfCheetah-v2/)。
12 |
13 |
14 |
15 | 动作空间:Box(6,),一只脚需要控制三个关节一共6个关节,每个关节的运动范围为[-1, 1]。
16 |
17 | 状态空间:Box(17, ),包含各种状态,每个值的范围为,主要描述“猎豹”本身的姿态等信息。
18 |
19 | 回报定义:每一步的回报与这一步的中猎豹的速度和猎豹行动的消耗有关,定义回报的代码如下。
20 |
21 | ```python
22 | def step(self, action):
23 | xposbefore = self.sim.data.qpos[0]
24 | self.do_simulation(action, self.frame_skip)
25 | xposafter = self.sim.data.qpos[0]
26 | ob = self._get_obs()
27 | reward_ctrl = - 0.1 * np.square(action).sum()
28 | reward_run = (xposafter - xposbefore)/self.dt
29 | # =========== reward ===========
30 | reward = reward_ctrl + reward_run
31 | # =========== reward ===========
32 | done = False
33 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
34 | ```
35 |
36 | 当猎豹无法控制平衡而倒下时,一个回合(episode)结束。
37 |
38 | 但是这个环境有一些问题,目前经过搜索并不知道一个回合的reward上限,实验中训练好的episode能跑出平台之外:
39 |
40 |
41 |
42 | 加上时间有限,所以训练中reward一直处于一个平缓上升的状态,本人猜测这可能是gym的一个bug。
--------------------------------------------------------------------------------
/joyrl/envs/register.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-22 23:02:13
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-12-24 22:52:08
9 | Discription:
10 | '''
11 | import gymnasium as gym
12 | from gymnasium.envs.registration import register
13 |
14 | def register_env(env_name):
15 | if env_name == 'Racetrack-v0':
16 | register(
17 | id='Racetrack-v0',
18 | entry_point='envs.racetrack:RacetrackEnv',
19 | max_episode_steps=1000,
20 | kwargs={}
21 | )
22 | elif env_name == 'FrozenLakeNoSlippery-v1':
23 | register(
24 | id='FrozenLakeNoSlippery-v1',
25 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
26 | kwargs={'map_name':"4x4",'is_slippery':False},
27 | )
28 | elif env_name == 'CustomCliffWalking-v0':
29 | register(
30 | id='CustomCliffWalking-v0',
31 | entry_point='joyrl.envs.gym.toy_text.cliff_walking:CustomCliffWalkingEnv',
32 | max_episode_steps=1000,
33 | kwargs={}
34 | )
35 | else:
36 | pass
37 |
38 | # if __name__ == "__main__":
39 | # import random
40 | # import gym
41 | # env = gym.make('FrozenLakeNoSlippery-v1')
42 | # num_steps = 1000000
43 | # state = env.reset()
44 | # n_actions = env.action_space.n
45 | # print(state)
46 | # for _ in range(num_steps) :
47 | # next_state, reward, done,_ = env.step(random.choice(range(n_actions)))
48 | # print(next_state)
49 | # if (done) :
50 | # _ = env.reset()
51 |
--------------------------------------------------------------------------------
/joyrl/envs/snake/README.md:
--------------------------------------------------------------------------------
1 | # 贪吃蛇
2 |
3 | 贪吃蛇是一个起源于1976年的街机游戏 Blockade,玩家控制蛇上下左右吃到食物并将身体增长,吃到食物后移动速度逐渐加快,直到碰到墙体或者蛇的身体算游戏结束。
4 |
5 | 
6 |
7 | 如图,本次任务整个游戏版面大小为560X560,绿色部分就是我们的智能体贪吃蛇,红色方块就是食物,墙位于四周,一旦食物被吃掉,会在下一个随机位置刷出新的食物。蛇的每一节以及食物的大小为40X40,除开墙体(厚度也为40),蛇可以活动的范围为480X480,也就是12X12的栅格。环境的状态等信息如下:
8 |
9 | * state:为一个元组,包含(adjoining_wall_x, adjoining_wall_y, food_dir_x, food_dir_y, adjoining_body_top, adjoining_body_bottom, adjoining_body_left, adjoining_body_right).
10 |
11 | * [adjoining_wall_x, adjoining_wall_y]:提供蛇头是否与墙体相邻的信息,具体包含9个状态
12 |
13 | adjoining_wall_x:0表示x轴方向蛇头无墙体相邻,1表示有墙在蛇头左边,2表示有墙在右边adjoining_wall_y:0表示y轴方向蛇头无墙体相邻,1表示有墙在蛇头上边,2表示有墙在下边
14 |
15 | 注意[0,0]也包括蛇跑出480X480范围的情况
16 |
17 | * [food_dir_x, food_dir_y]:表示食物与蛇头的位置关系
18 |
19 | food_dir_x:0表示食物与蛇头同在x轴上,1表示食物在蛇头左侧(不一定相邻),2表示在右边
20 |
21 | food_dir_y:0表示食物与蛇头同在y轴上,1表示食物在蛇头上面,2表示在下面
22 |
23 | * [adjoining_body_top, adjoining_body_bottom, adjoining_body_left, adjoining_body_right]:用以检查蛇的身体是否在蛇头的附近
24 |
25 | adjoining_body_top:1表示蛇头上边有蛇的身体,0表示没有
26 |
27 | adjoining_body_bottom:1表示蛇头下边有蛇的身体,0表示没有
28 |
29 | adjoining_body_left:1表示蛇头左边有蛇的身体,0表示没有
30 |
31 | adjoining_body_right:1表示蛇头右边有蛇的身体,0表示没有
32 |
33 | * action:即上下左右
34 |
35 | * reward:如果吃到食物给一个+1的reward,如果蛇没了就-1,其他情况给-0.1的reward
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/joyrl/envs/snake/example_assignment_and_report2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/snake/example_assignment_and_report2.pdf
--------------------------------------------------------------------------------
/joyrl/envs/snake/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | DISPLAY_SIZE = 560
3 | GRID_SIZE = 40
4 | WALL_SIZE = 40
5 | WHITE = (255, 255, 255)
6 | RED = (255, 0, 0)
7 | BLUE = (72, 61, 139)
8 | BLACK = (0, 0, 0)
9 | GREEN = (0, 255, 0)
10 |
11 | NUM_ADJOINING_WALL_X_STATES=3
12 | NUM_ADJOINING_WALL_Y_STATES=3
13 | NUM_FOOD_DIR_X=3
14 | NUM_FOOD_DIR_Y=3
15 | NUM_ADJOINING_BODY_TOP_STATES=2
16 | NUM_ADJOINING_BODY_BOTTOM_STATES=2
17 | NUM_ADJOINING_BODY_LEFT_STATES=2
18 | NUM_ADJOINING_BODY_RIGHT_STATES=2
19 | NUM_ACTIONS = 4
20 |
21 | CHECKPOINT = 'checkpoint.npy'
22 |
23 | def create_q_table():
24 | return np.zeros((NUM_ADJOINING_WALL_X_STATES, NUM_ADJOINING_WALL_Y_STATES, NUM_FOOD_DIR_X, NUM_FOOD_DIR_Y,
25 | NUM_ADJOINING_BODY_TOP_STATES, NUM_ADJOINING_BODY_BOTTOM_STATES, NUM_ADJOINING_BODY_LEFT_STATES,
26 | NUM_ADJOINING_BODY_RIGHT_STATES, NUM_ACTIONS))
27 |
28 | def sanity_check(arr):
29 | if (type(arr) is np.ndarray and
30 | arr.shape==(NUM_ADJOINING_WALL_X_STATES, NUM_ADJOINING_WALL_Y_STATES, NUM_FOOD_DIR_X, NUM_FOOD_DIR_Y,
31 | NUM_ADJOINING_BODY_TOP_STATES, NUM_ADJOINING_BODY_BOTTOM_STATES, NUM_ADJOINING_BODY_LEFT_STATES,
32 | NUM_ADJOINING_BODY_RIGHT_STATES,NUM_ACTIONS)):
33 | return True
34 | else:
35 | return False
36 |
37 | def save(filename, arr):
38 | if sanity_check(arr):
39 | np.save(filename,arr)
40 | return True
41 | else:
42 | print("Failed to save model")
43 | return False
44 |
45 | def load(filename):
46 | try:
47 | arr = np.load(filename)
48 | if sanity_check(arr):
49 | print("Loaded model successfully")
50 | return arr
51 | print("Model loaded is not in the required format")
52 | return None
53 | except:
54 | print("Filename doesnt exist")
55 | return None
--------------------------------------------------------------------------------
/joyrl/envs/stochastic_mdp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: John
5 | Email: johnjim0816@gmail.com
6 | Date: 2021-03-24 22:12:19
7 | LastEditor: John
8 | LastEditTime: 2021-03-26 17:12:43
9 | Discription:
10 | Environment:
11 | '''
12 | import numpy as np
13 | import random
14 |
15 |
16 | class StochasticMDP:
17 | def __init__(self):
18 | self.end = False
19 | self.curr_state = 2
20 | self.n_actions = 2
21 | self.n_states = 6
22 | self.p_right = 0.5
23 |
24 | def reset(self):
25 | self.end = False
26 | self.curr_state = 2
27 | state = np.zeros(self.n_states)
28 | state[self.curr_state - 1] = 1.
29 | return state
30 |
31 | def step(self, action):
32 | if self.curr_state != 1:
33 | if action == 1:
34 | if random.random() < self.p_right and self.curr_state < self.n_states:
35 | self.curr_state += 1
36 | else:
37 | self.curr_state -= 1
38 |
39 | if action == 0:
40 | self.curr_state -= 1
41 | if self.curr_state == self.n_states:
42 | self.end = True
43 |
44 | state = np.zeros(self.n_states)
45 | state[self.curr_state - 1] = 1.
46 |
47 | if self.curr_state == 1:
48 | if self.end:
49 | return state, 1.00, True, {}
50 | else:
51 | return state, 1.00/100.00, True, {}
52 | else:
53 | return state, 0.0, False, {}
54 |
--------------------------------------------------------------------------------
/joyrl/envs/track.txt:
--------------------------------------------------------------------------------
1 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2 | 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 1
3 | 1 1 1 1 1 1 0 0 0 0 0 0 0 3 3 3 3 3 1
4 | 1 1 1 1 1 0 0 0 0 0 0 0 0 3 3 3 3 3 1
5 | 1 1 1 1 0 0 0 0 0 0 0 0 0 3 3 3 3 3 1
6 | 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
7 | 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
8 | 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
9 | 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
10 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
11 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
12 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
13 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
14 | 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
15 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
--------------------------------------------------------------------------------
/joyrl/envs/windy_gridworld.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import sys
4 | from gym.envs.toy_text import discrete
5 |
6 | UP = 0
7 | RIGHT = 1
8 | DOWN = 2
9 | LEFT = 3
10 |
11 | class WindyGridworldEnv(discrete.DiscreteEnv):
12 |
13 | metadata = {'render.modes': ['human', 'ansi']}
14 |
15 | def _limit_coordinates(self, coord):
16 | coord[0] = min(coord[0], self.shape[0] - 1)
17 | coord[0] = max(coord[0], 0)
18 | coord[1] = min(coord[1], self.shape[1] - 1)
19 | coord[1] = max(coord[1], 0)
20 | return coord
21 |
22 | def _calculate_transition_prob(self, current, delta, winds):
23 | new_position = np.array(current) + np.array(delta) + np.array([-1, 0]) * winds[tuple(current)]
24 | new_position = self._limit_coordinates(new_position).astype(int)
25 | new_state = np.ravel_multi_index(tuple(new_position), self.shape)
26 | is_done = tuple(new_position) == (3, 7)
27 | return [(1.0, new_state, -1.0, is_done)]
28 |
29 | def __init__(self):
30 | self.shape = (7, 10)
31 |
32 | nS = np.prod(self.shape)
33 | n_actions = 4
34 |
35 | # Wind strength
36 | winds = np.zeros(self.shape)
37 | winds[:,[3,4,5,8]] = 1
38 | winds[:,[6,7]] = 2
39 |
40 | # Calculate transition probabilities
41 | P = {}
42 | for s in range(nS):
43 | position = np.unravel_index(s, self.shape)
44 | P[s] = { a : [] for a in range(n_actions) }
45 | P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
46 | P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
47 | P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
48 | P[s][LEFT] = self._calculate_transition_prob(position, [0, -1], winds)
49 |
50 | # We always start in state (3, 0)
51 | isd = np.zeros(nS)
52 | isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
53 |
54 | super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd)
55 |
56 | def render(self, mode='human', close=False):
57 | self._render(mode, close)
58 |
59 | def _render(self, mode='human', close=False):
60 | if close:
61 | return
62 |
63 | outfile = StringIO() if mode == 'ansi' else sys.stdout
64 |
65 | for s in range(self.nS):
66 | position = np.unravel_index(s, self.shape)
67 | # print(self.s)
68 | if self.s == s:
69 | output = " x "
70 | elif position == (3,7):
71 | output = " T "
72 | else:
73 | output = " o "
74 |
75 | if position[1] == 0:
76 | output = output.lstrip()
77 | if position[1] == self.shape[1] - 1:
78 | output = output.rstrip()
79 | output += "\n"
80 |
81 | outfile.write(output)
82 | outfile.write("\n")
83 |
--------------------------------------------------------------------------------
/joyrl/framework/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/framework/__init__.py
--------------------------------------------------------------------------------
/joyrl/framework/base.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-02 17:30:36
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-02 10:50:42
9 | Discription:
10 | '''
11 | import ray
12 | from joyrl.framework.config import MergedConfig
13 | from joyrl.framework.message import Msg
14 | from joyrl.framework.utils import Logger, create_module
15 |
16 |
17 | class Moduler(object):
18 | def __init__(self, cfg: MergedConfig, **kwargs) -> None:
19 | self.cfg = cfg
20 | self.name = kwargs.get('name', 'Moduler')
21 | self.logger = Logger(self.cfg.log_dir, log_name = self.name)
22 |
23 | def _t_start(self):
24 | ''' start threads
25 | '''
26 | raise NotImplementedError
27 |
28 | def _p_start(self):
29 | ''' start processes
30 | '''
31 | raise NotImplementedError
32 |
33 | def pub_msg(self, msg: Msg):
34 | ''' publish message
35 | '''
36 | raise NotImplementedError
37 |
38 | def init(self):
39 | ''' init module
40 | '''
41 | raise NotImplementedError
42 |
43 | def run(self):
44 | ''' run module
45 | '''
46 | raise NotImplementedError
47 |
48 |
49 |
--------------------------------------------------------------------------------
/joyrl/framework/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-02 15:30:09
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-07-20 13:03:03
9 | Discription:
10 | '''
11 | class DefaultConfig:
12 | ''' Default parameters for running
13 | '''
14 | def __init__(self) -> None:
15 | pass
16 | def print_cfg(self):
17 | ''' Print all parameters
18 | '''
19 | print(self.__dict__)
20 |
21 | class MergedConfig(object):
22 | ''' Merge general, algorithm and environment config
23 | '''
24 | def __init__(self) -> None:
25 | self.general_cfg = None
26 | self.algo_cfg = None
27 | self.env_cfg = None
28 |
29 | class GeneralConfig(object):
30 | ''' General parameters for running
31 | '''
32 | def __init__(self) -> None:
33 | # basic settings
34 | self.joyrl_version = "0.5.0"
35 | self.env_name = "gym" # name of environment
36 | self.algo_name = "DQN" # name of algorithm
37 | self.mode = "train" # train, test
38 | self.device = "custom" # set all device to cpu, cuda, custom
39 | self.interactor_device = "cpu" # device to use for interactor
40 | self.learner_device = "cpu" # device to use for learner
41 | self.seed = 0 # random seed
42 | self.is_learner_async = False # if learner is async
43 | self.max_episode = -1 # number of episodes for training, set -1 to keep running
44 | self.max_step = -1 # number of episodes for testing, set -1 means unlimited steps
45 | self.reward_threshold = float('inf') # reward threshold to stop training
46 | self.reward_threshold_limit = 10 # number of episodes to check reward threshold
47 | self.collect_traj = False # if collect trajectory or not
48 | # multiprocessing settings
49 | self.n_interactors = 1 # number of workers
50 | self.exps_trucation_size = 1 # size of exps to truncate
51 | self.n_learners = 1 # number of learners if using multi-processing, default 1
52 | self.share_buffer = True # if all learners share the same buffer
53 | # online evaluation settings
54 | self.online_eval = False # online evaluation or not
55 | self.online_eval_episode = 10 # online eval episodes
56 | self.model_save_fre = 500 # model save frequency per update step
57 | # load model settings
58 | self.load_checkpoint = False # if load checkpoint
59 | self.restore_model_meta = True # if restore model meta
60 | self.load_path = "Train_single_CartPole-v1_DQN_20230515-211721" # path to load model
61 | self.load_model_step = 'best' # load model at which step
62 | # stats recorder settings
63 | self.interact_summary_fre = 10 # record interact stats per episode
64 | self.policy_summary_fre = 100 # record update stats per update step
65 |
--------------------------------------------------------------------------------
/joyrl/framework/message.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2024-02-25 15:46:04
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 17:49:49
9 | Discription:
10 | '''
11 | from enum import Enum, unique
12 | from typing import Optional, Any
13 | from dataclasses import dataclass
14 |
15 | @unique
16 | class MsgType(Enum):
17 | # tracker
18 | TRACKER_GET_EPISODE = 0
19 | TRACKER_INCREASE_EPISODE = 1
20 | TRACKER_INCREASE_UPDATE_STEP = 2
21 | TRACKER_GET_UPDATE_STEP = 3
22 | TRACKER_CHECK_TASK_END = 4
23 | TRACKER_FORCE_TASK_END = 5
24 |
25 | # interactor
26 | INTERACTOR_SAMPLE = 10
27 | INTERACTOR_GET_SAMPLE_DATA = 11
28 |
29 | # learner
30 | LEARNER_UPDATE_POLICY = 20
31 | LEARNER_GET_UPDATED_MODEL_PARAMS_QUEUE = 21
32 |
33 | # collector
34 | COLLECTOR_PUT_EXPS = 30
35 | COLLECTOR_GET_TRAINING_DATA = 31
36 | COLLECTOR_GET_BUFFER_LENGTH = 32
37 |
38 | # recorder
39 | RECORDER_PUT_SUMMARY = 40
40 |
41 | # policy_mgr
42 | POLICY_MGR_PUT_MODEL_PARAMS = 70
43 | POLICY_MGR_PUT_MODEL_META = 71
44 |
45 | @dataclass
46 | class Msg(object):
47 | type: MsgType
48 | data: Optional[Any] = None
--------------------------------------------------------------------------------
/joyrl/framework/tracker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-04-28 16:16:04
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-14 17:49:11
9 | Discription:
10 | '''
11 | from joyrl.framework.message import Msg, MsgType
12 | from joyrl.framework.config import MergedConfig
13 | from joyrl.framework.base import Moduler
14 |
15 | class Tracker(Moduler):
16 | ''' tacker global information
17 | '''
18 | def __init__(self, cfg: MergedConfig, *args, **kwargs) -> None:
19 | super().__init__(cfg, *args, **kwargs)
20 | self.global_episode = 0 # current global episode
21 | self.global_sample_count = 0 # global sample count
22 | self.global_update_step = 0 # global update step
23 | self.force_task_end = False # force task end
24 | self.max_episode = cfg.max_episode # max episode
25 |
26 | def pub_msg(self, msg: Msg):
27 | msg_type, msg_data = msg.type, msg.data
28 | if msg_type == MsgType.TRACKER_GET_EPISODE:
29 | return self._get_episode()
30 | elif msg_type == MsgType.TRACKER_INCREASE_EPISODE:
31 | episode_delta = 1 if msg_data is None else msg_data
32 | self._increase_episode(i = episode_delta)
33 | elif msg_type == MsgType.TRACKER_GET_UPDATE_STEP:
34 | return self._get_update_step()
35 | elif msg_type == MsgType.TRACKER_INCREASE_UPDATE_STEP:
36 | update_step_delta = 1 if msg_data is None else msg_data
37 | self._increase_update_step(i = update_step_delta)
38 | elif msg_type == MsgType.TRACKER_CHECK_TASK_END:
39 | return self._check_task_end()
40 | elif msg_type == MsgType.TRACKER_FORCE_TASK_END:
41 | self.force_task_end = True
42 | else:
43 | raise NotImplementedError
44 |
45 | def _increase_episode(self, i: int = 1):
46 | ''' increase episode
47 | '''
48 | self.global_episode += i
49 |
50 | def _get_episode(self):
51 | ''' get current episode
52 | '''
53 | return self.global_episode
54 |
55 | def _check_task_end(self):
56 | ''' check if episode reaches the max episode
57 | '''
58 | if self.force_task_end:
59 | return True
60 | if self.max_episode < 0:
61 | return False
62 | return self.global_episode >= self.max_episode
63 |
64 | def _increase_update_step(self, i: int = 1):
65 | ''' increase update step
66 | '''
67 | self.global_update_step += i
68 |
69 | def _get_update_step(self):
70 | ''' get update step
71 | '''
72 | return self.global_update_step
--------------------------------------------------------------------------------
/joyrl/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-22 14:07:32
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-12-22 14:07:32
9 | Discription:
10 | '''
11 |
--------------------------------------------------------------------------------
/joyrl/scripts/scripts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-22 13:55:56
7 | LastEditor: JiangJi
8 | LastEditTime: 2023-12-22 14:07:35
9 | Discription:
10 | '''
11 | import argparse
12 | from joyrl import run
13 |
14 | def main():
15 | parser = argparse.ArgumentParser(description="hyperparameters")
16 | parser.add_argument('--yaml', default=None, type=str,
17 | help='the path of config file')
18 | args = parser.parse_args()
19 | run(yaml_path = args.yaml)
20 | if __name__ == "__main__":
21 | main()
--------------------------------------------------------------------------------
/presets/Atari/Breakout-v5/Breakout-v5_DQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DQN # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | device: cuda # device, cpu or cuda
5 | mode: train # run mode: train, test
6 | collect_traj: false # if collect trajectories or not
7 | mp_backend: single # multi-processing mode: single(default), ray
8 | n_workers: 2 # number of workers if using multi-processing, default 1
9 | load_checkpoint: false # if load checkpoint or not
10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best # load model step
12 | max_episode: 300 # max episodes, set -1 to keep running
13 | max_step: 500 # max steps per episode
14 | seed: 1 # random seed, set 0 not to use seed
15 | online_eval: true # if online eval or not
16 | online_eval_episode: 10 # online eval episodes
17 | model_save_fre: 500 # update step frequency of saving model
18 | algo_cfg:
19 | value_layers:
20 | - layer_type: conv2d
21 | in_channel: 4
22 | out_channel: 32
23 | kernel_size: 8
24 | stride: 4
25 | activation: relu
26 | - layer_type: conv2d
27 | in_channel: 32
28 | out_channel: 64
29 | kernel_size: 4
30 | stride: 2
31 | activation: relu
32 | - layer_type: conv2d
33 | in_channel: 64
34 | out_channel: 64
35 | kernel_size: 3
36 | stride: 1
37 | activation: relu
38 | - layer_type: flatten
39 | - layer_type: linear
40 | layer_size: [512]
41 | activation: relu
42 | batch_size: 64
43 | buffer_type: REPLAY_QUE
44 | max_buffer_size: 100000
45 | epsilon_decay: 500
46 | epsilon_end: 0.01
47 | epsilon_start: 0.95
48 | gamma: 0.95
49 | lr: 0.0001
50 | target_update: 4
51 | env_cfg:
52 | id: ALE/Breakout-v5
53 | wrapper: envs.wrappers.AtariWrapper
54 | render_mode: null
--------------------------------------------------------------------------------
/presets/Atari/Breakout-v5/Breakout-v5_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.2.2
3 | algo_name: PPO
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cuda
7 | mode: train # test # test
8 | exps_trucation_size: 1024
9 | exps_trucation_size_input_only: true
10 | is_learner_async: false
11 | load_checkpoint: false # true # false # true # test
12 | # load_path: Train_ALE/Breakout-v5_PPO_20240721-190921 # td 58
13 | # load_path: Train_ALE/Breakout-v5_PPO_20240722-001214 # td 67 258
14 | # load_path: Train_ALE/Breakout-v5_PPO_20240724-001646 # continue-train td 87 369
15 | # load_path: Train_ALE/Breakout-v5_PPO_20240724-223629 # 5.5e-5 continue-train td 99 408
16 | # load_path: Train_ALE/Breakout-v5_PPO_20240725-145732 # 2.5e-4 98
17 | load_path: Train_ALE/Breakout-v5_PPO_20240725-225306 # 2.5e-4 103 419
18 | load_model_step: best
19 | n_interactors: 10 # 1 # test
20 | max_episode: 45000 # 60000 # 3 # test
21 | max_step: 1200
22 | seed: 202407
23 | online_eval: true
24 | online_eval_episode: 15 # 1 # test
25 | model_save_fre: 10 # 1 # test
26 | policy_summary_fre: 10 # 1 # test
27 | interact_summary_fre: 100 # 1 # test
28 | algo_cfg:
29 | independ_actor: false
30 | return_form: gae # td
31 | branch_layers:
32 | - name: feature_1
33 | layers:
34 | - layer_type: conv2d
35 | in_channel: 4
36 | out_channel: 32
37 | kernel_size: 8
38 | stride: 4
39 | activation: relu
40 | - layer_type: conv2d
41 | in_channel: 32
42 | out_channel: 64
43 | kernel_size: 4
44 | stride: 2
45 | activation: relu
46 | - layer_type: conv2d
47 | in_channel: 64
48 | out_channel: 64
49 | kernel_size: 3
50 | stride: 1
51 | activation: relu
52 | - layer_type: flatten
53 | - layer_type: linear
54 | layer_size: [512]
55 | activation: relu
56 | buffer_type: ONPOLICY_QUE
57 | lr: 4.5e-4 # 3.0e-4 # 2.5e-4
58 | actor_lr: 3.0e-4
59 | critic_lr: 5.5e-4
60 | entropy_coef: 0.01
61 | critic_loss_coef: 0.5
62 | eps_clip: 0.105
63 | gamma: 0.99
64 | gae_lambda: 0.95
65 | k_epochs: 3
66 | batch_size: 256
67 | sgd_batch_size: 200
68 | max_norm: 0.5
69 | mini_batch_normalize: true
70 | env_cfg:
71 | id: ALE/Breakout-v5
72 | render_mode: null
73 | wrappers:
74 | - wrapper_name: BaseSkipFrame
75 | start_skip: 30
76 | skip: 4
77 | terminal_done_flag: true
78 | max_no_reward_count: 200
79 | - wrapper_name: EpisodicLifeEnv
80 | - wrapper_name: GrayScaleObservation
81 | - wrapper_name: ResizeObservation
82 | shape: 84
83 | - wrapper_name: ClipRewardEnv
84 | - wrapper_name: FrameStack
85 | num_stack: 4
86 | - wrapper_name: MultiHeadObsWrapper
87 | - wrapper_name: MultiHeadActionWrapper
88 | - wrapper_name: FrameStack2Numpy
89 |
90 |
--------------------------------------------------------------------------------
/presets/Atari/Breakout-v5/Breakout-v5_PPO_test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.2.2
3 | algo_name: PPO
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cuda
7 | mode: test # test
8 | exps_trucation_size: 1024
9 | exps_trucation_size_input_only: true
10 | is_learner_async: false
11 | load_checkpoint: true
12 | # load_path: Train_ALE/Breakout-v5_PPO_20240814-141323 # 373.000 in 800 steps
13 | # load_path: Train_ALE/Breakout-v5_PPO_20240814-004435 # 252.000 in 800 steps
14 | load_path: Train_ALE/Breakout-v5_PPO_20240923-204459 # 408 in 512
15 | load_model_step: best
16 | n_interactors: 1 # test
17 | max_episode: 3 # test
18 | max_step: 1200
19 | seed: 202407
20 | online_eval: true
21 | online_eval_episode: 1 # test
22 | model_save_fre: 1 # test
23 | policy_summary_fre: 1 # test
24 | interact_summary_fre: 1 # test
25 | algo_cfg:
26 | mini_batch_normalize: true
27 | independ_actor: false
28 | return_form: gae # td
29 | branch_layers:
30 | - name: feature_1
31 | layers:
32 | - layer_type: conv2d
33 | in_channel: 4
34 | out_channel: 32
35 | kernel_size: 8
36 | stride: 4
37 | activation: relu
38 | - layer_type: conv2d
39 | in_channel: 32
40 | out_channel: 64
41 | kernel_size: 4
42 | stride: 2
43 | activation: relu
44 | - layer_type: conv2d
45 | in_channel: 64
46 | out_channel: 64
47 | kernel_size: 3
48 | stride: 1
49 | activation: relu
50 | - layer_type: flatten
51 | - layer_type: linear
52 | layer_size: [512]
53 | activation: relu
54 | buffer_type: ONPOLICY_QUE
55 | lr: 3.5e-4 # 2.5e-4
56 | actor_lr: 3.0e-4
57 | critic_lr: 5.5e-4
58 | entropy_coef: 0.001
59 | critic_loss_coef: 0.5
60 | eps_clip: 0.105
61 | gamma: 0.99
62 | gae_lambda: 0.95
63 | k_epochs: 3
64 | batch_size: 256
65 | sgd_batch_size: 128
66 | max_norm: 0.5
67 | env_cfg:
68 | id: ALE/Breakout-v5
69 | render_mode: null
70 | wrappers:
71 | - wrapper_name: BaseSkipFrame
72 | start_skip: 30
73 | skip: 4
74 | terminal_done_flag: true
75 | max_no_reward_count: 200
76 | - wrapper_name: GrayScaleObservation
77 | - wrapper_name: ResizeObservation
78 | shape: 84
79 | # - wrapper_name: ClipRewardEnv
80 | - wrapper_name: EpisodicLifeEnv
81 | - wrapper_name: FrameStack
82 | num_stack: 4
83 | - wrapper_name: MultiHeadObsWrapper
84 | - wrapper_name: MultiHeadActionWrapper
85 | - wrapper_name: FrameStack2Numpy
86 |
87 |
--------------------------------------------------------------------------------
/presets/Atari/DemonAttack-v5/DoubleDQN_DemonAttack-v5_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DoubleDQN
3 | env_name: gym
4 | device: cuda
5 | mode: test
6 | collect_traj: false
7 | n_interactors: 1
8 | load_checkpoint: true
9 | # reward 860.000 in 552 steps
10 | load_path: Train_ALE/DemonAttack-v5_DoubleDQN_20240120-200251
11 | load_model_step: best
12 | max_episode: 2
13 | max_step: 1000
14 | seed: 2023
15 |
16 | algo_cfg:
17 | action_type: DISCRETE
18 | merge_layers:
19 | branch_layers:
20 | - name: feature_1
21 | layers:
22 | - layer_type: conv2d
23 | in_channel: 4
24 | out_channel: 16
25 | kernel_size: 4
26 | stride: 2
27 | activation: relu
28 | - layer_type: pooling
29 | pooling_type: max2d
30 | kernel_size: 2
31 | stride: 2
32 | padding: 0
33 | - layer_type: conv2d
34 | in_channel: 16
35 | out_channel: 32
36 | kernel_size: 4
37 | stride: 2
38 | activation: relu
39 | - layer_type: pooling
40 | pooling_type: avg2d
41 | kernel_size: 2
42 | stride: 2
43 | padding: 0
44 | - layer_type: flatten
45 | - layer_type: norm
46 | norm_type: LayerNorm
47 | normalized_shape: 512
48 | - layer_type: linear
49 | layer_size: [200]
50 | activation: relu
51 | - layer_type: linear
52 | layer_size: [200]
53 | activation: relu
54 | batch_size: 32
55 | buffer_type: REPLAY_QUE
56 | buffer_size: 12000
57 | epsilon_decay: 20000
58 | epsilon_end: 0.05
59 | epsilon_start: 0.95
60 | gamma: 0.99
61 | lr: 1.5e-4
62 | target_update: 16
63 | env_cfg:
64 | id: ALE/DemonAttack-v5
65 | render_mode: human
66 | obs_type: rgb
67 | wrappers:
68 | - wrapper_name: BaseSkipFrame
69 | skip: 5
70 | cut_slices:
71 | - [15, 188]
72 | - [0, 160]
73 | start_skip: 14
74 | int_action_flag: true
75 | - wrapper_name: GrayScaleObservation
76 | - wrapper_name: ResizeObservation
77 | shape: 84
78 | - wrapper_name: FrameStack
79 | num_stack: 4
80 |
81 | version: '0.5.2'
--------------------------------------------------------------------------------
/presets/Atari/DemonAttack-v5/DoubleDQN_DemonAttack-v5_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DoubleDQN
3 | env_name: gym
4 | device: cuda
5 | mode: train
6 | collect_traj: false
7 | n_interactors: 1
8 | load_checkpoint: false
9 | load_path: Train_ALE/DemonAttack-v5_DoubleDQN_20240114-101724
10 | load_model_step: best
11 | max_episode: 2000
12 | max_step: 300
13 | seed: 2023
14 | online_eval: true
15 | online_eval_episode: 10
16 | model_save_fre: 1500
17 |
18 | algo_cfg:
19 | action_type: DISCRETE
20 | merge_layers:
21 | branch_layers:
22 | - name: feature_1
23 | layers:
24 | - layer_type: conv2d
25 | in_channel: 4
26 | out_channel: 16
27 | kernel_size: 4
28 | stride: 2
29 | activation: relu
30 | - layer_type: pooling
31 | pooling_type: max2d
32 | kernel_size: 2
33 | stride: 2
34 | padding: 0
35 | - layer_type: conv2d
36 | in_channel: 16
37 | out_channel: 32
38 | kernel_size: 4
39 | stride: 2
40 | activation: relu
41 | - layer_type: pooling
42 | pooling_type: avg2d
43 | kernel_size: 2
44 | stride: 2
45 | padding: 0
46 | - layer_type: flatten
47 | - layer_type: norm
48 | norm_type: LayerNorm
49 | normalized_shape: 512
50 | - layer_type: linear
51 | layer_size: [200]
52 | activation: relu
53 | - layer_type: linear
54 | layer_size: [200]
55 | activation: relu
56 | batch_size: 32
57 | buffer_type: REPLAY_QUE
58 | buffer_size: 12000
59 | epsilon_decay: 20000
60 | epsilon_end: 0.05
61 | epsilon_start: 0.95
62 | gamma: 0.99
63 | lr: 1.5e-4
64 | target_update: 16
65 | env_cfg:
66 | id: ALE/DemonAttack-v5
67 | render_mode: null
68 | obs_type: rgb
69 | wrappers:
70 | - wrapper_name: BaseSkipFrame
71 | skip: 5
72 | cut_slices:
73 | - [15, 188]
74 | - [0, 160]
75 | start_skip: 14
76 | int_action_flag: true
77 | - wrapper_name: GrayScaleObservation
78 | - wrapper_name: ResizeObservation
79 | shape: 84
80 | - wrapper_name: FrameStack
81 | num_stack: 4
82 |
83 | version: '0.5.2'
--------------------------------------------------------------------------------
/presets/Atari/Enduro-v5/Enduro-v5_DQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DQN # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | device: cuda # device, cpu or cuda
5 | mode: train # run mode: train, test
6 | collect_traj: false # if collect trajectories or not
7 | mp_backend: single # multi-processing mode: single(default), ray
8 | n_workers: 2 # number of workers if using multi-processing, default 1
9 | load_checkpoint: false # if load checkpoint or not
10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best # load model step
12 | max_episode: 1000 # max episodes, set -1 to keep running
13 | max_step: 500 # max steps per episode
14 | seed: 1 # random seed, set 0 not to use seed
15 | online_eval: true # if online eval or not
16 | online_eval_episode: 10 # online eval episodes
17 | model_save_fre: 500 # update step frequency of saving model
18 | algo_cfg:
19 | value_layers:
20 | - layer_type: conv2d
21 | in_channel: 4
22 | out_channel: 32
23 | kernel_size: 8
24 | stride: 4
25 | activation: relu
26 | - layer_type: conv2d
27 | in_channel: 32
28 | out_channel: 64
29 | kernel_size: 4
30 | stride: 2
31 | activation: relu
32 | - layer_type: conv2d
33 | in_channel: 64
34 | out_channel: 64
35 | kernel_size: 3
36 | stride: 1
37 | activation: relu
38 | - layer_type: flatten
39 | - layer_type: linear
40 | layer_size: [512]
41 | activation: relu
42 | batch_size: 256
43 | buffer_type: REPLAY_QUE
44 | max_buffer_size: 100000
45 | epsilon_decay: 500
46 | epsilon_end: 0.01
47 | epsilon_start: 0.95
48 | gamma: 0.95
49 | lr: 0.0001
50 | target_update: 4
51 | env_cfg:
52 | id: ALE/Enduro-v5
53 | wrapper: envs.wrappers.AtariWrapper
--------------------------------------------------------------------------------
/presets/BipedalWalker-v3_DDPG_mp_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DDPG
3 | device: cpu
4 | env_name: gym
5 | mode: train
6 | eval_per_episode: 20
7 | load_checkpoint: false
8 | load_path: Train_BipedalWalker-v3_DDPG_20230414-214211
9 | max_steps: 300
10 | save_fig: true
11 | seed: 0
12 | show_fig: false
13 | test_eps: 3
14 | train_eps: 700
15 | n_workers: 2
16 | render: false
17 | render_mode: rgb_array
18 | algo_cfg:
19 | critic_hidden_dim: 128
20 | actor_hidden_dim: 128
21 | gamma: 0.99
22 | actor_lr: 5.0e-5
23 | critic_lr: 1.0e-3
24 | max_buffer_size: 20480
25 | tau: 0.01
26 | batch_size: 256
27 | env_cfg:
28 | id: BipedalWalker-v3
29 | new_step_api: true
30 | render_mode: rgb_array
--------------------------------------------------------------------------------
/presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_DDPG_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DDPG
3 | device: cpu
4 | env_name: gym
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_BipedalWalker-v3_DDPG_20230414-214211
8 | max_steps: 900
9 | save_fig: true
10 | seed: 2023
11 | show_fig: false
12 | test_eps: 3
13 | train_eps: 2000
14 | n_workers: 1
15 | render: true
16 | render_mode: rgb_array
17 | algo_cfg:
18 | critic_hidden_dim: 128
19 | actor_hidden_dim: 128
20 | gamma: 0.99
21 | actor_lr: 5.0e-5
22 | critic_lr: 1.0e-3
23 | max_buffer_size: 20480
24 | tau: 0.01
25 | batch_size: 256
26 | env_cfg:
27 | id: BipedalWalker-v3
28 | new_step_api: true
29 | render: true
30 | render_mode: rgb_array
--------------------------------------------------------------------------------
/presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_DDPG_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DDPG
3 | device: cpu
4 | env_name: gym
5 | mode: train
6 | eval_per_episode: 20
7 | load_checkpoint: false
8 | load_path: Train_BipedalWalker-v3_DDPG_20230414-214211
9 | max_steps: 300
10 | save_fig: true
11 | seed: 2023
12 | show_fig: false
13 | test_eps: 3
14 | train_eps: 700
15 | n_workers: 1
16 | render: false
17 | render_mode: rgb_array
18 | algo_cfg:
19 | critic_hidden_dim: 128
20 | actor_hidden_dim: 128
21 | gamma: 0.99
22 | actor_lr: 5.0e-5
23 | critic_lr: 1.0e-3
24 | max_buffer_size: 20480
25 | tau: 0.01
26 | batch_size: 256
27 | env_cfg:
28 | id: BipedalWalker-v3
29 | new_step_api: true
30 | render_mode: rgb_array
--------------------------------------------------------------------------------
/presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.2.2
3 | algo_name: PPO
4 | env_name: gym
5 | device: cuda
6 | interactor_device: cpu
7 | learner_device: cuda
8 | mode: train
9 | exps_trucation_size: 1024
10 | is_learner_async: false
11 | load_checkpoint: false
12 | load_path: Train_BipedalWalker-v3_PPO_20240619-222052 # if load checkpoint, then config path in 'tasks' dir
13 | load_model_step: best
14 | n_interactors: 10
15 | max_episode: 50000
16 | max_step: 500
17 | seed: 202406
18 | online_eval: true
19 | online_eval_episode: 15
20 | model_save_fre: 10
21 | policy_summary_fre: 10
22 | interact_summary_fre: 100
23 | algo_cfg:
24 | independ_actor: true
25 | return_form: td
26 | actor_branch_layers:
27 | - name: feature_1
28 | layers:
29 | - layer_type: linear
30 | layer_size: [200]
31 | activation: tanh
32 | - layer_type: linear
33 | layer_size: [200]
34 | activation: tanh
35 | critic_branch_layers:
36 | - name: feature_1
37 | layers:
38 | - layer_type: linear
39 | layer_size: [200]
40 | activation: tanh
41 | - layer_type: linear
42 | layer_size: [200]
43 | activation: tanh
44 | buffer_type: ONPOLICY_QUE
45 | lr: 2.5e-4
46 | actor_lr: 2.5e-4 # 1
47 | critic_lr: 3.0e-4 # 3
48 | entropy_coef: 0.001
49 | critic_loss_coef: 0.001
50 | eps_clip: 0.25
51 | gamma: 0.99
52 | gae_lambda: 0.95
53 | k_epochs: 2
54 | batch_size: 512
55 | sgd_batch_size: 256
56 | env_cfg:
57 | id: BipedalWalker-v3
58 | render_mode: null
59 | wrappers:
60 | - wrapper_name: ClipAction
61 | - wrapper_name: MultiHeadObsWrapper
62 | - wrapper_name: BipedalWalkerV3TFReward
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_SAC_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: SAC
3 | continous: true
4 | device: cuda
5 | env_name: gym
6 | eval_eps: 10
7 | eval_per_episode: 5
8 | load_checkpoint: true
9 | load_path: Train_gym_SAC_20230415-140928
10 | max_steps: 800
11 | mode: test
12 | mp_backend: mp
13 | new_step_api: true
14 | render: false
15 | render_mode: human
16 | save_fig: true
17 | seed: 666
18 | show_fig: false
19 | test_eps: 20
20 | train_eps: 2000
21 | wrapper: null
22 | algo_cfg:
23 | alpha: 0.01
24 | automatic_entropy_tuning: false
25 | batch_size: 64
26 | max_buffer_size: 1000000
27 | gamma: 0.98
28 | hidden_dim: 256
29 | lr: 0.0004
30 | n_epochs: 1
31 | policy_type: Gaussian
32 | start_steps: 10000
33 | target_update_fre: 1
34 | tau: 0.01
35 | env_cfg:
36 | id: BipedalWalker-v3
37 | new_step_api: true
38 | render_mode: null
39 |
--------------------------------------------------------------------------------
/presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_SAC_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: SAC
3 | continous: true
4 | device: cuda
5 | env_name: gym
6 | eval_eps: 10
7 | eval_per_episode: 5
8 | load_checkpoint: false
9 | load_path: Train_gym_SAC_20230415-140928
10 | max_steps: 500
11 | mode: train
12 | mp_backend: mp
13 | new_step_api: true
14 | render: false
15 | render_mode: human
16 | save_fig: true
17 | seed: 666
18 | show_fig: false
19 | test_eps: 20
20 | train_eps: 2000
21 | wrapper: null
22 | algo_cfg:
23 | alpha: 0.01
24 | automatic_entropy_tuning: false
25 | batch_size: 64
26 | max_buffer_size: 1000000
27 | gamma: 0.98
28 | hidden_dim: 256
29 | lr: 0.0004
30 | n_epochs: 1
31 | policy_type: Gaussian
32 | start_steps: 10000
33 | target_update_fre: 1
34 | tau: 0.01
35 | env_cfg:
36 | id: BipedalWalker-v3
37 | new_step_api: true
38 | render_mode: null
39 |
--------------------------------------------------------------------------------
/presets/Box2D/BipedalWalkerHardcore-v3/TD3_BipedalWalkerHardcore-v3.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: TD3
3 | device: cuda
4 | env_name: gym
5 | mode: train
6 | load_checkpoint: false
7 | load_path: Train_single_BipedalWalkerHardcore-v3_TD3_20230528-151042
8 | eval_per_episode: 50
9 | max_episode: 10000
10 | max_step: 1000
11 | seed: 2023
12 | model_save_fre: 70000
13 | online_eval: true
14 | algo_cfg:
15 | action_type: DPG
16 | buffer_type: REPLAY_QUE
17 | actor_layers:
18 | - layer_type: linear
19 | layer_size: [200]
20 | activation: relu
21 | - layer_type: linear
22 | layer_size: [200]
23 | activation: relu
24 | actor_lr: 1.0e-4
25 | batch_size: 256
26 | max_buffer_size: 60000
27 | critic_layers:
28 | - layer_type: linear
29 | layer_size: [200]
30 | activation: relu
31 | - layer_type: linear
32 | layer_size: [200]
33 | activation: relu
34 | critic_lr: 1.2e-4
35 | expl_noise: 0.25
36 | explore_steps: 2048
37 | gamma: 0.99
38 | noise_clip: 0.5
39 | policy_freq: 2
40 | policy_noise: 0.2
41 | tau: 0.005
42 | env_cfg:
43 | id: BipedalWalkerHardcore-v3
44 |
--------------------------------------------------------------------------------
/presets/Box2D/CarRacing-v2/CarRacing-v2_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.1.4
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | exps_trucation_size: 200
8 | is_learner_async: false
9 | load_checkpoint: false
10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best
12 | n_interactors: 10
13 | max_episode: -1
14 | max_step: 200
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 10
18 | model_save_fre: 10
19 | policy_summary_fre: 2
20 | interact_summary_fre: 100
21 | algo_cfg:
22 | branch_layers:
23 | - name: feature_1
24 | layers:
25 | - layer_type: conv2d
26 | in_channel: 3
27 | out_channel: 32
28 | kernel_size: 8
29 | stride: 4
30 | activation: relu
31 | # - layer_type: pooling
32 | # pooling_type: max2d
33 | # kernel_size: 2
34 | # stride: 2
35 | # padding: 0
36 | - layer_type: conv2d
37 | in_channel: 32
38 | out_channel: 64
39 | kernel_size: 4
40 | stride: 2
41 | activation: relu
42 | - layer_type: conv2d
43 | in_channel: 64
44 | out_channel: 64
45 | kernel_size: 3
46 | stride: 1
47 | activation: relu
48 | - layer_type: norm
49 | norm_type: LayerNorm
50 | # - layer_type: pooling
51 | # pooling_type: avg2d
52 | # kernel_size: 2
53 | # stride: 2
54 | # padding: 0
55 | - layer_type: flatten
56 | - layer_type: linear
57 | layer_size: [512]
58 | activation: relu
59 | - layer_type: norm
60 | norm_type: LayerNorm
61 | - layer_type: linear
62 | layer_size: [128]
63 | activation: relu
64 | buffer_type: ONPOLICY_QUE
65 | lr: 0.0003
66 | actor_lr: 0.003
67 | critic_lr: 0.01
68 | entropy_coef: 0.001
69 | critic_loss_coef: 0.5
70 | eps_clip: 0.2
71 | gamma: 0.95
72 | gae_lambda: 0.95
73 | k_epochs: 4
74 | batch_size: 2000
75 | sgd_batch_size: 50
76 | env_cfg:
77 | id: CarRacing-v2
78 | render_mode: null
79 | wrappers:
80 | - wrapper_name: ReshapeImageObsWrapper
81 | - wrapper_name: MultiHeadObsWrapper
82 |
83 |
--------------------------------------------------------------------------------
/presets/Box2D/CarRacing-v2/DQN_carRacing-v2_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DQN
3 | env_name: gym
4 | device: cuda
5 | mode: test
6 | collect_traj: false
7 | n_interactors: 1
8 | load_checkpoint: true
9 | load_path: Train_CarRacing-v2_DQN_20240109-221840
10 | load_model_step: best # 754
11 | max_episode: 3
12 | max_step: 1200
13 | seed: 2023
14 |
15 | algo_cfg:
16 | merge_layers:
17 | branch_layers:
18 | - name: feature_1
19 | layers:
20 | - layer_type: conv2d
21 | in_channel: 4
22 | out_channel: 16
23 | kernel_size: 4
24 | stride: 2
25 | activation: relu
26 | - layer_type: pooling
27 | pooling_type: max2d
28 | kernel_size: 2
29 | stride: 2
30 | padding: 0
31 | - layer_type: conv2d
32 | in_channel: 16
33 | out_channel: 32
34 | kernel_size: 4
35 | stride: 2
36 | activation: relu
37 | - layer_type: pooling
38 | pooling_type: avg2d
39 | kernel_size: 2
40 | stride: 2
41 | padding: 0
42 | - layer_type: flatten
43 | - layer_type: norm
44 | norm_type: LayerNorm
45 | normalized_shape: 512
46 | - layer_type: linear
47 | layer_size: [128]
48 | activation: relu
49 | batch_size: 128
50 | buffer_type: REPLAY_QUE
51 | max_buffer_size: 12000
52 | epsilon_decay: 2000
53 | epsilon_end: 0.02
54 | epsilon_start: 0.99
55 | gamma: 0.99
56 | lr: 1.5e-4 # 2.0e-4
57 | target_update: 4
58 | env_cfg:
59 | id: CarRacing-v2
60 | render_mode: human
61 | continuous: False
62 | wrapper:
63 | - wrapper_name: CarV2SkipFrame
64 | skip: 5
65 | - wrapper_name: GrayScaleObservation
66 | - wrapper_name: ResizeObservation
67 | shape: 84
68 | - wrapper_name: FrameStack
69 | num_stack: 4
70 |
--------------------------------------------------------------------------------
/presets/Box2D/CarRacing-v2/DQN_carRacing-v2_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DQN
3 | env_name: gym
4 | device: cuda
5 | mode: train
6 | collect_traj: false
7 | n_interactors: 1
8 | load_checkpoint: false
9 | load_path: Train_single_carRacing-v2_DQN
10 | load_model_step: best
11 | max_episode: 1200
12 | max_step: 1600 # 1200
13 | seed: 2023
14 | online_eval: true
15 | online_eval_episode: 100
16 | model_save_fre: 1000
17 |
18 | algo_cfg:
19 | merge_layers:
20 | branch_layers:
21 | - name: feature_1
22 | layers:
23 | - layer_type: conv2d
24 | in_channel: 4
25 | out_channel: 16
26 | kernel_size: 4
27 | stride: 2
28 | activation: relu
29 | - layer_type: pooling
30 | pooling_type: max2d
31 | kernel_size: 2
32 | stride: 2
33 | padding: 0
34 | - layer_type: conv2d
35 | in_channel: 16
36 | out_channel: 32
37 | kernel_size: 4
38 | stride: 2
39 | activation: relu
40 | - layer_type: pooling
41 | pooling_type: avg2d
42 | kernel_size: 2
43 | stride: 2
44 | padding: 0
45 | - layer_type: flatten
46 | - layer_type: norm
47 | norm_type: LayerNorm
48 | normalized_shape: 512
49 | - layer_type: linear
50 | layer_size: [128]
51 | activation: relu
52 | batch_size: 128
53 | buffer_type: REPLAY_QUE
54 | max_buffer_size: 12000
55 | epsilon_decay: 2000
56 | epsilon_end: 0.02
57 | epsilon_start: 0.99
58 | gamma: 0.99
59 | lr: 1.5e-4 # 2.0e-4
60 | target_update: 4
61 | env_cfg:
62 | id: CarRacing-v2
63 | render_mode: null
64 | continuous: False
65 | wrappers:
66 | - wrapper_name: CarV2SkipFrame
67 | skip: 5
68 | - wrapper_name: GrayScaleObservation
69 | - wrapper_name: ResizeObservation
70 | shape: 84
71 | - wrapper_name: FrameStack
72 | num_stack: 4
73 |
--------------------------------------------------------------------------------
/presets/Box2D/LunarLander-v2/LunarLander-v2_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | exps_trucation_size: 200
8 | is_learner_async: false
9 | load_checkpoint: true
10 | load_path: Train_LunarLander-v2_PPO_20240617-175014
11 | load_model_step: best
12 | n_interactors: 1
13 | max_episode: -1
14 | max_step: 1000
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 10
18 | reward_threshold: 210
19 | model_save_fre: 10
20 | policy_summary_fre: 5
21 | interact_summary_fre: 100
22 | algo_cfg:
23 | independ_actor: false
24 | return_form: td
25 | actor_branch_layers:
26 | - name: state
27 | layers:
28 | - layer_type: linear
29 | layer_size: [256]
30 | activation: relu
31 | critic_branch_layers:
32 | - name: state
33 | layers:
34 | - layer_type: linear
35 | layer_size: [256]
36 | activation: relu
37 | branch_layers:
38 | - name: state
39 | layers:
40 | - layer_type: linear
41 | layer_size: [256]
42 | activation: relu
43 | buffer_type: ONPOLICY_QUE
44 | eps_clip: 0.2
45 | entropy_coef: 0.002
46 | lr: 0.0003
47 | actor_lr: 0.003
48 | critic_lr: 0.01
49 | critic_loss_coef: 0.5
50 | gamma: 0.99
51 | gae_lambda: 0.95
52 | k_epochs: 4
53 | batch_size: 2000
54 | sgd_batch_size: 32
55 | env_cfg:
56 | id: LunarLander-v2
57 | render_mode: null
58 | wrappers:
59 | - wrapper_name: MultiHeadObsWrapper
60 | - wrapper_name: MultiHeadActionWrapper
61 |
--------------------------------------------------------------------------------
/presets/Box2D/LunarLander-v2/LunarLander-v2_PPO_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: PPO
3 | device: cpu
4 | env_name: LunarLander-v2
5 | eval_eps: 10
6 | eval_per_episode: 5
7 | load_checkpoint: true
8 | load_path: Train_LunarLander-v2_PPO_20230402-223154
9 | max_steps: 1000
10 | mode: test
11 | mp_backend: mp
12 | new_step_api: true
13 | render: false
14 | save_fig: true
15 | seed: 1
16 | show_fig: false
17 | test_eps: 20
18 | train_eps: 600
19 | wrapper: null
20 | algo_cfg:
21 | actor_hidden_dim: 256
22 | actor_lr: 0.0003
23 | continuous: false
24 | critic_hidden_dim: 256
25 | critic_lr: 0.001
26 | entropy_coef: 0.01
27 | eps_clip: 0.2
28 | gamma: 0.99
29 | k_epochs: 4
30 | ppo_type: clip
31 | sgd_batch_size: 32
32 | train_batch_size: 256
33 |
--------------------------------------------------------------------------------
/presets/Box2D/LunarLander-v2/LunarLanderContinuous-v2_SAC_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: SAC
3 | continuous: true
4 | device: cpu
5 | env_name: LunarLanderContinuous-v2
6 | eval_eps: 10
7 | eval_per_episode: 5
8 | load_checkpoint: true
9 | load_path: Train_LunarLanderContinuous-v2_SAC_20230402-170158
10 | max_steps: 500
11 | mode: test
12 | mp_backend: mp
13 | new_step_api: true
14 | render: true
15 | render_mode: human
16 | save_fig: true
17 | seed: 666
18 | show_fig: false
19 | test_eps: 20
20 | train_eps: 2000
21 | wrapper: null
22 | algo_cfg:
23 | alpha: 0.2
24 | automatic_entropy_tuning: false
25 | batch_size: 64
26 | max_buffer_size: 1000000
27 | gamma: 0.99
28 | hidden_dim: 256
29 | lr: 0.001
30 | n_epochs: 1
31 | policy_type: Gaussian
32 | start_steps: 10000
33 | target_update_fre: 1
34 | tau: 0.005
35 |
--------------------------------------------------------------------------------
/presets/Box2D/LunarLander-v2/LunarLanderContinuous-v2_SAC_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: SAC
3 | continous: true
4 | device: cuda
5 | env_name: gym
6 | eval_eps: 10
7 | eval_per_episode: 5
8 | load_checkpoint: false
9 | load_path: Train_LunarLanderContinuous-v2_PPO_20230401-102521
10 | max_steps: 500
11 | mode: train
12 | new_step_api: true
13 | render: false
14 | save_fig: true
15 | seed: 666
16 | show_fig: false
17 | test_eps: 20
18 | train_eps: 2000
19 | wrapper: null
20 | algo_cfg:
21 | alpha: 0.1
22 | automatic_entropy_tuning: false
23 | batch_size: 64
24 | max_buffer_size: 1000000
25 | gamma: 0.99
26 | hidden_dim: 256
27 | lr: 0.001
28 | n_epochs: 1
29 | policy_type: Gaussian
30 | start_steps: 10000
31 | target_update_fre: 1
32 | tau: 0.005
33 | env_cfg:
34 | id: LunarLanderContinuous-v2
--------------------------------------------------------------------------------
/presets/ClassControl/Acrobot-v1/Acrobot-v1_DQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: DQN
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | mode: train
8 | is_learner_async: false
9 | collect_traj: false
10 | n_interactors: 1
11 | load_checkpoint: true
12 | load_path: Train_Acrobot-v1_DQN_2024613
13 | load_model_step: best
14 | max_episode: -1
15 | max_step: 200
16 | seed: 1
17 | online_eval: true
18 | online_eval_episode: 10
19 | model_save_fre: 500
20 | policy_summary_fre: 100
21 |
22 | algo_cfg:
23 | learn_frequency: 1
24 | merge_layers:
25 | - layer_type: linear
26 | layer_size: [256]
27 | activation: relu
28 | - layer_type: linear
29 | layer_dim: [256]
30 | activation: relu
31 | batch_size: 64
32 | max_buffer_size: 100000
33 | epsilon_decay: 500
34 | epsilon_end: 0.01
35 | epsilon_start: 0.95
36 | gamma: 0.99
37 | lr: 0.0001
38 | target_update: 4
39 |
40 | env_cfg:
41 | id: Acrobot-v1
42 | render_mode: null
43 | wrappers:
44 | - wrapper_name: MultiHeadObsWrapper
45 | - wrapper_name: MultiHeadActionWrapper
46 |
--------------------------------------------------------------------------------
/presets/ClassControl/Acrobot-v1/Acrobot-v1_DoubleDQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: DoubleDQN
4 | env_name: gym
5 | mode: train
6 | is_learner_async: false
7 | collect_traj: false
8 | n_interactors: 1
9 | load_checkpoint: false
10 | load_path: Train_ALE/Acrobot-v1_DoubleDQN_20240114-101724
11 | load_model_step: best
12 | max_episode: -1
13 | max_step: 200
14 | seed: 1
15 | online_eval: true
16 | online_eval_episode: 10
17 | model_save_fre: 500
18 | policy_summary_fre: 100
19 |
20 | algo_cfg:
21 | merege_layers: []
22 | branch_layers:
23 | - name: feature1
24 | layers:
25 | - layer_type: linear
26 | layer_size: [256]
27 | activation: relu
28 | - layer_type: linear
29 | layer_size: [256]
30 | activation: relu
31 | buffer_type: REPLAY_QUE
32 | batch_size: 64
33 | max_buffer_size: 100000
34 | epsilon_decay: 500
35 | epsilon_end: 0.01
36 | epsilon_start: 0.95
37 | gamma: 0.99
38 | lr: 0.0001
39 | target_update: 4
40 | env_cfg:
41 | id: Acrobot-v1
42 | render_mode: null
43 | wrappers:
44 | - wrapper_name: MultiHeadObsWrapper
45 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/Acrobot-v1/Acrobot-v1_DuelingDQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: DuelingDQN
4 | env_name: gym
5 | interactor_device: cuda
6 | learner_device: cuda
7 | mode: train
8 | is_learner_async: false
9 | collect_traj: false
10 | n_interactors: 1
11 | load_checkpoint: false
12 | load_path: Train_CartPole-v1_DQN_20221026-054757
13 | load_model_step: best
14 | max_episode: -1
15 | max_step: 200
16 | seed: 1
17 | online_eval: true
18 | online_eval_episode: 10
19 | model_save_fre: 500
20 | policy_summary_fre: 100
21 |
22 | algo_cfg:
23 | merge_layers:
24 | - layer_type: linear
25 | layer_size: [256]
26 | activation: relu
27 | - layer_type: linear
28 | layer_size: [256]
29 | activation: relu
30 | buffer_type: REPLAY_QUE
31 | batch_size: 64
32 | max_buffer_size: 100000
33 | epsilon_decay: 500
34 | epsilon_end: 0.01
35 | epsilon_start: 0.95
36 | gamma: 0.99
37 | lr: 0.0001
38 | target_update: 4
39 | env_cfg:
40 | id: Acrobot-v1
41 | render_mode: null
42 | wrappers:
43 | - wrapper_name: MultiHeadObsWrapper
44 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/Acrobot-v1/Acrobot-v1_NoisyDQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: NoisyDQN
4 | env_name: gym
5 | interactor_device: cuda
6 | learner_device: cuda
7 | mode: train
8 | is_learner_async: false
9 | collect_traj: false
10 | n_interactors: 1
11 | load_checkpoint: false
12 | load_path: Train_CartPole-v1_NoisyDQN_20231225-000846
13 | load_model_step: best
14 | max_episode: -1
15 | max_step: 200
16 | seed: 1
17 | online_eval: true
18 | online_eval_episode: 10
19 | model_save_fre: 500
20 | policy_summary_fre: 100
21 |
22 | algo_cfg:
23 | merge_layers:
24 | - layer_type: noisy_linear
25 | layer_size: [256]
26 | activation: relu
27 | std_init: 0.4
28 | - layer_type: noisy_linear
29 | layer_size: [256]
30 | activation: relu
31 | std_init: 0.4
32 | buffer_type: REPLAY_QUE
33 | batch_size: 64
34 | max_buffer_size: 100000
35 | epsilon_decay: 500
36 | epsilon_end: 0.01
37 | epsilon_start: 0.95
38 | gamma: 0.99
39 | lr: 0.0001
40 | target_update: 4
41 | env_cfg:
42 | id: Acrobot-v1
43 | render_mode: null
44 | wrappers:
45 | - wrapper_name: MultiHeadObsWrapper
46 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/Acrobot-v1/Acrobot-v1_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | exps_trucation_size: 200
8 | is_learner_async: false
9 | load_checkpoint: false
10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best
12 | n_interactors: 15
13 | max_episode: -1
14 | max_step: -1
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 15
18 | model_save_fre: 10
19 | policy_summary_fre: 10
20 | interact_summary_fre: 100
21 | algo_cfg:
22 | independ_actor: false
23 | return_form: gae
24 | actor_branch_layers:
25 | - name: feature_1
26 | layers:
27 | - layer_type: linear
28 | layer_size: [256]
29 | activation: relu
30 | - layer_type: linear
31 | layer_size: [256]
32 | activation: relu
33 | critic_branch_layers:
34 | - name: feature_1
35 | layers:
36 | - layer_type: linear
37 | layer_size: [256]
38 | activation: relu
39 | - layer_type: linear
40 | layer_size: [256]
41 | activation: relu
42 | branch_layers:
43 | - name: state
44 | layers:
45 | - layer_type: linear
46 | layer_size: [256]
47 | activation: relu
48 | - layer_type: linear
49 | layer_size: [256]
50 | activation: relu
51 | buffer_type: ONPOLICY_QUE
52 | lr: 0.0005
53 | actor_lr: 0.0003
54 | critic_lr: 0.001
55 | entropy_coef: 0.001
56 | critic_loss_coef: 0.001
57 | eps_clip: 0.1
58 | gamma: 0.99
59 | gae_lambda: 0.94
60 | k_epochs: 4
61 | batch_size: 3000
62 | sgd_batch_size: 300
63 | # min_policy: 0.001
64 | env_cfg:
65 | id: Acrobot-v1
66 | render_mode: null
67 | wrappers:
68 | - wrapper_name: MultiHeadObsWrapper
69 | - wrapper_name: MultiHeadActionWrapper
70 |
71 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_A3C.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.6
3 | algo_name: A3C
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | exps_trucation_size: 20
8 | is_learner_async: false
9 | load_checkpoint: false
10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best
12 | n_interactors: 10
13 | max_episode: -1
14 | max_step: 200
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 15
18 | model_save_fre: 10
19 | policy_summary_fre: 10
20 | interact_summary_fre: 100
21 | algo_cfg:
22 | independ_actor: false
23 | return_form: td
24 | actor_branch_layers:
25 | - name: feature_1
26 | layers:
27 | - layer_type: linear
28 | layer_size: [256]
29 | activation: relu
30 | - layer_type: linear
31 | layer_size: [256]
32 | activation: relu
33 | critic_branch_layers:
34 | - name: feature_1
35 | layers:
36 | - layer_type: linear
37 | layer_size: [256]
38 | activation: relu
39 | - layer_type: linear
40 | layer_size: [256]
41 | activation: relu
42 | branch_layers:
43 | - name: state
44 | layers:
45 | - layer_type: linear
46 | layer_size: [256]
47 | activation: relu
48 | - layer_type: linear
49 | layer_size: [256]
50 | activation: relu
51 | buffer_type: ONPOLICY_QUE
52 | lr: 0.0003
53 | actor_lr: 0.0003
54 | critic_lr: 0.001
55 | entropy_coef: 0.001
56 | critic_loss_coef: 0.5
57 | gamma: 0.95
58 | gae_lambda: 0.95
59 | batch_size: 200
60 | # min_policy: 0.001
61 | env_cfg:
62 | id: CartPole-v1
63 | render_mode: null
64 | wrappers:
65 | - wrapper_name: MultiHeadObsWrapper
66 | - wrapper_name: MultiHeadActionWrapper
67 |
68 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_BC_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: BC
3 | device: cuda
4 | env_name: CartPole-v1
5 | eval_eps: 10
6 | eval_per_episode: 1
7 | load_checkpoint: True
8 | load_path: Train_CartPole-v1_BC_20230319-190431
9 | max_steps: 200
10 | mode: test
11 | render: false
12 | save_fig: true
13 | seed: 1
14 | show_fig: false
15 | test_eps: 10
16 | train_eps: 1
17 | wrapper: null
18 | algo_cfg:
19 | actor_hidden_dim: 256
20 | lr: 0.0003
21 | critic_hidden_dim: 256
22 | batch_size: 256
23 | train_iterations: 500
24 | expert_path: tasks/Collect_CartPole-v1_PPO_20230319-170351/traj/traj.pkl # 专家数据路径
25 |
26 |
27 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_BC_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: BC
3 | device: cuda
4 | env_name: CartPole-v1
5 | eval_eps: 10
6 | eval_per_episode: 1
7 | load_checkpoint: False
8 | load_path: Train_CartPole-v1_BC_20230319-114100
9 | max_steps: 200
10 | mode: train
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: false
16 | test_eps: 10
17 | train_eps: 1
18 | wrapper: null
19 | algo_cfg:
20 | actor_hidden_dim: 256
21 | lr: 0.0003
22 | critic_hidden_dim: 256
23 | batch_size: 128
24 | train_iterations: 500
25 | expert_path: tasks/Collect_CartPole-v1_PPO_20230319-170351/traj/traj.pkl # 专家数据路径
26 |
27 |
28 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_C51_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: C51
3 | device: cuda
4 | env_name: CartPole-v1
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_CartPole-v1_C51_20230114-222523 # model path under tasks folder
8 | max_steps: 200
9 | save_fig: true
10 | seed: 0
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 100
14 | Vmin: 0 # support of C51
15 | Vmax: 200 # support of C51
16 | n_atoms: 51 # support of C51
17 | algo_cfg:
18 | batch_size: 64
19 | max_buffer_size: 100000
20 | epsilon_decay: 500
21 | epsilon_end: 0.01
22 | epsilon_start: 0.95
23 | gamma: 0.95
24 | lr: 0.0001
25 | target_update: 4
26 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_C51_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: C51
3 | device: cuda
4 | env_name: CartPole-v1
5 | new_step_api: True
6 | mode: train
7 | load_checkpoint: false
8 | load_path: Train_CartPole-v1_C51_20221026-054757
9 | max_steps: 200
10 | save_fig: true
11 | seed: 1
12 | show_fig: false
13 | test_eps: 10
14 | train_eps: 100
15 | Vmin: 0 # support of C51
16 | Vmax: 200 # support of C51
17 | n_atoms: 51 # support of C51
18 | algo_cfg:
19 | value_layers:
20 | - layer_type: linear
21 | layer_dim: ['n_states',256]
22 | activation: relu
23 | - layer_type: linear
24 | layer_dim: [256,256]
25 | activation: relu
26 | - layer_type: linear
27 | layer_dim: [256,'n_actions']
28 | activation: none
29 | batch_size: 64
30 | max_buffer_size: 100000
31 | epsilon_decay: 500
32 | epsilon_end: 0.01
33 | epsilon_start: 0.95
34 | gamma: 0.95
35 | lr: 0.0001
36 | target_update: 4
37 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_CQL_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: CQL
3 | device: cpu
4 | env_name: CartPole-v1
5 | eval_eps: 10
6 | eval_per_episode: 5
7 | load_checkpoint: true
8 | load_path: Train_CartPole-v1_CQL_20230408-183652
9 | max_steps: 200
10 | mode: test
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: true
16 | test_eps: 10
17 | train_eps: 300
18 | wrapper: null
19 | algo_cfg:
20 | batch_size: 64
21 | max_buffer_size: 100000
22 | epsilon_decay: 500
23 | epsilon_end: 0.01
24 | epsilon_start: 0.95
25 | gamma: 0.99
26 | lr: 0.001
27 | target_update: 4
28 | tau: 0.001
29 | value_layers:
30 | - activation: relu
31 | layer_dim:
32 | - n_states
33 | - 256
34 | layer_type: linear
35 | - activation: relu
36 | layer_dim:
37 | - 256
38 | - 256
39 | layer_type: linear
40 | - activation: none
41 | layer_dim:
42 | - 256
43 | - n_actions
44 | layer_type: linear
45 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_CQL_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: CQL
3 | device: cpu
4 | env_name: CartPole-v1
5 | eval_eps: 10
6 | eval_per_episode: 5
7 | load_checkpoint: false
8 | load_path: tasks
9 | max_steps: 200
10 | mode: train
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: true
16 | test_eps: 10
17 | train_eps: 300
18 | wrapper: null
19 | algo_cfg:
20 | batch_size: 64
21 | max_buffer_size: 100000
22 | epsilon_decay: 500
23 | epsilon_end: 0.01
24 | epsilon_start: 0.95
25 | gamma: 0.99
26 | lr: 0.001
27 | target_update: 4
28 | tau: 0.001
29 | value_layers:
30 | - activation: relu
31 | layer_dim:
32 | - n_states
33 | - 256
34 | layer_type: linear
35 | - activation: relu
36 | layer_dim:
37 | - 256
38 | - 256
39 | layer_type: linear
40 | - activation: none
41 | layer_dim:
42 | - 256
43 | - n_actions
44 | layer_type: linear
45 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_CategoricalDQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.8
3 | algo_name: CategoricalDQN
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | mode: train
8 | is_learner_async: false
9 | collect_traj: false
10 | n_interactors: 1
11 | load_checkpoint: false
12 | load_path: Train_single_CartPole-v1_DQN_20230515-211721
13 | load_model_step: best
14 | reward_threshold: 200
15 | max_episode: -1
16 | max_step: 200
17 | seed: 1
18 | online_eval: true
19 | online_eval_episode: 10
20 | model_save_fre: 500
21 | policy_summary_fre: 100
22 |
23 | algo_cfg:
24 | enable_soft_update: True
25 | distributional: True
26 | learn_frequency: 1
27 | # branch_layers:
28 | # - name: feature_1
29 | # layers:
30 | # - layer_type: linear
31 | # layer_size: [256]
32 | # activation: relu
33 | # - layer_type: linear
34 | # layer_size: [256]
35 | # activation: relu
36 | # - name: feature_2
37 | # layers:
38 | # - layer_type: linear
39 | # layer_size: [256]
40 | # activation: relu
41 | # - layer_type: linear
42 | # layer_size: [256]
43 | # activation: relu
44 | merge_layers:
45 | - layer_type: linear
46 | layer_size: [256]
47 | activation: relu
48 | - layer_type: linear
49 | layer_size: [256]
50 | activation: relu
51 | batch_size: 128
52 | buffer_type: REPLAY_QUE
53 | max_buffer_size: 100000
54 | epsilon_decay: 500
55 | epsilon_end: 0.01
56 | epsilon_start: 0.95
57 | gamma: 0.95
58 | lr: 0.0001
59 | target_update: 4
60 | env_cfg:
61 | id: CartPole-v1
62 | render_mode: null
63 | wrappers:
64 | - wrapper_name: MultiHeadObsWrapper
65 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: DQN
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | mode: train
8 | is_learner_async: false
9 | collect_traj: false
10 | n_interactors: 1
11 | load_checkpoint: false
12 | load_path: Train_single_CartPole-v1_DQN_20230515-211721
13 | load_model_step: best
14 | reward_threshold: 200
15 | max_episode: -1
16 | max_step: 200
17 | seed: 1
18 | online_eval: true
19 | online_eval_episode: 10
20 | model_save_fre: 500
21 | policy_summary_fre: 100
22 |
23 | algo_cfg:
24 | learn_frequency: 1
25 | # branch_layers:
26 | # - name: feature_1
27 | # layers:
28 | # - layer_type: linear
29 | # layer_size: [256]
30 | # activation: relu
31 | # - layer_type: linear
32 | # layer_size: [256]
33 | # activation: relu
34 | # - name: feature_2
35 | # layers:
36 | # - layer_type: linear
37 | # layer_size: [256]
38 | # activation: relu
39 | # - layer_type: linear
40 | # layer_size: [256]
41 | # activation: relu
42 | merge_layers:
43 | - layer_type: linear
44 | layer_size: [256]
45 | activation: relu
46 | - layer_type: linear
47 | layer_size: [256]
48 | activation: relu
49 | batch_size: 128
50 | buffer_type: REPLAY_QUE
51 | max_buffer_size: 100000
52 | epsilon_decay: 500
53 | epsilon_end: 0.01
54 | epsilon_start: 0.95
55 | gamma: 0.95
56 | lr: 0.0001
57 | target_update: 4
58 | env_cfg:
59 | id: CartPole-v1
60 | render_mode: null
61 | wrappers:
62 | - wrapper_name: MultiHeadObsWrapper
63 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_DQN_1.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: DQN
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | is_learner_async: true
8 | collect_traj: false
9 | n_interactors: 5
10 | load_checkpoint: false
11 | load_path: Train_single_CartPole-v1_DQN_20230515-211721
12 | load_model_step: best
13 | max_episode: -1
14 | max_step: 200
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 10
18 | model_save_fre: 500
19 | interact_summary_fre: 10
20 | policy_summary_fre: 100
21 |
22 | algo_cfg:
23 | exps_trucation_size: 20
24 | learn_frequency: 200
25 | # branch_layers:
26 | # - name: feature_1
27 | # layers:
28 | # - layer_type: linear
29 | # layer_size: [256]
30 | # activation: relu
31 | # - layer_type: linear
32 | # layer_size: [256]
33 | # activation: relu
34 | # - name: feature_2
35 | # layers:
36 | # - layer_type: linear
37 | # layer_size: [256]
38 | # activation: relu
39 | # - layer_type: linear
40 | # layer_size: [256]
41 | # activation: relu
42 | merge_layers:
43 | - layer_type: linear
44 | layer_size: [256]
45 | activation: relu
46 | - layer_type: linear
47 | layer_size: [256]
48 | activation: relu
49 | batch_size: 128
50 | buffer_type: REPLAY_QUE
51 | max_buffer_size: 100000
52 | epsilon_decay: 500
53 | epsilon_end: 0.01
54 | epsilon_start: 0.95
55 | gamma: 0.95
56 | lr: 0.0001
57 | target_update: 4
58 | env_cfg:
59 | id: CartPole-v1
60 | render_mode: null
61 | wrappers:
62 | - wrapper_name: MultiHeadObsWrapper
63 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_DRQN_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DRQN
3 | device: cuda
4 | env_name: CartPole-v1
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_CartPole-v1_DRQN_20230204-223146 # model path under tasks folder
8 | max_steps: 200
9 | save_fig: true
10 | seed: 0
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 100
14 | algo_cfg:
15 | batch_size: 64
16 | max_buffer_size: 100000
17 | epsilon_decay: 0.995
18 | epsilon_end: 0.001
19 | epsilon_start: 0.1
20 | gamma: 0.99
21 | lr: 0.001
22 | target_update: 4
23 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_DRQN_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DRQN
3 | device: cuda
4 | env_name: CartPole-v1
5 | mode: train
6 | load_checkpoint: false
7 | load_path: Train_CartPole-v1_
8 | max_steps: 200
9 | save_fig: true
10 | seed: 1
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 200 ###
14 | algo_cfg:
15 | value_layers:
16 | - layer_type: linear
17 | layer_dim: ['n_states',64]
18 | activation: relu
19 | - layer_type: linear
20 | layer_dim: [64,64]
21 | activation: relu
22 | - layer_type: linear
23 | layer_dim: [64,'n_actions']
24 | activation: none
25 | batch_size: 8
26 | min_epi_num: 16
27 | max_epi_len: 100
28 | lookup_step: 10
29 | max_epi_num: 100
30 |
31 | max_buffer_size: 100000
32 | epsilon_decay: 0.995
33 | epsilon_end: 0.001
34 | epsilon_start: 0.1
35 | gamma: 0.99
36 | lr: 0.001
37 | target_update: 4
38 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_DoubleDQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: DoubleDQN
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | seed: 1
8 | mode: train
9 | max_episode: -1
10 | max_step: 200
11 | load_checkpoint: false
12 | load_path: Train_CartPole-v1_DQN_20221026-054757
13 | load_model_step: best
14 | online_eval: true
15 | online_eval_episode: 10
16 | model_save_fre: 500
17 | policy_summary_fre: 100
18 |
19 | algo_cfg:
20 | learn_frequency: 1
21 | # branch_layers:
22 | # - name: feature_1
23 | # layers:
24 | # - layer_type: linear
25 | # layer_size: [256]
26 | # activation: relu
27 | # - layer_type: linear
28 | # layer_size: [256]
29 | # activation: relu
30 | # - name: feature_2
31 | # layers:
32 | # - layer_type: linear
33 | # layer_size: [256]
34 | # activation: relu
35 | # - layer_type: linear
36 | # layer_size: [256]
37 | # activation: relu
38 | merge_layers:
39 | - layer_type: linear
40 | layer_size: [256]
41 | activation: relu
42 | - layer_type: linear
43 | layer_size: [256]
44 | activation: relu
45 | batch_size: 128
46 | buffer_type: REPLAY_QUE
47 | max_buffer_size: 100000
48 | epsilon_decay: 500
49 | epsilon_end: 0.01
50 | epsilon_start: 0.95
51 | gamma: 0.99
52 | lr: 0.0001
53 | target_update: 4
54 | env_cfg:
55 | id: CartPole-v1
56 | render_mode: null
57 | wrappers:
58 | - wrapper_name: MultiHeadObsWrapper
59 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_DuelingDQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: DuelingDQN
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | seed: 1
8 | mode: train
9 | max_episode: -1
10 | max_step: 200
11 | load_checkpoint: false
12 | load_path: Train_CartPole-v1_DQN_20221026-054757
13 | load_model_step: best
14 | online_eval: true
15 | online_eval_episode: 10
16 | model_save_fre: 500
17 | policy_summary_fre: 100
18 | algo_cfg:
19 | dueling: true
20 | merge_layers:
21 | - layer_type: linear
22 | layer_size: [256]
23 | activation: relu
24 | - layer_type: linear
25 | layer_size: [256]
26 | activation: relu
27 | batch_size: 128
28 | buffer_type: REPLAY_QUE
29 | max_buffer_size: 100000
30 | epsilon_decay: 500
31 | epsilon_end: 0.01
32 | epsilon_start: 0.95
33 | gamma: 0.95
34 | lr: 0.0001
35 | target_update: 4
36 | env_cfg:
37 | id: CartPole-v1
38 | render_mode: null
39 | wrappers:
40 | - wrapper_name: MultiHeadObsWrapper
41 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_GAIL_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: GAIL
3 | batch_size: 2048
4 | device: cuda
5 | discount: 0.99
6 | env_name: CartPole-v1
7 | eval_eps: 10
8 | eval_per_episode: 5
9 | load_checkpoint: true
10 | load_path: Train_CartPole-v1_GAIL_20221207-160945
11 | max_steps: 200
12 | mode: test
13 | hidden_dim: 32
14 | lr: 0.001
15 | imitation_replay_size: 4
16 | imitation_epochs: 5
17 | imitation_batch_size: 128
18 | new_step_api: true
19 | ppo_epochs: 5
20 | render: false
21 | r1_reg_coeff: 1
22 | save_fig: true
23 | seed: 1
24 | show_fig: false
25 | test_eps: 10
26 | train_eps: 400
27 | wrapper: null
28 | algo_cfg:
29 |
30 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_GAIL_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: GAIL
3 | adversarial_batch_size: 128
4 | adversarial_epochs: 5
5 | batch_size: 2048
6 | device: cuda
7 | discount: 0.99
8 | env_name: CartPole-v1
9 | eval_eps: 10
10 | eval_per_episode: 5
11 | load_checkpoint: false
12 | load_path: Collect_CartPole-v1_PPO_20221206-173222
13 | max_steps: 200
14 | mode: train
15 | new_step_api: true
16 | num_workers: 8
17 | hidden_dim: 32
18 | lr: 0.001
19 | imitation_replay_size: 4
20 | r1_reg_coeff: 1
21 | render: false
22 | ppo_epochs: 5
23 | save_fig: true
24 | seed: 1
25 | show_fig: false
26 | test_eps: 10
27 | train_eps: 1500
28 | wrapper: null
29 | algo_cfg:
30 |
31 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_NoisyDQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.4.1
3 | algo_name: NoisyDQN
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | seed: 1
8 | mode: train
9 | collect_traj: false
10 | max_episode: -1
11 | max_step: 200
12 | load_checkpoint: false
13 | load_path: Train_CartPole-v1_NoisyDQN_20231225-000846
14 | load_model_step: best
15 | online_eval: true
16 | online_eval_episode: 10
17 | model_save_fre: 500
18 | policy_summary_fre: 100
19 | algo_cfg:
20 | merge_layers:
21 | - layer_type: noisy_linear
22 | layer_size: [256]
23 | activation: relu
24 | std_init: 0.4
25 | - layer_type: noisy_linear
26 | layer_size: [256]
27 | activation: relu
28 | std_init: 0.4
29 | batch_size: 128
30 | buffer_type: REPLAY_QUE
31 | max_buffer_size: 100000
32 | epsilon_decay: 500
33 | epsilon_end: 0.01
34 | epsilon_start: 0.95
35 | gamma: 0.99
36 | lr: 0.0001
37 | target_update: 4
38 | env_cfg:
39 | id: CartPole-v1
40 | render_mode: null
41 | wrappers:
42 | - wrapper_name: MultiHeadObsWrapper
43 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_PER_DQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: PER_DQN # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | device: cuda # device, cpu or cuda
5 | mode: test # run mode: train, test
6 | collect_traj: false # if collect trajectories or not
7 | mp_backend: ray # multi-processing mode: single(default), ray
8 | n_workers: 2 # number of workers if using multi-processing, default 1
9 | load_checkpoint: false # if load checkpoint or not
10 | load_path: Train_single_CartPole-v1_PER_DQN_20230518-232215 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best # load model step
12 | max_episode: 100 # max episodes, set -1 to keep running
13 | max_step: 200 # max steps per episode
14 | seed: 1 # random seed, set 0 not to use seed
15 | online_eval: true # if online eval or not
16 | online_eval_episode: 10 # online eval episodes
17 | model_save_fre: 500 # update step frequency of saving model
18 |
19 | algo_cfg:
20 | value_layers:
21 | - layer_type: linear
22 | layer_size: [256]
23 | activation: relu
24 | - layer_type: linear
25 | layer_size: [256]
26 | activation: relu
27 | batch_size: 64
28 | buffer_type: PER_QUE
29 | max_buffer_size: 100000
30 | per_alpha: 0.6
31 | per_beta: 0.4
32 | per_beta_annealing: 0.001
33 | per_epsilon: 0.01
34 | epsilon_decay: 1000
35 | epsilon_end: 0.01
36 | epsilon_start: 0.95
37 | gamma: 0.99
38 | lr: 0.0001
39 | target_update: 4
40 | env_cfg:
41 | id: CartPole-v1
42 | render_mode: null
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | exps_trucation_size: 200
8 | is_learner_async: false
9 | load_checkpoint: false
10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best
12 | n_interactors: 10
13 | max_episode: -1
14 | max_step: 200
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 15
18 | model_save_fre: 10
19 | policy_summary_fre: 10
20 | interact_summary_fre: 100
21 | algo_cfg:
22 | independ_actor: false
23 | return_form: td
24 | actor_branch_layers:
25 | - name: feature_1
26 | layers:
27 | - layer_type: linear
28 | layer_size: [256]
29 | activation: relu
30 | - layer_type: linear
31 | layer_size: [256]
32 | activation: relu
33 | critic_branch_layers:
34 | - name: feature_1
35 | layers:
36 | - layer_type: linear
37 | layer_size: [256]
38 | activation: relu
39 | - layer_type: linear
40 | layer_size: [256]
41 | activation: relu
42 | branch_layers:
43 | - name: state
44 | layers:
45 | - layer_type: linear
46 | layer_size: [256]
47 | activation: relu
48 | - layer_type: linear
49 | layer_size: [256]
50 | activation: relu
51 | buffer_type: ONPOLICY_QUE
52 | lr: 0.0003
53 | actor_lr: 0.0003
54 | critic_lr: 0.001
55 | entropy_coef: 0.001
56 | critic_loss_coef: 0.001
57 | eps_clip: 0.1
58 | gamma: 0.95
59 | gae_lambda: 0.95
60 | k_epochs: 4
61 | batch_size: 2000
62 | sgd_batch_size: 200
63 | # min_policy: 0.001
64 | env_cfg:
65 | id: CartPole-v1
66 | render_mode: null
67 | wrappers:
68 | - wrapper_name: MultiHeadObsWrapper
69 | - wrapper_name: MultiHeadActionWrapper
70 |
71 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_PPO_off_policy.yaml:
--------------------------------------------------------------------------------
1 |
2 | general_cfg:
3 | joyrl_version: 0.6.5
4 | algo_name: PPO
5 | env_name: gym
6 | device: cpu
7 | mode: train
8 | exps_trucation_size: 200
9 | is_learner_async: true
10 | load_checkpoint: false
11 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir
12 | load_model_step: best
13 | n_interactors: 10
14 | max_episode: -1
15 | max_step: 200
16 | seed: 1
17 | online_eval: true
18 | online_eval_episode: 15
19 | model_save_fre: 10
20 | policy_summary_fre: 10
21 | interact_summary_fre: 100
22 | algo_cfg:
23 | independ_actor: false
24 | return_form: td
25 | actor_branch_layers:
26 | - name: feature_1
27 | layers:
28 | - layer_type: linear
29 | layer_size: [256]
30 | activation: relu
31 | - layer_type: linear
32 | layer_size: [256]
33 | activation: relu
34 | critic_branch_layers:
35 | - name: feature_1
36 | layers:
37 | - layer_type: linear
38 | layer_size: [256]
39 | activation: relu
40 | - layer_type: linear
41 | layer_size: [256]
42 | activation: relu
43 | branch_layers:
44 | - name: state
45 | layers:
46 | - layer_type: linear
47 | layer_size: [256]
48 | activation: relu
49 | - layer_type: linear
50 | layer_size: [256]
51 | activation: relu
52 | buffer_type: REPLAY_QUE
53 | max_buffer_size: 4000
54 | lr: 0.0003
55 | actor_lr: 0.0003
56 | critic_lr: 0.001
57 | entropy_coef: 0.001
58 | critic_loss_coef: 0.001
59 | eps_clip: 0.1
60 | gamma: 0.95
61 | gae_lambda: 0.95
62 | k_epochs: 4
63 | batch_size: 2000
64 | sgd_batch_size: 200
65 | # min_policy: 0.001
66 | env_cfg:
67 | id: CartPole-v1
68 | render_mode: null
69 | wrappers:
70 | - wrapper_name: MultiHeadObsWrapper
71 | - wrapper_name: MultiHeadActionWrapper
72 |
73 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_REINFORCE_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: REINFORCE
3 | device: cpu
4 | env_name: CartPole-v1
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_CartPole-v1_REINFORCE_20221203-143307
8 | max_steps: 200
9 | save_fig: true
10 | seed: 1
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 200
14 | algo_cfg:
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_REINFORCE_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: REINFORCE
3 | device: cpu
4 | env_name: CartPole-v1
5 | eval_eps: 10
6 | eval_per_episode: 5
7 | load_checkpoint: false
8 | load_path: Train_CartPole-v1_DQN_20221026-054757
9 | max_steps: 200
10 | mode: train
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: false
16 | test_eps: 10
17 | train_eps: 200
18 | wrapper: null
19 | algo_cfg:
20 | gamma: 0.99
21 | hidden_dim: 36
22 | lr: 0.01
23 | update_freq: 200
24 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_RainbowDQN_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: RainbowDQN
3 | device: cuda
4 | env_name: CartPole-v1
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_CartPole-v1_RainbowDQN_20230114-222012 # model path under tasks folder
8 | max_steps: 200
9 | save_fig: true
10 | seed: 0
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 100
14 | Vmin: 0 # support of C51
15 | Vmax: 200 # support of C51
16 | n_atoms: 51 # support of C51
17 | algo_cfg:
18 | batch_size: 64
19 | max_buffer_size: 100000
20 | epsilon_decay: 500
21 | epsilon_end: 0.01
22 | epsilon_start: 0.95
23 | gamma: 0.95
24 | lr: 0.0001
25 | target_update: 4
26 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_RainbowDQN_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: RainbowDQN
3 | device: cpu
4 | env_name: gym
5 | mode: train
6 | load_checkpoint: false
7 | load_path: Train_CartPole-v1_
8 | max_steps: 200
9 | save_fig: true
10 | seed: 1
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 100
14 | Vmin: 0 # support of C51
15 | Vmax: 200 # support of C51
16 | n_atoms: 51 # support of C51
17 | algo_cfg:
18 | value_layers:
19 | - layer_type: linear
20 | layer_dim: ['n_states',256]
21 | activation: relu
22 | - layer_type: linear
23 | layer_dim: [256,256]
24 | activation: relu
25 | - layer_type: linear
26 | layer_dim: [256,'n_actions']
27 | activation: none
28 | batch_size: 64
29 | max_buffer_size: 100000
30 | epsilon_decay: 500
31 | epsilon_end: 0.01
32 | epsilon_start: 0.95
33 | gamma: 0.95
34 | lr: 0.0001
35 | target_update: 4
36 | env_cfg:
37 | id: CartPole-v1
38 | new_step_api: true
39 | render_mode: null
40 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_RainbowDQN_Train_mp.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: RainbowDQN
3 | device: cpu
4 | env_name: gym
5 | mode: train
6 | mp_backend: mp # 多线程框架,ray或者mp(multiprocessing),默认mp
7 | n_workers: 2 # number of workers for parallel training
8 | load_checkpoint: false
9 | load_path: Train_CartPole-v1_
10 | max_steps: 200
11 | save_fig: true
12 | seed: 1
13 | show_fig: false
14 | test_eps: 10
15 | train_eps: 100
16 | Vmin: 0 # support of C51
17 | Vmax: 200 # support of C51
18 | n_atoms: 51 # support of C51
19 | algo_cfg:
20 | value_layers:
21 | - layer_type: linear
22 | layer_dim: ['n_states',256]
23 | activation: relu
24 | - layer_type: linear
25 | layer_dim: [256,256]
26 | activation: relu
27 | - layer_type: linear
28 | layer_dim: [256,'n_actions']
29 | activation: none
30 | batch_size: 64
31 | max_buffer_size: 100000
32 | epsilon_decay: 500
33 | epsilon_end: 0.01
34 | epsilon_start: 0.95
35 | gamma: 0.95
36 | lr: 0.0001
37 | target_update: 4
38 | env_cfg:
39 | id: CartPole-v1
40 | new_step_api: true
41 | render_mode: null
42 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_SAC_D_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: SAC_D
3 | device: cuda
4 | env_name: CartPole-v1
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_CartPole-v1_SAC_D_20230305-112849 # model path under tasks folder
8 | max_steps: 200
9 | save_fig: true
10 | seed: 0
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 200
14 | algo_cfg:
15 | batch_size: 64
16 | max_buffer_size: 100000
17 | alpha: 0.2
18 | epsilon_decay: 500
19 | epsilon_end: 0.01
20 | epsilon_start: 0.95
21 | gamma: 0.95
22 | lr: 0.0001
23 | target_update: 1
24 |
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_SAC_D_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: SAC_D
3 | device: cuda
4 | env_name: CartPole-v1
5 | new_step_api: True
6 | wrapper: null
7 | mode: train
8 | load_checkpoint: false
9 | load_path: Train_CartPole-v1_DQN_20221026-054757
10 | max_steps: 200
11 | save_fig: true
12 | seed: 0
13 | show_fig: false
14 | test_eps: 10
15 | train_eps: 200
16 | algo_cfg:
17 | alpha: 0.2
18 | epsilon_decay: 500
19 | epsilon_end: 0.01
20 | epsilon_start: 0.95
21 | gamma: 0.95
22 | lr: 0.0001
23 | target_update: 1
--------------------------------------------------------------------------------
/presets/ClassControl/CartPole-v1/CartPole-v1_SoftQ.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: SoftQ
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | mode: train
8 | is_learner_async: false
9 | collect_traj: false
10 | n_interactors: 1
11 | load_checkpoint: false
12 | load_path: Train_single_CartPole-v1_DQN_20230515-211721
13 | load_model_step: best
14 | reward_threshold: 200
15 | max_episode: -1
16 | max_step: 200
17 | seed: 1
18 | online_eval: true
19 | online_eval_episode: 10
20 | model_save_fre: 500
21 | policy_summary_fre: 100
22 |
23 | algo_cfg:
24 | alpha: 4
25 | learn_frequency: 1
26 | # branch_layers:
27 | # - name: feature_1
28 | # layers:
29 | # - layer_type: linear
30 | # layer_size: [256]
31 | # activation: relu
32 | # - layer_type: linear
33 | # layer_size: [256]
34 | # activation: relu
35 | # - name: feature_2
36 | # layers:
37 | # - layer_type: linear
38 | # layer_size: [256]
39 | # activation: relu
40 | # - layer_type: linear
41 | # layer_size: [256]
42 | # activation: relu
43 | merge_layers:
44 | - layer_type: linear
45 | layer_size: [256]
46 | activation: relu
47 | - layer_type: linear
48 | layer_size: [256]
49 | activation: relu
50 | batch_size: 128
51 | buffer_type: REPLAY_QUE
52 | max_buffer_size: 100000
53 | epsilon_decay: 500
54 | epsilon_end: 0.01
55 | epsilon_start: 0.95
56 | gamma: 0.95
57 | lr: 0.0001
58 | target_update: 4
59 | env_cfg:
60 | id: CartPole-v1
61 | render_mode: null
62 | wrappers:
63 | - wrapper_name: MultiHeadObsWrapper
64 | - wrapper_name: MultiHeadActionWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/MountainCar-v0/MountainCar-v0_DQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.2.5
3 | algo_name: DQN
4 | env_name: gym
5 | interactor_device: cpu
6 | learner_device: cpu
7 | mode: train
8 | is_learner_async: false
9 | collect_traj: false
10 | n_interactors: 1
11 | load_checkpoint: false
12 | load_path: "Train_MountainCar-v0_DQN"
13 | load_model_step: "best"
14 | max_episode: -1
15 | max_step: 200
16 | seed: 1
17 | online_eval: true
18 | online_eval_episode: 10
19 | algo_cfg:
20 | learn_frequency: 1
21 | merge_layers:
22 | - layer_type: linear
23 | layer_size: [256]
24 | activation: relu
25 | - layer_type: linear
26 | layer_dim: [256]
27 | activation: relu
28 | batch_size: 64
29 | max_buffer_size: 10000
30 | epsilon_decay: 1500
31 | epsilon_end: 0.01
32 | epsilon_start: 0.98
33 | gamma: 0.98
34 | lr: 0.001
35 | target_update: 10
36 | env_cfg:
37 | id: MountainCar-v0
38 | render_mode: null
39 | wrappers:
40 | - wrapper_name: MultiHeadActionWrapper
41 |
--------------------------------------------------------------------------------
/presets/ClassControl/MountainCar-v0/MountainCar-v0_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | interactor device: cuda
7 | learner device: cuda
8 | mode: train
9 | exps_trucation_size: 200
10 | is_learner_async: false
11 | load_checkpoint: false
12 | load_path: Train_MountainCar-v0_PPO_20240618-192707
13 | load_model_step: best
14 | n_interactors: 1
15 | max_episode: -1
16 | max_step: 200
17 | seed: 1
18 | online_eval: true
19 | online_eval_episode: 10
20 | reward_threshold: -110
21 | model_save_fre: 10
22 | policy_summary_fre: 5
23 | interact_summary_fre: 100
24 | algo_cfg:
25 | independ_actor: false
26 | return_form: td
27 | actor_branch_layers:
28 | - name: state
29 | layers:
30 | - layer_type: linear
31 | layer_size: [256]
32 | activation: relu
33 | critic_branch_layers:
34 | - name: state
35 | layers:
36 | - layer_type: linear
37 | layer_size: [256]
38 | activation: relu
39 | branch_layers:
40 | - name: state
41 | layers:
42 | - layer_type: linear
43 | layer_size: [256]
44 | activation: relu
45 | buffer_type: ONPOLICY_QUE
46 | eps_clip: 0.2
47 | entropy_coef: 0.002
48 | lr: 0.0003
49 | actor_lr: 0.003
50 | critic_lr: 0.01
51 | critic_loss_coef: 0.5
52 | gamma: 0.99
53 | gae_lambda: 0.95
54 | k_epochs: 4
55 | batch_size: 2000
56 | sgd_batch_size: 32
57 | env_cfg:
58 | id: MountainCar-v0
59 | render_mode: null
60 | wrappers:
61 | - wrapper_name: MultiHeadObsWrapper
62 | - wrapper_name: MultiHeadActionWrapper
63 |
--------------------------------------------------------------------------------
/presets/ClassControl/MountainCarContinuous-v0/MountainCarContinuous-v0_PPO-test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5.1
3 | algo_name: PPO
4 | device: cpu
5 | env_name: gym
6 | interactor device: cuda
7 | learner device: cuda
8 | mode: test
9 | exps_trucation_size: 1024
10 | is_learner_async: false
11 | load_checkpoint: true
12 | load_path: Train_MountainCarContinuous-v0_PPO_20240715-161812
13 | load_model_step: best
14 | n_interactors: 20
15 | max_episode: -1
16 | max_step: -1
17 | seed: 1
18 | reward_threshold: 90
19 | online_eval: true
20 | online_eval_episode: 20
21 | model_save_fre: 10000
22 | policy_summary_fre: 5000
23 | interact_summary_fre: 5000
24 | algo_cfg:
25 | actor_branch_layers:
26 | - name: action
27 | layers:
28 | - layer_type: linear
29 | layer_size: [256]
30 | activation: tanh
31 | - layer_type: linear
32 | layer_size: [256]
33 | activation: tanh
34 |
35 | critic_branch_layers:
36 | - name: critic
37 | layers:
38 | - layer_type: linear
39 | layer_size: [256]
40 | activation: relu
41 | - layer_type: linear
42 | layer_size: [256]
43 | activation: relu
44 | buffer_type: REPLAY_QUE
45 | max_buffer_size: 100000
46 | action_type_list: continuous
47 | lr: 0.0003
48 | actor_lr: 0.003
49 | critic_lr: 0.005
50 | entropy_coef: 0.003
51 | critic_loss_coef: 0.5
52 | eps_clip: 0.2
53 | gamma: 0.99
54 | return_form: mc
55 | gae_lambda: 0.95
56 | k_epochs: 4
57 | batch_size: 64
58 | env_cfg:
59 | id: MountainCarContinuous-v0
60 | render_mode: human
61 | wrappers:
62 | - wrapper_name: MultiHeadObsWrapper
63 |
--------------------------------------------------------------------------------
/presets/ClassControl/MountainCarContinuous-v0/MountainCarContinuous-v0_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5.1
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | interactor device: cuda
7 | learner device: cuda
8 | mode: train
9 | exps_trucation_size: 1024
10 | is_learner_async: false
11 | load_checkpoint: false
12 | load_path: Train_MountainCar-v0_PPO_20240618-192707
13 | load_model_step: best
14 | n_interactors: 1
15 | n_learners: 1
16 | max_episode: -1
17 | max_step: -1
18 | seed: 1
19 | reward_threshold: 90
20 | online_eval: true
21 | online_eval_episode: 20
22 | model_save_fre: 10000
23 | policy_summary_fre: 10000
24 | interact_summary_fre: 10000
25 | algo_cfg:
26 | actor_branch_layers:
27 | - name: action
28 | layers:
29 | - layer_type: linear
30 | layer_size: [256]
31 | activation: tanh
32 | - layer_type: linear
33 | layer_size: [256]
34 | activation: tanh
35 |
36 | critic_branch_layers:
37 | - name: critic
38 | layers:
39 | - layer_type: linear
40 | layer_size: [256]
41 | activation: relu
42 | - layer_type: linear
43 | layer_size: [256]
44 | activation: relu
45 | buffer_type: REPLAY_QUE
46 | max_buffer_size: 100000
47 | action_type_list: continuous
48 | lr: 0.0003
49 | actor_lr: 0.003
50 | critic_lr: 0.005
51 | entropy_coef: 0.003
52 | critic_loss_coef: 0.5
53 | eps_clip: 0.2
54 | gamma: 0.99
55 | return_form: mc
56 | gae_lambda: 0.95
57 | k_epochs: 4
58 | batch_size: 64
59 | action_std_bias: 0.8
60 | env_cfg:
61 | id: MountainCarContinuous-v0
62 | render_mode: null
63 | wrappers:
64 | - wrapper_name: MultiHeadObsWrapper
65 |
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.7.2
3 | algo_name: DDPG
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | load_checkpoint: false
8 | load_path: Train_ray_Pendulum-v1_DDPG_20230527-001715
9 | load_model_step: best
10 | max_episode: -1
11 | max_step: 200
12 | seed: 1
13 | online_eval: true
14 | online_eval_episode: 20
15 | model_save_fre: 500
16 | algo_cfg:
17 | action_space:
18 | type: [dpg]
19 | actor_branch_layers:
20 | - name: state
21 | layers:
22 | - layer_type: linear
23 | layer_size: [256]
24 | activation: relu
25 | - layer_type: linear
26 | layer_size: [256]
27 | activation: relu
28 | critic_branch_layers:
29 | - name: state
30 | layers:
31 | - layer_type: none
32 | - name: action
33 | layers:
34 | - layer_type: none
35 | critic_merge_layers:
36 | - layer_type: linear
37 | layer_size: [256]
38 | activation: relu
39 | - layer_type: linear
40 | layer_size: [256]
41 | activation: relu
42 | batch_size: 256
43 | buffer_type: REPLAY_QUE
44 | max_buffer_size: 8000
45 | actor_lr: 0.0001
46 | critic_lr: 0.001
47 | policy_loss_weight: 0.002
48 | gamma: 0.99
49 | tau: 0.01
50 | env_cfg:
51 | id: Pendulum-v1
52 | render_mode: null # null, rgb_array, human
53 | wrappers:
54 | - wrapper_name: MultiHeadObsWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG_HER_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DDPG_HER
3 | device: cpu
4 | env_name: gym
5 | eval_eps: 10
6 | eval_per_episode: 5
7 | her_sample_num: 4
8 | load_checkpoint: true
9 | load_path: tasks
10 | max_steps: 200
11 | mode: test
12 | mp_backend: mp
13 | new_step_api: true
14 | render: false
15 | render_mode: human
16 | save_fig: true
17 | seed: 0
18 | show_fig: false
19 | test_eps: 20
20 | train_eps: 150
21 | update_every: 100
22 | wrapper: null
23 | algo_cfg:
24 | actor_hidden_dim: 256
25 | actor_lr: 0.0001
26 | batch_size: 128
27 | max_buffer_size: 8000
28 | critic_hidden_dim: 256
29 | critic_lr: 0.001
30 | gamma: 0.99
31 | tau: 0.001
32 | env_cfg:
33 | id: Pendulum-v1
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG_HER_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DDPG_HER
3 | device: cpu
4 | env_name: gym
5 | eval_eps: 10
6 | eval_per_episode: 5
7 | her_sample_num: 4
8 | load_checkpoint: false
9 | load_path: tasks
10 | max_steps: 200
11 | mode: train
12 | mp_backend: mp
13 | new_step_api: true
14 | render: false
15 | render_mode: human
16 | save_fig: true
17 | seed: 0
18 | show_fig: false
19 | test_eps: 20
20 | train_eps: 100
21 | update_every: 100
22 | wrapper: null
23 | algo_cfg:
24 | actor_hidden_dim: 256
25 | actor_lr: 0.0001
26 | batch_size: 128
27 | max_buffer_size: 8000
28 | critic_hidden_dim: 256
29 | critic_lr: 0.001
30 | gamma: 0.99
31 | tau: 0.001
32 | env_cfg:
33 | id: Pendulum-v1
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.5
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | exps_trucation_size: 200
8 | is_learner_async: false
9 | load_checkpoint: false
10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best
12 | n_interactors: 10
13 | max_episode: -1
14 | max_step: 200
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 10
18 | model_save_fre: 10
19 | policy_summary_fre: 5
20 | interact_summary_fre: 100
21 | algo_cfg:
22 | actor_branch_layers:
23 | - name: state
24 | layers:
25 | - layer_type: linear
26 | layer_size: [256]
27 | activation: relu
28 | # - layer_type: linear
29 | # layer_size: [256]
30 | # activation: tanh
31 | critic_branch_layers:
32 | - name: state
33 | layers:
34 | - layer_type: linear
35 | layer_size: [256]
36 | activation: relu
37 | # - layer_type: linear
38 | # layer_size: [256]
39 | # activation: tanh
40 | branch_layers:
41 | - name: state
42 | layers:
43 | - layer_type: linear
44 | layer_size: [256]
45 | activation: relu
46 | buffer_type: ONPOLICY_QUE
47 | lr: 0.0003
48 | actor_lr: 0.003
49 | critic_lr: 0.01
50 | entropy_coef: 0.001
51 | critic_loss_coef: 0.5
52 | eps_clip: 0.2
53 | gamma: 0.95
54 | gae_lambda: 0.95
55 | k_epochs: 4
56 | batch_size: 2000
57 | sgd_batch_size: 50
58 | env_cfg:
59 | id: Pendulum-v1
60 | render_mode: null
61 | wrappers:
62 | - wrapper_name: MultiHeadObsWrapper
63 |
64 |
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_PPO_off_policy.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.2.3
3 | algo_name: PPO
4 | env_name: gym
5 | device: cpu
6 | mode: train
7 | exps_trucation_size: 200
8 | is_learner_async: true
9 | load_checkpoint: false
10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best
12 | n_interactors: 10
13 | max_episode: -1
14 | max_step: 200
15 | seed: 1
16 | online_eval: true
17 | online_eval_episode: 10
18 | model_save_fre: 10
19 | policy_summary_fre: 10
20 | interact_summary_fre: 100
21 | algo_cfg:
22 | actor_branch_layers:
23 | - name: state
24 | layers:
25 | - layer_type: linear
26 | layer_size: [256]
27 | activation: relu
28 | # - layer_type: linear
29 | # layer_size: [256]
30 | # activation: tanh
31 | critic_branch_layers:
32 | - name: state
33 | layers:
34 | - layer_type: linear
35 | layer_size: [256]
36 | activation: relu
37 | # - layer_type: linear
38 | # layer_size: [256]
39 | # activation: tanh
40 | branch_layers:
41 | - name: state
42 | layers:
43 | - layer_type: linear
44 | layer_size: [256]
45 | activation: relu
46 | buffer_type: REPLAY_QUE
47 | max_buffer_size: 4000
48 | lr: 0.0003
49 | actor_lr: 0.003
50 | critic_lr: 0.01
51 | entropy_coef: 0.001
52 | critic_loss_coef: 0.5
53 | eps_clip: 0.2
54 | gamma: 0.95
55 | gae_lambda: 0.95
56 | k_epochs: 4
57 | batch_size: 2000
58 | sgd_batch_size: 50
59 | env_cfg:
60 | id: Pendulum-v1
61 | render_mode: null
62 |
63 |
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_SAC_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: SAC
3 | device: cuda
4 | env_name: Pendulum-v1
5 | new_step_api: True
6 | wrapper: null
7 | mode: train
8 | load_checkpoint: false
9 | load_path: Train_CartPole-v1_DQN_20221026-054757
10 | max_steps: 200
11 | save_fig: true
12 | seed: 10
13 | show_fig: false
14 | test_eps: 10
15 | train_eps: 400
16 | algo_cfg:
17 | continous: false
18 | alpha: 0.2
19 |
20 |
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_TD3.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.7.2
3 | algo_name: TD3
4 | device: cpu
5 | mode: train
6 | load_checkpoint: false
7 | load_path: Train_ray_Pendulum-v1_DDPG_20230527-001715
8 | load_model_step: best
9 | max_episode: 400
10 | max_step: 200
11 | seed: 1
12 | online_eval: true
13 | online_eval_episode: 20
14 | model_save_fre: 500
15 | algo_cfg:
16 | action_space:
17 | type: [dpg]
18 | actor_branch_layers:
19 | - name: state
20 | layers:
21 | - layer_type: linear
22 | layer_size: [256]
23 | activation: tanh
24 | - layer_type: linear
25 | layer_size: [256]
26 | activation: tanh
27 | critic_branch_layers:
28 | - name: state
29 | layers:
30 | - layer_type: none
31 | - name: action
32 | layers:
33 | - layer_type: none
34 | critic_merge_layers:
35 | - layer_type: linear
36 | layer_size: [256]
37 | activation: tanh
38 | - layer_type: linear
39 | layer_size: [256]
40 | activation: tanh
41 | batch_size: 128
42 | buffer_type: REPLAY_QUE
43 | max_buffer_size: 8000
44 | actor_lr: 0.001
45 | critic_lr: 0.001
46 | policy_loss_weight: 0.002
47 | gamma: 0.99
48 | tau: 0.005
49 | env_cfg:
50 | id: Pendulum-v1
51 | render_mode: null # null, rgb_array, human
52 | wrappers:
53 | - wrapper_name: MultiHeadObsWrapper
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_TD3_BC_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: TD3_BC
3 | device: cuda
4 | env_name: gym
5 | eval_eps: 5
6 | eval_per_episode: 10
7 | load_checkpoint: true
8 | load_path: Train_gym_TD3_BC_20230416-111154
9 | max_steps: 200
10 | mode: test
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: false
16 | test_eps: 10
17 | train_eps: 1
18 | wrapper: null
19 | algo_cfg:
20 | actor_hidden_dim: 256
21 | actor_lr: 0.0003
22 | batch_size: 100
23 | max_buffer_size: 1000000
24 | critic_hidden_dim: 256
25 | critic_lr: 0.0003
26 | expl_noise: 0.1
27 | explore_steps: 1000
28 | gamma: 0.99
29 | noise_clip: 0.5
30 | policy_freq: 2
31 | policy_noise: 0.2
32 | tau: 0.005
33 | alpha: 5
34 | lmbda: 1
35 | normalize: false
36 | train_iterations: 2000 # 训练的迭代次数
37 | expert_path: tasks/Collect_gym_TD3_20230416-111040/traj/traj.pkl # 专家数据路径
38 | env_cfg:
39 | id: Pendulum-v1
40 | new_step_api: true
--------------------------------------------------------------------------------
/presets/ClassControl/Pendulum-v1/Pendulum-v1_TD3_BC_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: TD3_BC
3 | device: cuda
4 | env_name: gym
5 | eval_eps: 5
6 | eval_per_episode: 1
7 | load_checkpoint: false
8 | load_path: Train_CartPole-v1_DQN_20221026-054757
9 | max_steps: 200
10 | mode: train
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: false
16 | test_eps: 10
17 | train_eps: 1
18 | wrapper: null
19 | algo_cfg:
20 | actor_hidden_dim: 256
21 | actor_lr: 0.0003
22 | batch_size: 100
23 | max_buffer_size: 1000000
24 | critic_hidden_dim: 256
25 | critic_lr: 0.0003
26 | expl_noise: 0.1
27 | explore_steps: 1000
28 | gamma: 0.99
29 | noise_clip: 0.5
30 | policy_freq: 2
31 | policy_noise: 0.2
32 | tau: 0.005
33 | alpha: 5
34 | lmbda: 1
35 | normalize: false
36 | train_iterations: 1500 # 训练的迭代次数
37 | expert_path: tasks/Collect_gym_TD3_20230416-111040/traj/traj.pkl # 专家数据路径
38 | env_cfg:
39 | id: Pendulum-v1
40 | new_step_api: true
41 |
--------------------------------------------------------------------------------
/presets/External/Mario/Mario_DQN_CNN_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DQN_CNN
3 | device: cuda
4 | env_name: Mario
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_Mario_DQN_CNN_20221207-155552
8 | max_steps: 1000
9 | save_fig: true
10 | seed: 1
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 100
14 | wrapper: envs.wrappers.MarioWrappers
15 | new_step_api: True
16 | algo_cfg:
17 | batch_size: 64
18 | max_buffer_size: 100000
19 | epsilon_decay: 500
20 | epsilon_end: 0.01
21 | epsilon_start: 0.95
22 | gamma: 0.99
23 | lr: 0.0001
24 | target_update: 4
25 |
--------------------------------------------------------------------------------
/presets/External/Mario/Mario_DQN_CNN_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DQN_CNN
3 | device: cuda
4 | env_name: Mario
5 | mode: train
6 | load_checkpoint: false
7 | load_path: Train_Mario_DQN_CNN_20221207-155552
8 | max_steps: 1000
9 | save_fig: true
10 | seed: 1
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 200
14 | wrapper: envs.wrappers.MarioWrappers
15 | new_step_api: True
16 | algo_cfg:
17 | batch_size: 64
18 | max_buffer_size: 100000
19 | epsilon_decay: 500
20 | epsilon_end: 0.01
21 | epsilon_start: 0.95
22 | gamma: 0.99
23 | lr: 0.0001
24 | target_update: 4
25 |
--------------------------------------------------------------------------------
/presets/Mujoco/Ant-v4/Ant-v4_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: PPO # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | device: cuda # device, cpu or cuda
5 | mode: train # run mode: train, test
6 | collect_traj: false # if collect trajectories or not
7 | mp_backend: single # multi-processing mode: single(default), ray
8 | n_workers: 2 # number of workers if using multi-processing, default 1
9 | load_checkpoint: false # if load checkpoint or not
10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best # load model step
12 | max_episode: 2000 # max episodes, set -1 to keep running
13 | max_step: 1000 # max steps per episode
14 | seed: 1 # random seed, set 0 not to use seed
15 | online_eval: true # if online eval or not
16 | online_eval_episode: 10 # online eval episodes
17 | model_save_fre: 500 # update step frequency of saving model
18 | algo_cfg:
19 | actor_layers:
20 | - layer_type: linear
21 | layer_size: [256]
22 | activation: relu
23 | - layer_type: linear
24 | layer_size: [256]
25 | activation: relu
26 | critic_layers:
27 | - layer_type: linear
28 | layer_size: [256]
29 | activation: relu
30 | - layer_type: linear
31 | layer_size: [256]
32 | activation: relu
33 | batch_size: 256
34 | sgd_batch_size: 128
35 | k_epochs: 8
36 | buffer_type: ONPOLICY_QUE
37 | max_buffer_size: 100000
38 | epsilon_decay: 500
39 | epsilon_end: 0.01
40 | epsilon_start: 0.95
41 | gamma: 0.95
42 | lr: 0.0001
43 | target_update: 4
44 | env_cfg:
45 | id: Ant-v4
46 | render_mode: null
--------------------------------------------------------------------------------
/presets/Mujoco/HalfCheetah-v4/HalfCheetah-v2_TD3_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: TD3
3 | device: cuda
4 | env_name: HalfCheetah-v2
5 | eval_eps: 5
6 | eval_per_episode: 10
7 | load_checkpoint: false
8 | load_path: Train_HalfCheetah-v2_TD3_20230221-213446
9 | max_steps: 1000
10 | mode: test
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: false
16 | test_eps: 10
17 | train_eps: 1200
18 | wrapper: null
19 | algo_cfg:
20 | actor_hidden_dim: 256
21 | actor_lr: 0.0005
22 | batch_size: 256
23 | max_buffer_size: 1000000
24 | critic_hidden_dim: 256
25 | critic_lr: 0.0005
26 | expl_noise: 0.1
27 | explore_steps: 10000
28 | gamma: 0.99
29 | noise_clip: 0.5
30 | policy_freq: 2
31 | policy_noise: 0.2
32 | tau: 0.005
33 |
34 |
--------------------------------------------------------------------------------
/presets/Mujoco/HalfCheetah-v4/HalfCheetah-v2_TD3_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: TD3
3 | device: cuda
4 | env_name: HalfCheetah-v2
5 | eval_eps: 5
6 | eval_per_episode: 10
7 | load_checkpoint: false
8 | load_path: Train_CartPole-v1_DQN_20221026-054757
9 | max_steps: 1000
10 | mode: train
11 | new_step_api: true
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: false
16 | test_eps: 10
17 | train_eps: 1200
18 | wrapper: null
19 | algo_cfg:
20 | actor_hidden_dim: 256
21 | actor_lr: 0.0005
22 | batch_size: 256
23 | max_buffer_size: 1000000
24 | critic_hidden_dim: 256
25 | critic_lr: 0.0005
26 | expl_noise: 0.1
27 | explore_steps: 10000
28 | gamma: 0.99
29 | noise_clip: 0.5
30 | policy_freq: 2
31 | policy_noise: 0.2
32 | tau: 0.005
33 |
34 |
--------------------------------------------------------------------------------
/presets/Mujoco/HalfCheetah-v4/HalfCheetah-v4_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: PPO # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | device: cuda # device, cpu or cuda
5 | mode: train # run mode: train, test
6 | collect_traj: false # if collect trajectories or not
7 | mp_backend: single # multi-processing mode: single(default), ray
8 | n_workers: 2 # number of workers if using multi-processing, default 1
9 | load_checkpoint: false # if load checkpoint or not
10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir
11 | load_model_step: best # load model step
12 | max_episode: 2000 # max episodes, set -1 to keep running
13 | max_step: 1000 # max steps per episode
14 | seed: 1 # random seed, set 0 not to use seed
15 | online_eval: true # if online eval or not
16 | online_eval_episode: 10 # online eval episodes
17 | model_save_fre: 500 # update step frequency of saving model
18 | algo_cfg:
19 | # value_layers:
20 | # - layer_type: linear
21 | # layer_size: [256]
22 | # activation: relu
23 | # - layer_type: linear
24 | # layer_size: [256]
25 | # activation: relu
26 | actor_layers:
27 | - layer_type: linear
28 | layer_size: [256]
29 | activation: relu
30 | - layer_type: linear
31 | layer_size: [256]
32 | activation: relu
33 | critic_layers:
34 | - layer_type: linear
35 | layer_size: [256]
36 | activation: relu
37 | - layer_type: linear
38 | layer_size: [256]
39 | activation: relu
40 | batch_size: 256
41 | sgd_batch_size: 128
42 | k_epochs: 8
43 | buffer_type: ONPOLICY_QUE
44 | max_buffer_size: 100000
45 | epsilon_decay: 500
46 | epsilon_end: 0.01
47 | epsilon_start: 0.95
48 | gamma: 0.95
49 | lr: 0.0001
50 | target_update: 4
51 | env_cfg:
52 | id: HalfCheetah-v4
53 | render_mode: null
--------------------------------------------------------------------------------
/presets/Mujoco/Hopper-v4/Hopper-v4_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.2.2
3 | algo_name: PPO
4 | env_name: gym
5 | device: cuda
6 | interactor_device: cpu
7 | learner_device: cuda
8 | mode: train # test # test
9 | exps_trucation_size: 512
10 | is_learner_async: false
11 | load_checkpoint: false # true # test
12 | load_path: Train_Hopper-v4_PPO_20240625-123656 # if load checkpoint, then config path in 'tasks' dir
13 | load_model_step: best
14 | n_interactors: 10 # 1 # test
15 | max_episode: 10000 # 3 # test
16 | max_step: 500
17 | seed: 202406
18 | online_eval: true
19 | online_eval_episode: 15 # 1 # test
20 | model_save_fre: 10
21 | policy_summary_fre: 10
22 | interact_summary_fre: 100
23 | algo_cfg:
24 | independ_actor: true
25 | return_form: td
26 | actor_branch_layers:
27 | - name: feature_1
28 | layers:
29 | - layer_type: linear
30 | layer_size: [256]
31 | activation: tanh
32 | - layer_type: linear
33 | layer_size: [256]
34 | activation: tanh
35 | critic_branch_layers:
36 | - name: feature_1
37 | layers:
38 | - layer_type: linear
39 | layer_size: [256]
40 | activation: tanh
41 | - layer_type: linear
42 | layer_size: [256]
43 | activation: tanh
44 | buffer_type: ONPOLICY_QUE
45 | lr: 2.5e-4
46 | actor_lr: 1.5e-4
47 | critic_lr: 3.5e-4
48 | entropy_coef: 0.001
49 | critic_loss_coef: 0.001
50 | eps_clip: 0.25
51 | gamma: 0.99
52 | gae_lambda: 0.9
53 | k_epochs: 2
54 | batch_size: 256
55 | sgd_batch_size: 24
56 | env_cfg:
57 | id: Hopper-v4
58 | render_mode: null # human # test
59 | wrappers:
60 | - wrapper_name: ClipAction
61 | - wrapper_name: MultiHeadObsWrapper
62 |
--------------------------------------------------------------------------------
/presets/Mujoco/Reacher-v4/Reacher-v4_PPO.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | joyrl_version: 0.6.2.2
3 | algo_name: PPO
4 | env_name: gym
5 | device: cuda
6 | interactor_device: cpu
7 | learner_device: cuda
8 | mode: train
9 | exps_trucation_size: 512
10 | is_learner_async: false
11 | load_checkpoint: false
12 | load_path: Train_Reacher-v4_PPO_20240702-005711
13 | load_model_step: best
14 | n_interactors: 10
15 | max_episode: 42000
16 | max_step: 100
17 | seed: 202406
18 | online_eval: true
19 | online_eval_episode: 15
20 | model_save_fre: 10
21 | policy_summary_fre: 10
22 | interact_summary_fre: 100
23 | algo_cfg:
24 | independ_actor: true
25 | return_form: td
26 | actor_branch_layers:
27 | - name: feature_1
28 | layers:
29 | - layer_type: linear
30 | layer_size: [240]
31 | activation: tanh
32 | - layer_type: linear
33 | layer_size: [240]
34 | activation: tanh
35 | critic_branch_layers:
36 | - name: feature_1
37 | layers:
38 | - layer_type: linear
39 | layer_size: [240]
40 | activation: tanh
41 | - layer_type: linear
42 | layer_size: [240]
43 | activation: tanh
44 | buffer_type: ONPOLICY_QUE
45 | lr: 2.5e-4
46 | actor_lr: 5.5e-4
47 | critic_lr: 7.5e-4
48 | entropy_coef: 0.001
49 | critic_loss_coef: 0.001
50 | eps_clip: 0.185
51 | gamma: 0.99
52 | gae_lambda: 0.985
53 | k_epochs: 2
54 | batch_size: 256
55 | sgd_batch_size: 128
56 | env_cfg:
57 | id: Reacher-v4
58 | render_mode: null
59 | max_episode_steps: 100
60 | wrappers:
61 | - wrapper_name: ClipAction
62 | - wrapper_name: MultiHeadObsWrapper
63 | - wrapper_name: ReacherDistReward
64 | dis_weight: 0.4
65 |
66 |
--------------------------------------------------------------------------------
/presets/Others/Racetrack-v0/Racetrack-v0_QLearning_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: QLearning
3 | device: cpu
4 | env_name: Racetrack-v0
5 | render: True
6 | render_mode: human
7 | mode: test
8 | load_checkpoint: true
9 | load_path: Train_Racetrack-v0_QLearning_20221128-154935
10 | max_steps: 200
11 | save_fig: true
12 | seed: 10
13 | show_fig: false
14 | test_eps: 10
15 | train_eps: 400
16 | algo_cfg:
17 | epsilon_decay: 300
18 | epsilon_end: 0.01
19 | epsilon_start: 0.95
20 | gamma: 0.9
21 | lr: 0.1
22 |
--------------------------------------------------------------------------------
/presets/Others/Racetrack-v0/Racetrack-v0_QLearning_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: QLearning
3 | device: cpu
4 | env_name: Racetrack-v0
5 | new_step_api: True
6 | wrapper: null
7 | mode: train
8 | load_checkpoint: false
9 | load_path: Train_CartPole-v1_DQN_20221026-054757
10 | max_steps: 200
11 | save_fig: true
12 | seed: 10
13 | show_fig: false
14 | test_eps: 20
15 | train_eps: 400
16 | algo_cfg:
17 | epsilon_decay: 300
18 | epsilon_end: 0.01
19 | epsilon_start: 0.95
20 | gamma: 0.9
21 | lr: 0.1
22 |
--------------------------------------------------------------------------------
/presets/Others/Racetrack-v0/Train_Racetrack-v0_FirstVisitMC.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: FirstVisitMC
3 | device: cpu
4 | env_name: Racetrack-v0
5 | eval_eps: 10
6 | eval_per_episode: 5
7 | load_checkpoint: false
8 | load_path: tasks
9 | max_steps: 200
10 | mode: train
11 | save_fig: true
12 | seed: 1
13 | show_fig: false
14 | test_eps: 20
15 | train_eps: 200
16 | algo_cfg:
17 | epsilon: 0.15
18 | gamma: 0.9
19 | lr: 0.1
20 |
--------------------------------------------------------------------------------
/presets/Others/deep-sea-treasure-v0/DeepSeaTreasure-v0_MOQLearning_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: MO-QLearning
3 | device: cpu
4 | env_name: deep-sea-treasure-v0
5 | wrapper: envs.wrappers.DeepSeaTreasure
6 | render: True
7 | mode: test
8 | load_checkpoint: true
9 | load_path: Test_deep-sea-treasure-v0_MO-QLearning_20230329-234802
10 | max_steps: 100
11 | save_fig: true
12 | seed: 1
13 | show_fig: false
14 | test_eps: 10
15 | train_eps: 300
16 | algo_cfg:
17 | epsilon_decay: 300
18 | epsilon_end: 0.01
19 | epsilon_start: 0.95
20 | gamma: 0.99
21 | lr: 0.1
22 |
--------------------------------------------------------------------------------
/presets/Others/deep-sea-treasure-v0/DeepSeaTreasure-v0_MOQLearning_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: MO-QLearning
3 | device: cpu
4 | env_name: deep-sea-treasure-v0
5 | wrapper: envs.wrappers.DeepSeaTreasure
6 | mode: train
7 | load_checkpoint: false
8 | load_path: Train_deep-sea-treasure-v0_MO-QLearning_20230329-234319
9 | max_steps: 100
10 | save_fig: true
11 | seed: 1
12 | show_fig: false
13 | test_eps: 10
14 | train_eps: 300
15 | algo_cfg:
16 | exploration_type: e-greedy # softmax, ucb
17 | epsilon_decay: 300
18 | epsilon_end: 0.01
19 | epsilon_start: 0.95
20 | gamma: 0.99
21 | lr: 0.1
22 |
--------------------------------------------------------------------------------
/presets/Others/theAlley/theAlley_VI_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: VI
3 | device: cpu
4 | env_name: theAlley
5 | mode: test
6 | load_checkpoint: true
7 | load_path: Train_theAlley_VI_20221122-215228
8 | max_steps: 200
9 | save_fig: true
10 | seed: 1
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 100
14 | algo_cfg:
15 | # value_layers:
16 | # - layer_type: linear
17 | # layer_dim: ['n_states',256]
18 | # activation: relu
19 | # - layer_type: linear
20 | # layer_dim: [256,256]
21 | # activation: relu
22 | # - layer_type: linear
23 | # layer_dim: [256,'n_actions']
24 | # activation: none
25 | # batch_size: 64
26 | # max_buffer_size: 100000
27 | # epsilon_decay: 500
28 | # epsilon_end: 0.01
29 | # epsilon_start: 0.95
30 | gamma: 0.95
31 | lr: 0.0001
32 | # target_update: 4
33 |
--------------------------------------------------------------------------------
/presets/Others/theAlley/theAlley_VI_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: VI
3 | device: cpu
4 | env_name: theAlley
5 | mode: train
6 | load_checkpoint: false
7 | load_path: Train_theAlley_VI_20221122-215228
8 | max_steps: 200
9 | save_fig: true
10 | seed: 1
11 | show_fig: false
12 | test_eps: 10
13 | train_eps: 100
14 | algo_cfg:
15 | # value_layers:
16 | # - layer_type: linear
17 | # layer_dim: ['n_states',256]
18 | # activation: relu
19 | # - layer_type: linear
20 | # layer_dim: [256,256]
21 | # activation: relu
22 | # - layer_type: linear
23 | # layer_dim: [256,'n_actions']
24 | # activation: none
25 | # batch_size: 64
26 | # max_buffer_size: 100000
27 | # epsilon_decay: 500
28 | # epsilon_end: 0.01
29 | # epsilon_start: 0.95
30 | gamma: 0.95
31 | lr: 0.0001
32 | # target_update: 4
33 |
--------------------------------------------------------------------------------
/presets/Pendulum-v1_TD3_mp_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: TD3
3 | device: cpu
4 | env_name: gym
5 | eval_eps: 5
6 | eval_per_episode: 10
7 | load_checkpoint: true
8 | load_path: Train_gym_TD3_20230416-214019
9 | max_steps: 200
10 | mode: test
11 | n_workers: 1 # number of workers for parallel training
12 | render: false
13 | save_fig: true
14 | seed: 1
15 | show_fig: false
16 | test_eps: 10
17 | train_eps: 200
18 | wrapper: null
19 | algo_cfg:
20 | actor_hidden_dim: 256
21 | actor_lr: 0.001
22 | batch_size: 100
23 | max_buffer_size: 1000000
24 | critic_hidden_dim: 256
25 | critic_lr: 0.001
26 | expl_noise: 0.1
27 | explore_steps: 1000
28 | gamma: 0.99
29 | noise_clip: 0.5
30 | policy_freq: 2
31 | policy_noise: 0.2
32 | tau: 0.005
33 | global_best_reward: -1800
34 | env_cfg:
35 | id: Pendulum-v1
36 | new_step_api: true
37 |
--------------------------------------------------------------------------------
/presets/Pendulum-v1_TD3_mp_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: TD3
3 | device: cpu
4 | env_name: gym
5 | eval_eps: 1
6 | eval_per_episode: 10
7 | load_checkpoint: false
8 | load_path: Train_CartPole-v1_DQN_20221026-054757
9 | max_steps: 200
10 | mode: train
11 | mp_backend: mp # 多线程框架,ray或者mp(multiprocessing),默认mp
12 | n_workers: 4 # number of workers for parallel training
13 | render: false
14 | save_fig: true
15 | seed: 1
16 | show_fig: false
17 | test_eps: 10
18 | train_eps: 200
19 | wrapper: null
20 | algo_cfg:
21 | actor_hidden_dim: 256
22 | actor_lr: 0.001
23 | batch_size: 100
24 | max_buffer_size: 1000000
25 | critic_hidden_dim: 256
26 | critic_lr: 0.001
27 | expl_noise: 0.1
28 | explore_steps: 1000
29 | gamma: 0.99
30 | noise_clip: 0.5
31 | policy_freq: 2
32 | policy_noise: 0.2
33 | tau: 0.005
34 | global_best_reward: -1800
35 | env_cfg:
36 | id: Pendulum-v1
37 | new_step_api: true
38 |
--------------------------------------------------------------------------------
/presets/ToyText/CliffWalking-v0/CliffWalking-v0_DynaQ_Test.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DynaQ
3 | device: cpu
4 | env_name: CliffWalking-v0
5 | wrapper: envs.wrappers.CliffWalkingWapper
6 | mode: test
7 | load_checkpoint: true
8 | load_path: Train_CliffWalking-v0_DynaQ_20221210-095808
9 | max_steps: 100
10 | save_fig: true
11 | seed: 1
12 | show_fig: false
13 | test_eps: 10
14 | train_eps: 100
15 | algo_cfg:
16 | exploration_type: e-greedy # softmax, ucb
17 | epsilon_decay: 300
18 | epsilon_end: 0.01
19 | epsilon_start: 0.95
20 | gamma: 0.99
21 | lr: 0.1
22 |
--------------------------------------------------------------------------------
/presets/ToyText/CliffWalking-v0/CliffWalking-v0_DynaQ_Train.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DynaQ
3 | device: cpu
4 | env_name: CliffWalking-v0
5 | wrapper: envs.wrappers.CliffWalkingWapper
6 | mode: train
7 | load_checkpoint: false
8 | load_path: Train_CliffWalking-v0_DynaQ_20221210-095808
9 | max_steps: 100
10 | save_fig: true
11 | seed: 1
12 | show_fig: false
13 | test_eps: 10
14 | train_eps: 100
15 | algo_cfg:
16 | exploration_type: e-greedy # softmax, ucb
17 | epsilon_decay: 300
18 | epsilon_end: 0.01
19 | epsilon_start: 0.95
20 | gamma: 0.99
21 | lr: 0.1
22 |
--------------------------------------------------------------------------------
/presets/ToyText/CliffWalking-v0/CliffWalking-v0_QLearning.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: QLearning # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | mode: train # run mode: train, test
5 | collect_traj: false # if collect trajectories or not
6 | load_checkpoint: false # if load checkpoint or not
7 | load_path: Train_CliffWalking-v0_QLearning_20231224-173215 # if load checkpoint, then config path in 'tasks' dir
8 | load_model_step: 12000 # load model step
9 | max_episode: 500 # max episodes, set -1 to keep running
10 | max_step: 100 # max steps per episode
11 | seed: 1 # random seed, set 0 not to use seed
12 | model_save_fre: 200
13 | online_eval: true
14 | algo_cfg:
15 | epsilon_decay: 500
16 | epsilon_end: 0.01
17 | epsilon_start: 0.95
18 | gamma: 0.99
19 | lr: 0.1
20 | env_cfg:
21 | id: CliffWalking-v0
22 | render_mode: null
--------------------------------------------------------------------------------
/presets/ToyText/CliffWalking-v0/CliffWalking-v0_Sarsa.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: Sarsa # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | mode: train # run mode: train, test
5 | collect_traj: false # if collect trajectories or not
6 | load_checkpoint: false # if load checkpoint or not
7 | load_path: Train_single_CliffWalking-v0_Sarsa_20230519-010804 # if load checkpoint, then config path in 'tasks' dir
8 | load_model_step: 12000 # load model step
9 | max_episode: 500 # max episodes, set -1 to keep running
10 | max_step: 100 # max steps per episode
11 | seed: 1 # random seed, set 0 not to use seed
12 | model_save_fre: 200
13 | online_eval: true
14 |
15 | algo_cfg:
16 | epsilon_decay: 500
17 | epsilon_end: 0.01
18 | epsilon_start: 0.95
19 | gamma: 0.99
20 | lr: 0.1
21 |
22 | env_cfg:
23 | id: CliffWalking-v0
24 | render_mode: null
--------------------------------------------------------------------------------
/presets/ToyText/CliffWalking-v0/CustomCliffWalking-v0_DQN.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: DQN
3 | env_name: gym
4 | device: cpu
5 | mode: train
6 | collect_traj: false
7 | n_interactors: 1
8 | load_checkpoint: false
9 | load_path: Train_single_CartPole-v1_DQN_20230515-211721
10 | load_model_step: best
11 | max_episode: -1
12 | max_step: 20
13 | seed: 1
14 | online_eval: true
15 | online_eval_episode: 10
16 | model_save_fre: 500
17 |
18 | algo_cfg:
19 | value_layers:
20 | - layer_type: embed
21 | n_embeddings: 48
22 | embedding_dim: 4
23 | - layer_type: linear
24 | layer_size: [256]
25 | activation: relu
26 | - layer_type: linear
27 | layer_size: [256]
28 | activation: relu
29 | batch_size: 128
30 | buffer_type: REPLAY_QUE
31 | max_buffer_size: 10000
32 | epsilon_decay: 1000
33 | epsilon_end: 0.01
34 | epsilon_start: 0.99
35 | gamma: 0.95
36 | lr: 0.001
37 | target_update: 4
38 | env_cfg:
39 | id: CustomCliffWalking-v0
40 | render_mode: null
--------------------------------------------------------------------------------
/presets/ToyText/FrozenLake-v1/FrozenLake-v1_NoSlippery_QLearning.yaml:
--------------------------------------------------------------------------------
1 | general_cfg:
2 | algo_name: QLearning # algo name
3 | env_name: gym # env name, differ from env_id in env_cfgs
4 | mode: train # run mode: train, test
5 | collect_traj: false # if collect trajectories or not
6 | load_checkpoint: false # if load checkpoint or not
7 | load_path: Train_CliffWalking-v0_QLearning_20231224-173215 # if load checkpoint, then config path in 'tasks' dir
8 | load_model_step: 12000 # load model step
9 | max_episode: 800 # max episodes, set -1 to keep running
10 | max_step: 100 # max steps per episode
11 | seed: 10 # random seed, set 0 not to use seed
12 | model_save_fre: 200
13 | online_eval: true
14 | algo_cfg:
15 | epsilon_decay: 2000
16 | epsilon_end: 0.1
17 | epsilon_start: 0.7
18 | gamma: 0.95
19 | lr: 0.9
20 | env_cfg:
21 | id: FrozenLake-v1
22 | is_slippery: false
23 | render_mode: null
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ray[default]==2.6.3
2 | gymnasium==0.29.1
3 | tensorboard==2.16.2
4 | matplotlib==3.8.4
5 | seaborn==0.13.2
6 | dill==0.3.8
7 | scipy==1.13.0
8 | swig==4.2.1
9 | pygame==2.6.0
10 | gymnasium[box2d]==0.29.1
11 | numpy==1.26.4
12 | pandas==2.2.2
13 | six==1.16.0
14 | setuptools==69.5.1
15 | scipy==1.13.0
16 | PyYAML==6.0.1
17 | pydantic==1.10.17
18 | psutil==6.0.0
19 | colorlog==6.8.2
--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
1 | python offline_run.py --yaml presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | # conda activate joyrl
2 | python offline_run.py --yaml presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude =
3 | .git
4 | log
5 | __pycache__
6 | docs
7 | build
8 | dist
9 | *.egg-info
10 | .DS_Store
11 | benchmarks
12 | max-line-length = 87
13 | ignore = B305,W504,B006,B008,B024
14 |
15 | [yapf]
16 | based_on_style = pep8
17 | dedent_closing_brackets = true
18 | column_limit = 87
19 | blank_line_before_nested_class_or_def = true
20 |
21 | [isort]
22 | profile = black
23 | multi_line_output = 3
24 | line_length = 87
25 |
26 | [mypy]
27 | files = joyrl/**/*.py
28 | allow_redefinition = True
29 | check_untyped_defs = True
30 | disallow_incomplete_defs = True
31 | disallow_untyped_defs = True
32 | ignore_missing_imports = True
33 | no_implicit_optional = True
34 | pretty = True
35 | show_error_codes = True
36 | show_error_context = True
37 | show_traceback = True
38 | strict_equality = True
39 | strict_optional = True
40 | warn_no_return = True
41 | warn_redundant_casts = True
42 | warn_unreachable = True
43 | warn_unused_configs = True
44 | warn_unused_ignores = True
45 |
46 | [pydocstyle]
47 | ignore = D100,D102,D104,D105,D107,D203,D213,D401,D402
48 |
49 | [doc8]
50 | max-line-length = 1000
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | '''
4 | Author: JiangJi
5 | Email: johnjim0816@gmail.com
6 | Date: 2023-12-22 13:01:23
7 | LastEditor: JiangJi
8 | LastEditTime: 2024-06-17 14:43:29
9 | Discription:
10 | '''
11 | import sys,os
12 | from setuptools import setup, find_packages
13 | curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
14 |
15 | def get_version() -> str:
16 | # https://packaging.python.org/guides/single-sourcing-package-version/
17 | init = open(os.path.join("joyrl", "__init__.py"), "r").read().split()
18 | return init[init.index("__version__") + 2][1:-1]
19 |
20 | def get_install_requires() -> str:
21 | return [
22 | "ray[default]==2.6.3",
23 | "gymnasium==0.29.1",
24 | "gymnasium[box2d]==0.29.1",
25 | "tensorboard==2.16.2",
26 | "matplotlib==3.8.4",
27 | "seaborn==0.13.2",
28 | "dill==0.3.8",
29 | "scipy==1.13.0",
30 | "pygame==2.5.2",
31 | "swig==4.2.1",
32 | "numpy==1.26.4",
33 | "pandas==2.2.2",
34 | "six==1.16.0",
35 | "setuptools==69.5.1",
36 | "scipy==1.13.0",
37 | "PyYAML==6.0.1",
38 | "pydantic==1.10.15",
39 | "psutil==0.3.14",
40 | ""
41 | ]
42 |
43 | def get_extras_require() -> str:
44 | req = {
45 | "atari": ["atari_py", "opencv-python"],
46 | "mujoco": ["mujoco_py"],
47 | "pybullet": ["pybullet"],
48 | }
49 | return req
50 |
51 | setup(
52 | name="joyrl",
53 | version=get_version(),
54 | description="A Library for Deep Reinforcement Learning",
55 | long_description=open(f"{curr_path}/README.md", encoding="utf8").read(),
56 | long_description_content_type="text/markdown",
57 | url="https://github.com/datawhalechina/joyrl",
58 | author="johnjim0816",
59 | author_email="johnjim0816@gmail.com",
60 | license="MIT",
61 | python_requires=">=3.7",
62 | keywords="reinforcement learning platform pytorch",
63 | packages=find_packages(
64 | exclude=["test", "test.*", "examples", "examples.*", "docs", "docs.*"]
65 | ),
66 | platforms = "any",
67 | install_requires=get_install_requires(),
68 | extras_require=get_extras_require(),
69 | entry_points={
70 | "console_scripts": [
71 | "joyrl=joyrl.scripts.scripts:main",
72 | ],
73 | },
74 | )
--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | echo "rm -rf old dist"
2 | rm -rf dist/
3 | echo "rm -rf old dist"
4 | rm -rf build/
5 | echo "run: python setup.py sdist bdist_wheel"
6 | python setup.py sdist bdist_wheel
7 | echo "run: test pypi"
8 | twine upload --repository pypitest dist/*
9 | echo "run: upload!"
10 | twine upload dist/*
--------------------------------------------------------------------------------
/stop.bat:
--------------------------------------------------------------------------------
1 | ray stop
--------------------------------------------------------------------------------
/stop.sh:
--------------------------------------------------------------------------------
1 | ray stop
--------------------------------------------------------------------------------